From 6acfa43ad9e254f10e653d9e84316c52842fb768 Mon Sep 17 00:00:00 2001
From: Chirag Patil
Date: Sun, 14 Jun 2026 13:14:40 +0530
Subject: [PATCH 1/2] feat: stream MkDocs sites by URL
- Accept deployed MkDocs root URLs in the listen command
- Discover canonical pages from sitemap.xml
- Enforce bounded HTTPS fetching and versioned caching
- Add offline tests with complete line and branch coverage
---
README.md | 21 +-
binge_docs/cli.py | 60 +--
binge_docs/documentation_fetching.py | 244 +++++++++++++
binge_docs/documentation_sources.py | 525 ++++++++++-----------------
binge_docs/errors.py | 4 +
binge_docs/speech.py | 4 +-
pyproject.toml | 16 +-
tests/test_cli.py | 368 +++++++++++++++++++
tests/test_documentation_fetching.py | 254 +++++++++++++
tests/test_documentation_sources.py | 304 ++++++++++++++++
tests/test_model_store.py | 152 ++++++++
tests/test_narration.py | 85 +++++
tests/test_speech.py | 214 +++++++++++
uv.lock | 189 +++++++++-
14 files changed, 2047 insertions(+), 393 deletions(-)
create mode 100644 binge_docs/documentation_fetching.py
create mode 100644 tests/test_cli.py
create mode 100644 tests/test_documentation_fetching.py
create mode 100644 tests/test_documentation_sources.py
create mode 100644 tests/test_model_store.py
create mode 100644 tests/test_narration.py
create mode 100644 tests/test_speech.py
diff --git a/README.md b/README.md
index d5be41d..10db012 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
# Binge Docs 🎧
-Listen to official FastAPI and Typer documentation from your terminal.
+Listen to deployed MkDocs documentation from your terminal.
Binge Docs removes visual content, then streams the useful prose through
Kokoro.
@@ -30,24 +30,26 @@ The first playback downloads about 120 MB of Kokoro model files.
## 📚 Listen
-Browse a built-in source interactively:
+Browse any deployed MkDocs site interactively:
```bash
-binge-docs fastapi
-binge-docs typer
+binge-docs listen https://fastapi.tiangolo.com/
+binge-docs listen https://doughayden.github.io/agent-foundation/
```
-Open a page directly with a slug or official URL:
+Open a page directly with a relative path or full URL:
```bash
-binge-docs fastapi --page tutorial/first-steps
-binge-docs typer --page https://typer.tiangolo.com/tutorial/first-steps/
+binge-docs listen https://sqlmodel.tiangolo.com/ --page tutorial/
+binge-docs listen https://typer.tiangolo.com/ \
+ --page https://typer.tiangolo.com/tutorial/first-steps/
```
Use `--voice` and `--speed` to customize playback:
```bash
-binge-docs fastapi --page tutorial/first-steps --voice bf_emma --speed 1.2
+binge-docs listen https://fastapi.tiangolo.com/ \
+ --page tutorial/first-steps/ --voice bf_emma --speed 1.2
```
Speed must be between `0.5` and `2.0`. Defaults are `af_heart` and `1.0`.
@@ -62,8 +64,7 @@ Speed must be between `0.5` and `2.0`. Defaults are `af_heart` and `1.0`.
## 🛠️ Commands
```text
-binge-docs fastapi [--page PAGE] [--voice VOICE] [--speed SPEED]
-binge-docs typer [--page PAGE] [--voice VOICE] [--speed SPEED]
+binge-docs listen MKDOCS_URL [--page PAGE] [--voice VOICE] [--speed SPEED]
binge-docs voices
binge-docs setup
binge-docs --version
diff --git a/binge_docs/cli.py b/binge_docs/cli.py
index 97fa653..8790703 100644
--- a/binge_docs/cli.py
+++ b/binge_docs/cli.py
@@ -27,11 +27,9 @@
from binge_docs import __version__
from binge_docs.documentation_sources import (
- FASTAPI_SOURCE,
- TYPER_SOURCE,
CatalogResult,
DocumentationPage,
- DocumentationSource,
+ MkDocsSource,
)
from binge_docs.errors import BingeDocsError
from binge_docs.model_store import ModelPaths, ModelStore
@@ -47,7 +45,7 @@
app = typer.Typer(
name="binge-docs",
- help="Choose official documentation and listen from your terminal.",
+ help="Choose a deployed MkDocs site and listen from your terminal.",
no_args_is_help=True,
add_completion=True,
)
@@ -78,43 +76,18 @@ def main(
"""Stream official documentation as speech."""
-@app.command("fastapi")
-def fastapi_command(
- page: Annotated[
- str | None,
- typer.Option(
- "--page",
- "-p",
- help="Page slug or full fastapi.tiangolo.com URL.",
- ),
- ] = None,
- voice: Annotated[
+@app.command()
+def listen(
+ mkdocs_url: Annotated[
str,
- typer.Option("--voice", "-v", help="Kokoro US or UK English voice."),
- ] = DEFAULT_VOICE,
- speed: Annotated[
- float,
- typer.Option("--speed", "-s", help="Playback speed from 0.5 to 2.0."),
- ] = 1.0,
-) -> None:
- """Choose and stream one page from the official FastAPI documentation."""
-
- _stream_source_command(
- FASTAPI_SOURCE,
- page=page,
- voice=voice,
- speed=speed,
- )
-
-
-@app.command("typer")
-def typer_command(
+ typer.Argument(help="HTTPS root URL of a deployed MkDocs site."),
+ ],
page: Annotated[
str | None,
typer.Option(
"--page",
"-p",
- help="Page slug or full typer.tiangolo.com URL.",
+ help="Page path or full URL within the MkDocs site.",
),
] = None,
voice: Annotated[
@@ -126,18 +99,17 @@ def typer_command(
typer.Option("--speed", "-s", help="Playback speed from 0.5 to 2.0."),
] = 1.0,
) -> None:
- """Choose and stream one page from the official Typer documentation."""
+ """Choose and stream one page from a deployed MkDocs site."""
- _stream_source_command(
- TYPER_SOURCE,
- page=page,
- voice=voice,
- speed=speed,
- )
+ try:
+ source = MkDocsSource(mkdocs_url)
+ except BingeDocsError as error:
+ _exit_with_error(str(error))
+ _stream_source_command(source, page=page, voice=voice, speed=speed)
def _stream_source_command(
- source: DocumentationSource,
+ source: MkDocsSource,
*,
page: str | None,
voice: str,
@@ -405,5 +377,5 @@ def _handle_playback_key(key: str, playback_controller: PlaybackController) -> N
console.print("[yellow]Stopping playback...[/yellow]")
-if __name__ == "__main__":
+if __name__ == "__main__": # pragma: no cover
app()
diff --git a/binge_docs/documentation_fetching.py b/binge_docs/documentation_fetching.py
new file mode 100644
index 0000000..940e14d
--- /dev/null
+++ b/binge_docs/documentation_fetching.py
@@ -0,0 +1,244 @@
+"""Secure fetching and caching for documentation content."""
+
+from __future__ import annotations
+
+import time
+from collections.abc import Callable
+from dataclasses import dataclass
+from pathlib import Path
+from urllib.parse import unquote, urljoin, urlparse
+
+import httpx
+from platformdirs import user_cache_path
+
+DEFAULT_CACHE_TTL_SECONDS = 24 * 60 * 60
+DEFAULT_MAX_RESPONSE_BYTES = 10 * 1024 * 1024
+DEFAULT_MAX_REDIRECTS = 5
+
+
+@dataclass(frozen=True)
+class CachedContent:
+ """Content loaded from the network or a stale local cache."""
+
+ content: bytes
+ used_stale_cache: bool
+
+
+@dataclass(frozen=True)
+class FetchedDocument:
+ """Fetched bytes and the validated URL that produced them."""
+
+ content: bytes
+ final_url: str
+
+
+@dataclass(frozen=True)
+class URLPolicy:
+ """Restrict which documentation URLs can be fetched."""
+
+ allowed_origins: tuple[str, ...]
+ allowed_path_prefixes: tuple[str, ...] = ("/",)
+
+ def validate_url(self, url: str) -> str:
+ """Return a normalized URL if it satisfies the policy."""
+
+ parsed = urlparse(url)
+ if parsed.scheme != "https":
+ raise ValueError("Only HTTPS documentation pages are supported.")
+
+ if parsed.username or parsed.password:
+ raise ValueError("Documentation URLs cannot include user credentials.")
+
+ if parsed.port not in (None, 443):
+ raise ValueError("Documentation URLs cannot include a custom port.")
+
+ if parsed.query:
+ raise ValueError("Documentation URLs cannot include a query string.")
+
+ if parsed.fragment:
+ raise ValueError("Documentation URLs cannot include a fragment.")
+
+ origin = f"{parsed.scheme}://{parsed.hostname}"
+ if origin not in self.allowed_origins:
+ raise ValueError("This documentation URL is outside the allowed source origin.")
+
+ normalized_path = normalize_path(parsed.path or "/")
+ if not path_matches_prefix(normalized_path, self.allowed_path_prefixes):
+ raise ValueError("This documentation URL is outside the allowed source path.")
+
+ return f"{origin}{normalized_path}"
+
+
+class SecureWebFetcher:
+ """Fetch remote content with explicit URL validation and redirect handling."""
+
+ def __init__(
+ self,
+ *,
+ max_response_bytes: int = DEFAULT_MAX_RESPONSE_BYTES,
+ max_redirects: int = DEFAULT_MAX_REDIRECTS,
+ timeout: httpx.Timeout | None = None,
+ client_factory: type[httpx.Client] = httpx.Client,
+ ) -> None:
+ self.max_response_bytes = max_response_bytes
+ self.max_redirects = max_redirects
+ self.timeout = timeout or httpx.Timeout(connect=10.0, read=30.0, write=10.0, pool=10.0)
+ self.client_factory = client_factory
+
+ def fetch_bytes(self, url: str, policy: URLPolicy) -> FetchedDocument:
+ """Fetch one validated document without following unsafe redirects."""
+
+ current_url = policy.validate_url(url)
+ redirect_count = 0
+
+ with self.client_factory(follow_redirects=False, timeout=self.timeout) as client:
+ while True:
+ try:
+ with client.stream("GET", current_url) as response:
+ if response.status_code in {301, 302, 303, 307, 308}:
+ location = response.headers.get("location")
+ if not location:
+ raise OSError("Redirect response did not include a location.")
+
+ redirect_count += 1
+ if redirect_count > self.max_redirects:
+ raise OSError("Too many redirects while fetching documentation.")
+
+ next_url = urljoin(current_url, location)
+ current_url = policy.validate_url(next_url)
+ continue
+
+ response.raise_for_status()
+ return FetchedDocument(
+ content=read_bounded_response(
+ response,
+ max_response_bytes=self.max_response_bytes,
+ ),
+ final_url=current_url,
+ )
+ except httpx.HTTPError as error:
+ raise OSError(f"Could not download {current_url}") from error
+
+
+class WebCache:
+ """Small file cache with stale-on-network-failure behavior."""
+
+ def __init__(
+ self,
+ cache_dir: Path | None = None,
+ ttl_seconds: int = DEFAULT_CACHE_TTL_SECONDS,
+ clock: Callable[[], float] = time.time,
+ ) -> None:
+ self.cache_dir = cache_dir or user_cache_path("binge-docs")
+ self.ttl_seconds = ttl_seconds
+ self.clock = clock
+
+ def get(self, url: str, cache_key: str, fetcher: Callable[[str], bytes]) -> CachedContent:
+ """Return fresh content, using stale cache only for download failures."""
+
+ cache_path = self.cache_dir / cache_key
+ cached_content = self._read(cache_path)
+ cache_is_fresh = cached_content is not None and self.is_fresh(cache_path)
+
+ if cache_is_fresh:
+ return CachedContent(cached_content, used_stale_cache=False)
+
+ try:
+ content = fetcher(url)
+ except OSError as error:
+ if cached_content is not None:
+ return CachedContent(cached_content, used_stale_cache=True)
+ raise OSError(f"Could not download {url}") from error
+
+ self._write_atomically(cache_path, content)
+ return CachedContent(content, used_stale_cache=False)
+
+ def is_fresh(self, cache_path: Path) -> bool:
+ """Return whether a file is younger than the cache TTL."""
+
+ age_seconds = self.clock() - cache_path.stat().st_mtime
+ return age_seconds < self.ttl_seconds
+
+ @staticmethod
+ def _read(cache_path: Path) -> bytes | None:
+ try:
+ return cache_path.read_bytes()
+ except FileNotFoundError:
+ return None
+
+ @staticmethod
+ def _write_atomically(cache_path: Path, content: bytes) -> None:
+ cache_path.parent.mkdir(parents=True, exist_ok=True)
+ temporary_path = cache_path.with_suffix(f"{cache_path.suffix}.tmp")
+ try:
+ temporary_path.write_bytes(content)
+ temporary_path.replace(cache_path)
+ finally:
+ temporary_path.unlink(missing_ok=True)
+
+
+def read_bounded_response(response: httpx.Response, *, max_response_bytes: int) -> bytes:
+ """Read a response body while enforcing a decoded size limit."""
+
+ total_bytes = 0
+ chunks: list[bytes] = []
+
+ for chunk in response.iter_bytes():
+ total_bytes += len(chunk)
+ if total_bytes > max_response_bytes:
+ raise OSError("Documentation content exceeded the maximum supported size.")
+ chunks.append(chunk)
+
+ return b"".join(chunks)
+
+
+def normalize_path(path: str) -> str:
+ """Normalize a path and reject traversal or dot segments."""
+
+ decoded_path = unquote(path or "/")
+ if not decoded_path.startswith("/"):
+ decoded_path = f"/{decoded_path}"
+
+ had_trailing_slash = decoded_path.endswith("/")
+ raw_segments = decoded_path.split("/")
+ segments: list[str] = []
+
+ for segment in raw_segments:
+ if not segment:
+ continue
+ if segment in {".", ".."}:
+ raise ValueError("Documentation paths cannot contain traversal segments.")
+ segments.append(segment)
+
+ normalized_path = "/" + "/".join(segments)
+ if normalized_path != "/" and had_trailing_slash:
+ normalized_path = f"{normalized_path}/"
+ return normalized_path
+
+
+def normalize_path_prefix(prefix: str) -> str:
+ """Normalize an allowed path prefix."""
+
+ normalized_prefix = normalize_path(prefix or "/")
+ if normalized_prefix != "/" and not normalized_prefix.endswith("/"):
+ normalized_prefix = f"{normalized_prefix}/"
+ return normalized_prefix
+
+
+def path_matches_prefix(path: str, allowed_prefixes: tuple[str, ...]) -> bool:
+ """Return whether a normalized path is inside one allowed prefix."""
+
+ normalized_path = normalize_path(path)
+
+ for prefix in allowed_prefixes:
+ normalized_prefix = normalize_path_prefix(prefix)
+ if normalized_prefix == "/":
+ return True
+
+ if normalized_path == normalized_prefix[:-1]:
+ return True
+
+ if normalized_path.startswith(normalized_prefix):
+ return True
+
+ return False
diff --git a/binge_docs/documentation_sources.py b/binge_docs/documentation_sources.py
index 4cd8fd5..527e177 100644
--- a/binge_docs/documentation_sources.py
+++ b/binge_docs/documentation_sources.py
@@ -1,28 +1,27 @@
-"""Documentation source loading, normalization, and caching."""
+"""Load and cache documentation from a deployed MkDocs site."""
from __future__ import annotations
import hashlib
import json
-import time
-from collections.abc import Callable
-from dataclasses import asdict, dataclass, field
-from pathlib import Path, PurePosixPath
-from typing import Protocol
+from dataclasses import asdict, dataclass
+from pathlib import Path
from urllib.parse import urljoin, urlparse
+from xml.etree import ElementTree
-import httpx
from bs4 import BeautifulSoup
-from platformdirs import user_cache_path
-from binge_docs.errors import CatalogError, PageError
+from binge_docs.documentation_fetching import SecureWebFetcher, URLPolicy, WebCache, normalize_path
+from binge_docs.errors import CatalogError, PageError, SourceError
from binge_docs.narration import extract_narration
-DEFAULT_CACHE_TTL_SECONDS = 24 * 60 * 60
-DEFAULT_NAVIGATION_SELECTOR = "nav.md-nav--primary"
-DEFAULT_ARTICLE_SELECTOR = "article.md-content__inner"
-
-FetchBytes = Callable[[str], bytes]
+CATALOG_SNAPSHOT_VERSION = 4
+ARTICLE_SELECTORS = (
+ "article.md-content__inner",
+ '[role="main"]',
+ "main",
+ "article",
+)
@dataclass(frozen=True)
@@ -36,14 +35,6 @@ class DocumentationPage:
section_title: str
-@dataclass(frozen=True)
-class CachedContent:
- """Content loaded from the network or local cache."""
-
- content: bytes
- used_stale_cache: bool
-
-
@dataclass(frozen=True)
class CatalogResult:
"""Documentation catalog and cache status."""
@@ -61,161 +52,81 @@ class PageResult:
used_stale_cache: bool
-@dataclass(frozen=True)
-class MkDocsSourceDefinition:
- """Declarative configuration for one built-in MkDocs documentation site."""
-
- key: str
- display_name: str
- base_url: str
- allowed_host: str
- cache_namespace: str
- navigation_selector: str = DEFAULT_NAVIGATION_SELECTOR
- article_selector: str = DEFAULT_ARTICLE_SELECTOR
- section_titles: dict[str, str] = field(default_factory=dict)
- root_page_sections: dict[str, str] = field(default_factory=dict)
-
-
-class DocumentationSource(Protocol):
- """Small contract for a documentation source."""
-
- key: str
- display_name: str
-
- def load_catalog(self) -> CatalogResult:
- """Load the published catalog for this source."""
-
- def load_page(
- self,
- page_reference: str,
- pages: tuple[DocumentationPage, ...],
- ) -> PageResult:
- """Resolve and load one page from the known catalog."""
-
- def extract_narration(self, html: str) -> str:
- """Extract narratable prose from one rendered page."""
-
-
-def fetch_bytes(url: str) -> bytes:
- """Fetch a URL using bounded timeouts and redirects."""
-
- timeout = httpx.Timeout(connect=10.0, read=30.0, write=10.0, pool=10.0)
- with httpx.Client(follow_redirects=True, timeout=timeout) as client:
- response = client.get(url)
- response.raise_for_status()
- return response.content
-
-
-class WebCache:
- """Small file cache with stale-on-network-failure behavior."""
-
- def __init__(
- self,
- cache_dir: Path | None = None,
- ttl_seconds: int = DEFAULT_CACHE_TTL_SECONDS,
- fetcher: FetchBytes = fetch_bytes,
- clock: Callable[[], float] = time.time,
- ) -> None:
- self.cache_dir = cache_dir or user_cache_path("binge-docs")
- self.ttl_seconds = ttl_seconds
- self.fetcher = fetcher
- self.clock = clock
-
- def get(self, url: str, cache_key: str) -> CachedContent:
- """Return fresh content, refreshing stale entries when possible."""
-
- cache_path = self.cache_dir / cache_key
- cached_content = self._read(cache_path)
- cache_is_fresh = cached_content is not None and self._is_fresh(cache_path)
-
- if cache_is_fresh:
- return CachedContent(cached_content, used_stale_cache=False)
-
- try:
- content = self.fetcher(url)
- except Exception as error:
- if cached_content is not None:
- return CachedContent(cached_content, used_stale_cache=True)
- raise OSError(f"Could not download {url}") from error
-
- self._write_atomically(cache_path, content)
- return CachedContent(content, used_stale_cache=False)
-
- def _is_fresh(self, cache_path: Path) -> bool:
- age_seconds = self.clock() - cache_path.stat().st_mtime
- return age_seconds < self.ttl_seconds
-
- @staticmethod
- def _read(cache_path: Path) -> bytes | None:
- try:
- return cache_path.read_bytes()
- except FileNotFoundError:
- return None
-
- @staticmethod
- def _write_atomically(cache_path: Path, content: bytes) -> None:
- cache_path.parent.mkdir(parents=True, exist_ok=True)
- temporary_path = cache_path.with_suffix(f"{cache_path.suffix}.tmp")
- try:
- temporary_path.write_bytes(content)
- temporary_path.replace(cache_path)
- finally:
- temporary_path.unlink(missing_ok=True)
-
-
-class MkDocsDocumentationSource:
- """Load a published MkDocs documentation catalog and its pages."""
+class MkDocsSource:
+ """Discover and load one deployed MkDocs documentation site."""
def __init__(
self,
- definition: MkDocsSourceDefinition,
+ base_url: str,
+ *,
cache: WebCache | None = None,
+ fetcher: SecureWebFetcher | None = None,
) -> None:
- self.definition = definition
+ self.base_url = normalize_base_url(base_url)
+ parsed = urlparse(self.base_url)
+ self.display_name = f"{parsed.hostname}{parsed.path}".rstrip("/")
+ self.url_policy = URLPolicy(
+ allowed_origins=(f"https://{parsed.hostname}",),
+ allowed_path_prefixes=(parsed.path,),
+ )
self.cache = cache or WebCache()
- self.key = definition.key
- self.display_name = definition.display_name
+ self.fetcher = fetcher or SecureWebFetcher()
+ self.cache_namespace = hashlib.sha256(self.base_url.encode("utf-8")).hexdigest()[:16]
def load_catalog(self) -> CatalogResult:
- """Load and parse the source's published navigation."""
+ """Load the cached or published MkDocs sitemap."""
- catalog_cache_path = self._catalog_snapshot_path()
- html_result = self._load_catalog_html()
+ snapshot_path = self._catalog_snapshot_path()
+ cached_pages = self._read_catalog_snapshot(snapshot_path)
+ if cached_pages is not None and self.cache.is_fresh(snapshot_path):
+ return CatalogResult(cached_pages, used_stale_cache=False)
+ sitemap_url = urljoin(self.base_url, "sitemap.xml")
try:
- html = html_result.content.decode("utf-8")
- pages = parse_mkdocs_catalog(html, self.definition)
- except (UnicodeDecodeError, ValueError) as error:
- cached_pages = self._read_catalog_snapshot(catalog_cache_path)
- if cached_pages:
+ fetched_sitemap = self.fetcher.fetch_bytes(sitemap_url, self.url_policy)
+ except OSError as error:
+ if cached_pages is not None:
return CatalogResult(cached_pages, used_stale_cache=True)
raise CatalogError(
- f"{self.display_name}'s documentation catalog could not be understood."
+ f"The MkDocs sitemap at {sitemap_url} could not be downloaded. "
+ "Check the URL and your internet connection."
+ ) from error
+
+ try:
+ pages = parse_sitemap(fetched_sitemap.content, self.base_url, self.url_policy)
+ except (ElementTree.ParseError, UnicodeDecodeError, ValueError) as error:
+ raise CatalogError(
+ f"The sitemap at {sitemap_url} could not be understood."
) from error
- self._write_catalog_snapshot(catalog_cache_path, pages)
- return CatalogResult(pages, html_result.used_stale_cache)
+ self._write_catalog_snapshot(snapshot_path, pages)
+ return CatalogResult(pages, used_stale_cache=False)
def load_page(
self,
page_reference: str,
pages: tuple[DocumentationPage, ...],
) -> PageResult:
- """Resolve and load one page from the known catalog."""
+ """Resolve and load one page from the sitemap."""
- slug = normalize_page_reference(page_reference, self.definition)
- page = next((candidate for candidate in pages if candidate.slug == slug), None)
+ page_slug = resolve_page_slug(page_reference, self.base_url, self.url_policy)
+ page = next((candidate for candidate in pages if candidate.slug == page_slug), None)
if page is None:
raise PageError(
- f"Unknown {self.display_name} documentation page '{page_reference}'. "
- f"Run 'binge-docs {self.key}' to choose an available page."
+ f"Unknown MkDocs page '{page_reference}'. "
+ f"Run 'binge-docs listen {self.base_url}' to choose an available page."
)
cache_name = hashlib.sha256(page.url.encode("utf-8")).hexdigest() + ".html"
- cache_key = f"{self.definition.cache_namespace}/pages/{cache_name}"
-
+ cache_key = (
+ f"mkdocs/{self.cache_namespace}/v{CATALOG_SNAPSHOT_VERSION}/pages/{cache_name}"
+ )
try:
- result = self.cache.get(page.url, cache_key)
+ result = self.cache.get(
+ page.url,
+ cache_key,
+ lambda url: self.fetcher.fetch_bytes(url, self.url_policy).content,
+ )
except OSError as error:
raise PageError(
"This page is not cached and could not be downloaded. "
@@ -230,221 +141,179 @@ def load_page(
return PageResult(page, html, result.used_stale_cache)
def extract_narration(self, html: str) -> str:
- """Extract narratable prose from one rendered page."""
-
- return extract_narration(
- html,
- article_selector=self.definition.article_selector,
- source_display_name=self.display_name,
+ """Extract prose from the first recognized MkDocs content root."""
+
+ soup = BeautifulSoup(html, "html.parser")
+ for selector in ARTICLE_SELECTORS:
+ if soup.select_one(selector) is not None:
+ return extract_narration(
+ html,
+ article_selector=selector,
+ source_display_name=self.display_name,
+ )
+ raise PageError(
+ f"The selected {self.display_name} page does not contain a readable article."
)
def _catalog_snapshot_path(self) -> Path:
- cache_dir = self.cache.cache_dir
- namespace = self.definition.cache_namespace
- return cache_dir / namespace / "catalog.json"
-
- def _load_catalog_html(self) -> CachedContent:
- cache_key = f"{self.definition.cache_namespace}/catalog-source.html"
- try:
- return self.cache.get(self.definition.base_url, cache_key)
- except OSError as error:
- raise CatalogError(
- f"The {self.display_name} catalog is not cached and could not be downloaded. "
- "Check your internet connection and try again."
- ) from error
+ return self.cache.cache_dir / "mkdocs" / self.cache_namespace / "catalog.json"
- @staticmethod
- def _read_catalog_snapshot(
- snapshot_path: Path,
- ) -> tuple[DocumentationPage, ...] | None:
+ def _read_catalog_snapshot(self, snapshot_path: Path) -> tuple[DocumentationPage, ...] | None:
try:
- raw_pages = json.loads(snapshot_path.read_text(encoding="utf-8"))
+ snapshot = json.loads(snapshot_path.read_text(encoding="utf-8"))
except (FileNotFoundError, json.JSONDecodeError, OSError):
return None
+ if (
+ snapshot.get("version") != CATALOG_SNAPSHOT_VERSION
+ or snapshot.get("base_url") != self.base_url
+ ):
+ return None
+
+ raw_pages = snapshot.get("pages")
+ if not isinstance(raw_pages, list):
+ return None
+
try:
return tuple(DocumentationPage(**raw_page) for raw_page in raw_pages)
- except (TypeError, KeyError):
+ except TypeError:
return None
- @staticmethod
def _write_catalog_snapshot(
+ self,
snapshot_path: Path,
pages: tuple[DocumentationPage, ...],
) -> None:
- serialized_pages = json.dumps([asdict(page) for page in pages], indent=2)
- WebCache._write_atomically(snapshot_path, serialized_pages.encode("utf-8"))
-
-
-def parse_mkdocs_catalog(
- html: str,
- definition: MkDocsSourceDefinition,
+ snapshot = json.dumps(
+ {
+ "version": CATALOG_SNAPSHOT_VERSION,
+ "base_url": self.base_url,
+ "pages": [asdict(page) for page in pages],
+ },
+ indent=2,
+ sort_keys=True,
+ )
+ WebCache._write_atomically(snapshot_path, snapshot.encode("utf-8"))
+
+
+def normalize_base_url(url: str) -> str:
+ """Validate and normalize a deployed MkDocs root URL."""
+
+ parsed = urlparse(url.strip())
+ try:
+ port = parsed.port
+ except ValueError as error:
+ raise SourceError("The MkDocs URL contains an invalid port.") from error
+
+ if parsed.scheme != "https":
+ raise SourceError("The MkDocs URL must use HTTPS.")
+ if not parsed.hostname:
+ raise SourceError("The MkDocs URL must include a hostname.")
+ if parsed.username or parsed.password:
+ raise SourceError("The MkDocs URL cannot include user credentials.")
+ if port not in (None, 443):
+ raise SourceError("The MkDocs URL cannot include a custom port.")
+ if parsed.query or parsed.fragment:
+ raise SourceError("The MkDocs URL cannot include a query string or fragment.")
+
+ try:
+ path = normalize_path(parsed.path or "/")
+ except ValueError as error:
+ raise SourceError("The MkDocs URL cannot contain traversal segments.") from error
+ if not path.endswith("/"):
+ path = f"{path}/"
+
+ return f"https://{parsed.hostname}{path}"
+
+
+def parse_sitemap(
+ content: bytes,
+ base_url: str,
+ policy: URLPolicy,
) -> tuple[DocumentationPage, ...]:
- """Extract ordered documentation pages from an MkDocs primary navigation."""
-
- soup = BeautifulSoup(html, "html.parser")
- navigation = soup.select_one(definition.navigation_selector)
- if navigation is None:
- raise ValueError("Primary documentation navigation was not found.")
+ """Parse canonical pages from a MkDocs sitemap."""
+ root = ElementTree.fromstring(content.decode("utf-8"))
pages: list[DocumentationPage] = []
- seen_slugs: set[str] = set()
+ seen_urls: set[str] = set()
- for link in navigation.select("a.md-nav__link[href]"):
- href = link.get("href", "").strip()
- if not href or href.startswith("#"):
+ for location in root.findall(".//{*}loc"):
+ if location.text is None:
continue
-
- absolute_url = urljoin(definition.base_url, href)
try:
- slug = normalize_page_reference(absolute_url, definition)
- except PageError:
- continue
-
- if not slug or slug in seen_slugs:
+ page_url = policy.validate_url(location.text.strip())
+ except ValueError:
continue
-
- title = " ".join(link.get_text(" ", strip=True).split())
- if not title:
+ if page_url in seen_urls:
continue
- section = section_for_slug(slug, definition)
- section_title = definition.section_titles.get(
- section,
- section.replace("-", " ").title(),
- )
- pages.append(
- DocumentationPage(
- title=title,
- slug=slug,
- url=urljoin(definition.base_url, f"{slug}/"),
- section=section,
- section_title=section_title,
- )
- )
- seen_slugs.add(slug)
+ relative_path = relative_page_path(page_url, base_url)
+ pages.append(page_from_url(page_url, relative_path))
+ seen_urls.add(page_url)
if not pages:
- raise ValueError("No documentation pages were found.")
-
+ raise ValueError("The sitemap does not contain pages for this documentation site.")
return tuple(pages)
-def normalize_page_reference(
- page_reference: str,
- definition: MkDocsSourceDefinition,
-) -> str:
- """Normalize a source URL or page slug to a canonical slug."""
-
- reference = page_reference.strip()
- if not reference:
- raise PageError(f"A {definition.display_name} documentation page is required.")
+def relative_page_path(page_url: str, base_url: str) -> str:
+ """Return a page path relative to the documentation root."""
- parsed = urlparse(reference)
- is_url = bool(parsed.scheme or parsed.netloc)
+ page_path = urlparse(page_url).path
+ base_path = urlparse(base_url).path
+ if page_path == base_path.rstrip("/"):
+ return ""
+ if not page_path.startswith(base_path):
+ raise ValueError("The page URL is outside the documentation root.")
+ return page_path[len(base_path) :].strip("/")
- if is_url:
- is_https = parsed.scheme == "https"
- is_allowed_host = parsed.netloc == definition.allowed_host
- if not is_https or not is_allowed_host:
- raise PageError(f"Only {definition.base_url} pages are supported.")
- path = parsed.path
- else:
- path = reference
- normalized_path = str(PurePosixPath("/" + path.strip("/")))
- slug = normalized_path.strip("/")
+def page_from_url(page_url: str, relative_path: str) -> DocumentationPage:
+ """Create display metadata from a sitemap URL."""
- if slug in {"", "."}:
- raise PageError(
- f"The {definition.display_name} home page is not a selectable documentation page."
+ if not relative_path:
+ return DocumentationPage(
+ title="Home",
+ slug="/",
+ url=page_url,
+ section="home",
+ section_title="Home",
)
- return slug
-
-
-def section_for_slug(slug: str, definition: MkDocsSourceDefinition) -> str:
- """Return the user-facing catalog section for a page slug."""
-
- first_segment = slug.split("/", maxsplit=1)[0]
- return definition.root_page_sections.get(first_segment, first_segment)
-
-
-FASTAPI_SOURCE = MkDocsDocumentationSource(
- MkDocsSourceDefinition(
- key="fastapi",
- display_name="FastAPI",
- base_url="https://fastapi.tiangolo.com/",
- allowed_host="fastapi.tiangolo.com",
- cache_namespace="fastapi",
- section_titles={
- "overview": "Overview",
- "learn": "Learn",
- "tutorial": "Tutorial",
- "advanced": "Advanced",
- "fastapi-cli": "FastAPI CLI",
- "editor-support": "Editor Support",
- "deployment": "Deployment",
- "how-to": "How-To",
- "reference": "Reference",
- "resources": "Resources",
- "about": "About",
- "release-notes": "Release Notes",
- },
- root_page_sections={
- "features": "overview",
- "learn": "learn",
- "python-types": "learn",
- "async": "learn",
- "environment-variables": "learn",
- "virtual-environments": "learn",
- "fastapi-cli": "fastapi-cli",
- "editor-support": "editor-support",
- "fastapi-people": "resources",
- "help-fastapi": "resources",
- "contributing": "resources",
- "translations": "resources",
- "project-generation": "resources",
- "external-links": "resources",
- "newsletter": "resources",
- "alternatives": "about",
- "history-design-future": "about",
- "benchmarks": "about",
- "management": "about",
- "release-notes": "release-notes",
- },
+ segments = relative_path.split("/")
+ title = title_from_segment(segments[-1])
+ section = segments[0]
+ return DocumentationPage(
+ title=title,
+ slug=relative_path,
+ url=page_url,
+ section=section,
+ section_title=title_from_segment(section),
)
-)
-TYPER_SOURCE = MkDocsDocumentationSource(
- MkDocsSourceDefinition(
- key="typer",
- display_name="Typer",
- base_url="https://typer.tiangolo.com/",
- allowed_host="typer.tiangolo.com",
- cache_namespace="typer",
- section_titles={
- "features": "Features",
- "learn": "Learn",
- "reference": "Reference",
- "resources": "Resources",
- "about": "About",
- "release-notes": "Release Notes",
- },
- root_page_sections={
- "features": "features",
- "tutorial": "learn",
- "environment-variables": "learn",
- "virtual-environments": "learn",
- "reference": "reference",
- "resources": "resources",
- "help-typer": "resources",
- "contributing": "resources",
- "about": "about",
- "alternatives": "about",
- "management": "about",
- "release-notes": "release-notes",
- },
- )
-)
+def title_from_segment(segment: str) -> str:
+ """Convert one URL path segment into a display label."""
+
+ label = segment.removesuffix(".html").replace("-", " ").replace("_", " ")
+ return label.title()
+
+
+def resolve_page_slug(page_reference: str, base_url: str, policy: URLPolicy) -> str:
+ """Resolve a relative path or full URL to a sitemap slug."""
+
+ reference = page_reference.strip()
+ if not reference:
+ raise PageError("A MkDocs page is required.")
+
+ if reference == "/":
+ candidate = base_url
+ else:
+ candidate = reference if "://" in reference else urljoin(base_url, reference)
+ try:
+ page_url = policy.validate_url(candidate)
+ except ValueError as error:
+ raise PageError(f"Only pages within {base_url} are supported.") from error
+
+ relative_path = relative_page_path(page_url, base_url).removesuffix("/")
+ return relative_path or "/"
diff --git a/binge_docs/errors.py b/binge_docs/errors.py
index f628e2a..e8b75e7 100644
--- a/binge_docs/errors.py
+++ b/binge_docs/errors.py
@@ -13,6 +13,10 @@ class PageError(BingeDocsError):
"""Raised when a documentation page cannot be loaded or parsed."""
+class SourceError(BingeDocsError):
+ """Raised when a documentation source URL is invalid."""
+
+
class ModelError(BingeDocsError):
"""Raised when speech model assets cannot be prepared."""
diff --git a/binge_docs/speech.py b/binge_docs/speech.py
index d609796..8a6d9f8 100644
--- a/binge_docs/speech.py
+++ b/binge_docs/speech.py
@@ -239,7 +239,7 @@ def _split_paragraph(paragraph: str) -> list[str]:
segments.append(current_segment)
current_segment = sentence
- if current_segment:
+ if current_segment: # pragma: no branch
segments.append(current_segment)
return segments
@@ -267,7 +267,7 @@ def _merge_segments_for_streaming(segments: list[str]) -> list[str]:
merged_segments.append(current_batch)
current_batch = segment
- if current_batch:
+ if current_batch: # pragma: no branch
merged_segments.append(current_batch)
return merged_segments
diff --git a/pyproject.toml b/pyproject.toml
index 651455d..301f975 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -26,11 +26,6 @@ dependencies = [
[project.scripts]
binge-docs = "binge_docs.cli:app"
-[project.optional-dependencies]
-dev = [
- "ruff>=0.12,<1",
-]
-
[tool.hatch.build.targets.wheel]
packages = ["binge_docs"]
@@ -40,3 +35,14 @@ line-length = 100
[tool.ruff.lint]
select = ["E", "F", "I", "UP", "B", "SIM"]
+
+[dependency-groups]
+dev = [
+ "ruff>=0.12,<1",
+ "pytest>=9.1.0",
+ "pytest-cov>=7.1.0",
+]
+
+[tool.pytest.ini_options]
+addopts = "--cov=binge_docs --cov-branch --cov-fail-under=100"
+testpaths = ["tests"]
diff --git a/tests/test_cli.py b/tests/test_cli.py
new file mode 100644
index 0000000..16fde44
--- /dev/null
+++ b/tests/test_cli.py
@@ -0,0 +1,368 @@
+from __future__ import annotations
+
+import threading
+import types
+from pathlib import Path
+
+import pytest
+import typer
+from typer.testing import CliRunner
+
+from binge_docs import __version__, cli
+from binge_docs.documentation_sources import CatalogResult, DocumentationPage, PageResult
+from binge_docs.errors import BingeDocsError
+from binge_docs.model_store import ModelPaths
+from binge_docs.speech import PlaybackController
+
+runner = CliRunner()
+
+
+def make_model_paths() -> ModelPaths:
+ return ModelPaths(model=Path("/tmp/model.onnx"), voices=Path("/tmp/voices.bin"))
+
+
+class StubSource:
+ display_name = "Stub Docs"
+
+ def load_catalog(self) -> CatalogResult:
+ return CatalogResult(
+ pages=(
+ DocumentationPage(
+ title="First Steps",
+ slug="tutorial/first-steps",
+ url="https://docs.example.com/tutorial/first-steps/",
+ section="tutorial",
+ section_title="Tutorial",
+ ),
+ ),
+ used_stale_cache=False,
+ )
+
+ def load_page(self, page_reference: str, pages: tuple[DocumentationPage, ...]) -> PageResult:
+ if page_reference != "tutorial/first-steps":
+ raise BingeDocsError("bad page")
+ return PageResult(
+ page=pages[0],
+ html="Hello
",
+ used_stale_cache=False,
+ )
+
+ def extract_narration(self, html: str) -> str:
+ return "Narration"
+
+
+class FakeThread:
+ def __init__(self, *args: object, **kwargs: object) -> None:
+ self.args = args
+ self.kwargs = kwargs
+ self.started = False
+ self.join_timeout: float | None = None
+
+ def start(self) -> None:
+ self.started = True
+
+ def join(self, timeout: float | None = None) -> None:
+ self.join_timeout = timeout
+
+
+class FakeStdin:
+ def __init__(self, chars: list[str], *, tty: bool = True) -> None:
+ self._chars = chars
+ self._tty = tty
+
+ def isatty(self) -> bool:
+ return self._tty
+
+ def fileno(self) -> int:
+ return 0
+
+ def read(self, _count: int) -> str:
+ return self._chars.pop(0)
+
+
+class FakePrompt:
+ def __init__(self, answer: str | None) -> None:
+ self.answer = answer
+
+ def ask(self) -> str | None:
+ return self.answer
+
+
+class FakeStdout:
+ def __init__(self) -> None:
+ self.output = ""
+
+ def isatty(self) -> bool:
+ return True
+
+ def write(self, text: str) -> int:
+ self.output += text
+ return len(text)
+
+ def flush(self) -> None:
+ return None
+
+
+def test_cli_listen_voices_and_version(monkeypatch) -> None:
+ monkeypatch.setattr(cli, "MkDocsSource", lambda _url: StubSource())
+ monkeypatch.setattr(cli, "_prepare_models", lambda verify_existing=False: make_model_paths())
+ monkeypatch.setattr(
+ cli,
+ "_playback_input_listener",
+ lambda _controller: cli.contextlib.nullcontext(),
+ )
+
+ async def fake_stream_speech(*_args, **_kwargs) -> None:
+ return None
+
+ monkeypatch.setattr(cli, "stream_speech", fake_stream_speech)
+
+ listen_result = runner.invoke(
+ cli.app,
+ ["listen", "https://docs.example.com/", "--page", "tutorial/first-steps"],
+ )
+ assert listen_result.exit_code == 0
+ assert "First Steps" in listen_result.stdout
+
+ voices_result = runner.invoke(cli.app, ["voices"])
+ assert voices_result.exit_code == 0
+ assert "af_heart" in voices_result.stdout
+
+ version_result = runner.invoke(cli.app, ["--version"])
+ assert version_result.exit_code == 0
+ assert f"binge-docs {__version__}" in version_result.stdout
+
+
+def test_cli_reports_errors_and_setup_paths(monkeypatch) -> None:
+ def bad_source(_url: str):
+ raise BingeDocsError("invalid MkDocs URL")
+
+ monkeypatch.setattr(cli, "MkDocsSource", bad_source)
+ error_result = runner.invoke(cli.app, ["listen", "not-a-url"])
+ assert error_result.exit_code == 1
+
+ monkeypatch.setattr(cli, "MkDocsSource", lambda _url: StubSource())
+ no_tty_result = runner.invoke(cli.app, ["listen", "https://docs.example.com/"])
+ assert no_tty_result.exit_code == 1
+ assert "Interactive selection requires a terminal." in no_tty_result.stderr
+
+ sources_result = runner.invoke(cli.app, ["sources"])
+ assert sources_result.exit_code == 2
+
+ def raise_keyboard_interrupt(*, verify_existing: bool = False) -> ModelPaths:
+ raise KeyboardInterrupt
+
+ monkeypatch.setattr(cli, "_prepare_models", raise_keyboard_interrupt)
+ setup_interrupt = runner.invoke(cli.app, ["setup"])
+ assert setup_interrupt.exit_code == 130
+
+ def raise_binge_docs_error(*, verify_existing: bool = False) -> ModelPaths:
+ raise BingeDocsError("boom")
+
+ monkeypatch.setattr(cli, "_prepare_models", raise_binge_docs_error)
+ setup_error = runner.invoke(cli.app, ["setup"])
+ assert setup_error.exit_code == 1
+
+
+def test_stream_source_command_handles_keyboard_interrupt_and_application_errors(
+ monkeypatch,
+) -> None:
+ source = StubSource()
+ monkeypatch.setattr(cli, "_prepare_models", lambda verify_existing=False: make_model_paths())
+ monkeypatch.setattr(
+ cli,
+ "_playback_input_listener",
+ lambda _controller: cli.contextlib.nullcontext(),
+ )
+
+ async def raise_keyboard_interrupt(*_args, **_kwargs) -> None:
+ raise KeyboardInterrupt
+
+ monkeypatch.setattr(cli, "stream_speech", raise_keyboard_interrupt)
+ with pytest.raises(typer.Exit) as stopped:
+ cli._stream_source_command(source, page="tutorial/first-steps", voice="af_heart", speed=1.0)
+ assert stopped.value.exit_code == 130
+
+ class BadSource(StubSource):
+ def load_catalog(self) -> CatalogResult:
+ raise BingeDocsError("catalog failed")
+
+ with pytest.raises(typer.Exit) as failed:
+ cli._stream_source_command(
+ BadSource(),
+ page="tutorial/first-steps",
+ voice="af_heart",
+ speed=1.0,
+ )
+ assert failed.value.exit_code == 1
+
+
+def test_select_page_prepare_models_and_setup_success(monkeypatch) -> None:
+ catalog = StubSource().load_catalog()
+ answers = iter(["tutorial", "tutorial/first-steps"])
+ monkeypatch.setattr(cli.sys, "stdin", FakeStdin([], tty=True))
+ monkeypatch.setattr(cli.sys, "stdout", FakeStdout())
+ monkeypatch.setattr(
+ cli.questionary,
+ "select",
+ lambda *_args, **_kwargs: FakePrompt(next(answers)),
+ )
+
+ assert cli._select_page(catalog) == "tutorial/first-steps"
+
+ monkeypatch.setattr(
+ cli.questionary,
+ "select",
+ lambda *_args, **_kwargs: FakePrompt(None),
+ )
+ with pytest.raises(KeyboardInterrupt):
+ cli._select_page(catalog)
+
+ answers = iter(["tutorial", None])
+ monkeypatch.setattr(
+ cli.questionary,
+ "select",
+ lambda *_args, **_kwargs: FakePrompt(next(answers)),
+ )
+ with pytest.raises(KeyboardInterrupt):
+ cli._select_page(catalog)
+
+ class FakeStore:
+ def ensure_assets(self, progress, *, verify_existing: bool) -> ModelPaths:
+ progress("model.bin", 1, 2)
+ progress("model.bin", 2, 2)
+ assert verify_existing is True
+ return make_model_paths()
+
+ monkeypatch.setattr(cli, "ModelStore", lambda: FakeStore())
+ assert cli._prepare_models(verify_existing=True) == make_model_paths()
+
+ monkeypatch.setattr(cli, "_prepare_models", lambda verify_existing=False: make_model_paths())
+ cli.setup()
+
+
+def test_playback_helpers_cover_listener_paths(monkeypatch) -> None:
+ controller = PlaybackController(1.0)
+ original_handle_playback_key = cli._handle_playback_key
+ original_windows_listener = cli._listen_for_playback_input_windows
+ original_posix_listener = cli._listen_for_playback_input_posix
+ assert "1.0x" in cli._playback_controls_message(1.0)
+
+ cli._warn_if_stale("Stub", True, "catalog")
+ cli._warn_if_stale("Stub", False, "catalog")
+
+ with pytest.raises(typer.Exit) as error:
+ cli._exit_with_error("nope")
+ assert error.value.exit_code == 1
+
+ monkeypatch.setattr(cli.sys, "stdin", FakeStdin([], tty=False))
+ with cli._playback_input_listener(controller):
+ pass
+
+ fake_thread = FakeThread()
+ monkeypatch.setattr(cli.sys, "stdin", FakeStdin([], tty=True))
+ monkeypatch.setattr(cli.threading, "Thread", lambda *args, **kwargs: fake_thread)
+ with cli._playback_input_listener(controller):
+ pass
+ assert fake_thread.started
+ assert fake_thread.join_timeout == 0.5
+
+ dispatches: list[str] = []
+ monkeypatch.setattr(cli.sys, "platform", "win32")
+ monkeypatch.setattr(
+ cli,
+ "_listen_for_playback_input_windows",
+ lambda *_args: dispatches.append("windows"),
+ )
+ cli._listen_for_playback_input(controller, threading.Event())
+
+ monkeypatch.setattr(cli.sys, "platform", "linux")
+ monkeypatch.setattr(
+ cli,
+ "_listen_for_playback_input_posix",
+ lambda *_args: dispatches.append("posix"),
+ )
+ cli._listen_for_playback_input(controller, threading.Event())
+ monkeypatch.setattr(
+ cli,
+ "_listen_for_playback_input_posix",
+ lambda *_args: (_ for _ in ()).throw(RuntimeError("boom")),
+ )
+ cli._listen_for_playback_input(controller, threading.Event())
+ assert dispatches == ["windows", "posix"]
+ monkeypatch.setattr(
+ cli,
+ "_listen_for_playback_input_windows",
+ original_windows_listener,
+ )
+ monkeypatch.setattr(
+ cli,
+ "_listen_for_playback_input_posix",
+ original_posix_listener,
+ )
+
+ stop_listener = threading.Event()
+ key_state = {"calls": 0}
+
+ def kbhit() -> bool:
+ key_state["calls"] += 1
+ return key_state["calls"] > 1
+
+ def getwch() -> str:
+ stop_listener.set()
+ return "q"
+
+ monkeypatch.setitem(
+ __import__("sys").modules,
+ "msvcrt",
+ types.SimpleNamespace(kbhit=kbhit, getwch=getwch),
+ )
+ monkeypatch.setattr(cli.time, "sleep", lambda _seconds: None)
+ cli._listen_for_playback_input_windows(controller, stop_listener)
+
+ posix_stdin = FakeStdin([""], tty=True)
+ monkeypatch.setattr(cli.sys, "stdin", posix_stdin)
+ select_answers = iter([([], [], []), ([posix_stdin], [], [])])
+ monkeypatch.setitem(
+ __import__("sys").modules,
+ "select",
+ types.SimpleNamespace(select=lambda *_args: next(select_answers)),
+ )
+ monkeypatch.setitem(
+ __import__("sys").modules,
+ "termios",
+ types.SimpleNamespace(
+ TCSADRAIN=1,
+ tcgetattr=lambda _fd: "state",
+ tcsetattr=lambda *_args: None,
+ ),
+ )
+ monkeypatch.setitem(
+ __import__("sys").modules,
+ "tty",
+ types.SimpleNamespace(setcbreak=lambda _fd: None),
+ )
+ cli._listen_for_playback_input_posix(controller, threading.Event())
+
+ ready_stdin = FakeStdin(["x"], tty=True)
+ posix_stop_listener = threading.Event()
+ monkeypatch.setattr(cli.sys, "stdin", ready_stdin)
+ monkeypatch.setitem(
+ __import__("sys").modules,
+ "select",
+ types.SimpleNamespace(select=lambda *_args: ([ready_stdin], [], [])),
+ )
+ monkeypatch.setattr(
+ cli,
+ "_handle_playback_key",
+ lambda _key, _controller: posix_stop_listener.set(),
+ )
+ cli._listen_for_playback_input_posix(controller, posix_stop_listener)
+ monkeypatch.setattr(cli, "_handle_playback_key", original_handle_playback_key)
+
+ cli._handle_playback_key(" ", controller)
+ cli._handle_playback_key(" ", controller)
+ cli._handle_playback_key("+", controller)
+ cli._handle_playback_key("-", controller)
+ cli._handle_playback_key("q", controller)
+ cli._handle_playback_key("x", controller)
diff --git a/tests/test_documentation_fetching.py b/tests/test_documentation_fetching.py
new file mode 100644
index 0000000..273d668
--- /dev/null
+++ b/tests/test_documentation_fetching.py
@@ -0,0 +1,254 @@
+from __future__ import annotations
+
+import os
+
+import httpx
+import pytest
+
+from binge_docs.documentation_fetching import (
+ SecureWebFetcher,
+ URLPolicy,
+ WebCache,
+ normalize_path,
+ normalize_path_prefix,
+ path_matches_prefix,
+ read_bounded_response,
+)
+
+
+class FakeResponse:
+ def __init__(
+ self,
+ *,
+ status_code: int = 200,
+ chunks: tuple[bytes, ...] = (),
+ headers: dict[str, str] | None = None,
+ request_url: str = "https://docs.example.com/page/",
+ ) -> None:
+ self.status_code = status_code
+ self.headers = headers or {}
+ self._chunks = chunks
+ self._request = httpx.Request("GET", request_url)
+
+ def __enter__(self) -> FakeResponse:
+ return self
+
+ def __exit__(self, *_args: object) -> None:
+ return None
+
+ def iter_bytes(self):
+ yield from self._chunks
+
+ def raise_for_status(self) -> None:
+ if self.status_code >= 400:
+ raise httpx.HTTPStatusError(
+ "bad response",
+ request=self._request,
+ response=httpx.Response(self.status_code, request=self._request),
+ )
+
+
+class FakeClient:
+ def __init__(self, responses: dict[str, FakeResponse], **_kwargs: object) -> None:
+ self.responses = responses
+ self.calls: list[str] = []
+
+ def __enter__(self) -> FakeClient:
+ return self
+
+ def __exit__(self, *_args: object) -> None:
+ return None
+
+ def stream(self, _method: str, url: str) -> FakeResponse:
+ self.calls.append(url)
+ return self.responses[url]
+
+
+def test_url_policy_validates_and_normalizes_urls() -> None:
+ policy = URLPolicy(
+ allowed_origins=("https://docs.example.com",),
+ allowed_path_prefixes=("/guide/",),
+ )
+
+ assert (
+ policy.validate_url("https://docs.example.com:443/guide/start/")
+ == "https://docs.example.com/guide/start/"
+ )
+
+
+@pytest.mark.parametrize(
+ ("url", "allowed_prefixes"),
+ [
+ ("http://docs.example.com/guide/start/", ("/guide/",)),
+ ("https://user:pass@docs.example.com/guide/start/", ("/guide/",)),
+ ("https://docs.example.com:444/guide/start/", ("/guide/",)),
+ ("https://docs.example.com/guide/start/?q=1", ("/guide/",)),
+ ("https://docs.example.com/guide/start/#intro", ("/guide/",)),
+ ("https://evil.example.com/guide/start/", ("/guide/",)),
+ ("https://docs.example.com/private/start/", ("/guide/",)),
+ ("https://docs.example.com/guide/%2e%2e/secret/", ("/guide/",)),
+ ],
+)
+def test_url_policy_rejects_invalid_urls(url: str, allowed_prefixes: tuple[str, ...]) -> None:
+ policy = URLPolicy(
+ allowed_origins=("https://docs.example.com",),
+ allowed_path_prefixes=allowed_prefixes,
+ )
+
+ with pytest.raises(ValueError):
+ policy.validate_url(url)
+
+
+def test_secure_web_fetcher_follows_validated_redirects() -> None:
+ responses = {
+ "https://docs.example.com/start/": FakeResponse(
+ status_code=302,
+ headers={"location": "/guide/page/"},
+ request_url="https://docs.example.com/start/",
+ ),
+ "https://docs.example.com/guide/page/": FakeResponse(
+ chunks=(b"hello ", b"world"),
+ request_url="https://docs.example.com/guide/page/",
+ ),
+ }
+ created_clients: list[FakeClient] = []
+
+ def client_factory(**kwargs: object) -> FakeClient:
+ client = FakeClient(responses, **kwargs)
+ created_clients.append(client)
+ return client
+
+ fetcher = SecureWebFetcher(client_factory=client_factory)
+ policy = URLPolicy(
+ allowed_origins=("https://docs.example.com",),
+ allowed_path_prefixes=("/",),
+ )
+
+ result = fetcher.fetch_bytes("https://docs.example.com/start/", policy)
+
+ assert result.content == b"hello world"
+ assert result.final_url == "https://docs.example.com/guide/page/"
+ assert created_clients[0].calls == [
+ "https://docs.example.com/start/",
+ "https://docs.example.com/guide/page/",
+ ]
+
+
+def test_secure_web_fetcher_rejects_invalid_redirect_targets() -> None:
+ responses = {
+ "https://docs.example.com/start/": FakeResponse(
+ status_code=302,
+ headers={"location": "https://evil.example.com/page/"},
+ request_url="https://docs.example.com/start/",
+ )
+ }
+ fetcher = SecureWebFetcher(client_factory=lambda **kwargs: FakeClient(responses, **kwargs))
+ policy = URLPolicy(
+ allowed_origins=("https://docs.example.com",),
+ allowed_path_prefixes=("/",),
+ )
+
+ with pytest.raises(ValueError):
+ fetcher.fetch_bytes("https://docs.example.com/start/", policy)
+
+
+def test_secure_web_fetcher_handles_redirect_edge_cases() -> None:
+ policy = URLPolicy(
+ allowed_origins=("https://docs.example.com",),
+ allowed_path_prefixes=("/",),
+ )
+
+ no_location_responses = {
+ "https://docs.example.com/start/": FakeResponse(
+ status_code=302,
+ request_url="https://docs.example.com/start/",
+ )
+ }
+ no_location_fetcher = SecureWebFetcher(
+ client_factory=lambda **kwargs: FakeClient(no_location_responses, **kwargs)
+ )
+ with pytest.raises(OSError):
+ no_location_fetcher.fetch_bytes("https://docs.example.com/start/", policy)
+
+ loop_responses = {
+ "https://docs.example.com/start/": FakeResponse(
+ status_code=302,
+ headers={"location": "/start/"},
+ request_url="https://docs.example.com/start/",
+ )
+ }
+ loop_fetcher = SecureWebFetcher(
+ max_redirects=1,
+ client_factory=lambda **kwargs: FakeClient(loop_responses, **kwargs),
+ )
+ with pytest.raises(OSError):
+ loop_fetcher.fetch_bytes("https://docs.example.com/start/", policy)
+
+ error_responses = {
+ "https://docs.example.com/start/": FakeResponse(
+ status_code=500,
+ request_url="https://docs.example.com/start/",
+ )
+ }
+ error_fetcher = SecureWebFetcher(
+ client_factory=lambda **kwargs: FakeClient(error_responses, **kwargs)
+ )
+ with pytest.raises(OSError):
+ error_fetcher.fetch_bytes("https://docs.example.com/start/", policy)
+
+
+def test_read_bounded_response_enforces_response_limits() -> None:
+ response = FakeResponse(chunks=(b"abcd", b"efgh"))
+
+ with pytest.raises(OSError):
+ read_bounded_response(response, max_response_bytes=6)
+
+
+def test_web_cache_uses_fresh_and_stale_entries(tmp_path) -> None:
+ cache = WebCache(cache_dir=tmp_path, ttl_seconds=60)
+ cache_key = "docs/page.html"
+ fetch_calls: list[str] = []
+
+ fresh = cache.get(
+ "https://docs.example.com/page/",
+ cache_key,
+ lambda url: fetch_calls.append(url) or b"fresh",
+ )
+ assert fresh.content == b"fresh"
+ assert not fresh.used_stale_cache
+
+ still_fresh = cache.get(
+ "https://docs.example.com/page/",
+ cache_key,
+ lambda _url: (_ for _ in ()).throw(OSError("should not fetch")),
+ )
+ assert still_fresh.content == b"fresh"
+
+ os.utime(tmp_path / cache_key, (1, 1))
+ cache.clock = lambda: 120.0
+ stale = cache.get(
+ "https://docs.example.com/page/",
+ cache_key,
+ lambda _url: (_ for _ in ()).throw(OSError("offline")),
+ )
+ assert stale.content == b"fresh"
+ assert stale.used_stale_cache
+
+
+def test_web_cache_raises_when_no_cache_and_download_fails(tmp_path) -> None:
+ cache = WebCache(cache_dir=tmp_path)
+
+ with pytest.raises(OSError):
+ cache.get(
+ "https://docs.example.com/page/",
+ "docs/missing.html",
+ lambda _url: (_ for _ in ()).throw(OSError("offline")),
+ )
+
+
+def test_normalize_path_helpers_cover_prefix_logic() -> None:
+ assert normalize_path("guide/start/") == "/guide/start/"
+ assert normalize_path_prefix("/guide") == "/guide/"
+ assert path_matches_prefix("/guide", ("/guide/",))
+ assert path_matches_prefix("/guide/start/", ("/guide/",))
+ assert not path_matches_prefix("/reference/start/", ("/guide/",))
diff --git a/tests/test_documentation_sources.py b/tests/test_documentation_sources.py
new file mode 100644
index 0000000..2f796e0
--- /dev/null
+++ b/tests/test_documentation_sources.py
@@ -0,0 +1,304 @@
+from __future__ import annotations
+
+import os
+from xml.etree import ElementTree
+
+import pytest
+
+from binge_docs.documentation_fetching import FetchedDocument, URLPolicy, WebCache
+from binge_docs.documentation_sources import (
+ ARTICLE_SELECTORS,
+ CATALOG_SNAPSHOT_VERSION,
+ DocumentationPage,
+ MkDocsSource,
+ normalize_base_url,
+ page_from_url,
+ parse_sitemap,
+ relative_page_path,
+ resolve_page_slug,
+ title_from_segment,
+)
+from binge_docs.errors import CatalogError, PageError, SourceError
+
+SITEMAP = b"""
+
+ https://docs.example.com/guide/
+ https://docs.example.com/guide/getting-started/
+ https://docs.example.com/guide/tutorial/first_steps/
+ https://docs.example.com/guide/tutorial/first_steps/
+ https://docs.example.com/other/
+ https://external.example.com/guide/page/
+
+
+"""
+
+PAGE_HTML = b"""
+
+ First Steps
+ Hello docs.
+
+"""
+
+
+class StubFetcher:
+ def __init__(self, responses: dict[str, bytes | Exception]) -> None:
+ self.responses = responses
+ self.calls: list[str] = []
+
+ def fetch_bytes(self, url: str, policy: URLPolicy) -> FetchedDocument:
+ validated_url = policy.validate_url(url)
+ self.calls.append(validated_url)
+ response = self.responses[validated_url]
+ if isinstance(response, Exception):
+ raise response
+ return FetchedDocument(response, validated_url)
+
+
+def make_source(tmp_path, responses: dict[str, bytes | Exception]) -> MkDocsSource:
+ return MkDocsSource(
+ "https://docs.example.com/guide/",
+ cache=WebCache(cache_dir=tmp_path),
+ fetcher=StubFetcher(responses),
+ )
+
+
+def test_normalize_base_url_and_source_metadata(tmp_path) -> None:
+ assert normalize_base_url(" https://DOCS.example.com:443/guide ") == (
+ "https://docs.example.com/guide/"
+ )
+
+ source = make_source(tmp_path, {})
+ assert source.display_name == "docs.example.com/guide"
+ assert source.url_policy.allowed_path_prefixes == ("/guide/",)
+ assert len(source.cache_namespace) == 16
+
+
+@pytest.mark.parametrize(
+ "url",
+ [
+ "http://docs.example.com/",
+ "https:///guide/",
+ "https://user:pass@docs.example.com/",
+ "https://docs.example.com:444/",
+ "https://docs.example.com:bad/",
+ "https://docs.example.com/?q=1",
+ "https://docs.example.com/#intro",
+ "https://docs.example.com/%2e%2e/private/",
+ ],
+)
+def test_normalize_base_url_rejects_invalid_urls(url: str) -> None:
+ with pytest.raises(SourceError):
+ normalize_base_url(url)
+
+
+def test_parse_sitemap_preserves_pages_and_derives_labels() -> None:
+ base_url = "https://docs.example.com/guide/"
+ policy = URLPolicy(
+ allowed_origins=("https://docs.example.com",),
+ allowed_path_prefixes=("/guide/",),
+ )
+
+ pages = parse_sitemap(SITEMAP, base_url, policy)
+
+ assert pages == (
+ DocumentationPage("Home", "/", base_url, "home", "Home"),
+ DocumentationPage(
+ "Getting Started",
+ "getting-started",
+ f"{base_url}getting-started/",
+ "getting-started",
+ "Getting Started",
+ ),
+ DocumentationPage(
+ "First Steps",
+ "tutorial/first_steps",
+ f"{base_url}tutorial/first_steps/",
+ "tutorial",
+ "Tutorial",
+ ),
+ )
+
+
+def test_parse_sitemap_rejects_bad_or_empty_documents() -> None:
+ policy = URLPolicy(
+ allowed_origins=("https://docs.example.com",),
+ allowed_path_prefixes=("/guide/",),
+ )
+
+ with pytest.raises(ElementTree.ParseError):
+ parse_sitemap(b"", "https://docs.example.com/guide/", policy)
+ with pytest.raises(ValueError):
+ parse_sitemap(b"", "https://docs.example.com/guide/", policy)
+ with pytest.raises(UnicodeDecodeError):
+ parse_sitemap(b"\xff", "https://docs.example.com/guide/", policy)
+
+
+def test_source_loads_and_caches_catalog(tmp_path) -> None:
+ sitemap_url = "https://docs.example.com/guide/sitemap.xml"
+ fetcher = StubFetcher({sitemap_url: SITEMAP})
+ source = MkDocsSource(
+ "https://docs.example.com/guide/",
+ cache=WebCache(cache_dir=tmp_path),
+ fetcher=fetcher,
+ )
+
+ first = source.load_catalog()
+ second = source.load_catalog()
+
+ assert len(first.pages) == 3
+ assert not first.used_stale_cache
+ assert second.pages == first.pages
+ assert fetcher.calls == [sitemap_url]
+
+
+def test_source_uses_stale_catalog_only_for_download_failures(tmp_path) -> None:
+ sitemap_url = "https://docs.example.com/guide/sitemap.xml"
+ cache = WebCache(cache_dir=tmp_path, ttl_seconds=1)
+ source = MkDocsSource(
+ "https://docs.example.com/guide/",
+ cache=cache,
+ fetcher=StubFetcher({sitemap_url: SITEMAP}),
+ )
+ source.load_catalog()
+
+ snapshot_path = source._catalog_snapshot_path()
+ os.utime(snapshot_path, (1, 1))
+ cache.clock = lambda: 120.0
+ source.fetcher = StubFetcher({sitemap_url: OSError("offline")})
+ assert source.load_catalog().used_stale_cache
+
+ source.fetcher = StubFetcher({sitemap_url: b""})
+ with pytest.raises(CatalogError):
+ source.load_catalog()
+
+
+def test_source_reports_catalog_download_and_encoding_failures(tmp_path) -> None:
+ sitemap_url = "https://docs.example.com/guide/sitemap.xml"
+ source = make_source(tmp_path, {sitemap_url: OSError("offline")})
+ with pytest.raises(CatalogError):
+ source.load_catalog()
+
+ source = make_source(tmp_path / "utf8", {sitemap_url: b"\xff"})
+ with pytest.raises(CatalogError):
+ source.load_catalog()
+
+
+def test_source_ignores_invalid_catalog_snapshots(tmp_path) -> None:
+ source = make_source(tmp_path, {})
+ snapshot_path = source._catalog_snapshot_path()
+ snapshot_path.parent.mkdir(parents=True)
+
+ snapshots = [
+ "{",
+ '{"version": 999, "base_url": "https://docs.example.com/guide/", "pages": []}',
+ f'{{"version": {CATALOG_SNAPSHOT_VERSION}, "base_url": "wrong", "pages": []}}',
+ (
+ f'{{"version": {CATALOG_SNAPSHOT_VERSION}, '
+ '"base_url": "https://docs.example.com/guide/", "pages": {}}'
+ ),
+ (
+ f'{{"version": {CATALOG_SNAPSHOT_VERSION}, '
+ '"base_url": "https://docs.example.com/guide/", "pages": [{"title": "bad"}]}'
+ ),
+ ]
+ for snapshot in snapshots:
+ snapshot_path.write_text(snapshot, encoding="utf-8")
+ assert source._read_catalog_snapshot(snapshot_path) is None
+
+
+def test_source_loads_pages_by_path_or_url_and_uses_cache(tmp_path) -> None:
+ sitemap_url = "https://docs.example.com/guide/sitemap.xml"
+ page_url = "https://docs.example.com/guide/tutorial/first_steps/"
+ fetcher = StubFetcher({sitemap_url: SITEMAP, page_url: PAGE_HTML})
+ source = MkDocsSource(
+ "https://docs.example.com/guide/",
+ cache=WebCache(cache_dir=tmp_path),
+ fetcher=fetcher,
+ )
+ pages = source.load_catalog().pages
+
+ first = source.load_page("tutorial/first_steps", pages)
+ second = source.load_page(page_url, pages)
+
+ assert first.page.url == page_url
+ assert second.page == first.page
+ assert fetcher.calls == [sitemap_url, page_url]
+
+
+def test_source_page_errors_and_stale_cache(tmp_path) -> None:
+ sitemap_url = "https://docs.example.com/guide/sitemap.xml"
+ page_url = "https://docs.example.com/guide/getting-started/"
+ cache = WebCache(cache_dir=tmp_path, ttl_seconds=1)
+ source = MkDocsSource(
+ "https://docs.example.com/guide/",
+ cache=cache,
+ fetcher=StubFetcher({sitemap_url: SITEMAP, page_url: PAGE_HTML}),
+ )
+ pages = source.load_catalog().pages
+ source.load_page("getting-started/", pages)
+
+ page_cache = next(
+ (tmp_path / "mkdocs").glob(f"*/v{CATALOG_SNAPSHOT_VERSION}/pages/*.html")
+ )
+ os.utime(page_cache, (1, 1))
+ cache.clock = lambda: 120.0
+ source.fetcher = StubFetcher({page_url: OSError("offline")})
+ assert source.load_page("getting-started", pages).used_stale_cache
+
+ with pytest.raises(PageError):
+ source.load_page("missing", pages)
+ with pytest.raises(PageError):
+ source.load_page("https://evil.example.com/page/", pages)
+
+ source = make_source(tmp_path / "missing", {sitemap_url: SITEMAP, page_url: OSError("offline")})
+ pages = source.load_catalog().pages
+ with pytest.raises(PageError):
+ source.load_page("getting-started", pages)
+
+ source = make_source(tmp_path / "encoding", {sitemap_url: SITEMAP, page_url: b"\xff"})
+ pages = source.load_catalog().pages
+ with pytest.raises(PageError):
+ source.load_page("getting-started", pages)
+
+
+@pytest.mark.parametrize(
+ ("selector", "html"),
+ [
+ (ARTICLE_SELECTORS[0], "Material.
"),
+ (ARTICLE_SELECTORS[1], ""),
+ (ARTICLE_SELECTORS[2], "Main.
"),
+ (ARTICLE_SELECTORS[3], "Article.
"),
+ ],
+)
+def test_source_extracts_supported_theme_content(tmp_path, selector: str, html: str) -> None:
+ source = make_source(tmp_path, {})
+ assert source.extract_narration(html)
+
+
+def test_source_rejects_pages_without_readable_content(tmp_path) -> None:
+ source = make_source(tmp_path, {})
+ with pytest.raises(PageError):
+ source.extract_narration("No article")
+
+
+def test_page_helpers_cover_home_nested_and_invalid_references() -> None:
+ base_url = "https://docs.example.com/guide/"
+ policy = URLPolicy(
+ allowed_origins=("https://docs.example.com",),
+ allowed_path_prefixes=("/guide/",),
+ )
+
+ assert relative_page_path(base_url.rstrip("/"), base_url) == ""
+ assert relative_page_path(f"{base_url}tutorial/page/", base_url) == "tutorial/page"
+ with pytest.raises(ValueError):
+ relative_page_path("https://docs.example.com/other/", base_url)
+ assert title_from_segment("first_steps.html") == "First Steps"
+ assert page_from_url(base_url, "").title == "Home"
+ assert resolve_page_slug("tutorial/page", base_url, policy) == "tutorial/page"
+ assert resolve_page_slug(base_url, base_url, policy) == "/"
+ assert resolve_page_slug("/", base_url, policy) == "/"
+
+ with pytest.raises(PageError):
+ resolve_page_slug("", base_url, policy)
+ with pytest.raises(PageError):
+ resolve_page_slug("https://docs.example.com/guidebook/page/", base_url, policy)
diff --git a/tests/test_model_store.py b/tests/test_model_store.py
new file mode 100644
index 0000000..9a91a00
--- /dev/null
+++ b/tests/test_model_store.py
@@ -0,0 +1,152 @@
+from __future__ import annotations
+
+from pathlib import Path
+
+import httpx
+import pytest
+
+from binge_docs.errors import ModelError
+from binge_docs.model_store import (
+ MODEL_ASSET,
+ REQUIRED_ASSETS,
+ VOICES_ASSET,
+ ModelStore,
+ _content_length,
+ calculate_sha256,
+ download_asset,
+)
+
+
+class StreamResponse:
+ def __init__(self, *, chunks: tuple[bytes, ...], headers: dict[str, str] | None = None) -> None:
+ self._chunks = chunks
+ self.headers = headers or {}
+ self.request = httpx.Request("GET", "https://example.com/file")
+
+ def __enter__(self) -> StreamResponse:
+ return self
+
+ def __exit__(self, *_args: object) -> None:
+ return None
+
+ def raise_for_status(self) -> None:
+ return None
+
+ def iter_bytes(self, _chunk_size: int):
+ yield from self._chunks
+
+
+def test_model_store_uses_existing_assets_without_redownloading(tmp_path) -> None:
+ ready_paths = {}
+ for asset in REQUIRED_ASSETS:
+ destination = tmp_path / asset.filename
+ destination.write_text("ready", encoding="utf-8")
+ ready_paths[asset.filename] = destination
+
+ store = ModelStore(
+ model_dir=tmp_path,
+ downloader=lambda *_args: (_ for _ in ()).throw(RuntimeError),
+ )
+ paths = store.ensure_assets()
+
+ assert paths.model == ready_paths[MODEL_ASSET.filename]
+ assert paths.voices == ready_paths[VOICES_ASSET.filename]
+
+
+def test_model_store_redownloads_invalid_assets_and_wraps_errors(tmp_path, monkeypatch) -> None:
+ for asset in REQUIRED_ASSETS:
+ (tmp_path / asset.filename).write_text("wrong", encoding="utf-8")
+
+ checksum_values = iter(["bad", MODEL_ASSET.sha256, "bad", VOICES_ASSET.sha256])
+ monkeypatch.setattr(
+ "binge_docs.model_store.calculate_sha256",
+ lambda _path: next(checksum_values),
+ )
+ downloads: list[str] = []
+
+ def downloader(asset, destination: Path, _progress) -> None:
+ downloads.append(asset.filename)
+ destination.write_bytes(asset.filename.encode("utf-8"))
+
+ store = ModelStore(model_dir=tmp_path, downloader=downloader)
+ store.ensure_assets(verify_existing=True)
+ assert downloads == [MODEL_ASSET.filename, VOICES_ASSET.filename]
+
+ failing_store = ModelStore(
+ model_dir=tmp_path,
+ downloader=lambda _asset, destination, _progress: (_ for _ in ()).throw(OSError("offline")),
+ )
+ for asset in REQUIRED_ASSETS:
+ (tmp_path / asset.filename).unlink(missing_ok=True)
+
+ with pytest.raises(ModelError):
+ failing_store.ensure_assets()
+
+ passthrough_store = ModelStore(
+ model_dir=tmp_path,
+ downloader=lambda _asset, _destination, _progress: (
+ _ for _ in ()
+ ).throw(ModelError("nope")),
+ )
+ with pytest.raises(ModelError):
+ passthrough_store.ensure_assets()
+
+
+def test_model_store_raises_on_checksum_mismatch(tmp_path, monkeypatch) -> None:
+ monkeypatch.setattr("binge_docs.model_store.calculate_sha256", lambda _path: "wrong")
+ store = ModelStore(
+ model_dir=tmp_path,
+ downloader=lambda _asset, destination, _progress: destination.write_bytes(b"downloaded"),
+ )
+
+ with pytest.raises(ModelError):
+ store.ensure_assets()
+
+
+def test_download_asset_retries_and_reports_progress(tmp_path, monkeypatch) -> None:
+ attempts = {"count": 0}
+
+ def fake_stream(*_args, **_kwargs):
+ attempts["count"] += 1
+ if attempts["count"] == 1:
+ raise httpx.HTTPError("try again")
+ return StreamResponse(chunks=(b"ab", b"cd"), headers={"content-length": "4"})
+
+ progress_updates: list[tuple[str, int, int | None]] = []
+ monkeypatch.setattr("binge_docs.model_store.httpx.stream", fake_stream)
+
+ destination = tmp_path / "asset.bin"
+ download_asset(
+ MODEL_ASSET,
+ destination,
+ lambda name, completed, total: progress_updates.append((name, completed, total)),
+ )
+
+ assert destination.read_bytes() == b"abcd"
+ assert attempts["count"] == 2
+ assert progress_updates[-1] == (MODEL_ASSET.filename, 4, 4)
+
+ destination_without_progress = tmp_path / "asset-no-progress.bin"
+ monkeypatch.setattr(
+ "binge_docs.model_store.httpx.stream",
+ lambda *_args, **_kwargs: StreamResponse(chunks=(b"ab",)),
+ )
+ download_asset(MODEL_ASSET, destination_without_progress)
+ assert destination_without_progress.read_bytes() == b"ab"
+
+
+def test_download_asset_raises_after_retries_and_support_helpers(tmp_path, monkeypatch) -> None:
+ monkeypatch.setattr(
+ "binge_docs.model_store.httpx.stream",
+ lambda *_args, **_kwargs: (_ for _ in ()).throw(httpx.HTTPError("still failing")),
+ )
+
+ with pytest.raises(ModelError):
+ download_asset(MODEL_ASSET, tmp_path / "asset.bin")
+
+ file_path = tmp_path / "checksum.txt"
+ file_path.write_text("checksum", encoding="utf-8")
+ assert calculate_sha256(file_path)
+ assert _content_length(httpx.Response(200, headers={"content-length": "12"})) == 12
+ assert _content_length(httpx.Response(200)) is None
+ assert _content_length(httpx.Response(200, headers={"content-length": "bad"})) is None
diff --git a/tests/test_narration.py b/tests/test_narration.py
new file mode 100644
index 0000000..3b9e43e
--- /dev/null
+++ b/tests/test_narration.py
@@ -0,0 +1,85 @@
+from __future__ import annotations
+
+import pytest
+from bs4 import BeautifulSoup
+
+from binge_docs.errors import PageError
+from binge_docs.narration import (
+ _end_with_period,
+ _extract_element_text,
+ _has_skipped_ancestor,
+ _is_skipped,
+ extract_narration,
+)
+
+
+class FakeElement:
+ def __init__(self, name: str, parents: list[object]) -> None:
+ self.name = name
+ self.parents = parents
+
+ def get(self, _key: str, default=None):
+ return default
+
+
+class FakeDescendant(str):
+ def __new__(cls, value: str, parents: list[object]):
+ instance = str.__new__(cls, value)
+ instance.parents = parents
+ return instance
+
+
+def test_extract_narration_reads_content_and_skips_visual_noise() -> None:
+ html = """
+
+ Heading
+ Hello world
+ Hello world
+ Nested block
Tail
+ skip classKeep
+ Voice
+
+ code
+
+
+ """
+
+ narration = extract_narration(html, source_display_name="Stub Docs")
+
+ assert narration == "Heading.\n\nHello world\n\nTail\n\nKeep\n\nVoice\n\nBullet"
+
+
+def test_extract_narration_raises_for_missing_or_empty_articles() -> None:
+ with pytest.raises(PageError):
+ extract_narration("")
+
+ empty_html = 'code
'
+ with pytest.raises(PageError):
+ extract_narration(empty_html)
+
+
+def test_narration_helpers_cover_skipped_elements() -> None:
+ soup = BeautifulSoup(
+ """
+
+ Hidden
+ Hello world
+
+ """,
+ "html.parser",
+ )
+ hidden = soup.find_all("p")[0]
+ visible = soup.find_all("p")[1]
+ headerlink_text = visible.find("a").string
+ skipped_tag = BeautifulSoup("code
", "html.parser").pre
+
+ assert _is_skipped(hidden)
+ assert not _is_skipped(FakeElement("p", [object()]))
+ assert _is_skipped(skipped_tag)
+ assert _extract_element_text(visible) == "Hello world"
+ assert _has_skipped_ancestor(headerlink_text, visible)
+ assert not _has_skipped_ancestor(visible.find("span").string, visible)
+ assert not _has_skipped_ancestor(FakeDescendant("text", [object(), visible]), visible)
+ assert not _has_skipped_ancestor(FakeDescendant("text", [object()]), visible)
+ assert _end_with_period("Hello") == "Hello."
+ assert _end_with_period("Hello!") == "Hello!"
diff --git a/tests/test_speech.py b/tests/test_speech.py
new file mode 100644
index 0000000..b5fb940
--- /dev/null
+++ b/tests/test_speech.py
@@ -0,0 +1,214 @@
+from __future__ import annotations
+
+import asyncio
+import types
+from pathlib import Path
+
+import pytest
+
+from binge_docs.errors import PlaybackError
+from binge_docs.model_store import ModelPaths
+from binge_docs.speech import (
+ PlaybackController,
+ _merge_segments_for_streaming,
+ _split_paragraph,
+ _split_text_into_segments,
+ _StreamingAudioPlayer,
+ language_for_voice,
+ stream_speech,
+ validate_speed,
+ validate_voice,
+)
+
+
+class FakeFrame:
+ def __init__(self, values: list[int]) -> None:
+ self.values = list(values)
+
+ def __len__(self) -> int:
+ return len(self.values)
+
+ def __getitem__(self, item):
+ if isinstance(item, slice):
+ return FakeFrame(self.values[item])
+ return self.values[item]
+
+ def __mul__(self, multiplier: int) -> FakeFrame:
+ return FakeFrame([value * multiplier for value in self.values])
+
+
+class FakeSamples:
+ def __init__(self, values: list[int]) -> None:
+ self.values = values
+
+ def reshape(self, *_args: object) -> FakeFrame:
+ return FakeFrame(self.values)
+
+
+class FakeOutputStream:
+ def __init__(self, **kwargs: object) -> None:
+ self.kwargs = kwargs
+ self.started = False
+ self.stopped = False
+ self.closed = False
+ self.writes: list[FakeFrame] = []
+
+ def start(self) -> None:
+ self.started = True
+
+ def write(self, frame: FakeFrame) -> None:
+ self.writes.append(frame)
+
+ def stop(self) -> None:
+ self.stopped = True
+
+ def close(self) -> None:
+ self.closed = True
+
+
+class FakeKokoro:
+ def __init__(self, model_path: str, voices_path: str) -> None:
+ self.model_path = model_path
+ self.voices_path = voices_path
+ self.calls: list[tuple[str, str, float, str]] = []
+
+ def create_stream(self, segment: str, *, voice: str, speed: float, lang: str):
+ self.calls.append((segment, voice, speed, lang))
+
+ async def generator():
+ yield FakeSamples([1, 2, 3, 4]), 22_050
+
+ return generator()
+
+
+def test_validate_voice_and_speed_helpers() -> None:
+ assert validate_voice("AF_HEART") == "af_heart"
+ assert validate_speed(1.2) == 1.2
+ assert language_for_voice("bf_emma") == "en-gb"
+ assert language_for_voice("af_heart") == "en-us"
+
+ with pytest.raises(PlaybackError):
+ validate_voice("not-real")
+ with pytest.raises(PlaybackError):
+ validate_speed(0.1)
+
+
+def test_playback_controller_state_transitions() -> None:
+ controller = PlaybackController(1.0)
+
+ assert controller.speed == 1.0
+ assert controller.toggle_pause() is True
+ assert controller.is_paused is True
+ assert controller.toggle_pause() is False
+ assert controller.increase_speed() == 1.1
+ assert controller.decrease_speed() == 1.0
+ controller.toggle_pause()
+ waiter = __import__("threading").Thread(target=controller.wait_if_paused)
+ waiter.start()
+ controller.toggle_pause()
+ waiter.join(timeout=1)
+ controller.request_stop()
+
+ with pytest.raises(KeyboardInterrupt):
+ controller.ensure_not_stopped()
+
+
+def test_text_segmentation_helpers() -> None:
+ paragraph = "Sentence one. Sentence two. Sentence three. Sentence four." * 10
+ split_paragraph = _split_paragraph(paragraph)
+ assert len(split_paragraph) > 1
+ assert _split_paragraph("x" * 400) == ["x" * 400]
+ huge_sentence = ("x" * 330) + ". " + ("y" * 330) + "."
+ assert len(_split_paragraph(huge_sentence)) == 2
+ assert _merge_segments_for_streaming(["one", "two"]) == ["one\n\ntwo"]
+ assert len(_merge_segments_for_streaming(["x" * 600, "y" * 600])) == 2
+ assert _merge_segments_for_streaming([]) == []
+ assert _split_text_into_segments("Hello\n\nWorld")
+ assert _split_text_into_segments(" ") == []
+
+
+def test_streaming_audio_player_primes_stream_and_closes() -> None:
+ controller = PlaybackController(1.0)
+ created_streams: list[FakeOutputStream] = []
+
+ def output_stream_factory(**kwargs: object) -> FakeOutputStream:
+ stream = FakeOutputStream(**kwargs)
+ created_streams.append(stream)
+ return stream
+
+ player = _StreamingAudioPlayer(output_stream_factory, controller)
+ player.play_chunk(FakeSamples([1, 2, 3, 4]), 16_000)
+ player.play_chunk(FakeSamples([5, 6]), 16_000)
+ player.close()
+ player.close()
+
+ empty_stream_player = _StreamingAudioPlayer(output_stream_factory, controller)
+ empty_stream_player.output_stream = FakeOutputStream()
+ empty_stream_player._prime_output_stream(FakeFrame([]))
+
+ stream = created_streams[0]
+ assert stream.started
+ assert stream.stopped
+ assert stream.closed
+ assert len(stream.writes) >= 2
+
+
+def test_stream_speech_supports_injected_and_default_factories(monkeypatch) -> None:
+ model_paths = ModelPaths(model=Path("/tmp/model.onnx"), voices=Path("/tmp/voices.bin"))
+ created_streams: list[FakeOutputStream] = []
+ created_kokoros: list[FakeKokoro] = []
+
+ def kokoro_factory(model_path: str, voices_path: str) -> FakeKokoro:
+ kokoro = FakeKokoro(model_path, voices_path)
+ created_kokoros.append(kokoro)
+ return kokoro
+
+ def output_stream_factory(**kwargs: object) -> FakeOutputStream:
+ stream = FakeOutputStream(**kwargs)
+ created_streams.append(stream)
+ return stream
+
+ asyncio.run(
+ stream_speech(
+ "Hello world",
+ model_paths,
+ kokoro_factory=kokoro_factory,
+ output_stream_factory=output_stream_factory,
+ )
+ )
+
+ fake_kokoro_module = types.SimpleNamespace(Kokoro=kokoro_factory)
+ fake_sounddevice_module = types.SimpleNamespace(OutputStream=output_stream_factory)
+ monkeypatch.setitem(__import__("sys").modules, "kokoro_onnx", fake_kokoro_module)
+ monkeypatch.setitem(__import__("sys").modules, "sounddevice", fake_sounddevice_module)
+ asyncio.run(stream_speech("Hello again", model_paths))
+
+ assert created_kokoros
+ assert created_streams
+
+
+def test_stream_speech_wraps_failures_and_preserves_keyboard_interrupt() -> None:
+ model_paths = ModelPaths(model=Path("/tmp/model.onnx"), voices=Path("/tmp/voices.bin"))
+
+ with pytest.raises(PlaybackError):
+ asyncio.run(
+ stream_speech(
+ "Hello",
+ model_paths,
+ kokoro_factory=lambda *_args: (_ for _ in ()).throw(RuntimeError("boom")),
+ output_stream_factory=lambda **_kwargs: FakeOutputStream(),
+ )
+ )
+
+ controller = PlaybackController(1.0)
+ controller.request_stop()
+ with pytest.raises(KeyboardInterrupt):
+ asyncio.run(
+ stream_speech(
+ "Hello",
+ model_paths,
+ kokoro_factory=lambda *_args: FakeKokoro("model", "voices"),
+ output_stream_factory=lambda **_kwargs: FakeOutputStream(),
+ playback_controller=controller,
+ )
+ )
diff --git a/uv.lock b/uv.lock
index 156b46a..e0ff5f9 100644
--- a/uv.lock
+++ b/uv.lock
@@ -75,8 +75,10 @@ dependencies = [
{ name = "typer" },
]
-[package.optional-dependencies]
+[package.dev-dependencies]
dev = [
+ { name = "pytest" },
+ { name = "pytest-cov" },
{ name = "ruff" },
]
@@ -88,11 +90,16 @@ requires-dist = [
{ name = "platformdirs", specifier = ">=4.3,<5" },
{ name = "questionary", specifier = ">=2.1,<3" },
{ name = "rich", specifier = ">=14,<15" },
- { name = "ruff", marker = "extra == 'dev'", specifier = ">=0.12,<1" },
{ name = "sounddevice", specifier = ">=0.5,<1" },
{ name = "typer", specifier = ">=0.16,<1" },
]
-provides-extras = ["dev"]
+
+[package.metadata.requires-dev]
+dev = [
+ { name = "pytest", specifier = ">=9.1.0" },
+ { name = "pytest-cov", specifier = ">=7.1.0" },
+ { name = "ruff", specifier = ">=0.12,<1" },
+]
[[package]]
name = "certifi"
@@ -172,6 +179,94 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" },
]
+[[package]]
+name = "coverage"
+version = "7.14.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/54/fd/0ab2772530e946e1be1abd0bc09e647ec9b02e88f0867857601fefca8953/coverage-7.14.1.tar.gz", hash = "sha256:30c08f7d90415aa98b3c990385dea2939b0da55f38515e5b369b83655f8523be", size = 920132, upload-time = "2026-05-26T20:41:36.783Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/92/69/0d2ef01ff4b8fcecd4cba920d11e92fa4f96ae412441d3b56a90a258e69b/coverage-7.14.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:3e3680291c4a1d0dadfa84a2c459576a4af5133abb617905714339a0c73138cf", size = 219722, upload-time = "2026-05-26T20:38:14.002Z" },
+ { url = "https://files.pythonhosted.org/packages/f8/ae/9afdeaa31b9d9ce98124b6abf8bb49119bf71aecae04f8567c189d91299f/coverage-7.14.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:a5274669f37f2343635a347b91a60777621341ab3378e9c6ac9335eee704bddf", size = 220240, upload-time = "2026-05-26T20:38:17.424Z" },
+ { url = "https://files.pythonhosted.org/packages/51/69/c998589871df7ea7dba865cc5ee32b5a3e1d47ba6c68ef91104c7c46fa5e/coverage-7.14.1-cp310-cp310-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:cfe5a5fec635799ef33428f1e5e61bafa45a92a96190ba731561ba558ccc214d", size = 246981, upload-time = "2026-05-26T20:38:19.266Z" },
+ { url = "https://files.pythonhosted.org/packages/fc/10/1c7d04c13040dac531d21b712bbe08f902e6dd9b58f5d77875c4d030f8f2/coverage-7.14.1-cp310-cp310-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:62a9f70b52e0b5a95cfef4a5c5641b06983cadc5e538a3feeb5c00211f523ac2", size = 248812, upload-time = "2026-05-26T20:38:20.75Z" },
+ { url = "https://files.pythonhosted.org/packages/c1/65/2a38a4607ef27cadcfbcee034dba5830ae2569f90144a0f4c7dbf47d30b0/coverage-7.14.1-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3c18ebc343e15be53049b3a2dce38fe82d58f37e20ab9094b3a39c0aa4f6bb47", size = 250675, upload-time = "2026-05-26T20:38:22.159Z" },
+ { url = "https://files.pythonhosted.org/packages/c9/a2/a446ed9752a4a59b79e0fb6cbb319f6facb2183045c0725462625e66f87e/coverage-7.14.1-cp310-cp310-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:b84ffdf877644e7096aa936991efeed873f7f3df57b9cd001312b7668ab08550", size = 252590, upload-time = "2026-05-26T20:38:23.63Z" },
+ { url = "https://files.pythonhosted.org/packages/9e/fd/e81fbd7ba752365546e9842b1cbdaad3d6919d2a522c590aef16a281ec5e/coverage-7.14.1-cp310-cp310-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:e854312c4103f2ad4c0dc023b69b77ebfd2c89db5f86c4c94dc2353f9a92167e", size = 247691, upload-time = "2026-05-26T20:38:25.057Z" },
+ { url = "https://files.pythonhosted.org/packages/53/35/f3c26fdaae9ea937d154ca4d372e5ea0a4167ff70d36c6074ac2eacb2f83/coverage-7.14.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:c643734307300234fafa36bf2a040a7235f8f177ea1fd6ec1423aea6fb7b929f", size = 248716, upload-time = "2026-05-26T20:38:26.406Z" },
+ { url = "https://files.pythonhosted.org/packages/2e/14/940b6c49551fd343e8507ee2b0ba7af5d0aa04ed5bf768285cb7c72a9884/coverage-7.14.1-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:84ac9499e48700399a5dd0ea7085b5091961fec52c68d66b4ec0d3cf7f4441b1", size = 246721, upload-time = "2026-05-26T20:38:28.282Z" },
+ { url = "https://files.pythonhosted.org/packages/aa/2c/40fc0634186c28292a662dff578866b3913983d6c375a3c2a74020938719/coverage-7.14.1-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:7f02d09f70776579b926d889a4c9c235070a1f47c40458aeaca563fae5acfdb5", size = 250533, upload-time = "2026-05-26T20:38:29.753Z" },
+ { url = "https://files.pythonhosted.org/packages/de/e3/2c26bf1e811f9df991ff2a9bdddebdd13ee0665d564df7d05979f9146297/coverage-7.14.1-cp310-cp310-musllinux_1_2_riscv64.whl", hash = "sha256:ce66d8e46da2bb5ee313a745cbd2e391d319176c1f7a9451bfcd3a2fb920859b", size = 246990, upload-time = "2026-05-26T20:38:31.516Z" },
+ { url = "https://files.pythonhosted.org/packages/a8/b0/060260ef56bd92363ebdce0c7095ce422b06e69aae71828efeca473ab1ca/coverage-7.14.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:c912c259304cfb5ee584481cfb7ce1ff932b4d61e6c9140b8f19cb7b5ed82332", size = 247593, upload-time = "2026-05-26T20:38:33.065Z" },
+ { url = "https://files.pythonhosted.org/packages/63/f3/501502046efeb0d6d94b5ca54941d95f1184183dd6bdb7f283985783bb4a/coverage-7.14.1-cp310-cp310-win32.whl", hash = "sha256:1238cb94638e610e972c60dac68e813f868dc7d6e982535270558443058d9d59", size = 222330, upload-time = "2026-05-26T20:38:35.36Z" },
+ { url = "https://files.pythonhosted.org/packages/a0/5d/1bf99f2c558f128faf7906817ccbdb576ba815d3b41ce2ac1719b70a3663/coverage-7.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:fc459e5d73be2d6332fcfe8dbf3d8994671fe33c700f4565988ecfa511547253", size = 223261, upload-time = "2026-05-26T20:38:37.196Z" },
+ { url = "https://files.pythonhosted.org/packages/7d/d7/477ad149490e6cb849f28abea1dabb9c823cea72e7500c81b4240ce619c0/coverage-7.14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:478b5bcd63c2e1357c5c7e16c070690df7b07f676b1c114d7b93e533c664309f", size = 219848, upload-time = "2026-05-26T20:38:38.715Z" },
+ { url = "https://files.pythonhosted.org/packages/91/82/a5eb47257c50601bb7b9a9d2857c67b7a3a85ad74180eb2c98bb1fbe0ce5/coverage-7.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a24a81f9715ee42ef59a316cc11611c98fe23920f7c81861315c9f3ff4a230f4", size = 220354, upload-time = "2026-05-26T20:38:40.232Z" },
+ { url = "https://files.pythonhosted.org/packages/43/8b/78419b5391a5cb706b6544390507e469d83ffc9a8248b02c4011aceb9365/coverage-7.14.1-cp311-cp311-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:196a13319ad88d6d8ef5ab489ec4f44ddde2143c0c7d5b27786f6c3ffd56a7e1", size = 250771, upload-time = "2026-05-26T20:38:41.782Z" },
+ { url = "https://files.pythonhosted.org/packages/77/63/e77aaacd491182210d639636b7a8bba23ffffa9b82aa3762da9431855fa9/coverage-7.14.1-cp311-cp311-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:3d452fd08b5c72c5167c93e6867b5c08500bd40f2a21e1e854a500550b6cc36f", size = 252683, upload-time = "2026-05-26T20:38:43.305Z" },
+ { url = "https://files.pythonhosted.org/packages/65/1c/a022e3cfbec2ac241640003cb3a817e161d9c7f5aa9b49173756cdc03204/coverage-7.14.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:23bf7fa51ac02e07fc7c96849b82946da47ae862dc8f86d183b2a4864fc38129", size = 254791, upload-time = "2026-05-26T20:38:45.361Z" },
+ { url = "https://files.pythonhosted.org/packages/61/d6/967e408aca4c1ceb88cb0cc677169110ae7f5995fb5eaf5fb1f5a1bb8f5d/coverage-7.14.1-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:bcaa50684dcaadfa599ac48f81103c756d791cfd85c97203d2217c593d48b860", size = 256748, upload-time = "2026-05-26T20:38:46.91Z" },
+ { url = "https://files.pythonhosted.org/packages/b8/be/869188f7fe28638078ec479331ace6dc5f7b40b7153eb616f47ab79404d8/coverage-7.14.1-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:4ea1c034f95c9b056e856b794630b17f9fa3d57e4800ff1e503d3be0f9c9078c", size = 250907, upload-time = "2026-05-26T20:38:48.493Z" },
+ { url = "https://files.pythonhosted.org/packages/07/aa/adb7d3b4278d690e68703abcd76ab1b948242e3668d921711551b78f9ddb/coverage-7.14.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:c7e057326434e441306226fbeb5d1aaf14a2637efe97ba668306635835f32ad7", size = 252483, upload-time = "2026-05-26T20:38:50.074Z" },
+ { url = "https://files.pythonhosted.org/packages/43/61/331c74103c62dcb0c4b9b3a0de9a61aca016208b0a90f109592a9f9ecc28/coverage-7.14.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:59baf88468dbc8d63b1887afd92bda52e40bb1561696e5819670601403810cec", size = 250545, upload-time = "2026-05-26T20:38:51.613Z" },
+ { url = "https://files.pythonhosted.org/packages/f6/b6/c5dae3c104d89be04828f61810e6b3473825482e4c288cc4ed04553e08ae/coverage-7.14.1-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:d34d75f892b3ab73ba11cab5442cce7b3e168fd64162b16f0e1e0d09c508edef", size = 254310, upload-time = "2026-05-26T20:38:53.503Z" },
+ { url = "https://files.pythonhosted.org/packages/ad/a1/2b9d5863e3b83c01ad8199e3c597802fbb3a9dc90b058885804c20296d31/coverage-7.14.1-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:3a56abc20a472baf0304c455721bc601477440d28ecfde8a03dde79ede07e0df", size = 250266, upload-time = "2026-05-26T20:38:55.414Z" },
+ { url = "https://files.pythonhosted.org/packages/7f/5e/0e511fbdb269359be26fe678a1c3fa1f2aa2a01573cc3f54268c8d6d4797/coverage-7.14.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:6a3cb83d1552c0cd1b4906655b6a33fd4a8473229633a901c6b73bf86914dee9", size = 251174, upload-time = "2026-05-26T20:38:57.141Z" },
+ { url = "https://files.pythonhosted.org/packages/85/10/e55307b622b3dd9671cb321824502dc10f93e72f2802b9946159a8edadeb/coverage-7.14.1-cp311-cp311-win32.whl", hash = "sha256:10274a1fbeb8ec5d72966e17bb198a3104257aca4ac09d98667c5f8aca8c8548", size = 222354, upload-time = "2026-05-26T20:38:58.727Z" },
+ { url = "https://files.pythonhosted.org/packages/71/cf/107421693cfb71e4f1ca5bf70443f64d4161878068d07a3e51c7ad21d17b/coverage-7.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:87ebdf787d4888e3f3f2d523eadc6e18c6d18c6d0eb173801a189641627fb37e", size = 223290, upload-time = "2026-05-26T20:39:00.413Z" },
+ { url = "https://files.pythonhosted.org/packages/b8/1d/3e3644585eb29e9dafefb19555078529a4d7cce12bd21929664eea989277/coverage-7.14.1-cp311-cp311-win_arm64.whl", hash = "sha256:dd34767fa19848d35659ffc0a75314f58c7af3f1cd87ec521e8292a1238398a3", size = 221953, upload-time = "2026-05-26T20:39:02.159Z" },
+ { url = "https://files.pythonhosted.org/packages/3d/b7/bdbb725ba02c5b42825b200c940f38b7a54fcad24627b7192f78f8110d76/coverage-7.14.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:a06c76364a9360e33d6d23769aefdf7f66f38e2ffb60ceb1baaa4989d83b695c", size = 220022, upload-time = "2026-05-26T20:39:03.702Z" },
+ { url = "https://files.pythonhosted.org/packages/72/81/fdc0898a55c6219223291ec1a1fe89966ef212ce82276aa0899df84b5de0/coverage-7.14.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:fad54e871165f6ec2f536063ac74c3104508a12963e64072ba44bd822de52b0c", size = 220379, upload-time = "2026-05-26T20:39:05.381Z" },
+ { url = "https://files.pythonhosted.org/packages/de/72/de048c4a25e13bce59ac6a339351c10bdf2515e07459afcdaf04dc3143a2/coverage-7.14.1-cp312-cp312-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:84b535f00655ecafe1d929d1fb00ed5d6fa3051ea643ab2c161a3887b86f294b", size = 251888, upload-time = "2026-05-26T20:39:07.367Z" },
+ { url = "https://files.pythonhosted.org/packages/28/30/300c343f68beb9d4cbb64ec81e58c5b6b80b56927f72d2b38654ac26e013/coverage-7.14.1-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:6b6b0853b895fe0e98cbfc580d1ec3393d9302b4b1e96a77b3f5c91fdab899e6", size = 254624, upload-time = "2026-05-26T20:39:09.037Z" },
+ { url = "https://files.pythonhosted.org/packages/b1/ed/7b25642496e8170b6bac14adce00537c6e5fa2d586159401a4de3e8b49e6/coverage-7.14.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:442cc9c952b2df400cda54bb04ab87330cf2cd08a8692cbbea36773531eb6f37", size = 255739, upload-time = "2026-05-26T20:39:10.889Z" },
+ { url = "https://files.pythonhosted.org/packages/7f/a2/abd210b8c4e29c24e4624916db97bb519097a91034aaeb767f937e7da794/coverage-7.14.1-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:8270544c361ed405a27a060dbc9ed2c124b084d96dfdc2d9a2510482aef981ad", size = 257998, upload-time = "2026-05-26T20:39:12.722Z" },
+ { url = "https://files.pythonhosted.org/packages/7f/24/7c50beed3792fe62f6ce0545c6686ce83379719e2c0276179333d97eae92/coverage-7.14.1-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:48b283b1dd6372e8de2a7a9a4c4d5dc06f4d4fd209b876f3c88a7a205a0c8f84", size = 252296, upload-time = "2026-05-26T20:39:14.259Z" },
+ { url = "https://files.pythonhosted.org/packages/15/05/0f874628ebcbfc77ead559ff210281ef06a97db08481832e7dd39274a135/coverage-7.14.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:5b0c99ba93a07d56f6df340bb79be53202a082b2fdb81bfe6190b741a3470d54", size = 253658, upload-time = "2026-05-26T20:39:15.923Z" },
+ { url = "https://files.pythonhosted.org/packages/99/6f/ca6ad067364b337ef997802115e7ecad2abd2248b05471464b0dea02b4d4/coverage-7.14.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:e471bc5769ff073b058cfadb0d736b56ce067c8560eabeb0da88462df98c23e7", size = 251803, upload-time = "2026-05-26T20:39:17.537Z" },
+ { url = "https://files.pythonhosted.org/packages/c0/30/b9b4d377cd9f40baf228068f5a81faf8450c6228503011bd499708483a50/coverage-7.14.1-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:f497a1ea81d4cd7c10ddcaa685135b9aabd291af3d55775a9ddf3cb7a364cdd9", size = 255873, upload-time = "2026-05-26T20:39:19.414Z" },
+ { url = "https://files.pythonhosted.org/packages/3c/21/7c721a9e5e6bb88547d30a787aefb97512d3f54c1324c7488d9b3743f7f9/coverage-7.14.1-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:2222be86d0b54f5dd5a38f45f17f315f737245e857bf0bdedc70734f84a13c02", size = 251372, upload-time = "2026-05-26T20:39:21.169Z" },
+ { url = "https://files.pythonhosted.org/packages/9d/8c/f8ae5a2200130e1503cd7661a6cd3b2b7bacef98277fbf3571fb13f8b766/coverage-7.14.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:85e85586565842f6932abebd4c18bcb1074223dc0b3576e7d173ca710622813a", size = 253245, upload-time = "2026-05-26T20:39:23.097Z" },
+ { url = "https://files.pythonhosted.org/packages/34/62/70a9024672a5f6910517d9628c52c9afbdd3cf8f46426af52bb148a56fff/coverage-7.14.1-cp312-cp312-win32.whl", hash = "sha256:4a28fd227808366b196a75476dced2eb35b351d6766ba9c858dc93319e87f4f1", size = 222567, upload-time = "2026-05-26T20:39:24.868Z" },
+ { url = "https://files.pythonhosted.org/packages/f6/81/8b7cd386839b039ebe1855733b9f9449a8dec5d79564018234f185a7fa70/coverage-7.14.1-cp312-cp312-win_amd64.whl", hash = "sha256:54acdb6674a4661768d7bf7db32dfb9f46ab1d764f8aba6df75ce1a6a088724e", size = 223372, upload-time = "2026-05-26T20:39:26.603Z" },
+ { url = "https://files.pythonhosted.org/packages/ae/ba/b44d472022f620d289d95fa830143235c0c36461c6f2437ea8d51e5481ed/coverage-7.14.1-cp312-cp312-win_arm64.whl", hash = "sha256:99cd41ff91afd94896fea3bc002706b6ae4ce95727d06e4a0f39c0a8d8bd8b1a", size = 221989, upload-time = "2026-05-26T20:39:28.242Z" },
+ { url = "https://files.pythonhosted.org/packages/8a/9e/5f6d56327c62b185225d145191c607e07515294a0aa6338e58805cd4a5ac/coverage-7.14.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:be9f2c802dcfce3f71298303aa5dad0dce440a76c52f2f60dacd8656dab78793", size = 220044, upload-time = "2026-05-26T20:39:29.902Z" },
+ { url = "https://files.pythonhosted.org/packages/75/92/e82aca356744cbbc0f77a0b623e38918c1872361963413a3bab5d0340393/coverage-7.14.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:6223a72fd0e4c7156353ec0f08a5f93623e1d3034d0e2683b9bb8ea674131b1d", size = 220412, upload-time = "2026-05-26T20:39:31.561Z" },
+ { url = "https://files.pythonhosted.org/packages/27/c9/385bde0bf7ed0f4bf3a7ee5367060a86b5d218718cfd6fb943c0f836b34f/coverage-7.14.1-cp313-cp313-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:7279d2110a28cebc738b6459ecda2771735a4c18465fbbd36b3288fe5ed92247", size = 251412, upload-time = "2026-05-26T20:39:33.337Z" },
+ { url = "https://files.pythonhosted.org/packages/51/8c/23faf6a2343a0d17f960a4bd56c43bc7eb4cf312f774dd6ceebd82c7d8fc/coverage-7.14.1-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:9eeb3fcbc13ba40dfbdb22d01d196a28e9cef9ed4c29b60061a1e0e823a9929d", size = 254008, upload-time = "2026-05-26T20:39:35.009Z" },
+ { url = "https://files.pythonhosted.org/packages/42/06/36f4aa9ca8a815e6036156e80706a67828bb97bd826948244f6996dda957/coverage-7.14.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5f0cfc27c539f07cf5c0a4cfe211d0b6cae039f8f40526dbaa71944e64b50a7b", size = 255241, upload-time = "2026-05-26T20:39:36.71Z" },
+ { url = "https://files.pythonhosted.org/packages/ca/79/95266316352f90f6b1c6736bb413302edfde2453fb32422d3911642691b3/coverage-7.14.1-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:221c70f316241a78e77e607c227cefc8808d4e08f28d99c04f35694690e940be", size = 257373, upload-time = "2026-05-26T20:39:38.412Z" },
+ { url = "https://files.pythonhosted.org/packages/e3/9c/58316d1f66c488b5fca8a0eb3e98348807813efa8a0d0833b9021be27488/coverage-7.14.1-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:da028256b04ec30e5e0114b6f76172938c313991f0a2d3d894271315cf5d5e43", size = 251635, upload-time = "2026-05-26T20:39:40.268Z" },
+ { url = "https://files.pythonhosted.org/packages/ef/5a/ca2398a568e16fed7bb713e84ba3603a7164fb65779abe645c565ec890d5/coverage-7.14.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:76a085d7005236a767e3426148b2c407e53ad61695c562f8a81da2d373324901", size = 253373, upload-time = "2026-05-26T20:39:42.145Z" },
+ { url = "https://files.pythonhosted.org/packages/6e/2c/0396562c32deaebe7be51d865b3a41e9a87d7561acafe1a28f53b07e019a/coverage-7.14.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:b553d04b5e778a8e56d57eb134aff42a92718ecba45e79c4764ecfa40efd92ff", size = 251341, upload-time = "2026-05-26T20:39:43.907Z" },
+ { url = "https://files.pythonhosted.org/packages/fd/8f/a94f9221184c9cae1ee115820e3798e48b6b17777a9f19e46fb9a0c8dc74/coverage-7.14.1-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:46f714d2fb8ae2f4f29f23ada7f1e79b759fff5a70f94a1dac23af204c3ec9e4", size = 255497, upload-time = "2026-05-26T20:39:46.166Z" },
+ { url = "https://files.pythonhosted.org/packages/71/69/505d70e47db1eaebcd002c39759707621ef184cd6b1ae084d9f41293f323/coverage-7.14.1-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:1896f5e19ff3f0431c7ce2172adc54890fd97f86b59ced8ca1649145d9ffe35d", size = 251159, upload-time = "2026-05-26T20:39:48.03Z" },
+ { url = "https://files.pythonhosted.org/packages/e0/aa/58681c383aa33a9d2ed40a02d7a22fbf780d1fa4d575396365777828198c/coverage-7.14.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:62fd185ef9df3c33d1c8178c5af105f762afbad96038de9a4ae100aa6297ca33", size = 252934, upload-time = "2026-05-26T20:39:49.872Z" },
+ { url = "https://files.pythonhosted.org/packages/eb/fd/11c928cd6bdffc7074bb5965c173d9ebf517fb00205e1da524b98d29ef92/coverage-7.14.1-cp313-cp313-win32.whl", hash = "sha256:ab4af6352741a604c431c6072fce5bee33bf0f20dc7a56618d6bf6bb89e9810c", size = 222584, upload-time = "2026-05-26T20:39:51.68Z" },
+ { url = "https://files.pythonhosted.org/packages/6f/92/fb416fc26d340dcba19518c418d6048e913186e17243982c5e435e41fa7a/coverage-7.14.1-cp313-cp313-win_amd64.whl", hash = "sha256:7af486dabe8954d03b087f0021540897afe084f04e16ff5579e08cc46f871416", size = 223394, upload-time = "2026-05-26T20:39:53.472Z" },
+ { url = "https://files.pythonhosted.org/packages/73/c6/02d56e3867972f77d5036de924643f26c056e848f00452cafb4dbc3c29b4/coverage-7.14.1-cp313-cp313-win_arm64.whl", hash = "sha256:2224f89ffd0c5605ccce1ed7a584da162bc7c55f601ab1c946bc9de31a486b42", size = 222015, upload-time = "2026-05-26T20:39:55.374Z" },
+ { url = "https://files.pythonhosted.org/packages/4d/9e/fcc77914050df73f7662fa1f00902774c79c075a8388ab334074574bf77e/coverage-7.14.1-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:de286598cc65d2b489411174b1faec2f5a7775fb3201fd925db2a76b4030f37d", size = 220733, upload-time = "2026-05-26T20:39:57.189Z" },
+ { url = "https://files.pythonhosted.org/packages/f7/67/2963cbdaf5cbadec44efa3a1e39eaa1f02df4079585f05387607a221e126/coverage-7.14.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:042c46ded7c288aeb07cf14a28b6c1e10b78fcba40171c3fa1e939377eeef0b5", size = 221086, upload-time = "2026-05-26T20:39:59.019Z" },
+ { url = "https://files.pythonhosted.org/packages/c8/c5/8701645574e11881f2f47d8930f98bc48b5d43b25eb5b4430dfc4a2f9f48/coverage-7.14.1-cp313-cp313t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:f4ddbe407477f04c45115d1a4e5bc480f753553b534d338d4c3358b1cdd0ea52", size = 262381, upload-time = "2026-05-26T20:40:00.822Z" },
+ { url = "https://files.pythonhosted.org/packages/7c/28/7a64d73598263e0c5abd5084211a8474488d31b3c552ff531c719dfcff62/coverage-7.14.1-cp313-cp313t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:d13e6725992e2d2fd7d81d4f5241952d13740121dfd501da09201be39b2c003a", size = 264458, upload-time = "2026-05-26T20:40:02.506Z" },
+ { url = "https://files.pythonhosted.org/packages/fa/d8/4969179db9f7eb4df218e69540adf829d1c835f59452513d065d15446802/coverage-7.14.1-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f747dc8edcfe740130f28f32f3995e955494285717e86ee25af51db2219df08a", size = 266884, upload-time = "2026-05-26T20:40:04.421Z" },
+ { url = "https://files.pythonhosted.org/packages/a6/78/a45d5794dbc9bafd97afc96a4377c86c7820d78b6cf51b89bc1d4e919275/coverage-7.14.1-cp313-cp313t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:ced2f09ef276fd58611a1ef502164ad266d2b75174e5a40cabbdb4033f9f6cf2", size = 268022, upload-time = "2026-05-26T20:40:06.298Z" },
+ { url = "https://files.pythonhosted.org/packages/21/cb/4f5e354e9e3e67af96bd4e57113e6db6b22298c7168b13eec408a549903d/coverage-7.14.1-cp313-cp313t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:b84800013769a78ccb9ef4659402e26d06867e337b61ec365f77ad008adea80e", size = 261631, upload-time = "2026-05-26T20:40:08.226Z" },
+ { url = "https://files.pythonhosted.org/packages/ec/49/eced49af4cb996d5d8b7e94e736175c513e4facd3398507b89892b4326d8/coverage-7.14.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:ea8cd6ca0ee9f616aaef3afc6882e32c2cbf18b00d96313ffd76af650574034d", size = 264443, upload-time = "2026-05-26T20:40:10.137Z" },
+ { url = "https://files.pythonhosted.org/packages/f1/d8/5603a88a7c5913a6b54f6cb1a8c46f7b39cbb30f27cd3f492908da09b2d7/coverage-7.14.1-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:aa5e304a873fabddc11e484e9b6b738bd38bd7bed17b09aa84eecf5332e8b8bb", size = 262069, upload-time = "2026-05-26T20:40:11.999Z" },
+ { url = "https://files.pythonhosted.org/packages/f0/59/2ae3cb79da554a06c8619d6c88ea19dd1e4aed4b834b6a83bb1fa243bdc5/coverage-7.14.1-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:5a1c5215be81035e629d5bc756650634d0bf31991038db7a0eccb90f025ce16d", size = 265780, upload-time = "2026-05-26T20:40:13.858Z" },
+ { url = "https://files.pythonhosted.org/packages/af/5f/b130c1dc999031f2648bd25317fbce505ad8d5562079b4ed81e736a84967/coverage-7.14.1-cp313-cp313t-musllinux_1_2_riscv64.whl", hash = "sha256:79058c47dae6788504b5effb319961bcd72d7240551464b91d474bc0ed186d69", size = 260970, upload-time = "2026-05-26T20:40:16.142Z" },
+ { url = "https://files.pythonhosted.org/packages/87/d1/ec13ccddeb48ec963bdfa72a11224bac2584bd045ba13beca82f8113e9c7/coverage-7.14.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:370c5afae3fa0658e11694a32b24c2778f6bc2d17718121f94ee185e69f26b54", size = 263157, upload-time = "2026-05-26T20:40:18.382Z" },
+ { url = "https://files.pythonhosted.org/packages/cf/c2/cd91ead503045161092d3845f7bb95ea2f25131ce96d3e314dd835d91b9c/coverage-7.14.1-cp313-cp313t-win32.whl", hash = "sha256:3758dd0a7f1fa57365ef2e781df0f0731d38b6e3772259d13dae4bd8a958d4b1", size = 223259, upload-time = "2026-05-26T20:40:20.381Z" },
+ { url = "https://files.pythonhosted.org/packages/71/9f/1e28d97e6bd2c76b07f38b7c02870f1371255ff6717f54eca578fcbbdd0e/coverage-7.14.1-cp313-cp313t-win_amd64.whl", hash = "sha256:6ff665fb023a77386fe11685190cee1f60a7d635994a30d9b0a061533d470fce", size = 224320, upload-time = "2026-05-26T20:40:22.316Z" },
+ { url = "https://files.pythonhosted.org/packages/a9/e0/d936e908f0e1efa55e52b91e01b52f1055cef5e1ab2718493390ed8e2fb8/coverage-7.14.1-cp313-cp313t-win_arm64.whl", hash = "sha256:17a5a241e5997621a956a7f402a7433ef4221e5152809b785bec79e2323799f1", size = 222577, upload-time = "2026-05-26T20:40:24.894Z" },
+ { url = "https://files.pythonhosted.org/packages/8a/3c/1a983b9a745d7f83d53f057bcc5bf79ba6a2bbc08266b3f0c7d6fe630c9b/coverage-7.14.1-py3-none-any.whl", hash = "sha256:a252f21c27e38347e60111a3266b03827422a7d5525951aceee313aa68bab1d2", size = 211815, upload-time = "2026-05-26T20:41:34.078Z" },
+]
+
+[package.optional-dependencies]
+toml = [
+ { name = "tomli", marker = "python_full_version <= '3.11'" },
+]
+
[[package]]
name = "csvw"
version = "4.0.0"
@@ -219,7 +314,7 @@ name = "exceptiongroup"
version = "1.3.1"
source = { registry = "https://pypi.org/simple" }
dependencies = [
- { name = "typing-extensions", marker = "python_full_version < '3.13'" },
+ { name = "typing-extensions", marker = "python_full_version < '3.11'" },
]
sdist = { url = "https://files.pythonhosted.org/packages/50/79/66800aadf48771f6b62f7eb014e352e5d06856655206165d775e675a02c9/exceptiongroup-1.3.1.tar.gz", hash = "sha256:8b412432c6055b0b7d14c310000ae93352ed6754f70fa8f7c34141f91c4e3219", size = 30371, upload-time = "2025-11-21T23:01:54.787Z" }
wheels = [
@@ -280,6 +375,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/1e/5e/d4e9f1a599fb8e573b7b87160658329fbf28d19eac2718f51fc3def3aa5a/idna-3.18-py3-none-any.whl", hash = "sha256:7f952cbe720b688055e3f87de14f5c3e5fdaa8bc3928985c4077ca689de849a2", size = 65455, upload-time = "2026-06-02T14:34:06.319Z" },
]
+[[package]]
+name = "iniconfig"
+version = "2.3.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/72/34/14ca021ce8e5dfedc35312d08ba8bf51fdd999c576889fc2c24cb97f4f10/iniconfig-2.3.0.tar.gz", hash = "sha256:c76315c77db068650d49c5b56314774a7804df16fee4402c1f19d6d15d8c4730", size = 20503, upload-time = "2025-10-18T21:55:43.219Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/cb/b1/3846dd7f199d53cb17f49cba7e651e9ce294d8497c8c150530ed11865bb8/iniconfig-2.3.0-py3-none-any.whl", hash = "sha256:f631c04d2c48c52b84d0d0549c99ff3859c98df65b3101406327ecc7d53fbf12", size = 7484, upload-time = "2025-10-18T21:55:41.639Z" },
+]
+
[[package]]
name = "isodate"
version = "0.7.2"
@@ -609,6 +713,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/81/e6/cd9575ac904136b3cbf7aa7ee819ef86eedb7274e46f230e94ea4342e729/platformdirs-4.10.0-py3-none-any.whl", hash = "sha256:fb516cdb12eb0d857d0cd85a7c57cea4d060bee4578d6cf5a14dfdf8cbf8784a", size = 22743, upload-time = "2026-05-28T03:32:52.175Z" },
]
+[[package]]
+name = "pluggy"
+version = "1.6.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/f9/e2/3e91f31a7d2b083fe6ef3fa267035b518369d9511ffab804f839851d2779/pluggy-1.6.0.tar.gz", hash = "sha256:7dcc130b76258d33b90f61b658791dede3486c3e6bfb003ee5c9bfb396dd22f3", size = 69412, upload-time = "2025-05-15T12:30:07.975Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/54/20/4d324d65cc6d9205fabedc306948156824eb9f0ee1633355a8f7ec5c66bf/pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746", size = 20538, upload-time = "2025-05-15T12:30:06.134Z" },
+]
+
[[package]]
name = "prompt-toolkit"
version = "3.0.52"
@@ -663,6 +776,38 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/10/bd/c038d7cc38edc1aa5bf91ab8068b63d4308c66c4c8bb3cbba7dfbc049f9c/pyparsing-3.3.2-py3-none-any.whl", hash = "sha256:850ba148bd908d7e2411587e247a1e4f0327839c40e2e5e6d05a007ecc69911d", size = 122781, upload-time = "2026-01-21T03:57:55.912Z" },
]
+[[package]]
+name = "pytest"
+version = "9.1.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "colorama", marker = "sys_platform == 'win32'" },
+ { name = "exceptiongroup", marker = "python_full_version < '3.11'" },
+ { name = "iniconfig" },
+ { name = "packaging" },
+ { name = "pluggy" },
+ { name = "pygments" },
+ { name = "tomli", marker = "python_full_version < '3.11'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/84/0e/b5858858d74958632c49b72cb25a3976ff9f632397626715be71c89d3971/pytest-9.1.0.tar.gz", hash = "sha256:41dd9148c08072446394cefd3d79701701335a9f4cae69ba92e39f6c7f5c061c", size = 1634181, upload-time = "2026-06-13T18:52:45.983Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/8b/5a/ba30a81239b909821b3153e303e7def45178bf353da4f72380e6c5e8793b/pytest-9.1.0-py3-none-any.whl", hash = "sha256:8ebb0e7888bdf2bdfc602ec51f8f62d50200af37356c74e503c79a94f5c81f32", size = 386453, upload-time = "2026-06-13T18:52:44.045Z" },
+]
+
+[[package]]
+name = "pytest-cov"
+version = "7.1.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "coverage", extra = ["toml"] },
+ { name = "pluggy" },
+ { name = "pytest" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/b1/51/a849f96e117386044471c8ec2bd6cfebacda285da9525c9106aeb28da671/pytest_cov-7.1.0.tar.gz", hash = "sha256:30674f2b5f6351aa09702a9c8c364f6a01c27aae0c1366ae8016160d1efc56b2", size = 55592, upload-time = "2026-03-21T20:11:16.284Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/9d/7a/d968e294073affff457b041c2be9868a40c1c71f4a35fcc1e45e5493067b/pytest_cov-7.1.0-py3-none-any.whl", hash = "sha256:a0461110b7865f9a271aa1b51e516c9a95de9d696734a2f71e3e78f46e1d4678", size = 22876, upload-time = "2026-03-21T20:11:14.438Z" },
+]
+
[[package]]
name = "python-dateutil"
version = "2.9.0.post0"
@@ -1106,6 +1251,42 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/33/d1/8bb87d21e9aeb323cc03034f5eaf2c8f69841e40e4853c2627edf8111ed3/termcolor-3.3.0-py3-none-any.whl", hash = "sha256:cf642efadaf0a8ebbbf4bc7a31cec2f9b5f21a9f726f4ccbb08192c9c26f43a5", size = 7734, upload-time = "2025-12-29T12:55:20.718Z" },
]
+[[package]]
+name = "tomli"
+version = "2.4.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/22/de/48c59722572767841493b26183a0d1cc411d54fd759c5607c4590b6563a6/tomli-2.4.1.tar.gz", hash = "sha256:7c7e1a961a0b2f2472c1ac5b69affa0ae1132c39adcb67aba98568702b9cc23f", size = 17543, upload-time = "2026-03-25T20:22:03.828Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/f4/11/db3d5885d8528263d8adc260bb2d28ebf1270b96e98f0e0268d32b8d9900/tomli-2.4.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f8f0fc26ec2cc2b965b7a3b87cd19c5c6b8c5e5f436b984e85f486d652285c30", size = 154704, upload-time = "2026-03-25T20:21:10.473Z" },
+ { url = "https://files.pythonhosted.org/packages/6d/f7/675db52c7e46064a9aa928885a9b20f4124ecb9bc2e1ce74c9106648d202/tomli-2.4.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:4ab97e64ccda8756376892c53a72bd1f964e519c77236368527f758fbc36a53a", size = 149454, upload-time = "2026-03-25T20:21:12.036Z" },
+ { url = "https://files.pythonhosted.org/packages/61/71/81c50943cf953efa35bce7646caab3cf457a7d8c030b27cfb40d7235f9ee/tomli-2.4.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:96481a5786729fd470164b47cdb3e0e58062a496f455ee41b4403be77cb5a076", size = 237561, upload-time = "2026-03-25T20:21:13.098Z" },
+ { url = "https://files.pythonhosted.org/packages/48/c1/f41d9cb618acccca7df82aaf682f9b49013c9397212cb9f53219e3abac37/tomli-2.4.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5a881ab208c0baf688221f8cecc5401bd291d67e38a1ac884d6736cbcd8247e9", size = 243824, upload-time = "2026-03-25T20:21:14.569Z" },
+ { url = "https://files.pythonhosted.org/packages/22/e4/5a816ecdd1f8ca51fb756ef684b90f2780afc52fc67f987e3c61d800a46d/tomli-2.4.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:47149d5bd38761ac8be13a84864bf0b7b70bc051806bc3669ab1cbc56216b23c", size = 242227, upload-time = "2026-03-25T20:21:15.712Z" },
+ { url = "https://files.pythonhosted.org/packages/6b/49/2b2a0ef529aa6eec245d25f0c703e020a73955ad7edf73e7f54ddc608aa5/tomli-2.4.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:ec9bfaf3ad2df51ace80688143a6a4ebc09a248f6ff781a9945e51937008fcbc", size = 247859, upload-time = "2026-03-25T20:21:17.001Z" },
+ { url = "https://files.pythonhosted.org/packages/83/bd/6c1a630eaca337e1e78c5903104f831bda934c426f9231429396ce3c3467/tomli-2.4.1-cp311-cp311-win32.whl", hash = "sha256:ff2983983d34813c1aeb0fa89091e76c3a22889ee83ab27c5eeb45100560c049", size = 97204, upload-time = "2026-03-25T20:21:18.079Z" },
+ { url = "https://files.pythonhosted.org/packages/42/59/71461df1a885647e10b6bb7802d0b8e66480c61f3f43079e0dcd315b3954/tomli-2.4.1-cp311-cp311-win_amd64.whl", hash = "sha256:5ee18d9ebdb417e384b58fe414e8d6af9f4e7a0ae761519fb50f721de398dd4e", size = 108084, upload-time = "2026-03-25T20:21:18.978Z" },
+ { url = "https://files.pythonhosted.org/packages/b8/83/dceca96142499c069475b790e7913b1044c1a4337e700751f48ed723f883/tomli-2.4.1-cp311-cp311-win_arm64.whl", hash = "sha256:c2541745709bad0264b7d4705ad453b76ccd191e64aa6f0fc66b69a293a45ece", size = 95285, upload-time = "2026-03-25T20:21:20.309Z" },
+ { url = "https://files.pythonhosted.org/packages/c1/ba/42f134a3fe2b370f555f44b1d72feebb94debcab01676bf918d0cb70e9aa/tomli-2.4.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:c742f741d58a28940ce01d58f0ab2ea3ced8b12402f162f4d534dfe18ba1cd6a", size = 155924, upload-time = "2026-03-25T20:21:21.626Z" },
+ { url = "https://files.pythonhosted.org/packages/dc/c7/62d7a17c26487ade21c5422b646110f2162f1fcc95980ef7f63e73c68f14/tomli-2.4.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:7f86fd587c4ed9dd76f318225e7d9b29cfc5a9d43de44e5754db8d1128487085", size = 150018, upload-time = "2026-03-25T20:21:23.002Z" },
+ { url = "https://files.pythonhosted.org/packages/5c/05/79d13d7c15f13bdef410bdd49a6485b1c37d28968314eabee452c22a7fda/tomli-2.4.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ff18e6a727ee0ab0388507b89d1bc6a22b138d1e2fa56d1ad494586d61d2eae9", size = 244948, upload-time = "2026-03-25T20:21:24.04Z" },
+ { url = "https://files.pythonhosted.org/packages/10/90/d62ce007a1c80d0b2c93e02cab211224756240884751b94ca72df8a875ca/tomli-2.4.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:136443dbd7e1dee43c68ac2694fde36b2849865fa258d39bf822c10e8068eac5", size = 253341, upload-time = "2026-03-25T20:21:25.177Z" },
+ { url = "https://files.pythonhosted.org/packages/1a/7e/caf6496d60152ad4ed09282c1885cca4eea150bfd007da84aea07bcc0a3e/tomli-2.4.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:5e262d41726bc187e69af7825504c933b6794dc3fbd5945e41a79bb14c31f585", size = 248159, upload-time = "2026-03-25T20:21:26.364Z" },
+ { url = "https://files.pythonhosted.org/packages/99/e7/c6f69c3120de34bbd882c6fba7975f3d7a746e9218e56ab46a1bc4b42552/tomli-2.4.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:5cb41aa38891e073ee49d55fbc7839cfdb2bc0e600add13874d048c94aadddd1", size = 253290, upload-time = "2026-03-25T20:21:27.46Z" },
+ { url = "https://files.pythonhosted.org/packages/d6/2f/4a3c322f22c5c66c4b836ec58211641a4067364f5dcdd7b974b4c5da300c/tomli-2.4.1-cp312-cp312-win32.whl", hash = "sha256:da25dc3563bff5965356133435b757a795a17b17d01dbc0f42fb32447ddfd917", size = 98141, upload-time = "2026-03-25T20:21:28.492Z" },
+ { url = "https://files.pythonhosted.org/packages/24/22/4daacd05391b92c55759d55eaee21e1dfaea86ce5c571f10083360adf534/tomli-2.4.1-cp312-cp312-win_amd64.whl", hash = "sha256:52c8ef851d9a240f11a88c003eacb03c31fc1c9c4ec64a99a0f922b93874fda9", size = 108847, upload-time = "2026-03-25T20:21:29.386Z" },
+ { url = "https://files.pythonhosted.org/packages/68/fd/70e768887666ddd9e9f5d85129e84910f2db2796f9096aa02b721a53098d/tomli-2.4.1-cp312-cp312-win_arm64.whl", hash = "sha256:f758f1b9299d059cc3f6546ae2af89670cb1c4d48ea29c3cacc4fe7de3058257", size = 95088, upload-time = "2026-03-25T20:21:30.677Z" },
+ { url = "https://files.pythonhosted.org/packages/07/06/b823a7e818c756d9a7123ba2cda7d07bc2dd32835648d1a7b7b7a05d848d/tomli-2.4.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:36d2bd2ad5fb9eaddba5226aa02c8ec3fa4f192631e347b3ed28186d43be6b54", size = 155866, upload-time = "2026-03-25T20:21:31.65Z" },
+ { url = "https://files.pythonhosted.org/packages/14/6f/12645cf7f08e1a20c7eb8c297c6f11d31c1b50f316a7e7e1e1de6e2e7b7e/tomli-2.4.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:eb0dc4e38e6a1fd579e5d50369aa2e10acfc9cace504579b2faabb478e76941a", size = 149887, upload-time = "2026-03-25T20:21:33.028Z" },
+ { url = "https://files.pythonhosted.org/packages/5c/e0/90637574e5e7212c09099c67ad349b04ec4d6020324539297b634a0192b0/tomli-2.4.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c7f2c7f2b9ca6bdeef8f0fa897f8e05085923eb091721675170254cbc5b02897", size = 243704, upload-time = "2026-03-25T20:21:34.51Z" },
+ { url = "https://files.pythonhosted.org/packages/10/8f/d3ddb16c5a4befdf31a23307f72828686ab2096f068eaf56631e136c1fdd/tomli-2.4.1-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f3c6818a1a86dd6dca7ddcaaf76947d5ba31aecc28cb1b67009a5877c9a64f3f", size = 251628, upload-time = "2026-03-25T20:21:36.012Z" },
+ { url = "https://files.pythonhosted.org/packages/e3/f1/dbeeb9116715abee2485bf0a12d07a8f31af94d71608c171c45f64c0469d/tomli-2.4.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:d312ef37c91508b0ab2cee7da26ec0b3ed2f03ce12bd87a588d771ae15dcf82d", size = 247180, upload-time = "2026-03-25T20:21:37.136Z" },
+ { url = "https://files.pythonhosted.org/packages/d3/74/16336ffd19ed4da28a70959f92f506233bd7cfc2332b20bdb01591e8b1d1/tomli-2.4.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:51529d40e3ca50046d7606fa99ce3956a617f9b36380da3b7f0dd3dd28e68cb5", size = 251674, upload-time = "2026-03-25T20:21:38.298Z" },
+ { url = "https://files.pythonhosted.org/packages/16/f9/229fa3434c590ddf6c0aa9af64d3af4b752540686cace29e6281e3458469/tomli-2.4.1-cp313-cp313-win32.whl", hash = "sha256:2190f2e9dd7508d2a90ded5ed369255980a1bcdd58e52f7fe24b8162bf9fedbd", size = 97976, upload-time = "2026-03-25T20:21:39.316Z" },
+ { url = "https://files.pythonhosted.org/packages/6a/1e/71dfd96bcc1c775420cb8befe7a9d35f2e5b1309798f009dca17b7708c1e/tomli-2.4.1-cp313-cp313-win_amd64.whl", hash = "sha256:8d65a2fbf9d2f8352685bc1364177ee3923d6baf5e7f43ea4959d7d8bc326a36", size = 108755, upload-time = "2026-03-25T20:21:40.248Z" },
+ { url = "https://files.pythonhosted.org/packages/83/7a/d34f422a021d62420b78f5c538e5b102f62bea616d1d75a13f0a88acb04a/tomli-2.4.1-cp313-cp313-win_arm64.whl", hash = "sha256:4b605484e43cdc43f0954ddae319fb75f04cc10dd80d830540060ee7cd0243cd", size = 95265, upload-time = "2026-03-25T20:21:41.219Z" },
+ { url = "https://files.pythonhosted.org/packages/7b/61/cceae43728b7de99d9b847560c262873a1f6c98202171fd5ed62640b494b/tomli-2.4.1-py3-none-any.whl", hash = "sha256:0d85819802132122da43cb86656f8d1f8c6587d54ae7dcaf30e90533028b49fe", size = 14583, upload-time = "2026-03-25T20:22:03.012Z" },
+]
+
[[package]]
name = "typer"
version = "0.26.7"
From 26ec0bfc53356f680106a385f039ae839d2d52d1 Mon Sep 17 00:00:00 2001
From: Chirag Patil
Date: Sun, 14 Jun 2026 13:29:29 +0530
Subject: [PATCH 2/2] fix: harden MkDocs URL and sitemap handling
- Reject backslashes and unsafe protocol-relative references
- Preserve stale catalogs when fresh sitemaps cannot be parsed
- Parse XML using declared encodings and skip mismatched paths
- Add regression tests for reviewed edge cases
---
binge_docs/documentation_fetching.py | 2 +
binge_docs/documentation_sources.py | 17 +++++--
tests/test_documentation_fetching.py | 2 +
tests/test_documentation_sources.py | 73 +++++++++++++++++++++++++---
4 files changed, 83 insertions(+), 11 deletions(-)
diff --git a/binge_docs/documentation_fetching.py b/binge_docs/documentation_fetching.py
index 940e14d..0f101c4 100644
--- a/binge_docs/documentation_fetching.py
+++ b/binge_docs/documentation_fetching.py
@@ -196,6 +196,8 @@ def normalize_path(path: str) -> str:
"""Normalize a path and reject traversal or dot segments."""
decoded_path = unquote(path or "/")
+ if "\\" in decoded_path:
+ raise ValueError("Documentation paths cannot contain backslashes.")
if not decoded_path.startswith("/"):
decoded_path = f"/{decoded_path}"
diff --git a/binge_docs/documentation_sources.py b/binge_docs/documentation_sources.py
index 527e177..256031d 100644
--- a/binge_docs/documentation_sources.py
+++ b/binge_docs/documentation_sources.py
@@ -94,7 +94,9 @@ def load_catalog(self) -> CatalogResult:
try:
pages = parse_sitemap(fetched_sitemap.content, self.base_url, self.url_policy)
- except (ElementTree.ParseError, UnicodeDecodeError, ValueError) as error:
+ except (ElementTree.ParseError, LookupError, ValueError) as error:
+ if cached_pages is not None:
+ return CatalogResult(cached_pages, used_stale_cache=True)
raise CatalogError(
f"The sitemap at {sitemap_url} could not be understood."
) from error
@@ -233,7 +235,7 @@ def parse_sitemap(
) -> tuple[DocumentationPage, ...]:
"""Parse canonical pages from a MkDocs sitemap."""
- root = ElementTree.fromstring(content.decode("utf-8"))
+ root = ElementTree.fromstring(content)
pages: list[DocumentationPage] = []
seen_urls: set[str] = set()
@@ -247,7 +249,10 @@ def parse_sitemap(
if page_url in seen_urls:
continue
- relative_path = relative_page_path(page_url, base_url)
+ try:
+ relative_path = relative_page_path(page_url, base_url)
+ except ValueError:
+ continue
pages.append(page_from_url(page_url, relative_path))
seen_urls.add(page_url)
@@ -309,7 +314,11 @@ def resolve_page_slug(page_reference: str, base_url: str, policy: URLPolicy) ->
if reference == "/":
candidate = base_url
else:
- candidate = reference if "://" in reference else urljoin(base_url, reference)
+ parsed_reference = urlparse(reference)
+ if parsed_reference.scheme or parsed_reference.netloc:
+ candidate = reference
+ else:
+ candidate = urljoin(base_url, reference.lstrip("/"))
try:
page_url = policy.validate_url(candidate)
except ValueError as error:
diff --git a/tests/test_documentation_fetching.py b/tests/test_documentation_fetching.py
index 273d668..01bec4a 100644
--- a/tests/test_documentation_fetching.py
+++ b/tests/test_documentation_fetching.py
@@ -87,6 +87,8 @@ def test_url_policy_validates_and_normalizes_urls() -> None:
("https://evil.example.com/guide/start/", ("/guide/",)),
("https://docs.example.com/private/start/", ("/guide/",)),
("https://docs.example.com/guide/%2e%2e/secret/", ("/guide/",)),
+ ("https://docs.example.com/guide\\secret/", ("/guide/",)),
+ ("https://docs.example.com/guide/%5Csecret/", ("/guide/",)),
],
)
def test_url_policy_rejects_invalid_urls(url: str, allowed_prefixes: tuple[str, ...]) -> None:
diff --git a/tests/test_documentation_sources.py b/tests/test_documentation_sources.py
index 2f796e0..6681b22 100644
--- a/tests/test_documentation_sources.py
+++ b/tests/test_documentation_sources.py
@@ -84,6 +84,8 @@ def test_normalize_base_url_and_source_metadata(tmp_path) -> None:
"https://docs.example.com/?q=1",
"https://docs.example.com/#intro",
"https://docs.example.com/%2e%2e/private/",
+ "https://docs.example.com/guide\\private/",
+ "https://docs.example.com/guide/%5Cprivate/",
],
)
def test_normalize_base_url_rejects_invalid_urls(url: str) -> None:
@@ -129,8 +131,50 @@ def test_parse_sitemap_rejects_bad_or_empty_documents() -> None:
parse_sitemap(b"", "https://docs.example.com/guide/", policy)
with pytest.raises(ValueError):
parse_sitemap(b"", "https://docs.example.com/guide/", policy)
- with pytest.raises(UnicodeDecodeError):
+ with pytest.raises(ElementTree.ParseError):
parse_sitemap(b"\xff", "https://docs.example.com/guide/", policy)
+ with pytest.raises(LookupError):
+ parse_sitemap(
+ b'',
+ "https://docs.example.com/guide/",
+ policy,
+ )
+
+
+def test_parse_sitemap_honors_declared_encoding() -> None:
+ base_url = "https://docs.example.com/guide/"
+ policy = URLPolicy(
+ allowed_origins=("https://docs.example.com",),
+ allowed_path_prefixes=("/guide/",),
+ )
+ sitemap = (
+ ''
+ ''
+ "https://docs.example.com/guide/caf\xe9/"
+ ""
+ ).encode("iso-8859-1")
+
+ pages = parse_sitemap(sitemap, base_url, policy)
+
+ assert pages[0].slug == "caf\xe9"
+
+
+def test_parse_sitemap_skips_urls_outside_base_path() -> None:
+ base_url = "https://docs.example.com/guide/"
+ policy = URLPolicy(
+ allowed_origins=("https://docs.example.com",),
+ allowed_path_prefixes=("/",),
+ )
+ sitemap = b"""
+
+ https://docs.example.com/other/
+ https://docs.example.com/guide/valid/
+
+ """
+
+ pages = parse_sitemap(sitemap, base_url, policy)
+
+ assert tuple(page.slug for page in pages) == ("valid",)
def test_source_loads_and_caches_catalog(tmp_path) -> None:
@@ -151,7 +195,7 @@ def test_source_loads_and_caches_catalog(tmp_path) -> None:
assert fetcher.calls == [sitemap_url]
-def test_source_uses_stale_catalog_only_for_download_failures(tmp_path) -> None:
+def test_source_uses_stale_catalog_for_download_and_parse_failures(tmp_path) -> None:
sitemap_url = "https://docs.example.com/guide/sitemap.xml"
cache = WebCache(cache_dir=tmp_path, ttl_seconds=1)
source = MkDocsSource(
@@ -159,7 +203,7 @@ def test_source_uses_stale_catalog_only_for_download_failures(tmp_path) -> None:
cache=cache,
fetcher=StubFetcher({sitemap_url: SITEMAP}),
)
- source.load_catalog()
+ cached_pages = source.load_catalog().pages
snapshot_path = source._catalog_snapshot_path()
os.utime(snapshot_path, (1, 1))
@@ -167,9 +211,18 @@ def test_source_uses_stale_catalog_only_for_download_failures(tmp_path) -> None:
source.fetcher = StubFetcher({sitemap_url: OSError("offline")})
assert source.load_catalog().used_stale_cache
- source.fetcher = StubFetcher({sitemap_url: b""})
- with pytest.raises(CatalogError):
- source.load_catalog()
+ snapshot_content = snapshot_path.read_bytes()
+ invalid_sitemaps = (
+ b"",
+ b'',
+ b"",
+ )
+ for invalid_sitemap in invalid_sitemaps:
+ source.fetcher = StubFetcher({sitemap_url: invalid_sitemap})
+ parse_fallback = source.load_catalog()
+ assert parse_fallback.used_stale_cache
+ assert parse_fallback.pages == cached_pages
+ assert snapshot_path.read_bytes() == snapshot_content
def test_source_reports_catalog_download_and_encoding_failures(tmp_path) -> None:
@@ -178,7 +231,10 @@ def test_source_reports_catalog_download_and_encoding_failures(tmp_path) -> None
with pytest.raises(CatalogError):
source.load_catalog()
- source = make_source(tmp_path / "utf8", {sitemap_url: b"\xff"})
+ source = make_source(
+ tmp_path / "encoding",
+ {sitemap_url: b''},
+ )
with pytest.raises(CatalogError):
source.load_catalog()
@@ -295,10 +351,13 @@ def test_page_helpers_cover_home_nested_and_invalid_references() -> None:
assert title_from_segment("first_steps.html") == "First Steps"
assert page_from_url(base_url, "").title == "Home"
assert resolve_page_slug("tutorial/page", base_url, policy) == "tutorial/page"
+ assert resolve_page_slug("/tutorial/page", base_url, policy) == "tutorial/page"
assert resolve_page_slug(base_url, base_url, policy) == "/"
assert resolve_page_slug("/", base_url, policy) == "/"
with pytest.raises(PageError):
resolve_page_slug("", base_url, policy)
+ with pytest.raises(PageError):
+ resolve_page_slug("//evil.example.com/page/", base_url, policy)
with pytest.raises(PageError):
resolve_page_slug("https://docs.example.com/guidebook/page/", base_url, policy)