diff --git a/src/lean_explore/extract/doc_gen4.py b/src/lean_explore/extract/doc_gen4.py index 621e55a..759227e 100644 --- a/src/lean_explore/extract/doc_gen4.py +++ b/src/lean_explore/extract/doc_gen4.py @@ -6,6 +6,7 @@ import logging import os +import re import shutil import subprocess import time @@ -23,6 +24,29 @@ logger = logging.getLogger(__name__) +def _uses_sqlite_docgen(lean_version: str) -> bool: + """Return whether the matching doc-gen4 release writes api-docs.db. + + Doc-gen4's SQLite output landed after `v4.29.0-rc1` and is present starting + with `v4.29.0-rc2`. + """ + match = extract_lean_version(lean_version) + version_match = re.match(r"^v(\d+)\.(\d+)\.(\d+)(?:-rc(\d+))?$", match) + if version_match is None: + return False + + major, minor, patch, rc = version_match.groups() + version = (int(major), int(minor), int(patch)) + if version > (4, 29, 0): + return True + if version < (4, 29, 0): + return False + + if rc is None: + return True + return int(rc) >= 2 + + def _clear_workspace_cache(workspace_path: Path) -> None: """Clear entire Lake cache to force complete rebuild. @@ -47,8 +71,8 @@ def _clear_workspace_cache(workspace_path: Path) -> None: shutil.rmtree(lake_dir) -def _get_doc_lib_names(package_name: str) -> list[str]: - """Get the library names to run doc-gen4 on for a package. +def _get_library_names(package_name: str) -> list[str]: + """Get the library names to build and run doc-gen4 on for a package. Some packages have custom extract wrappers, others use upstream libraries directly. """ @@ -62,6 +86,57 @@ def _get_doc_lib_names(package_name: str) -> list[str]: return lib_names.get(package_name, [f"{package_name.title()}Extract"]) +def _run_lake_build_target( + workspace_path: Path, + package_name: str, + target: str, + env: dict[str, str], + allow_failure: bool = False, +) -> bool: + """Run ``lake build `` streaming output to the logger. + + Args: + workspace_path: Path to the Lake workspace directory. + package_name: Name of the package for log messages. + target: The Lake build target to run. + env: Environment variables to pass to the subprocess. + allow_failure: Whether to continue when the target fails. + + Returns: + ``True`` if the target built successfully, otherwise ``False``. + + Raises: + RuntimeError: If the target fails and ``allow_failure`` is ``False``. + """ + logger.info("[%s] Running lake build %s...", package_name, target) + process = subprocess.Popen( + ["lake", "build", target], + cwd=workspace_path, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + bufsize=1, + env=env, + ) + if process.stdout: + for line in process.stdout: + logger.info(line.rstrip()) + + returncode = process.wait() + if returncode == 0: + return True + + if allow_failure: + logger.warning( + "[%s] lake build %s failed (continuing with generated docs)", + package_name, + target, + ) + return False + + raise RuntimeError(f"lake build failed for {package_name} target {target}") + + def _setup_workspace(package_config: PackageConfig) -> tuple[str, str]: """Fetch toolchain from GitHub and update lakefile. @@ -155,44 +230,27 @@ def _run_lake_for_package(package_name: str, verbose: bool = False) -> None: if result.returncode != 0: logger.warning("[%s] Cache fetch failed (non-fatal)", package_name) - logger.info("[%s] Running lake build...", package_name) - process = subprocess.Popen( - ["lake", "build"], - cwd=workspace_path, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - text=True, - bufsize=1, - env=env, - ) - if process.stdout: - for line in process.stdout: - logger.info(line.rstrip()) - if process.wait() != 0: - raise RuntimeError(f"lake build failed for {package_name}") + # For SQLite-format doc-gen4 we only need :docInfo, which populates + # api-docs.db and stops there. The :docs facet additionally runs `fromDb` + # to generate all HTML and the search index, which we don't consume. + # Older BMP-format doc-gen4 has no :docInfo facet, so :docs is the only + # option there. + toolchain_file = workspace_path / "lean-toolchain" + lake_target = "docs" + if toolchain_file.is_file(): + toolchain = toolchain_file.read_text().strip() + if toolchain and _uses_sqlite_docgen(toolchain): + lake_target = "docInfo" - lib_names = _get_doc_lib_names(package_name) + lib_names = _get_library_names(package_name) for lib_name in lib_names: - logger.info("[%s] Running doc-gen4 (%s:docs)...", package_name, lib_name) - - process = subprocess.Popen( - ["lake", "build", f"{lib_name}:docs"], - cwd=workspace_path, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - text=True, - bufsize=1, - env=env, + _run_lake_build_target( + workspace_path, + package_name, + f"{lib_name}:{lake_target}", + env, + allow_failure=True, ) - if process.stdout: - for line in process.stdout: - logger.info(line.rstrip()) - returncode = process.wait() - if returncode != 0: - logger.warning( - "[%s] doc-gen4 had failures for %s (continuing with generated docs)", - package_name, lib_name, - ) async def run_doc_gen4( @@ -227,12 +285,29 @@ async def run_doc_gen4( workspace_path = Path("lean") / package_name logger.info("\n%s\nPackage: %s\n%s", "=" * 50, package_name, "=" * 50) + toolchain = None + ref = None if fresh: - _clear_workspace_cache(workspace_path) + if setup: + toolchain, ref = _setup_workspace(config) + logger.info("Toolchain: %s, ref: %s", toolchain, ref) + + # Doc-gen4 switched from BMP files to api-docs.db in v4.29.0-rc2. + # The SQLite format handles incremental updates, while legacy BMP + # output requires a cache clear to avoid stale files. + if toolchain and _uses_sqlite_docgen(toolchain): + logger.info( + "[%s] Skipping cache clear " + "(api-docs.db handles incremental updates)", + package_name, + ) + else: + _clear_workspace_cache(workspace_path) if setup: - toolchain, ref = _setup_workspace(config) - logger.info("Toolchain: %s, ref: %s", toolchain, ref) + if toolchain is None or ref is None: + toolchain, ref = _setup_workspace(config) + logger.info("Toolchain: %s, ref: %s", toolchain, ref) _run_lake_for_package(package_name, verbose) diff --git a/src/lean_explore/extract/doc_parser.py b/src/lean_explore/extract/doc_parser.py index 707e62a..2ca2aa4 100644 --- a/src/lean_explore/extract/doc_parser.py +++ b/src/lean_explore/extract/doc_parser.py @@ -1,12 +1,17 @@ """Parser for Lean doc-gen4 output files. -This module parses doc-gen4 JSON data and extracts Lean source code -to produce Declaration objects ready for database insertion. +This module parses doc-gen4 output and extracts Lean source code to produce +Declaration objects ready for database insertion. + +Supports two doc-gen4 output formats: +- SQLite database (api-docs.db): Used by doc-gen4 >= v4.29.0-rc2 +- BMP JSON files (.bmp): Used by doc-gen4 < v4.29.0-rc2 """ import json import logging import re +import sqlite3 from pathlib import Path from rich.progress import ( @@ -26,6 +31,148 @@ logger = logging.getLogger(__name__) +# --------------------------------------------------------------------------- +# RenderedCode BLOB parser +# +# Doc-gen4 stores declaration type signatures in the `name_info.type` column +# as a binary BLOB using leansqlite's ToBinary serialization format. +# +# The type is RenderedCode = TaggedText RenderedCode.Tag where: +# TaggedText: text(0) String | tag(1) Tag TaggedText | append(2) Array +# Tag: keyword(0) | string(1) | const(2) Name | sort-none(3) +# | sort-type(4) | sort-prop(5) | sort-sort(6) | otherExpr(7) +# Name: anonymous(0) | str(1) Name String | num(2) Name Nat +# +# Encoding primitives (big-endian, leansqlite Classes.lean): +# Nat – variable-length 7-bit chunks, high bit = continuation +# String – Nat(utf8_byte_length) + raw UTF-8 bytes +# Array – Nat(count) + elements +# --------------------------------------------------------------------------- + + +class _BlobReader: + """Minimal reader for leansqlite ToBinary format.""" + + __slots__ = ("_data", "_cursor") + + def __init__(self, data: bytes) -> None: + self._data = data + self._cursor = 0 + + def _read_byte(self) -> int: + if self._cursor >= len(self._data): + raise ValueError("Unexpected end of BLOB data") + value = self._data[self._cursor] + self._cursor += 1 + return value + + def _read_nat(self) -> int: + """Read a variable-length natural number (7-bit chunks, MSB = more).""" + result = 0 + shift = 0 + while True: + byte = self._read_byte() + if byte >= 128: + result |= (byte & 0x7F) << shift + else: + result |= byte << shift + break + shift += 7 + return result + + def _read_string(self) -> str: + byte_length = self._read_nat() + if self._cursor + byte_length > len(self._data): + raise ValueError("String extends past end of BLOB data") + raw = self._data[self._cursor : self._cursor + byte_length] + self._cursor += byte_length + return raw.decode("utf-8") + + def _read_name(self) -> str: + """Read a Lean Name and return its dot-separated string form.""" + tag = self._read_byte() + if tag == 0: # anonymous + return "" + if tag == 1: # str parent s + parent = self._read_name() + component = self._read_string() + return f"{parent}.{component}" if parent else component + if tag == 2: # num parent n + parent = self._read_name() + number = self._read_nat() + return f"{parent}.{number}" if parent else str(number) + raise ValueError(f"Invalid Name tag: {tag}") + + def _skip_name(self) -> None: + """Skip over a Name without allocating strings.""" + tag = self._read_byte() + if tag == 0: + return + if tag == 1: + self._skip_name() + byte_length = self._read_nat() + self._cursor += byte_length + return + if tag == 2: + self._skip_name() + self._read_nat() + return + raise ValueError(f"Invalid Name tag: {tag}") + + +def _extract_names_from_rendered_code(blob: bytes) -> list[str]: + """Extract referenced declaration names from a RenderedCode BLOB. + + Walks the TaggedText tree and collects Lean Names from every + RenderedCode.Tag.const node (tag byte 2). + + Args: + blob: Raw bytes of the RenderedCode BLOB from name_info.type. + + Returns: + De-duplicated list of fully-qualified Lean names referenced in the type. + """ + reader = _BlobReader(blob) + names: list[str] = [] + seen: set[str] = set() + + def walk_tagged_text() -> None: + tag = reader._read_byte() + if tag == 0: # text + reader._read_string() + elif tag == 1: # tag + walk_tag() + walk_tagged_text() + elif tag == 2: # append + count = reader._read_nat() + for _ in range(count): + walk_tagged_text() + else: + raise ValueError(f"Invalid TaggedText tag: {tag}") + + def walk_tag() -> None: + tag = reader._read_byte() + if tag <= 1 or (3 <= tag <= 7): + # keyword(0), string(1), sort-none(3), sort-type(4), + # sort-prop(5), sort-sort(6), otherExpr(7) — no payload + return + if tag == 2: # const + name = reader._read_name() + if name and name not in seen: + names.append(name) + seen.add(name) + return + raise ValueError(f"Invalid RenderedCode.Tag tag: {tag}") + + try: + walk_tagged_text() + except (ValueError, IndexError): + logger.debug("Failed to parse RenderedCode BLOB (%d bytes)", len(blob)) + return [] + + return names + + def _strip_lean_comments(source_text: str) -> str: """Strip Lean comments from source text for comparison. @@ -270,6 +417,16 @@ def _extract_source_text( package_name.replace("-", "").lower(), ]: if variant in package_cache: + if variant == "lean4" and file_path_string.startswith("src/lean/"): + adjusted_path = file_path_string[9:] + candidates.append(package_cache[variant] / adjusted_path) + continue + if variant == "lean4" and file_path_string.startswith("src/lake/"): + adjusted_path = file_path_string[9:] + candidates.append( + package_cache[variant].parent / "lake" / adjusted_path + ) + continue if variant == "lean4" and file_path_string.startswith("src/"): adjusted_path = file_path_string[4:] else: @@ -292,6 +449,251 @@ def _extract_source_text( ) +def _read_lean_toolchain_version(workspace_path: Path) -> str | None: + """Read the Lean version from a workspace's lean-toolchain file. + + Args: + workspace_path: Path to the package workspace (e.g., lean/mathlib). + + Returns: + Version string like 'v4.29.0-rc6', or None if not found. + """ + toolchain_file = workspace_path / "lean-toolchain" + if not toolchain_file.exists(): + return None + try: + content = toolchain_file.read_text().strip() + match = re.search(r"v\d+\.\d+\.\d+(?:-rc\d+)?", content) + return match.group() if match else None + except OSError: + return None + + +def _construct_source_link( + module_name: str, + module_source_url: str | None, + start_line: int, + end_line: int, + lean_version: str | None = None, +) -> str | None: + """Construct a GitHub source link from module URL and line range. + + Args: + module_name: Lean module name from api-docs.db. + module_source_url: GitHub URL to the module file from api-docs.db. + start_line: Start line number in the source file. + end_line: End line number in the source file. + lean_version: Lean toolchain version (e.g., 'v4.29.0-rc6') used as + the git ref for core module fallback URLs. + + Returns: + GitHub URL with line range fragment, or None if no source URL exists. + """ + if module_source_url: + return f"{module_source_url}#L{start_line}-L{end_line}" + + git_ref = lean_version or "master" + module_path = module_name.replace(".", "/") + root = module_name.split(".", 1)[0] + if root in {"Init", "Lean", "Std"}: + return ( + f"https://github.com/leanprover/lean4/blob/{git_ref}/" + f"src/lean/{module_path}.lean#L{start_line}-L{end_line}" + ) + if root == "Lake": + return ( + f"https://github.com/leanprover/lean4/blob/{git_ref}/" + f"src/lake/{module_path}.lean#L{start_line}-L{end_line}" + ) + + return None + + +def _parse_declarations_from_sqlite( + database_path: Path, + lean_root: Path, + package_cache: dict[str, Path], + allowed_module_prefixes: list[str], + lean_version: str | None = None, +) -> list[Declaration]: + """Parse declarations from a doc-gen4 SQLite database (api-docs.db). + + Doc-gen4 >= v4.29.0-rc2 outputs declaration data to a SQLite database + instead of individual BMP JSON files. This function reads that database + and produces the same Declaration objects as the BMP parser. + + Args: + database_path: Path to the api-docs.db SQLite database. + lean_root: Root directory of the Lean project. + package_cache: Dictionary mapping package names to their directories. + allowed_module_prefixes: Module prefixes to extract (e.g., ["Mathlib"]). + lean_version: Lean toolchain version for core module source links. + + Returns: + List of parsed Declaration objects. + """ + declarations = [] + + connection = sqlite3.connect(str(database_path)) + connection.row_factory = sqlite3.Row + + try: + # Query declarations with source ranges and both docstring types. + # Doc-gen4 stores docstrings as either markdown text or Verso binary + # BLOBs (never both). We prefer markdown; Verso BLOBs require a + # complex deserializer so we detect but cannot extract them yet. + query = """ + SELECT + n.module_name, + n.position, + n.kind, + n.name, + n.type, + r.start_line, + r.end_line, + d.text AS docstring, + v.content AS verso_docstring, + m.source_url + FROM name_info n + JOIN declaration_ranges r + ON n.module_name = r.module_name AND n.position = r.position + LEFT JOIN declaration_markdown_docstrings d + ON n.module_name = d.module_name AND n.position = d.position + LEFT JOIN declaration_verso_docstrings v + ON n.module_name = v.module_name AND n.position = v.position + JOIN modules m + ON n.module_name = m.name + WHERE n.render = 1 + ORDER BY n.module_name, n.position + """ + rows = connection.execute(query).fetchall() + + logger.info("Found %d declarations in api-docs.db", len(rows)) + + skipped_no_source = 0 + skipped_prefix = 0 + skipped_constructor = 0 + source_errors = 0 + verso_only_docstrings = 0 + + with Progress( + SpinnerColumn(), + TextColumn("[progress.description]{task.description}"), + BarColumn(), + TaskProgressColumn(), + TimeRemainingColumn(), + ) as progress: + task = progress.add_task( + "[cyan]Parsing api-docs.db...", total=len(rows) + ) + + for row in rows: + module_name = row["module_name"] + declaration_name = row["name"] + + # Filter by module prefix + matches_prefix = any( + module_name == prefix or module_name.startswith(prefix + ".") + for prefix in allowed_module_prefixes + ) + if not matches_prefix: + skipped_prefix += 1 + progress.update(task, advance=1) + continue + + # Skip auto-generated .mk constructors + if declaration_name.endswith(".mk"): + skipped_constructor += 1 + progress.update(task, advance=1) + continue + + source_url = row["source_url"] + start_line = row["start_line"] + end_line = row["end_line"] + + source_link = _construct_source_link( + module_name, source_url, start_line, end_line, + lean_version=lean_version, + ) + if not source_link: + skipped_no_source += 1 + progress.update(task, advance=1) + continue + + # Extract source text from local files + try: + source_text = _extract_source_text( + source_link, lean_root, package_cache + ) + except (FileNotFoundError, ValueError) as error: + source_errors += 1 + if source_errors <= 10: + logger.debug( + "Could not extract source for %s: %s", + declaration_name, error, + ) + progress.update(task, advance=1) + continue + + # Extract dependency names from the type signature BLOB + type_blob = row["type"] + if type_blob: + dep_names = _extract_names_from_rendered_code( + bytes(type_blob) + ) + # Filter out self-references + dep_names = [ + d for d in dep_names if d != declaration_name + ] + dependencies = dep_names or None + else: + dependencies = None + + # Use markdown docstring; detect Verso-only cases + docstring = row["docstring"] + if not docstring and row["verso_docstring"]: + verso_only_docstrings += 1 + + declarations.append( + Declaration( + name=declaration_name, + module=module_name, + docstring=docstring, + source_text=source_text, + source_link=source_link, + dependencies=dependencies, + ) + ) + + progress.update(task, advance=1) + + if skipped_prefix > 0: + logger.info( + "Skipped %d declarations outside allowed prefixes", + skipped_prefix, + ) + if skipped_constructor > 0: + logger.info("Skipped %d .mk constructors", skipped_constructor) + if skipped_no_source > 0: + logger.info("Skipped %d declarations without source URL", skipped_no_source) + if verso_only_docstrings > 0: + logger.warning( + "%d declarations have Verso-only docstrings " + "(not yet supported, stored as docstring=None)", + verso_only_docstrings, + ) + if source_errors > 0: + logger.warning( + "Could not extract source text for %d declarations", + source_errors, + ) + + finally: + connection.close() + + return declarations + + def _parse_declarations_from_files( bmp_files: list[Path], lean_root: Path, @@ -310,6 +712,7 @@ def _parse_declarations_from_files( List of parsed Declaration objects. """ declarations = [] + source_errors = 0 with Progress( SpinnerColumn(), @@ -338,23 +741,33 @@ def _parse_declarations_from_files( for declaration_data in data.get("declarations", []): information = declaration_data["info"] - source_text = _extract_source_text( - information["sourceLink"], lean_root, package_cache - ) + declaration_name = information["name"] + + # Skip auto-generated .mk constructors + if declaration_name.endswith(".mk"): + continue + + try: + source_text = _extract_source_text( + information["sourceLink"], lean_root, package_cache + ) + except (FileNotFoundError, ValueError) as error: + source_errors += 1 + if source_errors <= 10: + logger.debug( + "Could not extract source for %s: %s", + declaration_name, error, + ) + continue header_html = declaration_data.get("header", "") dependencies = _extract_dependencies_from_html(header_html) # Filter out self-references from dependencies - declaration_name = information["name"] filtered_dependencies = [ d for d in dependencies if d != declaration_name ] - # Skip auto-generated .mk constructors - if declaration_name.endswith(".mk"): - continue - declarations.append( Declaration( name=declaration_name, @@ -368,6 +781,12 @@ def _parse_declarations_from_files( progress.update(task, advance=1) + if source_errors > 0: + logger.warning( + "Could not extract source text for %d declarations", + source_errors, + ) + return declarations @@ -428,12 +847,88 @@ async def _insert_declarations_batch( return inserted_count +_REQUIRED_DOCGEN_TABLES = {"name_info", "declaration_ranges", "modules"} + + +def _validate_docgen_sqlite(database_path: Path) -> bool: + """Check that a doc-gen4 api-docs.db is a valid, usable SQLite database. + + Verifies the file is non-empty, opens as SQLite, and contains the tables + that the extraction pipeline requires. + + Args: + database_path: Path to the api-docs.db file. + + Returns: + True if the database is valid and contains the required tables. + """ + if database_path.stat().st_size == 0: + logger.warning("api-docs.db exists but is empty: %s", database_path) + return False + + try: + connection = sqlite3.connect(str(database_path)) + try: + cursor = connection.execute( + "SELECT name FROM sqlite_master WHERE type='table'" + ) + tables = {row[0] for row in cursor.fetchall()} + finally: + connection.close() + except sqlite3.DatabaseError as error: + logger.warning("api-docs.db is not a valid SQLite file: %s", error) + return False + + missing = _REQUIRED_DOCGEN_TABLES - tables + if missing: + logger.warning( + "api-docs.db is missing required tables %s: %s", + missing, database_path, + ) + return False + + return True + + +def _detect_docgen_format(workspace_path: Path) -> str: + """Detect which doc-gen4 output format a workspace uses. + + Doc-gen4 >= v4.29.0-rc2 writes to a SQLite database (api-docs.db). + Earlier versions write individual BMP JSON files to doc-data/. + + The SQLite file is validated before returning "sqlite" to guard against + zero-byte, corrupt, or incompatible databases left by crashed builds. + + Args: + workspace_path: Path to the package workspace (e.g., lean/mathlib). + + Returns: + "sqlite" if a valid api-docs.db exists, "bmp" if BMP files exist, + "none" otherwise. + """ + api_docs_db = workspace_path / ".lake" / "build" / "api-docs.db" + if api_docs_db.exists(): + if _validate_docgen_sqlite(api_docs_db): + return "sqlite" + logger.warning( + "Invalid api-docs.db at %s, checking for BMP fallback", + api_docs_db, + ) + + doc_data_dir = workspace_path / ".lake" / "build" / "doc-data" + if doc_data_dir.exists(): + bmp_files = list(doc_data_dir.glob("**/*.bmp")) + if bmp_files: + return "bmp" + + return "none" + + async def extract_declarations(engine: AsyncEngine, batch_size: int = 1000) -> None: """Extract all declarations from doc-gen4 data and load into database. - Looks for BMP files in each package's .lake/build/doc-data directory. - Extracts only declarations matching the package's configured module_prefixes, - ensuring each package's declarations come from its own workspace. + Automatically detects whether each package uses the newer SQLite format + (api-docs.db from doc-gen4 >= v4.29.0-rc2) or the legacy BMP JSON format. Args: engine: SQLAlchemy async engine for database connection. @@ -445,21 +940,13 @@ async def extract_declarations(engine: AsyncEngine, batch_size: int = 1000) -> N lean_root = Path("lean") all_declarations = [] - # Process each workspace separately with its own package cache for package_name in get_extraction_order(): package_config = PACKAGE_REGISTRY[package_name] - doc_data_dir = lean_root / package_name / ".lake" / "build" / "doc-data" - - if not doc_data_dir.exists(): - logger.warning( - "No doc-data directory for %s: %s", package_name, doc_data_dir - ) - continue - - bmp_files = sorted(doc_data_dir.glob("**/*.bmp")) - logger.info("Found %d BMP files in %s", len(bmp_files), package_name) + workspace_path = lean_root / package_name + docgen_format = _detect_docgen_format(workspace_path) - if not bmp_files: + if docgen_format == "none": + logger.warning("No doc-gen4 output found for %s", package_name) continue # Build workspace-specific package cache to avoid version mismatches @@ -469,9 +956,31 @@ async def extract_declarations(engine: AsyncEngine, batch_size: int = 1000) -> N package_name, len(package_cache), ) - declarations = _parse_declarations_from_files( - bmp_files, lean_root, package_cache, package_config.module_prefixes - ) + if docgen_format == "sqlite": + api_docs_path = workspace_path / ".lake" / "build" / "api-docs.db" + lean_version = _read_lean_toolchain_version(workspace_path) + logger.info( + "[%s] Using SQLite format (api-docs.db)", package_name + ) + declarations = _parse_declarations_from_sqlite( + api_docs_path, + lean_root, + package_cache, + package_config.module_prefixes, + lean_version=lean_version, + ) + else: + doc_data_dir = workspace_path / ".lake" / "build" / "doc-data" + bmp_files = sorted(doc_data_dir.glob("**/*.bmp")) + logger.info( + "[%s] Using BMP format (%d files)", + package_name, len(bmp_files), + ) + declarations = _parse_declarations_from_files( + bmp_files, lean_root, package_cache, + package_config.module_prefixes, + ) + logger.info( "Extracted %d declarations from %s (prefixes: %s)", len(declarations), package_name, package_config.module_prefixes, @@ -479,7 +988,9 @@ async def extract_declarations(engine: AsyncEngine, batch_size: int = 1000) -> N all_declarations.extend(declarations) if not all_declarations: - raise FileNotFoundError("No declarations extracted from any package workspace") + raise FileNotFoundError( + "No declarations extracted from any package workspace" + ) logger.info("Total declarations extracted: %d", len(all_declarations)) diff --git a/src/lean_explore/extract/package_registry.py b/src/lean_explore/extract/package_registry.py index a076e80..f9e0dd4 100644 --- a/src/lean_explore/extract/package_registry.py +++ b/src/lean_explore/extract/package_registry.py @@ -17,7 +17,7 @@ "physlean": PackageConfig( name="physlean", git_url="https://github.com/HEPLean/PhysLean", - module_prefixes=["PhysLean"], + module_prefixes=["Physlib", "QuantumInfo"], version_strategy=VersionStrategy.TAGGED, depends_on=["mathlib"], ), diff --git a/src/lean_explore/extract/package_utils.py b/src/lean_explore/extract/package_utils.py index f5912c0..ab5d5b8 100644 --- a/src/lean_explore/extract/package_utils.py +++ b/src/lean_explore/extract/package_utils.py @@ -82,7 +82,12 @@ def get_package_toolchain(package_configuration: PackageConfig) -> tuple[str, st def update_lakefile_docgen_version(lakefile_path: Path, lean_version: str) -> None: - """Update the doc-gen4 version in a lakefile to match the Lean version. + """Update the doc-gen4 version in a lakefile to match the Lean toolchain. + + Doc-gen4 releases are tagged to match Lean toolchain versions, so pinning + to the same version ensures compatibility. The doc-gen4 ``require`` must + appear before the main package ``require`` so that the main package's + transitive dependency versions take precedence during resolution. Args: lakefile_path: Path to lakefile.lean diff --git a/tests/extract/__main___test.py b/tests/extract/__main___test.py index 84607c6..aa9ee4d 100644 --- a/tests/extract/__main___test.py +++ b/tests/extract/__main___test.py @@ -50,32 +50,65 @@ class TestDocGen4Step: @pytest.mark.external async def test_run_doc_gen4_step_success(self, temp_directory): """Test successful doc-gen4 execution.""" - lean_directory = temp_directory / "lean" - lean_directory.mkdir() - - # Mock subprocess to avoid actually running lake - with patch("lean_explore.extract.doc_gen4.subprocess.Popen") as mock_popen: - mock_process = MagicMock() - mock_process.stdout = iter(["Building...\n", "Complete!\n"]) - mock_process.wait.return_value = 0 - mock_popen.return_value = mock_process + with patch( + "lean_explore.extract.doc_gen4.get_extraction_order", + return_value=["mathlib"], + ): + with patch( + "lean_explore.extract.doc_gen4._setup_workspace", + return_value=("leanprover/lean4:v4.29.0-rc2", "v4.29.0-rc2"), + ): + with patch( + "lean_explore.extract.doc_gen4.subprocess.run" + ) as mock_run: + mock_run.return_value = MagicMock( + returncode=0, + stdout="", + stderr="", + ) + with patch( + "lean_explore.extract.doc_gen4.subprocess.Popen" + ) as mock_popen: + mock_process = MagicMock() + mock_process.stdout = iter(["Building...\n", "Complete!\n"]) + mock_process.wait.return_value = 0 + mock_popen.return_value = mock_process - await _run_doc_gen4_step() + await _run_doc_gen4_step() - # Called twice: "lake build" and "lake build LeanExtract:docs" - assert mock_popen.call_count == 2 + assert mock_run.call_count == 2 + # Called twice: "lake build" and "lake build MathExtract:docs" + assert mock_popen.call_count == 2 @pytest.mark.external async def test_run_doc_gen4_step_failure(self): """Test doc-gen4 execution failure.""" - with patch("lean_explore.extract.doc_gen4.subprocess.Popen") as mock_popen: - mock_process = MagicMock() - mock_process.stdout = iter(["Error!\n"]) - mock_process.wait.return_value = 1 - mock_popen.return_value = mock_process - - with pytest.raises(RuntimeError, match="failed"): - await _run_doc_gen4_step() + with patch( + "lean_explore.extract.doc_gen4.get_extraction_order", + return_value=["mathlib"], + ): + with patch( + "lean_explore.extract.doc_gen4._setup_workspace", + return_value=("leanprover/lean4:v4.29.0-rc2", "v4.29.0-rc2"), + ): + with patch( + "lean_explore.extract.doc_gen4.subprocess.run" + ) as mock_run: + mock_run.return_value = MagicMock( + returncode=0, + stdout="", + stderr="", + ) + with patch( + "lean_explore.extract.doc_gen4.subprocess.Popen" + ) as mock_popen: + mock_process = MagicMock() + mock_process.stdout = iter(["Error!\n"]) + mock_process.wait.return_value = 1 + mock_popen.return_value = mock_process + + with pytest.raises(RuntimeError, match="failed"): + await _run_doc_gen4_step() class TestExtractionStep: diff --git a/tests/extract/doc_gen4_test.py b/tests/extract/doc_gen4_test.py new file mode 100644 index 0000000..ac4fa77 --- /dev/null +++ b/tests/extract/doc_gen4_test.py @@ -0,0 +1,124 @@ +"""Tests for doc-gen4 workspace orchestration.""" + +from unittest.mock import MagicMock, patch + +from lean_explore.extract.doc_gen4 import ( + _run_lake_for_package, + _uses_sqlite_docgen, + run_doc_gen4, +) +from lean_explore.extract.package_registry import PACKAGE_REGISTRY + + +class TestDocGen4VersionDetection: + """Tests for doc-gen4 format detection by Lean version.""" + + def test_uses_sqlite_docgen_from_rc2(self): + """Test the SQLite cutoff at v4.29.0-rc2.""" + assert _uses_sqlite_docgen("leanprover/lean4:v4.29.0-rc2") + assert _uses_sqlite_docgen("leanprover/lean4:v4.29.0") + assert not _uses_sqlite_docgen("leanprover/lean4:v4.29.0-rc1") + assert not _uses_sqlite_docgen("leanprover/lean4:v4.28.0") + + +class TestRunDocGen4FreshHandling: + """Tests for fresh-cache handling in run_doc_gen4.""" + + async def test_fresh_legacy_workspace_clears_cache(self): + """Test that legacy doc-gen4 workspaces still clear the cache.""" + with patch( + "lean_explore.extract.doc_gen4.get_extraction_order", + return_value=["mathlib"], + ): + with patch( + "lean_explore.extract.doc_gen4._setup_workspace", + return_value=("leanprover/lean4:v4.29.0-rc1", "v4.29.0-rc1"), + ): + with patch( + "lean_explore.extract.doc_gen4._clear_workspace_cache" + ) as mock_clear: + with patch( + "lean_explore.extract.doc_gen4._run_lake_for_package" + ) as mock_run: + await run_doc_gen4(fresh=True) + + mock_clear.assert_called_once() + mock_run.assert_called_once_with("mathlib", False) + + async def test_fresh_sqlite_workspace_skips_cache_clear(self): + """Test that SQLite doc-gen4 workspaces keep the cache.""" + with patch( + "lean_explore.extract.doc_gen4.get_extraction_order", + return_value=["mathlib"], + ): + with patch( + "lean_explore.extract.doc_gen4._setup_workspace", + return_value=("leanprover/lean4:v4.29.0-rc2", "v4.29.0-rc2"), + ): + with patch( + "lean_explore.extract.doc_gen4._clear_workspace_cache" + ) as mock_clear: + with patch( + "lean_explore.extract.doc_gen4._run_lake_for_package" + ) as mock_run: + await run_doc_gen4(fresh=True) + + mock_clear.assert_not_called() + mock_run.assert_called_once_with("mathlib", False) + + +class TestLakeBuildTargets: + """Tests for Lake build target selection.""" + + def _run_and_capture_target( + self, tmp_path, package_name, library_name, toolchain + ): + workspace_path = tmp_path / "lean" / package_name + workspace_path.mkdir(parents=True) + (workspace_path / "lean-toolchain").write_text(toolchain + "\n") + + with patch("lean_explore.extract.doc_gen4.Path") as mock_path: + mock_path.side_effect = lambda *args: tmp_path.joinpath(*args) + with patch("lean_explore.extract.doc_gen4._run_lake_update_with_retry"): + with patch( + "lean_explore.extract.doc_gen4.subprocess.run" + ) as mock_run: + mock_run.return_value = MagicMock( + returncode=0, stdout="", stderr="" + ) + with patch( + "lean_explore.extract.doc_gen4.subprocess.Popen" + ) as mock_popen: + mock_process = MagicMock() + mock_process.stdout = iter([]) + mock_process.wait.return_value = 0 + mock_popen.return_value = mock_process + + _run_lake_for_package(package_name) + + return mock_popen.call_args_list[0].args[0] + + def test_sqlite_workspace_uses_docinfo_target(self, tmp_path): + """SQLite-format workspaces should build :docInfo, skipping HTML gen.""" + args = self._run_and_capture_target( + tmp_path, "physlean", "PhysExtract", "leanprover/lean4:v4.29.0-rc2" + ) + assert args == ["lake", "build", "PhysExtract:docInfo"] + + def test_legacy_workspace_uses_docs_target(self, tmp_path): + """Pre-SQLite workspaces have no :docInfo facet, must build :docs.""" + args = self._run_and_capture_target( + tmp_path, "physlean", "PhysExtract", "leanprover/lean4:v4.29.0-rc1" + ) + assert args == ["lake", "build", "PhysExtract:docs"] + + +class TestPackageRegistry: + """Tests for package registry metadata.""" + + def test_physlean_uses_upstream_module_roots(self): + """Test that PhysLean filters on the upstream module roots.""" + assert PACKAGE_REGISTRY["physlean"].module_prefixes == [ + "Physlib", + "QuantumInfo", + ] diff --git a/tests/extract/doc_parser_test.py b/tests/extract/doc_parser_test.py index 68a0842..08ab60b 100644 --- a/tests/extract/doc_parser_test.py +++ b/tests/extract/doc_parser_test.py @@ -5,20 +5,28 @@ """ import json +import sqlite3 from unittest.mock import AsyncMock, patch import pytest from sqlalchemy import select from lean_explore.extract.doc_parser import ( + _BlobReader, _build_package_cache, + _construct_source_link, + _detect_docgen_format, _extract_dependencies_from_html, + _extract_names_from_rendered_code, _extract_source_text, _filter_auto_generated_projections, _insert_declarations_batch, _parse_declarations_from_files, + _parse_declarations_from_sqlite, + _read_lean_toolchain_version, _read_source_lines, _strip_lean_comments, + _validate_docgen_sqlite, extract_declarations, ) from lean_explore.extract.types import Declaration @@ -155,6 +163,253 @@ def test_extract_source_text_file_not_found(self, temp_directory): with pytest.raises(FileNotFoundError): _extract_source_text(source_link, lean_root, package_cache) + def test_extract_source_text_from_lake_toolchain(self, temp_directory): + """Test extracting source text from the Lake toolchain path.""" + lean_root = temp_directory / "lean" + toolchain_root = temp_directory / "toolchain" / "src" + lean_source_dir = toolchain_root / "lean" + lake_source_dir = toolchain_root / "lake" + lean_source_dir.mkdir(parents=True) + lake_file = lake_source_dir / "Lake" / "Config" / "Monad.lean" + lake_file.parent.mkdir(parents=True) + lake_file.write_text("def Lake.Config.Monad.run := 1\n") + + package_cache = {"lean4": lean_source_dir} + source_link = ( + "https://github.com/leanprover/lean4/blob/toolchain/" + "src/lake/Lake/Config/Monad.lean#L1-L1" + ) + + result = _extract_source_text(source_link, lean_root, package_cache) + + assert "Lake.Config.Monad.run" in result + + +def _encode_nat(n: int) -> bytes: + """Encode a natural number in leansqlite's variable-length format.""" + chunks = [] + while n >= 128: + chunks.append((n & 0x7F) | 0x80) + n >>= 7 + chunks.append(n) + return bytes(chunks) + + +def _encode_string(s: str) -> bytes: + """Encode a string: Nat(utf8_byte_length) + UTF-8 bytes.""" + encoded = s.encode("utf-8") + return _encode_nat(len(encoded)) + encoded + + +def _encode_name(name: str) -> bytes: + """Encode a dotted Lean name (e.g. 'Nat.add') into binary.""" + if not name: + return b"\x00" # anonymous + parts = name.split(".") + result = b"\x00" # start with anonymous + for part in parts: + # Name.str = tag 1 + parent + string + result = b"\x01" + result + _encode_string(part) + return result + + +def _make_const_tag(name: str) -> bytes: + """Build a RenderedCode.Tag.const(name) blob fragment.""" + return b"\x02" + _encode_name(name) + + +def _make_text_node(s: str) -> bytes: + """Build a TaggedText.text(s) blob fragment.""" + return b"\x00" + _encode_string(s) + + +def _make_tag_node(tag_bytes: bytes, inner: bytes) -> bytes: + """Build a TaggedText.tag(tag, inner) blob fragment.""" + return b"\x01" + tag_bytes + inner + + +def _make_append_node(children: list[bytes]) -> bytes: + """Build a TaggedText.append(children) blob fragment.""" + return b"\x02" + _encode_nat(len(children)) + b"".join(children) + + +class TestRenderedCodeBlobParser: + """Tests for RenderedCode BLOB parsing to extract dependency names.""" + + def test_extract_single_const(self): + """Test extracting a single const name from a type BLOB.""" + # TaggedText.tag(Tag.const("Nat"), TaggedText.text("Nat")) + blob = _make_tag_node(_make_const_tag("Nat"), _make_text_node("Nat")) + names = _extract_names_from_rendered_code(blob) + assert names == ["Nat"] + + def test_extract_multiple_consts(self): + """Test extracting multiple const names from a type BLOB.""" + # append([tag(const Nat, text Nat), text " → ", tag(const Bool, text Bool)]) + blob = _make_append_node([ + _make_tag_node(_make_const_tag("Nat"), _make_text_node("Nat")), + _make_text_node(" → "), + _make_tag_node(_make_const_tag("Bool"), _make_text_node("Bool")), + ]) + names = _extract_names_from_rendered_code(blob) + assert names == ["Nat", "Bool"] + + def test_extract_dotted_name(self): + """Test extracting a dotted name like Nat.add.""" + blob = _make_tag_node( + _make_const_tag("Nat.add"), _make_text_node("Nat.add") + ) + names = _extract_names_from_rendered_code(blob) + assert names == ["Nat.add"] + + def test_deduplicates_names(self): + """Test that duplicate const references are deduplicated.""" + blob = _make_append_node([ + _make_tag_node(_make_const_tag("Nat"), _make_text_node("Nat")), + _make_text_node(" → "), + _make_tag_node(_make_const_tag("Nat"), _make_text_node("Nat")), + ]) + names = _extract_names_from_rendered_code(blob) + assert names == ["Nat"] + + def test_skips_non_const_tags(self): + """Test that keyword, string, sort, otherExpr tags are skipped.""" + blob = _make_append_node([ + _make_tag_node(b"\x00", _make_text_node("def")), # keyword + _make_text_node(" "), + _make_tag_node(_make_const_tag("Nat"), _make_text_node("Nat")), + _make_tag_node(b"\x07", _make_text_node("x")), # otherExpr + ]) + names = _extract_names_from_rendered_code(blob) + assert names == ["Nat"] + + def test_empty_blob_returns_empty(self): + """Test that an empty or invalid BLOB returns empty list.""" + assert _extract_names_from_rendered_code(b"") == [] + assert _extract_names_from_rendered_code(b"\xff") == [] + + def test_text_only_returns_empty(self): + """Test that a BLOB with only text returns no names.""" + blob = _make_text_node("hello world") + names = _extract_names_from_rendered_code(blob) + assert names == [] + + def test_nested_tagged_text(self): + """Test parsing nested tag nodes.""" + # tag(otherExpr, tag(const("List"), text("List"))) + inner = _make_tag_node(_make_const_tag("List"), _make_text_node("List")) + blob = _make_tag_node(b"\x07", inner) # otherExpr wrapping + names = _extract_names_from_rendered_code(blob) + assert names == ["List"] + + def test_blob_reader_nat_encoding(self): + """Test that Nat encoding/decoding matches leansqlite format.""" + reader = _BlobReader(_encode_nat(0)) + assert reader._read_nat() == 0 + + reader = _BlobReader(_encode_nat(127)) + assert reader._read_nat() == 127 + + reader = _BlobReader(_encode_nat(128)) + assert reader._read_nat() == 128 + + reader = _BlobReader(_encode_nat(300)) + assert reader._read_nat() == 300 + + reader = _BlobReader(_encode_nat(100000)) + assert reader._read_nat() == 100000 + + def test_blob_reader_name_roundtrip(self): + """Test that Name encoding produces correct dot-separated output.""" + reader = _BlobReader(_encode_name("Mathlib.Data.Nat.Basic")) + assert reader._read_name() == "Mathlib.Data.Nat.Basic" + + reader = _BlobReader(_encode_name("")) + assert reader._read_name() == "" + + def test_sort_tags_handled(self): + """Test that sort tag variants (3-6) are handled without error.""" + for sort_byte in [3, 4, 5, 6]: + blob = _make_tag_node( + bytes([sort_byte]), _make_text_node("Type") + ) + names = _extract_names_from_rendered_code(blob) + assert names == [] + + +class TestSqliteHelpers: + """Tests for SQLite-specific parsing helpers.""" + + def test_construct_source_link_for_core_module_with_version(self): + """Test source links for core modules use the toolchain version.""" + result = _construct_source_link( + "Init.Data.Nat.Basic", None, 12, 15, + lean_version="v4.29.0-rc6", + ) + + assert ( + result + == "https://github.com/leanprover/lean4/blob/v4.29.0-rc6/" + "src/lean/Init/Data/Nat/Basic.lean#L12-L15" + ) + + def test_construct_source_link_for_lake_module_with_version(self): + """Test source links for Lake modules use the toolchain version.""" + result = _construct_source_link( + "Lake.Config.Monad", None, 7, 9, + lean_version="v4.29.0-rc6", + ) + + assert ( + result + == "https://github.com/leanprover/lean4/blob/v4.29.0-rc6/" + "src/lake/Lake/Config/Monad.lean#L7-L9" + ) + + def test_construct_source_link_falls_back_to_master(self): + """Test that missing lean_version falls back to 'master'.""" + result = _construct_source_link( + "Init.Data.Nat.Basic", None, 12, 15, + ) + + assert ( + result + == "https://github.com/leanprover/lean4/blob/master/" + "src/lean/Init/Data/Nat/Basic.lean#L12-L15" + ) + + def test_construct_source_link_prefers_source_url(self): + """Test that a non-None source_url is used directly.""" + url = "https://github.com/leanprover-community/mathlib4/blob/abc123/Foo.lean" + result = _construct_source_link("Foo.Bar", url, 1, 10) + + assert result == f"{url}#L1-L10" + + def test_read_lean_toolchain_version(self, temp_directory): + """Test reading version from a lean-toolchain file.""" + workspace = temp_directory / "workspace" + workspace.mkdir() + toolchain = workspace / "lean-toolchain" + toolchain.write_text("leanprover/lean4:v4.29.0-rc6\n") + + assert _read_lean_toolchain_version(workspace) == "v4.29.0-rc6" + + def test_read_lean_toolchain_version_release(self, temp_directory): + """Test reading a release version (no -rc suffix).""" + workspace = temp_directory / "workspace" + workspace.mkdir() + toolchain = workspace / "lean-toolchain" + toolchain.write_text("leanprover/lean4:v4.29.0\n") + + assert _read_lean_toolchain_version(workspace) == "v4.29.0" + + def test_read_lean_toolchain_version_missing(self, temp_directory): + """Test returns None when lean-toolchain doesn't exist.""" + workspace = temp_directory / "workspace" + workspace.mkdir() + + assert _read_lean_toolchain_version(workspace) is None + class TestDependencyExtraction: """Tests for dependency extraction from HTML.""" @@ -273,6 +528,458 @@ def test_parse_declarations_filters_packages(self, temp_directory): assert len(declarations) == 0 + def test_parse_declarations_from_sqlite_filters_non_rendered( + self, temp_directory + ): + """Test SQLite parsing only keeps rendered declarations.""" + lean_root = temp_directory / "lean" + database_path = temp_directory / "api-docs.db" + + source_dir = lean_root / "mathlib" / ".lake" / "packages" / "mathlib4" + source_file = source_dir / "Mathlib" / "Data" / "Nat" / "Basic.lean" + source_file.parent.mkdir(parents=True) + source_file.write_text( + "theorem Nat.visible : True := trivial\n" + "def Nat.hidden : Nat := 0\n" + ) + + connection = sqlite3.connect(database_path) + connection.executescript( + """ + CREATE TABLE modules (name TEXT PRIMARY KEY, source_url TEXT); + CREATE TABLE name_info ( + module_name TEXT NOT NULL, + position INTEGER NOT NULL, + kind TEXT, + name TEXT NOT NULL, + type BLOB NOT NULL, + sorried INTEGER NOT NULL, + render INTEGER NOT NULL, + PRIMARY KEY (module_name, position) + ); + CREATE TABLE declaration_ranges ( + module_name TEXT NOT NULL, + position INTEGER NOT NULL, + start_line INTEGER NOT NULL, + start_column INTEGER NOT NULL, + start_utf16 INTEGER NOT NULL, + end_line INTEGER NOT NULL, + end_column INTEGER NOT NULL, + end_utf16 INTEGER NOT NULL, + PRIMARY KEY (module_name, position) + ); + CREATE TABLE declaration_markdown_docstrings ( + module_name TEXT NOT NULL, + position INTEGER NOT NULL, + text TEXT NOT NULL, + PRIMARY KEY (module_name, position) + ); + CREATE TABLE declaration_verso_docstrings ( + module_name TEXT NOT NULL, + position INTEGER NOT NULL, + content BLOB NOT NULL, + PRIMARY KEY (module_name, position) + ); + """ + ) + connection.execute( + "INSERT INTO modules (name, source_url) VALUES (?, ?)", + ( + "Mathlib.Data.Nat.Basic", + "https://github.com/leanprover-community/mathlib4/blob/master/" + "Mathlib/Data/Nat/Basic.lean", + ), + ) + connection.executemany( + """ + INSERT INTO name_info + (module_name, position, kind, name, type, sorried, render) + VALUES (?, ?, ?, ?, ?, ?, ?) + """, + [ + ( + "Mathlib.Data.Nat.Basic", + 1, + "theorem", + "Nat.visible", + _make_text_node("True"), + 0, + 1, + ), + ( + "Mathlib.Data.Nat.Basic", + 2, + "definition", + "Nat.hidden", + _make_text_node("Nat"), + 0, + 0, + ), + ], + ) + connection.executemany( + """ + INSERT INTO declaration_ranges + (module_name, position, start_line, start_column, start_utf16, + end_line, end_column, end_utf16) + VALUES (?, ?, ?, ?, ?, ?, ?, ?) + """, + [ + ("Mathlib.Data.Nat.Basic", 1, 1, 0, 0, 1, 32, 32), + ("Mathlib.Data.Nat.Basic", 2, 2, 0, 0, 2, 22, 22), + ], + ) + connection.execute( + """ + INSERT INTO declaration_markdown_docstrings + (module_name, position, text) + VALUES (?, ?, ?) + """, + ("Mathlib.Data.Nat.Basic", 1, "Visible theorem"), + ) + connection.commit() + connection.close() + + declarations = _parse_declarations_from_sqlite( + database_path, + lean_root, + _build_package_cache(lean_root), + allowed_module_prefixes=["Mathlib"], + ) + + assert [declaration.name for declaration in declarations] == ["Nat.visible"] + assert declarations[0].docstring == "Visible theorem" + + def test_parse_declarations_from_sqlite_extracts_dependencies( + self, temp_directory + ): + """Test that SQLite parsing extracts dependencies from type BLOBs.""" + lean_root = temp_directory / "lean" + database_path = temp_directory / "api-docs.db" + + source_dir = lean_root / "mathlib" / ".lake" / "packages" / "mathlib4" + source_file = source_dir / "Mathlib" / "Data" / "Nat" / "Basic.lean" + source_file.parent.mkdir(parents=True) + source_file.write_text("def Nat.myFunc (n : Nat) : Bool := true\n") + + # Build a type BLOB: "Nat → Bool" + type_blob = _make_append_node([ + _make_tag_node(_make_const_tag("Nat"), _make_text_node("Nat")), + _make_text_node(" → "), + _make_tag_node(_make_const_tag("Bool"), _make_text_node("Bool")), + ]) + + connection = sqlite3.connect(database_path) + connection.executescript( + """ + CREATE TABLE modules (name TEXT PRIMARY KEY, source_url TEXT); + CREATE TABLE name_info ( + module_name TEXT NOT NULL, + position INTEGER NOT NULL, + kind TEXT, + name TEXT NOT NULL, + type BLOB NOT NULL, + sorried INTEGER NOT NULL, + render INTEGER NOT NULL, + PRIMARY KEY (module_name, position) + ); + CREATE TABLE declaration_ranges ( + module_name TEXT NOT NULL, + position INTEGER NOT NULL, + start_line INTEGER NOT NULL, + start_column INTEGER NOT NULL, + start_utf16 INTEGER NOT NULL, + end_line INTEGER NOT NULL, + end_column INTEGER NOT NULL, + end_utf16 INTEGER NOT NULL, + PRIMARY KEY (module_name, position) + ); + CREATE TABLE declaration_markdown_docstrings ( + module_name TEXT NOT NULL, + position INTEGER NOT NULL, + text TEXT NOT NULL, + PRIMARY KEY (module_name, position) + ); + CREATE TABLE declaration_verso_docstrings ( + module_name TEXT NOT NULL, + position INTEGER NOT NULL, + content BLOB NOT NULL, + PRIMARY KEY (module_name, position) + ); + """ + ) + connection.execute( + "INSERT INTO modules (name, source_url) VALUES (?, ?)", + ( + "Mathlib.Data.Nat.Basic", + "https://github.com/leanprover-community/mathlib4/blob/master/" + "Mathlib/Data/Nat/Basic.lean", + ), + ) + connection.execute( + """ + INSERT INTO name_info + (module_name, position, kind, name, type, sorried, render) + VALUES (?, ?, ?, ?, ?, ?, ?) + """, + ("Mathlib.Data.Nat.Basic", 1, "definition", "Nat.myFunc", type_blob, 0, 1), + ) + connection.execute( + """ + INSERT INTO declaration_ranges + (module_name, position, start_line, start_column, start_utf16, + end_line, end_column, end_utf16) + VALUES (?, ?, ?, ?, ?, ?, ?, ?) + """, + ("Mathlib.Data.Nat.Basic", 1, 1, 0, 0, 1, 40, 40), + ) + connection.commit() + connection.close() + + declarations = _parse_declarations_from_sqlite( + database_path, + lean_root, + _build_package_cache(lean_root), + allowed_module_prefixes=["Mathlib"], + ) + + assert len(declarations) == 1 + assert declarations[0].name == "Nat.myFunc" + assert declarations[0].dependencies == ["Nat", "Bool"] + + def test_parse_declarations_from_sqlite_detects_verso_docstrings( + self, temp_directory, caplog + ): + """Test that Verso-only docstrings are detected and logged.""" + lean_root = temp_directory / "lean" + database_path = temp_directory / "api-docs.db" + + source_dir = ( + lean_root / "mathlib" / ".lake" / "packages" / "mathlib4" + ) + source_file = source_dir / "Mathlib" / "Data" / "Nat" / "Basic.lean" + source_file.parent.mkdir(parents=True) + source_file.write_text( + "def Nat.withVerso : Nat := 0\n" + "def Nat.withMarkdown : Nat := 1\n" + ) + + connection = sqlite3.connect(database_path) + connection.executescript( + """ + CREATE TABLE modules (name TEXT PRIMARY KEY, source_url TEXT); + CREATE TABLE name_info ( + module_name TEXT NOT NULL, + position INTEGER NOT NULL, + kind TEXT, + name TEXT NOT NULL, + type BLOB NOT NULL, + sorried INTEGER NOT NULL, + render INTEGER NOT NULL, + PRIMARY KEY (module_name, position) + ); + CREATE TABLE declaration_ranges ( + module_name TEXT NOT NULL, + position INTEGER NOT NULL, + start_line INTEGER NOT NULL, + start_column INTEGER NOT NULL, + start_utf16 INTEGER NOT NULL, + end_line INTEGER NOT NULL, + end_column INTEGER NOT NULL, + end_utf16 INTEGER NOT NULL, + PRIMARY KEY (module_name, position) + ); + CREATE TABLE declaration_markdown_docstrings ( + module_name TEXT NOT NULL, + position INTEGER NOT NULL, + text TEXT NOT NULL, + PRIMARY KEY (module_name, position) + ); + CREATE TABLE declaration_verso_docstrings ( + module_name TEXT NOT NULL, + position INTEGER NOT NULL, + content BLOB NOT NULL, + PRIMARY KEY (module_name, position) + ); + """ + ) + connection.execute( + "INSERT INTO modules (name, source_url) VALUES (?, ?)", + ( + "Mathlib.Data.Nat.Basic", + "https://github.com/leanprover-community/mathlib4" + "/blob/master/Mathlib/Data/Nat/Basic.lean", + ), + ) + connection.executemany( + """ + INSERT INTO name_info + (module_name, position, kind, name, type, sorried, render) + VALUES (?, ?, ?, ?, ?, ?, ?) + """, + [ + ( + "Mathlib.Data.Nat.Basic", 1, "definition", + "Nat.withVerso", _make_text_node("Nat"), 0, 1, + ), + ( + "Mathlib.Data.Nat.Basic", 2, "definition", + "Nat.withMarkdown", _make_text_node("Nat"), 0, 1, + ), + ], + ) + connection.executemany( + """ + INSERT INTO declaration_ranges + (module_name, position, start_line, start_column, + start_utf16, end_line, end_column, end_utf16) + VALUES (?, ?, ?, ?, ?, ?, ?, ?) + """, + [ + ("Mathlib.Data.Nat.Basic", 1, 1, 0, 0, 1, 27, 27), + ("Mathlib.Data.Nat.Basic", 2, 2, 0, 0, 2, 30, 30), + ], + ) + # Declaration 1: Verso-only docstring (binary BLOB) + connection.execute( + """ + INSERT INTO declaration_verso_docstrings + (module_name, position, content) + VALUES (?, ?, ?) + """, + ("Mathlib.Data.Nat.Basic", 1, b"\x00\x01\x02"), + ) + # Declaration 2: Markdown docstring + connection.execute( + """ + INSERT INTO declaration_markdown_docstrings + (module_name, position, text) + VALUES (?, ?, ?) + """, + ("Mathlib.Data.Nat.Basic", 2, "A markdown docstring"), + ) + connection.commit() + connection.close() + + import logging + + with caplog.at_level(logging.WARNING): + declarations = _parse_declarations_from_sqlite( + database_path, + lean_root, + _build_package_cache(lean_root), + allowed_module_prefixes=["Mathlib"], + ) + + assert len(declarations) == 2 + # Verso-only: docstring is None + verso_decl = next( + d for d in declarations if d.name == "Nat.withVerso" + ) + assert verso_decl.docstring is None + # Markdown: docstring is present + md_decl = next( + d for d in declarations if d.name == "Nat.withMarkdown" + ) + assert md_decl.docstring == "A markdown docstring" + # Warning logged about Verso-only docstrings + assert "1 declarations have Verso-only docstrings" in caplog.text + + def test_parse_declarations_from_sqlite_uses_core_fallback_source_link( + self, temp_directory + ): + """Test SQLite parsing for core modules without a stored source URL.""" + lean_root = temp_directory / "lean" + database_path = temp_directory / "api-docs.db" + + toolchain_root = temp_directory / "toolchain" / "src" + lean_source_dir = toolchain_root / "lean" + init_file = lean_source_dir / "Init" / "Data" / "Nat" / "Basic.lean" + init_file.parent.mkdir(parents=True) + init_file.write_text("theorem Nat.core : True := trivial\n") + + connection = sqlite3.connect(database_path) + connection.executescript( + """ + CREATE TABLE modules (name TEXT PRIMARY KEY, source_url TEXT); + CREATE TABLE name_info ( + module_name TEXT NOT NULL, + position INTEGER NOT NULL, + kind TEXT, + name TEXT NOT NULL, + type BLOB NOT NULL, + sorried INTEGER NOT NULL, + render INTEGER NOT NULL, + PRIMARY KEY (module_name, position) + ); + CREATE TABLE declaration_ranges ( + module_name TEXT NOT NULL, + position INTEGER NOT NULL, + start_line INTEGER NOT NULL, + start_column INTEGER NOT NULL, + start_utf16 INTEGER NOT NULL, + end_line INTEGER NOT NULL, + end_column INTEGER NOT NULL, + end_utf16 INTEGER NOT NULL, + PRIMARY KEY (module_name, position) + ); + CREATE TABLE declaration_markdown_docstrings ( + module_name TEXT NOT NULL, + position INTEGER NOT NULL, + text TEXT NOT NULL, + PRIMARY KEY (module_name, position) + ); + CREATE TABLE declaration_verso_docstrings ( + module_name TEXT NOT NULL, + position INTEGER NOT NULL, + content BLOB NOT NULL, + PRIMARY KEY (module_name, position) + ); + """ + ) + connection.execute( + "INSERT INTO modules (name, source_url) VALUES (?, ?)", + ("Init.Data.Nat.Basic", None), + ) + connection.execute( + """ + INSERT INTO name_info + (module_name, position, kind, name, type, sorried, render) + VALUES (?, ?, ?, ?, ?, ?, ?) + """, + ( + "Init.Data.Nat.Basic", 1, "theorem", "Nat.core", + _make_text_node("True"), 0, 1, + ), + ) + connection.execute( + """ + INSERT INTO declaration_ranges + (module_name, position, start_line, start_column, start_utf16, + end_line, end_column, end_utf16) + VALUES (?, ?, ?, ?, ?, ?, ?, ?) + """, + ("Init.Data.Nat.Basic", 1, 1, 0, 0, 1, 31, 31), + ) + connection.commit() + connection.close() + + declarations = _parse_declarations_from_sqlite( + database_path, + lean_root, + {"lean4": lean_source_dir}, + allowed_module_prefixes=["Init"], + lean_version="v4.29.0-rc6", + ) + + assert len(declarations) == 1 + assert declarations[0].name == "Nat.core" + assert ( + declarations[0].source_link + == "https://github.com/leanprover/lean4/blob/v4.29.0-rc6/" + "src/lean/Init/Data/Nat/Basic.lean#L1-L1" + ) + class TestDeclarationInsertion: """Tests for database insertion.""" @@ -641,3 +1348,125 @@ def test_single_declaration(self): assert len(filtered) == 1 assert removed_count == 0 + + +class TestDocgenFormatDetection: + """Tests for doc-gen4 output format detection and SQLite validation.""" + + def test_detect_sqlite_with_valid_db(self, temp_directory): + """Test detection returns 'sqlite' for a valid api-docs.db.""" + workspace = temp_directory / "pkg" + db_path = workspace / ".lake" / "build" / "api-docs.db" + db_path.parent.mkdir(parents=True) + + connection = sqlite3.connect(db_path) + connection.executescript( + """ + CREATE TABLE modules (name TEXT PRIMARY KEY, source_url TEXT); + CREATE TABLE name_info ( + module_name TEXT, position INTEGER, kind TEXT, name TEXT, + type BLOB, sorried INTEGER, render INTEGER, + PRIMARY KEY (module_name, position) + ); + CREATE TABLE declaration_ranges ( + module_name TEXT, position INTEGER, + start_line INTEGER, start_column INTEGER, start_utf16 INTEGER, + end_line INTEGER, end_column INTEGER, end_utf16 INTEGER, + PRIMARY KEY (module_name, position) + ); + """ + ) + connection.close() + + assert _detect_docgen_format(workspace) == "sqlite" + + def test_detect_bmp_format(self, temp_directory): + """Test detection returns 'bmp' when only BMP files exist.""" + workspace = temp_directory / "pkg" + doc_data = workspace / ".lake" / "build" / "doc-data" + doc_data.mkdir(parents=True) + (doc_data / "declaration-data-Foo.bmp").write_text("{}") + + assert _detect_docgen_format(workspace) == "bmp" + + def test_detect_none_when_empty(self, temp_directory): + """Test detection returns 'none' when no doc output exists.""" + workspace = temp_directory / "pkg" + workspace.mkdir(parents=True) + + assert _detect_docgen_format(workspace) == "none" + + def test_detect_falls_back_to_bmp_for_empty_db(self, temp_directory): + """Test that an empty api-docs.db falls back to BMP if available.""" + workspace = temp_directory / "pkg" + db_path = workspace / ".lake" / "build" / "api-docs.db" + db_path.parent.mkdir(parents=True) + db_path.write_bytes(b"") + + doc_data = workspace / ".lake" / "build" / "doc-data" + doc_data.mkdir(parents=True) + (doc_data / "declaration-data-Foo.bmp").write_text("{}") + + assert _detect_docgen_format(workspace) == "bmp" + + def test_detect_returns_none_for_empty_db_without_bmp(self, temp_directory): + """Test that an empty api-docs.db with no BMP fallback returns 'none'.""" + workspace = temp_directory / "pkg" + db_path = workspace / ".lake" / "build" / "api-docs.db" + db_path.parent.mkdir(parents=True) + db_path.write_bytes(b"") + + assert _detect_docgen_format(workspace) == "none" + + def test_detect_falls_back_for_corrupt_db(self, temp_directory): + """Test that a corrupt (non-SQLite) file falls back gracefully.""" + workspace = temp_directory / "pkg" + db_path = workspace / ".lake" / "build" / "api-docs.db" + db_path.parent.mkdir(parents=True) + db_path.write_bytes(b"this is not a sqlite database") + + assert _detect_docgen_format(workspace) == "none" + + def test_detect_falls_back_for_db_missing_tables(self, temp_directory): + """Test that a SQLite DB missing required tables falls back.""" + workspace = temp_directory / "pkg" + db_path = workspace / ".lake" / "build" / "api-docs.db" + db_path.parent.mkdir(parents=True) + + connection = sqlite3.connect(db_path) + connection.execute( + "CREATE TABLE modules (name TEXT PRIMARY KEY, source_url TEXT)" + ) + connection.close() + + # Has modules but missing name_info and declaration_ranges + assert _detect_docgen_format(workspace) == "none" + + def test_validate_docgen_sqlite_valid(self, temp_directory): + """Test validation passes for a well-formed database.""" + db_path = temp_directory / "api-docs.db" + connection = sqlite3.connect(db_path) + connection.executescript( + """ + CREATE TABLE modules (name TEXT PRIMARY KEY); + CREATE TABLE name_info (module_name TEXT, position INTEGER); + CREATE TABLE declaration_ranges (module_name TEXT, position INTEGER); + """ + ) + connection.close() + + assert _validate_docgen_sqlite(db_path) is True + + def test_validate_docgen_sqlite_empty_file(self, temp_directory): + """Test validation rejects an empty file.""" + db_path = temp_directory / "api-docs.db" + db_path.write_bytes(b"") + + assert _validate_docgen_sqlite(db_path) is False + + def test_validate_docgen_sqlite_corrupt_file(self, temp_directory): + """Test validation rejects a non-SQLite file.""" + db_path = temp_directory / "api-docs.db" + db_path.write_bytes(b"not a database") + + assert _validate_docgen_sqlite(db_path) is False