From 881559b882d19fd2b2637c592fd1b62cfce92391 Mon Sep 17 00:00:00 2001 From: justincasher Date: Sat, 21 Mar 2026 21:06:16 -0400 Subject: [PATCH 01/22] Support doc-gen4 SQLite database format for extraction Doc-gen4 >= v4.29.0-rc1 outputs declaration data to a SQLite database (api-docs.db) instead of individual BMP JSON files. The extraction pipeline now auto-detects which format each package uses and parses accordingly. For packages using the new SQLite format, the workspace cache clear is skipped during fresh builds since api-docs.db handles incremental updates correctly. The full cache clear is still performed for packages using the legacy BMP format to avoid stale output files. --- src/lean_explore/extract/doc_gen4.py | 13 +- src/lean_explore/extract/doc_parser.py | 255 +++++++++++++++++++++++-- 2 files changed, 246 insertions(+), 22 deletions(-) diff --git a/src/lean_explore/extract/doc_gen4.py b/src/lean_explore/extract/doc_gen4.py index 621e55a..838c7e2 100644 --- a/src/lean_explore/extract/doc_gen4.py +++ b/src/lean_explore/extract/doc_gen4.py @@ -228,7 +228,18 @@ async def run_doc_gen4( logger.info("\n%s\nPackage: %s\n%s", "=" * 50, package_name, "=" * 50) if fresh: - _clear_workspace_cache(workspace_path) + # Packages using doc-gen4 >= v4.29 output to a SQLite database + # (api-docs.db) which handles incremental updates correctly. + # Only packages using the legacy BMP format need a full cache + # clear to avoid stale output files. + api_docs_db = workspace_path / ".lake" / "build" / "api-docs.db" + if api_docs_db.exists(): + logger.info( + f"[{package_name}] Skipping cache clear " + f"(api-docs.db handles incremental updates)" + ) + else: + _clear_workspace_cache(workspace_path) if setup: toolchain, ref = _setup_workspace(config) diff --git a/src/lean_explore/extract/doc_parser.py b/src/lean_explore/extract/doc_parser.py index 707e62a..bbd7447 100644 --- a/src/lean_explore/extract/doc_parser.py +++ b/src/lean_explore/extract/doc_parser.py @@ -1,12 +1,17 @@ """Parser for Lean doc-gen4 output files. -This module parses doc-gen4 JSON data and extracts Lean source code -to produce Declaration objects ready for database insertion. +This module parses doc-gen4 output and extracts Lean source code to produce +Declaration objects ready for database insertion. + +Supports two doc-gen4 output formats: +- SQLite database (api-docs.db): Used by doc-gen4 >= v4.29.0-rc1 +- BMP JSON files (.bmp): Used by doc-gen4 < v4.29.0-rc1 """ import json import logging import re +import sqlite3 from pathlib import Path from rich.progress import ( @@ -292,6 +297,176 @@ def _extract_source_text( ) +def _construct_source_link( + module_source_url: str | None, + module_name: str, + start_line: int, + end_line: int, +) -> str | None: + """Construct a GitHub source link from module URL and line range. + + Args: + module_source_url: GitHub URL to the module file from api-docs.db. + module_name: Fully qualified module name (e.g., "Mathlib.Algebra.Group"). + start_line: Start line number in the source file. + end_line: End line number in the source file. + + Returns: + GitHub URL with line range fragment, or None if no source URL exists. + """ + if not module_source_url: + return None + return f"{module_source_url}#L{start_line}-L{end_line}" + + +def _parse_declarations_from_sqlite( + database_path: Path, + lean_root: Path, + package_cache: dict[str, Path], + allowed_module_prefixes: list[str], +) -> list[Declaration]: + """Parse declarations from a doc-gen4 SQLite database (api-docs.db). + + Doc-gen4 >= v4.29.0-rc1 outputs declaration data to a SQLite database + instead of individual BMP JSON files. This function reads that database + and produces the same Declaration objects as the BMP parser. + + Args: + database_path: Path to the api-docs.db SQLite database. + lean_root: Root directory of the Lean project. + package_cache: Dictionary mapping package names to their directories. + allowed_module_prefixes: Module prefixes to extract (e.g., ["Mathlib"]). + + Returns: + List of parsed Declaration objects. + """ + declarations = [] + + connection = sqlite3.connect(str(database_path)) + connection.row_factory = sqlite3.Row + + try: + # Query all declarations with their source ranges and docstrings + query = """ + SELECT + n.module_name, + n.position, + n.kind, + n.name, + r.start_line, + r.end_line, + d.text AS docstring, + m.source_url + FROM name_info n + JOIN declaration_ranges r + ON n.module_name = r.module_name AND n.position = r.position + LEFT JOIN declaration_markdown_docstrings d + ON n.module_name = d.module_name AND n.position = d.position + JOIN modules m + ON n.module_name = m.name + ORDER BY n.module_name, n.position + """ + rows = connection.execute(query).fetchall() + + logger.info( + f"Found {len(rows)} declarations in api-docs.db" + ) + + skipped_no_source = 0 + skipped_prefix = 0 + skipped_constructor = 0 + source_errors = 0 + + with Progress( + SpinnerColumn(), + TextColumn("[progress.description]{task.description}"), + BarColumn(), + TaskProgressColumn(), + TimeRemainingColumn(), + ) as progress: + task = progress.add_task( + "[cyan]Parsing api-docs.db...", total=len(rows) + ) + + for row in rows: + module_name = row["module_name"] + declaration_name = row["name"] + + # Filter by module prefix + matches_prefix = any( + module_name == prefix or module_name.startswith(prefix + ".") + for prefix in allowed_module_prefixes + ) + if not matches_prefix: + skipped_prefix += 1 + progress.update(task, advance=1) + continue + + # Skip auto-generated .mk constructors + if declaration_name.endswith(".mk"): + skipped_constructor += 1 + progress.update(task, advance=1) + continue + + # Skip internal/auto-generated names + kind = row["kind"] + if kind == "constructor": + progress.update(task, advance=1) + continue + + source_url = row["source_url"] + start_line = row["start_line"] + end_line = row["end_line"] + + source_link = _construct_source_link( + source_url, module_name, start_line, end_line + ) + if not source_link: + skipped_no_source += 1 + progress.update(task, advance=1) + continue + + # Extract source text from local files + try: + source_text = _extract_source_text( + source_link, lean_root, package_cache + ) + except (FileNotFoundError, ValueError) as error: + source_errors += 1 + if source_errors <= 10: + logger.debug( + f"Could not extract source for " + f"{declaration_name}: {error}" + ) + progress.update(task, advance=1) + continue + + declarations.append( + Declaration( + name=declaration_name, + module=module_name, + docstring=row["docstring"], + source_text=source_text, + source_link=source_link, + dependencies=None, + ) + ) + + progress.update(task, advance=1) + + if skipped_no_source > 0: + logger.info("Skipped %d declarations without source URL", skipped_no_source) + if source_errors > 0: + logger.warning( + f"Could not extract source text for {source_errors} declarations" + ) + + finally: + connection.close() + + return declarations + + def _parse_declarations_from_files( bmp_files: list[Path], lean_root: Path, @@ -428,12 +603,36 @@ async def _insert_declarations_batch( return inserted_count +def _detect_docgen_format(workspace_path: Path) -> str: + """Detect which doc-gen4 output format a workspace uses. + + Doc-gen4 >= v4.29.0-rc1 writes to a SQLite database (api-docs.db). + Earlier versions write individual BMP JSON files to doc-data/. + + Args: + workspace_path: Path to the package workspace (e.g., lean/mathlib). + + Returns: + "sqlite" if api-docs.db exists, "bmp" if BMP files exist, "none" otherwise. + """ + api_docs_db = workspace_path / ".lake" / "build" / "api-docs.db" + if api_docs_db.exists(): + return "sqlite" + + doc_data_dir = workspace_path / ".lake" / "build" / "doc-data" + if doc_data_dir.exists(): + bmp_files = list(doc_data_dir.glob("**/*.bmp")) + if bmp_files: + return "bmp" + + return "none" + + async def extract_declarations(engine: AsyncEngine, batch_size: int = 1000) -> None: """Extract all declarations from doc-gen4 data and load into database. - Looks for BMP files in each package's .lake/build/doc-data directory. - Extracts only declarations matching the package's configured module_prefixes, - ensuring each package's declarations come from its own workspace. + Automatically detects whether each package uses the newer SQLite format + (api-docs.db from doc-gen4 >= v4.29.0-rc1) or the legacy BMP JSON format. Args: engine: SQLAlchemy async engine for database connection. @@ -445,21 +644,13 @@ async def extract_declarations(engine: AsyncEngine, batch_size: int = 1000) -> N lean_root = Path("lean") all_declarations = [] - # Process each workspace separately with its own package cache for package_name in get_extraction_order(): package_config = PACKAGE_REGISTRY[package_name] - doc_data_dir = lean_root / package_name / ".lake" / "build" / "doc-data" + workspace_path = lean_root / package_name + docgen_format = _detect_docgen_format(workspace_path) - if not doc_data_dir.exists(): - logger.warning( - "No doc-data directory for %s: %s", package_name, doc_data_dir - ) - continue - - bmp_files = sorted(doc_data_dir.glob("**/*.bmp")) - logger.info("Found %d BMP files in %s", len(bmp_files), package_name) - - if not bmp_files: + if docgen_format == "none": + logger.warning("No doc-gen4 output found for %s", package_name) continue # Build workspace-specific package cache to avoid version mismatches @@ -469,9 +660,29 @@ async def extract_declarations(engine: AsyncEngine, batch_size: int = 1000) -> N package_name, len(package_cache), ) - declarations = _parse_declarations_from_files( - bmp_files, lean_root, package_cache, package_config.module_prefixes - ) + if docgen_format == "sqlite": + api_docs_path = workspace_path / ".lake" / "build" / "api-docs.db" + logger.info( + "[%s] Using SQLite format (api-docs.db)", package_name + ) + declarations = _parse_declarations_from_sqlite( + api_docs_path, + lean_root, + package_cache, + package_config.module_prefixes, + ) + else: + doc_data_dir = workspace_path / ".lake" / "build" / "doc-data" + bmp_files = sorted(doc_data_dir.glob("**/*.bmp")) + logger.info( + "[%s] Using BMP format (%d files)", + package_name, len(bmp_files), + ) + declarations = _parse_declarations_from_files( + bmp_files, lean_root, package_cache, + package_config.module_prefixes, + ) + logger.info( "Extracted %d declarations from %s (prefixes: %s)", len(declarations), package_name, package_config.module_prefixes, @@ -479,7 +690,9 @@ async def extract_declarations(engine: AsyncEngine, batch_size: int = 1000) -> N all_declarations.extend(declarations) if not all_declarations: - raise FileNotFoundError("No declarations extracted from any package workspace") + raise FileNotFoundError( + "No declarations extracted from any package workspace" + ) logger.info("Total declarations extracted: %d", len(all_declarations)) From b8a3075189064272155400a6226d8d533316beea Mon Sep 17 00:00:00 2001 From: justincasher Date: Sat, 4 Apr 2026 15:12:38 -0400 Subject: [PATCH 02/22] Remove dead constructor kind check in SQLite parser The check `kind == "constructor"` never matched because doc-gen4 stores the kind as "ctor" in the name_info table. Removing it rather than fixing the string keeps behavior consistent with the BMP parser, which also only filters by the .mk suffix. --- src/lean_explore/extract/doc_parser.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/lean_explore/extract/doc_parser.py b/src/lean_explore/extract/doc_parser.py index bbd7447..fe68905 100644 --- a/src/lean_explore/extract/doc_parser.py +++ b/src/lean_explore/extract/doc_parser.py @@ -408,12 +408,6 @@ def _parse_declarations_from_sqlite( progress.update(task, advance=1) continue - # Skip internal/auto-generated names - kind = row["kind"] - if kind == "constructor": - progress.update(task, advance=1) - continue - source_url = row["source_url"] start_line = row["start_line"] end_line = row["end_line"] From d2c78826fc57d18311a4a9df66275840368808fb Mon Sep 17 00:00:00 2001 From: justincasher Date: Sat, 4 Apr 2026 15:13:04 -0400 Subject: [PATCH 03/22] Use %-style logging consistently in new SQLite code Convert remaining f-string logger calls to %-style formatting to match the convention used throughout the rest of the PR. --- src/lean_explore/extract/doc_gen4.py | 5 +++-- src/lean_explore/extract/doc_parser.py | 11 +++++------ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/lean_explore/extract/doc_gen4.py b/src/lean_explore/extract/doc_gen4.py index 838c7e2..7008f9a 100644 --- a/src/lean_explore/extract/doc_gen4.py +++ b/src/lean_explore/extract/doc_gen4.py @@ -235,8 +235,9 @@ async def run_doc_gen4( api_docs_db = workspace_path / ".lake" / "build" / "api-docs.db" if api_docs_db.exists(): logger.info( - f"[{package_name}] Skipping cache clear " - f"(api-docs.db handles incremental updates)" + "[%s] Skipping cache clear " + "(api-docs.db handles incremental updates)", + package_name, ) else: _clear_workspace_cache(workspace_path) diff --git a/src/lean_explore/extract/doc_parser.py b/src/lean_explore/extract/doc_parser.py index fe68905..8e02baf 100644 --- a/src/lean_explore/extract/doc_parser.py +++ b/src/lean_explore/extract/doc_parser.py @@ -368,9 +368,7 @@ def _parse_declarations_from_sqlite( """ rows = connection.execute(query).fetchall() - logger.info( - f"Found {len(rows)} declarations in api-docs.db" - ) + logger.info("Found %d declarations in api-docs.db", len(rows)) skipped_no_source = 0 skipped_prefix = 0 @@ -429,8 +427,8 @@ def _parse_declarations_from_sqlite( source_errors += 1 if source_errors <= 10: logger.debug( - f"Could not extract source for " - f"{declaration_name}: {error}" + "Could not extract source for %s: %s", + declaration_name, error, ) progress.update(task, advance=1) continue @@ -452,7 +450,8 @@ def _parse_declarations_from_sqlite( logger.info("Skipped %d declarations without source URL", skipped_no_source) if source_errors > 0: logger.warning( - f"Could not extract source text for {source_errors} declarations" + "Could not extract source text for %d declarations", + source_errors, ) finally: From 9360b4916b27d3bb2a493476e6a5b1008cf30d94 Mon Sep 17 00:00:00 2001 From: justincasher Date: Sat, 4 Apr 2026 15:13:30 -0400 Subject: [PATCH 04/22] Log skipped_prefix and skipped_constructor counters These counters were tracked but never printed, making it harder to debug extraction issues. Now logged alongside the existing skipped_no_source and source_errors messages. --- src/lean_explore/extract/doc_parser.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/lean_explore/extract/doc_parser.py b/src/lean_explore/extract/doc_parser.py index 8e02baf..5b6d146 100644 --- a/src/lean_explore/extract/doc_parser.py +++ b/src/lean_explore/extract/doc_parser.py @@ -446,6 +446,13 @@ def _parse_declarations_from_sqlite( progress.update(task, advance=1) + if skipped_prefix > 0: + logger.info( + "Skipped %d declarations outside allowed prefixes", + skipped_prefix, + ) + if skipped_constructor > 0: + logger.info("Skipped %d .mk constructors", skipped_constructor) if skipped_no_source > 0: logger.info("Skipped %d declarations without source URL", skipped_no_source) if source_errors > 0: From 6934848b36ddf0cedeeba9416398d004a9ba49e1 Mon Sep 17 00:00:00 2001 From: justincasher Date: Sat, 4 Apr 2026 15:13:49 -0400 Subject: [PATCH 05/22] Remove unused module_name parameter from _construct_source_link The parameter was documented but never used in the function body. --- src/lean_explore/extract/doc_parser.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/lean_explore/extract/doc_parser.py b/src/lean_explore/extract/doc_parser.py index 5b6d146..668bb1a 100644 --- a/src/lean_explore/extract/doc_parser.py +++ b/src/lean_explore/extract/doc_parser.py @@ -299,7 +299,6 @@ def _extract_source_text( def _construct_source_link( module_source_url: str | None, - module_name: str, start_line: int, end_line: int, ) -> str | None: @@ -307,7 +306,6 @@ def _construct_source_link( Args: module_source_url: GitHub URL to the module file from api-docs.db. - module_name: Fully qualified module name (e.g., "Mathlib.Algebra.Group"). start_line: Start line number in the source file. end_line: End line number in the source file. @@ -411,7 +409,7 @@ def _parse_declarations_from_sqlite( end_line = row["end_line"] source_link = _construct_source_link( - source_url, module_name, start_line, end_line + source_url, start_line, end_line ) if not source_link: skipped_no_source += 1 From a2ba5c563c4d9d96d896b4c7ca3a76f9dde2b140 Mon Sep 17 00:00:00 2001 From: justincasher Date: Sat, 4 Apr 2026 15:47:45 -0400 Subject: [PATCH 06/22] Fix doc data script lint --- src/lean_explore/extract/doc_parser.py | 39 +++- tests/extract/doc_parser_test.py | 249 +++++++++++++++++++++++++ 2 files changed, 284 insertions(+), 4 deletions(-) diff --git a/src/lean_explore/extract/doc_parser.py b/src/lean_explore/extract/doc_parser.py index 668bb1a..7c95072 100644 --- a/src/lean_explore/extract/doc_parser.py +++ b/src/lean_explore/extract/doc_parser.py @@ -275,6 +275,16 @@ def _extract_source_text( package_name.replace("-", "").lower(), ]: if variant in package_cache: + if variant == "lean4" and file_path_string.startswith("src/lean/"): + adjusted_path = file_path_string[9:] + candidates.append(package_cache[variant] / adjusted_path) + continue + if variant == "lean4" and file_path_string.startswith("src/lake/"): + adjusted_path = file_path_string[9:] + candidates.append( + package_cache[variant].parent / "lake" / adjusted_path + ) + continue if variant == "lean4" and file_path_string.startswith("src/"): adjusted_path = file_path_string[4:] else: @@ -298,6 +308,7 @@ def _extract_source_text( def _construct_source_link( + module_name: str, module_source_url: str | None, start_line: int, end_line: int, @@ -305,6 +316,7 @@ def _construct_source_link( """Construct a GitHub source link from module URL and line range. Args: + module_name: Lean module name from api-docs.db. module_source_url: GitHub URL to the module file from api-docs.db. start_line: Start line number in the source file. end_line: End line number in the source file. @@ -312,9 +324,23 @@ def _construct_source_link( Returns: GitHub URL with line range fragment, or None if no source URL exists. """ - if not module_source_url: - return None - return f"{module_source_url}#L{start_line}-L{end_line}" + if module_source_url: + return f"{module_source_url}#L{start_line}-L{end_line}" + + module_path = module_name.replace(".", "/") + root = module_name.split(".", 1)[0] + if root in {"Init", "Lean", "Std"}: + return ( + "https://github.com/leanprover/lean4/blob/toolchain/" + f"src/lean/{module_path}.lean#L{start_line}-L{end_line}" + ) + if root == "Lake": + return ( + "https://github.com/leanprover/lean4/blob/toolchain/" + f"src/lake/{module_path}.lean#L{start_line}-L{end_line}" + ) + + return None def _parse_declarations_from_sqlite( @@ -351,6 +377,7 @@ def _parse_declarations_from_sqlite( n.position, n.kind, n.name, + n.render, r.start_line, r.end_line, d.text AS docstring, @@ -409,13 +436,17 @@ def _parse_declarations_from_sqlite( end_line = row["end_line"] source_link = _construct_source_link( - source_url, start_line, end_line + module_name, source_url, start_line, end_line ) if not source_link: skipped_no_source += 1 progress.update(task, advance=1) continue + if not row["render"]: + progress.update(task, advance=1) + continue + # Extract source text from local files try: source_text = _extract_source_text( diff --git a/tests/extract/doc_parser_test.py b/tests/extract/doc_parser_test.py index 68a0842..0afdc67 100644 --- a/tests/extract/doc_parser_test.py +++ b/tests/extract/doc_parser_test.py @@ -5,6 +5,7 @@ """ import json +import sqlite3 from unittest.mock import AsyncMock, patch import pytest @@ -12,11 +13,13 @@ from lean_explore.extract.doc_parser import ( _build_package_cache, + _construct_source_link, _extract_dependencies_from_html, _extract_source_text, _filter_auto_generated_projections, _insert_declarations_batch, _parse_declarations_from_files, + _parse_declarations_from_sqlite, _read_source_lines, _strip_lean_comments, extract_declarations, @@ -155,6 +158,51 @@ def test_extract_source_text_file_not_found(self, temp_directory): with pytest.raises(FileNotFoundError): _extract_source_text(source_link, lean_root, package_cache) + def test_extract_source_text_from_lake_toolchain(self, temp_directory): + """Test extracting source text from the Lake toolchain path.""" + lean_root = temp_directory / "lean" + toolchain_root = temp_directory / "toolchain" / "src" + lean_source_dir = toolchain_root / "lean" + lake_source_dir = toolchain_root / "lake" + lean_source_dir.mkdir(parents=True) + lake_file = lake_source_dir / "Lake" / "Config" / "Monad.lean" + lake_file.parent.mkdir(parents=True) + lake_file.write_text("def Lake.Config.Monad.run := 1\n") + + package_cache = {"lean4": lean_source_dir} + source_link = ( + "https://github.com/leanprover/lean4/blob/toolchain/" + "src/lake/Lake/Config/Monad.lean#L1-L1" + ) + + result = _extract_source_text(source_link, lean_root, package_cache) + + assert "Lake.Config.Monad.run" in result + + +class TestSqliteHelpers: + """Tests for SQLite-specific parsing helpers.""" + + def test_construct_source_link_for_core_module(self): + """Test source links for core modules without a stored source URL.""" + result = _construct_source_link("Init.Data.Nat.Basic", None, 12, 15) + + assert ( + result + == "https://github.com/leanprover/lean4/blob/toolchain/" + "src/lean/Init/Data/Nat/Basic.lean#L12-L15" + ) + + def test_construct_source_link_for_lake_module(self): + """Test source links for Lake modules without a stored source URL.""" + result = _construct_source_link("Lake.Config.Monad", None, 7, 9) + + assert ( + result + == "https://github.com/leanprover/lean4/blob/toolchain/" + "src/lake/Lake/Config/Monad.lean#L7-L9" + ) + class TestDependencyExtraction: """Tests for dependency extraction from HTML.""" @@ -273,6 +321,207 @@ def test_parse_declarations_filters_packages(self, temp_directory): assert len(declarations) == 0 + def test_parse_declarations_from_sqlite_filters_non_rendered( + self, temp_directory + ): + """Test SQLite parsing only keeps rendered declarations.""" + lean_root = temp_directory / "lean" + database_path = temp_directory / "api-docs.db" + + source_dir = lean_root / "mathlib" / ".lake" / "packages" / "mathlib4" + source_file = source_dir / "Mathlib" / "Data" / "Nat" / "Basic.lean" + source_file.parent.mkdir(parents=True) + source_file.write_text( + "theorem Nat.visible : True := trivial\n" + "def Nat.hidden : Nat := 0\n" + ) + + connection = sqlite3.connect(database_path) + connection.executescript( + """ + CREATE TABLE modules (name TEXT PRIMARY KEY, source_url TEXT); + CREATE TABLE name_info ( + module_name TEXT NOT NULL, + position INTEGER NOT NULL, + kind TEXT, + name TEXT NOT NULL, + type BLOB NOT NULL, + sorried INTEGER NOT NULL, + render INTEGER NOT NULL, + PRIMARY KEY (module_name, position) + ); + CREATE TABLE declaration_ranges ( + module_name TEXT NOT NULL, + position INTEGER NOT NULL, + start_line INTEGER NOT NULL, + start_column INTEGER NOT NULL, + start_utf16 INTEGER NOT NULL, + end_line INTEGER NOT NULL, + end_column INTEGER NOT NULL, + end_utf16 INTEGER NOT NULL, + PRIMARY KEY (module_name, position) + ); + CREATE TABLE declaration_markdown_docstrings ( + module_name TEXT NOT NULL, + position INTEGER NOT NULL, + text TEXT NOT NULL, + PRIMARY KEY (module_name, position) + ); + """ + ) + connection.execute( + "INSERT INTO modules (name, source_url) VALUES (?, ?)", + ( + "Mathlib.Data.Nat.Basic", + "https://github.com/leanprover-community/mathlib4/blob/master/" + "Mathlib/Data/Nat/Basic.lean", + ), + ) + connection.executemany( + """ + INSERT INTO name_info + (module_name, position, kind, name, type, sorried, render) + VALUES (?, ?, ?, ?, ?, ?, ?) + """, + [ + ( + "Mathlib.Data.Nat.Basic", + 1, + "theorem", + "Nat.visible", + b"", + 0, + 1, + ), + ( + "Mathlib.Data.Nat.Basic", + 2, + "definition", + "Nat.hidden", + b"", + 0, + 0, + ), + ], + ) + connection.executemany( + """ + INSERT INTO declaration_ranges + (module_name, position, start_line, start_column, start_utf16, + end_line, end_column, end_utf16) + VALUES (?, ?, ?, ?, ?, ?, ?, ?) + """, + [ + ("Mathlib.Data.Nat.Basic", 1, 1, 0, 0, 1, 32, 32), + ("Mathlib.Data.Nat.Basic", 2, 2, 0, 0, 2, 22, 22), + ], + ) + connection.execute( + """ + INSERT INTO declaration_markdown_docstrings + (module_name, position, text) + VALUES (?, ?, ?) + """, + ("Mathlib.Data.Nat.Basic", 1, "Visible theorem"), + ) + connection.commit() + connection.close() + + declarations = _parse_declarations_from_sqlite( + database_path, + lean_root, + _build_package_cache(lean_root), + allowed_module_prefixes=["Mathlib"], + ) + + assert [declaration.name for declaration in declarations] == ["Nat.visible"] + assert declarations[0].docstring == "Visible theorem" + + def test_parse_declarations_from_sqlite_uses_core_fallback_source_link( + self, temp_directory + ): + """Test SQLite parsing for core modules without a stored source URL.""" + lean_root = temp_directory / "lean" + database_path = temp_directory / "api-docs.db" + + toolchain_root = temp_directory / "toolchain" / "src" + lean_source_dir = toolchain_root / "lean" + init_file = lean_source_dir / "Init" / "Data" / "Nat" / "Basic.lean" + init_file.parent.mkdir(parents=True) + init_file.write_text("theorem Nat.core : True := trivial\n") + + connection = sqlite3.connect(database_path) + connection.executescript( + """ + CREATE TABLE modules (name TEXT PRIMARY KEY, source_url TEXT); + CREATE TABLE name_info ( + module_name TEXT NOT NULL, + position INTEGER NOT NULL, + kind TEXT, + name TEXT NOT NULL, + type BLOB NOT NULL, + sorried INTEGER NOT NULL, + render INTEGER NOT NULL, + PRIMARY KEY (module_name, position) + ); + CREATE TABLE declaration_ranges ( + module_name TEXT NOT NULL, + position INTEGER NOT NULL, + start_line INTEGER NOT NULL, + start_column INTEGER NOT NULL, + start_utf16 INTEGER NOT NULL, + end_line INTEGER NOT NULL, + end_column INTEGER NOT NULL, + end_utf16 INTEGER NOT NULL, + PRIMARY KEY (module_name, position) + ); + CREATE TABLE declaration_markdown_docstrings ( + module_name TEXT NOT NULL, + position INTEGER NOT NULL, + text TEXT NOT NULL, + PRIMARY KEY (module_name, position) + ); + """ + ) + connection.execute( + "INSERT INTO modules (name, source_url) VALUES (?, ?)", + ("Init.Data.Nat.Basic", None), + ) + connection.execute( + """ + INSERT INTO name_info + (module_name, position, kind, name, type, sorried, render) + VALUES (?, ?, ?, ?, ?, ?, ?) + """, + ("Init.Data.Nat.Basic", 1, "theorem", "Nat.core", b"", 0, 1), + ) + connection.execute( + """ + INSERT INTO declaration_ranges + (module_name, position, start_line, start_column, start_utf16, + end_line, end_column, end_utf16) + VALUES (?, ?, ?, ?, ?, ?, ?, ?) + """, + ("Init.Data.Nat.Basic", 1, 1, 0, 0, 1, 31, 31), + ) + connection.commit() + connection.close() + + declarations = _parse_declarations_from_sqlite( + database_path, + lean_root, + {"lean4": lean_source_dir}, + allowed_module_prefixes=["Init"], + ) + + assert len(declarations) == 1 + assert declarations[0].name == "Nat.core" + assert ( + declarations[0].source_link + == "https://github.com/leanprover/lean4/blob/toolchain/" + "src/lean/Init/Data/Nat/Basic.lean#L1-L1" + ) + class TestDeclarationInsertion: """Tests for database insertion.""" From e25c77d44421b410a08f8c1db1e08c4fbc6d1117 Mon Sep 17 00:00:00 2001 From: justincasher Date: Sat, 4 Apr 2026 15:49:35 -0400 Subject: [PATCH 07/22] Fix doc-gen4 version handling and tests --- src/lean_explore/extract/doc_gen4.py | 45 ++++++++++++++--- tests/extract/__main___test.py | 73 ++++++++++++++++++++-------- tests/extract/doc_gen4_test.py | 62 +++++++++++++++++++++++ 3 files changed, 152 insertions(+), 28 deletions(-) create mode 100644 tests/extract/doc_gen4_test.py diff --git a/src/lean_explore/extract/doc_gen4.py b/src/lean_explore/extract/doc_gen4.py index 7008f9a..763476c 100644 --- a/src/lean_explore/extract/doc_gen4.py +++ b/src/lean_explore/extract/doc_gen4.py @@ -6,6 +6,7 @@ import logging import os +import re import shutil import subprocess import time @@ -23,6 +24,29 @@ logger = logging.getLogger(__name__) +def _uses_sqlite_docgen(lean_version: str) -> bool: + """Return whether the matching doc-gen4 release writes api-docs.db. + + Doc-gen4's SQLite output landed after `v4.29.0-rc1` and is present starting + with `v4.29.0-rc2`. + """ + match = extract_lean_version(lean_version) + version_match = re.match(r"^v(\d+)\.(\d+)\.(\d+)(?:-rc(\d+))?$", match) + if version_match is None: + return False + + major, minor, patch, rc = version_match.groups() + version = (int(major), int(minor), int(patch)) + if version > (4, 29, 0): + return True + if version < (4, 29, 0): + return False + + if rc is None: + return True + return int(rc) >= 2 + + def _clear_workspace_cache(workspace_path: Path) -> None: """Clear entire Lake cache to force complete rebuild. @@ -227,13 +251,17 @@ async def run_doc_gen4( workspace_path = Path("lean") / package_name logger.info("\n%s\nPackage: %s\n%s", "=" * 50, package_name, "=" * 50) + toolchain = None + ref = None if fresh: - # Packages using doc-gen4 >= v4.29 output to a SQLite database - # (api-docs.db) which handles incremental updates correctly. - # Only packages using the legacy BMP format need a full cache - # clear to avoid stale output files. - api_docs_db = workspace_path / ".lake" / "build" / "api-docs.db" - if api_docs_db.exists(): + if setup: + toolchain, ref = _setup_workspace(config) + logger.info("Toolchain: %s, ref: %s", toolchain, ref) + + # Doc-gen4 switched from BMP files to api-docs.db in v4.29.0-rc2. + # The SQLite format handles incremental updates, while legacy BMP + # output requires a cache clear to avoid stale files. + if toolchain and _uses_sqlite_docgen(toolchain): logger.info( "[%s] Skipping cache clear " "(api-docs.db handles incremental updates)", @@ -243,8 +271,9 @@ async def run_doc_gen4( _clear_workspace_cache(workspace_path) if setup: - toolchain, ref = _setup_workspace(config) - logger.info("Toolchain: %s, ref: %s", toolchain, ref) + if toolchain is None or ref is None: + toolchain, ref = _setup_workspace(config) + logger.info("Toolchain: %s, ref: %s", toolchain, ref) _run_lake_for_package(package_name, verbose) diff --git a/tests/extract/__main___test.py b/tests/extract/__main___test.py index 84607c6..aa9ee4d 100644 --- a/tests/extract/__main___test.py +++ b/tests/extract/__main___test.py @@ -50,32 +50,65 @@ class TestDocGen4Step: @pytest.mark.external async def test_run_doc_gen4_step_success(self, temp_directory): """Test successful doc-gen4 execution.""" - lean_directory = temp_directory / "lean" - lean_directory.mkdir() - - # Mock subprocess to avoid actually running lake - with patch("lean_explore.extract.doc_gen4.subprocess.Popen") as mock_popen: - mock_process = MagicMock() - mock_process.stdout = iter(["Building...\n", "Complete!\n"]) - mock_process.wait.return_value = 0 - mock_popen.return_value = mock_process + with patch( + "lean_explore.extract.doc_gen4.get_extraction_order", + return_value=["mathlib"], + ): + with patch( + "lean_explore.extract.doc_gen4._setup_workspace", + return_value=("leanprover/lean4:v4.29.0-rc2", "v4.29.0-rc2"), + ): + with patch( + "lean_explore.extract.doc_gen4.subprocess.run" + ) as mock_run: + mock_run.return_value = MagicMock( + returncode=0, + stdout="", + stderr="", + ) + with patch( + "lean_explore.extract.doc_gen4.subprocess.Popen" + ) as mock_popen: + mock_process = MagicMock() + mock_process.stdout = iter(["Building...\n", "Complete!\n"]) + mock_process.wait.return_value = 0 + mock_popen.return_value = mock_process - await _run_doc_gen4_step() + await _run_doc_gen4_step() - # Called twice: "lake build" and "lake build LeanExtract:docs" - assert mock_popen.call_count == 2 + assert mock_run.call_count == 2 + # Called twice: "lake build" and "lake build MathExtract:docs" + assert mock_popen.call_count == 2 @pytest.mark.external async def test_run_doc_gen4_step_failure(self): """Test doc-gen4 execution failure.""" - with patch("lean_explore.extract.doc_gen4.subprocess.Popen") as mock_popen: - mock_process = MagicMock() - mock_process.stdout = iter(["Error!\n"]) - mock_process.wait.return_value = 1 - mock_popen.return_value = mock_process - - with pytest.raises(RuntimeError, match="failed"): - await _run_doc_gen4_step() + with patch( + "lean_explore.extract.doc_gen4.get_extraction_order", + return_value=["mathlib"], + ): + with patch( + "lean_explore.extract.doc_gen4._setup_workspace", + return_value=("leanprover/lean4:v4.29.0-rc2", "v4.29.0-rc2"), + ): + with patch( + "lean_explore.extract.doc_gen4.subprocess.run" + ) as mock_run: + mock_run.return_value = MagicMock( + returncode=0, + stdout="", + stderr="", + ) + with patch( + "lean_explore.extract.doc_gen4.subprocess.Popen" + ) as mock_popen: + mock_process = MagicMock() + mock_process.stdout = iter(["Error!\n"]) + mock_process.wait.return_value = 1 + mock_popen.return_value = mock_process + + with pytest.raises(RuntimeError, match="failed"): + await _run_doc_gen4_step() class TestExtractionStep: diff --git a/tests/extract/doc_gen4_test.py b/tests/extract/doc_gen4_test.py new file mode 100644 index 0000000..f8443bf --- /dev/null +++ b/tests/extract/doc_gen4_test.py @@ -0,0 +1,62 @@ +"""Tests for doc-gen4 workspace orchestration.""" + +from unittest.mock import patch + +from lean_explore.extract.doc_gen4 import _uses_sqlite_docgen, run_doc_gen4 + + +class TestDocGen4VersionDetection: + """Tests for doc-gen4 format detection by Lean version.""" + + def test_uses_sqlite_docgen_from_rc2(self): + """Test the SQLite cutoff at v4.29.0-rc2.""" + assert _uses_sqlite_docgen("leanprover/lean4:v4.29.0-rc2") + assert _uses_sqlite_docgen("leanprover/lean4:v4.29.0") + assert not _uses_sqlite_docgen("leanprover/lean4:v4.29.0-rc1") + assert not _uses_sqlite_docgen("leanprover/lean4:v4.28.0") + + +class TestRunDocGen4FreshHandling: + """Tests for fresh-cache handling in run_doc_gen4.""" + + async def test_fresh_legacy_workspace_clears_cache(self): + """Test that legacy doc-gen4 workspaces still clear the cache.""" + with patch( + "lean_explore.extract.doc_gen4.get_extraction_order", + return_value=["mathlib"], + ): + with patch( + "lean_explore.extract.doc_gen4._setup_workspace", + return_value=("leanprover/lean4:v4.29.0-rc1", "v4.29.0-rc1"), + ): + with patch( + "lean_explore.extract.doc_gen4._clear_workspace_cache" + ) as mock_clear: + with patch( + "lean_explore.extract.doc_gen4._run_lake_for_package" + ) as mock_run: + await run_doc_gen4(fresh=True) + + mock_clear.assert_called_once() + mock_run.assert_called_once_with("mathlib", False) + + async def test_fresh_sqlite_workspace_skips_cache_clear(self): + """Test that SQLite doc-gen4 workspaces keep the cache.""" + with patch( + "lean_explore.extract.doc_gen4.get_extraction_order", + return_value=["mathlib"], + ): + with patch( + "lean_explore.extract.doc_gen4._setup_workspace", + return_value=("leanprover/lean4:v4.29.0-rc2", "v4.29.0-rc2"), + ): + with patch( + "lean_explore.extract.doc_gen4._clear_workspace_cache" + ) as mock_clear: + with patch( + "lean_explore.extract.doc_gen4._run_lake_for_package" + ) as mock_run: + await run_doc_gen4(fresh=True) + + mock_clear.assert_not_called() + mock_run.assert_called_once_with("mathlib", False) From 8224f5d96e689dc722d6c86d7b76922362750826 Mon Sep 17 00:00:00 2001 From: justincasher Date: Sat, 4 Apr 2026 15:49:41 -0400 Subject: [PATCH 08/22] Correct doc-gen4 SQLite cutoff references --- src/lean_explore/extract/doc_parser.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/lean_explore/extract/doc_parser.py b/src/lean_explore/extract/doc_parser.py index 7c95072..25fc154 100644 --- a/src/lean_explore/extract/doc_parser.py +++ b/src/lean_explore/extract/doc_parser.py @@ -4,8 +4,8 @@ Declaration objects ready for database insertion. Supports two doc-gen4 output formats: -- SQLite database (api-docs.db): Used by doc-gen4 >= v4.29.0-rc1 -- BMP JSON files (.bmp): Used by doc-gen4 < v4.29.0-rc1 +- SQLite database (api-docs.db): Used by doc-gen4 >= v4.29.0-rc2 +- BMP JSON files (.bmp): Used by doc-gen4 < v4.29.0-rc2 """ import json @@ -351,7 +351,7 @@ def _parse_declarations_from_sqlite( ) -> list[Declaration]: """Parse declarations from a doc-gen4 SQLite database (api-docs.db). - Doc-gen4 >= v4.29.0-rc1 outputs declaration data to a SQLite database + Doc-gen4 >= v4.29.0-rc2 outputs declaration data to a SQLite database instead of individual BMP JSON files. This function reads that database and produces the same Declaration objects as the BMP parser. @@ -635,7 +635,7 @@ async def _insert_declarations_batch( def _detect_docgen_format(workspace_path: Path) -> str: """Detect which doc-gen4 output format a workspace uses. - Doc-gen4 >= v4.29.0-rc1 writes to a SQLite database (api-docs.db). + Doc-gen4 >= v4.29.0-rc2 writes to a SQLite database (api-docs.db). Earlier versions write individual BMP JSON files to doc-data/. Args: @@ -661,7 +661,7 @@ async def extract_declarations(engine: AsyncEngine, batch_size: int = 1000) -> N """Extract all declarations from doc-gen4 data and load into database. Automatically detects whether each package uses the newer SQLite format - (api-docs.db from doc-gen4 >= v4.29.0-rc1) or the legacy BMP JSON format. + (api-docs.db from doc-gen4 >= v4.29.0-rc2) or the legacy BMP JSON format. Args: engine: SQLAlchemy async engine for database connection. From 954bff5f4a521b52ef0f7f8f63f07a0adc4ba8b2 Mon Sep 17 00:00:00 2001 From: justincasher Date: Sat, 11 Apr 2026 11:05:57 -0400 Subject: [PATCH 09/22] Fix dependency extraction for SQLite-backed packages The SQLite extraction path was hard-coding dependencies=None for every declaration, silently breaking dependency-based search ranking and informalization layering for mathlib/cslib. Add a RenderedCode BLOB parser that deserializes the leansqlite binary format used in name_info.type to extract const references (declaration dependencies) from type signatures. The parser handles the full TaggedText/RenderedCode.Tag/Name encoding including variable-length Nat and nested structures. Also adds the n.type column to the SQLite query so the BLOB is available for parsing. --- src/lean_explore/extract/doc_parser.py | 159 +++++++++++++++- tests/extract/doc_parser_test.py | 254 ++++++++++++++++++++++++- 2 files changed, 409 insertions(+), 4 deletions(-) diff --git a/src/lean_explore/extract/doc_parser.py b/src/lean_explore/extract/doc_parser.py index 25fc154..bbab127 100644 --- a/src/lean_explore/extract/doc_parser.py +++ b/src/lean_explore/extract/doc_parser.py @@ -31,6 +31,148 @@ logger = logging.getLogger(__name__) +# --------------------------------------------------------------------------- +# RenderedCode BLOB parser +# +# Doc-gen4 stores declaration type signatures in the `name_info.type` column +# as a binary BLOB using leansqlite's ToBinary serialization format. +# +# The type is RenderedCode = TaggedText RenderedCode.Tag where: +# TaggedText: text(0) String | tag(1) Tag TaggedText | append(2) Array +# Tag: keyword(0) | string(1) | const(2) Name | sort-none(3) +# | sort-type(4) | sort-prop(5) | sort-sort(6) | otherExpr(7) +# Name: anonymous(0) | str(1) Name String | num(2) Name Nat +# +# Encoding primitives (big-endian, leansqlite Classes.lean): +# Nat – variable-length 7-bit chunks, high bit = continuation +# String – Nat(utf8_byte_length) + raw UTF-8 bytes +# Array – Nat(count) + elements +# --------------------------------------------------------------------------- + + +class _BlobReader: + """Minimal reader for leansqlite ToBinary format.""" + + __slots__ = ("_data", "_cursor") + + def __init__(self, data: bytes) -> None: + self._data = data + self._cursor = 0 + + def _read_byte(self) -> int: + if self._cursor >= len(self._data): + raise ValueError("Unexpected end of BLOB data") + value = self._data[self._cursor] + self._cursor += 1 + return value + + def _read_nat(self) -> int: + """Read a variable-length natural number (7-bit chunks, MSB = more).""" + result = 0 + shift = 0 + while True: + byte = self._read_byte() + if byte >= 128: + result |= (byte & 0x7F) << shift + else: + result |= byte << shift + break + shift += 7 + return result + + def _read_string(self) -> str: + byte_length = self._read_nat() + if self._cursor + byte_length > len(self._data): + raise ValueError("String extends past end of BLOB data") + raw = self._data[self._cursor : self._cursor + byte_length] + self._cursor += byte_length + return raw.decode("utf-8") + + def _read_name(self) -> str: + """Read a Lean Name and return its dot-separated string form.""" + tag = self._read_byte() + if tag == 0: # anonymous + return "" + if tag == 1: # str parent s + parent = self._read_name() + component = self._read_string() + return f"{parent}.{component}" if parent else component + if tag == 2: # num parent n + parent = self._read_name() + number = self._read_nat() + return f"{parent}.{number}" if parent else str(number) + raise ValueError(f"Invalid Name tag: {tag}") + + def _skip_name(self) -> None: + """Skip over a Name without allocating strings.""" + tag = self._read_byte() + if tag == 0: + return + if tag == 1: + self._skip_name() + byte_length = self._read_nat() + self._cursor += byte_length + return + if tag == 2: + self._skip_name() + self._read_nat() + return + raise ValueError(f"Invalid Name tag: {tag}") + + +def _extract_names_from_rendered_code(blob: bytes) -> list[str]: + """Extract referenced declaration names from a RenderedCode BLOB. + + Walks the TaggedText tree and collects Lean Names from every + RenderedCode.Tag.const node (tag byte 2). + + Args: + blob: Raw bytes of the RenderedCode BLOB from name_info.type. + + Returns: + De-duplicated list of fully-qualified Lean names referenced in the type. + """ + reader = _BlobReader(blob) + names: list[str] = [] + seen: set[str] = set() + + def walk_tagged_text() -> None: + tag = reader._read_byte() + if tag == 0: # text + reader._read_string() + elif tag == 1: # tag + walk_tag() + walk_tagged_text() + elif tag == 2: # append + count = reader._read_nat() + for _ in range(count): + walk_tagged_text() + else: + raise ValueError(f"Invalid TaggedText tag: {tag}") + + def walk_tag() -> None: + tag = reader._read_byte() + if tag <= 1 or (3 <= tag <= 7): + # keyword(0), string(1), sort-none(3), sort-type(4), + # sort-prop(5), sort-sort(6), otherExpr(7) — no payload + return + if tag == 2: # const + name = reader._read_name() + if name and name not in seen: + names.append(name) + seen.add(name) + return + raise ValueError(f"Invalid RenderedCode.Tag tag: {tag}") + + try: + walk_tagged_text() + except (ValueError, IndexError): + logger.debug("Failed to parse RenderedCode BLOB (%d bytes)", len(blob)) + return [] + + return names + + def _strip_lean_comments(source_text: str) -> str: """Strip Lean comments from source text for comparison. @@ -377,6 +519,7 @@ def _parse_declarations_from_sqlite( n.position, n.kind, n.name, + n.type, n.render, r.start_line, r.end_line, @@ -462,6 +605,20 @@ def _parse_declarations_from_sqlite( progress.update(task, advance=1) continue + # Extract dependency names from the type signature BLOB + type_blob = row["type"] + if type_blob: + dep_names = _extract_names_from_rendered_code( + bytes(type_blob) + ) + # Filter out self-references + dep_names = [ + d for d in dep_names if d != declaration_name + ] + dependencies = dep_names or None + else: + dependencies = None + declarations.append( Declaration( name=declaration_name, @@ -469,7 +626,7 @@ def _parse_declarations_from_sqlite( docstring=row["docstring"], source_text=source_text, source_link=source_link, - dependencies=None, + dependencies=dependencies, ) ) diff --git a/tests/extract/doc_parser_test.py b/tests/extract/doc_parser_test.py index 0afdc67..0d92815 100644 --- a/tests/extract/doc_parser_test.py +++ b/tests/extract/doc_parser_test.py @@ -12,9 +12,11 @@ from sqlalchemy import select from lean_explore.extract.doc_parser import ( + _BlobReader, _build_package_cache, _construct_source_link, _extract_dependencies_from_html, + _extract_names_from_rendered_code, _extract_source_text, _filter_auto_generated_projections, _insert_declarations_batch, @@ -180,6 +182,158 @@ def test_extract_source_text_from_lake_toolchain(self, temp_directory): assert "Lake.Config.Monad.run" in result +def _encode_nat(n: int) -> bytes: + """Encode a natural number in leansqlite's variable-length format.""" + chunks = [] + while n >= 128: + chunks.append((n & 0x7F) | 0x80) + n >>= 7 + chunks.append(n) + return bytes(chunks) + + +def _encode_string(s: str) -> bytes: + """Encode a string: Nat(utf8_byte_length) + UTF-8 bytes.""" + encoded = s.encode("utf-8") + return _encode_nat(len(encoded)) + encoded + + +def _encode_name(name: str) -> bytes: + """Encode a dotted Lean name (e.g. 'Nat.add') into binary.""" + if not name: + return b"\x00" # anonymous + parts = name.split(".") + result = b"\x00" # start with anonymous + for part in parts: + # Name.str = tag 1 + parent + string + result = b"\x01" + result + _encode_string(part) + return result + + +def _make_const_tag(name: str) -> bytes: + """Build a RenderedCode.Tag.const(name) blob fragment.""" + return b"\x02" + _encode_name(name) + + +def _make_text_node(s: str) -> bytes: + """Build a TaggedText.text(s) blob fragment.""" + return b"\x00" + _encode_string(s) + + +def _make_tag_node(tag_bytes: bytes, inner: bytes) -> bytes: + """Build a TaggedText.tag(tag, inner) blob fragment.""" + return b"\x01" + tag_bytes + inner + + +def _make_append_node(children: list[bytes]) -> bytes: + """Build a TaggedText.append(children) blob fragment.""" + return b"\x02" + _encode_nat(len(children)) + b"".join(children) + + +class TestRenderedCodeBlobParser: + """Tests for RenderedCode BLOB parsing to extract dependency names.""" + + def test_extract_single_const(self): + """Test extracting a single const name from a type BLOB.""" + # TaggedText.tag(Tag.const("Nat"), TaggedText.text("Nat")) + blob = _make_tag_node(_make_const_tag("Nat"), _make_text_node("Nat")) + names = _extract_names_from_rendered_code(blob) + assert names == ["Nat"] + + def test_extract_multiple_consts(self): + """Test extracting multiple const names from a type BLOB.""" + # append([tag(const Nat, text Nat), text " → ", tag(const Bool, text Bool)]) + blob = _make_append_node([ + _make_tag_node(_make_const_tag("Nat"), _make_text_node("Nat")), + _make_text_node(" → "), + _make_tag_node(_make_const_tag("Bool"), _make_text_node("Bool")), + ]) + names = _extract_names_from_rendered_code(blob) + assert names == ["Nat", "Bool"] + + def test_extract_dotted_name(self): + """Test extracting a dotted name like Nat.add.""" + blob = _make_tag_node( + _make_const_tag("Nat.add"), _make_text_node("Nat.add") + ) + names = _extract_names_from_rendered_code(blob) + assert names == ["Nat.add"] + + def test_deduplicates_names(self): + """Test that duplicate const references are deduplicated.""" + blob = _make_append_node([ + _make_tag_node(_make_const_tag("Nat"), _make_text_node("Nat")), + _make_text_node(" → "), + _make_tag_node(_make_const_tag("Nat"), _make_text_node("Nat")), + ]) + names = _extract_names_from_rendered_code(blob) + assert names == ["Nat"] + + def test_skips_non_const_tags(self): + """Test that keyword, string, sort, otherExpr tags are skipped.""" + blob = _make_append_node([ + _make_tag_node(b"\x00", _make_text_node("def")), # keyword + _make_text_node(" "), + _make_tag_node(_make_const_tag("Nat"), _make_text_node("Nat")), + _make_tag_node(b"\x07", _make_text_node("x")), # otherExpr + ]) + names = _extract_names_from_rendered_code(blob) + assert names == ["Nat"] + + def test_empty_blob_returns_empty(self): + """Test that an empty or invalid BLOB returns empty list.""" + assert _extract_names_from_rendered_code(b"") == [] + assert _extract_names_from_rendered_code(b"\xff") == [] + + def test_text_only_returns_empty(self): + """Test that a BLOB with only text returns no names.""" + blob = _make_text_node("hello world") + names = _extract_names_from_rendered_code(blob) + assert names == [] + + def test_nested_tagged_text(self): + """Test parsing nested tag nodes.""" + # tag(otherExpr, tag(const("List"), text("List"))) + inner = _make_tag_node(_make_const_tag("List"), _make_text_node("List")) + blob = _make_tag_node(b"\x07", inner) # otherExpr wrapping + names = _extract_names_from_rendered_code(blob) + assert names == ["List"] + + def test_blob_reader_nat_encoding(self): + """Test that Nat encoding/decoding matches leansqlite format.""" + reader = _BlobReader(_encode_nat(0)) + assert reader._read_nat() == 0 + + reader = _BlobReader(_encode_nat(127)) + assert reader._read_nat() == 127 + + reader = _BlobReader(_encode_nat(128)) + assert reader._read_nat() == 128 + + reader = _BlobReader(_encode_nat(300)) + assert reader._read_nat() == 300 + + reader = _BlobReader(_encode_nat(100000)) + assert reader._read_nat() == 100000 + + def test_blob_reader_name_roundtrip(self): + """Test that Name encoding produces correct dot-separated output.""" + reader = _BlobReader(_encode_name("Mathlib.Data.Nat.Basic")) + assert reader._read_name() == "Mathlib.Data.Nat.Basic" + + reader = _BlobReader(_encode_name("")) + assert reader._read_name() == "" + + def test_sort_tags_handled(self): + """Test that sort tag variants (3-6) are handled without error.""" + for sort_byte in [3, 4, 5, 6]: + blob = _make_tag_node( + bytes([sort_byte]), _make_text_node("Type") + ) + names = _extract_names_from_rendered_code(blob) + assert names == [] + + class TestSqliteHelpers: """Tests for SQLite-specific parsing helpers.""" @@ -389,7 +543,7 @@ def test_parse_declarations_from_sqlite_filters_non_rendered( 1, "theorem", "Nat.visible", - b"", + _make_text_node("True"), 0, 1, ), @@ -398,7 +552,7 @@ def test_parse_declarations_from_sqlite_filters_non_rendered( 2, "definition", "Nat.hidden", - b"", + _make_text_node("Nat"), 0, 0, ), @@ -437,6 +591,97 @@ def test_parse_declarations_from_sqlite_filters_non_rendered( assert [declaration.name for declaration in declarations] == ["Nat.visible"] assert declarations[0].docstring == "Visible theorem" + def test_parse_declarations_from_sqlite_extracts_dependencies( + self, temp_directory + ): + """Test that SQLite parsing extracts dependencies from type BLOBs.""" + lean_root = temp_directory / "lean" + database_path = temp_directory / "api-docs.db" + + source_dir = lean_root / "mathlib" / ".lake" / "packages" / "mathlib4" + source_file = source_dir / "Mathlib" / "Data" / "Nat" / "Basic.lean" + source_file.parent.mkdir(parents=True) + source_file.write_text("def Nat.myFunc (n : Nat) : Bool := true\n") + + # Build a type BLOB: "Nat → Bool" + type_blob = _make_append_node([ + _make_tag_node(_make_const_tag("Nat"), _make_text_node("Nat")), + _make_text_node(" → "), + _make_tag_node(_make_const_tag("Bool"), _make_text_node("Bool")), + ]) + + connection = sqlite3.connect(database_path) + connection.executescript( + """ + CREATE TABLE modules (name TEXT PRIMARY KEY, source_url TEXT); + CREATE TABLE name_info ( + module_name TEXT NOT NULL, + position INTEGER NOT NULL, + kind TEXT, + name TEXT NOT NULL, + type BLOB NOT NULL, + sorried INTEGER NOT NULL, + render INTEGER NOT NULL, + PRIMARY KEY (module_name, position) + ); + CREATE TABLE declaration_ranges ( + module_name TEXT NOT NULL, + position INTEGER NOT NULL, + start_line INTEGER NOT NULL, + start_column INTEGER NOT NULL, + start_utf16 INTEGER NOT NULL, + end_line INTEGER NOT NULL, + end_column INTEGER NOT NULL, + end_utf16 INTEGER NOT NULL, + PRIMARY KEY (module_name, position) + ); + CREATE TABLE declaration_markdown_docstrings ( + module_name TEXT NOT NULL, + position INTEGER NOT NULL, + text TEXT NOT NULL, + PRIMARY KEY (module_name, position) + ); + """ + ) + connection.execute( + "INSERT INTO modules (name, source_url) VALUES (?, ?)", + ( + "Mathlib.Data.Nat.Basic", + "https://github.com/leanprover-community/mathlib4/blob/master/" + "Mathlib/Data/Nat/Basic.lean", + ), + ) + connection.execute( + """ + INSERT INTO name_info + (module_name, position, kind, name, type, sorried, render) + VALUES (?, ?, ?, ?, ?, ?, ?) + """, + ("Mathlib.Data.Nat.Basic", 1, "definition", "Nat.myFunc", type_blob, 0, 1), + ) + connection.execute( + """ + INSERT INTO declaration_ranges + (module_name, position, start_line, start_column, start_utf16, + end_line, end_column, end_utf16) + VALUES (?, ?, ?, ?, ?, ?, ?, ?) + """, + ("Mathlib.Data.Nat.Basic", 1, 1, 0, 0, 1, 40, 40), + ) + connection.commit() + connection.close() + + declarations = _parse_declarations_from_sqlite( + database_path, + lean_root, + _build_package_cache(lean_root), + allowed_module_prefixes=["Mathlib"], + ) + + assert len(declarations) == 1 + assert declarations[0].name == "Nat.myFunc" + assert declarations[0].dependencies == ["Nat", "Bool"] + def test_parse_declarations_from_sqlite_uses_core_fallback_source_link( self, temp_directory ): @@ -493,7 +738,10 @@ def test_parse_declarations_from_sqlite_uses_core_fallback_source_link( (module_name, position, kind, name, type, sorried, render) VALUES (?, ?, ?, ?, ?, ?, ?) """, - ("Init.Data.Nat.Basic", 1, "theorem", "Nat.core", b"", 0, 1), + ( + "Init.Data.Nat.Basic", 1, "theorem", "Nat.core", + _make_text_node("True"), 0, 1, + ), ) connection.execute( """ From 73a90db6690419e0dda08025c1937a63823bbaf3 Mon Sep 17 00:00:00 2001 From: justincasher Date: Sat, 11 Apr 2026 11:08:16 -0400 Subject: [PATCH 10/22] Fix dead source links for core modules in SQLite path The fallback source links for Init/Lean/Std/Lake modules used a 'toolchain' placeholder as the git ref, producing non-functional GitHub URLs. Doc-gen4 itself uses the actual Lean compiler hash for the same purpose. Read the lean-toolchain file from each workspace to get the real version tag (e.g., v4.29.0-rc6) and use it as the git ref. Falls back to 'master' when the toolchain file is unavailable. --- src/lean_explore/extract/doc_parser.py | 35 +++++++++++-- tests/extract/doc_parser_test.py | 70 ++++++++++++++++++++++---- 2 files changed, 93 insertions(+), 12 deletions(-) diff --git a/src/lean_explore/extract/doc_parser.py b/src/lean_explore/extract/doc_parser.py index bbab127..98535dc 100644 --- a/src/lean_explore/extract/doc_parser.py +++ b/src/lean_explore/extract/doc_parser.py @@ -449,11 +449,32 @@ def _extract_source_text( ) +def _read_lean_toolchain_version(workspace_path: Path) -> str | None: + """Read the Lean version from a workspace's lean-toolchain file. + + Args: + workspace_path: Path to the package workspace (e.g., lean/mathlib). + + Returns: + Version string like 'v4.29.0-rc6', or None if not found. + """ + toolchain_file = workspace_path / "lean-toolchain" + if not toolchain_file.exists(): + return None + try: + content = toolchain_file.read_text().strip() + match = re.search(r"v\d+\.\d+\.\d+(?:-rc\d+)?", content) + return match.group() if match else None + except OSError: + return None + + def _construct_source_link( module_name: str, module_source_url: str | None, start_line: int, end_line: int, + lean_version: str | None = None, ) -> str | None: """Construct a GitHub source link from module URL and line range. @@ -462,6 +483,8 @@ def _construct_source_link( module_source_url: GitHub URL to the module file from api-docs.db. start_line: Start line number in the source file. end_line: End line number in the source file. + lean_version: Lean toolchain version (e.g., 'v4.29.0-rc6') used as + the git ref for core module fallback URLs. Returns: GitHub URL with line range fragment, or None if no source URL exists. @@ -469,16 +492,17 @@ def _construct_source_link( if module_source_url: return f"{module_source_url}#L{start_line}-L{end_line}" + git_ref = lean_version or "master" module_path = module_name.replace(".", "/") root = module_name.split(".", 1)[0] if root in {"Init", "Lean", "Std"}: return ( - "https://github.com/leanprover/lean4/blob/toolchain/" + f"https://github.com/leanprover/lean4/blob/{git_ref}/" f"src/lean/{module_path}.lean#L{start_line}-L{end_line}" ) if root == "Lake": return ( - "https://github.com/leanprover/lean4/blob/toolchain/" + f"https://github.com/leanprover/lean4/blob/{git_ref}/" f"src/lake/{module_path}.lean#L{start_line}-L{end_line}" ) @@ -490,6 +514,7 @@ def _parse_declarations_from_sqlite( lean_root: Path, package_cache: dict[str, Path], allowed_module_prefixes: list[str], + lean_version: str | None = None, ) -> list[Declaration]: """Parse declarations from a doc-gen4 SQLite database (api-docs.db). @@ -502,6 +527,7 @@ def _parse_declarations_from_sqlite( lean_root: Root directory of the Lean project. package_cache: Dictionary mapping package names to their directories. allowed_module_prefixes: Module prefixes to extract (e.g., ["Mathlib"]). + lean_version: Lean toolchain version for core module source links. Returns: List of parsed Declaration objects. @@ -579,7 +605,8 @@ def _parse_declarations_from_sqlite( end_line = row["end_line"] source_link = _construct_source_link( - module_name, source_url, start_line, end_line + module_name, source_url, start_line, end_line, + lean_version=lean_version, ) if not source_link: skipped_no_source += 1 @@ -848,6 +875,7 @@ async def extract_declarations(engine: AsyncEngine, batch_size: int = 1000) -> N if docgen_format == "sqlite": api_docs_path = workspace_path / ".lake" / "build" / "api-docs.db" + lean_version = _read_lean_toolchain_version(workspace_path) logger.info( "[%s] Using SQLite format (api-docs.db)", package_name ) @@ -856,6 +884,7 @@ async def extract_declarations(engine: AsyncEngine, batch_size: int = 1000) -> N lean_root, package_cache, package_config.module_prefixes, + lean_version=lean_version, ) else: doc_data_dir = workspace_path / ".lake" / "build" / "doc-data" diff --git a/tests/extract/doc_parser_test.py b/tests/extract/doc_parser_test.py index 0d92815..c3f25d6 100644 --- a/tests/extract/doc_parser_test.py +++ b/tests/extract/doc_parser_test.py @@ -22,6 +22,7 @@ _insert_declarations_batch, _parse_declarations_from_files, _parse_declarations_from_sqlite, + _read_lean_toolchain_version, _read_source_lines, _strip_lean_comments, extract_declarations, @@ -337,26 +338,76 @@ def test_sort_tags_handled(self): class TestSqliteHelpers: """Tests for SQLite-specific parsing helpers.""" - def test_construct_source_link_for_core_module(self): - """Test source links for core modules without a stored source URL.""" - result = _construct_source_link("Init.Data.Nat.Basic", None, 12, 15) + def test_construct_source_link_for_core_module_with_version(self): + """Test source links for core modules use the toolchain version.""" + result = _construct_source_link( + "Init.Data.Nat.Basic", None, 12, 15, + lean_version="v4.29.0-rc6", + ) assert ( result - == "https://github.com/leanprover/lean4/blob/toolchain/" + == "https://github.com/leanprover/lean4/blob/v4.29.0-rc6/" "src/lean/Init/Data/Nat/Basic.lean#L12-L15" ) - def test_construct_source_link_for_lake_module(self): - """Test source links for Lake modules without a stored source URL.""" - result = _construct_source_link("Lake.Config.Monad", None, 7, 9) + def test_construct_source_link_for_lake_module_with_version(self): + """Test source links for Lake modules use the toolchain version.""" + result = _construct_source_link( + "Lake.Config.Monad", None, 7, 9, + lean_version="v4.29.0-rc6", + ) assert ( result - == "https://github.com/leanprover/lean4/blob/toolchain/" + == "https://github.com/leanprover/lean4/blob/v4.29.0-rc6/" "src/lake/Lake/Config/Monad.lean#L7-L9" ) + def test_construct_source_link_falls_back_to_master(self): + """Test that missing lean_version falls back to 'master'.""" + result = _construct_source_link( + "Init.Data.Nat.Basic", None, 12, 15, + ) + + assert ( + result + == "https://github.com/leanprover/lean4/blob/master/" + "src/lean/Init/Data/Nat/Basic.lean#L12-L15" + ) + + def test_construct_source_link_prefers_source_url(self): + """Test that a non-None source_url is used directly.""" + url = "https://github.com/leanprover-community/mathlib4/blob/abc123/Foo.lean" + result = _construct_source_link("Foo.Bar", url, 1, 10) + + assert result == f"{url}#L1-L10" + + def test_read_lean_toolchain_version(self, temp_directory): + """Test reading version from a lean-toolchain file.""" + workspace = temp_directory / "workspace" + workspace.mkdir() + toolchain = workspace / "lean-toolchain" + toolchain.write_text("leanprover/lean4:v4.29.0-rc6\n") + + assert _read_lean_toolchain_version(workspace) == "v4.29.0-rc6" + + def test_read_lean_toolchain_version_release(self, temp_directory): + """Test reading a release version (no -rc suffix).""" + workspace = temp_directory / "workspace" + workspace.mkdir() + toolchain = workspace / "lean-toolchain" + toolchain.write_text("leanprover/lean4:v4.29.0\n") + + assert _read_lean_toolchain_version(workspace) == "v4.29.0" + + def test_read_lean_toolchain_version_missing(self, temp_directory): + """Test returns None when lean-toolchain doesn't exist.""" + workspace = temp_directory / "workspace" + workspace.mkdir() + + assert _read_lean_toolchain_version(workspace) is None + class TestDependencyExtraction: """Tests for dependency extraction from HTML.""" @@ -760,13 +811,14 @@ def test_parse_declarations_from_sqlite_uses_core_fallback_source_link( lean_root, {"lean4": lean_source_dir}, allowed_module_prefixes=["Init"], + lean_version="v4.29.0-rc6", ) assert len(declarations) == 1 assert declarations[0].name == "Nat.core" assert ( declarations[0].source_link - == "https://github.com/leanprover/lean4/blob/toolchain/" + == "https://github.com/leanprover/lean4/blob/v4.29.0-rc6/" "src/lean/Init/Data/Nat/Basic.lean#L1-L1" ) From 1f7b1d4a310b3882cef3eef53ea090f2395fd778 Mon Sep 17 00:00:00 2001 From: justincasher Date: Sat, 11 Apr 2026 11:10:04 -0400 Subject: [PATCH 11/22] Detect Verso-only docstrings in SQLite extraction The SQLite query only joined declaration_markdown_docstrings, silently dropping docstrings stored in Verso format (declaration_verso_docstrings). Doc-gen4 stores each docstring as either markdown text or a Verso binary BLOB, never both. Add a LEFT JOIN on declaration_verso_docstrings to detect Verso-only cases. Since Verso BLOBs use a complex nested serialization format (Doc.Block/Doc.Inline trees with opaque Dynamic values), full deserialization is deferred. For now, Verso-only docstrings are counted and logged as a warning so the gap is visible rather than silent. --- src/lean_explore/extract/doc_parser.py | 22 +++- tests/extract/doc_parser_test.py | 156 +++++++++++++++++++++++++ 2 files changed, 176 insertions(+), 2 deletions(-) diff --git a/src/lean_explore/extract/doc_parser.py b/src/lean_explore/extract/doc_parser.py index 98535dc..64685d0 100644 --- a/src/lean_explore/extract/doc_parser.py +++ b/src/lean_explore/extract/doc_parser.py @@ -538,7 +538,10 @@ def _parse_declarations_from_sqlite( connection.row_factory = sqlite3.Row try: - # Query all declarations with their source ranges and docstrings + # Query declarations with source ranges and both docstring types. + # Doc-gen4 stores docstrings as either markdown text or Verso binary + # BLOBs (never both). We prefer markdown; Verso BLOBs require a + # complex deserializer so we detect but cannot extract them yet. query = """ SELECT n.module_name, @@ -550,12 +553,15 @@ def _parse_declarations_from_sqlite( r.start_line, r.end_line, d.text AS docstring, + v.content AS verso_docstring, m.source_url FROM name_info n JOIN declaration_ranges r ON n.module_name = r.module_name AND n.position = r.position LEFT JOIN declaration_markdown_docstrings d ON n.module_name = d.module_name AND n.position = d.position + LEFT JOIN declaration_verso_docstrings v + ON n.module_name = v.module_name AND n.position = v.position JOIN modules m ON n.module_name = m.name ORDER BY n.module_name, n.position @@ -568,6 +574,7 @@ def _parse_declarations_from_sqlite( skipped_prefix = 0 skipped_constructor = 0 source_errors = 0 + verso_only_docstrings = 0 with Progress( SpinnerColumn(), @@ -646,11 +653,16 @@ def _parse_declarations_from_sqlite( else: dependencies = None + # Use markdown docstring; detect Verso-only cases + docstring = row["docstring"] + if not docstring and row["verso_docstring"]: + verso_only_docstrings += 1 + declarations.append( Declaration( name=declaration_name, module=module_name, - docstring=row["docstring"], + docstring=docstring, source_text=source_text, source_link=source_link, dependencies=dependencies, @@ -668,6 +680,12 @@ def _parse_declarations_from_sqlite( logger.info("Skipped %d .mk constructors", skipped_constructor) if skipped_no_source > 0: logger.info("Skipped %d declarations without source URL", skipped_no_source) + if verso_only_docstrings > 0: + logger.warning( + "%d declarations have Verso-only docstrings " + "(not yet supported, stored as docstring=None)", + verso_only_docstrings, + ) if source_errors > 0: logger.warning( "Could not extract source text for %d declarations", diff --git a/tests/extract/doc_parser_test.py b/tests/extract/doc_parser_test.py index c3f25d6..bd53a70 100644 --- a/tests/extract/doc_parser_test.py +++ b/tests/extract/doc_parser_test.py @@ -572,6 +572,12 @@ def test_parse_declarations_from_sqlite_filters_non_rendered( text TEXT NOT NULL, PRIMARY KEY (module_name, position) ); + CREATE TABLE declaration_verso_docstrings ( + module_name TEXT NOT NULL, + position INTEGER NOT NULL, + content BLOB NOT NULL, + PRIMARY KEY (module_name, position) + ); """ ) connection.execute( @@ -692,6 +698,12 @@ def test_parse_declarations_from_sqlite_extracts_dependencies( text TEXT NOT NULL, PRIMARY KEY (module_name, position) ); + CREATE TABLE declaration_verso_docstrings ( + module_name TEXT NOT NULL, + position INTEGER NOT NULL, + content BLOB NOT NULL, + PRIMARY KEY (module_name, position) + ); """ ) connection.execute( @@ -733,6 +745,144 @@ def test_parse_declarations_from_sqlite_extracts_dependencies( assert declarations[0].name == "Nat.myFunc" assert declarations[0].dependencies == ["Nat", "Bool"] + def test_parse_declarations_from_sqlite_detects_verso_docstrings( + self, temp_directory, caplog + ): + """Test that Verso-only docstrings are detected and logged.""" + lean_root = temp_directory / "lean" + database_path = temp_directory / "api-docs.db" + + source_dir = ( + lean_root / "mathlib" / ".lake" / "packages" / "mathlib4" + ) + source_file = source_dir / "Mathlib" / "Data" / "Nat" / "Basic.lean" + source_file.parent.mkdir(parents=True) + source_file.write_text( + "def Nat.withVerso : Nat := 0\n" + "def Nat.withMarkdown : Nat := 1\n" + ) + + connection = sqlite3.connect(database_path) + connection.executescript( + """ + CREATE TABLE modules (name TEXT PRIMARY KEY, source_url TEXT); + CREATE TABLE name_info ( + module_name TEXT NOT NULL, + position INTEGER NOT NULL, + kind TEXT, + name TEXT NOT NULL, + type BLOB NOT NULL, + sorried INTEGER NOT NULL, + render INTEGER NOT NULL, + PRIMARY KEY (module_name, position) + ); + CREATE TABLE declaration_ranges ( + module_name TEXT NOT NULL, + position INTEGER NOT NULL, + start_line INTEGER NOT NULL, + start_column INTEGER NOT NULL, + start_utf16 INTEGER NOT NULL, + end_line INTEGER NOT NULL, + end_column INTEGER NOT NULL, + end_utf16 INTEGER NOT NULL, + PRIMARY KEY (module_name, position) + ); + CREATE TABLE declaration_markdown_docstrings ( + module_name TEXT NOT NULL, + position INTEGER NOT NULL, + text TEXT NOT NULL, + PRIMARY KEY (module_name, position) + ); + CREATE TABLE declaration_verso_docstrings ( + module_name TEXT NOT NULL, + position INTEGER NOT NULL, + content BLOB NOT NULL, + PRIMARY KEY (module_name, position) + ); + """ + ) + connection.execute( + "INSERT INTO modules (name, source_url) VALUES (?, ?)", + ( + "Mathlib.Data.Nat.Basic", + "https://github.com/leanprover-community/mathlib4" + "/blob/master/Mathlib/Data/Nat/Basic.lean", + ), + ) + connection.executemany( + """ + INSERT INTO name_info + (module_name, position, kind, name, type, sorried, render) + VALUES (?, ?, ?, ?, ?, ?, ?) + """, + [ + ( + "Mathlib.Data.Nat.Basic", 1, "definition", + "Nat.withVerso", _make_text_node("Nat"), 0, 1, + ), + ( + "Mathlib.Data.Nat.Basic", 2, "definition", + "Nat.withMarkdown", _make_text_node("Nat"), 0, 1, + ), + ], + ) + connection.executemany( + """ + INSERT INTO declaration_ranges + (module_name, position, start_line, start_column, + start_utf16, end_line, end_column, end_utf16) + VALUES (?, ?, ?, ?, ?, ?, ?, ?) + """, + [ + ("Mathlib.Data.Nat.Basic", 1, 1, 0, 0, 1, 27, 27), + ("Mathlib.Data.Nat.Basic", 2, 2, 0, 0, 2, 30, 30), + ], + ) + # Declaration 1: Verso-only docstring (binary BLOB) + connection.execute( + """ + INSERT INTO declaration_verso_docstrings + (module_name, position, content) + VALUES (?, ?, ?) + """, + ("Mathlib.Data.Nat.Basic", 1, b"\x00\x01\x02"), + ) + # Declaration 2: Markdown docstring + connection.execute( + """ + INSERT INTO declaration_markdown_docstrings + (module_name, position, text) + VALUES (?, ?, ?) + """, + ("Mathlib.Data.Nat.Basic", 2, "A markdown docstring"), + ) + connection.commit() + connection.close() + + import logging + + with caplog.at_level(logging.WARNING): + declarations = _parse_declarations_from_sqlite( + database_path, + lean_root, + _build_package_cache(lean_root), + allowed_module_prefixes=["Mathlib"], + ) + + assert len(declarations) == 2 + # Verso-only: docstring is None + verso_decl = next( + d for d in declarations if d.name == "Nat.withVerso" + ) + assert verso_decl.docstring is None + # Markdown: docstring is present + md_decl = next( + d for d in declarations if d.name == "Nat.withMarkdown" + ) + assert md_decl.docstring == "A markdown docstring" + # Warning logged about Verso-only docstrings + assert "1 declarations have Verso-only docstrings" in caplog.text + def test_parse_declarations_from_sqlite_uses_core_fallback_source_link( self, temp_directory ): @@ -777,6 +927,12 @@ def test_parse_declarations_from_sqlite_uses_core_fallback_source_link( text TEXT NOT NULL, PRIMARY KEY (module_name, position) ); + CREATE TABLE declaration_verso_docstrings ( + module_name TEXT NOT NULL, + position INTEGER NOT NULL, + content BLOB NOT NULL, + PRIMARY KEY (module_name, position) + ); """ ) connection.execute( From a611dc0b30b33e10335b106dcf0987dd849f3195 Mon Sep 17 00:00:00 2001 From: justincasher Date: Sat, 11 Apr 2026 11:32:19 -0400 Subject: [PATCH 12/22] Validate api-docs.db before trusting SQLite format detection _detect_docgen_format previously checked only file existence, which would misclassify a zero-byte, corrupt, or incomplete api-docs.db as usable. Now _validate_docgen_sqlite verifies the file is non-empty, opens as valid SQLite, and contains the required tables (name_info, declaration_ranges, modules). On failure the detector falls back to BMP if available, or returns "none". --- src/lean_explore/extract/doc_parser.py | 56 ++++++++++- tests/extract/doc_parser_test.py | 124 +++++++++++++++++++++++++ 2 files changed, 178 insertions(+), 2 deletions(-) diff --git a/src/lean_explore/extract/doc_parser.py b/src/lean_explore/extract/doc_parser.py index 64685d0..3ee9a27 100644 --- a/src/lean_explore/extract/doc_parser.py +++ b/src/lean_explore/extract/doc_parser.py @@ -834,21 +834,73 @@ async def _insert_declarations_batch( return inserted_count +_REQUIRED_DOCGEN_TABLES = {"name_info", "declaration_ranges", "modules"} + + +def _validate_docgen_sqlite(database_path: Path) -> bool: + """Check that a doc-gen4 api-docs.db is a valid, usable SQLite database. + + Verifies the file is non-empty, opens as SQLite, and contains the tables + that the extraction pipeline requires. + + Args: + database_path: Path to the api-docs.db file. + + Returns: + True if the database is valid and contains the required tables. + """ + if database_path.stat().st_size == 0: + logger.warning("api-docs.db exists but is empty: %s", database_path) + return False + + try: + connection = sqlite3.connect(str(database_path)) + try: + cursor = connection.execute( + "SELECT name FROM sqlite_master WHERE type='table'" + ) + tables = {row[0] for row in cursor.fetchall()} + finally: + connection.close() + except sqlite3.DatabaseError as error: + logger.warning("api-docs.db is not a valid SQLite file: %s", error) + return False + + missing = _REQUIRED_DOCGEN_TABLES - tables + if missing: + logger.warning( + "api-docs.db is missing required tables %s: %s", + missing, database_path, + ) + return False + + return True + + def _detect_docgen_format(workspace_path: Path) -> str: """Detect which doc-gen4 output format a workspace uses. Doc-gen4 >= v4.29.0-rc2 writes to a SQLite database (api-docs.db). Earlier versions write individual BMP JSON files to doc-data/. + The SQLite file is validated before returning "sqlite" to guard against + zero-byte, corrupt, or incompatible databases left by crashed builds. + Args: workspace_path: Path to the package workspace (e.g., lean/mathlib). Returns: - "sqlite" if api-docs.db exists, "bmp" if BMP files exist, "none" otherwise. + "sqlite" if a valid api-docs.db exists, "bmp" if BMP files exist, + "none" otherwise. """ api_docs_db = workspace_path / ".lake" / "build" / "api-docs.db" if api_docs_db.exists(): - return "sqlite" + if _validate_docgen_sqlite(api_docs_db): + return "sqlite" + logger.warning( + "Invalid api-docs.db at %s, checking for BMP fallback", + api_docs_db, + ) doc_data_dir = workspace_path / ".lake" / "build" / "doc-data" if doc_data_dir.exists(): diff --git a/tests/extract/doc_parser_test.py b/tests/extract/doc_parser_test.py index bd53a70..08ab60b 100644 --- a/tests/extract/doc_parser_test.py +++ b/tests/extract/doc_parser_test.py @@ -15,6 +15,7 @@ _BlobReader, _build_package_cache, _construct_source_link, + _detect_docgen_format, _extract_dependencies_from_html, _extract_names_from_rendered_code, _extract_source_text, @@ -25,6 +26,7 @@ _read_lean_toolchain_version, _read_source_lines, _strip_lean_comments, + _validate_docgen_sqlite, extract_declarations, ) from lean_explore.extract.types import Declaration @@ -1346,3 +1348,125 @@ def test_single_declaration(self): assert len(filtered) == 1 assert removed_count == 0 + + +class TestDocgenFormatDetection: + """Tests for doc-gen4 output format detection and SQLite validation.""" + + def test_detect_sqlite_with_valid_db(self, temp_directory): + """Test detection returns 'sqlite' for a valid api-docs.db.""" + workspace = temp_directory / "pkg" + db_path = workspace / ".lake" / "build" / "api-docs.db" + db_path.parent.mkdir(parents=True) + + connection = sqlite3.connect(db_path) + connection.executescript( + """ + CREATE TABLE modules (name TEXT PRIMARY KEY, source_url TEXT); + CREATE TABLE name_info ( + module_name TEXT, position INTEGER, kind TEXT, name TEXT, + type BLOB, sorried INTEGER, render INTEGER, + PRIMARY KEY (module_name, position) + ); + CREATE TABLE declaration_ranges ( + module_name TEXT, position INTEGER, + start_line INTEGER, start_column INTEGER, start_utf16 INTEGER, + end_line INTEGER, end_column INTEGER, end_utf16 INTEGER, + PRIMARY KEY (module_name, position) + ); + """ + ) + connection.close() + + assert _detect_docgen_format(workspace) == "sqlite" + + def test_detect_bmp_format(self, temp_directory): + """Test detection returns 'bmp' when only BMP files exist.""" + workspace = temp_directory / "pkg" + doc_data = workspace / ".lake" / "build" / "doc-data" + doc_data.mkdir(parents=True) + (doc_data / "declaration-data-Foo.bmp").write_text("{}") + + assert _detect_docgen_format(workspace) == "bmp" + + def test_detect_none_when_empty(self, temp_directory): + """Test detection returns 'none' when no doc output exists.""" + workspace = temp_directory / "pkg" + workspace.mkdir(parents=True) + + assert _detect_docgen_format(workspace) == "none" + + def test_detect_falls_back_to_bmp_for_empty_db(self, temp_directory): + """Test that an empty api-docs.db falls back to BMP if available.""" + workspace = temp_directory / "pkg" + db_path = workspace / ".lake" / "build" / "api-docs.db" + db_path.parent.mkdir(parents=True) + db_path.write_bytes(b"") + + doc_data = workspace / ".lake" / "build" / "doc-data" + doc_data.mkdir(parents=True) + (doc_data / "declaration-data-Foo.bmp").write_text("{}") + + assert _detect_docgen_format(workspace) == "bmp" + + def test_detect_returns_none_for_empty_db_without_bmp(self, temp_directory): + """Test that an empty api-docs.db with no BMP fallback returns 'none'.""" + workspace = temp_directory / "pkg" + db_path = workspace / ".lake" / "build" / "api-docs.db" + db_path.parent.mkdir(parents=True) + db_path.write_bytes(b"") + + assert _detect_docgen_format(workspace) == "none" + + def test_detect_falls_back_for_corrupt_db(self, temp_directory): + """Test that a corrupt (non-SQLite) file falls back gracefully.""" + workspace = temp_directory / "pkg" + db_path = workspace / ".lake" / "build" / "api-docs.db" + db_path.parent.mkdir(parents=True) + db_path.write_bytes(b"this is not a sqlite database") + + assert _detect_docgen_format(workspace) == "none" + + def test_detect_falls_back_for_db_missing_tables(self, temp_directory): + """Test that a SQLite DB missing required tables falls back.""" + workspace = temp_directory / "pkg" + db_path = workspace / ".lake" / "build" / "api-docs.db" + db_path.parent.mkdir(parents=True) + + connection = sqlite3.connect(db_path) + connection.execute( + "CREATE TABLE modules (name TEXT PRIMARY KEY, source_url TEXT)" + ) + connection.close() + + # Has modules but missing name_info and declaration_ranges + assert _detect_docgen_format(workspace) == "none" + + def test_validate_docgen_sqlite_valid(self, temp_directory): + """Test validation passes for a well-formed database.""" + db_path = temp_directory / "api-docs.db" + connection = sqlite3.connect(db_path) + connection.executescript( + """ + CREATE TABLE modules (name TEXT PRIMARY KEY); + CREATE TABLE name_info (module_name TEXT, position INTEGER); + CREATE TABLE declaration_ranges (module_name TEXT, position INTEGER); + """ + ) + connection.close() + + assert _validate_docgen_sqlite(db_path) is True + + def test_validate_docgen_sqlite_empty_file(self, temp_directory): + """Test validation rejects an empty file.""" + db_path = temp_directory / "api-docs.db" + db_path.write_bytes(b"") + + assert _validate_docgen_sqlite(db_path) is False + + def test_validate_docgen_sqlite_corrupt_file(self, temp_directory): + """Test validation rejects a non-SQLite file.""" + db_path = temp_directory / "api-docs.db" + db_path.write_bytes(b"not a database") + + assert _validate_docgen_sqlite(db_path) is False From 60fa48a739abd134006640019d3c9e9715cf41a3 Mon Sep 17 00:00:00 2001 From: justincasher Date: Sat, 11 Apr 2026 11:32:57 -0400 Subject: [PATCH 13/22] Filter non-rendered declarations in SQL instead of Python Move the render=1 check from a Python-side continue into a WHERE clause on the SQLite query. This avoids fetching and iterating rows that will be immediately discarded, reducing work for large packages like mathlib. --- src/lean_explore/extract/doc_parser.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/lean_explore/extract/doc_parser.py b/src/lean_explore/extract/doc_parser.py index 3ee9a27..a0ac869 100644 --- a/src/lean_explore/extract/doc_parser.py +++ b/src/lean_explore/extract/doc_parser.py @@ -549,7 +549,6 @@ def _parse_declarations_from_sqlite( n.kind, n.name, n.type, - n.render, r.start_line, r.end_line, d.text AS docstring, @@ -564,6 +563,7 @@ def _parse_declarations_from_sqlite( ON n.module_name = v.module_name AND n.position = v.position JOIN modules m ON n.module_name = m.name + WHERE n.render = 1 ORDER BY n.module_name, n.position """ rows = connection.execute(query).fetchall() @@ -620,10 +620,6 @@ def _parse_declarations_from_sqlite( progress.update(task, advance=1) continue - if not row["render"]: - progress.update(task, advance=1) - continue - # Extract source text from local files try: source_text = _extract_source_text( From c2482448d58dfe9954e2316d3434912ffcfdd8d2 Mon Sep 17 00:00:00 2001 From: justincasher Date: Sat, 11 Apr 2026 13:02:05 -0400 Subject: [PATCH 14/22] Handle missing source files gracefully in BMP parser The BMP parser crashed with FileNotFoundError when a source file referenced in a declaration's sourceLink was not present locally (e.g. after a package rename). The SQLite parser already handled this with a try/except. Apply the same pattern to the BMP parser: skip the declaration, count the error, and log a warning at the end. --- src/lean_explore/extract/doc_parser.py | 33 +++++++++++++++++++------- 1 file changed, 25 insertions(+), 8 deletions(-) diff --git a/src/lean_explore/extract/doc_parser.py b/src/lean_explore/extract/doc_parser.py index a0ac869..2ca2aa4 100644 --- a/src/lean_explore/extract/doc_parser.py +++ b/src/lean_explore/extract/doc_parser.py @@ -712,6 +712,7 @@ def _parse_declarations_from_files( List of parsed Declaration objects. """ declarations = [] + source_errors = 0 with Progress( SpinnerColumn(), @@ -740,23 +741,33 @@ def _parse_declarations_from_files( for declaration_data in data.get("declarations", []): information = declaration_data["info"] - source_text = _extract_source_text( - information["sourceLink"], lean_root, package_cache - ) + declaration_name = information["name"] + + # Skip auto-generated .mk constructors + if declaration_name.endswith(".mk"): + continue + + try: + source_text = _extract_source_text( + information["sourceLink"], lean_root, package_cache + ) + except (FileNotFoundError, ValueError) as error: + source_errors += 1 + if source_errors <= 10: + logger.debug( + "Could not extract source for %s: %s", + declaration_name, error, + ) + continue header_html = declaration_data.get("header", "") dependencies = _extract_dependencies_from_html(header_html) # Filter out self-references from dependencies - declaration_name = information["name"] filtered_dependencies = [ d for d in dependencies if d != declaration_name ] - # Skip auto-generated .mk constructors - if declaration_name.endswith(".mk"): - continue - declarations.append( Declaration( name=declaration_name, @@ -770,6 +781,12 @@ def _parse_declarations_from_files( progress.update(task, advance=1) + if source_errors > 0: + logger.warning( + "Could not extract source text for %d declarations", + source_errors, + ) + return declarations From 35f840244db8399d5c33aa2bb4a59166eabd6e64 Mon Sep 17 00:00:00 2001 From: justincasher Date: Sat, 11 Apr 2026 13:02:22 -0400 Subject: [PATCH 15/22] Pin doc-gen4 to main branch instead of version tags Pinning doc-gen4 to specific release tags (e.g. @ "v4.30.0-rc1") caused transitive dependency conflicts with packages like plausible, breaking lake exe cache get. Using @ "main" lets Lake resolve a compatible version automatically since doc-gen4 main tracks the latest Lean toolchain. Also reorder require statements so doc-gen4 comes before the main package dependency, ensuring the main package's dependency versions take precedence during resolution. --- src/lean_explore/extract/doc_gen4.py | 5 ++--- src/lean_explore/extract/package_utils.py | 16 ++++++++++------ 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/src/lean_explore/extract/doc_gen4.py b/src/lean_explore/extract/doc_gen4.py index 763476c..04523b9 100644 --- a/src/lean_explore/extract/doc_gen4.py +++ b/src/lean_explore/extract/doc_gen4.py @@ -18,7 +18,7 @@ from lean_explore.extract.package_utils import ( get_extraction_order, get_package_toolchain, - update_lakefile_docgen_version, + strip_lakefile_docgen_version, ) logger = logging.getLogger(__name__) @@ -97,9 +97,8 @@ def _setup_workspace(package_config: PackageConfig) -> tuple[str, str]: toolchain_file = workspace_path / "lean-toolchain" lean_toolchain, git_ref = get_package_toolchain(package_config) - lean_version = extract_lean_version(lean_toolchain) - update_lakefile_docgen_version(lakefile_path, lean_version) + strip_lakefile_docgen_version(lakefile_path) toolchain_file.write_text(lean_toolchain + "\n") return lean_toolchain, git_ref diff --git a/src/lean_explore/extract/package_utils.py b/src/lean_explore/extract/package_utils.py index f5912c0..97e40af 100644 --- a/src/lean_explore/extract/package_utils.py +++ b/src/lean_explore/extract/package_utils.py @@ -81,12 +81,16 @@ def get_package_toolchain(package_configuration: PackageConfig) -> tuple[str, st return toolchain, latest_tag -def update_lakefile_docgen_version(lakefile_path: Path, lean_version: str) -> None: - """Update the doc-gen4 version in a lakefile to match the Lean version. +def strip_lakefile_docgen_version(lakefile_path: Path) -> None: + """Pin doc-gen4 to its main branch instead of a specific version tag. + + Doc-gen4's main branch tracks the latest Lean toolchain. Pinning to a + specific release tag causes transitive dependency conflicts (e.g. plausible + version mismatches) that break ``lake exe cache get``. Using ``@ "main"`` + lets Lake resolve a compatible version automatically. Args: lakefile_path: Path to lakefile.lean - lean_version: Lean version like 'v4.27.0' """ content = lakefile_path.read_text() @@ -95,13 +99,13 @@ def update_lakefile_docgen_version(lakefile_path: Path, lean_version: str) -> No r'"https://github\.com/leanprover/doc-gen4"(?:\s+@\s+"[^"]*")?' ) replacement = ( - f"require «doc-gen4» from git\n" - f' "https://github.com/leanprover/doc-gen4" @ "{lean_version}"' + 'require «doc-gen4» from git\n' + ' "https://github.com/leanprover/doc-gen4" @ "main"' ) new_content = re.sub(pattern, replacement, content) if new_content != content: lakefile_path.write_text(new_content) logger.info( - "Updated doc-gen4 version to %s in %s", lean_version, lakefile_path + "Pinned doc-gen4 to main branch in %s", lakefile_path ) From 8295e2be25d3f841ac48ba7ee24891d3d0a45544 Mon Sep 17 00:00:00 2001 From: justincasher Date: Sat, 11 Apr 2026 13:28:58 -0400 Subject: [PATCH 16/22] Restore doc-gen4 version pinning to match package toolchain Pinning doc-gen4 to @ "main" breaks packages on older toolchains (e.g. PhysLean at v4.29.0 vs doc-gen4 main at v4.30.0-rc1) due to incompatible olean headers. Restore version-matched pinning so each package gets a doc-gen4 compatible with its Lean toolchain. The require ordering fix (doc-gen4 first, main package last) is kept to avoid transitive dependency conflicts. --- src/lean_explore/extract/doc_gen4.py | 5 +++-- src/lean_explore/extract/package_utils.py | 19 ++++++++++--------- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/src/lean_explore/extract/doc_gen4.py b/src/lean_explore/extract/doc_gen4.py index 04523b9..763476c 100644 --- a/src/lean_explore/extract/doc_gen4.py +++ b/src/lean_explore/extract/doc_gen4.py @@ -18,7 +18,7 @@ from lean_explore.extract.package_utils import ( get_extraction_order, get_package_toolchain, - strip_lakefile_docgen_version, + update_lakefile_docgen_version, ) logger = logging.getLogger(__name__) @@ -97,8 +97,9 @@ def _setup_workspace(package_config: PackageConfig) -> tuple[str, str]: toolchain_file = workspace_path / "lean-toolchain" lean_toolchain, git_ref = get_package_toolchain(package_config) + lean_version = extract_lean_version(lean_toolchain) - strip_lakefile_docgen_version(lakefile_path) + update_lakefile_docgen_version(lakefile_path, lean_version) toolchain_file.write_text(lean_toolchain + "\n") return lean_toolchain, git_ref diff --git a/src/lean_explore/extract/package_utils.py b/src/lean_explore/extract/package_utils.py index 97e40af..ab5d5b8 100644 --- a/src/lean_explore/extract/package_utils.py +++ b/src/lean_explore/extract/package_utils.py @@ -81,16 +81,17 @@ def get_package_toolchain(package_configuration: PackageConfig) -> tuple[str, st return toolchain, latest_tag -def strip_lakefile_docgen_version(lakefile_path: Path) -> None: - """Pin doc-gen4 to its main branch instead of a specific version tag. +def update_lakefile_docgen_version(lakefile_path: Path, lean_version: str) -> None: + """Update the doc-gen4 version in a lakefile to match the Lean toolchain. - Doc-gen4's main branch tracks the latest Lean toolchain. Pinning to a - specific release tag causes transitive dependency conflicts (e.g. plausible - version mismatches) that break ``lake exe cache get``. Using ``@ "main"`` - lets Lake resolve a compatible version automatically. + Doc-gen4 releases are tagged to match Lean toolchain versions, so pinning + to the same version ensures compatibility. The doc-gen4 ``require`` must + appear before the main package ``require`` so that the main package's + transitive dependency versions take precedence during resolution. Args: lakefile_path: Path to lakefile.lean + lean_version: Lean version like 'v4.27.0' """ content = lakefile_path.read_text() @@ -99,13 +100,13 @@ def strip_lakefile_docgen_version(lakefile_path: Path) -> None: r'"https://github\.com/leanprover/doc-gen4"(?:\s+@\s+"[^"]*")?' ) replacement = ( - 'require «doc-gen4» from git\n' - ' "https://github.com/leanprover/doc-gen4" @ "main"' + f"require «doc-gen4» from git\n" + f' "https://github.com/leanprover/doc-gen4" @ "{lean_version}"' ) new_content = re.sub(pattern, replacement, content) if new_content != content: lakefile_path.write_text(new_content) logger.info( - "Pinned doc-gen4 to main branch in %s", lakefile_path + "Updated doc-gen4 version to %s in %s", lean_version, lakefile_path ) From ed58bc0ef60c4c86889d636e26787a3abc2a2e5e Mon Sep 17 00:00:00 2001 From: justincasher Date: Sat, 11 Apr 2026 15:50:51 -0400 Subject: [PATCH 17/22] Clear cache on toolchain change even for SQLite-capable packages The cache-skip optimization for SQLite-format packages assumed oleans were always compatible. When the toolchain version changes between runs (e.g. from v4.30.0-rc1 to v4.29.0), stale oleans cause "incompatible header" build failures. Now the old toolchain is read before setup, and if it differs from the new one, the cache is cleared regardless of SQLite support. --- src/lean_explore/extract/doc_gen4.py | 22 ++++++++++++++---- tests/extract/doc_gen4_test.py | 34 +++++++++++++++++++++++++++- 2 files changed, 51 insertions(+), 5 deletions(-) diff --git a/src/lean_explore/extract/doc_gen4.py b/src/lean_explore/extract/doc_gen4.py index 763476c..6fd5afe 100644 --- a/src/lean_explore/extract/doc_gen4.py +++ b/src/lean_explore/extract/doc_gen4.py @@ -254,14 +254,28 @@ async def run_doc_gen4( toolchain = None ref = None if fresh: + # Read the current toolchain before setup potentially overwrites it + old_toolchain_file = workspace_path / "lean-toolchain" + old_toolchain = None + if old_toolchain_file.exists(): + old_toolchain = old_toolchain_file.read_text().strip() + if setup: toolchain, ref = _setup_workspace(config) logger.info("Toolchain: %s, ref: %s", toolchain, ref) - # Doc-gen4 switched from BMP files to api-docs.db in v4.29.0-rc2. - # The SQLite format handles incremental updates, while legacy BMP - # output requires a cache clear to avoid stale files. - if toolchain and _uses_sqlite_docgen(toolchain): + # Doc-gen4 >= v4.29.0-rc2 writes to api-docs.db which handles + # incremental updates, so we can skip the cache clear — but only + # if the toolchain hasn't changed. A toolchain change means all + # oleans are stale and must be rebuilt from scratch. + toolchain_changed = toolchain and toolchain != old_toolchain + if toolchain_changed: + logger.info( + "[%s] Toolchain changed (%s -> %s), clearing cache", + package_name, old_toolchain, toolchain, + ) + _clear_workspace_cache(workspace_path) + elif toolchain and _uses_sqlite_docgen(toolchain): logger.info( "[%s] Skipping cache clear " "(api-docs.db handles incremental updates)", diff --git a/tests/extract/doc_gen4_test.py b/tests/extract/doc_gen4_test.py index f8443bf..c6b76da 100644 --- a/tests/extract/doc_gen4_test.py +++ b/tests/extract/doc_gen4_test.py @@ -1,5 +1,6 @@ """Tests for doc-gen4 workspace orchestration.""" +from pathlib import Path from unittest.mock import patch from lean_explore.extract.doc_gen4 import _uses_sqlite_docgen, run_doc_gen4 @@ -16,11 +17,19 @@ def test_uses_sqlite_docgen_from_rc2(self): assert not _uses_sqlite_docgen("leanprover/lean4:v4.28.0") +def _write_toolchain(version: str) -> None: + """Write a lean-toolchain file for the mathlib workspace.""" + path = Path("lean/mathlib/lean-toolchain") + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(version + "\n") + + class TestRunDocGen4FreshHandling: """Tests for fresh-cache handling in run_doc_gen4.""" async def test_fresh_legacy_workspace_clears_cache(self): """Test that legacy doc-gen4 workspaces still clear the cache.""" + _write_toolchain("leanprover/lean4:v4.29.0-rc1") with patch( "lean_explore.extract.doc_gen4.get_extraction_order", return_value=["mathlib"], @@ -41,7 +50,8 @@ async def test_fresh_legacy_workspace_clears_cache(self): mock_run.assert_called_once_with("mathlib", False) async def test_fresh_sqlite_workspace_skips_cache_clear(self): - """Test that SQLite doc-gen4 workspaces keep the cache.""" + """Test that SQLite workspaces with unchanged toolchain skip clear.""" + _write_toolchain("leanprover/lean4:v4.29.0-rc2") with patch( "lean_explore.extract.doc_gen4.get_extraction_order", return_value=["mathlib"], @@ -60,3 +70,25 @@ async def test_fresh_sqlite_workspace_skips_cache_clear(self): mock_clear.assert_not_called() mock_run.assert_called_once_with("mathlib", False) + + async def test_fresh_sqlite_workspace_clears_on_toolchain_change(self): + """Test that a toolchain version change forces a cache clear.""" + _write_toolchain("leanprover/lean4:v4.29.0-rc2") + with patch( + "lean_explore.extract.doc_gen4.get_extraction_order", + return_value=["mathlib"], + ): + with patch( + "lean_explore.extract.doc_gen4._setup_workspace", + return_value=("leanprover/lean4:v4.30.0-rc1", "v4.30.0-rc1"), + ): + with patch( + "lean_explore.extract.doc_gen4._clear_workspace_cache" + ) as mock_clear: + with patch( + "lean_explore.extract.doc_gen4._run_lake_for_package" + ) as mock_run: + await run_doc_gen4(fresh=True) + + mock_clear.assert_called_once() + mock_run.assert_called_once_with("mathlib", False) From 65d4efaa83a54fd0e3678fe8e8075642b359c667 Mon Sep 17 00:00:00 2001 From: justincasher Date: Sat, 11 Apr 2026 16:03:18 -0400 Subject: [PATCH 18/22] Revert toolchain change detection in cache-skip logic MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The toolchain change detection was an overcorrection for a one-off issue caused by the @ "main" experiment. Lake handles toolchain changes and olean staleness itself — the cache-skip logic only needs to distinguish SQLite (incremental) from BMP (stale files) format. --- src/lean_explore/extract/doc_gen4.py | 22 ++++-------------- tests/extract/doc_gen4_test.py | 34 +--------------------------- 2 files changed, 5 insertions(+), 51 deletions(-) diff --git a/src/lean_explore/extract/doc_gen4.py b/src/lean_explore/extract/doc_gen4.py index 6fd5afe..763476c 100644 --- a/src/lean_explore/extract/doc_gen4.py +++ b/src/lean_explore/extract/doc_gen4.py @@ -254,28 +254,14 @@ async def run_doc_gen4( toolchain = None ref = None if fresh: - # Read the current toolchain before setup potentially overwrites it - old_toolchain_file = workspace_path / "lean-toolchain" - old_toolchain = None - if old_toolchain_file.exists(): - old_toolchain = old_toolchain_file.read_text().strip() - if setup: toolchain, ref = _setup_workspace(config) logger.info("Toolchain: %s, ref: %s", toolchain, ref) - # Doc-gen4 >= v4.29.0-rc2 writes to api-docs.db which handles - # incremental updates, so we can skip the cache clear — but only - # if the toolchain hasn't changed. A toolchain change means all - # oleans are stale and must be rebuilt from scratch. - toolchain_changed = toolchain and toolchain != old_toolchain - if toolchain_changed: - logger.info( - "[%s] Toolchain changed (%s -> %s), clearing cache", - package_name, old_toolchain, toolchain, - ) - _clear_workspace_cache(workspace_path) - elif toolchain and _uses_sqlite_docgen(toolchain): + # Doc-gen4 switched from BMP files to api-docs.db in v4.29.0-rc2. + # The SQLite format handles incremental updates, while legacy BMP + # output requires a cache clear to avoid stale files. + if toolchain and _uses_sqlite_docgen(toolchain): logger.info( "[%s] Skipping cache clear " "(api-docs.db handles incremental updates)", diff --git a/tests/extract/doc_gen4_test.py b/tests/extract/doc_gen4_test.py index c6b76da..f8443bf 100644 --- a/tests/extract/doc_gen4_test.py +++ b/tests/extract/doc_gen4_test.py @@ -1,6 +1,5 @@ """Tests for doc-gen4 workspace orchestration.""" -from pathlib import Path from unittest.mock import patch from lean_explore.extract.doc_gen4 import _uses_sqlite_docgen, run_doc_gen4 @@ -17,19 +16,11 @@ def test_uses_sqlite_docgen_from_rc2(self): assert not _uses_sqlite_docgen("leanprover/lean4:v4.28.0") -def _write_toolchain(version: str) -> None: - """Write a lean-toolchain file for the mathlib workspace.""" - path = Path("lean/mathlib/lean-toolchain") - path.parent.mkdir(parents=True, exist_ok=True) - path.write_text(version + "\n") - - class TestRunDocGen4FreshHandling: """Tests for fresh-cache handling in run_doc_gen4.""" async def test_fresh_legacy_workspace_clears_cache(self): """Test that legacy doc-gen4 workspaces still clear the cache.""" - _write_toolchain("leanprover/lean4:v4.29.0-rc1") with patch( "lean_explore.extract.doc_gen4.get_extraction_order", return_value=["mathlib"], @@ -50,8 +41,7 @@ async def test_fresh_legacy_workspace_clears_cache(self): mock_run.assert_called_once_with("mathlib", False) async def test_fresh_sqlite_workspace_skips_cache_clear(self): - """Test that SQLite workspaces with unchanged toolchain skip clear.""" - _write_toolchain("leanprover/lean4:v4.29.0-rc2") + """Test that SQLite doc-gen4 workspaces keep the cache.""" with patch( "lean_explore.extract.doc_gen4.get_extraction_order", return_value=["mathlib"], @@ -70,25 +60,3 @@ async def test_fresh_sqlite_workspace_skips_cache_clear(self): mock_clear.assert_not_called() mock_run.assert_called_once_with("mathlib", False) - - async def test_fresh_sqlite_workspace_clears_on_toolchain_change(self): - """Test that a toolchain version change forces a cache clear.""" - _write_toolchain("leanprover/lean4:v4.29.0-rc2") - with patch( - "lean_explore.extract.doc_gen4.get_extraction_order", - return_value=["mathlib"], - ): - with patch( - "lean_explore.extract.doc_gen4._setup_workspace", - return_value=("leanprover/lean4:v4.30.0-rc1", "v4.30.0-rc1"), - ): - with patch( - "lean_explore.extract.doc_gen4._clear_workspace_cache" - ) as mock_clear: - with patch( - "lean_explore.extract.doc_gen4._run_lake_for_package" - ) as mock_run: - await run_doc_gen4(fresh=True) - - mock_clear.assert_called_once() - mock_run.assert_called_once_with("mathlib", False) From 4d19e4764c3776928a1434435ea1c6821e21f9e0 Mon Sep 17 00:00:00 2001 From: justincasher Date: Sat, 11 Apr 2026 20:00:06 -0400 Subject: [PATCH 19/22] Fix PhysLean extractor build targets --- src/lean_explore/extract/doc_gen4.py | 99 ++++++++++++-------- src/lean_explore/extract/package_registry.py | 2 +- tests/extract/doc_gen4_test.py | 50 +++++++++- 3 files changed, 111 insertions(+), 40 deletions(-) diff --git a/src/lean_explore/extract/doc_gen4.py b/src/lean_explore/extract/doc_gen4.py index 763476c..99ee3f4 100644 --- a/src/lean_explore/extract/doc_gen4.py +++ b/src/lean_explore/extract/doc_gen4.py @@ -71,8 +71,8 @@ def _clear_workspace_cache(workspace_path: Path) -> None: shutil.rmtree(lake_dir) -def _get_doc_lib_names(package_name: str) -> list[str]: - """Get the library names to run doc-gen4 on for a package. +def _get_library_names(package_name: str) -> list[str]: + """Get the library names to build and run doc-gen4 on for a package. Some packages have custom extract wrappers, others use upstream libraries directly. """ @@ -86,6 +86,57 @@ def _get_doc_lib_names(package_name: str) -> list[str]: return lib_names.get(package_name, [f"{package_name.title()}Extract"]) +def _run_lake_build_target( + workspace_path: Path, + package_name: str, + target: str, + env: dict[str, str], + allow_failure: bool = False, +) -> bool: + """Run ``lake build `` streaming output to the logger. + + Args: + workspace_path: Path to the Lake workspace directory. + package_name: Name of the package for log messages. + target: The Lake build target to run. + env: Environment variables to pass to the subprocess. + allow_failure: Whether to continue when the target fails. + + Returns: + ``True`` if the target built successfully, otherwise ``False``. + + Raises: + RuntimeError: If the target fails and ``allow_failure`` is ``False``. + """ + logger.info("[%s] Running lake build %s...", package_name, target) + process = subprocess.Popen( + ["lake", "build", target], + cwd=workspace_path, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + bufsize=1, + env=env, + ) + if process.stdout: + for line in process.stdout: + logger.info(line.rstrip()) + + returncode = process.wait() + if returncode == 0: + return True + + if allow_failure: + logger.warning( + "[%s] lake build %s failed (continuing with generated docs)", + package_name, + target, + ) + return False + + raise RuntimeError(f"lake build failed for {package_name} target {target}") + + def _setup_workspace(package_config: PackageConfig) -> tuple[str, str]: """Fetch toolchain from GitHub and update lakefile. @@ -179,44 +230,18 @@ def _run_lake_for_package(package_name: str, verbose: bool = False) -> None: if result.returncode != 0: logger.warning("[%s] Cache fetch failed (non-fatal)", package_name) - logger.info("[%s] Running lake build...", package_name) - process = subprocess.Popen( - ["lake", "build"], - cwd=workspace_path, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - text=True, - bufsize=1, - env=env, - ) - if process.stdout: - for line in process.stdout: - logger.info(line.rstrip()) - if process.wait() != 0: - raise RuntimeError(f"lake build failed for {package_name}") - - lib_names = _get_doc_lib_names(package_name) + lib_names = _get_library_names(package_name) for lib_name in lib_names: - logger.info("[%s] Running doc-gen4 (%s:docs)...", package_name, lib_name) + _run_lake_build_target(workspace_path, package_name, lib_name, env) - process = subprocess.Popen( - ["lake", "build", f"{lib_name}:docs"], - cwd=workspace_path, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - text=True, - bufsize=1, - env=env, + for lib_name in lib_names: + _run_lake_build_target( + workspace_path, + package_name, + f"{lib_name}:docs", + env, + allow_failure=True, ) - if process.stdout: - for line in process.stdout: - logger.info(line.rstrip()) - returncode = process.wait() - if returncode != 0: - logger.warning( - "[%s] doc-gen4 had failures for %s (continuing with generated docs)", - package_name, lib_name, - ) async def run_doc_gen4( diff --git a/src/lean_explore/extract/package_registry.py b/src/lean_explore/extract/package_registry.py index a076e80..f9e0dd4 100644 --- a/src/lean_explore/extract/package_registry.py +++ b/src/lean_explore/extract/package_registry.py @@ -17,7 +17,7 @@ "physlean": PackageConfig( name="physlean", git_url="https://github.com/HEPLean/PhysLean", - module_prefixes=["PhysLean"], + module_prefixes=["Physlib", "QuantumInfo"], version_strategy=VersionStrategy.TAGGED, depends_on=["mathlib"], ), diff --git a/tests/extract/doc_gen4_test.py b/tests/extract/doc_gen4_test.py index f8443bf..e05339c 100644 --- a/tests/extract/doc_gen4_test.py +++ b/tests/extract/doc_gen4_test.py @@ -1,8 +1,13 @@ """Tests for doc-gen4 workspace orchestration.""" -from unittest.mock import patch +from unittest.mock import MagicMock, patch -from lean_explore.extract.doc_gen4 import _uses_sqlite_docgen, run_doc_gen4 +from lean_explore.extract.doc_gen4 import ( + _run_lake_for_package, + _uses_sqlite_docgen, + run_doc_gen4, +) +from lean_explore.extract.package_registry import PACKAGE_REGISTRY class TestDocGen4VersionDetection: @@ -60,3 +65,44 @@ async def test_fresh_sqlite_workspace_skips_cache_clear(self): mock_clear.assert_not_called() mock_run.assert_called_once_with("mathlib", False) + + +class TestLakeBuildTargets: + """Tests for explicit Lake build target selection.""" + + def test_physlean_builds_wrapper_before_docs(self): + """Test that PhysLean builds the wrapper target before doc generation.""" + with patch("lean_explore.extract.doc_gen4._run_lake_update_with_retry"): + with patch("lean_explore.extract.doc_gen4.subprocess.run") as mock_run: + mock_run.return_value = MagicMock( + returncode=0, + stdout="", + stderr="", + ) + with patch( + "lean_explore.extract.doc_gen4.subprocess.Popen" + ) as mock_popen: + mock_process = MagicMock() + mock_process.stdout = iter([]) + mock_process.wait.return_value = 0 + mock_popen.return_value = mock_process + + _run_lake_for_package("physlean") + + assert mock_popen.call_args_list[0].args[0] == ["lake", "build", "PhysExtract"] + assert mock_popen.call_args_list[1].args[0] == [ + "lake", + "build", + "PhysExtract:docs", + ] + + +class TestPackageRegistry: + """Tests for package registry metadata.""" + + def test_physlean_uses_upstream_module_roots(self): + """Test that PhysLean filters on the upstream module roots.""" + assert PACKAGE_REGISTRY["physlean"].module_prefixes == [ + "Physlib", + "QuantumInfo", + ] From 4ad6b240ced3d7ee727664b5b277f5ede0e3f9e8 Mon Sep 17 00:00:00 2001 From: justincasher Date: Sat, 11 Apr 2026 20:28:51 -0400 Subject: [PATCH 20/22] Allow library build failures in doc-gen4 pipeline Doc-gen4 can fail on individual modules (e.g. timeout computing equational lemmata) without invalidating the rest of the output. The explicit library build step should tolerate these failures like the :docs step already does, rather than aborting the entire pipeline. --- src/lean_explore/extract/doc_gen4.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/lean_explore/extract/doc_gen4.py b/src/lean_explore/extract/doc_gen4.py index 99ee3f4..d915695 100644 --- a/src/lean_explore/extract/doc_gen4.py +++ b/src/lean_explore/extract/doc_gen4.py @@ -232,7 +232,9 @@ def _run_lake_for_package(package_name: str, verbose: bool = False) -> None: lib_names = _get_library_names(package_name) for lib_name in lib_names: - _run_lake_build_target(workspace_path, package_name, lib_name, env) + _run_lake_build_target( + workspace_path, package_name, lib_name, env, allow_failure=True, + ) for lib_name in lib_names: _run_lake_build_target( From d9ce242105746a38570695e6180de7b3d936e97f Mon Sep 17 00:00:00 2001 From: justincasher Date: Sun, 12 Apr 2026 01:34:09 -0400 Subject: [PATCH 21/22] Skip redundant library build before :docs target MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The :docs facet transitively depends on :docInfo, which depends on the library oleans. Running lake build separately before lake build :docs causes Lake to process the doc-gen4 analysis twice — once during the explicit library build, then again as a replay during the :docs step. For mathlib this doubled the wall time (21K jobs instead of ~10K). The PhysLean issue this step was meant to fix is already resolved by the correct imports in PhysExtract.lean; the :docs target alone now builds everything it needs transitively. --- src/lean_explore/extract/doc_gen4.py | 8 +++----- tests/extract/doc_gen4_test.py | 9 ++++----- 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/src/lean_explore/extract/doc_gen4.py b/src/lean_explore/extract/doc_gen4.py index d915695..e3e62ae 100644 --- a/src/lean_explore/extract/doc_gen4.py +++ b/src/lean_explore/extract/doc_gen4.py @@ -230,12 +230,10 @@ def _run_lake_for_package(package_name: str, verbose: bool = False) -> None: if result.returncode != 0: logger.warning("[%s] Cache fetch failed (non-fatal)", package_name) + # The :docs facet transitively depends on :docInfo, which depends on the + # library oleans, so a single lake build per library is sufficient. The + # explicit library build beforehand is redundant and doubles the work. lib_names = _get_library_names(package_name) - for lib_name in lib_names: - _run_lake_build_target( - workspace_path, package_name, lib_name, env, allow_failure=True, - ) - for lib_name in lib_names: _run_lake_build_target( workspace_path, diff --git a/tests/extract/doc_gen4_test.py b/tests/extract/doc_gen4_test.py index e05339c..b1c8c49 100644 --- a/tests/extract/doc_gen4_test.py +++ b/tests/extract/doc_gen4_test.py @@ -68,10 +68,10 @@ async def test_fresh_sqlite_workspace_skips_cache_clear(self): class TestLakeBuildTargets: - """Tests for explicit Lake build target selection.""" + """Tests for Lake build target selection.""" - def test_physlean_builds_wrapper_before_docs(self): - """Test that PhysLean builds the wrapper target before doc generation.""" + def test_physlean_runs_docs_target(self): + """Test that PhysLean runs the :docs target for its wrapper library.""" with patch("lean_explore.extract.doc_gen4._run_lake_update_with_retry"): with patch("lean_explore.extract.doc_gen4.subprocess.run") as mock_run: mock_run.return_value = MagicMock( @@ -89,8 +89,7 @@ def test_physlean_builds_wrapper_before_docs(self): _run_lake_for_package("physlean") - assert mock_popen.call_args_list[0].args[0] == ["lake", "build", "PhysExtract"] - assert mock_popen.call_args_list[1].args[0] == [ + assert mock_popen.call_args_list[0].args[0] == [ "lake", "build", "PhysExtract:docs", From e3341bd890211c04214bf55e0ab2b001afb6e1d9 Mon Sep 17 00:00:00 2001 From: justincasher Date: Sun, 19 Apr 2026 11:24:29 -0400 Subject: [PATCH 22/22] Build :docInfo instead of :docs for SQLite workspaces MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The :docs facet populates api-docs.db via :docInfo, then runs doc-gen4 fromDb to generate every HTML page and the search index. We read api-docs.db directly and never consume the HTML, so the fromDb pass is wasted work — on Mathlib that's a second full scan over thousands of modules per rebuild. :docInfo stops after the database is populated, which is all the extractor needs. It was added alongside the SQLite output in doc-gen4 #347 (v4.29.0-rc2), so fall back to :docs on older toolchains where the facet doesn't exist. --- src/lean_explore/extract/doc_gen4.py | 17 ++++++-- tests/extract/doc_gen4_test.py | 63 ++++++++++++++++++---------- 2 files changed, 53 insertions(+), 27 deletions(-) diff --git a/src/lean_explore/extract/doc_gen4.py b/src/lean_explore/extract/doc_gen4.py index e3e62ae..759227e 100644 --- a/src/lean_explore/extract/doc_gen4.py +++ b/src/lean_explore/extract/doc_gen4.py @@ -230,15 +230,24 @@ def _run_lake_for_package(package_name: str, verbose: bool = False) -> None: if result.returncode != 0: logger.warning("[%s] Cache fetch failed (non-fatal)", package_name) - # The :docs facet transitively depends on :docInfo, which depends on the - # library oleans, so a single lake build per library is sufficient. The - # explicit library build beforehand is redundant and doubles the work. + # For SQLite-format doc-gen4 we only need :docInfo, which populates + # api-docs.db and stops there. The :docs facet additionally runs `fromDb` + # to generate all HTML and the search index, which we don't consume. + # Older BMP-format doc-gen4 has no :docInfo facet, so :docs is the only + # option there. + toolchain_file = workspace_path / "lean-toolchain" + lake_target = "docs" + if toolchain_file.is_file(): + toolchain = toolchain_file.read_text().strip() + if toolchain and _uses_sqlite_docgen(toolchain): + lake_target = "docInfo" + lib_names = _get_library_names(package_name) for lib_name in lib_names: _run_lake_build_target( workspace_path, package_name, - f"{lib_name}:docs", + f"{lib_name}:{lake_target}", env, allow_failure=True, ) diff --git a/tests/extract/doc_gen4_test.py b/tests/extract/doc_gen4_test.py index b1c8c49..ac4fa77 100644 --- a/tests/extract/doc_gen4_test.py +++ b/tests/extract/doc_gen4_test.py @@ -70,30 +70,47 @@ async def test_fresh_sqlite_workspace_skips_cache_clear(self): class TestLakeBuildTargets: """Tests for Lake build target selection.""" - def test_physlean_runs_docs_target(self): - """Test that PhysLean runs the :docs target for its wrapper library.""" - with patch("lean_explore.extract.doc_gen4._run_lake_update_with_retry"): - with patch("lean_explore.extract.doc_gen4.subprocess.run") as mock_run: - mock_run.return_value = MagicMock( - returncode=0, - stdout="", - stderr="", - ) + def _run_and_capture_target( + self, tmp_path, package_name, library_name, toolchain + ): + workspace_path = tmp_path / "lean" / package_name + workspace_path.mkdir(parents=True) + (workspace_path / "lean-toolchain").write_text(toolchain + "\n") + + with patch("lean_explore.extract.doc_gen4.Path") as mock_path: + mock_path.side_effect = lambda *args: tmp_path.joinpath(*args) + with patch("lean_explore.extract.doc_gen4._run_lake_update_with_retry"): with patch( - "lean_explore.extract.doc_gen4.subprocess.Popen" - ) as mock_popen: - mock_process = MagicMock() - mock_process.stdout = iter([]) - mock_process.wait.return_value = 0 - mock_popen.return_value = mock_process - - _run_lake_for_package("physlean") - - assert mock_popen.call_args_list[0].args[0] == [ - "lake", - "build", - "PhysExtract:docs", - ] + "lean_explore.extract.doc_gen4.subprocess.run" + ) as mock_run: + mock_run.return_value = MagicMock( + returncode=0, stdout="", stderr="" + ) + with patch( + "lean_explore.extract.doc_gen4.subprocess.Popen" + ) as mock_popen: + mock_process = MagicMock() + mock_process.stdout = iter([]) + mock_process.wait.return_value = 0 + mock_popen.return_value = mock_process + + _run_lake_for_package(package_name) + + return mock_popen.call_args_list[0].args[0] + + def test_sqlite_workspace_uses_docinfo_target(self, tmp_path): + """SQLite-format workspaces should build :docInfo, skipping HTML gen.""" + args = self._run_and_capture_target( + tmp_path, "physlean", "PhysExtract", "leanprover/lean4:v4.29.0-rc2" + ) + assert args == ["lake", "build", "PhysExtract:docInfo"] + + def test_legacy_workspace_uses_docs_target(self, tmp_path): + """Pre-SQLite workspaces have no :docInfo facet, must build :docs.""" + args = self._run_and_capture_target( + tmp_path, "physlean", "PhysExtract", "leanprover/lean4:v4.29.0-rc1" + ) + assert args == ["lake", "build", "PhysExtract:docs"] class TestPackageRegistry: