From 444c0be160bad1813278d9b42c443bf185b8c704 Mon Sep 17 00:00:00 2001 From: justincasher Date: Sat, 21 Mar 2026 21:06:16 -0400 Subject: [PATCH 01/22] Support doc-gen4 SQLite database format for extraction Doc-gen4 >= v4.29.0-rc1 outputs declaration data to a SQLite database (api-docs.db) instead of individual BMP JSON files. The extraction pipeline now auto-detects which format each package uses and parses accordingly. For packages using the new SQLite format, the workspace cache clear is skipped during fresh builds since api-docs.db handles incremental updates correctly. The full cache clear is still performed for packages using the legacy BMP format to avoid stale output files. --- src/lean_explore/extract/doc_gen4.py | 13 +- src/lean_explore/extract/doc_parser.py | 255 +++++++++++++++++++++++-- 2 files changed, 246 insertions(+), 22 deletions(-) diff --git a/src/lean_explore/extract/doc_gen4.py b/src/lean_explore/extract/doc_gen4.py index 621e55a..838c7e2 100644 --- a/src/lean_explore/extract/doc_gen4.py +++ b/src/lean_explore/extract/doc_gen4.py @@ -228,7 +228,18 @@ async def run_doc_gen4( logger.info("\n%s\nPackage: %s\n%s", "=" * 50, package_name, "=" * 50) if fresh: - _clear_workspace_cache(workspace_path) + # Packages using doc-gen4 >= v4.29 output to a SQLite database + # (api-docs.db) which handles incremental updates correctly. + # Only packages using the legacy BMP format need a full cache + # clear to avoid stale output files. + api_docs_db = workspace_path / ".lake" / "build" / "api-docs.db" + if api_docs_db.exists(): + logger.info( + f"[{package_name}] Skipping cache clear " + f"(api-docs.db handles incremental updates)" + ) + else: + _clear_workspace_cache(workspace_path) if setup: toolchain, ref = _setup_workspace(config) diff --git a/src/lean_explore/extract/doc_parser.py b/src/lean_explore/extract/doc_parser.py index 707e62a..bbd7447 100644 --- a/src/lean_explore/extract/doc_parser.py +++ b/src/lean_explore/extract/doc_parser.py @@ -1,12 +1,17 @@ """Parser for Lean doc-gen4 output files. -This module parses doc-gen4 JSON data and extracts Lean source code -to produce Declaration objects ready for database insertion. +This module parses doc-gen4 output and extracts Lean source code to produce +Declaration objects ready for database insertion. + +Supports two doc-gen4 output formats: +- SQLite database (api-docs.db): Used by doc-gen4 >= v4.29.0-rc1 +- BMP JSON files (.bmp): Used by doc-gen4 < v4.29.0-rc1 """ import json import logging import re +import sqlite3 from pathlib import Path from rich.progress import ( @@ -292,6 +297,176 @@ def _extract_source_text( ) +def _construct_source_link( + module_source_url: str | None, + module_name: str, + start_line: int, + end_line: int, +) -> str | None: + """Construct a GitHub source link from module URL and line range. + + Args: + module_source_url: GitHub URL to the module file from api-docs.db. + module_name: Fully qualified module name (e.g., "Mathlib.Algebra.Group"). + start_line: Start line number in the source file. + end_line: End line number in the source file. + + Returns: + GitHub URL with line range fragment, or None if no source URL exists. + """ + if not module_source_url: + return None + return f"{module_source_url}#L{start_line}-L{end_line}" + + +def _parse_declarations_from_sqlite( + database_path: Path, + lean_root: Path, + package_cache: dict[str, Path], + allowed_module_prefixes: list[str], +) -> list[Declaration]: + """Parse declarations from a doc-gen4 SQLite database (api-docs.db). + + Doc-gen4 >= v4.29.0-rc1 outputs declaration data to a SQLite database + instead of individual BMP JSON files. This function reads that database + and produces the same Declaration objects as the BMP parser. + + Args: + database_path: Path to the api-docs.db SQLite database. + lean_root: Root directory of the Lean project. + package_cache: Dictionary mapping package names to their directories. + allowed_module_prefixes: Module prefixes to extract (e.g., ["Mathlib"]). + + Returns: + List of parsed Declaration objects. + """ + declarations = [] + + connection = sqlite3.connect(str(database_path)) + connection.row_factory = sqlite3.Row + + try: + # Query all declarations with their source ranges and docstrings + query = """ + SELECT + n.module_name, + n.position, + n.kind, + n.name, + r.start_line, + r.end_line, + d.text AS docstring, + m.source_url + FROM name_info n + JOIN declaration_ranges r + ON n.module_name = r.module_name AND n.position = r.position + LEFT JOIN declaration_markdown_docstrings d + ON n.module_name = d.module_name AND n.position = d.position + JOIN modules m + ON n.module_name = m.name + ORDER BY n.module_name, n.position + """ + rows = connection.execute(query).fetchall() + + logger.info( + f"Found {len(rows)} declarations in api-docs.db" + ) + + skipped_no_source = 0 + skipped_prefix = 0 + skipped_constructor = 0 + source_errors = 0 + + with Progress( + SpinnerColumn(), + TextColumn("[progress.description]{task.description}"), + BarColumn(), + TaskProgressColumn(), + TimeRemainingColumn(), + ) as progress: + task = progress.add_task( + "[cyan]Parsing api-docs.db...", total=len(rows) + ) + + for row in rows: + module_name = row["module_name"] + declaration_name = row["name"] + + # Filter by module prefix + matches_prefix = any( + module_name == prefix or module_name.startswith(prefix + ".") + for prefix in allowed_module_prefixes + ) + if not matches_prefix: + skipped_prefix += 1 + progress.update(task, advance=1) + continue + + # Skip auto-generated .mk constructors + if declaration_name.endswith(".mk"): + skipped_constructor += 1 + progress.update(task, advance=1) + continue + + # Skip internal/auto-generated names + kind = row["kind"] + if kind == "constructor": + progress.update(task, advance=1) + continue + + source_url = row["source_url"] + start_line = row["start_line"] + end_line = row["end_line"] + + source_link = _construct_source_link( + source_url, module_name, start_line, end_line + ) + if not source_link: + skipped_no_source += 1 + progress.update(task, advance=1) + continue + + # Extract source text from local files + try: + source_text = _extract_source_text( + source_link, lean_root, package_cache + ) + except (FileNotFoundError, ValueError) as error: + source_errors += 1 + if source_errors <= 10: + logger.debug( + f"Could not extract source for " + f"{declaration_name}: {error}" + ) + progress.update(task, advance=1) + continue + + declarations.append( + Declaration( + name=declaration_name, + module=module_name, + docstring=row["docstring"], + source_text=source_text, + source_link=source_link, + dependencies=None, + ) + ) + + progress.update(task, advance=1) + + if skipped_no_source > 0: + logger.info("Skipped %d declarations without source URL", skipped_no_source) + if source_errors > 0: + logger.warning( + f"Could not extract source text for {source_errors} declarations" + ) + + finally: + connection.close() + + return declarations + + def _parse_declarations_from_files( bmp_files: list[Path], lean_root: Path, @@ -428,12 +603,36 @@ async def _insert_declarations_batch( return inserted_count +def _detect_docgen_format(workspace_path: Path) -> str: + """Detect which doc-gen4 output format a workspace uses. + + Doc-gen4 >= v4.29.0-rc1 writes to a SQLite database (api-docs.db). + Earlier versions write individual BMP JSON files to doc-data/. + + Args: + workspace_path: Path to the package workspace (e.g., lean/mathlib). + + Returns: + "sqlite" if api-docs.db exists, "bmp" if BMP files exist, "none" otherwise. + """ + api_docs_db = workspace_path / ".lake" / "build" / "api-docs.db" + if api_docs_db.exists(): + return "sqlite" + + doc_data_dir = workspace_path / ".lake" / "build" / "doc-data" + if doc_data_dir.exists(): + bmp_files = list(doc_data_dir.glob("**/*.bmp")) + if bmp_files: + return "bmp" + + return "none" + + async def extract_declarations(engine: AsyncEngine, batch_size: int = 1000) -> None: """Extract all declarations from doc-gen4 data and load into database. - Looks for BMP files in each package's .lake/build/doc-data directory. - Extracts only declarations matching the package's configured module_prefixes, - ensuring each package's declarations come from its own workspace. + Automatically detects whether each package uses the newer SQLite format + (api-docs.db from doc-gen4 >= v4.29.0-rc1) or the legacy BMP JSON format. Args: engine: SQLAlchemy async engine for database connection. @@ -445,21 +644,13 @@ async def extract_declarations(engine: AsyncEngine, batch_size: int = 1000) -> N lean_root = Path("lean") all_declarations = [] - # Process each workspace separately with its own package cache for package_name in get_extraction_order(): package_config = PACKAGE_REGISTRY[package_name] - doc_data_dir = lean_root / package_name / ".lake" / "build" / "doc-data" + workspace_path = lean_root / package_name + docgen_format = _detect_docgen_format(workspace_path) - if not doc_data_dir.exists(): - logger.warning( - "No doc-data directory for %s: %s", package_name, doc_data_dir - ) - continue - - bmp_files = sorted(doc_data_dir.glob("**/*.bmp")) - logger.info("Found %d BMP files in %s", len(bmp_files), package_name) - - if not bmp_files: + if docgen_format == "none": + logger.warning("No doc-gen4 output found for %s", package_name) continue # Build workspace-specific package cache to avoid version mismatches @@ -469,9 +660,29 @@ async def extract_declarations(engine: AsyncEngine, batch_size: int = 1000) -> N package_name, len(package_cache), ) - declarations = _parse_declarations_from_files( - bmp_files, lean_root, package_cache, package_config.module_prefixes - ) + if docgen_format == "sqlite": + api_docs_path = workspace_path / ".lake" / "build" / "api-docs.db" + logger.info( + "[%s] Using SQLite format (api-docs.db)", package_name + ) + declarations = _parse_declarations_from_sqlite( + api_docs_path, + lean_root, + package_cache, + package_config.module_prefixes, + ) + else: + doc_data_dir = workspace_path / ".lake" / "build" / "doc-data" + bmp_files = sorted(doc_data_dir.glob("**/*.bmp")) + logger.info( + "[%s] Using BMP format (%d files)", + package_name, len(bmp_files), + ) + declarations = _parse_declarations_from_files( + bmp_files, lean_root, package_cache, + package_config.module_prefixes, + ) + logger.info( "Extracted %d declarations from %s (prefixes: %s)", len(declarations), package_name, package_config.module_prefixes, @@ -479,7 +690,9 @@ async def extract_declarations(engine: AsyncEngine, batch_size: int = 1000) -> N all_declarations.extend(declarations) if not all_declarations: - raise FileNotFoundError("No declarations extracted from any package workspace") + raise FileNotFoundError( + "No declarations extracted from any package workspace" + ) logger.info("Total declarations extracted: %d", len(all_declarations)) From c45dab9249e71234f659e4d326cc6b08173085e6 Mon Sep 17 00:00:00 2001 From: justincasher Date: Sat, 4 Apr 2026 15:12:38 -0400 Subject: [PATCH 02/22] Remove dead constructor kind check in SQLite parser The check `kind == "constructor"` never matched because doc-gen4 stores the kind as "ctor" in the name_info table. Removing it rather than fixing the string keeps behavior consistent with the BMP parser, which also only filters by the .mk suffix. --- src/lean_explore/extract/doc_parser.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/lean_explore/extract/doc_parser.py b/src/lean_explore/extract/doc_parser.py index bbd7447..fe68905 100644 --- a/src/lean_explore/extract/doc_parser.py +++ b/src/lean_explore/extract/doc_parser.py @@ -408,12 +408,6 @@ def _parse_declarations_from_sqlite( progress.update(task, advance=1) continue - # Skip internal/auto-generated names - kind = row["kind"] - if kind == "constructor": - progress.update(task, advance=1) - continue - source_url = row["source_url"] start_line = row["start_line"] end_line = row["end_line"] From a042661d13b106775e405de8f2c0a5b37b5c780e Mon Sep 17 00:00:00 2001 From: justincasher Date: Sat, 4 Apr 2026 15:13:04 -0400 Subject: [PATCH 03/22] Use %-style logging consistently in new SQLite code Convert remaining f-string logger calls to %-style formatting to match the convention used throughout the rest of the PR. --- src/lean_explore/extract/doc_gen4.py | 5 +++-- src/lean_explore/extract/doc_parser.py | 11 +++++------ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/lean_explore/extract/doc_gen4.py b/src/lean_explore/extract/doc_gen4.py index 838c7e2..7008f9a 100644 --- a/src/lean_explore/extract/doc_gen4.py +++ b/src/lean_explore/extract/doc_gen4.py @@ -235,8 +235,9 @@ async def run_doc_gen4( api_docs_db = workspace_path / ".lake" / "build" / "api-docs.db" if api_docs_db.exists(): logger.info( - f"[{package_name}] Skipping cache clear " - f"(api-docs.db handles incremental updates)" + "[%s] Skipping cache clear " + "(api-docs.db handles incremental updates)", + package_name, ) else: _clear_workspace_cache(workspace_path) diff --git a/src/lean_explore/extract/doc_parser.py b/src/lean_explore/extract/doc_parser.py index fe68905..8e02baf 100644 --- a/src/lean_explore/extract/doc_parser.py +++ b/src/lean_explore/extract/doc_parser.py @@ -368,9 +368,7 @@ def _parse_declarations_from_sqlite( """ rows = connection.execute(query).fetchall() - logger.info( - f"Found {len(rows)} declarations in api-docs.db" - ) + logger.info("Found %d declarations in api-docs.db", len(rows)) skipped_no_source = 0 skipped_prefix = 0 @@ -429,8 +427,8 @@ def _parse_declarations_from_sqlite( source_errors += 1 if source_errors <= 10: logger.debug( - f"Could not extract source for " - f"{declaration_name}: {error}" + "Could not extract source for %s: %s", + declaration_name, error, ) progress.update(task, advance=1) continue @@ -452,7 +450,8 @@ def _parse_declarations_from_sqlite( logger.info("Skipped %d declarations without source URL", skipped_no_source) if source_errors > 0: logger.warning( - f"Could not extract source text for {source_errors} declarations" + "Could not extract source text for %d declarations", + source_errors, ) finally: From aaae8bb9080243b8bdb27fcd1b86113c0938698e Mon Sep 17 00:00:00 2001 From: justincasher Date: Sat, 4 Apr 2026 15:13:30 -0400 Subject: [PATCH 04/22] Log skipped_prefix and skipped_constructor counters These counters were tracked but never printed, making it harder to debug extraction issues. Now logged alongside the existing skipped_no_source and source_errors messages. --- src/lean_explore/extract/doc_parser.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/lean_explore/extract/doc_parser.py b/src/lean_explore/extract/doc_parser.py index 8e02baf..5b6d146 100644 --- a/src/lean_explore/extract/doc_parser.py +++ b/src/lean_explore/extract/doc_parser.py @@ -446,6 +446,13 @@ def _parse_declarations_from_sqlite( progress.update(task, advance=1) + if skipped_prefix > 0: + logger.info( + "Skipped %d declarations outside allowed prefixes", + skipped_prefix, + ) + if skipped_constructor > 0: + logger.info("Skipped %d .mk constructors", skipped_constructor) if skipped_no_source > 0: logger.info("Skipped %d declarations without source URL", skipped_no_source) if source_errors > 0: From af03b636e522e5b23a6c733a79c68926f6f4017d Mon Sep 17 00:00:00 2001 From: justincasher Date: Sat, 4 Apr 2026 15:13:49 -0400 Subject: [PATCH 05/22] Remove unused module_name parameter from _construct_source_link The parameter was documented but never used in the function body. --- src/lean_explore/extract/doc_parser.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/lean_explore/extract/doc_parser.py b/src/lean_explore/extract/doc_parser.py index 5b6d146..668bb1a 100644 --- a/src/lean_explore/extract/doc_parser.py +++ b/src/lean_explore/extract/doc_parser.py @@ -299,7 +299,6 @@ def _extract_source_text( def _construct_source_link( module_source_url: str | None, - module_name: str, start_line: int, end_line: int, ) -> str | None: @@ -307,7 +306,6 @@ def _construct_source_link( Args: module_source_url: GitHub URL to the module file from api-docs.db. - module_name: Fully qualified module name (e.g., "Mathlib.Algebra.Group"). start_line: Start line number in the source file. end_line: End line number in the source file. @@ -411,7 +409,7 @@ def _parse_declarations_from_sqlite( end_line = row["end_line"] source_link = _construct_source_link( - source_url, module_name, start_line, end_line + source_url, start_line, end_line ) if not source_link: skipped_no_source += 1 From e1d2e1676c01c1873b1e4e9f36400f5a9ad8771e Mon Sep 17 00:00:00 2001 From: justincasher Date: Sat, 4 Apr 2026 15:47:45 -0400 Subject: [PATCH 06/22] Fix doc data script lint --- scripts/generate_docs_data.py | 20 +- src/lean_explore/extract/doc_parser.py | 39 +++- tests/extract/doc_parser_test.py | 249 +++++++++++++++++++++++++ 3 files changed, 293 insertions(+), 15 deletions(-) diff --git a/scripts/generate_docs_data.py b/scripts/generate_docs_data.py index ed04a03..8b31194 100644 --- a/scripts/generate_docs_data.py +++ b/scripts/generate_docs_data.py @@ -193,7 +193,7 @@ class ModuleDict(TypedDict): def resolve_annotation(annotation: str | Expr | None) -> str: """Converts a griffe annotation to its string representation.""" - if isinstance(annotation, (Expr, str)): + if isinstance(annotation, Expr | str): return str(annotation) return "" @@ -391,16 +391,14 @@ def parse_docstring(docstring_object: object | None) -> DocstringSections: parse_admonition_section(section, sections_data) elif isinstance( section, - ( - DocstringSectionDeprecated, - DocstringSectionWarns, - DocstringSectionYields, - DocstringSectionReceives, - DocstringSectionOtherParameters, - DocstringSectionClasses, - DocstringSectionFunctions, - DocstringSectionModules, - ), + DocstringSectionDeprecated + | DocstringSectionWarns + | DocstringSectionYields + | DocstringSectionReceives + | DocstringSectionOtherParameters + | DocstringSectionClasses + | DocstringSectionFunctions + | DocstringSectionModules, ): sections_data[kind] = parse_generic_section(section) else: diff --git a/src/lean_explore/extract/doc_parser.py b/src/lean_explore/extract/doc_parser.py index 668bb1a..7c95072 100644 --- a/src/lean_explore/extract/doc_parser.py +++ b/src/lean_explore/extract/doc_parser.py @@ -275,6 +275,16 @@ def _extract_source_text( package_name.replace("-", "").lower(), ]: if variant in package_cache: + if variant == "lean4" and file_path_string.startswith("src/lean/"): + adjusted_path = file_path_string[9:] + candidates.append(package_cache[variant] / adjusted_path) + continue + if variant == "lean4" and file_path_string.startswith("src/lake/"): + adjusted_path = file_path_string[9:] + candidates.append( + package_cache[variant].parent / "lake" / adjusted_path + ) + continue if variant == "lean4" and file_path_string.startswith("src/"): adjusted_path = file_path_string[4:] else: @@ -298,6 +308,7 @@ def _extract_source_text( def _construct_source_link( + module_name: str, module_source_url: str | None, start_line: int, end_line: int, @@ -305,6 +316,7 @@ def _construct_source_link( """Construct a GitHub source link from module URL and line range. Args: + module_name: Lean module name from api-docs.db. module_source_url: GitHub URL to the module file from api-docs.db. start_line: Start line number in the source file. end_line: End line number in the source file. @@ -312,9 +324,23 @@ def _construct_source_link( Returns: GitHub URL with line range fragment, or None if no source URL exists. """ - if not module_source_url: - return None - return f"{module_source_url}#L{start_line}-L{end_line}" + if module_source_url: + return f"{module_source_url}#L{start_line}-L{end_line}" + + module_path = module_name.replace(".", "/") + root = module_name.split(".", 1)[0] + if root in {"Init", "Lean", "Std"}: + return ( + "https://github.com/leanprover/lean4/blob/toolchain/" + f"src/lean/{module_path}.lean#L{start_line}-L{end_line}" + ) + if root == "Lake": + return ( + "https://github.com/leanprover/lean4/blob/toolchain/" + f"src/lake/{module_path}.lean#L{start_line}-L{end_line}" + ) + + return None def _parse_declarations_from_sqlite( @@ -351,6 +377,7 @@ def _parse_declarations_from_sqlite( n.position, n.kind, n.name, + n.render, r.start_line, r.end_line, d.text AS docstring, @@ -409,13 +436,17 @@ def _parse_declarations_from_sqlite( end_line = row["end_line"] source_link = _construct_source_link( - source_url, start_line, end_line + module_name, source_url, start_line, end_line ) if not source_link: skipped_no_source += 1 progress.update(task, advance=1) continue + if not row["render"]: + progress.update(task, advance=1) + continue + # Extract source text from local files try: source_text = _extract_source_text( diff --git a/tests/extract/doc_parser_test.py b/tests/extract/doc_parser_test.py index 68a0842..0afdc67 100644 --- a/tests/extract/doc_parser_test.py +++ b/tests/extract/doc_parser_test.py @@ -5,6 +5,7 @@ """ import json +import sqlite3 from unittest.mock import AsyncMock, patch import pytest @@ -12,11 +13,13 @@ from lean_explore.extract.doc_parser import ( _build_package_cache, + _construct_source_link, _extract_dependencies_from_html, _extract_source_text, _filter_auto_generated_projections, _insert_declarations_batch, _parse_declarations_from_files, + _parse_declarations_from_sqlite, _read_source_lines, _strip_lean_comments, extract_declarations, @@ -155,6 +158,51 @@ def test_extract_source_text_file_not_found(self, temp_directory): with pytest.raises(FileNotFoundError): _extract_source_text(source_link, lean_root, package_cache) + def test_extract_source_text_from_lake_toolchain(self, temp_directory): + """Test extracting source text from the Lake toolchain path.""" + lean_root = temp_directory / "lean" + toolchain_root = temp_directory / "toolchain" / "src" + lean_source_dir = toolchain_root / "lean" + lake_source_dir = toolchain_root / "lake" + lean_source_dir.mkdir(parents=True) + lake_file = lake_source_dir / "Lake" / "Config" / "Monad.lean" + lake_file.parent.mkdir(parents=True) + lake_file.write_text("def Lake.Config.Monad.run := 1\n") + + package_cache = {"lean4": lean_source_dir} + source_link = ( + "https://github.com/leanprover/lean4/blob/toolchain/" + "src/lake/Lake/Config/Monad.lean#L1-L1" + ) + + result = _extract_source_text(source_link, lean_root, package_cache) + + assert "Lake.Config.Monad.run" in result + + +class TestSqliteHelpers: + """Tests for SQLite-specific parsing helpers.""" + + def test_construct_source_link_for_core_module(self): + """Test source links for core modules without a stored source URL.""" + result = _construct_source_link("Init.Data.Nat.Basic", None, 12, 15) + + assert ( + result + == "https://github.com/leanprover/lean4/blob/toolchain/" + "src/lean/Init/Data/Nat/Basic.lean#L12-L15" + ) + + def test_construct_source_link_for_lake_module(self): + """Test source links for Lake modules without a stored source URL.""" + result = _construct_source_link("Lake.Config.Monad", None, 7, 9) + + assert ( + result + == "https://github.com/leanprover/lean4/blob/toolchain/" + "src/lake/Lake/Config/Monad.lean#L7-L9" + ) + class TestDependencyExtraction: """Tests for dependency extraction from HTML.""" @@ -273,6 +321,207 @@ def test_parse_declarations_filters_packages(self, temp_directory): assert len(declarations) == 0 + def test_parse_declarations_from_sqlite_filters_non_rendered( + self, temp_directory + ): + """Test SQLite parsing only keeps rendered declarations.""" + lean_root = temp_directory / "lean" + database_path = temp_directory / "api-docs.db" + + source_dir = lean_root / "mathlib" / ".lake" / "packages" / "mathlib4" + source_file = source_dir / "Mathlib" / "Data" / "Nat" / "Basic.lean" + source_file.parent.mkdir(parents=True) + source_file.write_text( + "theorem Nat.visible : True := trivial\n" + "def Nat.hidden : Nat := 0\n" + ) + + connection = sqlite3.connect(database_path) + connection.executescript( + """ + CREATE TABLE modules (name TEXT PRIMARY KEY, source_url TEXT); + CREATE TABLE name_info ( + module_name TEXT NOT NULL, + position INTEGER NOT NULL, + kind TEXT, + name TEXT NOT NULL, + type BLOB NOT NULL, + sorried INTEGER NOT NULL, + render INTEGER NOT NULL, + PRIMARY KEY (module_name, position) + ); + CREATE TABLE declaration_ranges ( + module_name TEXT NOT NULL, + position INTEGER NOT NULL, + start_line INTEGER NOT NULL, + start_column INTEGER NOT NULL, + start_utf16 INTEGER NOT NULL, + end_line INTEGER NOT NULL, + end_column INTEGER NOT NULL, + end_utf16 INTEGER NOT NULL, + PRIMARY KEY (module_name, position) + ); + CREATE TABLE declaration_markdown_docstrings ( + module_name TEXT NOT NULL, + position INTEGER NOT NULL, + text TEXT NOT NULL, + PRIMARY KEY (module_name, position) + ); + """ + ) + connection.execute( + "INSERT INTO modules (name, source_url) VALUES (?, ?)", + ( + "Mathlib.Data.Nat.Basic", + "https://github.com/leanprover-community/mathlib4/blob/master/" + "Mathlib/Data/Nat/Basic.lean", + ), + ) + connection.executemany( + """ + INSERT INTO name_info + (module_name, position, kind, name, type, sorried, render) + VALUES (?, ?, ?, ?, ?, ?, ?) + """, + [ + ( + "Mathlib.Data.Nat.Basic", + 1, + "theorem", + "Nat.visible", + b"", + 0, + 1, + ), + ( + "Mathlib.Data.Nat.Basic", + 2, + "definition", + "Nat.hidden", + b"", + 0, + 0, + ), + ], + ) + connection.executemany( + """ + INSERT INTO declaration_ranges + (module_name, position, start_line, start_column, start_utf16, + end_line, end_column, end_utf16) + VALUES (?, ?, ?, ?, ?, ?, ?, ?) + """, + [ + ("Mathlib.Data.Nat.Basic", 1, 1, 0, 0, 1, 32, 32), + ("Mathlib.Data.Nat.Basic", 2, 2, 0, 0, 2, 22, 22), + ], + ) + connection.execute( + """ + INSERT INTO declaration_markdown_docstrings + (module_name, position, text) + VALUES (?, ?, ?) + """, + ("Mathlib.Data.Nat.Basic", 1, "Visible theorem"), + ) + connection.commit() + connection.close() + + declarations = _parse_declarations_from_sqlite( + database_path, + lean_root, + _build_package_cache(lean_root), + allowed_module_prefixes=["Mathlib"], + ) + + assert [declaration.name for declaration in declarations] == ["Nat.visible"] + assert declarations[0].docstring == "Visible theorem" + + def test_parse_declarations_from_sqlite_uses_core_fallback_source_link( + self, temp_directory + ): + """Test SQLite parsing for core modules without a stored source URL.""" + lean_root = temp_directory / "lean" + database_path = temp_directory / "api-docs.db" + + toolchain_root = temp_directory / "toolchain" / "src" + lean_source_dir = toolchain_root / "lean" + init_file = lean_source_dir / "Init" / "Data" / "Nat" / "Basic.lean" + init_file.parent.mkdir(parents=True) + init_file.write_text("theorem Nat.core : True := trivial\n") + + connection = sqlite3.connect(database_path) + connection.executescript( + """ + CREATE TABLE modules (name TEXT PRIMARY KEY, source_url TEXT); + CREATE TABLE name_info ( + module_name TEXT NOT NULL, + position INTEGER NOT NULL, + kind TEXT, + name TEXT NOT NULL, + type BLOB NOT NULL, + sorried INTEGER NOT NULL, + render INTEGER NOT NULL, + PRIMARY KEY (module_name, position) + ); + CREATE TABLE declaration_ranges ( + module_name TEXT NOT NULL, + position INTEGER NOT NULL, + start_line INTEGER NOT NULL, + start_column INTEGER NOT NULL, + start_utf16 INTEGER NOT NULL, + end_line INTEGER NOT NULL, + end_column INTEGER NOT NULL, + end_utf16 INTEGER NOT NULL, + PRIMARY KEY (module_name, position) + ); + CREATE TABLE declaration_markdown_docstrings ( + module_name TEXT NOT NULL, + position INTEGER NOT NULL, + text TEXT NOT NULL, + PRIMARY KEY (module_name, position) + ); + """ + ) + connection.execute( + "INSERT INTO modules (name, source_url) VALUES (?, ?)", + ("Init.Data.Nat.Basic", None), + ) + connection.execute( + """ + INSERT INTO name_info + (module_name, position, kind, name, type, sorried, render) + VALUES (?, ?, ?, ?, ?, ?, ?) + """, + ("Init.Data.Nat.Basic", 1, "theorem", "Nat.core", b"", 0, 1), + ) + connection.execute( + """ + INSERT INTO declaration_ranges + (module_name, position, start_line, start_column, start_utf16, + end_line, end_column, end_utf16) + VALUES (?, ?, ?, ?, ?, ?, ?, ?) + """, + ("Init.Data.Nat.Basic", 1, 1, 0, 0, 1, 31, 31), + ) + connection.commit() + connection.close() + + declarations = _parse_declarations_from_sqlite( + database_path, + lean_root, + {"lean4": lean_source_dir}, + allowed_module_prefixes=["Init"], + ) + + assert len(declarations) == 1 + assert declarations[0].name == "Nat.core" + assert ( + declarations[0].source_link + == "https://github.com/leanprover/lean4/blob/toolchain/" + "src/lean/Init/Data/Nat/Basic.lean#L1-L1" + ) + class TestDeclarationInsertion: """Tests for database insertion.""" From 37be25486d28d6f665c94207acbe9468b7392d2b Mon Sep 17 00:00:00 2001 From: justincasher Date: Sat, 4 Apr 2026 15:49:35 -0400 Subject: [PATCH 07/22] Fix doc-gen4 version handling and tests --- src/lean_explore/extract/doc_gen4.py | 45 ++++++++++++++--- tests/extract/__main___test.py | 73 ++++++++++++++++++++-------- tests/extract/doc_gen4_test.py | 62 +++++++++++++++++++++++ 3 files changed, 152 insertions(+), 28 deletions(-) create mode 100644 tests/extract/doc_gen4_test.py diff --git a/src/lean_explore/extract/doc_gen4.py b/src/lean_explore/extract/doc_gen4.py index 7008f9a..763476c 100644 --- a/src/lean_explore/extract/doc_gen4.py +++ b/src/lean_explore/extract/doc_gen4.py @@ -6,6 +6,7 @@ import logging import os +import re import shutil import subprocess import time @@ -23,6 +24,29 @@ logger = logging.getLogger(__name__) +def _uses_sqlite_docgen(lean_version: str) -> bool: + """Return whether the matching doc-gen4 release writes api-docs.db. + + Doc-gen4's SQLite output landed after `v4.29.0-rc1` and is present starting + with `v4.29.0-rc2`. + """ + match = extract_lean_version(lean_version) + version_match = re.match(r"^v(\d+)\.(\d+)\.(\d+)(?:-rc(\d+))?$", match) + if version_match is None: + return False + + major, minor, patch, rc = version_match.groups() + version = (int(major), int(minor), int(patch)) + if version > (4, 29, 0): + return True + if version < (4, 29, 0): + return False + + if rc is None: + return True + return int(rc) >= 2 + + def _clear_workspace_cache(workspace_path: Path) -> None: """Clear entire Lake cache to force complete rebuild. @@ -227,13 +251,17 @@ async def run_doc_gen4( workspace_path = Path("lean") / package_name logger.info("\n%s\nPackage: %s\n%s", "=" * 50, package_name, "=" * 50) + toolchain = None + ref = None if fresh: - # Packages using doc-gen4 >= v4.29 output to a SQLite database - # (api-docs.db) which handles incremental updates correctly. - # Only packages using the legacy BMP format need a full cache - # clear to avoid stale output files. - api_docs_db = workspace_path / ".lake" / "build" / "api-docs.db" - if api_docs_db.exists(): + if setup: + toolchain, ref = _setup_workspace(config) + logger.info("Toolchain: %s, ref: %s", toolchain, ref) + + # Doc-gen4 switched from BMP files to api-docs.db in v4.29.0-rc2. + # The SQLite format handles incremental updates, while legacy BMP + # output requires a cache clear to avoid stale files. + if toolchain and _uses_sqlite_docgen(toolchain): logger.info( "[%s] Skipping cache clear " "(api-docs.db handles incremental updates)", @@ -243,8 +271,9 @@ async def run_doc_gen4( _clear_workspace_cache(workspace_path) if setup: - toolchain, ref = _setup_workspace(config) - logger.info("Toolchain: %s, ref: %s", toolchain, ref) + if toolchain is None or ref is None: + toolchain, ref = _setup_workspace(config) + logger.info("Toolchain: %s, ref: %s", toolchain, ref) _run_lake_for_package(package_name, verbose) diff --git a/tests/extract/__main___test.py b/tests/extract/__main___test.py index 84607c6..aa9ee4d 100644 --- a/tests/extract/__main___test.py +++ b/tests/extract/__main___test.py @@ -50,32 +50,65 @@ class TestDocGen4Step: @pytest.mark.external async def test_run_doc_gen4_step_success(self, temp_directory): """Test successful doc-gen4 execution.""" - lean_directory = temp_directory / "lean" - lean_directory.mkdir() - - # Mock subprocess to avoid actually running lake - with patch("lean_explore.extract.doc_gen4.subprocess.Popen") as mock_popen: - mock_process = MagicMock() - mock_process.stdout = iter(["Building...\n", "Complete!\n"]) - mock_process.wait.return_value = 0 - mock_popen.return_value = mock_process + with patch( + "lean_explore.extract.doc_gen4.get_extraction_order", + return_value=["mathlib"], + ): + with patch( + "lean_explore.extract.doc_gen4._setup_workspace", + return_value=("leanprover/lean4:v4.29.0-rc2", "v4.29.0-rc2"), + ): + with patch( + "lean_explore.extract.doc_gen4.subprocess.run" + ) as mock_run: + mock_run.return_value = MagicMock( + returncode=0, + stdout="", + stderr="", + ) + with patch( + "lean_explore.extract.doc_gen4.subprocess.Popen" + ) as mock_popen: + mock_process = MagicMock() + mock_process.stdout = iter(["Building...\n", "Complete!\n"]) + mock_process.wait.return_value = 0 + mock_popen.return_value = mock_process - await _run_doc_gen4_step() + await _run_doc_gen4_step() - # Called twice: "lake build" and "lake build LeanExtract:docs" - assert mock_popen.call_count == 2 + assert mock_run.call_count == 2 + # Called twice: "lake build" and "lake build MathExtract:docs" + assert mock_popen.call_count == 2 @pytest.mark.external async def test_run_doc_gen4_step_failure(self): """Test doc-gen4 execution failure.""" - with patch("lean_explore.extract.doc_gen4.subprocess.Popen") as mock_popen: - mock_process = MagicMock() - mock_process.stdout = iter(["Error!\n"]) - mock_process.wait.return_value = 1 - mock_popen.return_value = mock_process - - with pytest.raises(RuntimeError, match="failed"): - await _run_doc_gen4_step() + with patch( + "lean_explore.extract.doc_gen4.get_extraction_order", + return_value=["mathlib"], + ): + with patch( + "lean_explore.extract.doc_gen4._setup_workspace", + return_value=("leanprover/lean4:v4.29.0-rc2", "v4.29.0-rc2"), + ): + with patch( + "lean_explore.extract.doc_gen4.subprocess.run" + ) as mock_run: + mock_run.return_value = MagicMock( + returncode=0, + stdout="", + stderr="", + ) + with patch( + "lean_explore.extract.doc_gen4.subprocess.Popen" + ) as mock_popen: + mock_process = MagicMock() + mock_process.stdout = iter(["Error!\n"]) + mock_process.wait.return_value = 1 + mock_popen.return_value = mock_process + + with pytest.raises(RuntimeError, match="failed"): + await _run_doc_gen4_step() class TestExtractionStep: diff --git a/tests/extract/doc_gen4_test.py b/tests/extract/doc_gen4_test.py new file mode 100644 index 0000000..f8443bf --- /dev/null +++ b/tests/extract/doc_gen4_test.py @@ -0,0 +1,62 @@ +"""Tests for doc-gen4 workspace orchestration.""" + +from unittest.mock import patch + +from lean_explore.extract.doc_gen4 import _uses_sqlite_docgen, run_doc_gen4 + + +class TestDocGen4VersionDetection: + """Tests for doc-gen4 format detection by Lean version.""" + + def test_uses_sqlite_docgen_from_rc2(self): + """Test the SQLite cutoff at v4.29.0-rc2.""" + assert _uses_sqlite_docgen("leanprover/lean4:v4.29.0-rc2") + assert _uses_sqlite_docgen("leanprover/lean4:v4.29.0") + assert not _uses_sqlite_docgen("leanprover/lean4:v4.29.0-rc1") + assert not _uses_sqlite_docgen("leanprover/lean4:v4.28.0") + + +class TestRunDocGen4FreshHandling: + """Tests for fresh-cache handling in run_doc_gen4.""" + + async def test_fresh_legacy_workspace_clears_cache(self): + """Test that legacy doc-gen4 workspaces still clear the cache.""" + with patch( + "lean_explore.extract.doc_gen4.get_extraction_order", + return_value=["mathlib"], + ): + with patch( + "lean_explore.extract.doc_gen4._setup_workspace", + return_value=("leanprover/lean4:v4.29.0-rc1", "v4.29.0-rc1"), + ): + with patch( + "lean_explore.extract.doc_gen4._clear_workspace_cache" + ) as mock_clear: + with patch( + "lean_explore.extract.doc_gen4._run_lake_for_package" + ) as mock_run: + await run_doc_gen4(fresh=True) + + mock_clear.assert_called_once() + mock_run.assert_called_once_with("mathlib", False) + + async def test_fresh_sqlite_workspace_skips_cache_clear(self): + """Test that SQLite doc-gen4 workspaces keep the cache.""" + with patch( + "lean_explore.extract.doc_gen4.get_extraction_order", + return_value=["mathlib"], + ): + with patch( + "lean_explore.extract.doc_gen4._setup_workspace", + return_value=("leanprover/lean4:v4.29.0-rc2", "v4.29.0-rc2"), + ): + with patch( + "lean_explore.extract.doc_gen4._clear_workspace_cache" + ) as mock_clear: + with patch( + "lean_explore.extract.doc_gen4._run_lake_for_package" + ) as mock_run: + await run_doc_gen4(fresh=True) + + mock_clear.assert_not_called() + mock_run.assert_called_once_with("mathlib", False) From 3148f473adc71556eccb348e86df90bf1b97966e Mon Sep 17 00:00:00 2001 From: justincasher Date: Sat, 4 Apr 2026 15:49:41 -0400 Subject: [PATCH 08/22] Correct doc-gen4 SQLite cutoff references --- src/lean_explore/extract/doc_parser.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/lean_explore/extract/doc_parser.py b/src/lean_explore/extract/doc_parser.py index 7c95072..25fc154 100644 --- a/src/lean_explore/extract/doc_parser.py +++ b/src/lean_explore/extract/doc_parser.py @@ -4,8 +4,8 @@ Declaration objects ready for database insertion. Supports two doc-gen4 output formats: -- SQLite database (api-docs.db): Used by doc-gen4 >= v4.29.0-rc1 -- BMP JSON files (.bmp): Used by doc-gen4 < v4.29.0-rc1 +- SQLite database (api-docs.db): Used by doc-gen4 >= v4.29.0-rc2 +- BMP JSON files (.bmp): Used by doc-gen4 < v4.29.0-rc2 """ import json @@ -351,7 +351,7 @@ def _parse_declarations_from_sqlite( ) -> list[Declaration]: """Parse declarations from a doc-gen4 SQLite database (api-docs.db). - Doc-gen4 >= v4.29.0-rc1 outputs declaration data to a SQLite database + Doc-gen4 >= v4.29.0-rc2 outputs declaration data to a SQLite database instead of individual BMP JSON files. This function reads that database and produces the same Declaration objects as the BMP parser. @@ -635,7 +635,7 @@ async def _insert_declarations_batch( def _detect_docgen_format(workspace_path: Path) -> str: """Detect which doc-gen4 output format a workspace uses. - Doc-gen4 >= v4.29.0-rc1 writes to a SQLite database (api-docs.db). + Doc-gen4 >= v4.29.0-rc2 writes to a SQLite database (api-docs.db). Earlier versions write individual BMP JSON files to doc-data/. Args: @@ -661,7 +661,7 @@ async def extract_declarations(engine: AsyncEngine, batch_size: int = 1000) -> N """Extract all declarations from doc-gen4 data and load into database. Automatically detects whether each package uses the newer SQLite format - (api-docs.db from doc-gen4 >= v4.29.0-rc1) or the legacy BMP JSON format. + (api-docs.db from doc-gen4 >= v4.29.0-rc2) or the legacy BMP JSON format. Args: engine: SQLAlchemy async engine for database connection. From 2780662cd2a04a096bff7d7da407ebad2357e4e8 Mon Sep 17 00:00:00 2001 From: justincasher Date: Sat, 11 Apr 2026 11:05:57 -0400 Subject: [PATCH 09/22] Fix dependency extraction for SQLite-backed packages The SQLite extraction path was hard-coding dependencies=None for every declaration, silently breaking dependency-based search ranking and informalization layering for mathlib/cslib. Add a RenderedCode BLOB parser that deserializes the leansqlite binary format used in name_info.type to extract const references (declaration dependencies) from type signatures. The parser handles the full TaggedText/RenderedCode.Tag/Name encoding including variable-length Nat and nested structures. Also adds the n.type column to the SQLite query so the BLOB is available for parsing. --- src/lean_explore/extract/doc_parser.py | 159 +++++++++++++++- tests/extract/doc_parser_test.py | 254 ++++++++++++++++++++++++- 2 files changed, 409 insertions(+), 4 deletions(-) diff --git a/src/lean_explore/extract/doc_parser.py b/src/lean_explore/extract/doc_parser.py index 25fc154..bbab127 100644 --- a/src/lean_explore/extract/doc_parser.py +++ b/src/lean_explore/extract/doc_parser.py @@ -31,6 +31,148 @@ logger = logging.getLogger(__name__) +# --------------------------------------------------------------------------- +# RenderedCode BLOB parser +# +# Doc-gen4 stores declaration type signatures in the `name_info.type` column +# as a binary BLOB using leansqlite's ToBinary serialization format. +# +# The type is RenderedCode = TaggedText RenderedCode.Tag where: +# TaggedText: text(0) String | tag(1) Tag TaggedText | append(2) Array +# Tag: keyword(0) | string(1) | const(2) Name | sort-none(3) +# | sort-type(4) | sort-prop(5) | sort-sort(6) | otherExpr(7) +# Name: anonymous(0) | str(1) Name String | num(2) Name Nat +# +# Encoding primitives (big-endian, leansqlite Classes.lean): +# Nat – variable-length 7-bit chunks, high bit = continuation +# String – Nat(utf8_byte_length) + raw UTF-8 bytes +# Array – Nat(count) + elements +# --------------------------------------------------------------------------- + + +class _BlobReader: + """Minimal reader for leansqlite ToBinary format.""" + + __slots__ = ("_data", "_cursor") + + def __init__(self, data: bytes) -> None: + self._data = data + self._cursor = 0 + + def _read_byte(self) -> int: + if self._cursor >= len(self._data): + raise ValueError("Unexpected end of BLOB data") + value = self._data[self._cursor] + self._cursor += 1 + return value + + def _read_nat(self) -> int: + """Read a variable-length natural number (7-bit chunks, MSB = more).""" + result = 0 + shift = 0 + while True: + byte = self._read_byte() + if byte >= 128: + result |= (byte & 0x7F) << shift + else: + result |= byte << shift + break + shift += 7 + return result + + def _read_string(self) -> str: + byte_length = self._read_nat() + if self._cursor + byte_length > len(self._data): + raise ValueError("String extends past end of BLOB data") + raw = self._data[self._cursor : self._cursor + byte_length] + self._cursor += byte_length + return raw.decode("utf-8") + + def _read_name(self) -> str: + """Read a Lean Name and return its dot-separated string form.""" + tag = self._read_byte() + if tag == 0: # anonymous + return "" + if tag == 1: # str parent s + parent = self._read_name() + component = self._read_string() + return f"{parent}.{component}" if parent else component + if tag == 2: # num parent n + parent = self._read_name() + number = self._read_nat() + return f"{parent}.{number}" if parent else str(number) + raise ValueError(f"Invalid Name tag: {tag}") + + def _skip_name(self) -> None: + """Skip over a Name without allocating strings.""" + tag = self._read_byte() + if tag == 0: + return + if tag == 1: + self._skip_name() + byte_length = self._read_nat() + self._cursor += byte_length + return + if tag == 2: + self._skip_name() + self._read_nat() + return + raise ValueError(f"Invalid Name tag: {tag}") + + +def _extract_names_from_rendered_code(blob: bytes) -> list[str]: + """Extract referenced declaration names from a RenderedCode BLOB. + + Walks the TaggedText tree and collects Lean Names from every + RenderedCode.Tag.const node (tag byte 2). + + Args: + blob: Raw bytes of the RenderedCode BLOB from name_info.type. + + Returns: + De-duplicated list of fully-qualified Lean names referenced in the type. + """ + reader = _BlobReader(blob) + names: list[str] = [] + seen: set[str] = set() + + def walk_tagged_text() -> None: + tag = reader._read_byte() + if tag == 0: # text + reader._read_string() + elif tag == 1: # tag + walk_tag() + walk_tagged_text() + elif tag == 2: # append + count = reader._read_nat() + for _ in range(count): + walk_tagged_text() + else: + raise ValueError(f"Invalid TaggedText tag: {tag}") + + def walk_tag() -> None: + tag = reader._read_byte() + if tag <= 1 or (3 <= tag <= 7): + # keyword(0), string(1), sort-none(3), sort-type(4), + # sort-prop(5), sort-sort(6), otherExpr(7) — no payload + return + if tag == 2: # const + name = reader._read_name() + if name and name not in seen: + names.append(name) + seen.add(name) + return + raise ValueError(f"Invalid RenderedCode.Tag tag: {tag}") + + try: + walk_tagged_text() + except (ValueError, IndexError): + logger.debug("Failed to parse RenderedCode BLOB (%d bytes)", len(blob)) + return [] + + return names + + def _strip_lean_comments(source_text: str) -> str: """Strip Lean comments from source text for comparison. @@ -377,6 +519,7 @@ def _parse_declarations_from_sqlite( n.position, n.kind, n.name, + n.type, n.render, r.start_line, r.end_line, @@ -462,6 +605,20 @@ def _parse_declarations_from_sqlite( progress.update(task, advance=1) continue + # Extract dependency names from the type signature BLOB + type_blob = row["type"] + if type_blob: + dep_names = _extract_names_from_rendered_code( + bytes(type_blob) + ) + # Filter out self-references + dep_names = [ + d for d in dep_names if d != declaration_name + ] + dependencies = dep_names or None + else: + dependencies = None + declarations.append( Declaration( name=declaration_name, @@ -469,7 +626,7 @@ def _parse_declarations_from_sqlite( docstring=row["docstring"], source_text=source_text, source_link=source_link, - dependencies=None, + dependencies=dependencies, ) ) diff --git a/tests/extract/doc_parser_test.py b/tests/extract/doc_parser_test.py index 0afdc67..0d92815 100644 --- a/tests/extract/doc_parser_test.py +++ b/tests/extract/doc_parser_test.py @@ -12,9 +12,11 @@ from sqlalchemy import select from lean_explore.extract.doc_parser import ( + _BlobReader, _build_package_cache, _construct_source_link, _extract_dependencies_from_html, + _extract_names_from_rendered_code, _extract_source_text, _filter_auto_generated_projections, _insert_declarations_batch, @@ -180,6 +182,158 @@ def test_extract_source_text_from_lake_toolchain(self, temp_directory): assert "Lake.Config.Monad.run" in result +def _encode_nat(n: int) -> bytes: + """Encode a natural number in leansqlite's variable-length format.""" + chunks = [] + while n >= 128: + chunks.append((n & 0x7F) | 0x80) + n >>= 7 + chunks.append(n) + return bytes(chunks) + + +def _encode_string(s: str) -> bytes: + """Encode a string: Nat(utf8_byte_length) + UTF-8 bytes.""" + encoded = s.encode("utf-8") + return _encode_nat(len(encoded)) + encoded + + +def _encode_name(name: str) -> bytes: + """Encode a dotted Lean name (e.g. 'Nat.add') into binary.""" + if not name: + return b"\x00" # anonymous + parts = name.split(".") + result = b"\x00" # start with anonymous + for part in parts: + # Name.str = tag 1 + parent + string + result = b"\x01" + result + _encode_string(part) + return result + + +def _make_const_tag(name: str) -> bytes: + """Build a RenderedCode.Tag.const(name) blob fragment.""" + return b"\x02" + _encode_name(name) + + +def _make_text_node(s: str) -> bytes: + """Build a TaggedText.text(s) blob fragment.""" + return b"\x00" + _encode_string(s) + + +def _make_tag_node(tag_bytes: bytes, inner: bytes) -> bytes: + """Build a TaggedText.tag(tag, inner) blob fragment.""" + return b"\x01" + tag_bytes + inner + + +def _make_append_node(children: list[bytes]) -> bytes: + """Build a TaggedText.append(children) blob fragment.""" + return b"\x02" + _encode_nat(len(children)) + b"".join(children) + + +class TestRenderedCodeBlobParser: + """Tests for RenderedCode BLOB parsing to extract dependency names.""" + + def test_extract_single_const(self): + """Test extracting a single const name from a type BLOB.""" + # TaggedText.tag(Tag.const("Nat"), TaggedText.text("Nat")) + blob = _make_tag_node(_make_const_tag("Nat"), _make_text_node("Nat")) + names = _extract_names_from_rendered_code(blob) + assert names == ["Nat"] + + def test_extract_multiple_consts(self): + """Test extracting multiple const names from a type BLOB.""" + # append([tag(const Nat, text Nat), text " → ", tag(const Bool, text Bool)]) + blob = _make_append_node([ + _make_tag_node(_make_const_tag("Nat"), _make_text_node("Nat")), + _make_text_node(" → "), + _make_tag_node(_make_const_tag("Bool"), _make_text_node("Bool")), + ]) + names = _extract_names_from_rendered_code(blob) + assert names == ["Nat", "Bool"] + + def test_extract_dotted_name(self): + """Test extracting a dotted name like Nat.add.""" + blob = _make_tag_node( + _make_const_tag("Nat.add"), _make_text_node("Nat.add") + ) + names = _extract_names_from_rendered_code(blob) + assert names == ["Nat.add"] + + def test_deduplicates_names(self): + """Test that duplicate const references are deduplicated.""" + blob = _make_append_node([ + _make_tag_node(_make_const_tag("Nat"), _make_text_node("Nat")), + _make_text_node(" → "), + _make_tag_node(_make_const_tag("Nat"), _make_text_node("Nat")), + ]) + names = _extract_names_from_rendered_code(blob) + assert names == ["Nat"] + + def test_skips_non_const_tags(self): + """Test that keyword, string, sort, otherExpr tags are skipped.""" + blob = _make_append_node([ + _make_tag_node(b"\x00", _make_text_node("def")), # keyword + _make_text_node(" "), + _make_tag_node(_make_const_tag("Nat"), _make_text_node("Nat")), + _make_tag_node(b"\x07", _make_text_node("x")), # otherExpr + ]) + names = _extract_names_from_rendered_code(blob) + assert names == ["Nat"] + + def test_empty_blob_returns_empty(self): + """Test that an empty or invalid BLOB returns empty list.""" + assert _extract_names_from_rendered_code(b"") == [] + assert _extract_names_from_rendered_code(b"\xff") == [] + + def test_text_only_returns_empty(self): + """Test that a BLOB with only text returns no names.""" + blob = _make_text_node("hello world") + names = _extract_names_from_rendered_code(blob) + assert names == [] + + def test_nested_tagged_text(self): + """Test parsing nested tag nodes.""" + # tag(otherExpr, tag(const("List"), text("List"))) + inner = _make_tag_node(_make_const_tag("List"), _make_text_node("List")) + blob = _make_tag_node(b"\x07", inner) # otherExpr wrapping + names = _extract_names_from_rendered_code(blob) + assert names == ["List"] + + def test_blob_reader_nat_encoding(self): + """Test that Nat encoding/decoding matches leansqlite format.""" + reader = _BlobReader(_encode_nat(0)) + assert reader._read_nat() == 0 + + reader = _BlobReader(_encode_nat(127)) + assert reader._read_nat() == 127 + + reader = _BlobReader(_encode_nat(128)) + assert reader._read_nat() == 128 + + reader = _BlobReader(_encode_nat(300)) + assert reader._read_nat() == 300 + + reader = _BlobReader(_encode_nat(100000)) + assert reader._read_nat() == 100000 + + def test_blob_reader_name_roundtrip(self): + """Test that Name encoding produces correct dot-separated output.""" + reader = _BlobReader(_encode_name("Mathlib.Data.Nat.Basic")) + assert reader._read_name() == "Mathlib.Data.Nat.Basic" + + reader = _BlobReader(_encode_name("")) + assert reader._read_name() == "" + + def test_sort_tags_handled(self): + """Test that sort tag variants (3-6) are handled without error.""" + for sort_byte in [3, 4, 5, 6]: + blob = _make_tag_node( + bytes([sort_byte]), _make_text_node("Type") + ) + names = _extract_names_from_rendered_code(blob) + assert names == [] + + class TestSqliteHelpers: """Tests for SQLite-specific parsing helpers.""" @@ -389,7 +543,7 @@ def test_parse_declarations_from_sqlite_filters_non_rendered( 1, "theorem", "Nat.visible", - b"", + _make_text_node("True"), 0, 1, ), @@ -398,7 +552,7 @@ def test_parse_declarations_from_sqlite_filters_non_rendered( 2, "definition", "Nat.hidden", - b"", + _make_text_node("Nat"), 0, 0, ), @@ -437,6 +591,97 @@ def test_parse_declarations_from_sqlite_filters_non_rendered( assert [declaration.name for declaration in declarations] == ["Nat.visible"] assert declarations[0].docstring == "Visible theorem" + def test_parse_declarations_from_sqlite_extracts_dependencies( + self, temp_directory + ): + """Test that SQLite parsing extracts dependencies from type BLOBs.""" + lean_root = temp_directory / "lean" + database_path = temp_directory / "api-docs.db" + + source_dir = lean_root / "mathlib" / ".lake" / "packages" / "mathlib4" + source_file = source_dir / "Mathlib" / "Data" / "Nat" / "Basic.lean" + source_file.parent.mkdir(parents=True) + source_file.write_text("def Nat.myFunc (n : Nat) : Bool := true\n") + + # Build a type BLOB: "Nat → Bool" + type_blob = _make_append_node([ + _make_tag_node(_make_const_tag("Nat"), _make_text_node("Nat")), + _make_text_node(" → "), + _make_tag_node(_make_const_tag("Bool"), _make_text_node("Bool")), + ]) + + connection = sqlite3.connect(database_path) + connection.executescript( + """ + CREATE TABLE modules (name TEXT PRIMARY KEY, source_url TEXT); + CREATE TABLE name_info ( + module_name TEXT NOT NULL, + position INTEGER NOT NULL, + kind TEXT, + name TEXT NOT NULL, + type BLOB NOT NULL, + sorried INTEGER NOT NULL, + render INTEGER NOT NULL, + PRIMARY KEY (module_name, position) + ); + CREATE TABLE declaration_ranges ( + module_name TEXT NOT NULL, + position INTEGER NOT NULL, + start_line INTEGER NOT NULL, + start_column INTEGER NOT NULL, + start_utf16 INTEGER NOT NULL, + end_line INTEGER NOT NULL, + end_column INTEGER NOT NULL, + end_utf16 INTEGER NOT NULL, + PRIMARY KEY (module_name, position) + ); + CREATE TABLE declaration_markdown_docstrings ( + module_name TEXT NOT NULL, + position INTEGER NOT NULL, + text TEXT NOT NULL, + PRIMARY KEY (module_name, position) + ); + """ + ) + connection.execute( + "INSERT INTO modules (name, source_url) VALUES (?, ?)", + ( + "Mathlib.Data.Nat.Basic", + "https://github.com/leanprover-community/mathlib4/blob/master/" + "Mathlib/Data/Nat/Basic.lean", + ), + ) + connection.execute( + """ + INSERT INTO name_info + (module_name, position, kind, name, type, sorried, render) + VALUES (?, ?, ?, ?, ?, ?, ?) + """, + ("Mathlib.Data.Nat.Basic", 1, "definition", "Nat.myFunc", type_blob, 0, 1), + ) + connection.execute( + """ + INSERT INTO declaration_ranges + (module_name, position, start_line, start_column, start_utf16, + end_line, end_column, end_utf16) + VALUES (?, ?, ?, ?, ?, ?, ?, ?) + """, + ("Mathlib.Data.Nat.Basic", 1, 1, 0, 0, 1, 40, 40), + ) + connection.commit() + connection.close() + + declarations = _parse_declarations_from_sqlite( + database_path, + lean_root, + _build_package_cache(lean_root), + allowed_module_prefixes=["Mathlib"], + ) + + assert len(declarations) == 1 + assert declarations[0].name == "Nat.myFunc" + assert declarations[0].dependencies == ["Nat", "Bool"] + def test_parse_declarations_from_sqlite_uses_core_fallback_source_link( self, temp_directory ): @@ -493,7 +738,10 @@ def test_parse_declarations_from_sqlite_uses_core_fallback_source_link( (module_name, position, kind, name, type, sorried, render) VALUES (?, ?, ?, ?, ?, ?, ?) """, - ("Init.Data.Nat.Basic", 1, "theorem", "Nat.core", b"", 0, 1), + ( + "Init.Data.Nat.Basic", 1, "theorem", "Nat.core", + _make_text_node("True"), 0, 1, + ), ) connection.execute( """ From 1cf309ff290cb501b2c89c978bf9b1cdbf2ecec0 Mon Sep 17 00:00:00 2001 From: justincasher Date: Sat, 11 Apr 2026 11:08:16 -0400 Subject: [PATCH 10/22] Fix dead source links for core modules in SQLite path The fallback source links for Init/Lean/Std/Lake modules used a 'toolchain' placeholder as the git ref, producing non-functional GitHub URLs. Doc-gen4 itself uses the actual Lean compiler hash for the same purpose. Read the lean-toolchain file from each workspace to get the real version tag (e.g., v4.29.0-rc6) and use it as the git ref. Falls back to 'master' when the toolchain file is unavailable. --- src/lean_explore/extract/doc_parser.py | 35 +++++++++++-- tests/extract/doc_parser_test.py | 70 ++++++++++++++++++++++---- 2 files changed, 93 insertions(+), 12 deletions(-) diff --git a/src/lean_explore/extract/doc_parser.py b/src/lean_explore/extract/doc_parser.py index bbab127..98535dc 100644 --- a/src/lean_explore/extract/doc_parser.py +++ b/src/lean_explore/extract/doc_parser.py @@ -449,11 +449,32 @@ def _extract_source_text( ) +def _read_lean_toolchain_version(workspace_path: Path) -> str | None: + """Read the Lean version from a workspace's lean-toolchain file. + + Args: + workspace_path: Path to the package workspace (e.g., lean/mathlib). + + Returns: + Version string like 'v4.29.0-rc6', or None if not found. + """ + toolchain_file = workspace_path / "lean-toolchain" + if not toolchain_file.exists(): + return None + try: + content = toolchain_file.read_text().strip() + match = re.search(r"v\d+\.\d+\.\d+(?:-rc\d+)?", content) + return match.group() if match else None + except OSError: + return None + + def _construct_source_link( module_name: str, module_source_url: str | None, start_line: int, end_line: int, + lean_version: str | None = None, ) -> str | None: """Construct a GitHub source link from module URL and line range. @@ -462,6 +483,8 @@ def _construct_source_link( module_source_url: GitHub URL to the module file from api-docs.db. start_line: Start line number in the source file. end_line: End line number in the source file. + lean_version: Lean toolchain version (e.g., 'v4.29.0-rc6') used as + the git ref for core module fallback URLs. Returns: GitHub URL with line range fragment, or None if no source URL exists. @@ -469,16 +492,17 @@ def _construct_source_link( if module_source_url: return f"{module_source_url}#L{start_line}-L{end_line}" + git_ref = lean_version or "master" module_path = module_name.replace(".", "/") root = module_name.split(".", 1)[0] if root in {"Init", "Lean", "Std"}: return ( - "https://github.com/leanprover/lean4/blob/toolchain/" + f"https://github.com/leanprover/lean4/blob/{git_ref}/" f"src/lean/{module_path}.lean#L{start_line}-L{end_line}" ) if root == "Lake": return ( - "https://github.com/leanprover/lean4/blob/toolchain/" + f"https://github.com/leanprover/lean4/blob/{git_ref}/" f"src/lake/{module_path}.lean#L{start_line}-L{end_line}" ) @@ -490,6 +514,7 @@ def _parse_declarations_from_sqlite( lean_root: Path, package_cache: dict[str, Path], allowed_module_prefixes: list[str], + lean_version: str | None = None, ) -> list[Declaration]: """Parse declarations from a doc-gen4 SQLite database (api-docs.db). @@ -502,6 +527,7 @@ def _parse_declarations_from_sqlite( lean_root: Root directory of the Lean project. package_cache: Dictionary mapping package names to their directories. allowed_module_prefixes: Module prefixes to extract (e.g., ["Mathlib"]). + lean_version: Lean toolchain version for core module source links. Returns: List of parsed Declaration objects. @@ -579,7 +605,8 @@ def _parse_declarations_from_sqlite( end_line = row["end_line"] source_link = _construct_source_link( - module_name, source_url, start_line, end_line + module_name, source_url, start_line, end_line, + lean_version=lean_version, ) if not source_link: skipped_no_source += 1 @@ -848,6 +875,7 @@ async def extract_declarations(engine: AsyncEngine, batch_size: int = 1000) -> N if docgen_format == "sqlite": api_docs_path = workspace_path / ".lake" / "build" / "api-docs.db" + lean_version = _read_lean_toolchain_version(workspace_path) logger.info( "[%s] Using SQLite format (api-docs.db)", package_name ) @@ -856,6 +884,7 @@ async def extract_declarations(engine: AsyncEngine, batch_size: int = 1000) -> N lean_root, package_cache, package_config.module_prefixes, + lean_version=lean_version, ) else: doc_data_dir = workspace_path / ".lake" / "build" / "doc-data" diff --git a/tests/extract/doc_parser_test.py b/tests/extract/doc_parser_test.py index 0d92815..c3f25d6 100644 --- a/tests/extract/doc_parser_test.py +++ b/tests/extract/doc_parser_test.py @@ -22,6 +22,7 @@ _insert_declarations_batch, _parse_declarations_from_files, _parse_declarations_from_sqlite, + _read_lean_toolchain_version, _read_source_lines, _strip_lean_comments, extract_declarations, @@ -337,26 +338,76 @@ def test_sort_tags_handled(self): class TestSqliteHelpers: """Tests for SQLite-specific parsing helpers.""" - def test_construct_source_link_for_core_module(self): - """Test source links for core modules without a stored source URL.""" - result = _construct_source_link("Init.Data.Nat.Basic", None, 12, 15) + def test_construct_source_link_for_core_module_with_version(self): + """Test source links for core modules use the toolchain version.""" + result = _construct_source_link( + "Init.Data.Nat.Basic", None, 12, 15, + lean_version="v4.29.0-rc6", + ) assert ( result - == "https://github.com/leanprover/lean4/blob/toolchain/" + == "https://github.com/leanprover/lean4/blob/v4.29.0-rc6/" "src/lean/Init/Data/Nat/Basic.lean#L12-L15" ) - def test_construct_source_link_for_lake_module(self): - """Test source links for Lake modules without a stored source URL.""" - result = _construct_source_link("Lake.Config.Monad", None, 7, 9) + def test_construct_source_link_for_lake_module_with_version(self): + """Test source links for Lake modules use the toolchain version.""" + result = _construct_source_link( + "Lake.Config.Monad", None, 7, 9, + lean_version="v4.29.0-rc6", + ) assert ( result - == "https://github.com/leanprover/lean4/blob/toolchain/" + == "https://github.com/leanprover/lean4/blob/v4.29.0-rc6/" "src/lake/Lake/Config/Monad.lean#L7-L9" ) + def test_construct_source_link_falls_back_to_master(self): + """Test that missing lean_version falls back to 'master'.""" + result = _construct_source_link( + "Init.Data.Nat.Basic", None, 12, 15, + ) + + assert ( + result + == "https://github.com/leanprover/lean4/blob/master/" + "src/lean/Init/Data/Nat/Basic.lean#L12-L15" + ) + + def test_construct_source_link_prefers_source_url(self): + """Test that a non-None source_url is used directly.""" + url = "https://github.com/leanprover-community/mathlib4/blob/abc123/Foo.lean" + result = _construct_source_link("Foo.Bar", url, 1, 10) + + assert result == f"{url}#L1-L10" + + def test_read_lean_toolchain_version(self, temp_directory): + """Test reading version from a lean-toolchain file.""" + workspace = temp_directory / "workspace" + workspace.mkdir() + toolchain = workspace / "lean-toolchain" + toolchain.write_text("leanprover/lean4:v4.29.0-rc6\n") + + assert _read_lean_toolchain_version(workspace) == "v4.29.0-rc6" + + def test_read_lean_toolchain_version_release(self, temp_directory): + """Test reading a release version (no -rc suffix).""" + workspace = temp_directory / "workspace" + workspace.mkdir() + toolchain = workspace / "lean-toolchain" + toolchain.write_text("leanprover/lean4:v4.29.0\n") + + assert _read_lean_toolchain_version(workspace) == "v4.29.0" + + def test_read_lean_toolchain_version_missing(self, temp_directory): + """Test returns None when lean-toolchain doesn't exist.""" + workspace = temp_directory / "workspace" + workspace.mkdir() + + assert _read_lean_toolchain_version(workspace) is None + class TestDependencyExtraction: """Tests for dependency extraction from HTML.""" @@ -760,13 +811,14 @@ def test_parse_declarations_from_sqlite_uses_core_fallback_source_link( lean_root, {"lean4": lean_source_dir}, allowed_module_prefixes=["Init"], + lean_version="v4.29.0-rc6", ) assert len(declarations) == 1 assert declarations[0].name == "Nat.core" assert ( declarations[0].source_link - == "https://github.com/leanprover/lean4/blob/toolchain/" + == "https://github.com/leanprover/lean4/blob/v4.29.0-rc6/" "src/lean/Init/Data/Nat/Basic.lean#L1-L1" ) From 077cd8e06fe61413bf180ca7b3ee434724977f56 Mon Sep 17 00:00:00 2001 From: justincasher Date: Sat, 11 Apr 2026 11:10:04 -0400 Subject: [PATCH 11/22] Detect Verso-only docstrings in SQLite extraction The SQLite query only joined declaration_markdown_docstrings, silently dropping docstrings stored in Verso format (declaration_verso_docstrings). Doc-gen4 stores each docstring as either markdown text or a Verso binary BLOB, never both. Add a LEFT JOIN on declaration_verso_docstrings to detect Verso-only cases. Since Verso BLOBs use a complex nested serialization format (Doc.Block/Doc.Inline trees with opaque Dynamic values), full deserialization is deferred. For now, Verso-only docstrings are counted and logged as a warning so the gap is visible rather than silent. --- src/lean_explore/extract/doc_parser.py | 22 +++- tests/extract/doc_parser_test.py | 156 +++++++++++++++++++++++++ 2 files changed, 176 insertions(+), 2 deletions(-) diff --git a/src/lean_explore/extract/doc_parser.py b/src/lean_explore/extract/doc_parser.py index 98535dc..64685d0 100644 --- a/src/lean_explore/extract/doc_parser.py +++ b/src/lean_explore/extract/doc_parser.py @@ -538,7 +538,10 @@ def _parse_declarations_from_sqlite( connection.row_factory = sqlite3.Row try: - # Query all declarations with their source ranges and docstrings + # Query declarations with source ranges and both docstring types. + # Doc-gen4 stores docstrings as either markdown text or Verso binary + # BLOBs (never both). We prefer markdown; Verso BLOBs require a + # complex deserializer so we detect but cannot extract them yet. query = """ SELECT n.module_name, @@ -550,12 +553,15 @@ def _parse_declarations_from_sqlite( r.start_line, r.end_line, d.text AS docstring, + v.content AS verso_docstring, m.source_url FROM name_info n JOIN declaration_ranges r ON n.module_name = r.module_name AND n.position = r.position LEFT JOIN declaration_markdown_docstrings d ON n.module_name = d.module_name AND n.position = d.position + LEFT JOIN declaration_verso_docstrings v + ON n.module_name = v.module_name AND n.position = v.position JOIN modules m ON n.module_name = m.name ORDER BY n.module_name, n.position @@ -568,6 +574,7 @@ def _parse_declarations_from_sqlite( skipped_prefix = 0 skipped_constructor = 0 source_errors = 0 + verso_only_docstrings = 0 with Progress( SpinnerColumn(), @@ -646,11 +653,16 @@ def _parse_declarations_from_sqlite( else: dependencies = None + # Use markdown docstring; detect Verso-only cases + docstring = row["docstring"] + if not docstring and row["verso_docstring"]: + verso_only_docstrings += 1 + declarations.append( Declaration( name=declaration_name, module=module_name, - docstring=row["docstring"], + docstring=docstring, source_text=source_text, source_link=source_link, dependencies=dependencies, @@ -668,6 +680,12 @@ def _parse_declarations_from_sqlite( logger.info("Skipped %d .mk constructors", skipped_constructor) if skipped_no_source > 0: logger.info("Skipped %d declarations without source URL", skipped_no_source) + if verso_only_docstrings > 0: + logger.warning( + "%d declarations have Verso-only docstrings " + "(not yet supported, stored as docstring=None)", + verso_only_docstrings, + ) if source_errors > 0: logger.warning( "Could not extract source text for %d declarations", diff --git a/tests/extract/doc_parser_test.py b/tests/extract/doc_parser_test.py index c3f25d6..bd53a70 100644 --- a/tests/extract/doc_parser_test.py +++ b/tests/extract/doc_parser_test.py @@ -572,6 +572,12 @@ def test_parse_declarations_from_sqlite_filters_non_rendered( text TEXT NOT NULL, PRIMARY KEY (module_name, position) ); + CREATE TABLE declaration_verso_docstrings ( + module_name TEXT NOT NULL, + position INTEGER NOT NULL, + content BLOB NOT NULL, + PRIMARY KEY (module_name, position) + ); """ ) connection.execute( @@ -692,6 +698,12 @@ def test_parse_declarations_from_sqlite_extracts_dependencies( text TEXT NOT NULL, PRIMARY KEY (module_name, position) ); + CREATE TABLE declaration_verso_docstrings ( + module_name TEXT NOT NULL, + position INTEGER NOT NULL, + content BLOB NOT NULL, + PRIMARY KEY (module_name, position) + ); """ ) connection.execute( @@ -733,6 +745,144 @@ def test_parse_declarations_from_sqlite_extracts_dependencies( assert declarations[0].name == "Nat.myFunc" assert declarations[0].dependencies == ["Nat", "Bool"] + def test_parse_declarations_from_sqlite_detects_verso_docstrings( + self, temp_directory, caplog + ): + """Test that Verso-only docstrings are detected and logged.""" + lean_root = temp_directory / "lean" + database_path = temp_directory / "api-docs.db" + + source_dir = ( + lean_root / "mathlib" / ".lake" / "packages" / "mathlib4" + ) + source_file = source_dir / "Mathlib" / "Data" / "Nat" / "Basic.lean" + source_file.parent.mkdir(parents=True) + source_file.write_text( + "def Nat.withVerso : Nat := 0\n" + "def Nat.withMarkdown : Nat := 1\n" + ) + + connection = sqlite3.connect(database_path) + connection.executescript( + """ + CREATE TABLE modules (name TEXT PRIMARY KEY, source_url TEXT); + CREATE TABLE name_info ( + module_name TEXT NOT NULL, + position INTEGER NOT NULL, + kind TEXT, + name TEXT NOT NULL, + type BLOB NOT NULL, + sorried INTEGER NOT NULL, + render INTEGER NOT NULL, + PRIMARY KEY (module_name, position) + ); + CREATE TABLE declaration_ranges ( + module_name TEXT NOT NULL, + position INTEGER NOT NULL, + start_line INTEGER NOT NULL, + start_column INTEGER NOT NULL, + start_utf16 INTEGER NOT NULL, + end_line INTEGER NOT NULL, + end_column INTEGER NOT NULL, + end_utf16 INTEGER NOT NULL, + PRIMARY KEY (module_name, position) + ); + CREATE TABLE declaration_markdown_docstrings ( + module_name TEXT NOT NULL, + position INTEGER NOT NULL, + text TEXT NOT NULL, + PRIMARY KEY (module_name, position) + ); + CREATE TABLE declaration_verso_docstrings ( + module_name TEXT NOT NULL, + position INTEGER NOT NULL, + content BLOB NOT NULL, + PRIMARY KEY (module_name, position) + ); + """ + ) + connection.execute( + "INSERT INTO modules (name, source_url) VALUES (?, ?)", + ( + "Mathlib.Data.Nat.Basic", + "https://github.com/leanprover-community/mathlib4" + "/blob/master/Mathlib/Data/Nat/Basic.lean", + ), + ) + connection.executemany( + """ + INSERT INTO name_info + (module_name, position, kind, name, type, sorried, render) + VALUES (?, ?, ?, ?, ?, ?, ?) + """, + [ + ( + "Mathlib.Data.Nat.Basic", 1, "definition", + "Nat.withVerso", _make_text_node("Nat"), 0, 1, + ), + ( + "Mathlib.Data.Nat.Basic", 2, "definition", + "Nat.withMarkdown", _make_text_node("Nat"), 0, 1, + ), + ], + ) + connection.executemany( + """ + INSERT INTO declaration_ranges + (module_name, position, start_line, start_column, + start_utf16, end_line, end_column, end_utf16) + VALUES (?, ?, ?, ?, ?, ?, ?, ?) + """, + [ + ("Mathlib.Data.Nat.Basic", 1, 1, 0, 0, 1, 27, 27), + ("Mathlib.Data.Nat.Basic", 2, 2, 0, 0, 2, 30, 30), + ], + ) + # Declaration 1: Verso-only docstring (binary BLOB) + connection.execute( + """ + INSERT INTO declaration_verso_docstrings + (module_name, position, content) + VALUES (?, ?, ?) + """, + ("Mathlib.Data.Nat.Basic", 1, b"\x00\x01\x02"), + ) + # Declaration 2: Markdown docstring + connection.execute( + """ + INSERT INTO declaration_markdown_docstrings + (module_name, position, text) + VALUES (?, ?, ?) + """, + ("Mathlib.Data.Nat.Basic", 2, "A markdown docstring"), + ) + connection.commit() + connection.close() + + import logging + + with caplog.at_level(logging.WARNING): + declarations = _parse_declarations_from_sqlite( + database_path, + lean_root, + _build_package_cache(lean_root), + allowed_module_prefixes=["Mathlib"], + ) + + assert len(declarations) == 2 + # Verso-only: docstring is None + verso_decl = next( + d for d in declarations if d.name == "Nat.withVerso" + ) + assert verso_decl.docstring is None + # Markdown: docstring is present + md_decl = next( + d for d in declarations if d.name == "Nat.withMarkdown" + ) + assert md_decl.docstring == "A markdown docstring" + # Warning logged about Verso-only docstrings + assert "1 declarations have Verso-only docstrings" in caplog.text + def test_parse_declarations_from_sqlite_uses_core_fallback_source_link( self, temp_directory ): @@ -777,6 +927,12 @@ def test_parse_declarations_from_sqlite_uses_core_fallback_source_link( text TEXT NOT NULL, PRIMARY KEY (module_name, position) ); + CREATE TABLE declaration_verso_docstrings ( + module_name TEXT NOT NULL, + position INTEGER NOT NULL, + content BLOB NOT NULL, + PRIMARY KEY (module_name, position) + ); """ ) connection.execute( From d812b8248aa25aefdb6732ac27d1f88f1f82d482 Mon Sep 17 00:00:00 2001 From: justincasher Date: Sat, 11 Apr 2026 11:32:19 -0400 Subject: [PATCH 12/22] Validate api-docs.db before trusting SQLite format detection _detect_docgen_format previously checked only file existence, which would misclassify a zero-byte, corrupt, or incomplete api-docs.db as usable. Now _validate_docgen_sqlite verifies the file is non-empty, opens as valid SQLite, and contains the required tables (name_info, declaration_ranges, modules). On failure the detector falls back to BMP if available, or returns "none". --- src/lean_explore/extract/doc_parser.py | 56 ++++++++++- tests/extract/doc_parser_test.py | 124 +++++++++++++++++++++++++ 2 files changed, 178 insertions(+), 2 deletions(-) diff --git a/src/lean_explore/extract/doc_parser.py b/src/lean_explore/extract/doc_parser.py index 64685d0..3ee9a27 100644 --- a/src/lean_explore/extract/doc_parser.py +++ b/src/lean_explore/extract/doc_parser.py @@ -834,21 +834,73 @@ async def _insert_declarations_batch( return inserted_count +_REQUIRED_DOCGEN_TABLES = {"name_info", "declaration_ranges", "modules"} + + +def _validate_docgen_sqlite(database_path: Path) -> bool: + """Check that a doc-gen4 api-docs.db is a valid, usable SQLite database. + + Verifies the file is non-empty, opens as SQLite, and contains the tables + that the extraction pipeline requires. + + Args: + database_path: Path to the api-docs.db file. + + Returns: + True if the database is valid and contains the required tables. + """ + if database_path.stat().st_size == 0: + logger.warning("api-docs.db exists but is empty: %s", database_path) + return False + + try: + connection = sqlite3.connect(str(database_path)) + try: + cursor = connection.execute( + "SELECT name FROM sqlite_master WHERE type='table'" + ) + tables = {row[0] for row in cursor.fetchall()} + finally: + connection.close() + except sqlite3.DatabaseError as error: + logger.warning("api-docs.db is not a valid SQLite file: %s", error) + return False + + missing = _REQUIRED_DOCGEN_TABLES - tables + if missing: + logger.warning( + "api-docs.db is missing required tables %s: %s", + missing, database_path, + ) + return False + + return True + + def _detect_docgen_format(workspace_path: Path) -> str: """Detect which doc-gen4 output format a workspace uses. Doc-gen4 >= v4.29.0-rc2 writes to a SQLite database (api-docs.db). Earlier versions write individual BMP JSON files to doc-data/. + The SQLite file is validated before returning "sqlite" to guard against + zero-byte, corrupt, or incompatible databases left by crashed builds. + Args: workspace_path: Path to the package workspace (e.g., lean/mathlib). Returns: - "sqlite" if api-docs.db exists, "bmp" if BMP files exist, "none" otherwise. + "sqlite" if a valid api-docs.db exists, "bmp" if BMP files exist, + "none" otherwise. """ api_docs_db = workspace_path / ".lake" / "build" / "api-docs.db" if api_docs_db.exists(): - return "sqlite" + if _validate_docgen_sqlite(api_docs_db): + return "sqlite" + logger.warning( + "Invalid api-docs.db at %s, checking for BMP fallback", + api_docs_db, + ) doc_data_dir = workspace_path / ".lake" / "build" / "doc-data" if doc_data_dir.exists(): diff --git a/tests/extract/doc_parser_test.py b/tests/extract/doc_parser_test.py index bd53a70..08ab60b 100644 --- a/tests/extract/doc_parser_test.py +++ b/tests/extract/doc_parser_test.py @@ -15,6 +15,7 @@ _BlobReader, _build_package_cache, _construct_source_link, + _detect_docgen_format, _extract_dependencies_from_html, _extract_names_from_rendered_code, _extract_source_text, @@ -25,6 +26,7 @@ _read_lean_toolchain_version, _read_source_lines, _strip_lean_comments, + _validate_docgen_sqlite, extract_declarations, ) from lean_explore.extract.types import Declaration @@ -1346,3 +1348,125 @@ def test_single_declaration(self): assert len(filtered) == 1 assert removed_count == 0 + + +class TestDocgenFormatDetection: + """Tests for doc-gen4 output format detection and SQLite validation.""" + + def test_detect_sqlite_with_valid_db(self, temp_directory): + """Test detection returns 'sqlite' for a valid api-docs.db.""" + workspace = temp_directory / "pkg" + db_path = workspace / ".lake" / "build" / "api-docs.db" + db_path.parent.mkdir(parents=True) + + connection = sqlite3.connect(db_path) + connection.executescript( + """ + CREATE TABLE modules (name TEXT PRIMARY KEY, source_url TEXT); + CREATE TABLE name_info ( + module_name TEXT, position INTEGER, kind TEXT, name TEXT, + type BLOB, sorried INTEGER, render INTEGER, + PRIMARY KEY (module_name, position) + ); + CREATE TABLE declaration_ranges ( + module_name TEXT, position INTEGER, + start_line INTEGER, start_column INTEGER, start_utf16 INTEGER, + end_line INTEGER, end_column INTEGER, end_utf16 INTEGER, + PRIMARY KEY (module_name, position) + ); + """ + ) + connection.close() + + assert _detect_docgen_format(workspace) == "sqlite" + + def test_detect_bmp_format(self, temp_directory): + """Test detection returns 'bmp' when only BMP files exist.""" + workspace = temp_directory / "pkg" + doc_data = workspace / ".lake" / "build" / "doc-data" + doc_data.mkdir(parents=True) + (doc_data / "declaration-data-Foo.bmp").write_text("{}") + + assert _detect_docgen_format(workspace) == "bmp" + + def test_detect_none_when_empty(self, temp_directory): + """Test detection returns 'none' when no doc output exists.""" + workspace = temp_directory / "pkg" + workspace.mkdir(parents=True) + + assert _detect_docgen_format(workspace) == "none" + + def test_detect_falls_back_to_bmp_for_empty_db(self, temp_directory): + """Test that an empty api-docs.db falls back to BMP if available.""" + workspace = temp_directory / "pkg" + db_path = workspace / ".lake" / "build" / "api-docs.db" + db_path.parent.mkdir(parents=True) + db_path.write_bytes(b"") + + doc_data = workspace / ".lake" / "build" / "doc-data" + doc_data.mkdir(parents=True) + (doc_data / "declaration-data-Foo.bmp").write_text("{}") + + assert _detect_docgen_format(workspace) == "bmp" + + def test_detect_returns_none_for_empty_db_without_bmp(self, temp_directory): + """Test that an empty api-docs.db with no BMP fallback returns 'none'.""" + workspace = temp_directory / "pkg" + db_path = workspace / ".lake" / "build" / "api-docs.db" + db_path.parent.mkdir(parents=True) + db_path.write_bytes(b"") + + assert _detect_docgen_format(workspace) == "none" + + def test_detect_falls_back_for_corrupt_db(self, temp_directory): + """Test that a corrupt (non-SQLite) file falls back gracefully.""" + workspace = temp_directory / "pkg" + db_path = workspace / ".lake" / "build" / "api-docs.db" + db_path.parent.mkdir(parents=True) + db_path.write_bytes(b"this is not a sqlite database") + + assert _detect_docgen_format(workspace) == "none" + + def test_detect_falls_back_for_db_missing_tables(self, temp_directory): + """Test that a SQLite DB missing required tables falls back.""" + workspace = temp_directory / "pkg" + db_path = workspace / ".lake" / "build" / "api-docs.db" + db_path.parent.mkdir(parents=True) + + connection = sqlite3.connect(db_path) + connection.execute( + "CREATE TABLE modules (name TEXT PRIMARY KEY, source_url TEXT)" + ) + connection.close() + + # Has modules but missing name_info and declaration_ranges + assert _detect_docgen_format(workspace) == "none" + + def test_validate_docgen_sqlite_valid(self, temp_directory): + """Test validation passes for a well-formed database.""" + db_path = temp_directory / "api-docs.db" + connection = sqlite3.connect(db_path) + connection.executescript( + """ + CREATE TABLE modules (name TEXT PRIMARY KEY); + CREATE TABLE name_info (module_name TEXT, position INTEGER); + CREATE TABLE declaration_ranges (module_name TEXT, position INTEGER); + """ + ) + connection.close() + + assert _validate_docgen_sqlite(db_path) is True + + def test_validate_docgen_sqlite_empty_file(self, temp_directory): + """Test validation rejects an empty file.""" + db_path = temp_directory / "api-docs.db" + db_path.write_bytes(b"") + + assert _validate_docgen_sqlite(db_path) is False + + def test_validate_docgen_sqlite_corrupt_file(self, temp_directory): + """Test validation rejects a non-SQLite file.""" + db_path = temp_directory / "api-docs.db" + db_path.write_bytes(b"not a database") + + assert _validate_docgen_sqlite(db_path) is False From 03ff5d061db95b489da5e7f888561c090820cca5 Mon Sep 17 00:00:00 2001 From: justincasher Date: Sat, 11 Apr 2026 11:32:57 -0400 Subject: [PATCH 13/22] Filter non-rendered declarations in SQL instead of Python Move the render=1 check from a Python-side continue into a WHERE clause on the SQLite query. This avoids fetching and iterating rows that will be immediately discarded, reducing work for large packages like mathlib. --- src/lean_explore/extract/doc_parser.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/lean_explore/extract/doc_parser.py b/src/lean_explore/extract/doc_parser.py index 3ee9a27..a0ac869 100644 --- a/src/lean_explore/extract/doc_parser.py +++ b/src/lean_explore/extract/doc_parser.py @@ -549,7 +549,6 @@ def _parse_declarations_from_sqlite( n.kind, n.name, n.type, - n.render, r.start_line, r.end_line, d.text AS docstring, @@ -564,6 +563,7 @@ def _parse_declarations_from_sqlite( ON n.module_name = v.module_name AND n.position = v.position JOIN modules m ON n.module_name = m.name + WHERE n.render = 1 ORDER BY n.module_name, n.position """ rows = connection.execute(query).fetchall() @@ -620,10 +620,6 @@ def _parse_declarations_from_sqlite( progress.update(task, advance=1) continue - if not row["render"]: - progress.update(task, advance=1) - continue - # Extract source text from local files try: source_text = _extract_source_text( From d44a5c0519713b6eb9b76c3e79fc804c26ea192c Mon Sep 17 00:00:00 2001 From: justincasher Date: Sat, 11 Apr 2026 13:02:05 -0400 Subject: [PATCH 14/22] Handle missing source files gracefully in BMP parser The BMP parser crashed with FileNotFoundError when a source file referenced in a declaration's sourceLink was not present locally (e.g. after a package rename). The SQLite parser already handled this with a try/except. Apply the same pattern to the BMP parser: skip the declaration, count the error, and log a warning at the end. --- src/lean_explore/extract/doc_parser.py | 33 +++++++++++++++++++------- 1 file changed, 25 insertions(+), 8 deletions(-) diff --git a/src/lean_explore/extract/doc_parser.py b/src/lean_explore/extract/doc_parser.py index a0ac869..2ca2aa4 100644 --- a/src/lean_explore/extract/doc_parser.py +++ b/src/lean_explore/extract/doc_parser.py @@ -712,6 +712,7 @@ def _parse_declarations_from_files( List of parsed Declaration objects. """ declarations = [] + source_errors = 0 with Progress( SpinnerColumn(), @@ -740,23 +741,33 @@ def _parse_declarations_from_files( for declaration_data in data.get("declarations", []): information = declaration_data["info"] - source_text = _extract_source_text( - information["sourceLink"], lean_root, package_cache - ) + declaration_name = information["name"] + + # Skip auto-generated .mk constructors + if declaration_name.endswith(".mk"): + continue + + try: + source_text = _extract_source_text( + information["sourceLink"], lean_root, package_cache + ) + except (FileNotFoundError, ValueError) as error: + source_errors += 1 + if source_errors <= 10: + logger.debug( + "Could not extract source for %s: %s", + declaration_name, error, + ) + continue header_html = declaration_data.get("header", "") dependencies = _extract_dependencies_from_html(header_html) # Filter out self-references from dependencies - declaration_name = information["name"] filtered_dependencies = [ d for d in dependencies if d != declaration_name ] - # Skip auto-generated .mk constructors - if declaration_name.endswith(".mk"): - continue - declarations.append( Declaration( name=declaration_name, @@ -770,6 +781,12 @@ def _parse_declarations_from_files( progress.update(task, advance=1) + if source_errors > 0: + logger.warning( + "Could not extract source text for %d declarations", + source_errors, + ) + return declarations From 4a7586ef68b7e7021a4e13c303434972d57dcc47 Mon Sep 17 00:00:00 2001 From: justincasher Date: Sat, 11 Apr 2026 13:02:22 -0400 Subject: [PATCH 15/22] Pin doc-gen4 to main branch instead of version tags Pinning doc-gen4 to specific release tags (e.g. @ "v4.30.0-rc1") caused transitive dependency conflicts with packages like plausible, breaking lake exe cache get. Using @ "main" lets Lake resolve a compatible version automatically since doc-gen4 main tracks the latest Lean toolchain. Also reorder require statements so doc-gen4 comes before the main package dependency, ensuring the main package's dependency versions take precedence during resolution. --- lean/cslib/lakefile.lean | 6 +++--- lean/flt/lakefile.lean | 6 +++--- lean/formal-conjectures/lakefile.lean | 6 +++--- lean/mathlib/lakefile.lean | 6 +++--- lean/physlean/lakefile.lean | 6 +++--- src/lean_explore/extract/doc_gen4.py | 5 ++--- src/lean_explore/extract/package_utils.py | 16 ++++++++++------ 7 files changed, 27 insertions(+), 24 deletions(-) diff --git a/lean/cslib/lakefile.lean b/lean/cslib/lakefile.lean index e48cda9..3f04e15 100644 --- a/lean/cslib/lakefile.lean +++ b/lean/cslib/lakefile.lean @@ -7,8 +7,8 @@ package «cslib-extractor» where lean_lib «CslibExtract» where roots := #[`CslibExtract] +require «doc-gen4» from git + "https://github.com/leanprover/doc-gen4" @ "main" + require Cslib from git "https://github.com/leanprover/cslib" @ "main" - -require «doc-gen4» from git - "https://github.com/leanprover/doc-gen4" @ "v4.28.0-rc1" diff --git a/lean/flt/lakefile.lean b/lean/flt/lakefile.lean index 412308c..f21999c 100644 --- a/lean/flt/lakefile.lean +++ b/lean/flt/lakefile.lean @@ -7,8 +7,8 @@ package «flt-extractor» where lean_lib «FLTExtract» where roots := #[`FLTExtract] +require «doc-gen4» from git + "https://github.com/leanprover/doc-gen4" @ "main" + require FLT from git "https://github.com/ImperialCollegeLondon/FLT" @ "main" - -require «doc-gen4» from git - "https://github.com/leanprover/doc-gen4" @ "v4.27.0" diff --git a/lean/formal-conjectures/lakefile.lean b/lean/formal-conjectures/lakefile.lean index 032c248..d136505 100644 --- a/lean/formal-conjectures/lakefile.lean +++ b/lean/formal-conjectures/lakefile.lean @@ -4,8 +4,8 @@ open Lake DSL package «formal-conjectures-extractor» where -- Workspace for extracting formal-conjectures documentation +require «doc-gen4» from git + "https://github.com/leanprover/doc-gen4" @ "main" + require «formal_conjectures» from git "https://github.com/google-deepmind/formal-conjectures" @ "main" - -require «doc-gen4» from git - "https://github.com/leanprover/doc-gen4" @ "v4.22.0" diff --git a/lean/mathlib/lakefile.lean b/lean/mathlib/lakefile.lean index f6d3ba3..f2b6f63 100644 --- a/lean/mathlib/lakefile.lean +++ b/lean/mathlib/lakefile.lean @@ -7,8 +7,8 @@ package «mathlib-extractor» where lean_lib «MathExtract» where roots := #[`MathExtract] +require «doc-gen4» from git + "https://github.com/leanprover/doc-gen4" @ "main" + require mathlib from git "https://github.com/leanprover-community/mathlib4.git" - -require «doc-gen4» from git - "https://github.com/leanprover/doc-gen4" @ "v4.28.0-rc1" diff --git a/lean/physlean/lakefile.lean b/lean/physlean/lakefile.lean index b6bbade..14492e1 100644 --- a/lean/physlean/lakefile.lean +++ b/lean/physlean/lakefile.lean @@ -7,8 +7,8 @@ package «physlean-extractor» where lean_lib «PhysExtract» where roots := #[`PhysExtract] +require «doc-gen4» from git + "https://github.com/leanprover/doc-gen4" @ "main" + require PhysLean from git "https://github.com/HEPLean/PhysLean" - -require «doc-gen4» from git - "https://github.com/leanprover/doc-gen4" @ "v4.26.0" diff --git a/src/lean_explore/extract/doc_gen4.py b/src/lean_explore/extract/doc_gen4.py index 763476c..04523b9 100644 --- a/src/lean_explore/extract/doc_gen4.py +++ b/src/lean_explore/extract/doc_gen4.py @@ -18,7 +18,7 @@ from lean_explore.extract.package_utils import ( get_extraction_order, get_package_toolchain, - update_lakefile_docgen_version, + strip_lakefile_docgen_version, ) logger = logging.getLogger(__name__) @@ -97,9 +97,8 @@ def _setup_workspace(package_config: PackageConfig) -> tuple[str, str]: toolchain_file = workspace_path / "lean-toolchain" lean_toolchain, git_ref = get_package_toolchain(package_config) - lean_version = extract_lean_version(lean_toolchain) - update_lakefile_docgen_version(lakefile_path, lean_version) + strip_lakefile_docgen_version(lakefile_path) toolchain_file.write_text(lean_toolchain + "\n") return lean_toolchain, git_ref diff --git a/src/lean_explore/extract/package_utils.py b/src/lean_explore/extract/package_utils.py index f5912c0..97e40af 100644 --- a/src/lean_explore/extract/package_utils.py +++ b/src/lean_explore/extract/package_utils.py @@ -81,12 +81,16 @@ def get_package_toolchain(package_configuration: PackageConfig) -> tuple[str, st return toolchain, latest_tag -def update_lakefile_docgen_version(lakefile_path: Path, lean_version: str) -> None: - """Update the doc-gen4 version in a lakefile to match the Lean version. +def strip_lakefile_docgen_version(lakefile_path: Path) -> None: + """Pin doc-gen4 to its main branch instead of a specific version tag. + + Doc-gen4's main branch tracks the latest Lean toolchain. Pinning to a + specific release tag causes transitive dependency conflicts (e.g. plausible + version mismatches) that break ``lake exe cache get``. Using ``@ "main"`` + lets Lake resolve a compatible version automatically. Args: lakefile_path: Path to lakefile.lean - lean_version: Lean version like 'v4.27.0' """ content = lakefile_path.read_text() @@ -95,13 +99,13 @@ def update_lakefile_docgen_version(lakefile_path: Path, lean_version: str) -> No r'"https://github\.com/leanprover/doc-gen4"(?:\s+@\s+"[^"]*")?' ) replacement = ( - f"require «doc-gen4» from git\n" - f' "https://github.com/leanprover/doc-gen4" @ "{lean_version}"' + 'require «doc-gen4» from git\n' + ' "https://github.com/leanprover/doc-gen4" @ "main"' ) new_content = re.sub(pattern, replacement, content) if new_content != content: lakefile_path.write_text(new_content) logger.info( - "Updated doc-gen4 version to %s in %s", lean_version, lakefile_path + "Pinned doc-gen4 to main branch in %s", lakefile_path ) From 16b747417c720f4f1564b9ee79f1e633ccca08f6 Mon Sep 17 00:00:00 2001 From: justincasher Date: Sat, 11 Apr 2026 13:02:31 -0400 Subject: [PATCH 16/22] Update workspace toolchains and manifests Refresh lake-manifest.json and lean-toolchain files across all package workspaces after switching doc-gen4 to main branch. --- lean/cslib/lake-manifest.json | 133 +++++++++++--------- lean/cslib/lean-toolchain | 2 +- lean/flt/lake-manifest.json | 129 ++++++++++--------- lean/flt/lean-toolchain | 2 +- lean/formal-conjectures/lake-manifest.json | 139 +++++++++++---------- lean/formal-conjectures/lean-toolchain | 2 +- lean/mathlib/lake-manifest.json | 129 ++++++++++--------- lean/mathlib/lean-toolchain | 2 +- lean/physlean/lake-manifest.json | 129 ++++++++++--------- lean/physlean/lean-toolchain | 2 +- 10 files changed, 362 insertions(+), 307 deletions(-) diff --git a/lean/cslib/lake-manifest.json b/lean/cslib/lake-manifest.json index 76ba7e0..1a4bb3f 100644 --- a/lean/cslib/lake-manifest.json +++ b/lean/cslib/lake-manifest.json @@ -1,81 +1,41 @@ -{"version": "1.1.0", +{"version": "1.2.0", "packagesDir": ".lake/packages", "packages": - [{"url": "https://github.com/leanprover/doc-gen4", + [{"url": "https://github.com/leanprover/cslib", "type": "git", "subDir": null, "scope": "", - "rev": "7be6082d00bb0cfc1e136505dba7718cd5fcaa84", - "name": "«doc-gen4»", - "manifestFile": "lake-manifest.json", - "inputRev": "v4.28.0-rc1", - "inherited": false, - "configFile": "lakefile.lean"}, - {"url": "https://github.com/leanprover/cslib", - "type": "git", - "subDir": null, - "scope": "", - "rev": "00d68a39497c0954bd52ca5237eea2aa152a5387", + "rev": "48e29fde33e1d1650a1615ba4d1440a1c36a23c3", "name": "Cslib", "manifestFile": "lake-manifest.json", "inputRev": "main", "inherited": false, "configFile": "lakefile.toml"}, - {"url": "https://github.com/leanprover/lean4-cli", + {"url": "https://github.com/leanprover/doc-gen4", "type": "git", "subDir": null, "scope": "", - "rev": "28e0856d4424863a85b18f38868c5420c55f9bae", - "name": "Cli", - "manifestFile": "lake-manifest.json", - "inputRev": "main", - "inherited": true, - "configFile": "lakefile.toml"}, - {"url": "https://github.com/fgdorais/lean4-unicode-basic", - "type": "git", - "subDir": null, - "scope": "", - "rev": "8668e1ab7c987fb8ed1349f14c3b7b60bd5f27b6", - "name": "UnicodeBasic", - "manifestFile": "lake-manifest.json", - "inputRev": "main", - "inherited": true, - "configFile": "lakefile.lean"}, - {"url": "https://github.com/dupuisf/BibtexQuery", - "type": "git", - "subDir": null, - "scope": "", - "rev": "1c5c543d2637aebf90c80aead2d401ae88db13cc", - "name": "BibtexQuery", - "manifestFile": "lake-manifest.json", - "inputRev": "master", - "inherited": true, - "configFile": "lakefile.toml"}, - {"url": "https://github.com/acmepjz/md4lean", - "type": "git", - "subDir": null, - "scope": "", - "rev": "38ac5945d744903ffcc473ce1030223991b11cf6", - "name": "MD4Lean", + "rev": "2c96309e0de1b916d1a0b74c8119fddc7bf7da51", + "name": "«doc-gen4»", "manifestFile": "lake-manifest.json", "inputRev": "main", - "inherited": true, + "inherited": false, "configFile": "lakefile.lean"}, {"url": "https://github.com/leanprover-community/mathlib4", "type": "git", "subDir": null, "scope": "leanprover-community", - "rev": "5352afccd6866369be9de43f5b7ec47203555f44", + "rev": "ae0c6140a8312511dbb5bc265a0146dd418eca8d", "name": "mathlib", "manifestFile": "lake-manifest.json", - "inputRev": "v4.28.0-rc1", + "inputRev": "master", "inherited": true, "configFile": "lakefile.lean"}, {"url": "https://github.com/leanprover-community/plausible", "type": "git", "subDir": null, "scope": "leanprover-community", - "rev": "7311586e1a56af887b1081d05e80c11b6c41d212", + "rev": "f449eabb8f7e3feef0366856c20e28a6d2c97ee3", "name": "plausible", "manifestFile": "lake-manifest.json", "inputRev": "main", @@ -85,7 +45,7 @@ "type": "git", "subDir": null, "scope": "leanprover-community", - "rev": "5ce7f0a355f522a952a3d678d696bd563bb4fd28", + "rev": "c5d5b8fe6e5158def25cd28eb94e4141ad97c843", "name": "LeanSearchClient", "manifestFile": "lake-manifest.json", "inputRev": "main", @@ -95,7 +55,7 @@ "type": "git", "subDir": null, "scope": "leanprover-community", - "rev": "875ad9d88ed684e39c16bdea260e6ecfa15afd60", + "rev": "86503d416c875fdcf3b6b6c54c22581e96c6bda7", "name": "importGraph", "manifestFile": "lake-manifest.json", "inputRev": "main", @@ -105,41 +65,92 @@ "type": "git", "subDir": null, "scope": "leanprover-community", - "rev": "6d65c6e0a25b8a52c13c3adeb63ecde3bfbb6294", + "rev": "82d457fb3bdd9efadbae06608ff337d689efdddf", "name": "proofwidgets", "manifestFile": "lake-manifest.json", - "inputRev": "v0.0.86", + "inputRev": "v0.0.97", "inherited": true, "configFile": "lakefile.lean"}, {"url": "https://github.com/leanprover-community/aesop", "type": "git", "subDir": null, "scope": "leanprover-community", - "rev": "f08e838d4f9aea519f3cde06260cfb686fd4bab0", + "rev": "f74c7555aaa94eadd7b7bff9170f7983f92aac21", "name": "aesop", "manifestFile": "lake-manifest.json", - "inputRev": "master", + "inputRev": "v4.30.0-rc1", "inherited": true, "configFile": "lakefile.toml"}, {"url": "https://github.com/leanprover-community/quote4", "type": "git", "subDir": null, "scope": "leanprover-community", - "rev": "23324752757bf28124a518ec284044c8db79fee5", + "rev": "7aa86cb20b8458748dc24d55dab2d7ea01161057", "name": "Qq", "manifestFile": "lake-manifest.json", - "inputRev": "master", + "inputRev": "v4.30.0-rc1", "inherited": true, "configFile": "lakefile.toml"}, {"url": "https://github.com/leanprover-community/batteries", "type": "git", "subDir": null, "scope": "leanprover-community", - "rev": "cabbb5a025bfbbc50af9184ed2dfdde6ea4f53a7", + "rev": "bf597c77bf9b8e66720d724928207f5911533113", "name": "batteries", "manifestFile": "lake-manifest.json", + "inputRev": "v4.30.0-rc1", + "inherited": true, + "configFile": "lakefile.toml"}, + {"url": "https://github.com/leanprover/lean4-cli", + "type": "git", + "subDir": null, + "scope": "leanprover", + "rev": "f7d0ca7c926cdde0562af20394dd25d028b839a5", + "name": "Cli", + "manifestFile": "lake-manifest.json", + "inputRev": "v4.30.0-rc1", + "inherited": true, + "configFile": "lakefile.toml"}, + {"url": "https://github.com/leanprover/leansqlite", + "type": "git", + "subDir": null, + "scope": "", + "rev": "4dfd48ca35193382ddf4c64eef5cdd4f6091bef3", + "name": "leansqlite", + "manifestFile": "lake-manifest.json", + "inputRev": "main", + "inherited": true, + "configFile": "lakefile.lean"}, + {"url": "https://github.com/fgdorais/lean4-unicode-basic", + "type": "git", + "subDir": null, + "scope": "", + "rev": "c8bc12a81fd4ee633719f7fd3f587eca7d0e87ab", + "name": "UnicodeBasic", + "manifestFile": "lake-manifest.json", + "inputRev": "main", + "inherited": true, + "configFile": "lakefile.lean"}, + {"url": "https://github.com/dupuisf/BibtexQuery", + "type": "git", + "subDir": null, + "scope": "", + "rev": "5d31b64fb703c5d77f6ef4d1fb958f9bdf1ea539", + "name": "BibtexQuery", + "manifestFile": "lake-manifest.json", + "inputRev": "nightly-testing", + "inherited": true, + "configFile": "lakefile.toml"}, + {"url": "https://github.com/acmepjz/md4lean", + "type": "git", + "subDir": null, + "scope": "", + "rev": "6a3fb240133bcb7e1a066fdc784b3fdc304e3fc5", + "name": "MD4Lean", + "manifestFile": "lake-manifest.json", "inputRev": "main", "inherited": true, - "configFile": "lakefile.toml"}], + "configFile": "lakefile.lean"}], "name": "«cslib-extractor»", - "lakeDir": ".lake"} + "lakeDir": ".lake", + "fixedToolchain": false} diff --git a/lean/cslib/lean-toolchain b/lean/cslib/lean-toolchain index 4bafde2..2210cba 100644 --- a/lean/cslib/lean-toolchain +++ b/lean/cslib/lean-toolchain @@ -1 +1 @@ -leanprover/lean4:v4.28.0-rc1 +leanprover/lean4:v4.30.0-rc1 diff --git a/lean/flt/lake-manifest.json b/lean/flt/lake-manifest.json index 0faeab2..1a668fe 100644 --- a/lean/flt/lake-manifest.json +++ b/lean/flt/lake-manifest.json @@ -1,65 +1,25 @@ -{"version": "1.1.0", +{"version": "1.2.0", "packagesDir": ".lake/packages", "packages": - [{"url": "https://github.com/leanprover/doc-gen4", + [{"url": "https://github.com/ImperialCollegeLondon/FLT", "type": "git", "subDir": null, "scope": "", - "rev": "01e143329f2f79abbb8559344e836c221ae84cfd", - "name": "«doc-gen4»", - "manifestFile": "lake-manifest.json", - "inputRev": "v4.27.0", - "inherited": false, - "configFile": "lakefile.lean"}, - {"url": "https://github.com/ImperialCollegeLondon/FLT", - "type": "git", - "subDir": null, - "scope": "", - "rev": "782d8b962dd96be0c767b7b99e390e32c52ea583", + "rev": "260e400807ff88e84d008fb66792ec2dfdf83154", "name": "FLT", "manifestFile": "lake-manifest.json", "inputRev": "main", "inherited": false, "configFile": "lakefile.toml"}, - {"url": "https://github.com/leanprover/lean4-cli", + {"url": "https://github.com/leanprover/doc-gen4", "type": "git", "subDir": null, "scope": "", - "rev": "55c37290ff6186e2e965d68cf853a57c0702db82", - "name": "Cli", - "manifestFile": "lake-manifest.json", - "inputRev": "main", - "inherited": true, - "configFile": "lakefile.toml"}, - {"url": "https://github.com/fgdorais/lean4-unicode-basic", - "type": "git", - "subDir": null, - "scope": "", - "rev": "b09d573cbacf5a8e767292e0edeec787a81689ce", - "name": "UnicodeBasic", - "manifestFile": "lake-manifest.json", - "inputRev": "main", - "inherited": true, - "configFile": "lakefile.lean"}, - {"url": "https://github.com/dupuisf/BibtexQuery", - "type": "git", - "subDir": null, - "scope": "", - "rev": "c8a6a4dae7c7f42949a1058427a75c20447e990f", - "name": "BibtexQuery", - "manifestFile": "lake-manifest.json", - "inputRev": "master", - "inherited": true, - "configFile": "lakefile.toml"}, - {"url": "https://github.com/acmepjz/md4lean", - "type": "git", - "subDir": null, - "scope": "", - "rev": "38ac5945d744903ffcc473ce1030223991b11cf6", - "name": "MD4Lean", + "rev": "2c96309e0de1b916d1a0b74c8119fddc7bf7da51", + "name": "«doc-gen4»", "manifestFile": "lake-manifest.json", "inputRev": "main", - "inherited": true, + "inherited": false, "configFile": "lakefile.lean"}, {"url": "https://github.com/PatrickMassot/checkdecls.git", "type": "git", @@ -75,17 +35,17 @@ "type": "git", "subDir": null, "scope": "", - "rev": "a3a10db0e9d66acbebf76c5e6a135066525ac900", + "rev": "8f9d9cff6bd728b17a24e163c9402775d9e6a365", "name": "mathlib", "manifestFile": "lake-manifest.json", - "inputRev": null, + "inputRev": "master", "inherited": true, "configFile": "lakefile.lean"}, {"url": "https://github.com/leanprover-community/plausible", "type": "git", "subDir": null, "scope": "leanprover-community", - "rev": "009dc1e6f2feb2c96c081537d80a0905b2c6498f", + "rev": "55c8532eb21ec9f6d565d51d96b8ca50bd1fbef3", "name": "plausible", "manifestFile": "lake-manifest.json", "inputRev": "main", @@ -95,7 +55,7 @@ "type": "git", "subDir": null, "scope": "leanprover-community", - "rev": "5ce7f0a355f522a952a3d678d696bd563bb4fd28", + "rev": "c5d5b8fe6e5158def25cd28eb94e4141ad97c843", "name": "LeanSearchClient", "manifestFile": "lake-manifest.json", "inputRev": "main", @@ -105,7 +65,7 @@ "type": "git", "subDir": null, "scope": "leanprover-community", - "rev": "8f497d55985a189cea8020d9dc51260af1e41ad2", + "rev": "85b59af46828c029a9168f2f9c35119bd0721e6e", "name": "importGraph", "manifestFile": "lake-manifest.json", "inputRev": "main", @@ -115,17 +75,17 @@ "type": "git", "subDir": null, "scope": "leanprover-community", - "rev": "c04225ee7c0585effbd933662b3151f01b600e40", + "rev": "be3b2e63b1bbf496c478cef98b86972a37c1417d", "name": "proofwidgets", "manifestFile": "lake-manifest.json", - "inputRev": "v0.0.85", + "inputRev": "v0.0.87", "inherited": true, "configFile": "lakefile.lean"}, {"url": "https://github.com/leanprover-community/aesop", "type": "git", "subDir": null, "scope": "leanprover-community", - "rev": "cb837cc26236ada03c81837bebe0acd9c70ced7d", + "rev": "f642a64c76df8ba9cb53dba3b919425a0c2aeaf1", "name": "aesop", "manifestFile": "lake-manifest.json", "inputRev": "master", @@ -135,7 +95,7 @@ "type": "git", "subDir": null, "scope": "leanprover-community", - "rev": "bd58c9efe2086d56ca361807014141a860ddbf8c", + "rev": "b8f98e9087e02c8553945a2c5abf07cec8e798c3", "name": "Qq", "manifestFile": "lake-manifest.json", "inputRev": "master", @@ -145,11 +105,62 @@ "type": "git", "subDir": null, "scope": "leanprover-community", - "rev": "b25b36a7caf8e237e7d1e6121543078a06777c8a", + "rev": "495c008c3e3f4fb4256ff5582ddb3abf3198026f", "name": "batteries", "manifestFile": "lake-manifest.json", "inputRev": "main", "inherited": true, - "configFile": "lakefile.toml"}], + "configFile": "lakefile.toml"}, + {"url": "https://github.com/leanprover/lean4-cli", + "type": "git", + "subDir": null, + "scope": "leanprover", + "rev": "4f10f47646cb7d5748d6f423f4a07f98f7bbcc9e", + "name": "Cli", + "manifestFile": "lake-manifest.json", + "inputRev": "v4.28.0", + "inherited": true, + "configFile": "lakefile.toml"}, + {"url": "https://github.com/leanprover/leansqlite", + "type": "git", + "subDir": null, + "scope": "", + "rev": "4dfd48ca35193382ddf4c64eef5cdd4f6091bef3", + "name": "leansqlite", + "manifestFile": "lake-manifest.json", + "inputRev": "main", + "inherited": true, + "configFile": "lakefile.lean"}, + {"url": "https://github.com/fgdorais/lean4-unicode-basic", + "type": "git", + "subDir": null, + "scope": "", + "rev": "c8bc12a81fd4ee633719f7fd3f587eca7d0e87ab", + "name": "UnicodeBasic", + "manifestFile": "lake-manifest.json", + "inputRev": "main", + "inherited": true, + "configFile": "lakefile.lean"}, + {"url": "https://github.com/dupuisf/BibtexQuery", + "type": "git", + "subDir": null, + "scope": "", + "rev": "5d31b64fb703c5d77f6ef4d1fb958f9bdf1ea539", + "name": "BibtexQuery", + "manifestFile": "lake-manifest.json", + "inputRev": "nightly-testing", + "inherited": true, + "configFile": "lakefile.toml"}, + {"url": "https://github.com/acmepjz/md4lean", + "type": "git", + "subDir": null, + "scope": "", + "rev": "6a3fb240133bcb7e1a066fdc784b3fdc304e3fc5", + "name": "MD4Lean", + "manifestFile": "lake-manifest.json", + "inputRev": "main", + "inherited": true, + "configFile": "lakefile.lean"}], "name": "«flt-extractor»", - "lakeDir": ".lake"} + "lakeDir": ".lake", + "fixedToolchain": false} diff --git a/lean/flt/lean-toolchain b/lean/flt/lean-toolchain index 5249182..79ac861 100644 --- a/lean/flt/lean-toolchain +++ b/lean/flt/lean-toolchain @@ -1 +1 @@ -leanprover/lean4:v4.27.0 +leanprover/lean4:v4.30.0-rc1 \ No newline at end of file diff --git a/lean/formal-conjectures/lake-manifest.json b/lean/formal-conjectures/lake-manifest.json index 3f35390..d9f2d91 100644 --- a/lean/formal-conjectures/lake-manifest.json +++ b/lean/formal-conjectures/lake-manifest.json @@ -1,91 +1,51 @@ -{"version": "1.1.0", +{"version": "1.2.0", "packagesDir": ".lake/packages", "packages": - [{"url": "https://github.com/leanprover/doc-gen4", + [{"url": "https://github.com/google-deepmind/formal-conjectures", "type": "git", "subDir": null, "scope": "", - "rev": "58b48e75bd19f785927e06912dea610e5e48f1fa", - "name": "«doc-gen4»", - "manifestFile": "lake-manifest.json", - "inputRev": "v4.22.0", - "inherited": false, - "configFile": "lakefile.lean"}, - {"url": "https://github.com/google-deepmind/formal-conjectures", - "type": "git", - "subDir": null, - "scope": "", - "rev": "f01550cf4f910c7018f593f47979ab1cc9e8c70e", + "rev": "0ab27014f1755dc2812ba4193add6b4b159a0022", "name": "formal_conjectures", "manifestFile": "lake-manifest.json", "inputRev": "main", "inherited": false, "configFile": "lakefile.toml"}, - {"url": "https://github.com/mhuisi/lean4-cli", - "type": "git", - "subDir": null, - "scope": "", - "rev": "6667b921594697980586296511fab6a359e802d1", - "name": "Cli", - "manifestFile": "lake-manifest.json", - "inputRev": "main", - "inherited": true, - "configFile": "lakefile.toml"}, - {"url": "https://github.com/fgdorais/lean4-unicode-basic", - "type": "git", - "subDir": null, - "scope": "", - "rev": "d3195374a885cf2b0bfa66063deb493686029f95", - "name": "UnicodeBasic", - "manifestFile": "lake-manifest.json", - "inputRev": "main", - "inherited": true, - "configFile": "lakefile.lean"}, - {"url": "https://github.com/dupuisf/BibtexQuery", - "type": "git", - "subDir": null, - "scope": "", - "rev": "dbfe2b7630c5f7c5c1cf71e7747ffc0a30337f69", - "name": "BibtexQuery", - "manifestFile": "lake-manifest.json", - "inputRev": "dbfe2b7630c5f7c5c1cf71e7747ffc0a30337f69", - "inherited": true, - "configFile": "lakefile.toml"}, - {"url": "https://github.com/acmepjz/md4lean", + {"url": "https://github.com/leanprover/doc-gen4", "type": "git", "subDir": null, "scope": "", - "rev": "feac4e0c356b0928657bf3b54fa83ae952f53257", - "name": "MD4Lean", + "rev": "2c96309e0de1b916d1a0b74c8119fddc7bf7da51", + "name": "«doc-gen4»", "manifestFile": "lake-manifest.json", "inputRev": "main", - "inherited": true, + "inherited": false, "configFile": "lakefile.lean"}, {"url": "https://github.com/leanprover-community/mathlib4", "type": "git", "subDir": null, "scope": "leanprover-community", - "rev": "79e94a093aff4a60fb1b1f92d9681e407124c2ca", + "rev": "a3a10db0e9d66acbebf76c5e6a135066525ac900", "name": "mathlib", "manifestFile": "lake-manifest.json", - "inputRev": "v4.22.0", + "inputRev": "v4.27.0", "inherited": true, "configFile": "lakefile.lean"}, {"url": "https://github.com/leanprover-community/plausible", "type": "git", "subDir": null, "scope": "leanprover-community", - "rev": "b100ad4c5d74a464f497aaa8e7c74d86bf39a56f", + "rev": "009dc1e6f2feb2c96c081537d80a0905b2c6498f", "name": "plausible", "manifestFile": "lake-manifest.json", - "inputRev": "v4.22.0", + "inputRev": "main", "inherited": true, "configFile": "lakefile.toml"}, {"url": "https://github.com/leanprover-community/LeanSearchClient", "type": "git", "subDir": null, "scope": "leanprover-community", - "rev": "99657ad92e23804e279f77ea6dbdeebaa1317b98", + "rev": "5ce7f0a355f522a952a3d678d696bd563bb4fd28", "name": "LeanSearchClient", "manifestFile": "lake-manifest.json", "inputRev": "main", @@ -95,51 +55,102 @@ "type": "git", "subDir": null, "scope": "leanprover-community", - "rev": "eb164a46de87078f27640ee71e6c3841defc2484", + "rev": "8f497d55985a189cea8020d9dc51260af1e41ad2", "name": "importGraph", "manifestFile": "lake-manifest.json", - "inputRev": "v4.22.0", + "inputRev": "main", "inherited": true, "configFile": "lakefile.toml"}, {"url": "https://github.com/leanprover-community/ProofWidgets4", "type": "git", "subDir": null, "scope": "leanprover-community", - "rev": "1253a071e6939b0faf5c09d2b30b0bfc79dae407", + "rev": "c04225ee7c0585effbd933662b3151f01b600e40", "name": "proofwidgets", "manifestFile": "lake-manifest.json", - "inputRev": "v0.0.68", + "inputRev": "v0.0.85", "inherited": true, "configFile": "lakefile.lean"}, {"url": "https://github.com/leanprover-community/aesop", "type": "git", "subDir": null, "scope": "leanprover-community", - "rev": "1256a18522728c2eeed6109b02dd2b8f207a2a3c", + "rev": "cb837cc26236ada03c81837bebe0acd9c70ced7d", "name": "aesop", "manifestFile": "lake-manifest.json", - "inputRev": "v4.22.0", + "inputRev": "master", "inherited": true, "configFile": "lakefile.toml"}, {"url": "https://github.com/leanprover-community/quote4", "type": "git", "subDir": null, "scope": "leanprover-community", - "rev": "917bfa5064b812b7fbd7112d018ea0b4def25ab3", + "rev": "bd58c9efe2086d56ca361807014141a860ddbf8c", "name": "Qq", "manifestFile": "lake-manifest.json", - "inputRev": "v4.22.0", + "inputRev": "master", "inherited": true, "configFile": "lakefile.toml"}, {"url": "https://github.com/leanprover-community/batteries", "type": "git", "subDir": null, "scope": "leanprover-community", - "rev": "240676e9568c254a69be94801889d4b13f3b249f", + "rev": "b25b36a7caf8e237e7d1e6121543078a06777c8a", "name": "batteries", "manifestFile": "lake-manifest.json", - "inputRev": "v4.22.0", + "inputRev": "main", + "inherited": true, + "configFile": "lakefile.toml"}, + {"url": "https://github.com/leanprover/lean4-cli", + "type": "git", + "subDir": null, + "scope": "leanprover", + "rev": "55c37290ff6186e2e965d68cf853a57c0702db82", + "name": "Cli", + "manifestFile": "lake-manifest.json", + "inputRev": "v4.27.0", + "inherited": true, + "configFile": "lakefile.toml"}, + {"url": "https://github.com/leanprover/leansqlite", + "type": "git", + "subDir": null, + "scope": "", + "rev": "4dfd48ca35193382ddf4c64eef5cdd4f6091bef3", + "name": "leansqlite", + "manifestFile": "lake-manifest.json", + "inputRev": "main", + "inherited": true, + "configFile": "lakefile.lean"}, + {"url": "https://github.com/fgdorais/lean4-unicode-basic", + "type": "git", + "subDir": null, + "scope": "", + "rev": "c8bc12a81fd4ee633719f7fd3f587eca7d0e87ab", + "name": "UnicodeBasic", + "manifestFile": "lake-manifest.json", + "inputRev": "main", + "inherited": true, + "configFile": "lakefile.lean"}, + {"url": "https://github.com/dupuisf/BibtexQuery", + "type": "git", + "subDir": null, + "scope": "", + "rev": "5d31b64fb703c5d77f6ef4d1fb958f9bdf1ea539", + "name": "BibtexQuery", + "manifestFile": "lake-manifest.json", + "inputRev": "nightly-testing", + "inherited": true, + "configFile": "lakefile.toml"}, + {"url": "https://github.com/acmepjz/md4lean", + "type": "git", + "subDir": null, + "scope": "", + "rev": "6a3fb240133bcb7e1a066fdc784b3fdc304e3fc5", + "name": "MD4Lean", + "manifestFile": "lake-manifest.json", + "inputRev": "main", "inherited": true, - "configFile": "lakefile.toml"}], + "configFile": "lakefile.lean"}], "name": "«formal-conjectures-extractor»", - "lakeDir": ".lake"} + "lakeDir": ".lake", + "fixedToolchain": false} diff --git a/lean/formal-conjectures/lean-toolchain b/lean/formal-conjectures/lean-toolchain index 6ac6d4c..79ac861 100644 --- a/lean/formal-conjectures/lean-toolchain +++ b/lean/formal-conjectures/lean-toolchain @@ -1 +1 @@ -leanprover/lean4:v4.22.0 +leanprover/lean4:v4.30.0-rc1 \ No newline at end of file diff --git a/lean/mathlib/lake-manifest.json b/lean/mathlib/lake-manifest.json index 68cc3c8..d094a2d 100644 --- a/lean/mathlib/lake-manifest.json +++ b/lean/mathlib/lake-manifest.json @@ -1,71 +1,31 @@ -{"version": "1.1.0", +{"version": "1.2.0", "packagesDir": ".lake/packages", "packages": - [{"url": "https://github.com/leanprover/doc-gen4", + [{"url": "https://github.com/leanprover-community/mathlib4.git", "type": "git", "subDir": null, "scope": "", - "rev": "7be6082d00bb0cfc1e136505dba7718cd5fcaa84", - "name": "«doc-gen4»", - "manifestFile": "lake-manifest.json", - "inputRev": "v4.28.0-rc1", - "inherited": false, - "configFile": "lakefile.lean"}, - {"url": "https://github.com/leanprover-community/mathlib4.git", - "type": "git", - "subDir": null, - "scope": "", - "rev": "0e16bffea0a1b2664111fb1cfeb46c94e4a92604", + "rev": "fd68cd664924f41a676da81ded2c886d0488af03", "name": "mathlib", "manifestFile": "lake-manifest.json", "inputRev": null, "inherited": false, "configFile": "lakefile.lean"}, - {"url": "https://github.com/leanprover/lean4-cli", - "type": "git", - "subDir": null, - "scope": "", - "rev": "28e0856d4424863a85b18f38868c5420c55f9bae", - "name": "Cli", - "manifestFile": "lake-manifest.json", - "inputRev": "main", - "inherited": true, - "configFile": "lakefile.toml"}, - {"url": "https://github.com/fgdorais/lean4-unicode-basic", + {"url": "https://github.com/leanprover/doc-gen4", "type": "git", "subDir": null, "scope": "", - "rev": "8668e1ab7c987fb8ed1349f14c3b7b60bd5f27b6", - "name": "UnicodeBasic", - "manifestFile": "lake-manifest.json", - "inputRev": "main", - "inherited": true, - "configFile": "lakefile.lean"}, - {"url": "https://github.com/dupuisf/BibtexQuery", - "type": "git", - "subDir": null, - "scope": "", - "rev": "1c5c543d2637aebf90c80aead2d401ae88db13cc", - "name": "BibtexQuery", - "manifestFile": "lake-manifest.json", - "inputRev": "master", - "inherited": true, - "configFile": "lakefile.toml"}, - {"url": "https://github.com/acmepjz/md4lean", - "type": "git", - "subDir": null, - "scope": "", - "rev": "38ac5945d744903ffcc473ce1030223991b11cf6", - "name": "MD4Lean", + "rev": "2c96309e0de1b916d1a0b74c8119fddc7bf7da51", + "name": "«doc-gen4»", "manifestFile": "lake-manifest.json", "inputRev": "main", - "inherited": true, + "inherited": false, "configFile": "lakefile.lean"}, {"url": "https://github.com/leanprover-community/plausible", "type": "git", "subDir": null, "scope": "leanprover-community", - "rev": "7311586e1a56af887b1081d05e80c11b6c41d212", + "rev": "a3b459a8312125758e51c354b93d54ba620efda6", "name": "plausible", "manifestFile": "lake-manifest.json", "inputRev": "main", @@ -75,7 +35,7 @@ "type": "git", "subDir": null, "scope": "leanprover-community", - "rev": "5ce7f0a355f522a952a3d678d696bd563bb4fd28", + "rev": "c5d5b8fe6e5158def25cd28eb94e4141ad97c843", "name": "LeanSearchClient", "manifestFile": "lake-manifest.json", "inputRev": "main", @@ -85,7 +45,7 @@ "type": "git", "subDir": null, "scope": "leanprover-community", - "rev": "875ad9d88ed684e39c16bdea260e6ecfa15afd60", + "rev": "4411c5f89c797401c609b3a946c8874569e69731", "name": "importGraph", "manifestFile": "lake-manifest.json", "inputRev": "main", @@ -95,41 +55,92 @@ "type": "git", "subDir": null, "scope": "leanprover-community", - "rev": "6d65c6e0a25b8a52c13c3adeb63ecde3bfbb6294", + "rev": "82d457fb3bdd9efadbae06608ff337d689efdddf", "name": "proofwidgets", "manifestFile": "lake-manifest.json", - "inputRev": "v0.0.86", + "inputRev": "v0.0.97", "inherited": true, "configFile": "lakefile.lean"}, {"url": "https://github.com/leanprover-community/aesop", "type": "git", "subDir": null, "scope": "leanprover-community", - "rev": "f08e838d4f9aea519f3cde06260cfb686fd4bab0", + "rev": "f74c7555aaa94eadd7b7bff9170f7983f92aac21", "name": "aesop", "manifestFile": "lake-manifest.json", - "inputRev": "master", + "inputRev": "v4.30.0-rc1", "inherited": true, "configFile": "lakefile.toml"}, {"url": "https://github.com/leanprover-community/quote4", "type": "git", "subDir": null, "scope": "leanprover-community", - "rev": "23324752757bf28124a518ec284044c8db79fee5", + "rev": "7aa86cb20b8458748dc24d55dab2d7ea01161057", "name": "Qq", "manifestFile": "lake-manifest.json", - "inputRev": "master", + "inputRev": "v4.30.0-rc1", "inherited": true, "configFile": "lakefile.toml"}, {"url": "https://github.com/leanprover-community/batteries", "type": "git", "subDir": null, "scope": "leanprover-community", - "rev": "cabbb5a025bfbbc50af9184ed2dfdde6ea4f53a7", + "rev": "bf597c77bf9b8e66720d724928207f5911533113", "name": "batteries", "manifestFile": "lake-manifest.json", + "inputRev": "v4.30.0-rc1", + "inherited": true, + "configFile": "lakefile.toml"}, + {"url": "https://github.com/leanprover/lean4-cli", + "type": "git", + "subDir": null, + "scope": "leanprover", + "rev": "f7d0ca7c926cdde0562af20394dd25d028b839a5", + "name": "Cli", + "manifestFile": "lake-manifest.json", + "inputRev": "v4.30.0-rc1", + "inherited": true, + "configFile": "lakefile.toml"}, + {"url": "https://github.com/leanprover/leansqlite", + "type": "git", + "subDir": null, + "scope": "", + "rev": "4dfd48ca35193382ddf4c64eef5cdd4f6091bef3", + "name": "leansqlite", + "manifestFile": "lake-manifest.json", + "inputRev": "main", + "inherited": true, + "configFile": "lakefile.lean"}, + {"url": "https://github.com/fgdorais/lean4-unicode-basic", + "type": "git", + "subDir": null, + "scope": "", + "rev": "c8bc12a81fd4ee633719f7fd3f587eca7d0e87ab", + "name": "UnicodeBasic", + "manifestFile": "lake-manifest.json", + "inputRev": "main", + "inherited": true, + "configFile": "lakefile.lean"}, + {"url": "https://github.com/dupuisf/BibtexQuery", + "type": "git", + "subDir": null, + "scope": "", + "rev": "5d31b64fb703c5d77f6ef4d1fb958f9bdf1ea539", + "name": "BibtexQuery", + "manifestFile": "lake-manifest.json", + "inputRev": "nightly-testing", + "inherited": true, + "configFile": "lakefile.toml"}, + {"url": "https://github.com/acmepjz/md4lean", + "type": "git", + "subDir": null, + "scope": "", + "rev": "6a3fb240133bcb7e1a066fdc784b3fdc304e3fc5", + "name": "MD4Lean", + "manifestFile": "lake-manifest.json", "inputRev": "main", "inherited": true, - "configFile": "lakefile.toml"}], + "configFile": "lakefile.lean"}], "name": "«mathlib-extractor»", - "lakeDir": ".lake"} + "lakeDir": ".lake", + "fixedToolchain": false} diff --git a/lean/mathlib/lean-toolchain b/lean/mathlib/lean-toolchain index 4bafde2..2210cba 100644 --- a/lean/mathlib/lean-toolchain +++ b/lean/mathlib/lean-toolchain @@ -1 +1 @@ -leanprover/lean4:v4.28.0-rc1 +leanprover/lean4:v4.30.0-rc1 diff --git a/lean/physlean/lake-manifest.json b/lean/physlean/lake-manifest.json index e0230f9..bda3b9e 100644 --- a/lean/physlean/lake-manifest.json +++ b/lean/physlean/lake-manifest.json @@ -1,81 +1,41 @@ -{"version": "1.1.0", +{"version": "1.2.0", "packagesDir": ".lake/packages", "packages": - [{"url": "https://github.com/leanprover/doc-gen4", + [{"url": "https://github.com/HEPLean/PhysLean", "type": "git", "subDir": null, "scope": "", - "rev": "77ef3eb515ad6bd125c596f0b164349d4a7d5bf5", - "name": "«doc-gen4»", - "manifestFile": "lake-manifest.json", - "inputRev": "v4.26.0", - "inherited": false, - "configFile": "lakefile.lean"}, - {"url": "https://github.com/HEPLean/PhysLean", - "type": "git", - "subDir": null, - "scope": "", - "rev": "7ba876c3f5fcd9f67fa2ee9326dd5d0b1e70079a", + "rev": "9ca1ee1d0cac43391399fcdc9e9fca8c94c17057", "name": "PhysLean", "manifestFile": "lake-manifest.json", "inputRev": null, "inherited": false, - "configFile": "lakefile.toml"}, - {"url": "https://github.com/leanprover/lean4-cli", - "type": "git", - "subDir": null, - "scope": "", - "rev": "933fce7e893f65969714c60cdb4bd8376786044e", - "name": "Cli", - "manifestFile": "lake-manifest.json", - "inputRev": "main", - "inherited": true, - "configFile": "lakefile.toml"}, - {"url": "https://github.com/fgdorais/lean4-unicode-basic", - "type": "git", - "subDir": null, - "scope": "", - "rev": "84b88f7ac9adf382b9668f852cee82487d616792", - "name": "UnicodeBasic", - "manifestFile": "lake-manifest.json", - "inputRev": "main", - "inherited": true, "configFile": "lakefile.lean"}, - {"url": "https://github.com/dupuisf/BibtexQuery", + {"url": "https://github.com/leanprover/doc-gen4", "type": "git", "subDir": null, "scope": "", - "rev": "3ab4379b2b92448717de66b7d3e254ac1487aede", - "name": "BibtexQuery", - "manifestFile": "lake-manifest.json", - "inputRev": "master", - "inherited": true, - "configFile": "lakefile.toml"}, - {"url": "https://github.com/acmepjz/md4lean", - "type": "git", - "subDir": null, - "scope": "", - "rev": "38ac5945d744903ffcc473ce1030223991b11cf6", - "name": "MD4Lean", + "rev": "2c96309e0de1b916d1a0b74c8119fddc7bf7da51", + "name": "«doc-gen4»", "manifestFile": "lake-manifest.json", "inputRev": "main", - "inherited": true, + "inherited": false, "configFile": "lakefile.lean"}, {"url": "https://github.com/leanprover-community/mathlib4.git", "type": "git", "subDir": null, "scope": "", - "rev": "2df2f0150c275ad53cb3c90f7c98ec15a56a1a67", + "rev": "8a178386ffc0f5fef0b77738bb5449d50efeea95", "name": "mathlib", "manifestFile": "lake-manifest.json", - "inputRev": "v4.26.0", + "inputRev": "v4.29.0", "inherited": true, "configFile": "lakefile.lean"}, {"url": "https://github.com/leanprover-community/plausible", "type": "git", "subDir": null, "scope": "leanprover-community", - "rev": "160af9e8e7d4ae448f3c92edcc5b6a8522453f11", + "rev": "83e90935a17ca19ebe4b7893c7f7066e266f50d3", "name": "plausible", "manifestFile": "lake-manifest.json", "inputRev": "main", @@ -85,7 +45,7 @@ "type": "git", "subDir": null, "scope": "leanprover-community", - "rev": "3591c3f664ac3719c4c86e4483e21e228707bfa2", + "rev": "c5d5b8fe6e5158def25cd28eb94e4141ad97c843", "name": "LeanSearchClient", "manifestFile": "lake-manifest.json", "inputRev": "main", @@ -95,7 +55,7 @@ "type": "git", "subDir": null, "scope": "leanprover-community", - "rev": "e9f31324f15ead11048b1443e62c5deaddd055d2", + "rev": "48d5698bc464786347c1b0d859b18f938420f060", "name": "importGraph", "manifestFile": "lake-manifest.json", "inputRev": "main", @@ -105,17 +65,17 @@ "type": "git", "subDir": null, "scope": "leanprover-community", - "rev": "b4fb2aa5290ebf61bc5f80a5375ba642f0a49192", + "rev": "3c52dee17f0cd89c1ec14de78920d1bdaa3d26b3", "name": "proofwidgets", "manifestFile": "lake-manifest.json", - "inputRev": "v0.0.83", + "inputRev": "v0.0.95", "inherited": true, "configFile": "lakefile.lean"}, {"url": "https://github.com/leanprover-community/aesop", "type": "git", "subDir": null, "scope": "leanprover-community", - "rev": "2f6d238744c4cb07fdc91240feaf5d4221a27931", + "rev": "7152850e7b216a0d409701617721b6e469d34bf6", "name": "aesop", "manifestFile": "lake-manifest.json", "inputRev": "master", @@ -125,7 +85,7 @@ "type": "git", "subDir": null, "scope": "leanprover-community", - "rev": "9312503909aa8e8bb392530145cc1677a6298574", + "rev": "707efb56d0696634e9e965523a1bbe9ac6ce141d", "name": "Qq", "manifestFile": "lake-manifest.json", "inputRev": "master", @@ -135,11 +95,62 @@ "type": "git", "subDir": null, "scope": "leanprover-community", - "rev": "24241822ef9d3e7f6a3bcc53ad136e12663db8f3", + "rev": "756e3321fd3b02a85ffda19fef789916223e578c", "name": "batteries", "manifestFile": "lake-manifest.json", "inputRev": "main", "inherited": true, - "configFile": "lakefile.toml"}], + "configFile": "lakefile.toml"}, + {"url": "https://github.com/leanprover/lean4-cli", + "type": "git", + "subDir": null, + "scope": "leanprover", + "rev": "7802da01beb530bf051ab657443f9cd9bc3e1a29", + "name": "Cli", + "manifestFile": "lake-manifest.json", + "inputRev": "v4.29.0", + "inherited": true, + "configFile": "lakefile.toml"}, + {"url": "https://github.com/kim-em/leansqlite", + "type": "git", + "subDir": null, + "scope": "", + "rev": "d14544c72b593af6a66131bc34cdab16bf7c0940", + "name": "leansqlite", + "manifestFile": "lake-manifest.json", + "inputRev": "suppress-reducibility-warning", + "inherited": true, + "configFile": "lakefile.lean"}, + {"url": "https://github.com/fgdorais/lean4-unicode-basic", + "type": "git", + "subDir": null, + "scope": "", + "rev": "9539e34e5cb2d52a6454d9b6218f6b6835cad071", + "name": "UnicodeBasic", + "manifestFile": "lake-manifest.json", + "inputRev": "main", + "inherited": true, + "configFile": "lakefile.lean"}, + {"url": "https://github.com/dupuisf/BibtexQuery", + "type": "git", + "subDir": null, + "scope": "", + "rev": "5d31b64fb703c5d77f6ef4d1fb958f9bdf1ea539", + "name": "BibtexQuery", + "manifestFile": "lake-manifest.json", + "inputRev": "nightly-testing", + "inherited": true, + "configFile": "lakefile.toml"}, + {"url": "https://github.com/acmepjz/md4lean", + "type": "git", + "subDir": null, + "scope": "", + "rev": "6a3fb240133bcb7e1a066fdc784b3fdc304e3fc5", + "name": "MD4Lean", + "manifestFile": "lake-manifest.json", + "inputRev": "main", + "inherited": true, + "configFile": "lakefile.lean"}], "name": "«physlean-extractor»", - "lakeDir": ".lake"} + "lakeDir": ".lake", + "fixedToolchain": false} diff --git a/lean/physlean/lean-toolchain b/lean/physlean/lean-toolchain index e59446d..79ac861 100644 --- a/lean/physlean/lean-toolchain +++ b/lean/physlean/lean-toolchain @@ -1 +1 @@ -leanprover/lean4:v4.26.0 +leanprover/lean4:v4.30.0-rc1 \ No newline at end of file From 95fa0aefff9795c57ee8c3bd61907fd0b9716df3 Mon Sep 17 00:00:00 2001 From: justincasher Date: Sat, 11 Apr 2026 13:28:58 -0400 Subject: [PATCH 17/22] Restore doc-gen4 version pinning to match package toolchain Pinning doc-gen4 to @ "main" breaks packages on older toolchains (e.g. PhysLean at v4.29.0 vs doc-gen4 main at v4.30.0-rc1) due to incompatible olean headers. Restore version-matched pinning so each package gets a doc-gen4 compatible with its Lean toolchain. The require ordering fix (doc-gen4 first, main package last) is kept to avoid transitive dependency conflicts. --- src/lean_explore/extract/doc_gen4.py | 5 +++-- src/lean_explore/extract/package_utils.py | 19 ++++++++++--------- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/src/lean_explore/extract/doc_gen4.py b/src/lean_explore/extract/doc_gen4.py index 04523b9..763476c 100644 --- a/src/lean_explore/extract/doc_gen4.py +++ b/src/lean_explore/extract/doc_gen4.py @@ -18,7 +18,7 @@ from lean_explore.extract.package_utils import ( get_extraction_order, get_package_toolchain, - strip_lakefile_docgen_version, + update_lakefile_docgen_version, ) logger = logging.getLogger(__name__) @@ -97,8 +97,9 @@ def _setup_workspace(package_config: PackageConfig) -> tuple[str, str]: toolchain_file = workspace_path / "lean-toolchain" lean_toolchain, git_ref = get_package_toolchain(package_config) + lean_version = extract_lean_version(lean_toolchain) - strip_lakefile_docgen_version(lakefile_path) + update_lakefile_docgen_version(lakefile_path, lean_version) toolchain_file.write_text(lean_toolchain + "\n") return lean_toolchain, git_ref diff --git a/src/lean_explore/extract/package_utils.py b/src/lean_explore/extract/package_utils.py index 97e40af..ab5d5b8 100644 --- a/src/lean_explore/extract/package_utils.py +++ b/src/lean_explore/extract/package_utils.py @@ -81,16 +81,17 @@ def get_package_toolchain(package_configuration: PackageConfig) -> tuple[str, st return toolchain, latest_tag -def strip_lakefile_docgen_version(lakefile_path: Path) -> None: - """Pin doc-gen4 to its main branch instead of a specific version tag. +def update_lakefile_docgen_version(lakefile_path: Path, lean_version: str) -> None: + """Update the doc-gen4 version in a lakefile to match the Lean toolchain. - Doc-gen4's main branch tracks the latest Lean toolchain. Pinning to a - specific release tag causes transitive dependency conflicts (e.g. plausible - version mismatches) that break ``lake exe cache get``. Using ``@ "main"`` - lets Lake resolve a compatible version automatically. + Doc-gen4 releases are tagged to match Lean toolchain versions, so pinning + to the same version ensures compatibility. The doc-gen4 ``require`` must + appear before the main package ``require`` so that the main package's + transitive dependency versions take precedence during resolution. Args: lakefile_path: Path to lakefile.lean + lean_version: Lean version like 'v4.27.0' """ content = lakefile_path.read_text() @@ -99,13 +100,13 @@ def strip_lakefile_docgen_version(lakefile_path: Path) -> None: r'"https://github\.com/leanprover/doc-gen4"(?:\s+@\s+"[^"]*")?' ) replacement = ( - 'require «doc-gen4» from git\n' - ' "https://github.com/leanprover/doc-gen4" @ "main"' + f"require «doc-gen4» from git\n" + f' "https://github.com/leanprover/doc-gen4" @ "{lean_version}"' ) new_content = re.sub(pattern, replacement, content) if new_content != content: lakefile_path.write_text(new_content) logger.info( - "Pinned doc-gen4 to main branch in %s", lakefile_path + "Updated doc-gen4 version to %s in %s", lean_version, lakefile_path ) From 05ed942e678c0945e4ebdeee3d7ed40933f7b2be Mon Sep 17 00:00:00 2001 From: justincasher Date: Sat, 11 Apr 2026 15:50:51 -0400 Subject: [PATCH 18/22] Clear cache on toolchain change even for SQLite-capable packages The cache-skip optimization for SQLite-format packages assumed oleans were always compatible. When the toolchain version changes between runs (e.g. from v4.30.0-rc1 to v4.29.0), stale oleans cause "incompatible header" build failures. Now the old toolchain is read before setup, and if it differs from the new one, the cache is cleared regardless of SQLite support. --- src/lean_explore/extract/doc_gen4.py | 22 ++++++++++++++---- tests/extract/doc_gen4_test.py | 34 +++++++++++++++++++++++++++- 2 files changed, 51 insertions(+), 5 deletions(-) diff --git a/src/lean_explore/extract/doc_gen4.py b/src/lean_explore/extract/doc_gen4.py index 763476c..6fd5afe 100644 --- a/src/lean_explore/extract/doc_gen4.py +++ b/src/lean_explore/extract/doc_gen4.py @@ -254,14 +254,28 @@ async def run_doc_gen4( toolchain = None ref = None if fresh: + # Read the current toolchain before setup potentially overwrites it + old_toolchain_file = workspace_path / "lean-toolchain" + old_toolchain = None + if old_toolchain_file.exists(): + old_toolchain = old_toolchain_file.read_text().strip() + if setup: toolchain, ref = _setup_workspace(config) logger.info("Toolchain: %s, ref: %s", toolchain, ref) - # Doc-gen4 switched from BMP files to api-docs.db in v4.29.0-rc2. - # The SQLite format handles incremental updates, while legacy BMP - # output requires a cache clear to avoid stale files. - if toolchain and _uses_sqlite_docgen(toolchain): + # Doc-gen4 >= v4.29.0-rc2 writes to api-docs.db which handles + # incremental updates, so we can skip the cache clear — but only + # if the toolchain hasn't changed. A toolchain change means all + # oleans are stale and must be rebuilt from scratch. + toolchain_changed = toolchain and toolchain != old_toolchain + if toolchain_changed: + logger.info( + "[%s] Toolchain changed (%s -> %s), clearing cache", + package_name, old_toolchain, toolchain, + ) + _clear_workspace_cache(workspace_path) + elif toolchain and _uses_sqlite_docgen(toolchain): logger.info( "[%s] Skipping cache clear " "(api-docs.db handles incremental updates)", diff --git a/tests/extract/doc_gen4_test.py b/tests/extract/doc_gen4_test.py index f8443bf..c6b76da 100644 --- a/tests/extract/doc_gen4_test.py +++ b/tests/extract/doc_gen4_test.py @@ -1,5 +1,6 @@ """Tests for doc-gen4 workspace orchestration.""" +from pathlib import Path from unittest.mock import patch from lean_explore.extract.doc_gen4 import _uses_sqlite_docgen, run_doc_gen4 @@ -16,11 +17,19 @@ def test_uses_sqlite_docgen_from_rc2(self): assert not _uses_sqlite_docgen("leanprover/lean4:v4.28.0") +def _write_toolchain(version: str) -> None: + """Write a lean-toolchain file for the mathlib workspace.""" + path = Path("lean/mathlib/lean-toolchain") + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(version + "\n") + + class TestRunDocGen4FreshHandling: """Tests for fresh-cache handling in run_doc_gen4.""" async def test_fresh_legacy_workspace_clears_cache(self): """Test that legacy doc-gen4 workspaces still clear the cache.""" + _write_toolchain("leanprover/lean4:v4.29.0-rc1") with patch( "lean_explore.extract.doc_gen4.get_extraction_order", return_value=["mathlib"], @@ -41,7 +50,8 @@ async def test_fresh_legacy_workspace_clears_cache(self): mock_run.assert_called_once_with("mathlib", False) async def test_fresh_sqlite_workspace_skips_cache_clear(self): - """Test that SQLite doc-gen4 workspaces keep the cache.""" + """Test that SQLite workspaces with unchanged toolchain skip clear.""" + _write_toolchain("leanprover/lean4:v4.29.0-rc2") with patch( "lean_explore.extract.doc_gen4.get_extraction_order", return_value=["mathlib"], @@ -60,3 +70,25 @@ async def test_fresh_sqlite_workspace_skips_cache_clear(self): mock_clear.assert_not_called() mock_run.assert_called_once_with("mathlib", False) + + async def test_fresh_sqlite_workspace_clears_on_toolchain_change(self): + """Test that a toolchain version change forces a cache clear.""" + _write_toolchain("leanprover/lean4:v4.29.0-rc2") + with patch( + "lean_explore.extract.doc_gen4.get_extraction_order", + return_value=["mathlib"], + ): + with patch( + "lean_explore.extract.doc_gen4._setup_workspace", + return_value=("leanprover/lean4:v4.30.0-rc1", "v4.30.0-rc1"), + ): + with patch( + "lean_explore.extract.doc_gen4._clear_workspace_cache" + ) as mock_clear: + with patch( + "lean_explore.extract.doc_gen4._run_lake_for_package" + ) as mock_run: + await run_doc_gen4(fresh=True) + + mock_clear.assert_called_once() + mock_run.assert_called_once_with("mathlib", False) From 6bb4bcd3c894cf665b3ba3b99673a543a1f4b8d4 Mon Sep 17 00:00:00 2001 From: justincasher Date: Sat, 11 Apr 2026 16:03:18 -0400 Subject: [PATCH 19/22] Revert toolchain change detection in cache-skip logic MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The toolchain change detection was an overcorrection for a one-off issue caused by the @ "main" experiment. Lake handles toolchain changes and olean staleness itself — the cache-skip logic only needs to distinguish SQLite (incremental) from BMP (stale files) format. --- src/lean_explore/extract/doc_gen4.py | 22 ++++-------------- tests/extract/doc_gen4_test.py | 34 +--------------------------- 2 files changed, 5 insertions(+), 51 deletions(-) diff --git a/src/lean_explore/extract/doc_gen4.py b/src/lean_explore/extract/doc_gen4.py index 6fd5afe..763476c 100644 --- a/src/lean_explore/extract/doc_gen4.py +++ b/src/lean_explore/extract/doc_gen4.py @@ -254,28 +254,14 @@ async def run_doc_gen4( toolchain = None ref = None if fresh: - # Read the current toolchain before setup potentially overwrites it - old_toolchain_file = workspace_path / "lean-toolchain" - old_toolchain = None - if old_toolchain_file.exists(): - old_toolchain = old_toolchain_file.read_text().strip() - if setup: toolchain, ref = _setup_workspace(config) logger.info("Toolchain: %s, ref: %s", toolchain, ref) - # Doc-gen4 >= v4.29.0-rc2 writes to api-docs.db which handles - # incremental updates, so we can skip the cache clear — but only - # if the toolchain hasn't changed. A toolchain change means all - # oleans are stale and must be rebuilt from scratch. - toolchain_changed = toolchain and toolchain != old_toolchain - if toolchain_changed: - logger.info( - "[%s] Toolchain changed (%s -> %s), clearing cache", - package_name, old_toolchain, toolchain, - ) - _clear_workspace_cache(workspace_path) - elif toolchain and _uses_sqlite_docgen(toolchain): + # Doc-gen4 switched from BMP files to api-docs.db in v4.29.0-rc2. + # The SQLite format handles incremental updates, while legacy BMP + # output requires a cache clear to avoid stale files. + if toolchain and _uses_sqlite_docgen(toolchain): logger.info( "[%s] Skipping cache clear " "(api-docs.db handles incremental updates)", diff --git a/tests/extract/doc_gen4_test.py b/tests/extract/doc_gen4_test.py index c6b76da..f8443bf 100644 --- a/tests/extract/doc_gen4_test.py +++ b/tests/extract/doc_gen4_test.py @@ -1,6 +1,5 @@ """Tests for doc-gen4 workspace orchestration.""" -from pathlib import Path from unittest.mock import patch from lean_explore.extract.doc_gen4 import _uses_sqlite_docgen, run_doc_gen4 @@ -17,19 +16,11 @@ def test_uses_sqlite_docgen_from_rc2(self): assert not _uses_sqlite_docgen("leanprover/lean4:v4.28.0") -def _write_toolchain(version: str) -> None: - """Write a lean-toolchain file for the mathlib workspace.""" - path = Path("lean/mathlib/lean-toolchain") - path.parent.mkdir(parents=True, exist_ok=True) - path.write_text(version + "\n") - - class TestRunDocGen4FreshHandling: """Tests for fresh-cache handling in run_doc_gen4.""" async def test_fresh_legacy_workspace_clears_cache(self): """Test that legacy doc-gen4 workspaces still clear the cache.""" - _write_toolchain("leanprover/lean4:v4.29.0-rc1") with patch( "lean_explore.extract.doc_gen4.get_extraction_order", return_value=["mathlib"], @@ -50,8 +41,7 @@ async def test_fresh_legacy_workspace_clears_cache(self): mock_run.assert_called_once_with("mathlib", False) async def test_fresh_sqlite_workspace_skips_cache_clear(self): - """Test that SQLite workspaces with unchanged toolchain skip clear.""" - _write_toolchain("leanprover/lean4:v4.29.0-rc2") + """Test that SQLite doc-gen4 workspaces keep the cache.""" with patch( "lean_explore.extract.doc_gen4.get_extraction_order", return_value=["mathlib"], @@ -70,25 +60,3 @@ async def test_fresh_sqlite_workspace_skips_cache_clear(self): mock_clear.assert_not_called() mock_run.assert_called_once_with("mathlib", False) - - async def test_fresh_sqlite_workspace_clears_on_toolchain_change(self): - """Test that a toolchain version change forces a cache clear.""" - _write_toolchain("leanprover/lean4:v4.29.0-rc2") - with patch( - "lean_explore.extract.doc_gen4.get_extraction_order", - return_value=["mathlib"], - ): - with patch( - "lean_explore.extract.doc_gen4._setup_workspace", - return_value=("leanprover/lean4:v4.30.0-rc1", "v4.30.0-rc1"), - ): - with patch( - "lean_explore.extract.doc_gen4._clear_workspace_cache" - ) as mock_clear: - with patch( - "lean_explore.extract.doc_gen4._run_lake_for_package" - ) as mock_run: - await run_doc_gen4(fresh=True) - - mock_clear.assert_called_once() - mock_run.assert_called_once_with("mathlib", False) From c89bae39d4e283e277688cdec6363e6cd0af714a Mon Sep 17 00:00:00 2001 From: justincasher Date: Sat, 11 Apr 2026 20:00:06 -0400 Subject: [PATCH 20/22] Fix PhysLean extractor build targets --- lean/physlean/PhysExtract.lean | 3 +- lean/physlean/lakefile.lean | 3 +- src/lean_explore/extract/doc_gen4.py | 99 ++++++++++++-------- src/lean_explore/extract/package_registry.py | 2 +- tests/extract/doc_gen4_test.py | 50 +++++++++- 5 files changed, 115 insertions(+), 42 deletions(-) diff --git a/lean/physlean/PhysExtract.lean b/lean/physlean/PhysExtract.lean index bbfe63d..cbfc3a6 100644 --- a/lean/physlean/PhysExtract.lean +++ b/lean/physlean/PhysExtract.lean @@ -1 +1,2 @@ -import PhysLean +import Physlib +import QuantumInfo diff --git a/lean/physlean/lakefile.lean b/lean/physlean/lakefile.lean index 14492e1..b819436 100644 --- a/lean/physlean/lakefile.lean +++ b/lean/physlean/lakefile.lean @@ -4,11 +4,12 @@ open Lake DSL package «physlean-extractor» where -- Workspace for extracting PhysLean documentation +@[default_target] lean_lib «PhysExtract» where roots := #[`PhysExtract] require «doc-gen4» from git - "https://github.com/leanprover/doc-gen4" @ "main" + "https://github.com/leanprover/doc-gen4" @ "v4.29.0" require PhysLean from git "https://github.com/HEPLean/PhysLean" diff --git a/src/lean_explore/extract/doc_gen4.py b/src/lean_explore/extract/doc_gen4.py index 763476c..99ee3f4 100644 --- a/src/lean_explore/extract/doc_gen4.py +++ b/src/lean_explore/extract/doc_gen4.py @@ -71,8 +71,8 @@ def _clear_workspace_cache(workspace_path: Path) -> None: shutil.rmtree(lake_dir) -def _get_doc_lib_names(package_name: str) -> list[str]: - """Get the library names to run doc-gen4 on for a package. +def _get_library_names(package_name: str) -> list[str]: + """Get the library names to build and run doc-gen4 on for a package. Some packages have custom extract wrappers, others use upstream libraries directly. """ @@ -86,6 +86,57 @@ def _get_doc_lib_names(package_name: str) -> list[str]: return lib_names.get(package_name, [f"{package_name.title()}Extract"]) +def _run_lake_build_target( + workspace_path: Path, + package_name: str, + target: str, + env: dict[str, str], + allow_failure: bool = False, +) -> bool: + """Run ``lake build `` streaming output to the logger. + + Args: + workspace_path: Path to the Lake workspace directory. + package_name: Name of the package for log messages. + target: The Lake build target to run. + env: Environment variables to pass to the subprocess. + allow_failure: Whether to continue when the target fails. + + Returns: + ``True`` if the target built successfully, otherwise ``False``. + + Raises: + RuntimeError: If the target fails and ``allow_failure`` is ``False``. + """ + logger.info("[%s] Running lake build %s...", package_name, target) + process = subprocess.Popen( + ["lake", "build", target], + cwd=workspace_path, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + bufsize=1, + env=env, + ) + if process.stdout: + for line in process.stdout: + logger.info(line.rstrip()) + + returncode = process.wait() + if returncode == 0: + return True + + if allow_failure: + logger.warning( + "[%s] lake build %s failed (continuing with generated docs)", + package_name, + target, + ) + return False + + raise RuntimeError(f"lake build failed for {package_name} target {target}") + + def _setup_workspace(package_config: PackageConfig) -> tuple[str, str]: """Fetch toolchain from GitHub and update lakefile. @@ -179,44 +230,18 @@ def _run_lake_for_package(package_name: str, verbose: bool = False) -> None: if result.returncode != 0: logger.warning("[%s] Cache fetch failed (non-fatal)", package_name) - logger.info("[%s] Running lake build...", package_name) - process = subprocess.Popen( - ["lake", "build"], - cwd=workspace_path, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - text=True, - bufsize=1, - env=env, - ) - if process.stdout: - for line in process.stdout: - logger.info(line.rstrip()) - if process.wait() != 0: - raise RuntimeError(f"lake build failed for {package_name}") - - lib_names = _get_doc_lib_names(package_name) + lib_names = _get_library_names(package_name) for lib_name in lib_names: - logger.info("[%s] Running doc-gen4 (%s:docs)...", package_name, lib_name) + _run_lake_build_target(workspace_path, package_name, lib_name, env) - process = subprocess.Popen( - ["lake", "build", f"{lib_name}:docs"], - cwd=workspace_path, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - text=True, - bufsize=1, - env=env, + for lib_name in lib_names: + _run_lake_build_target( + workspace_path, + package_name, + f"{lib_name}:docs", + env, + allow_failure=True, ) - if process.stdout: - for line in process.stdout: - logger.info(line.rstrip()) - returncode = process.wait() - if returncode != 0: - logger.warning( - "[%s] doc-gen4 had failures for %s (continuing with generated docs)", - package_name, lib_name, - ) async def run_doc_gen4( diff --git a/src/lean_explore/extract/package_registry.py b/src/lean_explore/extract/package_registry.py index a076e80..f9e0dd4 100644 --- a/src/lean_explore/extract/package_registry.py +++ b/src/lean_explore/extract/package_registry.py @@ -17,7 +17,7 @@ "physlean": PackageConfig( name="physlean", git_url="https://github.com/HEPLean/PhysLean", - module_prefixes=["PhysLean"], + module_prefixes=["Physlib", "QuantumInfo"], version_strategy=VersionStrategy.TAGGED, depends_on=["mathlib"], ), diff --git a/tests/extract/doc_gen4_test.py b/tests/extract/doc_gen4_test.py index f8443bf..e05339c 100644 --- a/tests/extract/doc_gen4_test.py +++ b/tests/extract/doc_gen4_test.py @@ -1,8 +1,13 @@ """Tests for doc-gen4 workspace orchestration.""" -from unittest.mock import patch +from unittest.mock import MagicMock, patch -from lean_explore.extract.doc_gen4 import _uses_sqlite_docgen, run_doc_gen4 +from lean_explore.extract.doc_gen4 import ( + _run_lake_for_package, + _uses_sqlite_docgen, + run_doc_gen4, +) +from lean_explore.extract.package_registry import PACKAGE_REGISTRY class TestDocGen4VersionDetection: @@ -60,3 +65,44 @@ async def test_fresh_sqlite_workspace_skips_cache_clear(self): mock_clear.assert_not_called() mock_run.assert_called_once_with("mathlib", False) + + +class TestLakeBuildTargets: + """Tests for explicit Lake build target selection.""" + + def test_physlean_builds_wrapper_before_docs(self): + """Test that PhysLean builds the wrapper target before doc generation.""" + with patch("lean_explore.extract.doc_gen4._run_lake_update_with_retry"): + with patch("lean_explore.extract.doc_gen4.subprocess.run") as mock_run: + mock_run.return_value = MagicMock( + returncode=0, + stdout="", + stderr="", + ) + with patch( + "lean_explore.extract.doc_gen4.subprocess.Popen" + ) as mock_popen: + mock_process = MagicMock() + mock_process.stdout = iter([]) + mock_process.wait.return_value = 0 + mock_popen.return_value = mock_process + + _run_lake_for_package("physlean") + + assert mock_popen.call_args_list[0].args[0] == ["lake", "build", "PhysExtract"] + assert mock_popen.call_args_list[1].args[0] == [ + "lake", + "build", + "PhysExtract:docs", + ] + + +class TestPackageRegistry: + """Tests for package registry metadata.""" + + def test_physlean_uses_upstream_module_roots(self): + """Test that PhysLean filters on the upstream module roots.""" + assert PACKAGE_REGISTRY["physlean"].module_prefixes == [ + "Physlib", + "QuantumInfo", + ] From 268d8147a1d625e43007d4cb90d3e441a9661ea5 Mon Sep 17 00:00:00 2001 From: justincasher Date: Sat, 11 Apr 2026 20:28:51 -0400 Subject: [PATCH 21/22] Allow library build failures in doc-gen4 pipeline Doc-gen4 can fail on individual modules (e.g. timeout computing equational lemmata) without invalidating the rest of the output. The explicit library build step should tolerate these failures like the :docs step already does, rather than aborting the entire pipeline. --- src/lean_explore/extract/doc_gen4.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/lean_explore/extract/doc_gen4.py b/src/lean_explore/extract/doc_gen4.py index 99ee3f4..d915695 100644 --- a/src/lean_explore/extract/doc_gen4.py +++ b/src/lean_explore/extract/doc_gen4.py @@ -232,7 +232,9 @@ def _run_lake_for_package(package_name: str, verbose: bool = False) -> None: lib_names = _get_library_names(package_name) for lib_name in lib_names: - _run_lake_build_target(workspace_path, package_name, lib_name, env) + _run_lake_build_target( + workspace_path, package_name, lib_name, env, allow_failure=True, + ) for lib_name in lib_names: _run_lake_build_target( From a222f00155af26b78def1bfe3acc35dba0c08869 Mon Sep 17 00:00:00 2001 From: justincasher Date: Sun, 12 Apr 2026 01:34:09 -0400 Subject: [PATCH 22/22] Skip redundant library build before :docs target MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The :docs facet transitively depends on :docInfo, which depends on the library oleans. Running lake build separately before lake build :docs causes Lake to process the doc-gen4 analysis twice — once during the explicit library build, then again as a replay during the :docs step. For mathlib this doubled the wall time (21K jobs instead of ~10K). The PhysLean issue this step was meant to fix is already resolved by the correct imports in PhysExtract.lean; the :docs target alone now builds everything it needs transitively. --- src/lean_explore/extract/doc_gen4.py | 8 +++----- tests/extract/doc_gen4_test.py | 9 ++++----- 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/src/lean_explore/extract/doc_gen4.py b/src/lean_explore/extract/doc_gen4.py index d915695..e3e62ae 100644 --- a/src/lean_explore/extract/doc_gen4.py +++ b/src/lean_explore/extract/doc_gen4.py @@ -230,12 +230,10 @@ def _run_lake_for_package(package_name: str, verbose: bool = False) -> None: if result.returncode != 0: logger.warning("[%s] Cache fetch failed (non-fatal)", package_name) + # The :docs facet transitively depends on :docInfo, which depends on the + # library oleans, so a single lake build per library is sufficient. The + # explicit library build beforehand is redundant and doubles the work. lib_names = _get_library_names(package_name) - for lib_name in lib_names: - _run_lake_build_target( - workspace_path, package_name, lib_name, env, allow_failure=True, - ) - for lib_name in lib_names: _run_lake_build_target( workspace_path, diff --git a/tests/extract/doc_gen4_test.py b/tests/extract/doc_gen4_test.py index e05339c..b1c8c49 100644 --- a/tests/extract/doc_gen4_test.py +++ b/tests/extract/doc_gen4_test.py @@ -68,10 +68,10 @@ async def test_fresh_sqlite_workspace_skips_cache_clear(self): class TestLakeBuildTargets: - """Tests for explicit Lake build target selection.""" + """Tests for Lake build target selection.""" - def test_physlean_builds_wrapper_before_docs(self): - """Test that PhysLean builds the wrapper target before doc generation.""" + def test_physlean_runs_docs_target(self): + """Test that PhysLean runs the :docs target for its wrapper library.""" with patch("lean_explore.extract.doc_gen4._run_lake_update_with_retry"): with patch("lean_explore.extract.doc_gen4.subprocess.run") as mock_run: mock_run.return_value = MagicMock( @@ -89,8 +89,7 @@ def test_physlean_builds_wrapper_before_docs(self): _run_lake_for_package("physlean") - assert mock_popen.call_args_list[0].args[0] == ["lake", "build", "PhysExtract"] - assert mock_popen.call_args_list[1].args[0] == [ + assert mock_popen.call_args_list[0].args[0] == [ "lake", "build", "PhysExtract:docs",