Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
881559b
Support doc-gen4 SQLite database format for extraction
justincasher Mar 22, 2026
b8a3075
Remove dead constructor kind check in SQLite parser
justincasher Apr 4, 2026
d2c7882
Use %-style logging consistently in new SQLite code
justincasher Apr 4, 2026
9360b49
Log skipped_prefix and skipped_constructor counters
justincasher Apr 4, 2026
6934848
Remove unused module_name parameter from _construct_source_link
justincasher Apr 4, 2026
a2ba5c5
Fix doc data script lint
justincasher Apr 4, 2026
e25c77d
Fix doc-gen4 version handling and tests
justincasher Apr 4, 2026
8224f5d
Correct doc-gen4 SQLite cutoff references
justincasher Apr 4, 2026
954bff5
Fix dependency extraction for SQLite-backed packages
justincasher Apr 11, 2026
73a90db
Fix dead source links for core modules in SQLite path
justincasher Apr 11, 2026
1f7b1d4
Detect Verso-only docstrings in SQLite extraction
justincasher Apr 11, 2026
a611dc0
Validate api-docs.db before trusting SQLite format detection
justincasher Apr 11, 2026
60fa48a
Filter non-rendered declarations in SQL instead of Python
justincasher Apr 11, 2026
c248244
Handle missing source files gracefully in BMP parser
justincasher Apr 11, 2026
35f8402
Pin doc-gen4 to main branch instead of version tags
justincasher Apr 11, 2026
8295e2b
Restore doc-gen4 version pinning to match package toolchain
justincasher Apr 11, 2026
ed58bc0
Clear cache on toolchain change even for SQLite-capable packages
justincasher Apr 11, 2026
65d4efa
Revert toolchain change detection in cache-skip logic
justincasher Apr 11, 2026
4d19e47
Fix PhysLean extractor build targets
justincasher Apr 12, 2026
4ad6b24
Allow library build failures in doc-gen4 pipeline
justincasher Apr 12, 2026
d9ce242
Skip redundant library build before :docs target
justincasher Apr 12, 2026
e3341bd
Build :docInfo instead of :docs for SQLite workspaces
justincasher Apr 19, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
155 changes: 115 additions & 40 deletions src/lean_explore/extract/doc_gen4.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

import logging
import os
import re
import shutil
import subprocess
import time
Expand All @@ -23,6 +24,29 @@
logger = logging.getLogger(__name__)


def _uses_sqlite_docgen(lean_version: str) -> bool:
"""Return whether the matching doc-gen4 release writes api-docs.db.

Doc-gen4's SQLite output landed after `v4.29.0-rc1` and is present starting
with `v4.29.0-rc2`.
"""
match = extract_lean_version(lean_version)
version_match = re.match(r"^v(\d+)\.(\d+)\.(\d+)(?:-rc(\d+))?$", match)
if version_match is None:
return False

major, minor, patch, rc = version_match.groups()
version = (int(major), int(minor), int(patch))
if version > (4, 29, 0):
return True
if version < (4, 29, 0):
return False

if rc is None:
return True
return int(rc) >= 2


def _clear_workspace_cache(workspace_path: Path) -> None:
"""Clear entire Lake cache to force complete rebuild.

Expand All @@ -47,8 +71,8 @@ def _clear_workspace_cache(workspace_path: Path) -> None:
shutil.rmtree(lake_dir)


def _get_doc_lib_names(package_name: str) -> list[str]:
"""Get the library names to run doc-gen4 on for a package.
def _get_library_names(package_name: str) -> list[str]:
"""Get the library names to build and run doc-gen4 on for a package.

Some packages have custom extract wrappers, others use upstream libraries directly.
"""
Expand All @@ -62,6 +86,57 @@ def _get_doc_lib_names(package_name: str) -> list[str]:
return lib_names.get(package_name, [f"{package_name.title()}Extract"])


def _run_lake_build_target(
workspace_path: Path,
package_name: str,
target: str,
env: dict[str, str],
allow_failure: bool = False,
) -> bool:
"""Run ``lake build <target>`` streaming output to the logger.

Args:
workspace_path: Path to the Lake workspace directory.
package_name: Name of the package for log messages.
target: The Lake build target to run.
env: Environment variables to pass to the subprocess.
allow_failure: Whether to continue when the target fails.

Returns:
``True`` if the target built successfully, otherwise ``False``.

Raises:
RuntimeError: If the target fails and ``allow_failure`` is ``False``.
"""
logger.info("[%s] Running lake build %s...", package_name, target)
process = subprocess.Popen(
["lake", "build", target],
cwd=workspace_path,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
text=True,
bufsize=1,
env=env,
)
if process.stdout:
for line in process.stdout:
logger.info(line.rstrip())

returncode = process.wait()
if returncode == 0:
return True

if allow_failure:
logger.warning(
"[%s] lake build %s failed (continuing with generated docs)",
package_name,
target,
)
return False

raise RuntimeError(f"lake build failed for {package_name} target {target}")


def _setup_workspace(package_config: PackageConfig) -> tuple[str, str]:
"""Fetch toolchain from GitHub and update lakefile.

Expand Down Expand Up @@ -155,44 +230,27 @@ def _run_lake_for_package(package_name: str, verbose: bool = False) -> None:
if result.returncode != 0:
logger.warning("[%s] Cache fetch failed (non-fatal)", package_name)

logger.info("[%s] Running lake build...", package_name)
process = subprocess.Popen(
["lake", "build"],
cwd=workspace_path,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
text=True,
bufsize=1,
env=env,
)
if process.stdout:
for line in process.stdout:
logger.info(line.rstrip())
if process.wait() != 0:
raise RuntimeError(f"lake build failed for {package_name}")
# For SQLite-format doc-gen4 we only need :docInfo, which populates
# api-docs.db and stops there. The :docs facet additionally runs `fromDb`
# to generate all HTML and the search index, which we don't consume.
# Older BMP-format doc-gen4 has no :docInfo facet, so :docs is the only
# option there.
toolchain_file = workspace_path / "lean-toolchain"
lake_target = "docs"
if toolchain_file.is_file():
toolchain = toolchain_file.read_text().strip()
if toolchain and _uses_sqlite_docgen(toolchain):
lake_target = "docInfo"

lib_names = _get_doc_lib_names(package_name)
lib_names = _get_library_names(package_name)
for lib_name in lib_names:
logger.info("[%s] Running doc-gen4 (%s:docs)...", package_name, lib_name)

process = subprocess.Popen(
["lake", "build", f"{lib_name}:docs"],
cwd=workspace_path,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
text=True,
bufsize=1,
env=env,
_run_lake_build_target(
workspace_path,
package_name,
f"{lib_name}:{lake_target}",
env,
allow_failure=True,
)
if process.stdout:
for line in process.stdout:
logger.info(line.rstrip())
returncode = process.wait()
if returncode != 0:
logger.warning(
"[%s] doc-gen4 had failures for %s (continuing with generated docs)",
package_name, lib_name,
)


async def run_doc_gen4(
Expand Down Expand Up @@ -227,12 +285,29 @@ async def run_doc_gen4(
workspace_path = Path("lean") / package_name
logger.info("\n%s\nPackage: %s\n%s", "=" * 50, package_name, "=" * 50)

toolchain = None
ref = None
if fresh:
_clear_workspace_cache(workspace_path)
if setup:
toolchain, ref = _setup_workspace(config)
logger.info("Toolchain: %s, ref: %s", toolchain, ref)

# Doc-gen4 switched from BMP files to api-docs.db in v4.29.0-rc2.
# The SQLite format handles incremental updates, while legacy BMP
# output requires a cache clear to avoid stale files.
if toolchain and _uses_sqlite_docgen(toolchain):
logger.info(
"[%s] Skipping cache clear "
"(api-docs.db handles incremental updates)",
package_name,
)
else:
_clear_workspace_cache(workspace_path)

if setup:
toolchain, ref = _setup_workspace(config)
logger.info("Toolchain: %s, ref: %s", toolchain, ref)
if toolchain is None or ref is None:
toolchain, ref = _setup_workspace(config)
logger.info("Toolchain: %s, ref: %s", toolchain, ref)

_run_lake_for_package(package_name, verbose)

Expand Down
Loading
Loading