diff --git a/src/lean_explore/extract/doc_gen4.py b/src/lean_explore/extract/doc_gen4.py
index 621e55a..759227e 100644
--- a/src/lean_explore/extract/doc_gen4.py
+++ b/src/lean_explore/extract/doc_gen4.py
@@ -6,6 +6,7 @@
 
 import logging
 import os
+import re
 import shutil
 import subprocess
 import time
@@ -23,6 +24,29 @@
 logger = logging.getLogger(__name__)
 
 
+def _uses_sqlite_docgen(lean_version: str) -> bool:
+    """Return whether the matching doc-gen4 release writes api-docs.db.
+
+    Doc-gen4's SQLite output landed after `v4.29.0-rc1` and is present starting
+    with `v4.29.0-rc2`.
+    """
+    match = extract_lean_version(lean_version)
+    version_match = re.match(r"^v(\d+)\.(\d+)\.(\d+)(?:-rc(\d+))?$", match)
+    if version_match is None:
+        return False
+
+    major, minor, patch, rc = version_match.groups()
+    version = (int(major), int(minor), int(patch))
+    if version > (4, 29, 0):
+        return True
+    if version < (4, 29, 0):
+        return False
+
+    if rc is None:
+        return True
+    return int(rc) >= 2
+
+
 def _clear_workspace_cache(workspace_path: Path) -> None:
     """Clear entire Lake cache to force complete rebuild.
 
@@ -47,8 +71,8 @@ def _clear_workspace_cache(workspace_path: Path) -> None:
         shutil.rmtree(lake_dir)
 
 
-def _get_doc_lib_names(package_name: str) -> list[str]:
-    """Get the library names to run doc-gen4 on for a package.
+def _get_library_names(package_name: str) -> list[str]:
+    """Get the library names to build and run doc-gen4 on for a package.
 
     Some packages have custom extract wrappers, others use upstream libraries directly.
     """
@@ -62,6 +86,57 @@ def _get_doc_lib_names(package_name: str) -> list[str]:
     return lib_names.get(package_name, [f"{package_name.title()}Extract"])
 
 
+def _run_lake_build_target(
+    workspace_path: Path,
+    package_name: str,
+    target: str,
+    env: dict[str, str],
+    allow_failure: bool = False,
+) -> bool:
+    """Run ``lake build <target>`` streaming output to the logger.
+
+    Args:
+        workspace_path: Path to the Lake workspace directory.
+        package_name: Name of the package for log messages.
+        target: The Lake build target to run.
+        env: Environment variables to pass to the subprocess.
+        allow_failure: Whether to continue when the target fails.
+
+    Returns:
+        ``True`` if the target built successfully, otherwise ``False``.
+
+    Raises:
+        RuntimeError: If the target fails and ``allow_failure`` is ``False``.
+    """
+    logger.info("[%s] Running lake build %s...", package_name, target)
+    process = subprocess.Popen(
+        ["lake", "build", target],
+        cwd=workspace_path,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+        text=True,
+        bufsize=1,
+        env=env,
+    )
+    if process.stdout:
+        for line in process.stdout:
+            logger.info(line.rstrip())
+
+    returncode = process.wait()
+    if returncode == 0:
+        return True
+
+    if allow_failure:
+        logger.warning(
+            "[%s] lake build %s failed (continuing with generated docs)",
+            package_name,
+            target,
+        )
+        return False
+
+    raise RuntimeError(f"lake build failed for {package_name} target {target}")
+
+
 def _setup_workspace(package_config: PackageConfig) -> tuple[str, str]:
     """Fetch toolchain from GitHub and update lakefile.
 
@@ -155,44 +230,27 @@ def _run_lake_for_package(package_name: str, verbose: bool = False) -> None:
         if result.returncode != 0:
             logger.warning("[%s] Cache fetch failed (non-fatal)", package_name)
 
-    logger.info("[%s] Running lake build...", package_name)
-    process = subprocess.Popen(
-        ["lake", "build"],
-        cwd=workspace_path,
-        stdout=subprocess.PIPE,
-        stderr=subprocess.STDOUT,
-        text=True,
-        bufsize=1,
-        env=env,
-    )
-    if process.stdout:
-        for line in process.stdout:
-            logger.info(line.rstrip())
-    if process.wait() != 0:
-        raise RuntimeError(f"lake build failed for {package_name}")
+    # For SQLite-format doc-gen4 we only need :docInfo, which populates
+    # api-docs.db and stops there. The :docs facet additionally runs `fromDb`
+    # to generate all HTML and the search index, which we don't consume.
+    # Older BMP-format doc-gen4 has no :docInfo facet, so :docs is the only
+    # option there.
+    toolchain_file = workspace_path / "lean-toolchain"
+    lake_target = "docs"
+    if toolchain_file.is_file():
+        toolchain = toolchain_file.read_text().strip()
+        if toolchain and _uses_sqlite_docgen(toolchain):
+            lake_target = "docInfo"
 
-    lib_names = _get_doc_lib_names(package_name)
+    lib_names = _get_library_names(package_name)
     for lib_name in lib_names:
-        logger.info("[%s] Running doc-gen4 (%s:docs)...", package_name, lib_name)
-
-        process = subprocess.Popen(
-            ["lake", "build", f"{lib_name}:docs"],
-            cwd=workspace_path,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.STDOUT,
-            text=True,
-            bufsize=1,
-            env=env,
+        _run_lake_build_target(
+            workspace_path,
+            package_name,
+            f"{lib_name}:{lake_target}",
+            env,
+            allow_failure=True,
         )
-        if process.stdout:
-            for line in process.stdout:
-                logger.info(line.rstrip())
-        returncode = process.wait()
-        if returncode != 0:
-            logger.warning(
-                "[%s] doc-gen4 had failures for %s (continuing with generated docs)",
-                package_name, lib_name,
-            )
 
 
 async def run_doc_gen4(
@@ -227,12 +285,29 @@ async def run_doc_gen4(
         workspace_path = Path("lean") / package_name
         logger.info("\n%s\nPackage: %s\n%s", "=" * 50, package_name, "=" * 50)
 
+        toolchain = None
+        ref = None
         if fresh:
-            _clear_workspace_cache(workspace_path)
+            if setup:
+                toolchain, ref = _setup_workspace(config)
+                logger.info("Toolchain: %s, ref: %s", toolchain, ref)
+
+            # Doc-gen4 switched from BMP files to api-docs.db in v4.29.0-rc2.
+            # The SQLite format handles incremental updates, while legacy BMP
+            # output requires a cache clear to avoid stale files.
+            if toolchain and _uses_sqlite_docgen(toolchain):
+                logger.info(
+                    "[%s] Skipping cache clear "
+                    "(api-docs.db handles incremental updates)",
+                    package_name,
+                )
+            else:
+                _clear_workspace_cache(workspace_path)
 
         if setup:
-            toolchain, ref = _setup_workspace(config)
-            logger.info("Toolchain: %s, ref: %s", toolchain, ref)
+            if toolchain is None or ref is None:
+                toolchain, ref = _setup_workspace(config)
+                logger.info("Toolchain: %s, ref: %s", toolchain, ref)
 
         _run_lake_for_package(package_name, verbose)
 
diff --git a/src/lean_explore/extract/doc_parser.py b/src/lean_explore/extract/doc_parser.py
index 707e62a..2ca2aa4 100644
--- a/src/lean_explore/extract/doc_parser.py
+++ b/src/lean_explore/extract/doc_parser.py
@@ -1,12 +1,17 @@
 """Parser for Lean doc-gen4 output files.
 
-This module parses doc-gen4 JSON data and extracts Lean source code
-to produce Declaration objects ready for database insertion.
+This module parses doc-gen4 output and extracts Lean source code to produce
+Declaration objects ready for database insertion.
+
+Supports two doc-gen4 output formats:
+- SQLite database (api-docs.db): Used by doc-gen4 >= v4.29.0-rc2
+- BMP JSON files (.bmp): Used by doc-gen4 < v4.29.0-rc2
 """
 
 import json
 import logging
 import re
+import sqlite3
 from pathlib import Path
 
 from rich.progress import (
@@ -26,6 +31,148 @@
 logger = logging.getLogger(__name__)
 
 
+# ---------------------------------------------------------------------------
+# RenderedCode BLOB parser
+#
+# Doc-gen4 stores declaration type signatures in the `name_info.type` column
+# as a binary BLOB using leansqlite's ToBinary serialization format.
+#
+# The type is  RenderedCode = TaggedText RenderedCode.Tag  where:
+#   TaggedText:  text(0) String | tag(1) Tag TaggedText | append(2) Array
+#   Tag:         keyword(0) | string(1) | const(2) Name | sort-none(3)
+#                | sort-type(4) | sort-prop(5) | sort-sort(6) | otherExpr(7)
+#   Name:        anonymous(0) | str(1) Name String | num(2) Name Nat
+#
+# Encoding primitives (big-endian, leansqlite Classes.lean):
+#   Nat  – variable-length 7-bit chunks, high bit = continuation
+#   String – Nat(utf8_byte_length) + raw UTF-8 bytes
+#   Array  – Nat(count) + elements
+# ---------------------------------------------------------------------------
+
+
+class _BlobReader:
+    """Minimal reader for leansqlite ToBinary format."""
+
+    __slots__ = ("_data", "_cursor")
+
+    def __init__(self, data: bytes) -> None:
+        self._data = data
+        self._cursor = 0
+
+    def _read_byte(self) -> int:
+        if self._cursor >= len(self._data):
+            raise ValueError("Unexpected end of BLOB data")
+        value = self._data[self._cursor]
+        self._cursor += 1
+        return value
+
+    def _read_nat(self) -> int:
+        """Read a variable-length natural number (7-bit chunks, MSB = more)."""
+        result = 0
+        shift = 0
+        while True:
+            byte = self._read_byte()
+            if byte >= 128:
+                result |= (byte & 0x7F) << shift
+            else:
+                result |= byte << shift
+                break
+            shift += 7
+        return result
+
+    def _read_string(self) -> str:
+        byte_length = self._read_nat()
+        if self._cursor + byte_length > len(self._data):
+            raise ValueError("String extends past end of BLOB data")
+        raw = self._data[self._cursor : self._cursor + byte_length]
+        self._cursor += byte_length
+        return raw.decode("utf-8")
+
+    def _read_name(self) -> str:
+        """Read a Lean Name and return its dot-separated string form."""
+        tag = self._read_byte()
+        if tag == 0:  # anonymous
+            return ""
+        if tag == 1:  # str parent s
+            parent = self._read_name()
+            component = self._read_string()
+            return f"{parent}.{component}" if parent else component
+        if tag == 2:  # num parent n
+            parent = self._read_name()
+            number = self._read_nat()
+            return f"{parent}.{number}" if parent else str(number)
+        raise ValueError(f"Invalid Name tag: {tag}")
+
+    def _skip_name(self) -> None:
+        """Skip over a Name without allocating strings."""
+        tag = self._read_byte()
+        if tag == 0:
+            return
+        if tag == 1:
+            self._skip_name()
+            byte_length = self._read_nat()
+            self._cursor += byte_length
+            return
+        if tag == 2:
+            self._skip_name()
+            self._read_nat()
+            return
+        raise ValueError(f"Invalid Name tag: {tag}")
+
+
+def _extract_names_from_rendered_code(blob: bytes) -> list[str]:
+    """Extract referenced declaration names from a RenderedCode BLOB.
+
+    Walks the TaggedText tree and collects Lean Names from every
+    RenderedCode.Tag.const node (tag byte 2).
+
+    Args:
+        blob: Raw bytes of the RenderedCode BLOB from name_info.type.
+
+    Returns:
+        De-duplicated list of fully-qualified Lean names referenced in the type.
+    """
+    reader = _BlobReader(blob)
+    names: list[str] = []
+    seen: set[str] = set()
+
+    def walk_tagged_text() -> None:
+        tag = reader._read_byte()
+        if tag == 0:  # text
+            reader._read_string()
+        elif tag == 1:  # tag
+            walk_tag()
+            walk_tagged_text()
+        elif tag == 2:  # append
+            count = reader._read_nat()
+            for _ in range(count):
+                walk_tagged_text()
+        else:
+            raise ValueError(f"Invalid TaggedText tag: {tag}")
+
+    def walk_tag() -> None:
+        tag = reader._read_byte()
+        if tag <= 1 or (3 <= tag <= 7):
+            # keyword(0), string(1), sort-none(3), sort-type(4),
+            # sort-prop(5), sort-sort(6), otherExpr(7) — no payload
+            return
+        if tag == 2:  # const
+            name = reader._read_name()
+            if name and name not in seen:
+                names.append(name)
+                seen.add(name)
+            return
+        raise ValueError(f"Invalid RenderedCode.Tag tag: {tag}")
+
+    try:
+        walk_tagged_text()
+    except (ValueError, IndexError):
+        logger.debug("Failed to parse RenderedCode BLOB (%d bytes)", len(blob))
+        return []
+
+    return names
+
+
 def _strip_lean_comments(source_text: str) -> str:
     """Strip Lean comments from source text for comparison.
 
@@ -270,6 +417,16 @@ def _extract_source_text(
         package_name.replace("-", "").lower(),
     ]:
         if variant in package_cache:
+            if variant == "lean4" and file_path_string.startswith("src/lean/"):
+                adjusted_path = file_path_string[9:]
+                candidates.append(package_cache[variant] / adjusted_path)
+                continue
+            if variant == "lean4" and file_path_string.startswith("src/lake/"):
+                adjusted_path = file_path_string[9:]
+                candidates.append(
+                    package_cache[variant].parent / "lake" / adjusted_path
+                )
+                continue
             if variant == "lean4" and file_path_string.startswith("src/"):
                 adjusted_path = file_path_string[4:]
             else:
@@ -292,6 +449,251 @@ def _extract_source_text(
     )
 
 
+def _read_lean_toolchain_version(workspace_path: Path) -> str | None:
+    """Read the Lean version from a workspace's lean-toolchain file.
+
+    Args:
+        workspace_path: Path to the package workspace (e.g., lean/mathlib).
+
+    Returns:
+        Version string like 'v4.29.0-rc6', or None if not found.
+    """
+    toolchain_file = workspace_path / "lean-toolchain"
+    if not toolchain_file.exists():
+        return None
+    try:
+        content = toolchain_file.read_text().strip()
+        match = re.search(r"v\d+\.\d+\.\d+(?:-rc\d+)?", content)
+        return match.group() if match else None
+    except OSError:
+        return None
+
+
+def _construct_source_link(
+    module_name: str,
+    module_source_url: str | None,
+    start_line: int,
+    end_line: int,
+    lean_version: str | None = None,
+) -> str | None:
+    """Construct a GitHub source link from module URL and line range.
+
+    Args:
+        module_name: Lean module name from api-docs.db.
+        module_source_url: GitHub URL to the module file from api-docs.db.
+        start_line: Start line number in the source file.
+        end_line: End line number in the source file.
+        lean_version: Lean toolchain version (e.g., 'v4.29.0-rc6') used as
+            the git ref for core module fallback URLs.
+
+    Returns:
+        GitHub URL with line range fragment, or None if no source URL exists.
+    """
+    if module_source_url:
+        return f"{module_source_url}#L{start_line}-L{end_line}"
+
+    git_ref = lean_version or "master"
+    module_path = module_name.replace(".", "/")
+    root = module_name.split(".", 1)[0]
+    if root in {"Init", "Lean", "Std"}:
+        return (
+            f"https://github.com/leanprover/lean4/blob/{git_ref}/"
+            f"src/lean/{module_path}.lean#L{start_line}-L{end_line}"
+        )
+    if root == "Lake":
+        return (
+            f"https://github.com/leanprover/lean4/blob/{git_ref}/"
+            f"src/lake/{module_path}.lean#L{start_line}-L{end_line}"
+        )
+
+    return None
+
+
+def _parse_declarations_from_sqlite(
+    database_path: Path,
+    lean_root: Path,
+    package_cache: dict[str, Path],
+    allowed_module_prefixes: list[str],
+    lean_version: str | None = None,
+) -> list[Declaration]:
+    """Parse declarations from a doc-gen4 SQLite database (api-docs.db).
+
+    Doc-gen4 >= v4.29.0-rc2 outputs declaration data to a SQLite database
+    instead of individual BMP JSON files. This function reads that database
+    and produces the same Declaration objects as the BMP parser.
+
+    Args:
+        database_path: Path to the api-docs.db SQLite database.
+        lean_root: Root directory of the Lean project.
+        package_cache: Dictionary mapping package names to their directories.
+        allowed_module_prefixes: Module prefixes to extract (e.g., ["Mathlib"]).
+        lean_version: Lean toolchain version for core module source links.
+
+    Returns:
+        List of parsed Declaration objects.
+    """
+    declarations = []
+
+    connection = sqlite3.connect(str(database_path))
+    connection.row_factory = sqlite3.Row
+
+    try:
+        # Query declarations with source ranges and both docstring types.
+        # Doc-gen4 stores docstrings as either markdown text or Verso binary
+        # BLOBs (never both). We prefer markdown; Verso BLOBs require a
+        # complex deserializer so we detect but cannot extract them yet.
+        query = """
+            SELECT
+                n.module_name,
+                n.position,
+                n.kind,
+                n.name,
+                n.type,
+                r.start_line,
+                r.end_line,
+                d.text AS docstring,
+                v.content AS verso_docstring,
+                m.source_url
+            FROM name_info n
+            JOIN declaration_ranges r
+                ON n.module_name = r.module_name AND n.position = r.position
+            LEFT JOIN declaration_markdown_docstrings d
+                ON n.module_name = d.module_name AND n.position = d.position
+            LEFT JOIN declaration_verso_docstrings v
+                ON n.module_name = v.module_name AND n.position = v.position
+            JOIN modules m
+                ON n.module_name = m.name
+            WHERE n.render = 1
+            ORDER BY n.module_name, n.position
+        """
+        rows = connection.execute(query).fetchall()
+
+        logger.info("Found %d declarations in api-docs.db", len(rows))
+
+        skipped_no_source = 0
+        skipped_prefix = 0
+        skipped_constructor = 0
+        source_errors = 0
+        verso_only_docstrings = 0
+
+        with Progress(
+            SpinnerColumn(),
+            TextColumn("[progress.description]{task.description}"),
+            BarColumn(),
+            TaskProgressColumn(),
+            TimeRemainingColumn(),
+        ) as progress:
+            task = progress.add_task(
+                "[cyan]Parsing api-docs.db...", total=len(rows)
+            )
+
+            for row in rows:
+                module_name = row["module_name"]
+                declaration_name = row["name"]
+
+                # Filter by module prefix
+                matches_prefix = any(
+                    module_name == prefix or module_name.startswith(prefix + ".")
+                    for prefix in allowed_module_prefixes
+                )
+                if not matches_prefix:
+                    skipped_prefix += 1
+                    progress.update(task, advance=1)
+                    continue
+
+                # Skip auto-generated .mk constructors
+                if declaration_name.endswith(".mk"):
+                    skipped_constructor += 1
+                    progress.update(task, advance=1)
+                    continue
+
+                source_url = row["source_url"]
+                start_line = row["start_line"]
+                end_line = row["end_line"]
+
+                source_link = _construct_source_link(
+                    module_name, source_url, start_line, end_line,
+                    lean_version=lean_version,
+                )
+                if not source_link:
+                    skipped_no_source += 1
+                    progress.update(task, advance=1)
+                    continue
+
+                # Extract source text from local files
+                try:
+                    source_text = _extract_source_text(
+                        source_link, lean_root, package_cache
+                    )
+                except (FileNotFoundError, ValueError) as error:
+                    source_errors += 1
+                    if source_errors <= 10:
+                        logger.debug(
+                            "Could not extract source for %s: %s",
+                            declaration_name, error,
+                        )
+                    progress.update(task, advance=1)
+                    continue
+
+                # Extract dependency names from the type signature BLOB
+                type_blob = row["type"]
+                if type_blob:
+                    dep_names = _extract_names_from_rendered_code(
+                        bytes(type_blob)
+                    )
+                    # Filter out self-references
+                    dep_names = [
+                        d for d in dep_names if d != declaration_name
+                    ]
+                    dependencies = dep_names or None
+                else:
+                    dependencies = None
+
+                # Use markdown docstring; detect Verso-only cases
+                docstring = row["docstring"]
+                if not docstring and row["verso_docstring"]:
+                    verso_only_docstrings += 1
+
+                declarations.append(
+                    Declaration(
+                        name=declaration_name,
+                        module=module_name,
+                        docstring=docstring,
+                        source_text=source_text,
+                        source_link=source_link,
+                        dependencies=dependencies,
+                    )
+                )
+
+                progress.update(task, advance=1)
+
+        if skipped_prefix > 0:
+            logger.info(
+                "Skipped %d declarations outside allowed prefixes",
+                skipped_prefix,
+            )
+        if skipped_constructor > 0:
+            logger.info("Skipped %d .mk constructors", skipped_constructor)
+        if skipped_no_source > 0:
+            logger.info("Skipped %d declarations without source URL", skipped_no_source)
+        if verso_only_docstrings > 0:
+            logger.warning(
+                "%d declarations have Verso-only docstrings "
+                "(not yet supported, stored as docstring=None)",
+                verso_only_docstrings,
+            )
+        if source_errors > 0:
+            logger.warning(
+                "Could not extract source text for %d declarations",
+                source_errors,
+            )
+
+    finally:
+        connection.close()
+
+    return declarations
+
+
 def _parse_declarations_from_files(
     bmp_files: list[Path],
     lean_root: Path,
@@ -310,6 +712,7 @@ def _parse_declarations_from_files(
         List of parsed Declaration objects.
     """
     declarations = []
+    source_errors = 0
 
     with Progress(
         SpinnerColumn(),
@@ -338,23 +741,33 @@ def _parse_declarations_from_files(
 
             for declaration_data in data.get("declarations", []):
                 information = declaration_data["info"]
-                source_text = _extract_source_text(
-                    information["sourceLink"], lean_root, package_cache
-                )
+                declaration_name = information["name"]
+
+                # Skip auto-generated .mk constructors
+                if declaration_name.endswith(".mk"):
+                    continue
+
+                try:
+                    source_text = _extract_source_text(
+                        information["sourceLink"], lean_root, package_cache
+                    )
+                except (FileNotFoundError, ValueError) as error:
+                    source_errors += 1
+                    if source_errors <= 10:
+                        logger.debug(
+                            "Could not extract source for %s: %s",
+                            declaration_name, error,
+                        )
+                    continue
 
                 header_html = declaration_data.get("header", "")
                 dependencies = _extract_dependencies_from_html(header_html)
 
                 # Filter out self-references from dependencies
-                declaration_name = information["name"]
                 filtered_dependencies = [
                     d for d in dependencies if d != declaration_name
                 ]
 
-                # Skip auto-generated .mk constructors
-                if declaration_name.endswith(".mk"):
-                    continue
-
                 declarations.append(
                     Declaration(
                         name=declaration_name,
@@ -368,6 +781,12 @@ def _parse_declarations_from_files(
 
             progress.update(task, advance=1)
 
+    if source_errors > 0:
+        logger.warning(
+            "Could not extract source text for %d declarations",
+            source_errors,
+        )
+
     return declarations
 
 
@@ -428,12 +847,88 @@ async def _insert_declarations_batch(
     return inserted_count
 
 
+_REQUIRED_DOCGEN_TABLES = {"name_info", "declaration_ranges", "modules"}
+
+
+def _validate_docgen_sqlite(database_path: Path) -> bool:
+    """Check that a doc-gen4 api-docs.db is a valid, usable SQLite database.
+
+    Verifies the file is non-empty, opens as SQLite, and contains the tables
+    that the extraction pipeline requires.
+
+    Args:
+        database_path: Path to the api-docs.db file.
+
+    Returns:
+        True if the database is valid and contains the required tables.
+    """
+    if database_path.stat().st_size == 0:
+        logger.warning("api-docs.db exists but is empty: %s", database_path)
+        return False
+
+    try:
+        connection = sqlite3.connect(str(database_path))
+        try:
+            cursor = connection.execute(
+                "SELECT name FROM sqlite_master WHERE type='table'"
+            )
+            tables = {row[0] for row in cursor.fetchall()}
+        finally:
+            connection.close()
+    except sqlite3.DatabaseError as error:
+        logger.warning("api-docs.db is not a valid SQLite file: %s", error)
+        return False
+
+    missing = _REQUIRED_DOCGEN_TABLES - tables
+    if missing:
+        logger.warning(
+            "api-docs.db is missing required tables %s: %s",
+            missing, database_path,
+        )
+        return False
+
+    return True
+
+
+def _detect_docgen_format(workspace_path: Path) -> str:
+    """Detect which doc-gen4 output format a workspace uses.
+
+    Doc-gen4 >= v4.29.0-rc2 writes to a SQLite database (api-docs.db).
+    Earlier versions write individual BMP JSON files to doc-data/.
+
+    The SQLite file is validated before returning "sqlite" to guard against
+    zero-byte, corrupt, or incompatible databases left by crashed builds.
+
+    Args:
+        workspace_path: Path to the package workspace (e.g., lean/mathlib).
+
+    Returns:
+        "sqlite" if a valid api-docs.db exists, "bmp" if BMP files exist,
+        "none" otherwise.
+    """
+    api_docs_db = workspace_path / ".lake" / "build" / "api-docs.db"
+    if api_docs_db.exists():
+        if _validate_docgen_sqlite(api_docs_db):
+            return "sqlite"
+        logger.warning(
+            "Invalid api-docs.db at %s, checking for BMP fallback",
+            api_docs_db,
+        )
+
+    doc_data_dir = workspace_path / ".lake" / "build" / "doc-data"
+    if doc_data_dir.exists():
+        bmp_files = list(doc_data_dir.glob("**/*.bmp"))
+        if bmp_files:
+            return "bmp"
+
+    return "none"
+
+
 async def extract_declarations(engine: AsyncEngine, batch_size: int = 1000) -> None:
     """Extract all declarations from doc-gen4 data and load into database.
 
-    Looks for BMP files in each package's .lake/build/doc-data directory.
-    Extracts only declarations matching the package's configured module_prefixes,
-    ensuring each package's declarations come from its own workspace.
+    Automatically detects whether each package uses the newer SQLite format
+    (api-docs.db from doc-gen4 >= v4.29.0-rc2) or the legacy BMP JSON format.
 
     Args:
         engine: SQLAlchemy async engine for database connection.
@@ -445,21 +940,13 @@ async def extract_declarations(engine: AsyncEngine, batch_size: int = 1000) -> N
     lean_root = Path("lean")
     all_declarations = []
 
-    # Process each workspace separately with its own package cache
     for package_name in get_extraction_order():
         package_config = PACKAGE_REGISTRY[package_name]
-        doc_data_dir = lean_root / package_name / ".lake" / "build" / "doc-data"
-
-        if not doc_data_dir.exists():
-            logger.warning(
-                "No doc-data directory for %s: %s", package_name, doc_data_dir
-            )
-            continue
-
-        bmp_files = sorted(doc_data_dir.glob("**/*.bmp"))
-        logger.info("Found %d BMP files in %s", len(bmp_files), package_name)
+        workspace_path = lean_root / package_name
+        docgen_format = _detect_docgen_format(workspace_path)
 
-        if not bmp_files:
+        if docgen_format == "none":
+            logger.warning("No doc-gen4 output found for %s", package_name)
             continue
 
         # Build workspace-specific package cache to avoid version mismatches
@@ -469,9 +956,31 @@ async def extract_declarations(engine: AsyncEngine, batch_size: int = 1000) -> N
             package_name, len(package_cache),
         )
 
-        declarations = _parse_declarations_from_files(
-            bmp_files, lean_root, package_cache, package_config.module_prefixes
-        )
+        if docgen_format == "sqlite":
+            api_docs_path = workspace_path / ".lake" / "build" / "api-docs.db"
+            lean_version = _read_lean_toolchain_version(workspace_path)
+            logger.info(
+                "[%s] Using SQLite format (api-docs.db)", package_name
+            )
+            declarations = _parse_declarations_from_sqlite(
+                api_docs_path,
+                lean_root,
+                package_cache,
+                package_config.module_prefixes,
+                lean_version=lean_version,
+            )
+        else:
+            doc_data_dir = workspace_path / ".lake" / "build" / "doc-data"
+            bmp_files = sorted(doc_data_dir.glob("**/*.bmp"))
+            logger.info(
+                "[%s] Using BMP format (%d files)",
+                package_name, len(bmp_files),
+            )
+            declarations = _parse_declarations_from_files(
+                bmp_files, lean_root, package_cache,
+                package_config.module_prefixes,
+            )
+
         logger.info(
             "Extracted %d declarations from %s (prefixes: %s)",
             len(declarations), package_name, package_config.module_prefixes,
@@ -479,7 +988,9 @@ async def extract_declarations(engine: AsyncEngine, batch_size: int = 1000) -> N
         all_declarations.extend(declarations)
 
     if not all_declarations:
-        raise FileNotFoundError("No declarations extracted from any package workspace")
+        raise FileNotFoundError(
+            "No declarations extracted from any package workspace"
+        )
 
     logger.info("Total declarations extracted: %d", len(all_declarations))
 
diff --git a/src/lean_explore/extract/package_registry.py b/src/lean_explore/extract/package_registry.py
index a076e80..f9e0dd4 100644
--- a/src/lean_explore/extract/package_registry.py
+++ b/src/lean_explore/extract/package_registry.py
@@ -17,7 +17,7 @@
     "physlean": PackageConfig(
         name="physlean",
         git_url="https://github.com/HEPLean/PhysLean",
-        module_prefixes=["PhysLean"],
+        module_prefixes=["Physlib", "QuantumInfo"],
         version_strategy=VersionStrategy.TAGGED,
         depends_on=["mathlib"],
     ),
diff --git a/src/lean_explore/extract/package_utils.py b/src/lean_explore/extract/package_utils.py
index f5912c0..ab5d5b8 100644
--- a/src/lean_explore/extract/package_utils.py
+++ b/src/lean_explore/extract/package_utils.py
@@ -82,7 +82,12 @@ def get_package_toolchain(package_configuration: PackageConfig) -> tuple[str, st
 
 
 def update_lakefile_docgen_version(lakefile_path: Path, lean_version: str) -> None:
-    """Update the doc-gen4 version in a lakefile to match the Lean version.
+    """Update the doc-gen4 version in a lakefile to match the Lean toolchain.
+
+    Doc-gen4 releases are tagged to match Lean toolchain versions, so pinning
+    to the same version ensures compatibility. The doc-gen4 ``require`` must
+    appear before the main package ``require`` so that the main package's
+    transitive dependency versions take precedence during resolution.
 
     Args:
         lakefile_path: Path to lakefile.lean
diff --git a/tests/extract/__main___test.py b/tests/extract/__main___test.py
index 84607c6..aa9ee4d 100644
--- a/tests/extract/__main___test.py
+++ b/tests/extract/__main___test.py
@@ -50,32 +50,65 @@ class TestDocGen4Step:
     @pytest.mark.external
     async def test_run_doc_gen4_step_success(self, temp_directory):
         """Test successful doc-gen4 execution."""
-        lean_directory = temp_directory / "lean"
-        lean_directory.mkdir()
-
-        # Mock subprocess to avoid actually running lake
-        with patch("lean_explore.extract.doc_gen4.subprocess.Popen") as mock_popen:
-            mock_process = MagicMock()
-            mock_process.stdout = iter(["Building...\n", "Complete!\n"])
-            mock_process.wait.return_value = 0
-            mock_popen.return_value = mock_process
+        with patch(
+            "lean_explore.extract.doc_gen4.get_extraction_order",
+            return_value=["mathlib"],
+        ):
+            with patch(
+                "lean_explore.extract.doc_gen4._setup_workspace",
+                return_value=("leanprover/lean4:v4.29.0-rc2", "v4.29.0-rc2"),
+            ):
+                with patch(
+                    "lean_explore.extract.doc_gen4.subprocess.run"
+                ) as mock_run:
+                    mock_run.return_value = MagicMock(
+                        returncode=0,
+                        stdout="",
+                        stderr="",
+                    )
+                    with patch(
+                        "lean_explore.extract.doc_gen4.subprocess.Popen"
+                    ) as mock_popen:
+                        mock_process = MagicMock()
+                        mock_process.stdout = iter(["Building...\n", "Complete!\n"])
+                        mock_process.wait.return_value = 0
+                        mock_popen.return_value = mock_process
 
-            await _run_doc_gen4_step()
+                        await _run_doc_gen4_step()
 
-            # Called twice: "lake build" and "lake build LeanExtract:docs"
-            assert mock_popen.call_count == 2
+        assert mock_run.call_count == 2
+        # Called twice: "lake build" and "lake build MathExtract:docs"
+        assert mock_popen.call_count == 2
 
     @pytest.mark.external
     async def test_run_doc_gen4_step_failure(self):
         """Test doc-gen4 execution failure."""
-        with patch("lean_explore.extract.doc_gen4.subprocess.Popen") as mock_popen:
-            mock_process = MagicMock()
-            mock_process.stdout = iter(["Error!\n"])
-            mock_process.wait.return_value = 1
-            mock_popen.return_value = mock_process
-
-            with pytest.raises(RuntimeError, match="failed"):
-                await _run_doc_gen4_step()
+        with patch(
+            "lean_explore.extract.doc_gen4.get_extraction_order",
+            return_value=["mathlib"],
+        ):
+            with patch(
+                "lean_explore.extract.doc_gen4._setup_workspace",
+                return_value=("leanprover/lean4:v4.29.0-rc2", "v4.29.0-rc2"),
+            ):
+                with patch(
+                    "lean_explore.extract.doc_gen4.subprocess.run"
+                ) as mock_run:
+                    mock_run.return_value = MagicMock(
+                        returncode=0,
+                        stdout="",
+                        stderr="",
+                    )
+                    with patch(
+                        "lean_explore.extract.doc_gen4.subprocess.Popen"
+                    ) as mock_popen:
+                        mock_process = MagicMock()
+                        mock_process.stdout = iter(["Error!\n"])
+                        mock_process.wait.return_value = 1
+                        mock_popen.return_value = mock_process
+
+                        with pytest.raises(RuntimeError, match="failed"):
+                            await _run_doc_gen4_step()
 
 
 class TestExtractionStep:
diff --git a/tests/extract/doc_gen4_test.py b/tests/extract/doc_gen4_test.py
new file mode 100644
index 0000000..ac4fa77
--- /dev/null
+++ b/tests/extract/doc_gen4_test.py
@@ -0,0 +1,124 @@
+"""Tests for doc-gen4 workspace orchestration."""
+
+from unittest.mock import MagicMock, patch
+
+from lean_explore.extract.doc_gen4 import (
+    _run_lake_for_package,
+    _uses_sqlite_docgen,
+    run_doc_gen4,
+)
+from lean_explore.extract.package_registry import PACKAGE_REGISTRY
+
+
+class TestDocGen4VersionDetection:
+    """Tests for doc-gen4 format detection by Lean version."""
+
+    def test_uses_sqlite_docgen_from_rc2(self):
+        """Test the SQLite cutoff at v4.29.0-rc2."""
+        assert _uses_sqlite_docgen("leanprover/lean4:v4.29.0-rc2")
+        assert _uses_sqlite_docgen("leanprover/lean4:v4.29.0")
+        assert not _uses_sqlite_docgen("leanprover/lean4:v4.29.0-rc1")
+        assert not _uses_sqlite_docgen("leanprover/lean4:v4.28.0")
+
+
+class TestRunDocGen4FreshHandling:
+    """Tests for fresh-cache handling in run_doc_gen4."""
+
+    async def test_fresh_legacy_workspace_clears_cache(self):
+        """Test that legacy doc-gen4 workspaces still clear the cache."""
+        with patch(
+            "lean_explore.extract.doc_gen4.get_extraction_order",
+            return_value=["mathlib"],
+        ):
+            with patch(
+                "lean_explore.extract.doc_gen4._setup_workspace",
+                return_value=("leanprover/lean4:v4.29.0-rc1", "v4.29.0-rc1"),
+            ):
+                with patch(
+                    "lean_explore.extract.doc_gen4._clear_workspace_cache"
+                ) as mock_clear:
+                    with patch(
+                        "lean_explore.extract.doc_gen4._run_lake_for_package"
+                    ) as mock_run:
+                        await run_doc_gen4(fresh=True)
+
+        mock_clear.assert_called_once()
+        mock_run.assert_called_once_with("mathlib", False)
+
+    async def test_fresh_sqlite_workspace_skips_cache_clear(self):
+        """Test that SQLite doc-gen4 workspaces keep the cache."""
+        with patch(
+            "lean_explore.extract.doc_gen4.get_extraction_order",
+            return_value=["mathlib"],
+        ):
+            with patch(
+                "lean_explore.extract.doc_gen4._setup_workspace",
+                return_value=("leanprover/lean4:v4.29.0-rc2", "v4.29.0-rc2"),
+            ):
+                with patch(
+                    "lean_explore.extract.doc_gen4._clear_workspace_cache"
+                ) as mock_clear:
+                    with patch(
+                        "lean_explore.extract.doc_gen4._run_lake_for_package"
+                    ) as mock_run:
+                        await run_doc_gen4(fresh=True)
+
+        mock_clear.assert_not_called()
+        mock_run.assert_called_once_with("mathlib", False)
+
+
+class TestLakeBuildTargets:
+    """Tests for Lake build target selection."""
+
+    def _run_and_capture_target(
+        self, tmp_path, package_name, library_name, toolchain
+    ):
+        workspace_path = tmp_path / "lean" / package_name
+        workspace_path.mkdir(parents=True)
+        (workspace_path / "lean-toolchain").write_text(toolchain + "\n")
+
+        with patch("lean_explore.extract.doc_gen4.Path") as mock_path:
+            mock_path.side_effect = lambda *args: tmp_path.joinpath(*args)
+            with patch("lean_explore.extract.doc_gen4._run_lake_update_with_retry"):
+                with patch(
+                    "lean_explore.extract.doc_gen4.subprocess.run"
+                ) as mock_run:
+                    mock_run.return_value = MagicMock(
+                        returncode=0, stdout="", stderr=""
+                    )
+                    with patch(
+                        "lean_explore.extract.doc_gen4.subprocess.Popen"
+                    ) as mock_popen:
+                        mock_process = MagicMock()
+                        mock_process.stdout = iter([])
+                        mock_process.wait.return_value = 0
+                        mock_popen.return_value = mock_process
+
+                        _run_lake_for_package(package_name)
+
+        return mock_popen.call_args_list[0].args[0]
+
+    def test_sqlite_workspace_uses_docinfo_target(self, tmp_path):
+        """SQLite-format workspaces should build :docInfo, skipping HTML gen."""
+        args = self._run_and_capture_target(
+            tmp_path, "physlean", "PhysExtract", "leanprover/lean4:v4.29.0-rc2"
+        )
+        assert args == ["lake", "build", "PhysExtract:docInfo"]
+
+    def test_legacy_workspace_uses_docs_target(self, tmp_path):
+        """Pre-SQLite workspaces have no :docInfo facet, must build :docs."""
+        args = self._run_and_capture_target(
+            tmp_path, "physlean", "PhysExtract", "leanprover/lean4:v4.29.0-rc1"
+        )
+        assert args == ["lake", "build", "PhysExtract:docs"]
+
+
+class TestPackageRegistry:
+    """Tests for package registry metadata."""
+
+    def test_physlean_uses_upstream_module_roots(self):
+        """Test that PhysLean filters on the upstream module roots."""
+        assert PACKAGE_REGISTRY["physlean"].module_prefixes == [
+            "Physlib",
+            "QuantumInfo",
+        ]
diff --git a/tests/extract/doc_parser_test.py b/tests/extract/doc_parser_test.py
index 68a0842..08ab60b 100644
--- a/tests/extract/doc_parser_test.py
+++ b/tests/extract/doc_parser_test.py
@@ -5,20 +5,28 @@
 """
 
 import json
+import sqlite3
 from unittest.mock import AsyncMock, patch
 
 import pytest
 from sqlalchemy import select
 
 from lean_explore.extract.doc_parser import (
+    _BlobReader,
     _build_package_cache,
+    _construct_source_link,
+    _detect_docgen_format,
     _extract_dependencies_from_html,
+    _extract_names_from_rendered_code,
     _extract_source_text,
     _filter_auto_generated_projections,
     _insert_declarations_batch,
     _parse_declarations_from_files,
+    _parse_declarations_from_sqlite,
+    _read_lean_toolchain_version,
     _read_source_lines,
     _strip_lean_comments,
+    _validate_docgen_sqlite,
     extract_declarations,
 )
 from lean_explore.extract.types import Declaration
@@ -155,6 +163,253 @@ def test_extract_source_text_file_not_found(self, temp_directory):
         with pytest.raises(FileNotFoundError):
             _extract_source_text(source_link, lean_root, package_cache)
 
+    def test_extract_source_text_from_lake_toolchain(self, temp_directory):
+        """Test extracting source text from the Lake toolchain path."""
+        lean_root = temp_directory / "lean"
+        toolchain_root = temp_directory / "toolchain" / "src"
+        lean_source_dir = toolchain_root / "lean"
+        lake_source_dir = toolchain_root / "lake"
+        lean_source_dir.mkdir(parents=True)
+        lake_file = lake_source_dir / "Lake" / "Config" / "Monad.lean"
+        lake_file.parent.mkdir(parents=True)
+        lake_file.write_text("def Lake.Config.Monad.run := 1\n")
+
+        package_cache = {"lean4": lean_source_dir}
+        source_link = (
+            "https://github.com/leanprover/lean4/blob/toolchain/"
+            "src/lake/Lake/Config/Monad.lean#L1-L1"
+        )
+
+        result = _extract_source_text(source_link, lean_root, package_cache)
+
+        assert "Lake.Config.Monad.run" in result
+
+
+def _encode_nat(n: int) -> bytes:
+    """Encode a natural number in leansqlite's variable-length format."""
+    chunks = []
+    while n >= 128:
+        chunks.append((n & 0x7F) | 0x80)
+        n >>= 7
+    chunks.append(n)
+    return bytes(chunks)
+
+
+def _encode_string(s: str) -> bytes:
+    """Encode a string: Nat(utf8_byte_length) + UTF-8 bytes."""
+    encoded = s.encode("utf-8")
+    return _encode_nat(len(encoded)) + encoded
+
+
+def _encode_name(name: str) -> bytes:
+    """Encode a dotted Lean name (e.g. 'Nat.add') into binary."""
+    if not name:
+        return b"\x00"  # anonymous
+    parts = name.split(".")
+    result = b"\x00"  # start with anonymous
+    for part in parts:
+        # Name.str = tag 1 + parent + string
+        result = b"\x01" + result + _encode_string(part)
+    return result
+
+
+def _make_const_tag(name: str) -> bytes:
+    """Build a RenderedCode.Tag.const(name) blob fragment."""
+    return b"\x02" + _encode_name(name)
+
+
+def _make_text_node(s: str) -> bytes:
+    """Build a TaggedText.text(s) blob fragment."""
+    return b"\x00" + _encode_string(s)
+
+
+def _make_tag_node(tag_bytes: bytes, inner: bytes) -> bytes:
+    """Build a TaggedText.tag(tag, inner) blob fragment."""
+    return b"\x01" + tag_bytes + inner
+
+
+def _make_append_node(children: list[bytes]) -> bytes:
+    """Build a TaggedText.append(children) blob fragment."""
+    return b"\x02" + _encode_nat(len(children)) + b"".join(children)
+
+
+class TestRenderedCodeBlobParser:
+    """Tests for RenderedCode BLOB parsing to extract dependency names."""
+
+    def test_extract_single_const(self):
+        """Test extracting a single const name from a type BLOB."""
+        # TaggedText.tag(Tag.const("Nat"), TaggedText.text("Nat"))
+        blob = _make_tag_node(_make_const_tag("Nat"), _make_text_node("Nat"))
+        names = _extract_names_from_rendered_code(blob)
+        assert names == ["Nat"]
+
+    def test_extract_multiple_consts(self):
+        """Test extracting multiple const names from a type BLOB."""
+        # append([tag(const Nat, text Nat), text " → ", tag(const Bool, text Bool)])
+        blob = _make_append_node([
+            _make_tag_node(_make_const_tag("Nat"), _make_text_node("Nat")),
+            _make_text_node(" → "),
+            _make_tag_node(_make_const_tag("Bool"), _make_text_node("Bool")),
+        ])
+        names = _extract_names_from_rendered_code(blob)
+        assert names == ["Nat", "Bool"]
+
+    def test_extract_dotted_name(self):
+        """Test extracting a dotted name like Nat.add."""
+        blob = _make_tag_node(
+            _make_const_tag("Nat.add"), _make_text_node("Nat.add")
+        )
+        names = _extract_names_from_rendered_code(blob)
+        assert names == ["Nat.add"]
+
+    def test_deduplicates_names(self):
+        """Test that duplicate const references are deduplicated."""
+        blob = _make_append_node([
+            _make_tag_node(_make_const_tag("Nat"), _make_text_node("Nat")),
+            _make_text_node(" → "),
+            _make_tag_node(_make_const_tag("Nat"), _make_text_node("Nat")),
+        ])
+        names = _extract_names_from_rendered_code(blob)
+        assert names == ["Nat"]
+
+    def test_skips_non_const_tags(self):
+        """Test that keyword, string, sort, otherExpr tags are skipped."""
+        blob = _make_append_node([
+            _make_tag_node(b"\x00", _make_text_node("def")),       # keyword
+            _make_text_node(" "),
+            _make_tag_node(_make_const_tag("Nat"), _make_text_node("Nat")),
+            _make_tag_node(b"\x07", _make_text_node("x")),         # otherExpr
+        ])
+        names = _extract_names_from_rendered_code(blob)
+        assert names == ["Nat"]
+
+    def test_empty_blob_returns_empty(self):
+        """Test that an empty or invalid BLOB returns empty list."""
+        assert _extract_names_from_rendered_code(b"") == []
+        assert _extract_names_from_rendered_code(b"\xff") == []
+
+    def test_text_only_returns_empty(self):
+        """Test that a BLOB with only text returns no names."""
+        blob = _make_text_node("hello world")
+        names = _extract_names_from_rendered_code(blob)
+        assert names == []
+
+    def test_nested_tagged_text(self):
+        """Test parsing nested tag nodes."""
+        # tag(otherExpr, tag(const("List"), text("List")))
+        inner = _make_tag_node(_make_const_tag("List"), _make_text_node("List"))
+        blob = _make_tag_node(b"\x07", inner)  # otherExpr wrapping
+        names = _extract_names_from_rendered_code(blob)
+        assert names == ["List"]
+
+    def test_blob_reader_nat_encoding(self):
+        """Test that Nat encoding/decoding matches leansqlite format."""
+        reader = _BlobReader(_encode_nat(0))
+        assert reader._read_nat() == 0
+
+        reader = _BlobReader(_encode_nat(127))
+        assert reader._read_nat() == 127
+
+        reader = _BlobReader(_encode_nat(128))
+        assert reader._read_nat() == 128
+
+        reader = _BlobReader(_encode_nat(300))
+        assert reader._read_nat() == 300
+
+        reader = _BlobReader(_encode_nat(100000))
+        assert reader._read_nat() == 100000
+
+    def test_blob_reader_name_roundtrip(self):
+        """Test that Name encoding produces correct dot-separated output."""
+        reader = _BlobReader(_encode_name("Mathlib.Data.Nat.Basic"))
+        assert reader._read_name() == "Mathlib.Data.Nat.Basic"
+
+        reader = _BlobReader(_encode_name(""))
+        assert reader._read_name() == ""
+
+    def test_sort_tags_handled(self):
+        """Test that sort tag variants (3-6) are handled without error."""
+        for sort_byte in [3, 4, 5, 6]:
+            blob = _make_tag_node(
+                bytes([sort_byte]), _make_text_node("Type")
+            )
+            names = _extract_names_from_rendered_code(blob)
+            assert names == []
+
+
+class TestSqliteHelpers:
+    """Tests for SQLite-specific parsing helpers."""
+
+    def test_construct_source_link_for_core_module_with_version(self):
+        """Test source links for core modules use the toolchain version."""
+        result = _construct_source_link(
+            "Init.Data.Nat.Basic", None, 12, 15,
+            lean_version="v4.29.0-rc6",
+        )
+
+        assert (
+            result
+            == "https://github.com/leanprover/lean4/blob/v4.29.0-rc6/"
+            "src/lean/Init/Data/Nat/Basic.lean#L12-L15"
+        )
+
+    def test_construct_source_link_for_lake_module_with_version(self):
+        """Test source links for Lake modules use the toolchain version."""
+        result = _construct_source_link(
+            "Lake.Config.Monad", None, 7, 9,
+            lean_version="v4.29.0-rc6",
+        )
+
+        assert (
+            result
+            == "https://github.com/leanprover/lean4/blob/v4.29.0-rc6/"
+            "src/lake/Lake/Config/Monad.lean#L7-L9"
+        )
+
+    def test_construct_source_link_falls_back_to_master(self):
+        """Test that missing lean_version falls back to 'master'."""
+        result = _construct_source_link(
+            "Init.Data.Nat.Basic", None, 12, 15,
+        )
+
+        assert (
+            result
+            == "https://github.com/leanprover/lean4/blob/master/"
+            "src/lean/Init/Data/Nat/Basic.lean#L12-L15"
+        )
+
+    def test_construct_source_link_prefers_source_url(self):
+        """Test that a non-None source_url is used directly."""
+        url = "https://github.com/leanprover-community/mathlib4/blob/abc123/Foo.lean"
+        result = _construct_source_link("Foo.Bar", url, 1, 10)
+
+        assert result == f"{url}#L1-L10"
+
+    def test_read_lean_toolchain_version(self, temp_directory):
+        """Test reading version from a lean-toolchain file."""
+        workspace = temp_directory / "workspace"
+        workspace.mkdir()
+        toolchain = workspace / "lean-toolchain"
+        toolchain.write_text("leanprover/lean4:v4.29.0-rc6\n")
+
+        assert _read_lean_toolchain_version(workspace) == "v4.29.0-rc6"
+
+    def test_read_lean_toolchain_version_release(self, temp_directory):
+        """Test reading a release version (no -rc suffix)."""
+        workspace = temp_directory / "workspace"
+        workspace.mkdir()
+        toolchain = workspace / "lean-toolchain"
+        toolchain.write_text("leanprover/lean4:v4.29.0\n")
+
+        assert _read_lean_toolchain_version(workspace) == "v4.29.0"
+
+    def test_read_lean_toolchain_version_missing(self, temp_directory):
+        """Test returns None when lean-toolchain doesn't exist."""
+        workspace = temp_directory / "workspace"
+        workspace.mkdir()
+
+        assert _read_lean_toolchain_version(workspace) is None
+
 
 class TestDependencyExtraction:
     """Tests for dependency extraction from HTML."""
@@ -273,6 +528,458 @@ def test_parse_declarations_filters_packages(self, temp_directory):
 
         assert len(declarations) == 0
 
+    def test_parse_declarations_from_sqlite_filters_non_rendered(
+        self, temp_directory
+    ):
+        """Test SQLite parsing only keeps rendered declarations."""
+        lean_root = temp_directory / "lean"
+        database_path = temp_directory / "api-docs.db"
+
+        source_dir = lean_root / "mathlib" / ".lake" / "packages" / "mathlib4"
+        source_file = source_dir / "Mathlib" / "Data" / "Nat" / "Basic.lean"
+        source_file.parent.mkdir(parents=True)
+        source_file.write_text(
+            "theorem Nat.visible : True := trivial\n"
+            "def Nat.hidden : Nat := 0\n"
+        )
+
+        connection = sqlite3.connect(database_path)
+        connection.executescript(
+            """
+            CREATE TABLE modules (name TEXT PRIMARY KEY, source_url TEXT);
+            CREATE TABLE name_info (
+              module_name TEXT NOT NULL,
+              position INTEGER NOT NULL,
+              kind TEXT,
+              name TEXT NOT NULL,
+              type BLOB NOT NULL,
+              sorried INTEGER NOT NULL,
+              render INTEGER NOT NULL,
+              PRIMARY KEY (module_name, position)
+            );
+            CREATE TABLE declaration_ranges (
+              module_name TEXT NOT NULL,
+              position INTEGER NOT NULL,
+              start_line INTEGER NOT NULL,
+              start_column INTEGER NOT NULL,
+              start_utf16 INTEGER NOT NULL,
+              end_line INTEGER NOT NULL,
+              end_column INTEGER NOT NULL,
+              end_utf16 INTEGER NOT NULL,
+              PRIMARY KEY (module_name, position)
+            );
+            CREATE TABLE declaration_markdown_docstrings (
+              module_name TEXT NOT NULL,
+              position INTEGER NOT NULL,
+              text TEXT NOT NULL,
+              PRIMARY KEY (module_name, position)
+            );
+            CREATE TABLE declaration_verso_docstrings (
+              module_name TEXT NOT NULL,
+              position INTEGER NOT NULL,
+              content BLOB NOT NULL,
+              PRIMARY KEY (module_name, position)
+            );
+            """
+        )
+        connection.execute(
+            "INSERT INTO modules (name, source_url) VALUES (?, ?)",
+            (
+                "Mathlib.Data.Nat.Basic",
+                "https://github.com/leanprover-community/mathlib4/blob/master/"
+                "Mathlib/Data/Nat/Basic.lean",
+            ),
+        )
+        connection.executemany(
+            """
+            INSERT INTO name_info
+              (module_name, position, kind, name, type, sorried, render)
+            VALUES (?, ?, ?, ?, ?, ?, ?)
+            """,
+            [
+                (
+                    "Mathlib.Data.Nat.Basic",
+                    1,
+                    "theorem",
+                    "Nat.visible",
+                    _make_text_node("True"),
+                    0,
+                    1,
+                ),
+                (
+                    "Mathlib.Data.Nat.Basic",
+                    2,
+                    "definition",
+                    "Nat.hidden",
+                    _make_text_node("Nat"),
+                    0,
+                    0,
+                ),
+            ],
+        )
+        connection.executemany(
+            """
+            INSERT INTO declaration_ranges
+              (module_name, position, start_line, start_column, start_utf16,
+               end_line, end_column, end_utf16)
+            VALUES (?, ?, ?, ?, ?, ?, ?, ?)
+            """,
+            [
+                ("Mathlib.Data.Nat.Basic", 1, 1, 0, 0, 1, 32, 32),
+                ("Mathlib.Data.Nat.Basic", 2, 2, 0, 0, 2, 22, 22),
+            ],
+        )
+        connection.execute(
+            """
+            INSERT INTO declaration_markdown_docstrings
+              (module_name, position, text)
+            VALUES (?, ?, ?)
+            """,
+            ("Mathlib.Data.Nat.Basic", 1, "Visible theorem"),
+        )
+        connection.commit()
+        connection.close()
+
+        declarations = _parse_declarations_from_sqlite(
+            database_path,
+            lean_root,
+            _build_package_cache(lean_root),
+            allowed_module_prefixes=["Mathlib"],
+        )
+
+        assert [declaration.name for declaration in declarations] == ["Nat.visible"]
+        assert declarations[0].docstring == "Visible theorem"
+
+    def test_parse_declarations_from_sqlite_extracts_dependencies(
+        self, temp_directory
+    ):
+        """Test that SQLite parsing extracts dependencies from type BLOBs."""
+        lean_root = temp_directory / "lean"
+        database_path = temp_directory / "api-docs.db"
+
+        source_dir = lean_root / "mathlib" / ".lake" / "packages" / "mathlib4"
+        source_file = source_dir / "Mathlib" / "Data" / "Nat" / "Basic.lean"
+        source_file.parent.mkdir(parents=True)
+        source_file.write_text("def Nat.myFunc (n : Nat) : Bool := true\n")
+
+        # Build a type BLOB: "Nat → Bool"
+        type_blob = _make_append_node([
+            _make_tag_node(_make_const_tag("Nat"), _make_text_node("Nat")),
+            _make_text_node(" → "),
+            _make_tag_node(_make_const_tag("Bool"), _make_text_node("Bool")),
+        ])
+
+        connection = sqlite3.connect(database_path)
+        connection.executescript(
+            """
+            CREATE TABLE modules (name TEXT PRIMARY KEY, source_url TEXT);
+            CREATE TABLE name_info (
+              module_name TEXT NOT NULL,
+              position INTEGER NOT NULL,
+              kind TEXT,
+              name TEXT NOT NULL,
+              type BLOB NOT NULL,
+              sorried INTEGER NOT NULL,
+              render INTEGER NOT NULL,
+              PRIMARY KEY (module_name, position)
+            );
+            CREATE TABLE declaration_ranges (
+              module_name TEXT NOT NULL,
+              position INTEGER NOT NULL,
+              start_line INTEGER NOT NULL,
+              start_column INTEGER NOT NULL,
+              start_utf16 INTEGER NOT NULL,
+              end_line INTEGER NOT NULL,
+              end_column INTEGER NOT NULL,
+              end_utf16 INTEGER NOT NULL,
+              PRIMARY KEY (module_name, position)
+            );
+            CREATE TABLE declaration_markdown_docstrings (
+              module_name TEXT NOT NULL,
+              position INTEGER NOT NULL,
+              text TEXT NOT NULL,
+              PRIMARY KEY (module_name, position)
+            );
+            CREATE TABLE declaration_verso_docstrings (
+              module_name TEXT NOT NULL,
+              position INTEGER NOT NULL,
+              content BLOB NOT NULL,
+              PRIMARY KEY (module_name, position)
+            );
+            """
+        )
+        connection.execute(
+            "INSERT INTO modules (name, source_url) VALUES (?, ?)",
+            (
+                "Mathlib.Data.Nat.Basic",
+                "https://github.com/leanprover-community/mathlib4/blob/master/"
+                "Mathlib/Data/Nat/Basic.lean",
+            ),
+        )
+        connection.execute(
+            """
+            INSERT INTO name_info
+              (module_name, position, kind, name, type, sorried, render)
+            VALUES (?, ?, ?, ?, ?, ?, ?)
+            """,
+            ("Mathlib.Data.Nat.Basic", 1, "definition", "Nat.myFunc", type_blob, 0, 1),
+        )
+        connection.execute(
+            """
+            INSERT INTO declaration_ranges
+              (module_name, position, start_line, start_column, start_utf16,
+               end_line, end_column, end_utf16)
+            VALUES (?, ?, ?, ?, ?, ?, ?, ?)
+            """,
+            ("Mathlib.Data.Nat.Basic", 1, 1, 0, 0, 1, 40, 40),
+        )
+        connection.commit()
+        connection.close()
+
+        declarations = _parse_declarations_from_sqlite(
+            database_path,
+            lean_root,
+            _build_package_cache(lean_root),
+            allowed_module_prefixes=["Mathlib"],
+        )
+
+        assert len(declarations) == 1
+        assert declarations[0].name == "Nat.myFunc"
+        assert declarations[0].dependencies == ["Nat", "Bool"]
+
+    def test_parse_declarations_from_sqlite_detects_verso_docstrings(
+        self, temp_directory, caplog
+    ):
+        """Test that Verso-only docstrings are detected and logged."""
+        lean_root = temp_directory / "lean"
+        database_path = temp_directory / "api-docs.db"
+
+        source_dir = (
+            lean_root / "mathlib" / ".lake" / "packages" / "mathlib4"
+        )
+        source_file = source_dir / "Mathlib" / "Data" / "Nat" / "Basic.lean"
+        source_file.parent.mkdir(parents=True)
+        source_file.write_text(
+            "def Nat.withVerso : Nat := 0\n"
+            "def Nat.withMarkdown : Nat := 1\n"
+        )
+
+        connection = sqlite3.connect(database_path)
+        connection.executescript(
+            """
+            CREATE TABLE modules (name TEXT PRIMARY KEY, source_url TEXT);
+            CREATE TABLE name_info (
+              module_name TEXT NOT NULL,
+              position INTEGER NOT NULL,
+              kind TEXT,
+              name TEXT NOT NULL,
+              type BLOB NOT NULL,
+              sorried INTEGER NOT NULL,
+              render INTEGER NOT NULL,
+              PRIMARY KEY (module_name, position)
+            );
+            CREATE TABLE declaration_ranges (
+              module_name TEXT NOT NULL,
+              position INTEGER NOT NULL,
+              start_line INTEGER NOT NULL,
+              start_column INTEGER NOT NULL,
+              start_utf16 INTEGER NOT NULL,
+              end_line INTEGER NOT NULL,
+              end_column INTEGER NOT NULL,
+              end_utf16 INTEGER NOT NULL,
+              PRIMARY KEY (module_name, position)
+            );
+            CREATE TABLE declaration_markdown_docstrings (
+              module_name TEXT NOT NULL,
+              position INTEGER NOT NULL,
+              text TEXT NOT NULL,
+              PRIMARY KEY (module_name, position)
+            );
+            CREATE TABLE declaration_verso_docstrings (
+              module_name TEXT NOT NULL,
+              position INTEGER NOT NULL,
+              content BLOB NOT NULL,
+              PRIMARY KEY (module_name, position)
+            );
+            """
+        )
+        connection.execute(
+            "INSERT INTO modules (name, source_url) VALUES (?, ?)",
+            (
+                "Mathlib.Data.Nat.Basic",
+                "https://github.com/leanprover-community/mathlib4"
+                "/blob/master/Mathlib/Data/Nat/Basic.lean",
+            ),
+        )
+        connection.executemany(
+            """
+            INSERT INTO name_info
+              (module_name, position, kind, name, type, sorried, render)
+            VALUES (?, ?, ?, ?, ?, ?, ?)
+            """,
+            [
+                (
+                    "Mathlib.Data.Nat.Basic", 1, "definition",
+                    "Nat.withVerso", _make_text_node("Nat"), 0, 1,
+                ),
+                (
+                    "Mathlib.Data.Nat.Basic", 2, "definition",
+                    "Nat.withMarkdown", _make_text_node("Nat"), 0, 1,
+                ),
+            ],
+        )
+        connection.executemany(
+            """
+            INSERT INTO declaration_ranges
+              (module_name, position, start_line, start_column,
+               start_utf16, end_line, end_column, end_utf16)
+            VALUES (?, ?, ?, ?, ?, ?, ?, ?)
+            """,
+            [
+                ("Mathlib.Data.Nat.Basic", 1, 1, 0, 0, 1, 27, 27),
+                ("Mathlib.Data.Nat.Basic", 2, 2, 0, 0, 2, 30, 30),
+            ],
+        )
+        # Declaration 1: Verso-only docstring (binary BLOB)
+        connection.execute(
+            """
+            INSERT INTO declaration_verso_docstrings
+              (module_name, position, content)
+            VALUES (?, ?, ?)
+            """,
+            ("Mathlib.Data.Nat.Basic", 1, b"\x00\x01\x02"),
+        )
+        # Declaration 2: Markdown docstring
+        connection.execute(
+            """
+            INSERT INTO declaration_markdown_docstrings
+              (module_name, position, text)
+            VALUES (?, ?, ?)
+            """,
+            ("Mathlib.Data.Nat.Basic", 2, "A markdown docstring"),
+        )
+        connection.commit()
+        connection.close()
+
+        import logging
+
+        with caplog.at_level(logging.WARNING):
+            declarations = _parse_declarations_from_sqlite(
+                database_path,
+                lean_root,
+                _build_package_cache(lean_root),
+                allowed_module_prefixes=["Mathlib"],
+            )
+
+        assert len(declarations) == 2
+        # Verso-only: docstring is None
+        verso_decl = next(
+            d for d in declarations if d.name == "Nat.withVerso"
+        )
+        assert verso_decl.docstring is None
+        # Markdown: docstring is present
+        md_decl = next(
+            d for d in declarations if d.name == "Nat.withMarkdown"
+        )
+        assert md_decl.docstring == "A markdown docstring"
+        # Warning logged about Verso-only docstrings
+        assert "1 declarations have Verso-only docstrings" in caplog.text
+
+    def test_parse_declarations_from_sqlite_uses_core_fallback_source_link(
+        self, temp_directory
+    ):
+        """Test SQLite parsing for core modules without a stored source URL."""
+        lean_root = temp_directory / "lean"
+        database_path = temp_directory / "api-docs.db"
+
+        toolchain_root = temp_directory / "toolchain" / "src"
+        lean_source_dir = toolchain_root / "lean"
+        init_file = lean_source_dir / "Init" / "Data" / "Nat" / "Basic.lean"
+        init_file.parent.mkdir(parents=True)
+        init_file.write_text("theorem Nat.core : True := trivial\n")
+
+        connection = sqlite3.connect(database_path)
+        connection.executescript(
+            """
+            CREATE TABLE modules (name TEXT PRIMARY KEY, source_url TEXT);
+            CREATE TABLE name_info (
+              module_name TEXT NOT NULL,
+              position INTEGER NOT NULL,
+              kind TEXT,
+              name TEXT NOT NULL,
+              type BLOB NOT NULL,
+              sorried INTEGER NOT NULL,
+              render INTEGER NOT NULL,
+              PRIMARY KEY (module_name, position)
+            );
+            CREATE TABLE declaration_ranges (
+              module_name TEXT NOT NULL,
+              position INTEGER NOT NULL,
+              start_line INTEGER NOT NULL,
+              start_column INTEGER NOT NULL,
+              start_utf16 INTEGER NOT NULL,
+              end_line INTEGER NOT NULL,
+              end_column INTEGER NOT NULL,
+              end_utf16 INTEGER NOT NULL,
+              PRIMARY KEY (module_name, position)
+            );
+            CREATE TABLE declaration_markdown_docstrings (
+              module_name TEXT NOT NULL,
+              position INTEGER NOT NULL,
+              text TEXT NOT NULL,
+              PRIMARY KEY (module_name, position)
+            );
+            CREATE TABLE declaration_verso_docstrings (
+              module_name TEXT NOT NULL,
+              position INTEGER NOT NULL,
+              content BLOB NOT NULL,
+              PRIMARY KEY (module_name, position)
+            );
+            """
+        )
+        connection.execute(
+            "INSERT INTO modules (name, source_url) VALUES (?, ?)",
+            ("Init.Data.Nat.Basic", None),
+        )
+        connection.execute(
+            """
+            INSERT INTO name_info
+              (module_name, position, kind, name, type, sorried, render)
+            VALUES (?, ?, ?, ?, ?, ?, ?)
+            """,
+            (
+                "Init.Data.Nat.Basic", 1, "theorem", "Nat.core",
+                _make_text_node("True"), 0, 1,
+            ),
+        )
+        connection.execute(
+            """
+            INSERT INTO declaration_ranges
+              (module_name, position, start_line, start_column, start_utf16,
+               end_line, end_column, end_utf16)
+            VALUES (?, ?, ?, ?, ?, ?, ?, ?)
+            """,
+            ("Init.Data.Nat.Basic", 1, 1, 0, 0, 1, 31, 31),
+        )
+        connection.commit()
+        connection.close()
+
+        declarations = _parse_declarations_from_sqlite(
+            database_path,
+            lean_root,
+            {"lean4": lean_source_dir},
+            allowed_module_prefixes=["Init"],
+            lean_version="v4.29.0-rc6",
+        )
+
+        assert len(declarations) == 1
+        assert declarations[0].name == "Nat.core"
+        assert (
+            declarations[0].source_link
+            == "https://github.com/leanprover/lean4/blob/v4.29.0-rc6/"
+            "src/lean/Init/Data/Nat/Basic.lean#L1-L1"
+        )
+
 
 class TestDeclarationInsertion:
     """Tests for database insertion."""
@@ -641,3 +1348,125 @@ def test_single_declaration(self):
 
         assert len(filtered) == 1
         assert removed_count == 0
+
+
+class TestDocgenFormatDetection:
+    """Tests for doc-gen4 output format detection and SQLite validation."""
+
+    def test_detect_sqlite_with_valid_db(self, temp_directory):
+        """Test detection returns 'sqlite' for a valid api-docs.db."""
+        workspace = temp_directory / "pkg"
+        db_path = workspace / ".lake" / "build" / "api-docs.db"
+        db_path.parent.mkdir(parents=True)
+
+        connection = sqlite3.connect(db_path)
+        connection.executescript(
+            """
+            CREATE TABLE modules (name TEXT PRIMARY KEY, source_url TEXT);
+            CREATE TABLE name_info (
+              module_name TEXT, position INTEGER, kind TEXT, name TEXT,
+              type BLOB, sorried INTEGER, render INTEGER,
+              PRIMARY KEY (module_name, position)
+            );
+            CREATE TABLE declaration_ranges (
+              module_name TEXT, position INTEGER,
+              start_line INTEGER, start_column INTEGER, start_utf16 INTEGER,
+              end_line INTEGER, end_column INTEGER, end_utf16 INTEGER,
+              PRIMARY KEY (module_name, position)
+            );
+            """
+        )
+        connection.close()
+
+        assert _detect_docgen_format(workspace) == "sqlite"
+
+    def test_detect_bmp_format(self, temp_directory):
+        """Test detection returns 'bmp' when only BMP files exist."""
+        workspace = temp_directory / "pkg"
+        doc_data = workspace / ".lake" / "build" / "doc-data"
+        doc_data.mkdir(parents=True)
+        (doc_data / "declaration-data-Foo.bmp").write_text("{}")
+
+        assert _detect_docgen_format(workspace) == "bmp"
+
+    def test_detect_none_when_empty(self, temp_directory):
+        """Test detection returns 'none' when no doc output exists."""
+        workspace = temp_directory / "pkg"
+        workspace.mkdir(parents=True)
+
+        assert _detect_docgen_format(workspace) == "none"
+
+    def test_detect_falls_back_to_bmp_for_empty_db(self, temp_directory):
+        """Test that an empty api-docs.db falls back to BMP if available."""
+        workspace = temp_directory / "pkg"
+        db_path = workspace / ".lake" / "build" / "api-docs.db"
+        db_path.parent.mkdir(parents=True)
+        db_path.write_bytes(b"")
+
+        doc_data = workspace / ".lake" / "build" / "doc-data"
+        doc_data.mkdir(parents=True)
+        (doc_data / "declaration-data-Foo.bmp").write_text("{}")
+
+        assert _detect_docgen_format(workspace) == "bmp"
+
+    def test_detect_returns_none_for_empty_db_without_bmp(self, temp_directory):
+        """Test that an empty api-docs.db with no BMP fallback returns 'none'."""
+        workspace = temp_directory / "pkg"
+        db_path = workspace / ".lake" / "build" / "api-docs.db"
+        db_path.parent.mkdir(parents=True)
+        db_path.write_bytes(b"")
+
+        assert _detect_docgen_format(workspace) == "none"
+
+    def test_detect_falls_back_for_corrupt_db(self, temp_directory):
+        """Test that a corrupt (non-SQLite) file falls back gracefully."""
+        workspace = temp_directory / "pkg"
+        db_path = workspace / ".lake" / "build" / "api-docs.db"
+        db_path.parent.mkdir(parents=True)
+        db_path.write_bytes(b"this is not a sqlite database")
+
+        assert _detect_docgen_format(workspace) == "none"
+
+    def test_detect_falls_back_for_db_missing_tables(self, temp_directory):
+        """Test that a SQLite DB missing required tables falls back."""
+        workspace = temp_directory / "pkg"
+        db_path = workspace / ".lake" / "build" / "api-docs.db"
+        db_path.parent.mkdir(parents=True)
+
+        connection = sqlite3.connect(db_path)
+        connection.execute(
+            "CREATE TABLE modules (name TEXT PRIMARY KEY, source_url TEXT)"
+        )
+        connection.close()
+
+        # Has modules but missing name_info and declaration_ranges
+        assert _detect_docgen_format(workspace) == "none"
+
+    def test_validate_docgen_sqlite_valid(self, temp_directory):
+        """Test validation passes for a well-formed database."""
+        db_path = temp_directory / "api-docs.db"
+        connection = sqlite3.connect(db_path)
+        connection.executescript(
+            """
+            CREATE TABLE modules (name TEXT PRIMARY KEY);
+            CREATE TABLE name_info (module_name TEXT, position INTEGER);
+            CREATE TABLE declaration_ranges (module_name TEXT, position INTEGER);
+            """
+        )
+        connection.close()
+
+        assert _validate_docgen_sqlite(db_path) is True
+
+    def test_validate_docgen_sqlite_empty_file(self, temp_directory):
+        """Test validation rejects an empty file."""
+        db_path = temp_directory / "api-docs.db"
+        db_path.write_bytes(b"")
+
+        assert _validate_docgen_sqlite(db_path) is False
+
+    def test_validate_docgen_sqlite_corrupt_file(self, temp_directory):
+        """Test validation rejects a non-SQLite file."""
+        db_path = temp_directory / "api-docs.db"
+        db_path.write_bytes(b"not a database")
+
+        assert _validate_docgen_sqlite(db_path) is False