From f018b9f720333064829d62942910eaed9c07fa99 Mon Sep 17 00:00:00 2001 From: dslovinsky Date: Fri, 13 Feb 2026 17:45:36 -0500 Subject: [PATCH] fix: crawl sitemap index to resolve actual page URLs The new docs site uses a sitemapindex at sitemap.xml that references child sitemaps (sitemap-0.xml, etc.) instead of listing pages directly. This recursively follows the index to collect all page URLs. Co-Authored-By: Claude --- scripts/generate-metadata.ts | 55 +++++++++++++++++++++++++----------- 1 file changed, 39 insertions(+), 16 deletions(-) diff --git a/scripts/generate-metadata.ts b/scripts/generate-metadata.ts index bc4e4faff..3b020c592 100644 --- a/scripts/generate-metadata.ts +++ b/scripts/generate-metadata.ts @@ -1,4 +1,3 @@ -// Cursor-generated import * as fs from "fs"; import * as path from "path"; @@ -7,6 +6,43 @@ const OUTPUT_FILE = path.join(API_SPECS_DIR, "metadata.json"); const API_SPECS_URL = "https://dev-docs.alchemy.com"; const DOCS_URL = "https://www.alchemy.com/docs"; +function extractLocs(xml: string): string[] { + return ( + xml + .match(/(.*?)<\/loc>/g) + ?.map((tag) => tag.replace(/<\/?loc>/g, "")) || [] + ); +} + +function isSitemapIndex(xml: string): boolean { + return xml.includes(" { + const response = await fetch(sitemapUrl); + if (!response.ok) { + throw new Error( + `Failed to fetch ${sitemapUrl}: ${response.status} ${response.statusText}`, + ); + } + + const xml = await response.text(); + + if (isSitemapIndex(xml)) { + const childSitemapUrls = extractLocs(xml); + console.info( + `Found sitemap index with ${childSitemapUrls.length} sitemap(s)`, + ); + + const results = await Promise.all( + childSitemapUrls.map((url) => fetchSitemapUrls(url)), + ); + return results.flat(); + } + + return extractLocs(xml); +} + (async () => { try { const files: string[] = []; @@ -27,22 +63,9 @@ const DOCS_URL = "https://www.alchemy.com/docs"; traverse(API_SPECS_DIR); - // Fetch and parse sitemap - const sitemapResponse = await fetch(`${DOCS_URL}/sitemap.xml`); - - if (!sitemapResponse.ok) { - throw new Error(`Failed to fetch sitemap: ${sitemapResponse.statusText}`); - } - - const sitemapXml = await sitemapResponse.text(); - - // Extract URLs using regex and remove host - const urls = - sitemapXml - .match(/(.*?)<\/loc>/g) - ?.map((url) => url.replace(/<\/?loc>/g, "")) || []; + const urls = await fetchSitemapUrls(`${DOCS_URL}/sitemap.xml`); + console.info(`Collected ${urls.length} page URL(s) from sitemap`); - // Write to file fs.writeFileSync(OUTPUT_FILE, JSON.stringify({ files, urls }, null, 2)); console.info(`Successfully generated metadata file at ${OUTPUT_FILE}`); } catch (error) {