From 1324b32f0a28b324a44f4f1150ae38e5f2b11d6a Mon Sep 17 00:00:00 2001 From: Denys Kuchma Date: Mon, 11 May 2026 21:33:45 +0300 Subject: [PATCH 1/2] new doc-collector --- bin/explorbot-cli.ts | 2 + boat/doc-collector/bin/doc-collector-cli.ts | 5 + boat/doc-collector/package.json | 24 ++ boat/doc-collector/src/ai/documentarian.ts | 184 +++++++++ boat/doc-collector/src/cli.ts | 119 ++++++ boat/doc-collector/src/config.ts | 162 ++++++++ boat/doc-collector/src/docbot.ts | 391 ++++++++++++++++++ boat/doc-collector/src/docs-renderer.ts | 187 +++++++++ boat/doc-collector/src/path-filter.ts | 46 +++ boat/doc-collector/src/research-navigation.ts | 90 ++++ bun.lock | 6 +- docs/commands.md | 32 +- docs/doc-collector.md | 140 +++++++ package.json | 2 +- src/ai/researcher.ts | 3 +- src/ai/researcher/coordinates.ts | 2 +- src/ai/researcher/parser.ts | 3 + src/config.ts | 16 +- src/explorbot.ts | 1 + src/state-manager.ts | 4 + src/utils/url-matcher.ts | 7 +- src/utils/web-element.ts | 5 +- tests/unit/config.test.ts | 42 ++ tests/unit/doc-collector.test.ts | 247 +++++++++++ tests/unit/research-parser.test.ts | 15 + tests/unit/url-matcher.test.ts | 16 + 26 files changed, 1738 insertions(+), 13 deletions(-) create mode 100644 boat/doc-collector/bin/doc-collector-cli.ts create mode 100644 boat/doc-collector/package.json create mode 100644 boat/doc-collector/src/ai/documentarian.ts create mode 100644 boat/doc-collector/src/cli.ts create mode 100644 boat/doc-collector/src/config.ts create mode 100644 boat/doc-collector/src/docbot.ts create mode 100644 boat/doc-collector/src/docs-renderer.ts create mode 100644 boat/doc-collector/src/path-filter.ts create mode 100644 boat/doc-collector/src/research-navigation.ts create mode 100644 docs/doc-collector.md create mode 100644 tests/unit/config.test.ts create mode 100644 tests/unit/doc-collector.test.ts diff --git a/bin/explorbot-cli.ts b/bin/explorbot-cli.ts index d19469c..5499742 100755 --- a/bin/explorbot-cli.ts +++ b/bin/explorbot-cli.ts @@ -823,6 +823,8 @@ program }); import { createApiCommands } from '../boat/api-tester/src/cli.ts'; +import { createDocsCommands } from '../boat/doc-collector/src/cli.ts'; program.addCommand(createApiCommands('api')); +program.addCommand(createDocsCommands('docs')); program.parse(); diff --git a/boat/doc-collector/bin/doc-collector-cli.ts b/boat/doc-collector/bin/doc-collector-cli.ts new file mode 100644 index 0000000..1502211 --- /dev/null +++ b/boat/doc-collector/bin/doc-collector-cli.ts @@ -0,0 +1,5 @@ +#!/usr/bin/env bun +import { createDocsCommands } from '../src/cli.ts'; + +const program = createDocsCommands('doc-collector'); +program.parse(); diff --git a/boat/doc-collector/package.json b/boat/doc-collector/package.json new file mode 100644 index 0000000..c081616 --- /dev/null +++ b/boat/doc-collector/package.json @@ -0,0 +1,24 @@ +{ + "name": "doc-collector", + "version": "1.0.0", + "description": "AI-powered website documentation collector", + "type": "module", + "bin": { + "doc-collector": "./bin/doc-collector-cli.ts" + }, + "scripts": { + "format": "biome format --write .", + "lint:fix": "biome lint --write .", + "check:fix": "biome check --write ." + }, + "dependencies": { + "ai": "^6.0.6", + "commander": "^14.0.1", + "dedent": "^1.6.0", + "zod": "^4.1.8" + }, + "devDependencies": { + "@biomejs/biome": "^1.5.3", + "typescript": "^5.0.0" + } +} diff --git a/boat/doc-collector/src/ai/documentarian.ts b/boat/doc-collector/src/ai/documentarian.ts new file mode 100644 index 0000000..8396d3a --- /dev/null +++ b/boat/doc-collector/src/ai/documentarian.ts @@ -0,0 +1,184 @@ +import dedent from 'dedent'; +import { z } from 'zod'; +import type { AIProvider } from '../../../../src/ai/provider.ts'; +import type { WebPageState } from '../../../../src/state-manager.ts'; +import type { DocbotConfig } from '../config.ts'; + +class Documentarian { + private provider: AIProvider; + private config: DocbotConfig; + + constructor(provider: AIProvider, config: DocbotConfig = {}) { + this.provider = provider; + this.config = config; + } + + async document(state: WebPageState, research: string): Promise { + try { + return await this.generateDocumentation(state, research); + } catch (error) { + if (!this.shouldRetryWithSanitizedResearch(error)) { + throw error; + } + + return this.generateDocumentation(state, this.sanitizeResearch(research), true); + } + } + + private getSystemPrompt(): string { + const customPrompt = this.config.docs?.prompt; + let promptSuffix = ''; + if (customPrompt) { + promptSuffix = customPrompt; + } + + return dedent` + + You are a product analyst preparing functional website documentation from UI research. + + + + Convert exploratory UI research into a precise spec of what users can do on the current page. + Distinguish proven capabilities from assumptions. + Prefer accuracy over coverage. + + + + Only list capabilities that are grounded in the provided page research. + Put actions into "can" only when there is direct evidence in the page context. + Put actions into "might" only when the UI strongly suggests a capability but proof is incomplete. + Describe each action from the end-user perspective. + Be explicit about scope: + - one item + - list of items + - bulk operations + - all items + - page-level + Avoid implementation details, selectors, and QA wording. + Avoid duplicate actions with different phrasing. + + + ${promptSuffix} + `; + } + + private buildPrompt(state: WebPageState, research: string, simplified = false): string { + const headings = [state.h1, state.h2, state.h3, state.h4].filter(Boolean).join(' | '); + const links = (state.links || []) + .slice(0, 50) + .map((link) => `- ${link.title}: ${link.url}`) + .join('\n'); + const simplificationNote = simplified + ? dedent` + + The research text was simplified because the original formatting was noisy. + Ignore malformed table syntax and rely only on clear, repeated signals. + Prefer fewer actions over speculative coverage. + + ` + : ''; + + return dedent` + + URL: ${state.url} + Title: ${state.title || ''} + Headings: ${headings} + + + + ${links} + + + + ${research} + + + ${simplificationNote} + + + Return structured data. + summary: short page purpose statement. + can: actions you are 100% sure are available on page. + might: actions that look possible but are not fully proven. + For each action provide: + - action: concise user-facing capability phrased as "user can ..." + - scope: one of one item, list of items, bulk operations, all items, page-level + - evidence: short reason based on visible UI or research + + `; + } + + private async generateDocumentation(state: WebPageState, research: string, simplified = false): Promise { + const messages = [ + { + role: 'system' as const, + content: this.getSystemPrompt(), + }, + { + role: 'user' as const, + content: this.buildPrompt(state, research, simplified), + }, + ]; + + const response = await this.provider.generateObject(messages, pageDocumentationSchema, undefined, { + agentName: 'documentarian', + }); + + return response.object as PageDocumentation; + } + + private shouldRetryWithSanitizedResearch(error: unknown): boolean { + const message = error instanceof Error ? error.message : String(error); + return message.includes('Failed to generate JSON') || message.includes('failed_generation'); + } + + private sanitizeResearch(research: string): string { + const lines = research.split('\n'); + const sanitized: string[] = []; + + for (const line of lines) { + if (!line.trim()) { + sanitized.push(line); + continue; + } + + if (!line.includes('|')) { + sanitized.push(line); + continue; + } + + const pipeCount = (line.match(/\|/g) || []).length; + if (pipeCount < 2) { + continue; + } + + if (line.includes('|------')) { + sanitized.push(line); + continue; + } + + if (line.trim().startsWith('|') && pipeCount >= 4) { + sanitized.push(line); + } + } + + return sanitized.join('\n'); + } +} + +const capabilitySchema = z.object({ + action: z.string(), + scope: z.enum(['one item', 'list of items', 'bulk operations', 'all items', 'page-level']), + evidence: z.string(), +}); + +const pageDocumentationSchema = z.object({ + summary: z.string(), + can: z.array(capabilitySchema), + might: z.array(capabilitySchema), +}); + +type PageDocumentation = z.infer; + +export { Documentarian }; +export type { PageDocumentation }; diff --git a/boat/doc-collector/src/cli.ts b/boat/doc-collector/src/cli.ts new file mode 100644 index 0000000..40a84c0 --- /dev/null +++ b/boat/doc-collector/src/cli.ts @@ -0,0 +1,119 @@ +import fs from 'node:fs'; +import path from 'node:path'; +import { Command } from 'commander'; +import { setPreserveConsoleLogs } from '../../../src/utils/logger.ts'; +import { DocBot, type DocbotOptions } from './docbot.ts'; + +function buildOptions(options: any): DocbotOptions { + let session = options.session; + if (options.session === true) { + session = 'output/session.json'; + } + + return { + verbose: options.verbose || options.debug, + config: options.config, + path: options.path, + show: options.show, + headless: options.headless, + incognito: options.incognito, + session, + docsConfig: options.docsConfig, + }; +} + +function addCommonOptions(cmd: Command): Command { + return cmd + .option('-v, --verbose', 'Enable verbose logging') + .option('--debug', 'Enable debug logging') + .option('-c, --config ', 'Path to explorbot configuration file') + .option('--docs-config ', 'Path to doc collector configuration file') + .option('-p, --path ', 'Working directory path') + .option('-s, --show', 'Show browser window') + .option('--headless', 'Run browser in headless mode') + .option('--incognito', 'Run without recording experiences') + .option('--session [file]', 'Save/restore browser session from file'); +} + +export function createDocsCommands(name = 'docs'): Command { + const cmd = new Command(name); + cmd.description('AI-powered website documentation collector'); + + addCommonOptions(cmd.command('collect ').description('Crawl pages and generate documentation spec').option('--max-pages ', 'Maximum number of pages to document')).action(async (startPath, options) => { + setPreserveConsoleLogs(true); + + try { + const bot = new DocBot({ + ...buildOptions(options), + startUrl: startPath, + }); + await bot.start(); + + let maxPages: number | undefined; + if (options.maxPages) { + maxPages = Number.parseInt(options.maxPages, 10); + } + + const result = await bot.collect(startPath, { maxPages }); + + console.log(`\nDocumented ${result.pages.length} page(s)`); + console.log(`Skipped ${result.skipped.length} page(s)`); + console.log(`Spec index: ${result.indexPath}`); + console.log(`Pages dir: ${path.join(result.outputDir, 'pages')}`); + + await bot.stop(); + process.exit(0); + } catch (error) { + console.error('Failed:', error instanceof Error ? error.message : 'Unknown error'); + process.exit(1); + } + }); + + cmd + .command('init') + .description('Initialize doc collector configuration') + .option('-f, --force', 'Overwrite existing config file') + .option('-p, --path ', 'Working directory for initialization') + .action(async (options) => { + const originalCwd = process.cwd(); + if (options.path) { + const resolvedPath = path.resolve(options.path); + fs.mkdirSync(resolvedPath, { recursive: true }); + process.chdir(resolvedPath); + console.log(`Working in: ${resolvedPath}`); + } + + const configPath = path.resolve('docbot.config.ts'); + if (fs.existsSync(configPath) && !options.force) { + console.log(`Config file already exists: ${configPath}`); + console.log('Use --force to overwrite.'); + process.exit(1); + } + + const configContent = `export default { + docs: { + maxPages: 100, + output: 'docs', + screenshot: true, + collapseDynamicPages: true, + scope: 'site', + includePaths: [], + excludePaths: [], + deniedPathSegments: ['callback', 'callbacks', 'logout', 'signout', 'sign_out', 'destroy', 'delete', 'remove'], + minCanActions: 1, + minInteractiveElements: 3, + // prompt: 'Add domain-specific documentation guidance here', + }, + }; + `; + + fs.writeFileSync(configPath, configContent, 'utf8'); + console.log(`Created: ${configPath}`); + + if (process.cwd() !== originalCwd) { + process.chdir(originalCwd); + } + }); + + return cmd; +} diff --git a/boat/doc-collector/src/config.ts b/boat/doc-collector/src/config.ts new file mode 100644 index 0000000..79ed513 --- /dev/null +++ b/boat/doc-collector/src/config.ts @@ -0,0 +1,162 @@ +import { existsSync, readFileSync } from 'node:fs'; +import path, { resolve } from 'node:path'; +import { parseEnv } from 'node:util'; +import { ConfigParser } from '../../../src/config.ts'; + +class DocbotConfigParser { + private static instance: DocbotConfigParser; + private config: DocbotConfig | null = null; + private configPath: string | null = null; + + private constructor() {} + + static getInstance(): DocbotConfigParser { + if (!DocbotConfigParser.instance) { + DocbotConfigParser.instance = new DocbotConfigParser(); + } + return DocbotConfigParser.instance; + } + + static loadEnv(filePath: string): void { + const resolved = resolve(filePath); + if (!existsSync(resolved)) return; + Object.assign(process.env, parseEnv(readFileSync(resolved, 'utf8'))); + } + + async loadConfig(options?: { config?: string; path?: string }): Promise { + if (this.config && !options?.config && !options?.path) { + return this.config; + } + + const originalCwd = process.cwd(); + if (options?.path) { + process.chdir(resolve(options.path)); + } + + DocbotConfigParser.loadEnv('.env'); + + try { + const resolvedPath = options?.config || this.findConfigFile(); + if (!resolvedPath) { + this.config = this.mergeWithDefaults({}); + this.configPath = null; + return this.config; + } + + const configModule = await this.loadConfigModule(resolvedPath); + const loadedConfig = configModule.default || configModule; + this.config = this.mergeWithDefaults(loadedConfig || {}); + this.configPath = resolvedPath; + return this.config; + } finally { + if (options?.path && originalCwd !== process.cwd()) { + process.chdir(originalCwd); + } + } + } + + getConfig(): DocbotConfig { + if (this.config) { + return this.config; + } + return this.mergeWithDefaults({}); + } + + getConfigPath(): string | null { + return this.configPath; + } + + getOutputDir(): string { + const outputDir = ConfigParser.getInstance().getOutputDir(); + const docsOutput = this.getConfig().docs?.output; + if (!docsOutput) { + return path.join(outputDir, 'docs'); + } + return path.join(outputDir, docsOutput); + } + + private findConfigFile(): string | null { + const possiblePaths = ['docbot.config.js', 'docbot.config.mjs', 'docbot.config.ts']; + + for (const candidate of possiblePaths) { + const fullPath = resolve(process.cwd(), candidate); + if (existsSync(fullPath)) { + return fullPath; + } + } + + return null; + } + + private async loadConfigModule(configPath: string): Promise { + const ext = configPath.split('.').pop(); + + if (ext === 'ts') { + try { + return await import(configPath); + } catch { + const require = (await import('node:module')).createRequire(import.meta.url); + return require(configPath); + } + } + + if (ext === 'js' || ext === 'mjs') { + return await import(configPath); + } + + return JSON.parse(readFileSync(configPath, 'utf8')); + } + + private mergeWithDefaults(config: Partial): DocbotConfig { + return this.deepMerge( + { + docs: { + maxPages: 100, + output: 'docs', + screenshot: true, + collapseDynamicPages: true, + scope: 'site', + includePaths: [], + excludePaths: [], + deniedPathSegments: ['callback', 'callbacks', 'logout', 'signout', 'sign_out', 'destroy', 'delete', 'remove'], + minCanActions: 1, + minInteractiveElements: 3, + }, + }, + config + ); + } + + private deepMerge(target: any, source: any): any { + const result = { ...target }; + + for (const key in source) { + if (source[key] && typeof source[key] === 'object' && !Array.isArray(source[key]) && source[key].constructor === Object) { + result[key] = this.deepMerge(result[key] || {}, source[key]); + continue; + } + result[key] = source[key]; + } + + return result; + } +} + +interface DocbotConfig { + docs?: { + maxPages?: number; + output?: string; + screenshot?: boolean; + prompt?: string; + collapseDynamicPages?: boolean; + scope?: 'site' | 'section' | 'subtree'; + includePaths?: string[]; + excludePaths?: string[]; + deniedPathSegments?: string[]; + minCanActions?: number; + minInteractiveElements?: number; + }; +} + +export { DocbotConfigParser }; +export type { DocbotConfig }; diff --git a/boat/doc-collector/src/docbot.ts b/boat/doc-collector/src/docbot.ts new file mode 100644 index 0000000..70d03f0 --- /dev/null +++ b/boat/doc-collector/src/docbot.ts @@ -0,0 +1,391 @@ +import { existsSync, mkdirSync, writeFileSync } from 'node:fs'; +import path from 'node:path'; +import { ExplorBot, type ExplorBotOptions } from '../../../src/explorbot.ts'; +import type { Link, WebPageState } from '../../../src/state-manager.ts'; +import { normalizeUrl } from '../../../src/state-manager.ts'; +import { sanitizeFilename } from '../../../src/utils/strings.ts'; +import { tag } from '../../../src/utils/logger.ts'; +import { Documentarian, type PageDocumentation } from './ai/documentarian.ts'; +import { type DocbotConfig, DocbotConfigParser } from './config.ts'; +import { type DocumentedPage, renderPageDocumentation, renderSpecIndex, type SkippedPage } from './docs-renderer.ts'; +import { getDocPageKey, shouldCrawlDocPath } from './path-filter.ts'; +import { extractResearchNavigationTargets } from './research-navigation.ts'; + +class DocBot { + private explorBot: ExplorBot; + private configParser: DocbotConfigParser; + private config: DocbotConfig = {}; + private documentarian!: Documentarian; + private options: DocbotOptions; + private scopeRoot = '/'; + + constructor(options: DocbotOptions = {}) { + this.options = options; + const baseUrl = this.extractAbsoluteBaseUrl(options.startUrl || '/'); + this.explorBot = new ExplorBot({ + baseUrl, + verbose: options.verbose, + config: options.config, + path: options.path, + show: options.show, + headless: options.headless, + incognito: options.incognito, + session: options.session, + }); + this.configParser = DocbotConfigParser.getInstance(); + } + + async start(): Promise { + await this.explorBot.start(); + this.config = await this.configParser.loadConfig({ + config: this.options.docsConfig, + path: this.options.path, + }); + this.documentarian = new Documentarian(this.explorBot.getProvider(), this.config); + this.ensureDirectory(this.configParser.getOutputDir()); + this.ensureDirectory(this.getPagesDir()); + } + + async stop(): Promise { + await this.explorBot.stop(); + } + + async collect(startPath: string, opts: CollectOptions = {}): Promise { + const effectiveStartPath = this.normalizeStartPath(startPath); + this.scopeRoot = this.getScopeRoot(effectiveStartPath); + const effectiveMaxPages = this.getMaxPages(opts.maxPages); + const queue: string[] = []; + const queued = new Set(); + const documented = new Set(); + const pages: DocumentedPage[] = []; + const skipped: SkippedPage[] = []; + const baseUrl = this.explorBot.getConfig().playwright.url; + + this.enqueuePath(effectiveStartPath, queue, queued); + + while (queue.length > 0 && pages.length < effectiveMaxPages) { + const target = queue.shift(); + if (!target) { + continue; + } + + const targetKey = this.getPageKey(target); + if (documented.has(targetKey)) { + continue; + } + + const stateManager = this.explorBot.getExplorer().getStateManager(); + if (stateManager.hasVisitedState(target)) { + continue; + } + + try { + tag('info').log(`Collecting docs for ${this.toDisplayUrl(target, baseUrl)}`); + await this.explorBot.visit(target); + + if (stateManager.isInDeadLoop()) { + tag('warning').log('Dead loop detected during docs crawl, stopping collection'); + skipped.push({ + url: target, + reason: 'dead loop detected during crawl', + }); + break; + } + + const state = this.explorBot.getCurrentState(); + if (!state) { + skipped.push({ + url: target, + reason: 'page state was not captured after navigation', + }); + continue; + } + + const pageKey = this.getPageKey(state.url || target); + if (documented.has(pageKey)) { + continue; + } + + const research = await this.explorBot.agentResearcher().research(state, { + screenshot: this.shouldUseScreenshots(), + force: true, + }); + const documentation = await this.documentarian.document(state, research); + const lowSignalReason = this.getLowSignalReason(documentation, research); + if (lowSignalReason) { + skipped.push({ + url: state.url, + reason: lowSignalReason, + }); + documented.add(pageKey); + continue; + } + const filePath = this.savePageDocumentation(state, documentation); + + pages.push({ + url: state.url, + title: state.title || '', + summary: documentation.summary, + canCount: documentation.can.length, + mightCount: documentation.might.length, + canActions: documentation.can.map((item) => item.action), + mightActions: documentation.might.map((item) => item.action), + filePath, + }); + documented.add(pageKey); + + const nextPaths = this.extractNextPaths(state, baseUrl, research); + for (const nextPath of nextPaths) { + if (documented.has(this.getPageKey(nextPath))) { + continue; + } + if (stateManager.hasVisitedState(nextPath)) { + continue; + } + this.enqueuePath(nextPath, queue, queued); + } + } catch (error) { + const reason = error instanceof Error ? error.message : String(error); + tag('warning').log(`Skipping ${target}: ${reason}`); + skipped.push({ + url: target, + reason, + }); + } + } + + const indexPath = this.saveIndex(effectiveStartPath, pages, skipped, effectiveMaxPages); + + return { + pages, + skipped, + indexPath, + outputDir: this.configParser.getOutputDir(), + }; + } + + private getMaxPages(override?: number): number { + if (override && override > 0) { + return override; + } + + const configured = this.config.docs?.maxPages; + if (configured && configured > 0) { + return configured; + } + + return 100; + } + + private shouldUseScreenshots(): boolean { + const screenshot = this.config.docs?.screenshot; + if (screenshot === false) { + return false; + } + return true; + } + + private extractNextPaths(state: WebPageState, baseUrl: string, research: string): string[] { + const paths: string[] = []; + const seen = new Set(); + + for (const link of state.links || []) { + const nextPath = this.resolveLink(link, baseUrl); + if (!nextPath) { + continue; + } + if (!shouldCrawlDocPath(nextPath, this.config)) { + continue; + } + if (!this.isInScope(nextPath)) { + continue; + } + if (seen.has(nextPath)) { + continue; + } + seen.add(nextPath); + paths.push(nextPath); + } + + for (const target of extractResearchNavigationTargets(state, research)) { + if (!shouldCrawlDocPath(target, this.config)) { + continue; + } + if (!this.isInScope(target)) { + continue; + } + if (seen.has(target)) { + continue; + } + seen.add(target); + paths.push(target); + } + + return paths; + } + + private resolveLink(link: Link, baseUrl: string): string | null { + let resolved: URL; + + try { + resolved = new URL(link.url, baseUrl); + } catch { + return null; + } + + const base = new URL(baseUrl); + if (resolved.origin !== base.origin) { + return null; + } + + const pathName = resolved.pathname || '/'; + return `${pathName}${resolved.search}${resolved.hash}`; + } + + private toDisplayUrl(target: string, baseUrl: string): string { + try { + return new URL(target, baseUrl).toString(); + } catch { + return target; + } + } + + private enqueuePath(inputPath: string, queue: string[], queued: Set): void { + const normalized = normalizeUrl(inputPath); + const pageKey = this.getPageKey(inputPath); + if (queued.has(pageKey)) { + return; + } + queued.add(pageKey); + if (!inputPath.startsWith('/')) { + queue.push(`/${normalized}`); + return; + } + queue.push(inputPath); + } + + private getPageKey(pageUrl: string): string { + return getDocPageKey(pageUrl, this.config); + } + + private normalizeStartPath(startPath: string): string { + try { + const parsed = new URL(startPath); + return `${parsed.pathname || '/'}${parsed.search}${parsed.hash}`; + } catch { + return startPath; + } + } + + private extractAbsoluteBaseUrl(startPath: string): string | undefined { + try { + const parsed = new URL(startPath); + return parsed.origin; + } catch { + return undefined; + } + } + + private isInScope(target: string): boolean { + const normalized = this.normalizeStartPath(target); + const scope = this.config.docs?.scope || 'site'; + + if (scope === 'site') { + return true; + } + + if (scope === 'subtree') { + return normalized === this.scopeRoot || normalized.startsWith(`${this.scopeRoot}/`); + } + + if (scope === 'section') { + return normalized === this.scopeRoot || normalized.startsWith(`${this.scopeRoot}/`) || normalized.startsWith(`${this.scopeRoot}-`); + } + + return true; + } + + private getScopeRoot(startPath: string): string { + const normalized = this.normalizeStartPath(startPath); + const parts = normalized.split('/').filter(Boolean); + if (parts.length === 0) { + return '/'; + } + if (parts.length >= 4) { + return `/${parts.slice(0, 4).join('/')}`; + } + return `/${parts.join('/')}`; + } + + private getLowSignalReason(documentation: PageDocumentation, research: string): string | null { + const minCanActions = this.config.docs?.minCanActions ?? 1; + const minInteractiveElements = this.config.docs?.minInteractiveElements ?? 3; + + if (documentation.can.length >= minCanActions) { + return null; + } + + const interactiveCount = this.countInteractiveElements(research); + if (interactiveCount >= minInteractiveElements) { + return null; + } + + return `low-signal page: only ${documentation.can.length} proven actions and ${interactiveCount} interactive elements`; + } + + private countInteractiveElements(research: string): number { + const matches = [...research.matchAll(/\((\d+) elements?\)/g)]; + return matches.reduce((sum, match) => sum + Number.parseInt(match[1], 10), 0); + } + + private savePageDocumentation(state: WebPageState, documentation: PageDocumentation): string { + const pagePath = this.getPageFilePath(state.url); + writeFileSync(pagePath, renderPageDocumentation(state, documentation), 'utf8'); + return pagePath; + } + + private saveIndex(startPath: string, pages: DocumentedPage[], skipped: SkippedPage[], maxPages: number): string { + const indexPath = path.join(this.configParser.getOutputDir(), 'spec.md'); + writeFileSync(indexPath, renderSpecIndex(this.configParser.getOutputDir(), startPath, pages, skipped, maxPages), 'utf8'); + return indexPath; + } + + private getPagesDir(): string { + return path.join(this.configParser.getOutputDir(), 'pages'); + } + + private getPageFilePath(pageUrl: string): string { + const normalized = normalizeUrl(pageUrl || '/'); + const baseName = sanitizeFilename(normalized || 'root'); + if (baseName) { + return path.join(this.getPagesDir(), `${baseName}.md`); + } + return path.join(this.getPagesDir(), 'root.md'); + } + + private ensureDirectory(dirPath: string): void { + if (existsSync(dirPath)) { + return; + } + mkdirSync(dirPath, { recursive: true }); + } +} + +interface DocbotOptions extends ExplorBotOptions { + docsConfig?: string; + startUrl?: string; +} + +interface CollectOptions { + maxPages?: number; +} + +interface CollectionResult { + pages: DocumentedPage[]; + skipped: SkippedPage[]; + indexPath: string; + outputDir: string; +} + +export { DocBot }; +export type { DocbotOptions, CollectOptions, CollectionResult, DocumentedPage, SkippedPage }; diff --git a/boat/doc-collector/src/docs-renderer.ts b/boat/doc-collector/src/docs-renderer.ts new file mode 100644 index 0000000..0be4736 --- /dev/null +++ b/boat/doc-collector/src/docs-renderer.ts @@ -0,0 +1,187 @@ +import path from 'node:path'; +import type { WebPageState } from '../../../src/state-manager.ts'; +import type { PageDocumentation } from './ai/documentarian.ts'; + +function renderPageDocumentation(state: WebPageState, documentation: PageDocumentation): string { + const lines: string[] = []; + lines.push(`# ${state.url}`); + lines.push(''); + + if (state.title) { + lines.push(`Title: ${normalizeInlineText(state.title)}`); + lines.push(''); + } + + lines.push('## Purpose'); + lines.push(''); + lines.push(ensureSentence(documentation.summary)); + lines.push(''); + lines.push('## User Can'); + lines.push(''); + + if (documentation.can.length === 0) { + lines.push('- No proven actions were identified from the collected research.'); + lines.push(''); + } + + for (const item of documentation.can) { + lines.push(`- ${normalizeAction(item.action)} -> ${item.scope}`); + lines.push(` Proof: ${ensureSentence(item.evidence)}`); + } + + if (documentation.can.length > 0) { + lines.push(''); + } + + lines.push('## User Might'); + lines.push(''); + + if (documentation.might.length === 0) { + lines.push('- No assumption-based actions were identified.'); + lines.push(''); + } + + for (const item of documentation.might) { + lines.push(`- ${normalizeAction(item.action, 'might')} -> ${item.scope}`); + lines.push(` Signal: ${ensureSentence(item.evidence)}`); + } + + if (documentation.might.length > 0) { + lines.push(''); + } + + return `${lines.join('\n').trimEnd()}\n`; +} + +function renderSpecIndex(outputDir: string, startPath: string, pages: DocumentedPage[], skipped: SkippedPage[], maxPages: number): string { + const lines: string[] = []; + lines.push('# Website Spec'); + lines.push(''); + lines.push('## Overview'); + lines.push(''); + lines.push(`Start page: ${startPath}`); + lines.push(`Pages documented: ${pages.length}`); + lines.push(`Pages skipped: ${skipped.length}`); + lines.push(`Max pages: ${maxPages}`); + lines.push(''); + lines.push('## Pages'); + lines.push(''); + + if (pages.length === 0) { + lines.push('- No pages were documented.'); + lines.push(''); + } + + for (const page of pages) { + const relativeFile = path.relative(outputDir, page.filePath).replaceAll('\\', '/'); + lines.push(`### [${page.url}](${relativeFile})`); + lines.push(''); + lines.push(`Purpose: ${ensureSentence(page.summary)}`); + lines.push(`Proven actions: ${page.canCount}`); + lines.push(`Possible actions: ${page.mightCount}`); + if (page.title) { + lines.push(`Title: ${normalizeInlineText(page.title)}`); + } + lines.push(''); + + if (page.canActions.length > 0) { + lines.push('User Can:'); + for (const action of page.canActions) { + lines.push(`- ${normalizeAction(action, 'can')}`); + } + lines.push(''); + } + + if (page.mightActions.length > 0) { + lines.push('User Might:'); + for (const action of page.mightActions) { + lines.push(`- ${normalizeAction(action, 'might')}`); + } + lines.push(''); + } + } + + if (skipped.length > 0) { + lines.push('## Skipped'); + lines.push(''); + + for (const page of skipped) { + lines.push(`- ${page.url}. Reason: ${ensureSentence(page.reason)}`); + } + + lines.push(''); + } + + return `${lines.join('\n').trimEnd()}\n`; +} + +function normalizeAction(action: string, kind: 'can' | 'might' = 'can'): string { + const trimmed = normalizeInlineText(action); + if (!trimmed) { + return 'user can interact with this page'; + } + + const normalized = ensureSentence(trimmed).slice(0, -1); + const lower = normalized.toLowerCase(); + + if (kind === 'can') { + if (lower.startsWith('user can ')) { + return normalized; + } + if (lower.startsWith('can ')) { + return `user can ${normalized.slice(4)}`; + } + if (lower.startsWith('user might ')) { + return `user can ${normalized.slice(11)}`; + } + return `user can ${normalized}`; + } + + if (lower.startsWith('user might ')) { + return normalized; + } + if (lower.startsWith('might ')) { + return `user might ${normalized.slice(6)}`; + } + if (lower.startsWith('user can ')) { + return `user might ${normalized.slice(9)}`; + } + if (lower.startsWith('can ')) { + return `user might ${normalized.slice(4)}`; + } + return `user might ${normalized}`; +} + +function ensureSentence(text: string): string { + const trimmed = normalizeInlineText(text); + if (!trimmed) { + return ''; + } + if (/[.!?]$/.test(trimmed)) { + return trimmed; + } + return `${trimmed}.`; +} + +function normalizeInlineText(text: string): string { + return text.normalize('NFKC').replace(/\s+/g, ' ').trim(); +} + +interface DocumentedPage { + url: string; + title: string; + summary: string; + canCount: number; + mightCount: number; + canActions: string[]; + mightActions: string[]; + filePath: string; +} + +interface SkippedPage { + url: string; + reason: string; +} + +export { renderPageDocumentation, renderSpecIndex, ensureSentence, normalizeAction }; +export type { DocumentedPage, SkippedPage }; diff --git a/boat/doc-collector/src/path-filter.ts b/boat/doc-collector/src/path-filter.ts new file mode 100644 index 0000000..a2fda2c --- /dev/null +++ b/boat/doc-collector/src/path-filter.ts @@ -0,0 +1,46 @@ +import { normalizeUrl } from '../../../src/state-manager.ts'; +import { matchesUrl, generalizeUrl } from '../../../src/utils/url-matcher.ts'; +import type { DocbotConfig } from './config.ts'; + +const DEFAULT_DENIED_PATH_SEGMENTS = ['callback', 'callbacks', 'logout', 'signout', 'sign_out', 'destroy', 'delete', 'remove']; + +export function shouldCrawlDocPath(nextPath: string, config: DocbotConfig = {}): boolean { + const parsed = new URL(nextPath, 'http://localhost'); + const segments = parsed.pathname + .split('/') + .map((segment) => segment.trim().toLowerCase()) + .filter(Boolean); + const normalizedPath = parsed.pathname || '/'; + + const includePaths = config.docs?.includePaths || []; + if (includePaths.length > 0) { + return includePaths.some((pattern) => matchesUrl(pattern, normalizedPath)); + } + + const excludePaths = config.docs?.excludePaths || []; + if (excludePaths.some((pattern) => matchesUrl(pattern, normalizedPath))) { + return false; + } + + if (segments.length === 0) { + return true; + } + + const terminalActions = new Set((config.docs?.deniedPathSegments || DEFAULT_DENIED_PATH_SEGMENTS).map((segment) => segment.trim().toLowerCase()).filter(Boolean)); + if (segments.some((segment) => terminalActions.has(segment))) { + return false; + } + + return true; +} + +export function getDocPageKey(pageUrl: string, config: DocbotConfig = {}): string { + const normalized = normalizeUrl(pageUrl || '/'); + const path = normalized.startsWith('/') ? normalized : `/${normalized}`; + + if (config.docs?.collapseDynamicPages === false) { + return normalizeUrl(path); + } + + return normalizeUrl(generalizeUrl(path)); +} diff --git a/boat/doc-collector/src/research-navigation.ts b/boat/doc-collector/src/research-navigation.ts new file mode 100644 index 0000000..e130a0d --- /dev/null +++ b/boat/doc-collector/src/research-navigation.ts @@ -0,0 +1,90 @@ +import type { WebPageState } from '../../../src/state-manager.ts'; +import { parseResearchSections, type ResearchElement } from '../../../src/ai/researcher/parser.ts'; + +const OPEN_API_TAG_SELECTOR_PATTERN = /api-\d+\/tag\/([a-z0-9-]+)(?:["'#/\]\s]|$)/i; +const OPEN_API_NAVIGATION_SECTION_KEYWORDS = ['navigation', 'menu']; + +export function extractResearchNavigationTargets(state: WebPageState, research: string): string[] { + const currentUrl = state.url || '/'; + const sections = parseResearchSections(research); + const targets: string[] = []; + const seen = new Set(); + + for (const section of sections) { + const sectionName = section.name.toLowerCase(); + if (!OPEN_API_NAVIGATION_SECTION_KEYWORDS.some((keyword) => sectionName.includes(keyword))) { + continue; + } + + for (const element of section.elements) { + const target = extractNavigationTarget(currentUrl, element); + if (!target || seen.has(target)) { + continue; + } + + seen.add(target); + targets.push(target); + } + } + + return targets; +} + +function extractNavigationTarget(currentUrl: string, element: ResearchElement): string | null { + const openApiTagFromCss = extractOpenApiTagHashFromCss(element.css); + if (openApiTagFromCss) { + return buildSamePageHashTarget(currentUrl, openApiTagFromCss); + } + + if (!currentUrl.includes('#tag/')) { + return null; + } + + const inferredOpenApiTag = inferOpenApiTagSlugFromLabel(element.name); + if (!inferredOpenApiTag) { + return null; + } + + return buildSamePageHashTarget(currentUrl, `tag/${inferredOpenApiTag}`); +} + +function extractOpenApiTagHashFromCss(css: string | null): string | null { + if (!css) { + return null; + } + + const normalizedSelector = css.replaceAll('\\/', '/'); + const match = normalizedSelector.match(OPEN_API_TAG_SELECTOR_PATTERN); + if (!match?.[1]) { + return null; + } + + return `tag/${match[1].toLowerCase()}`; +} + +function inferOpenApiTagSlugFromLabel(name: string): string | null { + const cleanedLabel = name + .replace(/^'+|'+$/g, '') + .replace(/\(expanded\)|\(collapsed\)|open group|close group|show more/gi, '') + .trim(); + + if (!cleanedLabel.includes('/')) { + return null; + } + + const slug = cleanedLabel + .split('/') + .map((part) => part.trim().toLowerCase()) + .filter(Boolean) + .join('-') + .replace(/[^a-z0-9-]+/g, '-') + .replace(/-+/g, '-') + .replace(/^-|-$/g, ''); + + return slug || null; +} + +function buildSamePageHashTarget(currentUrl: string, hashPath: string): string { + const [baseWithSearch] = currentUrl.split('#'); + return `${baseWithSearch}#${hashPath}`; +} diff --git a/bun.lock b/bun.lock index 55e37da..613fe7a 100644 --- a/bun.lock +++ b/bun.lock @@ -22,7 +22,7 @@ "@opentelemetry/sdk-trace-base": "^2.2.0", "@opentelemetry/semantic-conventions": "^1.38.0", "@scalar/openapi-parser": "^0.25.6", - "@testomatio/reporter": "^2.7.9-beta.2-markdown", + "@testomatio/reporter": "^2.7.9-beta.3-markdown", "ai": "^6.0.6", "axe-core": "^4.11.1", "bash-tool": "^1.3.15", @@ -934,7 +934,7 @@ "@testing-library/react": ["@testing-library/react@16.3.0", "", { "dependencies": { "@babel/runtime": "^7.12.5" }, "peerDependencies": { "@testing-library/dom": "^10.0.0", "@types/react": "^18.0.0 || ^19.0.0", "@types/react-dom": "^18.0.0 || ^19.0.0", "react": "^18.0.0 || ^19.0.0", "react-dom": "^18.0.0 || ^19.0.0" }, "optionalPeers": ["@types/react", "@types/react-dom"] }, "sha512-kFSyxiEDwv1WLl2fgsq6pPBbw5aWKrsY2/noi1Id0TK0UParSF62oFQFGHXIyaG4pp2tEub/Zlel+fjjZILDsw=="], - "@testomatio/reporter": ["@testomatio/reporter@2.7.9-beta.2-markdown", "", { "dependencies": { "@aws-sdk/client-s3": "^3.279.0", "@aws-sdk/lib-storage": "^3.279.0", "@cucumber/cucumber": "^10.9.0", "@octokit/rest": "^21.1.1", "callsite-record": "^4.1.4", "commander": "^12", "cross-spawn": "^7.0.3", "csv-writer": "^1.6.0", "debug": "4.3.4", "dotenv": "^16.0.1", "fast-xml-parser": "^5.3.4", "file-url": "3.0.0", "filesize": "^10.1.6", "gaxios": ">=6.0 || >=7.0.0-rc.4 || <8", "glob": "^10.3", "handlebars": "^4.7.8", "has-flag": "^5.0.1", "humanize-duration": "^3.27.3", "is-valid-path": "^0.1.1", "js-yaml": "^4.1.1", "json-cycle": "^1.3.0", "lodash.memoize": "^4.1.2", "lodash.merge": "^4.6.2", "minimatch": "^10.2.4", "picocolors": "^1.0.1", "pretty-ms": "^7.0.1", "promise-retry": "^2.0.1", "strip-ansi": "7.1.0", "uuid": "^9.0.0" }, "bin": { "report-xml": "src/bin/reportXml.js", "start-test-run": "src/bin/startTest.js", "upload-artifacts": "src/bin/uploadArtifacts.js", "reporter": "src/bin/cli.js" } }, "sha512-RBZAN/Je4FwDUkCv07BWgbkhllwcesd3C7dNGgUhbJ/5c+qITInZuf7GAGMPLwCJJm3JJjkfVcLkX+Injw5ppA=="], + "@testomatio/reporter": ["@testomatio/reporter@2.7.9-beta.3-markdown", "", { "dependencies": { "@aws-sdk/client-s3": "^3.279.0", "@aws-sdk/lib-storage": "^3.279.0", "@cucumber/cucumber": "^10.9.0", "@octokit/rest": "^21.1.1", "callsite-record": "^4.1.4", "commander": "^12", "cross-spawn": "^7.0.3", "csv-writer": "^1.6.0", "debug": "4.3.4", "dotenv": "^16.0.1", "fast-xml-parser": "^5.3.4", "file-url": "3.0.0", "filesize": "^10.1.6", "gaxios": ">=6.0 || >=7.0.0-rc.4 || <8", "glob": "^10.3", "handlebars": "^4.7.8", "has-flag": "^5.0.1", "humanize-duration": "^3.27.3", "is-valid-path": "^0.1.1", "js-yaml": "^4.1.1", "json-cycle": "^1.3.0", "lodash.memoize": "^4.1.2", "lodash.merge": "^4.6.2", "marked": "^14.1.4", "minimatch": "^10.2.4", "picocolors": "^1.0.1", "pretty-ms": "^7.0.1", "promise-retry": "^2.0.1", "strip-ansi": "7.1.0", "uuid": "^9.0.0" }, "bin": { "report-xml": "src/bin/reportXml.js", "start-test-run": "src/bin/startTest.js", "upload-artifacts": "src/bin/uploadArtifacts.js", "reporter": "src/bin/cli.js" } }, "sha512-up5EWx9WV9AX+jYwStCXaXyPOH4DDTtsiJ218KVDTUVpAnim563dEm6DCgYznMJelmahKs1/Bi+J8Duyu7JfOg=="], "@tokenizer/inflate": ["@tokenizer/inflate@0.4.1", "", { "dependencies": { "debug": "^4.4.3", "token-types": "^6.1.1" } }, "sha512-2mAv+8pkG6GIZiF1kNg1jAjh27IDxEPKwdGul3snfztFerfPGI1LjDezZp3i7BElXompqEtPmoPx6c2wgtWsOA=="], @@ -2848,6 +2848,8 @@ "@testomatio/reporter/js-yaml": ["js-yaml@4.1.1", "", { "dependencies": { "argparse": "^2.0.1" }, "bin": { "js-yaml": "bin/js-yaml.js" } }, "sha512-qQKT4zQxXl8lLwBtHMWwaTcGfFOZviOJet3Oy/xmGk2gZH677CJM9EvtfdSkgWcATZhj/55JZ0rmy3myCT5lsA=="], + "@testomatio/reporter/marked": ["marked@14.1.4", "", { "bin": { "marked": "bin/marked.js" } }, "sha512-vkVZ8ONmUdPnjCKc5uTRvmkRbx4EAi2OkTOXmfTDhZz3OFqMNBM1oTTWwTr4HY4uAEojhzPf+Fy8F1DWa3Sndg=="], + "@testomatio/reporter/strip-ansi": ["strip-ansi@7.1.0", "", { "dependencies": { "ansi-regex": "^6.0.1" } }, "sha512-iq6eVVI64nQQTRYq2KtEg2d2uU7LElhTJwsH4YzIHZshxlgZms/wIc4VoDQTlG/IvVIrBKG06CrZnp0qv7hkcQ=="], "@types/jsdom/parse5": ["parse5@7.3.0", "", { "dependencies": { "entities": "^6.0.0" } }, "sha512-IInvU7fabl34qmi9gY8XOVxhYyMyuH2xUNpb2q8/Y+7552KlejkRvqvD19nMoUW/uQGGbqNpA6Tufu5FL5BZgw=="], diff --git a/docs/commands.md b/docs/commands.md index 3aa791c..bb4f79c 100644 --- a/docs/commands.md +++ b/docs/commands.md @@ -23,7 +23,7 @@ CLI commands run headless by default, execute the task, and exit. TUI commands r ## Common Options -These options are available on all CLI commands (`start`, `explore`, `plan`, `drill`, `research`, `context`): +These options are available on all CLI commands (`start`, `explore`, `plan`, `drill`, `research`, `context`, `docs collect`): | Option | Description | |--------|-------------| @@ -274,6 +274,36 @@ Navigate to a URI or state using AI assistance. The Navigator agent figures out how to reach the destination. +## Documentation Collection + +### `explorbot docs collect ` + +Crawl pages and generate a documentation spec with `Purpose`, `User Can`, and `User Might` sections for each documented page. + +```bash +explorbot docs collect /users/sign_in +explorbot docs collect /docs/openapi#tag/project-analytics-tags --max-pages 20 +explorbot docs collect https://teleportal.ua/ua/serials/stb/kod --path explorbot-testing --show --session --max-pages 20 +``` + +Output is written to: + +- `output/docs/spec.md` +- `output/docs/pages/*.md` + +Use `docbot.config.*` to control crawl scope, path filters, dynamic-page collapsing, and low-signal page skipping. + +See [Documentation Collection](./doc-collector.md) for full configuration, crawl modes, and examples. + +### `explorbot docs init` + +Create a starter `docbot.config.ts` file. + +```bash +explorbot docs init +explorbot docs init --path explorbot-testing +``` + ## Test Rerun ### `/runs [file]` diff --git a/docs/doc-collector.md b/docs/doc-collector.md new file mode 100644 index 0000000..2059246 --- /dev/null +++ b/docs/doc-collector.md @@ -0,0 +1,140 @@ +# Documentation Collection + +`doc-collector` crawls pages and generates a lightweight spec: + +- `output/docs/spec.md` +- `output/docs/pages/*.md` +- `output/research/*.md` + +Each page is summarized as: + +- `Purpose` +- `User Can` +- `User Might` + +## Commands + +### `explorbot docs collect ` + +Start from a relative path or a full URL: + +```bash +explorbot docs collect /users/sign_in +explorbot docs collect /docs/openapi#tag/project-analytics-tags --max-pages 20 +explorbot docs collect https://teleportal.ua/ua/serials/stb/kod --path explorbot-testing --show --session --max-pages 20 +``` + +Supported options: + +| Option | Description | +|--------|-------------| +| `--max-pages ` | Limit documented pages | +| `-c, --config ` | Path to `explorbot.config.*` | +| `--docs-config ` | Path to `docbot.config.*` | +| `-p, --path ` | Working directory | +| `-s, --show` | Show browser window | +| `--headless` | Run headless | +| `--incognito` | Do not record experiences | +| `--session [file]` | Save or restore browser session | +| `-v, --verbose` | Verbose logging | +| `--debug` | Debug logging | + +If you pass a full URL, its origin is used as the runtime base URL for that run. + +### `explorbot docs init` + +Create a starter `docbot.config.ts`: + +```bash +explorbot docs init +explorbot docs init --path explorbot-testing +explorbot docs init --path explorbot-testing --force +``` + +### Standalone CLI + +```bash +bun boat/doc-collector/bin/doc-collector-cli.ts collect /users/sign_in --max-pages 20 +``` + +## Config + +The collector loads `docbot.config.js`, `docbot.config.mjs`, or `docbot.config.ts`. If none exists, defaults are used. + +```ts +export default { + docs: { + maxPages: 100, + output: 'docs', + screenshot: true, + collapseDynamicPages: true, + scope: 'site', + includePaths: [], + excludePaths: [], + deniedPathSegments: ['callback', 'callbacks', 'logout', 'signout', 'sign_out', 'destroy', 'delete', 'remove'], + minCanActions: 1, + minInteractiveElements: 3, + // prompt: 'Add domain-specific guidance here', + }, +}; +``` + +| Option | Default | Description | +|--------|---------|-------------| +| `maxPages` | `100` | Maximum pages to document | +| `output` | `'docs'` | Output folder inside `output/` | +| `screenshot` | `true` | Allow screenshot-assisted research | +| `prompt` | unset | Extra instructions for the Documentarian | +| `collapseDynamicPages` | `true` | Collapse dynamic URLs like `/users/123` and `/users/456` into one crawl key | +| `scope` | `'site'` | Crawl breadth mode | +| `includePaths` | `[]` | Only allow matching paths | +| `excludePaths` | `[]` | Exclude matching paths | +| `deniedPathSegments` | built-in list | Block terminal or destructive endpoints | +| `minCanActions` | `1` | Minimum proven actions before a page is considered low-signal | +| `minInteractiveElements` | `3` | Minimum interactive elements before a page is considered low-signal | + +## Scope Modes + +### `site` + +Crawl across the whole current origin. + +### `subtree` + +Stay inside the starting path and its descendants. + +Start page: + +```text +/ua/serials/stb/kod +``` + +Allowed: + +- `/ua/serials/stb/kod` +- `/ua/serials/stb/kod/2026` +- `/ua/serials/stb/kod/2025/seriya-12` + +Blocked: + +- `/ua/serials` +- `/ua/show` +- `/ua/person/...` + +### `section` + +Softer boundary than `subtree`: keep the same scope root, its descendants, and closely related slug variations. + +## Notes + +- same-origin only +- visited pages are tracked through the state manager +- dead loops are stopped +- next targets are discovered from links and research navigation +- low-signal pages can be skipped + +## Related Docs + +- [commands.md](./commands.md) - terminal command reference +- [configuration.md](./configuration.md) - main Explorbot configuration +- [researcher.md](./researcher.md) - researcher behavior diff --git a/package.json b/package.json index 120d2d8..baf45dd 100644 --- a/package.json +++ b/package.json @@ -14,7 +14,7 @@ "bin": { "explorbot": "./dist/bin/explorbot-cli.js" }, - "files": ["dist/", "src/**/*.ts", "src/**/*.tsx", "bin/**/*.ts", "boat/api-tester/src/**/*.ts", "rules/", "assets/sample-files/"], + "files": ["dist/", "src/**/*.ts", "src/**/*.tsx", "bin/**/*.ts", "boat/api-tester/src/**/*.ts", "boat/doc-collector/src/**/*.ts", "boat/doc-collector/bin/**/*.ts", "boat/doc-collector/package.json", "rules/", "assets/sample-files/"], "scripts": { "build": "bun run src/index.tsx build && bun run build:bin", "build:bin": "bun build bin/explorbot-cli.ts --outdir bin --target node --external commander --format esm", diff --git a/src/ai/researcher.ts b/src/ai/researcher.ts index be503f8..d6708ae 100644 --- a/src/ai/researcher.ts +++ b/src/ai/researcher.ts @@ -121,7 +121,8 @@ export class Researcher extends ResearcherBase implements Agent { const sessionName = `researcher: ${state.url}`; return Observability.run(sessionName, { tags: ['researcher'], sessionId: stateHash }, async () => { - tag('info').log(`Researching ${state.url} to understand the context...`); + const displayUrl = state.fullUrl || state.url; + tag('info').log(`Researching ${displayUrl} to understand the context...`); setActivity(`${this.emoji} Researching...`, 'action'); await this.ensureNavigated(state.url, screenshot && this.provider.hasVision()); diff --git a/src/ai/researcher/coordinates.ts b/src/ai/researcher/coordinates.ts index b4d9b43..9a45f04 100644 --- a/src/ai/researcher/coordinates.ts +++ b/src/ai/researcher/coordinates.ts @@ -198,7 +198,7 @@ export function WithCoordinates(Base: T) { const eidxWithoutCoords: string[] = []; for (const section of sections) { for (const el of section.elements) { - if (el.eidx && !el.coordinates) eidxWithoutCoords.push(el.eidx); + if (el.eidx && /^e\d+$/i.test(el.eidx) && !el.coordinates) eidxWithoutCoords.push(el.eidx); } } if (eidxWithoutCoords.length === 0) return; diff --git a/src/ai/researcher/parser.ts b/src/ai/researcher/parser.ts index 0bfb09f..f6cab4d 100644 --- a/src/ai/researcher/parser.ts +++ b/src/ai/researcher/parser.ts @@ -64,6 +64,9 @@ export function mapRowToElement(row: Record): ResearchElement | let eidxRaw = (colMap.eidx || '').trim(); if (eidxRaw && /^\d+$/.test(eidxRaw)) eidxRaw = `e${eidxRaw}`; + if (eidxRaw && !/^e\d+$/i.test(eidxRaw)) { + eidxRaw = ''; + } const aria = parseAriaLocator(colMap.aria || '-'); diff --git a/src/config.ts b/src/config.ts index f1b65d2..105677e 100644 --- a/src/config.ts +++ b/src/config.ts @@ -266,6 +266,7 @@ export class ConfigParser { private static instance: ConfigParser; private config: ExplorbotConfig | null = null; private configPath: string | null = null; + private runtimeBaseUrlOverride: string | null = null; private constructor() {} @@ -285,8 +286,9 @@ export class ConfigParser { public async loadConfig(options?: { config?: string; path?: string; + baseUrl?: string; }): Promise { - if (this.config && !options?.config && !options?.path) { + if (this.config && !options?.config && !options?.path && this.runtimeBaseUrlOverride === (options?.baseUrl || null)) { return this.config; } @@ -317,7 +319,8 @@ export class ConfigParser { throw new Error('Configuration file is empty or invalid'); } - this.config = this.resolveConfig(loadedConfig as ExplorbotConfig); + this.config = this.resolveConfig(loadedConfig as ExplorbotConfig, options); + this.runtimeBaseUrlOverride = options?.baseUrl || null; this.configPath = resolvedPath; log(`Configuration loaded from: ${resolvedPath}`); @@ -372,6 +375,7 @@ export class ConfigParser { if (ConfigParser.instance) { ConfigParser.instance.config = null; ConfigParser.instance.configPath = null; + ConfigParser.instance.runtimeBaseUrlOverride = null; } } @@ -455,11 +459,17 @@ export class ConfigParser { } } - private resolveConfig(config: ExplorbotConfig): ExplorbotConfig { + private resolveConfig(config: ExplorbotConfig, options?: { baseUrl?: string }): ExplorbotConfig { if (config.web?.url && !config.playwright?.url) { config.playwright = config.playwright || { browser: 'chromium', url: '' }; config.playwright.url = config.web.url; } + + if (options?.baseUrl) { + config.playwright = config.playwright || { browser: 'chromium', url: '' }; + config.playwright.url = options.baseUrl; + } + return config; } diff --git a/src/explorbot.ts b/src/explorbot.ts index e377b5b..e6ea64a 100644 --- a/src/explorbot.ts +++ b/src/explorbot.ts @@ -34,6 +34,7 @@ import { sanitizeFilename } from './utils/strings.ts'; export interface ExplorBotOptions { from?: string; + baseUrl?: string; verbose?: boolean; config?: string; path?: string; diff --git a/src/state-manager.ts b/src/state-manager.ts index 0fbb4a2..c9dc246 100644 --- a/src/state-manager.ts +++ b/src/state-manager.ts @@ -547,6 +547,10 @@ export class StateManager { } export function normalizeUrl(url: string): string { + if (url.startsWith('/')) { + return url.replace(/^\/+/, '').replace(/\/+$/g, ''); + } + try { const parsed = new URL(url, 'http://localhost'); const path = parsed.pathname.replace(/^\/+|\/+$/g, ''); diff --git a/src/utils/url-matcher.ts b/src/utils/url-matcher.ts index de73d06..7198a3c 100644 --- a/src/utils/url-matcher.ts +++ b/src/utils/url-matcher.ts @@ -82,10 +82,13 @@ export function matchesUrl(pattern: string, path: string): boolean { } export function extractStatePath(url: string): string { - if (url.startsWith('/')) return url; + if (url.startsWith('/')) { + return `/${url.replace(/^\/+/, '')}`; + } try { const urlObj = new URL(url); - return `${urlObj.pathname}${urlObj.search}${urlObj.hash}`; + const normalizedPathname = `/${urlObj.pathname.replace(/^\/+/, '')}`; + return `${normalizedPathname}${urlObj.search}${urlObj.hash}`; } catch { return url; } diff --git a/src/utils/web-element.ts b/src/utils/web-element.ts index 7e0e593..83beb39 100644 --- a/src/utils/web-element.ts +++ b/src/utils/web-element.ts @@ -122,7 +122,8 @@ export class WebElement { } static async fromEidxList(page: any, eidxList: string[]): Promise { - if (eidxList.length === 0) return []; + const validEidxList = eidxList.filter((eidx) => /^e\d+$/i.test(eidx)); + if (validEidxList.length === 0) return []; const rawList: RawElementData[] = await page.evaluate( ([list, extractFnStr, config]: [string[], string, ElementExtractionConfig]) => { @@ -136,7 +137,7 @@ export class WebElement { } return results; }, - [eidxList, getElementDataExtractorSource(), ELEMENT_EXTRACTION_CONFIG] as [string[], string, ElementExtractionConfig] + [validEidxList, getElementDataExtractorSource(), ELEMENT_EXTRACTION_CONFIG] as [string[], string, ElementExtractionConfig] ); return rawList.map((d) => WebElement.fromRawData(d)); diff --git a/tests/unit/config.test.ts b/tests/unit/config.test.ts new file mode 100644 index 0000000..1bfb587 --- /dev/null +++ b/tests/unit/config.test.ts @@ -0,0 +1,42 @@ +import { beforeEach, describe, expect, it } from 'bun:test'; +import { ConfigParser } from '../../src/config.ts'; + +describe('ConfigParser runtime baseUrl overrides', () => { + beforeEach(() => { + ConfigParser.resetForTesting(); + }); + + it('reloads config when runtime baseUrl override changes', async () => { + const parser = ConfigParser.getInstance(); + const originalLoadConfigModule = (parser as any).loadConfigModule; + const originalFindConfigFile = (parser as any).findConfigFile; + + (parser as any).findConfigFile = () => '/virtual/explorbot.config.ts'; + (parser as any).loadConfigModule = async () => ({ + default: { + playwright: { + url: 'https://default.example.com', + browser: 'chromium', + }, + ai: { + model: { modelId: 'test-model', provider: 'test' }, + config: {}, + }, + }, + }); + + try { + const first = await parser.loadConfig({ baseUrl: 'https://one.example.com' }); + const second = await parser.loadConfig({ baseUrl: 'https://two.example.com' }); + const fallback = await parser.loadConfig(); + + expect(first.playwright.url).toBe('https://one.example.com'); + expect(second.playwright.url).toBe('https://two.example.com'); + expect(fallback.playwright.url).toBe('https://default.example.com'); + } finally { + (parser as any).loadConfigModule = originalLoadConfigModule; + (parser as any).findConfigFile = originalFindConfigFile; + ConfigParser.resetForTesting(); + } + }); +}); diff --git a/tests/unit/doc-collector.test.ts b/tests/unit/doc-collector.test.ts new file mode 100644 index 0000000..ee30b7b --- /dev/null +++ b/tests/unit/doc-collector.test.ts @@ -0,0 +1,247 @@ +import { describe, expect, it } from 'bun:test'; +import { DocBot } from '../../boat/doc-collector/src/docbot.ts'; +import { Documentarian } from '../../boat/doc-collector/src/ai/documentarian.ts'; +import { normalizeAction, renderPageDocumentation, renderSpecIndex } from '../../boat/doc-collector/src/docs-renderer.ts'; +import { getDocPageKey, shouldCrawlDocPath } from '../../boat/doc-collector/src/path-filter.ts'; +import { extractResearchNavigationTargets } from '../../boat/doc-collector/src/research-navigation.ts'; + +describe('doc-collector path filter', () => { + it('allows regular documentation pages', () => { + expect(shouldCrawlDocPath('/users/sign_in')).toBe(true); + expect(shouldCrawlDocPath('/users/sign_up')).toBe(true); + expect(shouldCrawlDocPath('/users/password/new')).toBe(true); + expect(shouldCrawlDocPath('/users/sso')).toBe(true); + expect(shouldCrawlDocPath('/users/auth/google_oauth2')).toBe(true); + }); + + it('skips callback and destructive endpoints', () => { + expect(shouldCrawlDocPath('/users/auth/github/callback')).toBe(false); + expect(shouldCrawlDocPath('/logout')).toBe(false); + }); + + it('supports config-driven include and exclude path policies', () => { + expect( + shouldCrawlDocPath('/admin/users', { + docs: { + excludePaths: ['/admin/*'], + }, + }) + ).toBe(false); + + expect( + shouldCrawlDocPath('/admin/users', { + docs: { + includePaths: ['/admin/*'], + }, + }) + ).toBe(true); + }); + + it('generalizes dynamic pages into one crawl key by default', () => { + expect(getDocPageKey('/users/123')).toBe(getDocPageKey('/users/456')); + expect(getDocPageKey('/users/123/edit')).toBe(getDocPageKey('/users/456/edit')); + }); + + it('can keep dynamic pages separate when configured', () => { + expect( + getDocPageKey('/users/123', { + docs: { + collapseDynamicPages: false, + }, + }) + ).toBe('users/123'); + }); +}); + +describe('doc-collector research navigation', () => { + it('extracts openapi tag targets from navigation and menu sections', () => { + const research = ` +## Navigation + +| Element | Type | ARIA | CSS | +|------|------|------|------| +| 'Project / Analytics / Tags' | button | { role: 'button', text: 'Project / Analytics / Tags Open Group' } | 'button[id="api-1/tag/project-analytics-tags"]' | +| 'Project / Analytics / Labels' | button | { role: 'button', text: 'Project / Analytics / Labels Open Group' } | 'button:has-text("Project / Analytics / Labels")' | +| 'Shows linked issues from jira statistics for a project' | button | { role: 'button', text: 'Shows linked issues from jira statistics for a project' } | 'button:has-text("Shows linked issues")' | + +## Menu + +| Element | Type | ARIA | CSS | +|------|------|------|------| +| 'Show More' | button | { role: 'button', text: 'Show all Project / Analytics / Jira endpoints' } | 'button[id="api-1/tag/project-analytics-jira"]' | +`; + + expect( + extractResearchNavigationTargets( + { + url: '/docs/openapi#tag/project-analytics-tests', + }, + research + ) + ).toEqual([ + '/docs/openapi#tag/project-analytics-tags', + '/docs/openapi#tag/project-analytics-labels', + '/docs/openapi#tag/project-analytics-jira', + ]); + }); +}); + +describe('doc-collector renderer', () => { + it('renders page documentation in spec format', () => { + const markdown = renderPageDocumentation( + { + url: '/users/sign_in', + title: 'Testomat.io', + }, + { + summary: 'Sign in page for existing users', + can: [ + { + action: 'user can sign in with email and password', + scope: 'page-level', + evidence: 'Email and password fields plus submit button are visible', + }, + ], + might: [ + { + action: 'use social login', + scope: 'one item', + evidence: 'OAuth buttons are shown in the form', + }, + ], + } + ); + + expect(markdown).toContain('## Purpose'); + expect(markdown).toContain('- user can sign in with email and password -> page-level'); + expect(markdown).toContain('Proof: Email and password fields plus submit button are visible.'); + expect(markdown).toContain('- user might use social login -> one item'); + expect(markdown).toContain('Signal: OAuth buttons are shown in the form.'); + }); + + it('renders aggregate spec index with skipped pages', () => { + const markdown = renderSpecIndex( + 'D:/project/output/docs', + '/users/sign_in', + [ + { + url: '/users/sign_in', + title: 'Testomat.io', + summary: 'Sign in page', + canCount: 7, + mightCount: 1, + canActions: ['user can sign in with email and password'], + mightActions: ['user might use social login'], + filePath: 'D:/project/output/docs/pages/users_sign_in.md', + }, + ], + [ + { + url: '/users/auth/google_oauth2', + reason: 'redirected into external auth flow', + }, + ], + 20 + ); + + expect(markdown).toContain('## Overview'); + expect(markdown).toContain('### [/users/sign_in](pages/users_sign_in.md)'); + expect(markdown).toContain('Proven actions: 7'); + expect(markdown).toContain('User Can:'); + expect(markdown).toContain('- user can sign in with email and password'); + expect(markdown).toContain('User Might:'); + expect(markdown).toContain('- user might use social login'); + expect(markdown).toContain('## Skipped'); + expect(markdown).toContain('/users/auth/google_oauth2. Reason: redirected into external auth flow.'); + }); + + it('normalizes might-actions without duplicating prefixes', () => { + expect(normalizeAction('user might be able to submit the login form by pressing Enter', 'might')).toBe('user might be able to submit the login form by pressing Enter'); + expect(normalizeAction('user can submit the login form by pressing Enter', 'might')).toBe('user might submit the login form by pressing Enter'); + }); + +}); + +describe('doc-collector scope and signal', () => { + it('keeps subtree scope around the start page', () => { + const bot = new DocBot(); + (bot as any).config = { docs: { scope: 'subtree' } }; + (bot as any).scopeRoot = '/ua/serials/stb/kod'; + + expect((bot as any).isInScope('/ua/serials/stb/kod/2026')).toBe(true); + expect((bot as any).isInScope('/ua/serials/stb/kod/2026/seriya-1')).toBe(true); + expect((bot as any).isInScope('/ua/person/actor')).toBe(false); + expect((bot as any).isInScope('/ua/faq')).toBe(false); + }); + + it('marks pages with weak docs and few controls as low-signal', () => { + const bot = new DocBot(); + (bot as any).config = { docs: { minCanActions: 1, minInteractiveElements: 3 } }; + + expect( + (bot as any).getLowSignalReason( + { summary: 'The page currently loads with no visible content.', can: [], might: [] }, + '* Content (0 elements) `main`\n\nChars: 120' + ) + ).toContain('low-signal page'); + }); + + it('keeps pages with proven actions out of low-signal skip', () => { + const bot = new DocBot(); + (bot as any).config = { docs: { minCanActions: 1, minInteractiveElements: 3 } }; + + expect( + (bot as any).getLowSignalReason( + { summary: 'Serial details page.', can: [{ action: 'watch episode', scope: 'one item', evidence: 'episode links visible' }], might: [] }, + '* Episodes (10 elements) `.tp-show__list`\n\nChars: 1200' + ) + ).toBeNull(); + }); +}); + +describe('documentarian fallback', () => { + it('retries with sanitized research after JSON generation failure', async () => { + const calls: string[] = []; + const provider = { + async generateObject(messages: Array<{ role: string; content: string }>) { + calls.push(messages[1].content); + if (calls.length === 1) { + throw new Error('Failed to generate JSON. Please adjust your prompt. See failed_generation for more details.'); + } + return { + object: { + summary: 'Episode page', + can: [ + { + action: 'user can watch the episode', + scope: 'one item', + evidence: 'Video player is visible', + }, + ], + might: [], + }, + }; + }, + } as any; + + const documentarian = new Documentarian(provider, {}); + const result = await documentarian.document( + { + url: '/ua/serials/stb/kod', + title: 'K.O.D.', + }, + `## Content + +| Element | Type | ARIA | CSS | Coordinates | +|------|------|------|------|------| +| 'Play button' | link | { role: 'link', text: 'play' } | 'a.about-project__play' | (468, 537) | +| 'Broken row' | link | - | 2026' } | 'a[href="/ua/serials/stb/kod/2026"]' | +` + ); + + expect(result.summary).toBe('Episode page'); + expect(result.can).toHaveLength(1); + expect(calls).toHaveLength(2); + expect(calls[1]).toContain(''); + }); +}); diff --git a/tests/unit/research-parser.test.ts b/tests/unit/research-parser.test.ts index 960beb0..9cd9cb7 100644 --- a/tests/unit/research-parser.test.ts +++ b/tests/unit/research-parser.test.ts @@ -144,4 +144,19 @@ describe('formatResearchSummary', () => { const summary = formatResearchSummary(md); expect(summary).toContain('* Section (1 element)'); }); + + it('drops invalid non-eidx values from the eidx column', () => { + const md = dedent` + ## Navigation + + | Element | Type | ARIA | CSS | eidx | + |------|------|------|------|------| + | 'Year link' | link | { role: 'link', text: '2026' } | 'a[href*="/kod/2026"]' | 'a[href*="/kod/2026"]' | + | 'Episode link' | link | { role: 'link', text: 'Episode 1' } | 'a[href*="/episode-1"]' | 42 | + `; + + const sections = parseResearchSections(md); + expect(sections[0].elements[0].eidx).toBeNull(); + expect(sections[0].elements[1].eidx).toBe('e42'); + }); }); diff --git a/tests/unit/url-matcher.test.ts b/tests/unit/url-matcher.test.ts index 5229476..71128fb 100644 --- a/tests/unit/url-matcher.test.ts +++ b/tests/unit/url-matcher.test.ts @@ -1,5 +1,6 @@ import { beforeEach, describe, expect, it } from 'bun:test'; import { ConfigParser } from '../../src/config'; +import { normalizeUrl } from '../../src/state-manager'; import { extractStatePath, generalizeSegment, generalizeUrl, hasDynamicUrlSegment, isDynamicSegment, matchesUrl } from '../../src/utils/url-matcher'; describe('url-matcher', () => { @@ -166,12 +167,27 @@ describe('url-matcher', () => { expect(extractStatePath('/dashboard')).toBe('/dashboard'); }); + it('collapses repeated leading slashes for path-like URLs', () => { + expect(extractStatePath('///series/page/57/')).toBe('/series/page/57/'); + }); + it('strips host from absolute URL, keeps hash', () => { expect(extractStatePath('https://example.com/page#section')).toBe('/page#section'); }); + it('collapses repeated leading slashes in absolute URL paths', () => { + expect(extractStatePath('https://example.com///series/page/57/')).toBe('/series/page/57/'); + }); + it('returns original string when URL is unparseable', () => { expect(extractStatePath('not a url')).toBe('not a url'); }); }); + + describe('normalizeUrl', () => { + it('treats repeated leading slashes as a relative path, not a protocol-relative URL', () => { + expect(normalizeUrl('///series/page/57/')).toBe('series/page/57'); + expect(normalizeUrl('/series/page/57/')).toBe('series/page/57'); + }); + }); }); From e1a355a3297d8e1bd22444b52fb5cdf1437381e2 Mon Sep 17 00:00:00 2001 From: Denys Kuchma Date: Mon, 11 May 2026 21:44:15 +0300 Subject: [PATCH 2/2] fix format --- tests/unit/doc-collector.test.ts | 21 +++------------------ 1 file changed, 3 insertions(+), 18 deletions(-) diff --git a/tests/unit/doc-collector.test.ts b/tests/unit/doc-collector.test.ts index ee30b7b..25f3b37 100644 --- a/tests/unit/doc-collector.test.ts +++ b/tests/unit/doc-collector.test.ts @@ -78,11 +78,7 @@ describe('doc-collector research navigation', () => { }, research ) - ).toEqual([ - '/docs/openapi#tag/project-analytics-tags', - '/docs/openapi#tag/project-analytics-labels', - '/docs/openapi#tag/project-analytics-jira', - ]); + ).toEqual(['/docs/openapi#tag/project-analytics-tags', '/docs/openapi#tag/project-analytics-labels', '/docs/openapi#tag/project-analytics-jira']); }); }); @@ -159,7 +155,6 @@ describe('doc-collector renderer', () => { expect(normalizeAction('user might be able to submit the login form by pressing Enter', 'might')).toBe('user might be able to submit the login form by pressing Enter'); expect(normalizeAction('user can submit the login form by pressing Enter', 'might')).toBe('user might submit the login form by pressing Enter'); }); - }); describe('doc-collector scope and signal', () => { @@ -178,24 +173,14 @@ describe('doc-collector scope and signal', () => { const bot = new DocBot(); (bot as any).config = { docs: { minCanActions: 1, minInteractiveElements: 3 } }; - expect( - (bot as any).getLowSignalReason( - { summary: 'The page currently loads with no visible content.', can: [], might: [] }, - '* Content (0 elements) `main`\n\nChars: 120' - ) - ).toContain('low-signal page'); + expect((bot as any).getLowSignalReason({ summary: 'The page currently loads with no visible content.', can: [], might: [] }, '* Content (0 elements) `main`\n\nChars: 120')).toContain('low-signal page'); }); it('keeps pages with proven actions out of low-signal skip', () => { const bot = new DocBot(); (bot as any).config = { docs: { minCanActions: 1, minInteractiveElements: 3 } }; - expect( - (bot as any).getLowSignalReason( - { summary: 'Serial details page.', can: [{ action: 'watch episode', scope: 'one item', evidence: 'episode links visible' }], might: [] }, - '* Episodes (10 elements) `.tp-show__list`\n\nChars: 1200' - ) - ).toBeNull(); + expect((bot as any).getLowSignalReason({ summary: 'Serial details page.', can: [{ action: 'watch episode', scope: 'one item', evidence: 'episode links visible' }], might: [] }, '* Episodes (10 elements) `.tp-show__list`\n\nChars: 1200')).toBeNull(); }); });