Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@
"zod": "^4.3.6"
},
"devDependencies": {
"concurrently": "^10.0.0",
"@commitlint/cli": "^20.4.4",
"@commitlint/config-conventional": "^20.4.4",
"@playwright/test": "^1.58.2",
Expand All @@ -50,9 +49,11 @@
"@types/sanitize-filename": "^1.1.28",
"@vitest/coverage-v8": "^4.0.18",
"@vitest/ui": "^4.0.18",
"concurrently": "^10.0.0",
"esbuild": "^0.27.4",
"husky": "^9.1.7",
"lint-staged": "^17.0.5",
"madge": "^8.0.0",
"markdown-toc": "^1.2.0",
"msw": "^2.12.10",
"oxfmt": "^0.32.0",
Expand Down
840 changes: 840 additions & 0 deletions pnpm-lock.yaml

Large diffs are not rendered by default.

5 changes: 5 additions & 0 deletions src/ai/cross-encoder.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
export class CrossEncoderReranker {
async rerank(_query: string, passages: string[]): Promise<string[]> {
return passages
}
}
16 changes: 16 additions & 0 deletions src/ai/rag-types.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
export interface ExtractedFact {
fact: string
citations: number[]
}

export class PipelinePlan {
constructor(
public readonly originalQuery: string,
public readonly hydeDocument: string,
public readonly mode: string
) {}

get searchLimit(): number {
return this.mode === 'exhaustive' ? 50 : 20
}
}
116 changes: 116 additions & 0 deletions src/export/export-orchestrator.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
import { join } from 'node:path'
import { writeFileSync, existsSync, mkdirSync, readdirSync, statSync } from 'node:fs'
import { pathToFileURL } from 'node:url'
import { type Config } from '../utils/config.js'
import type { ExtractedConversation } from '../scraper/conversation-extractor.js'
import { sanitizeFilename, sanitizeSpaceName } from './sanitizer.js'
import { type ExportStrategy } from '../exporters/export.strategy.js'
import { logger } from '../utils/logger.js'

export class ExportOrchestrator {
static readonly WriteError = class extends Error {
constructor(message: string) {
super(message)
this.name = 'FileWriteError'
}
}

private strategies: ExportStrategy[] = []

constructor(private readonly config: Config) {
this.ensureRootExportDirectoryExists()
}

async initialize(): Promise<void> {
await this.initializeStrategies()
}

private async initializeStrategies(): Promise<void> {
const strategiesDir = join(import.meta.dirname, '..', 'exporters')

if (!existsSync(strategiesDir)) {
logger.warn(`Exporters directory not found: ${strategiesDir}`)
return
}

const files = readdirSync(strategiesDir)
for (const file of files) {
if (
(file.endsWith('.strategy.ts') || file.endsWith('.strategy.js')) &&
!file.endsWith('.d.ts')
) {
try {
const filePath = join(strategiesDir, file)
const moduleUrl = pathToFileURL(filePath).href
const strategyModule = await import(moduleUrl)
const strategy = strategyModule.default as ExportStrategy

if (strategy && strategy.name && typeof strategy.format === 'function') {
if (this.config.exportStrategies.includes(strategy.name)) {
this.strategies.push(strategy)
logger.debug(`Registered export strategy: ${strategy.name}`)
}
}
} catch (error) {
logger.error(`Failed to load export strategy ${file}: ${error}`)
}
}
}

if (this.strategies.length === 0) {
logger.warn('No active export strategies found. Defaulting to markdown.')
try {
const markdownStrategy = (await import('../exporters/markdown.strategy.js')).default
this.strategies.push(markdownStrategy)
} catch (e) {
logger.error('Failed to load default markdown strategy', e)
}
}
}

async exportConversation(conversation: ExtractedConversation): Promise<string[]> {
const writtenFiles: string[] = []

for (const strategy of this.strategies) {
try {
const outputDir = strategy.outputDir(this.config)
const safeSpaceName = sanitizeSpaceName(conversation.spaceName)
const spaceSpecificDirectory = join(outputDir, safeSpaceName)

if (!existsSync(spaceSpecificDirectory)) {
mkdirSync(spaceSpecificDirectory, { recursive: true })
}

const safeFileTitle = sanitizeFilename(conversation.title)
const fileName = `${safeFileTitle} (${conversation.id})${strategy.fileExtension}`
const destinationFilePath = join(spaceSpecificDirectory, fileName)

const content = strategy.format(conversation)
writeFileSync(destinationFilePath, content, 'utf-8')

if (!existsSync(destinationFilePath) || statSync(destinationFilePath).size === 0) {
throw new Error(`Exported file is missing or empty: ${destinationFilePath}`)
}

writtenFiles.push(destinationFilePath)
} catch (error) {
const errorMessage = error instanceof Error ? error.message : String(error)
logger.error(`Failed to export with ${strategy.name} for ${conversation.id}: ${errorMessage}`)
}
}

if (writtenFiles.length === 0 && this.strategies.length > 0) {
throw new ExportOrchestrator.WriteError(
`Failed to write conversation ${conversation.id} with any strategy.`
)
}

return writtenFiles
}

private ensureRootExportDirectoryExists(): void {
if (!existsSync(this.config.exportDir)) {
mkdirSync(this.config.exportDir, { recursive: true })
}
}
}
30 changes: 30 additions & 0 deletions src/exporters/custom.strategy.ts.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
// Copy this file to src/strategies/csv.exporter.ts
// Then add "csv" to EXPORT_STRATEGIES in your .env
//
// The `name` field here MUST match what you put in EXPORT_STRATEGIES.

import type { ExportStrategy } from './export.strategy.js'
import type { ExtractedConversation } from '../scraper/conversation-extractor.js'
import type { Config } from '../utils/config.js'

const exporter: ExportStrategy = {
name: 'csv',
fileExtension: '.csv',
// outputDir: where your files will be written.
// Return config.exportDir to share the default exports folder,
// or return a custom path for a separate output directory.
outputDir(config: Config): string {
return config.exportDir
},
// format: receives the fully extracted conversation, returns a string.
// The string will be written to: outputDir / spaceName / title (id).csv
format(conversation: ExtractedConversation): string {
const header = 'role,content'
const rows = conversation.messages.map(
(m) => `${m.role},"${m.content.replace(/"/g, '""')}"`
)
return [header, ...rows].join('\n')
},
}

export default exporter
12 changes: 12 additions & 0 deletions src/exporters/export.strategy.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import type { ExtractedConversation } from '../scraper/conversation-extractor.js'
import type { Config } from '../utils/config.js'

export interface ExportStrategy {
/** Must match exactly what you put in EXPORT_STRATEGIES */
name: string
fileExtension: string
/** Where to write output files. Return config.exportDir as the safe default. */
outputDir(config: Config): string
/** Serialize the conversation. Return a string (UTF-8). */
format(conversation: ExtractedConversation): string
}
21 changes: 21 additions & 0 deletions src/exporters/markdown.strategy.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import type { ExportStrategy } from './export.strategy.js'
import type { ExtractedConversation } from '../scraper/conversation-extractor.js'
import type { Config } from '../utils/config.js'

const exporter: ExportStrategy = {
name: 'markdown',
fileExtension: '.md',
outputDir(config: Config): string {
return config.exportDir
},
format(conversation: ExtractedConversation): string {
const headerTitle = `# ${conversation.title}\n\n`
const metadataBlock =
`**Space:** ${conversation.spaceName} \n` +
`**ID:** ${conversation.id} \n` +
`**Date:** ${conversation.timestamp.toISOString()} \n\n`
return headerTitle + metadataBlock + conversation.content
},
}

export default exporter
42 changes: 33 additions & 9 deletions src/scraper/conversation-extractor.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,19 @@ import { waitStrategy } from '../utils/wait-strategy.js'
import { ApiDiagnosticsWriter } from '../utils/api-diagnostics.js'
import { type Config } from '../utils/config.js'

export interface ConversationMessage {
role: 'user' | 'assistant'
content: string
}

export interface ExtractedConversation {
id: string
contentHash: string
title: string
spaceName: string
timestamp: Date
content: string
messages: ConversationMessage[]
}

export class ConversationExtractor {
Expand Down Expand Up @@ -333,10 +339,11 @@ export class ConversationExtractor {
const spaceName = firstEntry.collection_info?.title ?? collectionTitleFromData ?? 'General'
const timestamp = this.extractTimestamp(firstEntry, apiData)
const contentHash = this.hashEntries(validatedEntries)
const markdownContent = this.convertEntriesToMarkdown(validatedEntries, title)
const messages = this.parseMessages(validatedEntries, title)
const markdownContent = this.convertMessagesToMarkdown(messages)

if (!markdownContent) {
logger.warn(`Thread has empty content after formatting: ${conversationUrl}`)
if (!markdownContent && messages.length === 0) {
logger.warn(`Thread has no content or messages: ${conversationUrl}`)
return null
}

Expand All @@ -347,6 +354,7 @@ export class ConversationExtractor {
timestamp,
content: markdownContent,
contentHash,
messages,
}
} catch (error) {
errorBus.emitError('Failed to parse conversation data.', error)
Expand Down Expand Up @@ -376,13 +384,17 @@ export class ConversationExtractor {
return rawTimestamp ? new Date(rawTimestamp) : new Date()
}

private convertEntriesToMarkdown(entries: unknown[], threadTitle: string): string {
let markdown = ''
private parseMessages(entries: unknown[], threadTitle: string): ConversationMessage[] {
const messages: ConversationMessage[] = []
const typedEntries = entries as any[]

for (let i = 0; i < typedEntries.length; i++) {
const entry = typedEntries[i]
let question = entry.query_str ?? (i === 0 ? threadTitle : 'Follow‑up')
const question = entry.query_str ?? (i === 0 ? threadTitle : 'Follow‑up')

if (question) {
messages.push({ role: 'user', content: question })
}

let answer = ''
for (const block of entry.blocks ?? []) {
Expand All @@ -391,11 +403,23 @@ export class ConversationExtractor {
}
}

if (question) markdown += `## ${question}\n\n`
if (answer) markdown += `${answer.trim()}\n\n`
markdown += '---\n\n'
if (answer.trim()) {
messages.push({ role: 'assistant', content: answer.trim() })
}
}

return messages
}

private convertMessagesToMarkdown(messages: ConversationMessage[]): string {
let markdown = ''
for (const message of messages) {
if (message.role === 'user') {
markdown += `## ${message.content}\n\n`
} else {
markdown += `${message.content}\n\n---\n\n`
}
}
return markdown.trim()
}
}
4 changes: 2 additions & 2 deletions src/utils/api-diagnostics.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import fs from 'node:fs/promises'
import path from 'node:path'
import { join } from 'node:path'
import { logger } from './logger.js'
import type { Config } from './config.js'

Expand All @@ -26,7 +26,7 @@ export class ApiDiagnosticsWriter {
}

await fs.mkdir(this.DEBUG_DIRECTORY, { recursive: true })
const diagnosticLogPath = path.join(this.DEBUG_DIRECTORY, this.DIAGNOSTICS_FILENAME)
const diagnosticLogPath = join(this.DEBUG_DIRECTORY, this.DIAGNOSTICS_FILENAME)

const entryAsJsonLine = JSON.stringify(diagnosticEntry) + '\n'
await fs.appendFile(diagnosticLogPath, entryAsJsonLine, 'utf8')
Expand Down
7 changes: 6 additions & 1 deletion src/utils/config.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import { config as loadEnv } from 'dotenv'
import { existsSync, mkdirSync } from 'node:fs'
import { dirname, join } from 'node:path'
import { join, dirname } from 'node:path'
import { z } from 'zod'
import { logger } from './logger.js'

Expand All @@ -24,6 +24,10 @@ const configSchema = z.object({
.transform((val) => val === 'true'),
headless: z.union([z.boolean(), z.literal('new')]),
debug: z.boolean(),
exportStrategies: z
.string()
.optional()
.transform((val) => (val ? val.split(',').map((s) => s.trim()) : ['markdown'])),
})

export type Config = z.infer<typeof configSchema>
Expand Down Expand Up @@ -61,6 +65,7 @@ function parseEnvConfig(): Config {
enableVectorSearch: process.env['ENABLE_VECTOR_SEARCH'],
headless: headless,
debug: process.env['DEBUG'] === 'true',
exportStrategies: process.env['EXPORT_STRATEGIES'],
}

const result = configSchema.safeParse(rawConfig)
Expand Down
6 changes: 3 additions & 3 deletions test/unit/api-diagnostics.unit.test.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import { describe, it, expect, vi, beforeEach } from 'vitest'
import { ApiDiagnosticsWriter } from '../../src/utils/api-diagnostics.js'
import fs from 'node:fs/promises'
import path from 'node:path'
import { join } from 'node:path'

vi.mock('node:fs/promises')

Expand All @@ -25,7 +25,7 @@ describe('ApiDiagnosticsWriter (Unit)', () => {

expect(fs.mkdir).toHaveBeenCalledWith('debug', { recursive: true })
expect(fs.appendFile).toHaveBeenCalledWith(
path.join('debug', 'api-diagnostics.jsonl'),
join('debug', 'api-diagnostics.jsonl'),
expect.stringContaining('"url":"http://test.com"'),
'utf8'
)
Expand All @@ -42,7 +42,7 @@ describe('ApiDiagnosticsWriter (Unit)', () => {
await writer.writeFailure(entry)

expect(fs.appendFile).toHaveBeenCalledWith(
path.join('debug', 'api-diagnostics.jsonl'),
join('debug', 'api-diagnostics.jsonl'),
expect.stringContaining('"zodErrorPaths":["entries.0.title"]'),
'utf8'
)
Expand Down
Loading