diff --git a/.github/workflows/js.yml b/.github/workflows/js.yml index c7e8cd8..2426576 100644 --- a/.github/workflows/js.yml +++ b/.github/workflows/js.yml @@ -320,6 +320,12 @@ jobs: run: npm test -- --testPathPattern="wikipedia-download" --testTimeout=120000 timeout-minutes: 10 + - name: Run GitHub repository capture live integration tests (issue #5) + env: + GITHUB_REPOSITORY_INTEGRATION: 'true' + run: npm test -- --testPathPattern="github-readme" --testTimeout=120000 + timeout-minutes: 10 + - name: Build Docker image for e2e tests run: docker compose build timeout-minutes: 10 diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 628ad86..f53c264 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -182,6 +182,13 @@ jobs: run: cargo test --test integration wikipedia_download::live -- --nocapture timeout-minutes: 10 + - name: Run GitHub repository capture live integration tests (issue #5) + working-directory: rust + env: + GITHUB_REPOSITORY_INTEGRATION: '1' + run: cargo test --test integration github_repository::live -- --nocapture + timeout-minutes: 10 + # Build check - only runs when Rust code changes build: name: Rust - Build diff --git a/js/.changeset/github-repository-capture.md b/js/.changeset/github-repository-capture.md new file mode 100644 index 0000000..e99c0d4 --- /dev/null +++ b/js/.changeset/github-repository-capture.md @@ -0,0 +1,5 @@ +--- +'@link-assistant/web-capture': patch +--- + +Add compact GitHub repository capture for txt and markdown output, including repository metadata, the root file tree, and README content. diff --git a/js/README.md b/js/README.md index e3ed22e..5692896 100644 --- a/js/README.md +++ b/js/README.md @@ -52,6 +52,10 @@ web-capture https://example.com --format html -o page.html # Capture raw paste text web-capture https://xpaste.pro/p/t4q0Lsp0 --format txt -o paste.txt +# Capture a GitHub repository as compact text or Markdown +web-capture https://github.com/link-assistant/web-capture --format txt -o repository.txt +web-capture https://github.com/link-assistant/web-capture --format markdown -o repository.md + # Take a PNG screenshot web-capture https://example.com --format png -o screenshot.png @@ -153,6 +157,11 @@ containing `index.md`, `xpaste-pro-.md`, and `xpaste-pro-.txt`. Canonical `/p/`, `/p//raw`, `/ru/p/`, and `/en/p/` URLs are normalized before capture. +For plain GitHub repository URLs such as `https://github.com/owner/repo`, +`/markdown` returns a compact repository snapshot with repository metadata, the +root file tree, and README content. GitHub subpages continue through the regular +HTML-to-Markdown conversion path. + | Parameter | Required | Description | Default | | ------------------- | -------- | ------------------------------------------------------------------------- | -------- | | `url` | Yes | URL to fetch | - | @@ -171,6 +180,9 @@ Returns raw text content as a `.txt` attachment. xpaste.pro paste URLs are normalized to their `/raw` endpoint, including localized `/ru/p/` and `/en/p/` URLs. +Plain GitHub repository URLs return a compact `.txt` snapshot with repository +metadata, the root file tree, and README content. + | Parameter | Required | Description | Default | | --------- | -------- | ------------ | ------- | | `url` | Yes | URL to fetch | - | @@ -490,6 +502,9 @@ with environment variables: # Download the Wikipedia page (markdown + image) in every supported engine WIKIPEDIA_INTEGRATION=true npm test -- --testPathPattern="wikipedia-download" +# Download a GitHub repository page as compact txt/markdown, original HTML, and screenshots +GITHUB_REPOSITORY_INTEGRATION=true npm test -- --testPathPattern="github-readme" + # Habr articles and public Google Docs live suites HABR_INTEGRATION=true npm test -- --testPathPattern="habr-article" GDOCS_INTEGRATION=true npm test -- --testPathPattern="gdocs-public-doc" diff --git a/js/bin/web-capture.js b/js/bin/web-capture.js index 0d16497..54fa8c8 100755 --- a/js/bin/web-capture.js +++ b/js/bin/web-capture.js @@ -614,6 +614,12 @@ async function captureUrl(url, options) { captureGoogleDocWithBrowserOrFallback, selectGoogleDocsCaptureMethod, } = await import('../src/gdocs.js'); + const { + fetchGithubRepositorySnapshot, + formatGithubRepositoryMarkdown, + formatGithubRepositoryText, + isGithubRepositoryUrl, + } = await import('../src/github.js'); const normalizedFormat = format.toLowerCase(); log.debug(() => ({ @@ -921,15 +927,22 @@ async function captureUrl(url, options) { try { if (normalizedFormat === 'txt' || normalizedFormat === 'text') { - const response = await fetch(normalizeUrlForTextContent(absoluteUrl)); - if (!response.ok) { - throw new Error(`HTTP ${response.status}: ${response.statusText}`); - } - const contentType = response.headers.get('content-type') || 'text/plain'; - if (!contentType.includes('text/')) { - throw new Error(`Expected text content, got ${contentType}`); + let text; + if (isGithubRepositoryUrl(absoluteUrl)) { + const snapshot = await fetchGithubRepositorySnapshot(absoluteUrl); + text = formatGithubRepositoryText(snapshot); + } else { + const response = await fetch(normalizeUrlForTextContent(absoluteUrl)); + if (!response.ok) { + throw new Error(`HTTP ${response.status}: ${response.statusText}`); + } + const contentType = + response.headers.get('content-type') || 'text/plain'; + if (!contentType.includes('text/')) { + throw new Error(`Expected text content, got ${contentType}`); + } + text = await response.text(); } - const text = await response.text(); const output = explicitOutput === '-' ? null @@ -1151,17 +1164,27 @@ async function captureUrl(url, options) { console.error(`Archive saved to: ${outPath}`); } else if (normalizedFormat === 'markdown' || normalizedFormat === 'md') { // Markdown format — enhanced conversion is now the default - const html = await fetchHtml(absoluteUrl); - const { convertHtmlToMarkdownEnhanced } = await import('../src/lib.js'); - const result = convertHtmlToMarkdownEnhanced(html, absoluteUrl, { - extractLatex: options.extractLatex, - extractMetadata: options.extractMetadata, - postProcess: options.postProcess, - detectCodeLanguage: options.detectCodeLanguage, - contentSelector: options.contentSelector, - bodySelector: options.bodySelector, - }); - const markdown = result.markdown; + let markdown; + if ( + isGithubRepositoryUrl(absoluteUrl) && + !options.contentSelector && + !options.bodySelector + ) { + const snapshot = await fetchGithubRepositorySnapshot(absoluteUrl); + markdown = formatGithubRepositoryMarkdown(snapshot); + } else { + const html = await fetchHtml(absoluteUrl); + const { convertHtmlToMarkdownEnhanced } = await import('../src/lib.js'); + const result = convertHtmlToMarkdownEnhanced(html, absoluteUrl, { + extractLatex: options.extractLatex, + extractMetadata: options.extractMetadata, + postProcess: options.postProcess, + detectCodeLanguage: options.detectCodeLanguage, + contentSelector: options.contentSelector, + bodySelector: options.bodySelector, + }); + markdown = result.markdown; + } const output = explicitOutput === '-' diff --git a/js/src/github.js b/js/src/github.js new file mode 100644 index 0000000..261db52 --- /dev/null +++ b/js/src/github.js @@ -0,0 +1,285 @@ +import fetch from 'node-fetch'; +import { URL } from 'node:url'; + +const GITHUB_API_BASE = 'https://api.github.com'; +const GITHUB_USER_AGENT = 'web-capture'; + +export function parseGithubRepositoryUrl(url) { + try { + const parsed = new URL(url); + const host = parsed.hostname.toLowerCase(); + if (host !== 'github.com' && host !== 'www.github.com') { + return null; + } + + const parts = parsed.pathname.split('/').filter(Boolean); + if (parts.length !== 2) { + return null; + } + + const [owner, repo] = parts; + if (!owner || !repo) { + return null; + } + + return { + owner, + repo, + fullName: `${owner}/${repo}`, + htmlUrl: `https://github.com/${owner}/${repo}`, + }; + } catch { + return null; + } +} + +export function isGithubRepositoryUrl(url) { + return Boolean(parseGithubRepositoryUrl(url)); +} + +export function getGithubRepositoryTextFilename(url) { + const parsed = parseGithubRepositoryUrl(url); + return parsed ? `${parsed.owner}-${parsed.repo}.txt` : null; +} + +export async function fetchGithubRepositorySnapshot(url) { + const parsed = parseGithubRepositoryUrl(url); + if (!parsed) { + throw new Error(`Not a GitHub repository URL: ${url}`); + } + + const repository = await fetchGithubJson( + `${GITHUB_API_BASE}/repos/${encodePath(parsed.owner)}/${encodePath(parsed.repo)}` + ); + const defaultBranch = repository.default_branch || 'main'; + + const [readme, tree] = await Promise.all([ + fetchGithubReadme(parsed, defaultBranch), + fetchGithubRootTree(parsed, defaultBranch), + ]); + + return { + sourceUrl: parsed.htmlUrl, + repository, + defaultBranch, + readme, + tree, + }; +} + +export function formatGithubRepositoryMarkdown(snapshot) { + const { repository, defaultBranch, readme, tree, sourceUrl } = snapshot; + const lines = [ + `# ${repository.full_name}`, + '', + repository.description ? `> ${repository.description}` : null, + '', + '## Repository', + '', + `- URL: ${repository.html_url || sourceUrl}`, + `- Default branch: \`${defaultBranch}\``, + repository.language ? `- Primary language: ${repository.language}` : null, + numberLine('Stars', repository.stargazers_count), + numberLine('Forks', repository.forks_count), + numberLine('Open issues', repository.open_issues_count), + repository.license?.spdx_id + ? `- License: ${repository.license.spdx_id}` + : null, + topicsLine(repository.topics), + '', + '## Files', + '', + ].filter((line) => line !== null); + + if (tree.length > 0) { + for (const item of tree) { + const label = item.type === 'dir' ? `${item.name}/` : item.name; + const suffix = + item.type === 'file' && typeof item.size === 'number' + ? ` (${formatBytes(item.size)})` + : ''; + lines.push(`- [${label}](${item.html_url})${suffix}`); + } + } else { + lines.push('- No root files returned by the GitHub API.'); + } + + lines.push('', `## ${readme?.path || 'README'}`, ''); + if (readme?.content) { + lines.push(readme.content.trimEnd(), ''); + } else { + lines.push('README content was not returned by the GitHub API.', ''); + } + + return lines.join('\n'); +} + +export function formatGithubRepositoryText(snapshot) { + const { repository, defaultBranch, readme, tree, sourceUrl } = snapshot; + const lines = [ + `Repository: ${repository.full_name}`, + repository.description ? `Description: ${repository.description}` : null, + `URL: ${repository.html_url || sourceUrl}`, + `Default branch: ${defaultBranch}`, + repository.language ? `Primary language: ${repository.language}` : null, + plainNumberLine('Stars', repository.stargazers_count), + plainNumberLine('Forks', repository.forks_count), + plainNumberLine('Open issues', repository.open_issues_count), + repository.license?.spdx_id + ? `License: ${repository.license.spdx_id}` + : null, + repository.topics?.length + ? `Topics: ${repository.topics.join(', ')}` + : null, + '', + 'Files:', + ].filter((line) => line !== null); + + if (tree.length > 0) { + for (const item of tree) { + const label = item.type === 'dir' ? `${item.name}/` : item.name; + const suffix = + item.type === 'file' && typeof item.size === 'number' + ? ` (${formatBytes(item.size)})` + : ''; + lines.push(`- ${label}${suffix}`); + } + } else { + lines.push('- No root files returned by the GitHub API.'); + } + + lines.push('', `${readme?.path || 'README'}:`, ''); + if (readme?.content) { + lines.push(readme.content.trimEnd(), ''); + } else { + lines.push('README content was not returned by the GitHub API.', ''); + } + + return lines.join('\n'); +} + +async function fetchGithubReadme(parsed, defaultBranch) { + const readme = await fetchGithubJson( + `${GITHUB_API_BASE}/repos/${encodePath(parsed.owner)}/${encodePath(parsed.repo)}/readme?ref=${encodeURIComponent(defaultBranch)}`, + { optional: true } + ); + if (!readme) { + return null; + } + + let content = null; + if (readme.content && readme.encoding === 'base64') { + content = Buffer.from( + readme.content.replace(/\s+/g, ''), + 'base64' + ).toString('utf8'); + } else if (readme.download_url) { + content = await fetchGithubText(readme.download_url, { optional: true }); + } + + return { + name: readme.name || 'README', + path: readme.path || readme.name || 'README', + htmlUrl: readme.html_url, + content, + }; +} + +async function fetchGithubRootTree(parsed, defaultBranch) { + const tree = await fetchGithubJson( + `${GITHUB_API_BASE}/repos/${encodePath(parsed.owner)}/${encodePath(parsed.repo)}/contents?ref=${encodeURIComponent(defaultBranch)}`, + { optional: true } + ); + if (!Array.isArray(tree)) { + return []; + } + + return tree + .map((item) => { + const pathKind = item.type === 'dir' ? 'tree' : 'blob'; + return { + name: item.name, + path: item.path, + type: item.type, + size: item.size, + html_url: + item.html_url || + `https://github.com/${parsed.owner}/${parsed.repo}/${pathKind}/${defaultBranch}/${item.path}`, + }; + }) + .sort((a, b) => { + if (a.type !== b.type) { + return a.type === 'dir' ? -1 : 1; + } + return a.name.localeCompare(b.name); + }); +} + +async function fetchGithubJson(url, options = {}) { + const response = await fetch(url, { + headers: githubHeaders('application/vnd.github+json'), + }); + if (options.optional && response.status === 404) { + return null; + } + if (!response.ok) { + throw new Error(`GitHub API ${response.status}: ${response.statusText}`); + } + return response.json(); +} + +async function fetchGithubText(url, options = {}) { + const response = await fetch(url, { + headers: githubHeaders('text/plain'), + }); + if (options.optional && response.status === 404) { + return null; + } + if (!response.ok) { + throw new Error(`GitHub raw ${response.status}: ${response.statusText}`); + } + return response.text(); +} + +function githubHeaders(accept) { + const headers = { + Accept: accept, + 'User-Agent': GITHUB_USER_AGENT, + 'X-GitHub-Api-Version': '2022-11-28', + }; + const token = process.env.GITHUB_TOKEN || process.env.GH_TOKEN; + if (token) { + headers.Authorization = `Bearer ${token}`; + } + return headers; +} + +function encodePath(value) { + return encodeURIComponent(value); +} + +function numberLine(label, value) { + return typeof value === 'number' + ? `- ${label}: ${value.toLocaleString()}` + : null; +} + +function plainNumberLine(label, value) { + return typeof value === 'number' + ? `${label}: ${value.toLocaleString()}` + : null; +} + +function topicsLine(topics) { + return topics?.length ? `- Topics: ${topics.join(', ')}` : null; +} + +function formatBytes(size) { + if (size < 1024) { + return `${size} B`; + } + if (size < 1024 * 1024) { + return `${(size / 1024).toFixed(1)} KB`; + } + return `${(size / (1024 * 1024)).toFixed(1)} MB`; +} diff --git a/js/src/markdown.js b/js/src/markdown.js index 319d9e6..2aaa3a2 100644 --- a/js/src/markdown.js +++ b/js/src/markdown.js @@ -8,6 +8,11 @@ import { normalizeUrlForTextPage, scopeHtmlForMarkdown, } from './lib.js'; +import { + fetchGithubRepositorySnapshot, + formatGithubRepositoryMarkdown, + isGithubRepositoryUrl, +} from './github.js'; import { convertWithKreuzberg, isKreuzbergAvailable } from './kreuzberg.js'; import { applyImageMode } from './extract-images.js'; import archiver from 'archiver'; @@ -39,6 +44,21 @@ export async function markdownHandler(req, res) { try { const pageUrl = normalizeUrlForTextPage(url); + if ( + format === 'text' && + !req.query.contentSelector && + !req.query.bodySelector && + isGithubRepositoryUrl(pageUrl) + ) { + const snapshot = await fetchGithubRepositorySnapshot(pageUrl); + let markdown = formatGithubRepositoryMarkdown(snapshot); + const result = await applyImageMode(markdown, { + mode: embedImages ? 'embed' : 'default', + }); + markdown = result.markdown; + return await sendMarkdownResponse(res, url, markdown); + } + const html = await fetchHtml(pageUrl); if (converter === 'kreuzberg') { diff --git a/js/src/txt.js b/js/src/txt.js index a5293cf..11649fc 100644 --- a/js/src/txt.js +++ b/js/src/txt.js @@ -1,6 +1,12 @@ import fetch from 'node-fetch'; import { URL } from 'node:url'; import { getTextPasteFilename, normalizeUrlForTextContent } from './lib.js'; +import { + fetchGithubRepositorySnapshot, + formatGithubRepositoryText, + getGithubRepositoryTextFilename, + isGithubRepositoryUrl, +} from './github.js'; export async function txtHandler(req, res) { const url = req.query.url; @@ -9,6 +15,12 @@ export async function txtHandler(req, res) { } try { + if (isGithubRepositoryUrl(url)) { + const snapshot = await fetchGithubRepositorySnapshot(url); + const text = formatGithubRepositoryText(snapshot); + return sendTextResponse(res, url, text); + } + // Normalize URL to get text content (e.g., xpaste.pro -> xpaste.pro/raw) const textUrl = normalizeUrlForTextContent(url); @@ -27,20 +39,28 @@ export async function txtHandler(req, res) { const text = await response.text(); - // Set appropriate headers for text file download - res.setHeader('Content-Type', 'text/plain; charset=utf-8'); - res.setHeader( - 'Content-Disposition', - `attachment; filename="${getFilenameFromUrl(url)}"` - ); - res.send(text); + sendTextResponse(res, url, text); } catch (err) { console.error('Text fetch error:', err); res.status(500).send('Error fetching text content'); } } +function sendTextResponse(res, url, text) { + res.setHeader('Content-Type', 'text/plain; charset=utf-8'); + res.setHeader( + 'Content-Disposition', + `attachment; filename="${getFilenameFromUrl(url)}"` + ); + res.send(text); +} + function getFilenameFromUrl(url) { + const githubRepositoryFilename = getGithubRepositoryTextFilename(url); + if (githubRepositoryFilename) { + return githubRepositoryFilename; + } + const textPasteFilename = getTextPasteFilename(url); if (textPasteFilename) { return textPasteFilename; diff --git a/js/tests/integration/api-endpoints.test.js b/js/tests/integration/api-endpoints.test.js index 21215d7..5cec360 100644 --- a/js/tests/integration/api-endpoints.test.js +++ b/js/tests/integration/api-endpoints.test.js @@ -25,6 +25,50 @@ const MOCK_HTML = ` test `; +function mockGithubRepository(owner = 'octocat', repo = 'Hello-World') { + const readme = '# Hello World\n\nThis README came from GitHub.'; + nock('https://api.github.com') + .get(`/repos/${owner}/${repo}`) + .reply(200, { + full_name: `${owner}/${repo}`, + description: 'A friendly test repository', + html_url: `https://github.com/${owner}/${repo}`, + default_branch: 'master', + language: 'JavaScript', + stargazers_count: 42, + forks_count: 7, + open_issues_count: 3, + license: { spdx_id: 'MIT' }, + topics: ['demo', 'capture'], + }) + .get(`/repos/${owner}/${repo}/readme`) + .query({ ref: 'master' }) + .reply(200, { + name: 'README.md', + path: 'README.md', + encoding: 'base64', + content: Buffer.from(readme, 'utf8').toString('base64'), + html_url: `https://github.com/${owner}/${repo}/blob/master/README.md`, + }) + .get(`/repos/${owner}/${repo}/contents`) + .query({ ref: 'master' }) + .reply(200, [ + { + name: 'src', + path: 'src', + type: 'dir', + html_url: `https://github.com/${owner}/${repo}/tree/master/src`, + }, + { + name: 'README.md', + path: 'README.md', + type: 'file', + size: readme.length, + html_url: `https://github.com/${owner}/${repo}/blob/master/README.md`, + }, + ]); +} + afterEach(() => { nock.cleanAll(); }); @@ -70,6 +114,44 @@ describe('API Endpoint Tests', () => { expect(res.text).toContain('Hello world'); }); + it('returns a compact GitHub repository markdown snapshot', async () => { + mockGithubRepository(); + + const res = await request(app) + .get('/markdown') + .query({ url: 'https://github.com/octocat/Hello-World' }) + .expect(200); + + expect(res.headers['content-type']).toMatch(/text\/markdown/); + expect(res.text).toContain('# octocat/Hello-World'); + expect(res.text).toContain('## Repository'); + expect(res.text).toContain('## Files'); + expect(res.text).toContain('- [src/]('); + expect(res.text).toContain('## README.md'); + expect(res.text).toContain('This README came from GitHub.'); + expect(res.text).not.toContain('Skip to content'); + expect(nock.isDone()).toBe(true); + }); + + it('returns a compact GitHub repository markdown snapshot for kreuzberg text format', async () => { + mockGithubRepository(); + + const res = await request(app) + .get('/markdown') + .query({ + url: 'https://github.com/octocat/Hello-World', + converter: 'kreuzberg', + format: 'text', + }) + .expect(200); + + expect(res.headers['content-type']).toMatch(/text\/markdown/); + expect(res.text).toContain('# octocat/Hello-World'); + expect(res.text).toContain('## Files'); + expect(res.text).toContain('This README came from GitHub.'); + expect(nock.isDone()).toBe(true); + }); + it('rejects unsupported converter names', async () => { await request(app) .get('/markdown') @@ -133,6 +215,28 @@ describe('API Endpoint Tests', () => { }); }); + describe('GET /txt', () => { + it('returns a compact GitHub repository plain-text snapshot', async () => { + mockGithubRepository(); + + const res = await request(app) + .get('/txt') + .query({ url: 'https://github.com/octocat/Hello-World' }) + .expect(200); + + expect(res.headers['content-type']).toMatch(/text\/plain/); + expect(res.headers['content-disposition']).toContain( + 'filename="octocat-Hello-World.txt"' + ); + expect(res.text).toContain('Repository: octocat/Hello-World'); + expect(res.text).toContain('Files:'); + expect(res.text).toContain('- src/'); + expect(res.text).toContain('README.md:'); + expect(res.text).toContain('This README came from GitHub.'); + expect(nock.isDone()).toBe(true); + }); + }); + describe('GET /archive', () => { it('returns a ZIP archive with remote images (markdown format)', async () => { nock('https://example.com').get('/archive-test').reply(200, MOCK_HTML, { diff --git a/js/tests/integration/github-readme.test.js b/js/tests/integration/github-readme.test.js new file mode 100644 index 0000000..ce5a243 --- /dev/null +++ b/js/tests/integration/github-readme.test.js @@ -0,0 +1,137 @@ +/** + * Integration tests for capturing GitHub repository pages (issue #5). + * + * Live tests are gated behind GITHUB_REPOSITORY_INTEGRATION so default/offline + * runs stay deterministic. The mocked API endpoint tests cover the default + * behavior without network access; this suite proves the real GitHub repository + * page can still be captured as compact text/markdown, original HTML, and PNG + * screenshots in every supported JavaScript browser engine. + */ + +import { jest } from '@jest/globals'; +import { createBrowser } from '../../src/browser.js'; +import { fetchHtml } from '../../src/lib.js'; +import { retry } from '../../src/retry.js'; +import { + fetchGithubRepositorySnapshot, + formatGithubRepositoryMarkdown, + formatGithubRepositoryText, + parseGithubRepositoryUrl, +} from '../../src/github.js'; + +const GITHUB_REPOSITORY_URL = + process.env.GITHUB_REPOSITORY_URL || + 'https://github.com/link-assistant/web-capture'; +const PNG_SIGNATURE = Buffer.from([ + 0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a, +]); + +const SKIP_LIVE = + !process.env.GITHUB_REPOSITORY_INTEGRATION || + process.env.GITHUB_REPOSITORY_INTEGRATION === 'false'; +const describeIfLive = SKIP_LIVE ? describe.skip : describe; + +jest.setTimeout(120000); + +async function navigateWithRetry(page, url) { + await retry( + async () => { + await page.goto(url, { + waitUntil: 'domcontentloaded', + timeout: 30000, + }); + }, + { + retries: 3, + baseDelay: 2000, + onRetry: (err, attempt, delay) => { + console.log( + `Navigation retry ${attempt} for ${url} after ${delay}ms: ${err.message}` + ); + }, + } + ); + await page.waitForTimeout(5000); +} + +describeIfLive('GitHub repository capture', () => { + it('downloads compact repository text and markdown with README plus file tree', async () => { + const parsed = parseGithubRepositoryUrl(GITHUB_REPOSITORY_URL); + expect(parsed).not.toBeNull(); + + const snapshot = await retry( + () => fetchGithubRepositorySnapshot(GITHUB_REPOSITORY_URL), + { + retries: 3, + baseDelay: 2000, + } + ); + + const text = formatGithubRepositoryText(snapshot); + const markdown = formatGithubRepositoryMarkdown(snapshot); + + expect(text).toContain(`Repository: ${parsed.fullName}`); + expect(text).toContain('Files:'); + expect(text).toMatch(/README/i); + expect(text.length).toBeGreaterThan(500); + + expect(markdown).toContain(`# ${parsed.fullName}`); + expect(markdown).toContain('## Files'); + expect(markdown).toMatch(/## .*README/i); + expect(markdown.length).toBeGreaterThan(500); + expect(markdown).not.toMatch(/ { + const parsed = parseGithubRepositoryUrl(GITHUB_REPOSITORY_URL); + expect(parsed).not.toBeNull(); + + const html = await retry(() => fetchHtml(GITHUB_REPOSITORY_URL), { + retries: 3, + baseDelay: 2000, + }); + + expect(html).toMatch(/ { + describeIfLive(`${engine}`, () => { + let browser; + + beforeAll(async () => { + browser = await createBrowser(engine); + }); + + afterAll(async () => { + if (browser) { + await browser.close(); + } + }); + + it('captures the GitHub repository page as a PNG screenshot', async () => { + const page = await browser.newPage(); + await page.setExtraHTTPHeaders({ + 'Accept-Language': 'en-US,en;q=0.9', + 'Accept-Charset': 'utf-8', + }); + await page.setUserAgent( + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' + ); + await page.setViewport({ width: 1280, height: 800 }); + await navigateWithRetry(page, GITHUB_REPOSITORY_URL); + + const screenshot = await page.screenshot({ type: 'png' }); + + expect(screenshot).toBeInstanceOf(Buffer); + expect(screenshot.length).toBeGreaterThan(1000); + expect(screenshot.slice(0, 8)).toEqual(PNG_SIGNATURE); + }); + }); + } +); diff --git a/js/tests/unit/github.test.js b/js/tests/unit/github.test.js new file mode 100644 index 0000000..f71f93a --- /dev/null +++ b/js/tests/unit/github.test.js @@ -0,0 +1,97 @@ +import { + formatGithubRepositoryMarkdown, + formatGithubRepositoryText, + getGithubRepositoryTextFilename, + isGithubRepositoryUrl, + parseGithubRepositoryUrl, +} from '../../src/github.js'; + +const SNAPSHOT = { + sourceUrl: 'https://github.com/octocat/Hello-World', + defaultBranch: 'master', + repository: { + full_name: 'octocat/Hello-World', + description: 'A friendly test repository', + html_url: 'https://github.com/octocat/Hello-World', + default_branch: 'master', + language: 'JavaScript', + stargazers_count: 42, + forks_count: 7, + open_issues_count: 3, + license: { spdx_id: 'MIT' }, + topics: ['demo', 'capture'], + }, + tree: [ + { + name: 'src', + path: 'src', + type: 'dir', + html_url: 'https://github.com/octocat/Hello-World/tree/master/src', + }, + { + name: 'README.md', + path: 'README.md', + type: 'file', + size: 37, + html_url: 'https://github.com/octocat/Hello-World/blob/master/README.md', + }, + ], + readme: { + path: 'README.md', + content: '# Hello World\n\nThis is the README.', + }, +}; + +describe('GitHub repository URLs', () => { + it('detects plain GitHub repository pages', () => { + expect( + parseGithubRepositoryUrl('https://github.com/octocat/Hello-World') + ).toEqual({ + owner: 'octocat', + repo: 'Hello-World', + fullName: 'octocat/Hello-World', + htmlUrl: 'https://github.com/octocat/Hello-World', + }); + expect( + isGithubRepositoryUrl('https://github.com/octocat/Hello-World') + ).toBe(true); + expect( + isGithubRepositoryUrl('https://github.com/octocat/Hello-World/issues') + ).toBe(false); + expect( + isGithubRepositoryUrl('https://example.com/octocat/Hello-World') + ).toBe(false); + }); + + it('derives repository text filenames', () => { + expect( + getGithubRepositoryTextFilename('https://github.com/octocat/Hello-World') + ).toBe('octocat-Hello-World.txt'); + }); +}); + +describe('GitHub repository snapshot formatting', () => { + it('formats a compact markdown snapshot with metadata, file tree, and README', () => { + const markdown = formatGithubRepositoryMarkdown(SNAPSHOT); + + expect(markdown).toContain('# octocat/Hello-World'); + expect(markdown).toContain('> A friendly test repository'); + expect(markdown).toContain('- Default branch: `master`'); + expect(markdown).toContain('- [src/]('); + expect(markdown).toContain('- [README.md]('); + expect(markdown).toContain('## README.md'); + expect(markdown).toContain('# Hello World'); + }); + + it('formats a compact plain-text snapshot with metadata, file tree, and README', () => { + const text = formatGithubRepositoryText(SNAPSHOT); + + expect(text).toContain('Repository: octocat/Hello-World'); + expect(text).toContain('Description: A friendly test repository'); + expect(text).toContain('Files:'); + expect(text).toContain('- src/'); + expect(text).toContain('- README.md'); + expect(text).toContain('README.md:'); + expect(text).toContain('This is the README.'); + }); +}); diff --git a/rust/Cargo.lock b/rust/Cargo.lock index dfbfe14..c0282b1 100644 --- a/rust/Cargo.lock +++ b/rust/Cargo.lock @@ -3630,7 +3630,7 @@ dependencies = [ [[package]] name = "web-capture" -version = "0.3.25" +version = "0.3.26" dependencies = [ "anyhow", "async-tungstenite", diff --git a/rust/Cargo.toml b/rust/Cargo.toml index 83e660a..13c1161 100644 --- a/rust/Cargo.toml +++ b/rust/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "web-capture" -version = "0.3.25" +version = "0.3.26" edition = "2021" description = "CLI and microservice to render web pages as HTML, Markdown, or PNG" license = "Unlicense" diff --git a/rust/README.md b/rust/README.md index 7c79819..57209f4 100644 --- a/rust/README.md +++ b/rust/README.md @@ -52,6 +52,10 @@ web-capture https://example.com --format html # Capture raw paste text web-capture https://xpaste.pro/p/t4q0Lsp0 --format txt -o paste.txt +# Capture a GitHub repository as compact text or Markdown +web-capture https://github.com/link-assistant/web-capture --format txt -o repository.txt +web-capture https://github.com/link-assistant/web-capture --format markdown -o repository.md + # Take a screenshot web-capture https://example.com --format png -o screenshot.png @@ -89,6 +93,11 @@ Markdown stays under 1500 lines. Larger paste pages return a ZIP containing `index.md`, `xpaste-pro-.md`, and `xpaste-pro-.txt`. Canonical, localized, and `/raw` paste URLs are normalized before capture. +For plain GitHub repository URLs such as `https://github.com/owner/repo`, +`/markdown` and `/txt` return compact repository snapshots with repository +metadata, the root file tree, and README content. GitHub subpages continue +through the regular capture path. + ### Search Endpoint ``` @@ -208,6 +217,10 @@ web-capture https://example.com -f html -o page.html # Raw paste text web-capture https://xpaste.pro/p/t4q0Lsp0 -f txt -o paste.txt +# GitHub repository snapshot +web-capture https://github.com/link-assistant/web-capture -f markdown -o repository.md +web-capture https://github.com/link-assistant/web-capture -f txt -o repository.txt + # Google Docs live editor model web-capture https://docs.google.com/document/d/DOC_ID/edit --capture browser @@ -305,6 +318,9 @@ with environment variables: # Download the Wikipedia page (markdown + image) via the browser engine WIKIPEDIA_INTEGRATION=1 cargo test --test integration wikipedia_download::live -- --nocapture +# Download a GitHub repository page as compact txt/markdown, original HTML, and screenshot +GITHUB_REPOSITORY_INTEGRATION=1 cargo test --test integration github_repository::live -- --nocapture + # Public Google Docs live suite GDOCS_INTEGRATION=1 cargo test --test integration gdocs_public_doc::live -- --nocapture ``` diff --git a/rust/src/github.rs b/rust/src/github.rs new file mode 100644 index 0000000..0d62e4b --- /dev/null +++ b/rust/src/github.rs @@ -0,0 +1,551 @@ +//! GitHub repository-page capture helpers. +//! +//! Plain repository pages are mostly application HTML. For text and markdown +//! output, the compact content users expect is available more reliably through +//! the GitHub REST API: repository details, the root file listing, and README. + +use anyhow::{anyhow, Context}; +use base64::{engine::general_purpose::STANDARD, Engine as _}; +use reqwest::header::{HeaderMap, HeaderValue, ACCEPT, AUTHORIZATION, USER_AGENT}; +use reqwest::StatusCode; +use serde::de::DeserializeOwned; +use serde::Deserialize; +use url::Url; + +const GITHUB_API_BASE: &str = "https://api.github.com"; +const GITHUB_USER_AGENT: &str = "web-capture"; + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct GithubRepositoryUrl { + pub owner: String, + pub repo: String, + pub full_name: String, + pub html_url: String, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct GithubRepositoryMetadata { + pub full_name: String, + pub html_url: String, + pub description: Option, + pub language: Option, + pub stargazers_count: u64, + pub forks_count: u64, + pub open_issues_count: u64, + pub license_spdx_id: Option, + pub topics: Vec, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct GithubReadme { + pub name: String, + pub path: String, + pub html_url: Option, + pub content: Option, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct GithubTreeEntry { + pub name: String, + pub path: String, + pub kind: String, + pub size: Option, + pub html_url: String, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct GithubRepositorySnapshot { + pub source_url: String, + pub repository: GithubRepositoryMetadata, + pub default_branch: String, + pub readme: Option, + pub tree: Vec, +} + +#[derive(Debug, Deserialize)] +struct RepositoryApiResponse { + full_name: String, + html_url: String, + description: Option, + default_branch: Option, + language: Option, + stargazers_count: Option, + forks_count: Option, + open_issues_count: Option, + license: Option, + topics: Option>, +} + +#[derive(Debug, Deserialize)] +struct RepositoryLicenseApiResponse { + spdx_id: Option, +} + +#[derive(Debug, Deserialize)] +struct ReadmeApiResponse { + name: Option, + path: Option, + html_url: Option, + download_url: Option, + content: Option, + encoding: Option, +} + +#[derive(Debug, Deserialize)] +struct ContentsApiResponse { + name: String, + path: String, + #[serde(rename = "type")] + kind: String, + size: Option, + html_url: Option, +} + +/// Parse a plain GitHub repository URL. +/// +/// URLs for subpages such as `/issues`, `/tree/...`, or `/blob/...` are not +/// treated as repository snapshots because those pages have their own capture +/// semantics. +#[must_use] +pub fn parse_github_repository_url(url: &str) -> Option { + let parsed = Url::parse(url).ok()?; + let host = parsed.host_str()?.to_ascii_lowercase(); + if host != "github.com" && host != "www.github.com" { + return None; + } + + let parts: Vec<_> = parsed + .path_segments()? + .filter(|segment| !segment.is_empty()) + .collect(); + if parts.len() != 2 { + return None; + } + + let owner = parts[0].to_string(); + let repo = parts[1].to_string(); + if owner.is_empty() || repo.is_empty() { + return None; + } + + Some(GithubRepositoryUrl { + full_name: format!("{owner}/{repo}"), + html_url: format!("https://github.com/{owner}/{repo}"), + owner, + repo, + }) +} + +#[must_use] +pub fn is_github_repository_url(url: &str) -> bool { + parse_github_repository_url(url).is_some() +} + +#[must_use] +pub fn github_repository_text_filename(url: &str) -> Option { + parse_github_repository_url(url).map(|repo| format!("{}-{}.txt", repo.owner, repo.repo)) +} + +pub async fn fetch_github_repository_snapshot( + url: &str, +) -> anyhow::Result { + let parsed = parse_github_repository_url(url) + .ok_or_else(|| anyhow!("Not a GitHub repository URL: {url}"))?; + + let repository: RepositoryApiResponse = fetch_github_json(&format!( + "{GITHUB_API_BASE}/repos/{}/{}", + parsed.owner, parsed.repo + )) + .await? + .ok_or_else(|| anyhow!("Repository was not returned by the GitHub API"))?; + + let default_branch = repository + .default_branch + .clone() + .unwrap_or_else(|| "main".to_string()); + + let (readme, tree) = tokio::try_join!( + fetch_github_readme(&parsed, &default_branch), + fetch_github_root_tree(&parsed, &default_branch) + )?; + + Ok(GithubRepositorySnapshot { + source_url: parsed.html_url, + repository: GithubRepositoryMetadata { + full_name: repository.full_name, + html_url: repository.html_url, + description: repository.description, + language: repository.language, + stargazers_count: repository.stargazers_count.unwrap_or_default(), + forks_count: repository.forks_count.unwrap_or_default(), + open_issues_count: repository.open_issues_count.unwrap_or_default(), + license_spdx_id: repository.license.and_then(|license| license.spdx_id), + topics: repository.topics.unwrap_or_default(), + }, + default_branch, + readme, + tree, + }) +} + +#[must_use] +pub fn format_github_repository_markdown(snapshot: &GithubRepositorySnapshot) -> String { + let mut lines = vec![ + format!("# {}", snapshot.repository.full_name), + String::new(), + ]; + if let Some(description) = &snapshot.repository.description { + lines.push(format!("> {description}")); + lines.push(String::new()); + } + + lines.extend([ + "## Repository".to_string(), + String::new(), + format!("- URL: {}", repository_url(snapshot)), + format!("- Default branch: `{}`", snapshot.default_branch), + ]); + push_optional_line( + &mut lines, + snapshot + .repository + .language + .as_ref() + .map(|language| format!("- Primary language: {language}")), + ); + lines.push(format!("- Stars: {}", snapshot.repository.stargazers_count)); + lines.push(format!("- Forks: {}", snapshot.repository.forks_count)); + lines.push(format!( + "- Open issues: {}", + snapshot.repository.open_issues_count + )); + push_optional_line( + &mut lines, + snapshot + .repository + .license_spdx_id + .as_ref() + .map(|license| format!("- License: {license}")), + ); + if !snapshot.repository.topics.is_empty() { + lines.push(format!( + "- Topics: {}", + snapshot.repository.topics.join(", ") + )); + } + + lines.extend([String::new(), "## Files".to_string(), String::new()]); + append_tree_markdown(&mut lines, &snapshot.tree); + + let readme_path = snapshot + .readme + .as_ref() + .map_or("README", |readme| readme.path.as_str()); + lines.extend([String::new(), format!("## {readme_path}"), String::new()]); + append_readme_content(&mut lines, snapshot.readme.as_ref()); + + lines.join("\n") +} + +#[must_use] +pub fn format_github_repository_text(snapshot: &GithubRepositorySnapshot) -> String { + let mut lines = vec![format!("Repository: {}", snapshot.repository.full_name)]; + if let Some(description) = &snapshot.repository.description { + lines.push(format!("Description: {description}")); + } + lines.extend([ + format!("URL: {}", repository_url(snapshot)), + format!("Default branch: {}", snapshot.default_branch), + ]); + push_optional_line( + &mut lines, + snapshot + .repository + .language + .as_ref() + .map(|language| format!("Primary language: {language}")), + ); + lines.push(format!("Stars: {}", snapshot.repository.stargazers_count)); + lines.push(format!("Forks: {}", snapshot.repository.forks_count)); + lines.push(format!( + "Open issues: {}", + snapshot.repository.open_issues_count + )); + push_optional_line( + &mut lines, + snapshot + .repository + .license_spdx_id + .as_ref() + .map(|license| format!("License: {license}")), + ); + if !snapshot.repository.topics.is_empty() { + lines.push(format!("Topics: {}", snapshot.repository.topics.join(", "))); + } + + lines.extend([String::new(), "Files:".to_string()]); + append_tree_text(&mut lines, &snapshot.tree); + + let readme_path = snapshot + .readme + .as_ref() + .map_or("README", |readme| readme.path.as_str()); + lines.extend([String::new(), format!("{readme_path}:"), String::new()]); + append_readme_content(&mut lines, snapshot.readme.as_ref()); + + lines.join("\n") +} + +async fn fetch_github_readme( + parsed: &GithubRepositoryUrl, + default_branch: &str, +) -> anyhow::Result> { + let readme: Option = fetch_optional_github_json(&format!( + "{GITHUB_API_BASE}/repos/{}/{}/readme?ref={default_branch}", + parsed.owner, parsed.repo + )) + .await?; + + let Some(readme) = readme else { + return Ok(None); + }; + + let content = if readme.encoding.as_deref() == Some("base64") { + readme + .content + .as_deref() + .map(decode_base64_text) + .transpose()? + } else if let Some(download_url) = readme.download_url.as_deref() { + fetch_optional_github_text(download_url).await? + } else { + None + }; + + let name = readme.name.unwrap_or_else(|| "README".to_string()); + let path = readme.path.unwrap_or_else(|| name.clone()); + Ok(Some(GithubReadme { + name, + path, + html_url: readme.html_url, + content, + })) +} + +async fn fetch_github_root_tree( + parsed: &GithubRepositoryUrl, + default_branch: &str, +) -> anyhow::Result> { + let contents: Option> = fetch_optional_github_json(&format!( + "{GITHUB_API_BASE}/repos/{}/{}/contents?ref={default_branch}", + parsed.owner, parsed.repo + )) + .await?; + + let mut tree: Vec<_> = contents + .unwrap_or_default() + .into_iter() + .map(|item| { + let html_url = item.html_url.unwrap_or_else(|| { + let kind = if item.kind == "dir" { "tree" } else { "blob" }; + format!( + "https://github.com/{}/{}/{kind}/{default_branch}/{}", + parsed.owner, parsed.repo, item.path + ) + }); + GithubTreeEntry { + name: item.name, + path: item.path, + kind: item.kind, + size: item.size, + html_url, + } + }) + .collect(); + tree.sort_by( + |a, b| match (a.kind.as_str() == "dir", b.kind.as_str() == "dir") { + (true, false) => std::cmp::Ordering::Less, + (false, true) => std::cmp::Ordering::Greater, + _ => a.name.cmp(&b.name), + }, + ); + Ok(tree) +} + +async fn fetch_github_json(url: &str) -> anyhow::Result> +where + T: DeserializeOwned, +{ + fetch_github_json_with_optional_not_found(url, false).await +} + +async fn fetch_optional_github_json(url: &str) -> anyhow::Result> +where + T: DeserializeOwned, +{ + fetch_github_json_with_optional_not_found(url, true).await +} + +async fn fetch_github_json_with_optional_not_found( + url: &str, + optional: bool, +) -> anyhow::Result> +where + T: DeserializeOwned, +{ + let response = reqwest::Client::new() + .get(url) + .headers(github_headers("application/vnd.github+json")) + .send() + .await + .with_context(|| format!("Requesting {url}"))?; + if optional && response.status() == StatusCode::NOT_FOUND { + return Ok(None); + } + let status = response.status(); + let body = response + .text() + .await + .with_context(|| format!("Reading response body from {url}"))?; + if !status.is_success() { + anyhow::bail!("GitHub API {status}: {body}"); + } + Ok(Some(serde_json::from_str(&body).with_context(|| { + format!("Parsing GitHub JSON from {url}") + })?)) +} + +async fn fetch_optional_github_text(url: &str) -> anyhow::Result> { + let response = reqwest::Client::new() + .get(url) + .headers(github_headers("text/plain")) + .send() + .await + .with_context(|| format!("Requesting {url}"))?; + if response.status() == StatusCode::NOT_FOUND { + return Ok(None); + } + let status = response.status(); + let text = response + .text() + .await + .with_context(|| format!("Reading text response from {url}"))?; + if !status.is_success() { + anyhow::bail!("GitHub raw {status}: {text}"); + } + Ok(Some(text)) +} + +fn github_headers(accept: &str) -> HeaderMap { + let mut headers = HeaderMap::new(); + headers.insert( + ACCEPT, + HeaderValue::from_str(accept).unwrap_or_else(|_| HeaderValue::from_static("*/*")), + ); + headers.insert(USER_AGENT, HeaderValue::from_static(GITHUB_USER_AGENT)); + headers.insert( + "X-GitHub-Api-Version", + HeaderValue::from_static("2022-11-28"), + ); + if let Ok(token) = std::env::var("GITHUB_TOKEN").or_else(|_| std::env::var("GH_TOKEN")) { + if let Ok(value) = HeaderValue::from_str(&format!("Bearer {token}")) { + headers.insert(AUTHORIZATION, value); + } + } + headers +} + +fn decode_base64_text(content: &str) -> anyhow::Result { + let stripped: String = content.chars().filter(|ch| !ch.is_whitespace()).collect(); + let bytes = STANDARD + .decode(stripped) + .context("Decoding GitHub README base64 content")?; + Ok(String::from_utf8_lossy(&bytes).into_owned()) +} + +fn repository_url(snapshot: &GithubRepositorySnapshot) -> &str { + if snapshot.repository.html_url.is_empty() { + &snapshot.source_url + } else { + &snapshot.repository.html_url + } +} + +fn push_optional_line(lines: &mut Vec, line: Option) { + if let Some(line) = line { + lines.push(line); + } +} + +fn append_tree_markdown(lines: &mut Vec, tree: &[GithubTreeEntry]) { + if tree.is_empty() { + lines.push("- No root files returned by the GitHub API.".to_string()); + return; + } + + for item in tree { + let label = if item.kind == "dir" { + format!("{}/", item.name) + } else { + item.name.clone() + }; + let suffix = if item.kind == "file" { + item.size + .map_or_else(String::new, |size| format!(" ({})", format_bytes(size))) + } else { + String::new() + }; + lines.push(format!("- [{label}]({}){suffix}", item.html_url)); + } +} + +fn append_tree_text(lines: &mut Vec, tree: &[GithubTreeEntry]) { + if tree.is_empty() { + lines.push("- No root files returned by the GitHub API.".to_string()); + return; + } + + for item in tree { + let label = if item.kind == "dir" { + format!("{}/", item.name) + } else { + item.name.clone() + }; + let suffix = if item.kind == "file" { + item.size + .map_or_else(String::new, |size| format!(" ({})", format_bytes(size))) + } else { + String::new() + }; + lines.push(format!("- {label}{suffix}")); + } +} + +fn append_readme_content(lines: &mut Vec, readme: Option<&GithubReadme>) { + if let Some(content) = readme.and_then(|readme| readme.content.as_deref()) { + lines.push(content.trim_end().to_string()); + } else { + lines.push("README content was not returned by the GitHub API.".to_string()); + } + lines.push(String::new()); +} + +fn format_bytes(size: u64) -> String { + if size < 1024 { + return format!("{size} B"); + } + if size < 1024 * 1024 { + return format_scaled_bytes(size, 1024, "KB"); + } + format_scaled_bytes(size, 1024 * 1024, "MB") +} + +fn format_scaled_bytes(size: u64, unit: u64, suffix: &str) -> String { + let mut whole = size / unit; + let mut tenth = ((size % unit) * 10 + unit / 2) / unit; + if tenth == 10 { + whole += 1; + tenth = 0; + } + format!("{whole}.{tenth} {suffix}") +} diff --git a/rust/src/lib.rs b/rust/src/lib.rs index 29214cb..ebf4f74 100644 --- a/rust/src/lib.rs +++ b/rust/src/lib.rs @@ -40,6 +40,7 @@ pub mod browser; pub mod extract_images; pub mod figures; pub mod gdocs; +pub mod github; pub mod html; pub mod kreuzberg; pub mod latex; diff --git a/rust/src/main.rs b/rust/src/main.rs index af647a4..b89b700 100644 --- a/rust/src/main.rs +++ b/rust/src/main.rs @@ -517,27 +517,28 @@ async fn markdown_handler(Query(params): Query) -> Response { .as_deref() .unwrap_or("html2md") .to_ascii_lowercase(); - if converter != "html2md" && converter != "kreuzberg" { - return (StatusCode::BAD_REQUEST, "Unsupported `converter` parameter").into_response(); - } - let format = params .format .as_deref() .unwrap_or("text") .to_ascii_lowercase(); - if format != "text" && format != "json" { - return (StatusCode::BAD_REQUEST, "Unsupported `format` parameter").into_response(); - } - if format == "json" && converter != "kreuzberg" { - return ( - StatusCode::BAD_REQUEST, - "`format=json` is only supported with `converter=kreuzberg`", - ) - .into_response(); + if let Some(response) = validate_markdown_query(&converter, &format) { + return response; } let page_url = web_capture::xpaste::normalize_url_for_text_page(&url); + let has_selector = params.content_selector.is_some() || params.body_selector.is_some(); + if let Some(response) = maybe_github_repository_markdown_response( + &page_url, + &format, + has_selector, + params.embed_images, + ) + .await + { + return response; + } + let html = match fetch_html(&page_url).await { Ok(html) => html, Err(e) => { @@ -565,12 +566,11 @@ async fn markdown_handler(Query(params): Query) -> Response { } }; - let mode = if params.embed_images { - ImageMode::Embed - } else { - ImageMode::Default - }; - if let Ok(image_result) = apply_image_mode(&result.content, mode, Some(&page_url)) { + if let Ok(image_result) = apply_image_mode( + &result.content, + server_markdown_image_mode(params.embed_images), + Some(&page_url), + ) { result.content = image_result.markdown; } @@ -597,12 +597,11 @@ async fn markdown_handler(Query(params): Query) -> Response { // Route through the unified chokepoint. The server returns a single // response body, so only Default (strip base64) and Embed apply. - let mode = if params.embed_images { - ImageMode::Embed - } else { - ImageMode::Default - }; - if let Ok(result) = apply_image_mode(&markdown, mode, Some(&page_url)) { + if let Ok(result) = apply_image_mode( + &markdown, + server_markdown_image_mode(params.embed_images), + Some(&page_url), + ) { markdown = result.markdown; } @@ -613,6 +612,70 @@ async fn markdown_handler(Query(params): Query) -> Response { markdown_response(markdown) } +fn validate_markdown_query(converter: &str, format: &str) -> Option { + if converter != "html2md" && converter != "kreuzberg" { + return Some( + (StatusCode::BAD_REQUEST, "Unsupported `converter` parameter").into_response(), + ); + } + if format != "text" && format != "json" { + return Some((StatusCode::BAD_REQUEST, "Unsupported `format` parameter").into_response()); + } + if format == "json" && converter != "kreuzberg" { + return Some( + ( + StatusCode::BAD_REQUEST, + "`format=json` is only supported with `converter=kreuzberg`", + ) + .into_response(), + ); + } + None +} + +async fn maybe_github_repository_markdown_response( + page_url: &str, + format: &str, + has_selector: bool, + embed_images: bool, +) -> Option { + if format != "text" || has_selector || !web_capture::github::is_github_repository_url(page_url) + { + return None; + } + + let snapshot = match web_capture::github::fetch_github_repository_snapshot(page_url).await { + Ok(snapshot) => snapshot, + Err(e) => { + error!("Failed to fetch GitHub repository snapshot: {}", e); + return Some( + ( + StatusCode::INTERNAL_SERVER_ERROR, + "Error fetching GitHub repository snapshot", + ) + .into_response(), + ); + } + }; + let mut markdown = web_capture::github::format_github_repository_markdown(&snapshot); + if let Ok(result) = apply_image_mode( + &markdown, + server_markdown_image_mode(embed_images), + Some(page_url), + ) { + markdown = result.markdown; + } + Some(markdown_response(markdown)) +} + +const fn server_markdown_image_mode(embed_images: bool) -> ImageMode { + if embed_images { + ImageMode::Embed + } else { + ImageMode::Default + } +} + /// Text download endpoint handler async fn txt_handler(Query(params): Query) -> Response { let url = match normalize_url(¶ms.url) { @@ -636,6 +699,13 @@ async fn txt_handler(Query(params): Query) -> Response { } async fn fetch_text_content(url: &str) -> anyhow::Result { + if web_capture::github::is_github_repository_url(url) { + let snapshot = web_capture::github::fetch_github_repository_snapshot(url).await?; + return Ok(web_capture::github::format_github_repository_text( + &snapshot, + )); + } + let text_url = web_capture::xpaste::normalize_url_for_text_content(url); let response = reqwest::get(&text_url).await?; if !response.status().is_success() { @@ -653,7 +723,8 @@ async fn fetch_text_content(url: &str) -> anyhow::Result { } fn text_response(url: &str, text: String) -> Response { - let filename = web_capture::xpaste::filename_for_text_url(url); + let filename = web_capture::github::github_repository_text_filename(url) + .unwrap_or_else(|| web_capture::xpaste::filename_for_text_url(url)); let mut response = (StatusCode::OK, text).into_response(); response.headers_mut().insert( header::CONTENT_TYPE, @@ -1198,6 +1269,36 @@ async fn capture_url( } } "markdown" | "md" => { + if args.content_selector.is_none() + && args.body_selector.is_none() + && web_capture::github::is_github_repository_url(&absolute_url) + { + let snapshot = + web_capture::github::fetch_github_repository_snapshot(&absolute_url).await?; + let markdown = web_capture::github::format_github_repository_markdown(&snapshot); + + let is_stdout = output.is_some_and(|p| p.as_os_str() == "-"); + let derived; + let effective_output = if is_stdout { + None + } else if let Some(path) = output { + Some(path.clone()) + } else { + derived = derive_output_path(&absolute_url, "md", &args.data_dir); + Some(derived) + }; + if let Some(ref path) = effective_output { + let markdown = + process_output_markdown(markdown, args, path, "GitHub repository Markdown") + .await?; + write_text_capture_to_path(&markdown, path, "GitHub repository Markdown") + .await?; + } else { + print!("{markdown}"); + } + return Ok(()); + } + let html = capture_html_content(&absolute_url, args).await?; // Enhanced conversion is now the default diff --git a/rust/tests/integration/github_repository.rs b/rust/tests/integration/github_repository.rs new file mode 100644 index 0000000..108165f --- /dev/null +++ b/rust/tests/integration/github_repository.rs @@ -0,0 +1,143 @@ +//! Integration tests for capturing GitHub repository pages (issue #5). +//! +//! Live tests are gated behind `GITHUB_REPOSITORY_INTEGRATION` so default and +//! offline runs stay deterministic. They verify that a real GitHub repository +//! can be captured as compact text/markdown, original HTML, and a PNG screenshot +//! through the Rust implementation. + +const DEFAULT_REPOSITORY_URL: &str = "https://github.com/link-assistant/web-capture"; + +fn repository_url() -> String { + std::env::var("GITHUB_REPOSITORY_URL").unwrap_or_else(|_| DEFAULT_REPOSITORY_URL.to_string()) +} + +fn live_enabled() -> bool { + matches!( + std::env::var("GITHUB_REPOSITORY_INTEGRATION").as_deref(), + Ok("1" | "true" | "TRUE") + ) +} + +fn chrome_available() -> bool { + std::env::var_os("WEB_CAPTURE_CHROME_PATH").is_some() + || [ + "google-chrome", + "google-chrome-stable", + "chromium", + "chromium-browser", + "chrome", + ] + .iter() + .any(|candidate| { + std::process::Command::new(candidate) + .arg("--version") + .output() + .is_ok() + }) +} + +#[test] +fn detects_default_github_repository_url() { + assert!(web_capture::github::is_github_repository_url( + DEFAULT_REPOSITORY_URL + )); + assert!(!web_capture::github::is_github_repository_url( + "https://github.com/link-assistant/web-capture/issues" + )); +} + +#[tokio::test] +async fn live_download_repository_as_text_and_markdown() { + if !live_enabled() { + eprintln!( + "Skipping live GitHub repository text/markdown test; set GITHUB_REPOSITORY_INTEGRATION=1 to enable." + ); + return; + } + + let url = repository_url(); + let parsed = web_capture::github::parse_github_repository_url(&url) + .expect("GITHUB_REPOSITORY_URL should be a plain repository URL"); + let snapshot = web_capture::github::fetch_github_repository_snapshot(&url) + .await + .expect("fetch GitHub repository snapshot"); + let text = web_capture::github::format_github_repository_text(&snapshot); + let markdown = web_capture::github::format_github_repository_markdown(&snapshot); + + assert!(text.contains(&format!("Repository: {}", parsed.full_name))); + assert!(text.contains("Files:")); + assert!(text.to_ascii_lowercase().contains("readme")); + assert!( + text.len() > 500, + "expected substantial text, got {} bytes", + text.len() + ); + + assert!(markdown.contains(&format!("# {}", parsed.full_name))); + assert!(markdown.contains("## Files")); + assert!(markdown.to_ascii_lowercase().contains("readme")); + assert!( + markdown.len() > 500, + "expected substantial markdown, got {} bytes", + markdown.len() + ); + assert!( + !markdown.to_ascii_lowercase().contains(" tags" + ); +} + +#[tokio::test] +async fn live_download_repository_as_original_html() { + if !live_enabled() { + eprintln!( + "Skipping live GitHub repository HTML test; set GITHUB_REPOSITORY_INTEGRATION=1 to enable." + ); + return; + } + + let url = repository_url(); + let parsed = web_capture::github::parse_github_repository_url(&url) + .expect("GITHUB_REPOSITORY_URL should be a plain repository URL"); + let html = web_capture::fetch_html(&url) + .await + .expect("fetch GitHub repository HTML"); + + assert!(html.to_ascii_lowercase().contains(" 1000, + "expected substantial HTML, got {} bytes", + html.len() + ); +} + +#[tokio::test] +async fn live_capture_repository_as_png() { + if !live_enabled() { + eprintln!( + "Skipping live GitHub repository screenshot test; set GITHUB_REPOSITORY_INTEGRATION=1 to enable." + ); + return; + } + if !chrome_available() { + eprintln!("Skipping live GitHub repository screenshot test because Chrome/Chromium is not installed"); + return; + } + + let screenshot = web_capture::capture_screenshot(&repository_url()) + .await + .expect("capture GitHub repository screenshot"); + + assert!( + screenshot.len() > 1000, + "expected a non-trivial PNG, got {} bytes", + screenshot.len() + ); + assert_eq!( + &screenshot[..8], + b"\x89PNG\r\n\x1a\n", + "expected a valid PNG signature" + ); +} diff --git a/rust/tests/integration/mod.rs b/rust/tests/integration/mod.rs index 392ca63..9ce105a 100644 --- a/rust/tests/integration/mod.rs +++ b/rust/tests/integration/mod.rs @@ -8,6 +8,7 @@ mod figures; mod gdocs; mod gdocs_image_parity; mod gdocs_public_doc; +mod github_repository; mod heading_numbering; mod html2md_br_in_list_item; mod html2md_ol_numbering; diff --git a/rust/tests/unit/github.rs b/rust/tests/unit/github.rs new file mode 100644 index 0000000..70e82ac --- /dev/null +++ b/rust/tests/unit/github.rs @@ -0,0 +1,105 @@ +use web_capture::github::{ + format_github_repository_markdown, format_github_repository_text, + github_repository_text_filename, is_github_repository_url, parse_github_repository_url, + GithubReadme, GithubRepositoryMetadata, GithubRepositorySnapshot, GithubTreeEntry, +}; + +#[test] +fn detects_plain_github_repository_pages() { + let parsed = parse_github_repository_url("https://github.com/octocat/Hello-World").unwrap(); + + assert_eq!(parsed.owner, "octocat"); + assert_eq!(parsed.repo, "Hello-World"); + assert_eq!(parsed.full_name, "octocat/Hello-World"); + assert_eq!(parsed.html_url, "https://github.com/octocat/Hello-World"); + assert!(is_github_repository_url( + "https://github.com/octocat/Hello-World" + )); + assert!(!is_github_repository_url( + "https://github.com/octocat/Hello-World/issues" + )); + assert!(!is_github_repository_url( + "https://example.com/octocat/Hello-World" + )); +} + +#[test] +fn derives_repository_text_filenames() { + assert_eq!( + github_repository_text_filename("https://github.com/octocat/Hello-World"), + Some("octocat-Hello-World.txt".to_string()) + ); +} + +#[test] +fn formats_compact_markdown_snapshot_with_metadata_files_and_readme() { + let snapshot = fixture_snapshot(); + + let markdown = format_github_repository_markdown(&snapshot); + + assert!(markdown.contains("# octocat/Hello-World")); + assert!(markdown.contains("> A friendly test repository")); + assert!(markdown.contains("- Default branch: `master`")); + assert!(markdown.contains("- [src/](")); + assert!(markdown.contains("- [README.md](")); + assert!(markdown.contains("## README.md")); + assert!(markdown.contains("# Hello World")); +} + +#[test] +fn formats_compact_text_snapshot_with_metadata_files_and_readme() { + let snapshot = fixture_snapshot(); + + let text = format_github_repository_text(&snapshot); + + assert!(text.contains("Repository: octocat/Hello-World")); + assert!(text.contains("Description: A friendly test repository")); + assert!(text.contains("Files:")); + assert!(text.contains("- src/")); + assert!(text.contains("- README.md")); + assert!(text.contains("README.md:")); + assert!(text.contains("This is the README.")); +} + +fn fixture_snapshot() -> GithubRepositorySnapshot { + GithubRepositorySnapshot { + source_url: "https://github.com/octocat/Hello-World".to_string(), + default_branch: "master".to_string(), + repository: GithubRepositoryMetadata { + full_name: "octocat/Hello-World".to_string(), + html_url: "https://github.com/octocat/Hello-World".to_string(), + description: Some("A friendly test repository".to_string()), + language: Some("JavaScript".to_string()), + stargazers_count: 42, + forks_count: 7, + open_issues_count: 3, + license_spdx_id: Some("MIT".to_string()), + topics: vec!["demo".to_string(), "capture".to_string()], + }, + tree: vec![ + GithubTreeEntry { + name: "src".to_string(), + path: "src".to_string(), + kind: "dir".to_string(), + size: None, + html_url: "https://github.com/octocat/Hello-World/tree/master/src".to_string(), + }, + GithubTreeEntry { + name: "README.md".to_string(), + path: "README.md".to_string(), + kind: "file".to_string(), + size: Some(37), + html_url: "https://github.com/octocat/Hello-World/blob/master/README.md" + .to_string(), + }, + ], + readme: Some(GithubReadme { + name: "README.md".to_string(), + path: "README.md".to_string(), + html_url: Some( + "https://github.com/octocat/Hello-World/blob/master/README.md".to_string(), + ), + content: Some("# Hello World\n\nThis is the README.".to_string()), + }), + } +} diff --git a/rust/tests/unit/mod.rs b/rust/tests/unit/mod.rs index 5ede78a..d82405a 100644 --- a/rust/tests/unit/mod.rs +++ b/rust/tests/unit/mod.rs @@ -1,4 +1,5 @@ mod extract_images; +mod github; mod html; mod latex; mod lib_api;