link-assistant · konard · Jun 5, 2026 · Oct 25, 2025 · Oct 25, 2025 · Oct 25, 2025
diff --git a/.github/workflows/js.yml b/.github/workflows/js.yml
@@ -320,6 +320,12 @@ jobs:
         run: npm test -- --testPathPattern="wikipedia-download" --testTimeout=120000
         timeout-minutes: 10
 
+      - name: Run GitHub repository capture live integration tests (issue #5)
+        env:
+          GITHUB_REPOSITORY_INTEGRATION: 'true'
+        run: npm test -- --testPathPattern="github-readme" --testTimeout=120000
+        timeout-minutes: 10
+
       - name: Build Docker image for e2e tests
         run: docker compose build
         timeout-minutes: 10

diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
@@ -182,6 +182,13 @@ jobs:
         run: cargo test --test integration wikipedia_download::live -- --nocapture
         timeout-minutes: 10
 
+      - name: Run GitHub repository capture live integration tests (issue #5)
+        working-directory: rust
+        env:
+          GITHUB_REPOSITORY_INTEGRATION: '1'
+        run: cargo test --test integration github_repository::live -- --nocapture
+        timeout-minutes: 10
+
   # Build check - only runs when Rust code changes
   build:
     name: Rust - Build

diff --git a/js/.changeset/github-repository-capture.md b/js/.changeset/github-repository-capture.md
@@ -0,0 +1,5 @@
+---
+'@link-assistant/web-capture': patch
+---
+
+Add compact GitHub repository capture for txt and markdown output, including repository metadata, the root file tree, and README content.
diff --git a/js/README.md b/js/README.md
@@ -52,6 +52,10 @@ web-capture https://example.com --format html -o page.html
 # Capture raw paste text
 web-capture https://xpaste.pro/p/t4q0Lsp0 --format txt -o paste.txt
 
+# Capture a GitHub repository as compact text or Markdown
+web-capture https://github.com/link-assistant/web-capture --format txt -o repository.txt
+web-capture https://github.com/link-assistant/web-capture --format markdown -o repository.md
+
 # Take a PNG screenshot
 web-capture https://example.com --format png -o screenshot.png
 
@@ -153,6 +157,11 @@ containing `index.md`, `xpaste-pro-<id>.md`, and `xpaste-pro-<id>.txt`.
 Canonical `/p/<id>`, `/p/<id>/raw`, `/ru/p/<id>`, and `/en/p/<id>` URLs are
 normalized before capture.
 
+For plain GitHub repository URLs such as `https://github.com/owner/repo`,
+`/markdown` returns a compact repository snapshot with repository metadata, the
+root file tree, and README content. GitHub subpages continue through the regular
+HTML-to-Markdown conversion path.
+
 | Parameter           | Required | Description                                                               | Default  |
 | ------------------- | -------- | ------------------------------------------------------------------------- | -------- |
 | `url`               | Yes      | URL to fetch                                                              | -        |
@@ -171,6 +180,9 @@ Returns raw text content as a `.txt` attachment. xpaste.pro paste URLs are
 normalized to their `/raw` endpoint, including localized `/ru/p/<id>` and
 `/en/p/<id>` URLs.
 
+Plain GitHub repository URLs return a compact `.txt` snapshot with repository
+metadata, the root file tree, and README content.
+
 | Parameter | Required | Description  | Default |
 | --------- | -------- | ------------ | ------- |
 | `url`     | Yes      | URL to fetch | -       |
@@ -490,6 +502,9 @@ with environment variables:
 # Download the Wikipedia page (markdown + image) in every supported engine
 WIKIPEDIA_INTEGRATION=true npm test -- --testPathPattern="wikipedia-download"
 
+# Download a GitHub repository page as compact txt/markdown, original HTML, and screenshots
+GITHUB_REPOSITORY_INTEGRATION=true npm test -- --testPathPattern="github-readme"
+
 # Habr articles and public Google Docs live suites
 HABR_INTEGRATION=true npm test -- --testPathPattern="habr-article"
 GDOCS_INTEGRATION=true npm test -- --testPathPattern="gdocs-public-doc"

diff --git a/js/bin/web-capture.js b/js/bin/web-capture.js
@@ -29,7 +29,7 @@

 // Create configuration using lino-arguments pattern
 const config = makeConfig({
  yargs: ({ yargs, getenv }) =>
    yargs
      .usage(
        'web-capture - Capture web pages as HTML, Markdown, or PNG\n\nUsage:\n  web-capture --serve [--port <port>]       Start as API server\n  web-capture <url> [options]               Capture a URL to file/stdout'
@@ -560,7 +560,7 @@
  }
 }

 async function captureUrl(url, options) {
  const {
    format,
    output: explicitOutput,
@@ -614,6 +614,12 @@
     captureGoogleDocWithBrowserOrFallback,
     selectGoogleDocsCaptureMethod,
   } = await import('../src/gdocs.js');
+  const {
+    fetchGithubRepositorySnapshot,
+    formatGithubRepositoryMarkdown,
+    formatGithubRepositoryText,
+    isGithubRepositoryUrl,
+  } = await import('../src/github.js');
 
   const normalizedFormat = format.toLowerCase();
   log.debug(() => ({
@@ -834,7 +840,7 @@
            archive.append(htmlContent, {
              name: 'document.html',
            });
            for (const img of archiveResult.images) {
              archive.append(img.data, { name: `images/${img.filename}` });
            }
            await archive.finalize();
@@ -857,7 +863,7 @@
            archive.append(htmlContent2, {
              name: 'document.html',
            });
            for (const img of archiveResult.images) {
              archive.append(img.data, { name: `images/${img.filename}` });
            }
            await archive.finalize();
@@ -921,15 +927,22 @@
 
   try {
     if (normalizedFormat === 'txt' || normalizedFormat === 'text') {
-      const response = await fetch(normalizeUrlForTextContent(absoluteUrl));
-      if (!response.ok) {
-        throw new Error(`HTTP ${response.status}: ${response.statusText}`);
-      }
-      const contentType = response.headers.get('content-type') || 'text/plain';
-      if (!contentType.includes('text/')) {
-        throw new Error(`Expected text content, got ${contentType}`);
+      let text;
+      if (isGithubRepositoryUrl(absoluteUrl)) {
+        const snapshot = await fetchGithubRepositorySnapshot(absoluteUrl);
+        text = formatGithubRepositoryText(snapshot);
+      } else {
+        const response = await fetch(normalizeUrlForTextContent(absoluteUrl));
+        if (!response.ok) {
+          throw new Error(`HTTP ${response.status}: ${response.statusText}`);
+        }
+        const contentType =
+          response.headers.get('content-type') || 'text/plain';
+        if (!contentType.includes('text/')) {
+          throw new Error(`Expected text content, got ${contentType}`);
+        }
+        text = await response.text();
       }
-      const text = await response.text();
       const output =
         explicitOutput === '-'
           ? null
@@ -1151,17 +1164,27 @@
       console.error(`Archive saved to: ${outPath}`);
     } else if (normalizedFormat === 'markdown' || normalizedFormat === 'md') {
       // Markdown format — enhanced conversion is now the default
-      const html = await fetchHtml(absoluteUrl);
-      const { convertHtmlToMarkdownEnhanced } = await import('../src/lib.js');
-      const result = convertHtmlToMarkdownEnhanced(html, absoluteUrl, {
-        extractLatex: options.extractLatex,
-        extractMetadata: options.extractMetadata,
-        postProcess: options.postProcess,
-        detectCodeLanguage: options.detectCodeLanguage,
-        contentSelector: options.contentSelector,
-        bodySelector: options.bodySelector,
-      });
-      const markdown = result.markdown;
+      let markdown;
+      if (
+        isGithubRepositoryUrl(absoluteUrl) &&
+        !options.contentSelector &&
+        !options.bodySelector
+      ) {
+        const snapshot = await fetchGithubRepositorySnapshot(absoluteUrl);
+        markdown = formatGithubRepositoryMarkdown(snapshot);
+      } else {
+        const html = await fetchHtml(absoluteUrl);
+        const { convertHtmlToMarkdownEnhanced } = await import('../src/lib.js');
+        const result = convertHtmlToMarkdownEnhanced(html, absoluteUrl, {
+          extractLatex: options.extractLatex,
+          extractMetadata: options.extractMetadata,
+          postProcess: options.postProcess,
+          detectCodeLanguage: options.detectCodeLanguage,
+          contentSelector: options.contentSelector,
+          bodySelector: options.bodySelector,
+        });
+        markdown = result.markdown;
+      }
 
       const output =
         explicitOutput === '-'