diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md index 3401174..a2937b2 100644 --- a/ARCHITECTURE.md +++ b/ARCHITECTURE.md @@ -77,6 +77,7 @@ web-capture/ │ ├── index.js # Express server entry point │ ├── browser.js # Browser abstraction layer (Puppeteer & Playwright) │ ├── html.js # HTML endpoint handler +│ ├── txt.js # Text endpoint handler │ ├── markdown.js # Markdown endpoint handler │ ├── image.js # Screenshot endpoint handler │ ├── stream.js # Stream endpoint handler (stub) @@ -133,6 +134,7 @@ web-capture/ **Endpoints**: - `GET /html?url=` - Raw HTML or Puppeteer-rendered HTML +- `GET /txt?url=` - Raw text fetch/download - `GET /markdown?url=` - Markdown conversion - `GET /image?url=` - PNG screenshot - `GET /stream?url=` - Stream handler (future) @@ -197,26 +199,42 @@ flowchart TD ```javascript const browser = await puppeteer.launch({ - args: ['--no-sandbox', '--disable-setuid-sandbox'], + args: ["--no-sandbox", "--disable-setuid-sandbox"], }); await page.setViewport({ width: 1280, height: 800 }); await page.goto(url, { - waitUntil: 'networkidle0', + waitUntil: "networkidle0", timeout: 30000, }); ``` --- -### 3. Markdown Handler (`src/markdown.js`) +### 3. Text Handler (`src/txt.js`) + +**Purpose**: Fetch text resources and return them as `.txt` downloads + +**Process**: + +1. Normalize paste-like URLs such as `https://xpaste.pro/p/` to their raw text endpoint +2. Fetch the text resource +3. Reject non-text responses +4. Return as `text/plain` with an attachment filename + +--- + +### 4. Markdown Handler (`src/markdown.js`) **Purpose**: Convert web pages to clean, readable Markdown **Process**: 1. Fetch HTML (via `fetchHtml`) -2. Convert to Markdown (via `convertHtmlToMarkdown`) -3. Return as `text/markdown` +2. Normalize xpaste.pro paste URLs so `/p/`, localized paste pages, and `/raw` URLs capture the visual paste page +3. Reorder page landmarks so headers, main content, and footers follow visual order before conversion +4. Convert to Markdown (via `convertHtmlToMarkdown`) +5. For xpaste.pro pastes, append raw paste text inline when the result stays under 1500 lines, or return a ZIP with `index.md`, page Markdown, and raw `.txt` when larger +6. Return as `text/markdown` or `application/zip` **Output**: Clean Markdown suitable for: @@ -227,7 +245,7 @@ await page.goto(url, { --- -### 4. Image Handler (`src/image.js`) +### 5. Image Handler (`src/image.js`) **Purpose**: Capture PNG screenshots of web pages @@ -261,8 +279,8 @@ sequenceDiagram **Response Headers**: ```javascript -res.set('Content-Type', 'image/png'); -res.set('Content-Disposition', 'inline; filename="screenshot.png"'); +res.set("Content-Type", "image/png"); +res.set("Content-Disposition", 'inline; filename="screenshot.png"'); ``` --- @@ -321,7 +339,7 @@ observer.observe(document.body, { childList: true, subtree: true }); ```javascript const charsetMatch = html.match(/]+charset=["']?([^"'>\s]+)/i); -const currentCharset = charsetMatch ? charsetMatch[1].toLowerCase() : 'utf-8'; +const currentCharset = charsetMatch ? charsetMatch[1].toLowerCase() : "utf-8"; ``` --- @@ -400,9 +418,9 @@ The abstraction layer automatically handles differences between engines: **Using Puppeteer (default)**: ```javascript -const browser = await createBrowser('puppeteer'); +const browser = await createBrowser("puppeteer"); const page = await browser.newPage(); -await page.goto('https://example.com'); +await page.goto("https://example.com"); const html = await page.content(); await browser.close(); ``` @@ -410,9 +428,9 @@ await browser.close(); **Using Playwright**: ```javascript -const browser = await createBrowser('playwright'); +const browser = await createBrowser("playwright"); const page = await browser.newPage(); -await page.goto('https://example.com'); +await page.goto("https://example.com"); const html = await page.content(); await browser.close(); ``` @@ -462,6 +480,33 @@ curl "http://localhost:3000/html?url=https://example.com&engine=playwright" --- +### GET /txt + +**Description**: Fetch and return text content as a `.txt` attachment + +**Parameters**: + +- `url` (required): Target URL + +**Response**: + +- `Content-Type: text/plain; charset=utf-8` +- `Content-Disposition: attachment; filename="...txt"` +- Body: Text content + +**Example**: + +```bash +curl "http://localhost:3000/txt?url=https://xpaste.pro/p/t4q0Lsp0" > paste.txt +``` + +**Behavior**: + +- xpaste.pro paste URLs normalize to `/raw`, including `/ru/p/` and `/en/p/` +- Non-text responses are rejected + +--- + ### GET /markdown **Description**: Convert web page to Markdown @@ -474,6 +519,7 @@ curl "http://localhost:3000/html?url=https://example.com&engine=playwright" - `Content-Type: text/markdown` - Body: Markdown content +- xpaste.pro paste pages include raw paste text inline under 1500 lines, or return `application/zip` with `index.md`, page Markdown, and raw `.txt` files when larger **Example**: @@ -556,12 +602,12 @@ ENTRYPOINT ["node", "bin/web-capture.js", "--serve"] ### Docker Compose ```yaml -version: '3.8' +version: "3.8" services: web-capture: build: . ports: - - '3000:3000' + - "3000:3000" restart: unless-stopped environment: - PORT=3000 @@ -586,15 +632,15 @@ docker run -p 3000:3000 web-capture ```javascript // examples/js/markdown_download.js -import fetch from 'node-fetch'; -import fs from 'fs'; +import fetch from "node-fetch"; +import fs from "fs"; -const url = 'https://example.com'; +const url = "https://example.com"; const response = await fetch( - `http://localhost:3000/markdown?url=${encodeURIComponent(url)}` + `http://localhost:3000/markdown?url=${encodeURIComponent(url)}`, ); const markdown = await response.text(); -fs.writeFileSync('downloaded.md', markdown); +fs.writeFileSync("downloaded.md", markdown); ``` ### Python @@ -654,14 +700,14 @@ yarn test:all ```javascript export default { - testEnvironment: 'node', + testEnvironment: "node", transform: { - '^.+\\.m?js$': 'babel-jest', + "^.+\\.m?js$": "babel-jest", }, moduleNameMapper: { - '^(\\.{1,2}/.*)\\.js$': '$1', + "^(\\.{1,2}/.*)\\.js$": "$1", }, - testMatch: ['**/tests/**/*.test.js'], + testMatch: ["**/tests/**/*.test.js"], }; ``` @@ -700,9 +746,9 @@ export default { ```javascript args: [ - '--no-sandbox', // Required for Docker - '--disable-setuid-sandbox', // Required for Docker - '--disable-dev-shm-usage', // Avoid shared memory issues + "--no-sandbox", // Required for Docker + "--disable-setuid-sandbox", // Required for Docker + "--disable-dev-shm-usage", // Avoid shared memory issues ]; ``` @@ -926,15 +972,15 @@ export async function myEndpointHandler(req, res) { 2. Register in `src/index.js`: ```javascript -import { myEndpointHandler } from './myEndpoint.js'; -app.get('/my-endpoint', myEndpointHandler); +import { myEndpointHandler } from "./myEndpoint.js"; +app.get("/my-endpoint", myEndpointHandler); ``` 3. Add tests in `tests/unit/`: ```javascript // tests/unit/myEndpoint.test.js -test('myEndpoint works', async () => { +test("myEndpoint works", async () => { // Test implementation }); ``` diff --git a/README.md b/README.md index 0d78e54..bb440d9 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,7 @@ A CLI and microservice to fetch URLs and render them as: - **Markdown**: Clean HTML-to-Markdown conversion with image extraction - **HTML**: Rendered page content +- **Plain text**: Raw text downloads for paste-like URLs such as xpaste.pro - **PNG/JPEG screenshot**: Viewport or full-page capture - **ZIP archive**: Markdown/HTML + locally downloaded images - **PDF**: Print-quality document @@ -50,7 +51,8 @@ Both implementations expose the same API: | Endpoint | Description | | --------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------- | | `GET /html?url=` | Get rendered HTML content | -| `GET /markdown?url=` | Get Markdown conversion with the default converter | +| `GET /txt?url=` | Get raw text content, with xpaste.pro paste URLs normalized to `/raw` | +| `GET /markdown?url=` | Get Markdown conversion; xpaste.pro pastes include raw text inline under 1500 lines or as a ZIP when larger | | `GET /markdown?url=&converter=kreuzberg` | High-performance Markdown conversion via [html-to-markdown](https://github.com/kreuzberg-dev/html-to-markdown) | | `GET /markdown?url=&converter=kreuzberg&format=json` | Structured result with metadata, tables, images, and warnings | | `GET /image?url=` | Get PNG screenshot | @@ -74,6 +76,9 @@ web-capture https://example.com -o - # Capture as HTML web-capture https://example.com --format html +# Capture raw paste text +web-capture https://xpaste.pro/p/t4q0Lsp0 --format txt -o paste.txt + # Take a screenshot web-capture https://example.com --format png -o screenshot.png @@ -89,24 +94,24 @@ web-capture --serve --port 8080 ## CLI Options -| Option | Short | Description | Default | -| ------------------------ | ----- | --------------------------------------------------------------------------------------- | --------------------- | -| `--serve` | `-s` | Start as HTTP API server | - | -| `--port` | `-p` | Port to listen on | 3000 | -| `--format` | `-f` | Output format: `markdown`/`md`, `html`, `image`/`png`, `jpeg`, `pdf`, `docx`, `archive` | `markdown` | -| `--output` | `-o` | Output file path. Use `-o -` for stdout | auto-derived from URL | -| `--data-dir` | | Base directory for auto-derived output paths | `./data/web-capture` | -| `--engine` | `-e` | Browser engine (JS only): `puppeteer`, `playwright` | `puppeteer` | -| `--embed-images` | | Keep images inline as base64 data URIs (self-contained file) | `false` | -| `--no-extract-images` | | Alias for `--embed-images` | `false` | -| `--extract-images[=DIR]` | | Extract images to `DIR/images/` (or next to the output) and download remote images | - | -| `--keep-original-links` | | Keep remote image URLs as direct links (the default markdown behavior) | `false` | -| `--images-dir` | | Subdirectory name for extracted images | `images` | -| `--archive` | | Create archive: `zip` (default), `7z`, `tar.gz`, `tar` | - | -| `--extract-latex` | | Extract LaTeX formulas | `true` | -| `--extract-metadata` | | Extract article metadata | `true` | -| `--post-process` | | Apply post-processing | `true` | -| `--detect-code-language` | | Detect code block languages | `true` | +| Option | Short | Description | Default | +| ------------------------ | ----- | ----------------------------------------------------------------------------------------------------- | --------------------- | +| `--serve` | `-s` | Start as HTTP API server | - | +| `--port` | `-p` | Port to listen on | 3000 | +| `--format` | `-f` | Output format: `markdown`/`md`, `html`, `txt`/`text`, `image`/`png`, `jpeg`, `pdf`, `docx`, `archive` | `markdown` | +| `--output` | `-o` | Output file path. Use `-o -` for stdout | auto-derived from URL | +| `--data-dir` | | Base directory for auto-derived output paths | `./data/web-capture` | +| `--engine` | `-e` | Browser engine (JS only): `puppeteer`, `playwright` | `puppeteer` | +| `--embed-images` | | Keep images inline as base64 data URIs (self-contained file) | `false` | +| `--no-extract-images` | | Alias for `--embed-images` | `false` | +| `--extract-images[=DIR]` | | Extract images to `DIR/images/` (or next to the output) and download remote images | - | +| `--keep-original-links` | | Keep remote image URLs as direct links (the default markdown behavior) | `false` | +| `--images-dir` | | Subdirectory name for extracted images | `images` | +| `--archive` | | Create archive: `zip` (default), `7z`, `tar.gz`, `tar` | - | +| `--extract-latex` | | Extract LaTeX formulas | `true` | +| `--extract-metadata` | | Extract article metadata | `true` | +| `--post-process` | | Apply post-processing | `true` | +| `--detect-code-language` | | Detect code block languages | `true` | ## Image Handling @@ -143,22 +148,23 @@ All flags can be controlled via environment variables: Both implementations expose the same API: -| Endpoint | Description | -| --------------------------------------------------------- | --------------------------------------------------------- | -| `GET /html?url=` | Get rendered HTML content | -| `GET /markdown?url=` | Get Markdown (original links kept, base64 stripped) | -| `GET /markdown?url=&converter=kreuzberg` | Get Markdown with the high-performance converter | -| `GET /markdown?url=&converter=kreuzberg&format=json` | Get structured Markdown conversion data | -| `GET /markdown?url=&embedImages=true` | Get Markdown with base64 images inline | -| `GET /markdown?url=&keepOriginalLinks=false` | Get Markdown with all images stripped | -| `GET /image?url=` | Get PNG screenshot | -| `GET /archive?url=` | ZIP archive with markdown + images extracted to `images/` | -| `GET /archive?url=&keepOriginalLinks=true` | ZIP archive keeping original remote image URLs | -| `GET /archive?url=&embedImages=true` | ZIP archive with base64 images inline | -| `GET /pdf?url=` | PDF with embedded images | -| `GET /docx?url=` | DOCX with embedded images | -| `GET /fetch?url=` | Proxy fetch content | -| `GET /stream?url=` | Stream content | +| Endpoint | Description | +| --------------------------------------------------------- | ------------------------------------------------------------------------------------------------ | +| `GET /html?url=` | Get rendered HTML content | +| `GET /txt?url=` | Get raw text content, including normalized xpaste.pro raw paste text | +| `GET /markdown?url=` | Get Markdown; xpaste.pro pastes include raw text inline under 1500 lines or as a ZIP when larger | +| `GET /markdown?url=&converter=kreuzberg` | Get Markdown with the high-performance converter | +| `GET /markdown?url=&converter=kreuzberg&format=json` | Get structured Markdown conversion data | +| `GET /markdown?url=&embedImages=true` | Get Markdown with base64 images inline | +| `GET /markdown?url=&keepOriginalLinks=false` | Get Markdown with all images stripped | +| `GET /image?url=` | Get PNG screenshot | +| `GET /archive?url=` | ZIP archive with markdown + images extracted to `images/` | +| `GET /archive?url=&keepOriginalLinks=true` | ZIP archive keeping original remote image URLs | +| `GET /archive?url=&embedImages=true` | ZIP archive with base64 images inline | +| `GET /pdf?url=` | PDF with embedded images | +| `GET /docx?url=` | DOCX with embedded images | +| `GET /fetch?url=` | Proxy fetch content | +| `GET /stream?url=` | Stream content | ## Docker @@ -206,7 +212,9 @@ web-capture/ │ ├── scripts/ # Shared build/release scripts │ ├── *.mjs # JavaScript-specific scripts +│ ├── xpaste/ # xpaste fixture capture/regeneration helpers │ └── rust-*.mjs # Rust-specific scripts +├── tests/xpaste/data/ # Shared xpaste HTML/text/markdown/screenshot fixtures │ ├── .github/workflows/ │ ├── js.yml # JavaScript CI/CD @@ -240,6 +248,7 @@ cargo fmt # Format code ## Features - **Markdown Conversion**: Clean HTML-to-Markdown with LaTeX extraction, metadata, and code language detection +- **Plain Text Capture**: `/txt` endpoint and `--format txt` output for text resources and xpaste.pro raw paste URLs - **Image Extraction**: Base64 data URI images extracted to files with content-hash filenames - **HTML Rendering**: Fetch and render HTML with JavaScript support via headless browsers - **High-Performance Conversion**: Optional [kreuzberg html-to-markdown](https://github.com/kreuzberg-dev/html-to-markdown) backend with structured metadata, table, and image results diff --git a/js/.changeset/xpaste-text-capture.md b/js/.changeset/xpaste-text-capture.md new file mode 100644 index 0000000..766fb64 --- /dev/null +++ b/js/.changeset/xpaste-text-capture.md @@ -0,0 +1,5 @@ +--- +'@link-assistant/web-capture': patch +--- + +Add xpaste.pro raw text capture and large-paste markdown archive handling. diff --git a/js/README.md b/js/README.md index 7e1d1eb..e3ed22e 100644 --- a/js/README.md +++ b/js/README.md @@ -10,6 +10,7 @@ A CLI and microservice to fetch URLs and render them as: - **Markdown**: Converted from HTML with image extraction (default) - **HTML**: Rendered page content +- **Plain text**: Raw text downloads for paste-like URLs such as xpaste.pro - **PNG/JPEG screenshot**: Viewport or full-page capture with theme support - **ZIP archive**: Markdown + locally downloaded images - **PDF**: Print-quality document with embedded images @@ -48,6 +49,9 @@ web-capture https://example.com -o - # Capture as HTML web-capture https://example.com --format html -o page.html +# Capture raw paste text +web-capture https://xpaste.pro/p/t4q0Lsp0 --format txt -o paste.txt + # Take a PNG screenshot web-capture https://example.com --format png -o screenshot.png @@ -141,6 +145,14 @@ GET /markdown?url= Converts the HTML content of the specified URL to Markdown format. By default, original remote image URLs are preserved and base64 data URIs are stripped (clean single-file output). Use `keepOriginalLinks=false` to strip all images, or `embedImages=true` to keep base64 images inline. +For xpaste.pro paste URLs, `/markdown` fetches the visual paste page, keeps +visible header, language, metadata, paste body, and footer text in visual order, +and appends the raw paste text as a fenced block named `xpaste-pro-.txt` +when the final Markdown stays under 1500 lines. Larger paste pages return a ZIP +containing `index.md`, `xpaste-pro-.md`, and `xpaste-pro-.txt`. +Canonical `/p/`, `/p//raw`, `/ru/p/`, and `/en/p/` URLs are +normalized before capture. + | Parameter | Required | Description | Default | | ------------------- | -------- | ------------------------------------------------------------------------- | -------- | | `url` | Yes | URL to fetch | - | @@ -149,6 +161,20 @@ Converts the HTML content of the specified URL to Markdown format. By default, o | `embedImages` | No | Keep base64 images inline (`true`/`false`) | `false` | | `keepOriginalLinks` | No | Keep original remote URLs, strip base64 | `true` | +### Text Endpoint + +``` +GET /txt?url= +``` + +Returns raw text content as a `.txt` attachment. xpaste.pro paste URLs are +normalized to their `/raw` endpoint, including localized `/ru/p/` and +`/en/p/` URLs. + +| Parameter | Required | Description | Default | +| --------- | -------- | ------------ | ------- | +| `url` | Yes | URL to fetch | - | + ### Image Endpoint ``` @@ -334,6 +360,7 @@ web-capture [options] | ----------------- | ---------------------------------------- | | `markdown` / `md` | Markdown conversion (default) | | `html` | Rendered HTML | +| `txt` / `text` | Raw text download | | `image` / `png` | PNG screenshot (lossless) | | `jpeg` | JPEG screenshot (configurable quality) | | `pdf` | PDF with embedded images | diff --git a/js/bin/web-capture.js b/js/bin/web-capture.js index 06aec67..0d16497 100755 --- a/js/bin/web-capture.js +++ b/js/bin/web-capture.js @@ -78,7 +78,7 @@ const config = makeConfig({ alias: 'f', type: 'string', description: - 'Output format: markdown, md, html, image, png, jpeg, pdf, docx, archive', + 'Output format: markdown, md, html, txt, text, image, png, jpeg, pdf, docx, archive', default: 'markdown', }) .option('theme', { @@ -289,7 +289,7 @@ const config = makeConfig({ 'Capture private Google Doc with API token' ) .epilogue( - 'API Endpoints (in server mode):\n GET /html?url=&engine= Get rendered HTML\n GET /markdown?url= Get Markdown conversion\n GET /image?url=&format=png|jpeg&theme=light|dark Screenshot\n GET /archive?url=&localImages=true&documentFormat=markdown|html ZIP archive\n GET /pdf?url=&theme=light|dark PDF with embedded images\n GET /docx?url= DOCX with embedded images\n GET /fetch?url= Proxy fetch\n GET /stream?url= Streaming proxy' + 'API Endpoints (in server mode):\n GET /html?url=&engine= Get rendered HTML\n GET /markdown?url= Get Markdown conversion\n GET /txt?url= Get text content\n GET /image?url=&format=png|jpeg&theme=light|dark Screenshot\n GET /archive?url=&localImages=true&documentFormat=markdown|html ZIP archive\n GET /pdf?url=&theme=light|dark PDF with embedded images\n GET /docx?url= DOCX with embedded images\n GET /fetch?url= Proxy fetch\n GET /stream?url= Streaming proxy' ) .strict(), lenv: { @@ -322,6 +322,7 @@ async function startServer(port) { console.log('Available endpoints:'); console.log(` GET /html?url= - Render page as HTML`); console.log(` GET /markdown?url= - Convert page to Markdown`); + console.log(` GET /txt?url= - Fetch text content`); console.log(` GET /image?url= - Screenshot (PNG/JPEG)`); console.log( ` GET /archive?url= - ZIP archive with markdown + images` @@ -598,8 +599,12 @@ async function captureUrl(url, options) { } // Import required modules - const { fetchHtml, convertToUtf8, convertRelativeUrls } = - await import('../src/lib.js'); + const { + fetchHtml, + convertToUtf8, + convertRelativeUrls, + normalizeUrlForTextContent, + } = await import('../src/lib.js'); const { createBrowser } = await import('../src/browser.js'); const { isGoogleDocsUrl, @@ -915,7 +920,28 @@ async function captureUrl(url, options) { } try { - if (normalizedFormat === 'jpeg') { + if (normalizedFormat === 'txt' || normalizedFormat === 'text') { + const response = await fetch(normalizeUrlForTextContent(absoluteUrl)); + if (!response.ok) { + throw new Error(`HTTP ${response.status}: ${response.statusText}`); + } + const contentType = response.headers.get('content-type') || 'text/plain'; + if (!contentType.includes('text/')) { + throw new Error(`Expected text content, got ${contentType}`); + } + const text = await response.text(); + const output = + explicitOutput === '-' + ? null + : explicitOutput || deriveOutputPath(absoluteUrl, 'txt', dataDir); + if (output) { + fs.mkdirSync(path.dirname(output), { recursive: true }); + fs.writeFileSync(output, text, 'utf-8'); + console.error(`Text saved to: ${output}`); + } else { + process.stdout.write(text); + } + } else if (normalizedFormat === 'jpeg') { // JPEG screenshot const { createBrowser } = await import('../src/browser.js'); const { dismissPopups, scrollToLoadContent } = @@ -1160,6 +1186,8 @@ async function captureUrl(url, options) { normalizedFormat === 'screenshot' ) { // Image/screenshot format + const { dismissPopups, scrollToLoadContent } = + await import('../src/popups.js'); const browser = await createBrowser(engine); try { const page = await browser.newPage(); @@ -1177,8 +1205,12 @@ async function captureUrl(url, options) { }); // Wait for 5 seconds after page load await new Promise((resolve) => setTimeout(resolve, 5000)); + if (fullPage) { + await scrollToLoadContent(page); + } + await dismissPopups(page); - const buffer = await page.screenshot({ type: 'png' }); + const buffer = await page.screenshot({ type: 'png', fullPage }); if (explicitOutput) { fs.writeFileSync(explicitOutput, buffer); diff --git a/js/jest.config.mjs b/js/jest.config.mjs index d688d1b..c417bb0 100644 --- a/js/jest.config.mjs +++ b/js/jest.config.mjs @@ -13,6 +13,7 @@ export default { '**/tests/mock/**/*.test.js', '**/tests/e2e/**/*.test.js', '**/tests/integration/**/*.test.js', + '**/tests/xpaste/**/*.test.js', ], setupFiles: ['./tests/jest.setup.mjs'], }; diff --git a/js/src/index.js b/js/src/index.js index 260d61f..e79b5f3 100644 --- a/js/src/index.js +++ b/js/src/index.js @@ -5,6 +5,7 @@ import { markdownHandler } from './markdown.js'; import { imageHandler } from './image.js'; import { streamHandler } from './stream.js'; import { fetchHandler } from './fetch.js'; +import { txtHandler } from './txt.js'; import { archiveHandler } from './archive.js'; import { pdfHandler } from './pdf.js'; import { docxHandler } from './docx.js'; @@ -22,6 +23,7 @@ app.get('/markdown', markdownHandler); app.get('/image', imageHandler); app.get('/stream', streamHandler); app.get('/fetch', fetchHandler); +app.get('/txt', txtHandler); app.get('/archive', archiveHandler); app.get('/pdf', pdfHandler); app.get('/docx', docxHandler); diff --git a/js/src/lib.js b/js/src/lib.js index 226ed90..ed561d2 100644 --- a/js/src/lib.js +++ b/js/src/lib.js @@ -19,6 +19,95 @@ export async function fetchHtml(url) { return response.text(); } +export function isTextPasteUrl(url) { + const parsed = parseTextPasteUrl(url); + return Boolean(parsed); +} + +export function normalizeUrlForTextContent(url) { + const parsed = parseTextPasteUrl(url); + if (!parsed) { + return url; + } + return `https://${parsed.host}${parsed.pathPrefix}/p/${parsed.pasteId}/raw`; +} + +export function normalizeUrlForTextPage(url) { + const parsed = parseTextPasteUrl(url); + if (!parsed) { + return url; + } + return `https://${parsed.host}${parsed.pathPrefix}/p/${parsed.pasteId}`; +} + +export function getTextPasteId(url) { + return parseTextPasteUrl(url)?.pasteId || null; +} + +export function getTextPasteFilename(url) { + const pasteId = getTextPasteId(url); + return pasteId ? `xpaste-pro-${pasteId}.txt` : null; +} + +export function appendTextPasteMarkdownAttachment(markdown, url, rawText) { + const textFilename = getTextPasteFilename(url) || 'download.txt'; + const normalizedRawText = normalizeMarkdownAttachmentText(rawText); + const fence = markdownFenceFor(normalizedRawText); + const attachmentText = normalizedRawText.endsWith('\n') + ? normalizedRawText + : `${normalizedRawText}\n`; + + return [ + markdown.trimEnd(), + '', + `## ${textFilename}`, + '', + `${fence}text`, + `${attachmentText}${fence}`, + '', + ].join('\n'); +} + +function normalizeMarkdownAttachmentText(text) { + return String(text).replace(/\r\n/g, '\n').replace(/\r/g, '\n'); +} + +function markdownFenceFor(text) { + const runs = text.match(/`{3,}/g) || []; + const length = runs.reduce((max, run) => Math.max(max, run.length + 1), 3); + return '`'.repeat(length); +} + +function parseTextPasteUrl(url) { + try { + const parsed = new URL(url); + const host = parsed.hostname.toLowerCase(); + if (host !== 'xpaste.pro' && host !== 'www.xpaste.pro') { + return null; + } + + const parts = parsed.pathname.split('/').filter(Boolean); + const index = parts.indexOf('p'); + let pathPrefix = ''; + if (index === -1 || !parts[index + 1]) { + return null; + } + if (index === 1 && ['en', 'ru'].includes(parts[0])) { + pathPrefix = `/${parts[0]}`; + } else if (index !== 0) { + return null; + } + const tail = parts.slice(index + 2); + if (tail.length > 1 || (tail[0] && tail[0] !== 'raw')) { + return null; + } + + return { host: 'xpaste.pro', pathPrefix, pasteId: parts[index + 1] }; + } catch { + return null; + } +} + export function convertHtmlToMarkdown(html, baseUrl) { // Ensure all URLs are absolute before Markdown conversion if (baseUrl) { @@ -27,6 +116,8 @@ export function convertHtmlToMarkdown(html, baseUrl) { // Load HTML into Cheerio const $ = cheerio.load(html); + reorderVisualLayoutElements($); + // Remove