From f364383a85a292e0d2599e459f5834090ca48be3 Mon Sep 17 00:00:00 2001 From: eric861129 Date: Wed, 11 Feb 2026 13:32:57 +0800 Subject: [PATCH] feat: enhance document generation with automatic figure numbering and text cleaning for publishing --- docs/AI_GENERATION_GUIDE.md | 166 +++--------------- services/docx/builders/common.ts | 6 + services/docx/builders/image.ts | 67 ++++--- services/docx/types.ts | 4 + services/docxGenerator.ts | 6 + services/parser/ast.ts | 115 ++++++++---- .../__snapshots__/docxGenerator.test.ts.snap | 36 +++- tests/markdownParser.test.ts | 10 +- utils/styleParser.ts | 8 +- utils/textProcessor.ts | 60 +++++++ 10 files changed, 270 insertions(+), 208 deletions(-) create mode 100644 utils/textProcessor.ts diff --git a/docs/AI_GENERATION_GUIDE.md b/docs/AI_GENERATION_GUIDE.md index b3bf435..f4cf85e 100644 --- a/docs/AI_GENERATION_GUIDE.md +++ b/docs/AI_GENERATION_GUIDE.md @@ -1,6 +1,6 @@ # AI Generation & Conversion Guide (AI 輔助生成與轉換指南) -本文件旨在提供給大型語言模型 (LLM) 閱讀,以便精確地將現有的內容轉換為 **MD2DOC-Evolution** 專屬格式。 +本文件旨在提供給大型語言模型 (LLM) 閱讀,以便精確地將現有的內容轉換為 **MD2DOC-Evolution** 專屬格式,並符合專業出版社的寫作規範。 --- @@ -23,14 +23,26 @@ - **禁止使用 H4, H5, H6**:本專案僅支援 `#`, `##`, `###`。若原始稿件有更深層級,請將其轉換為 `**粗體項目**`。 - **目錄標籤**:在 Frontmatter 結束後的下一行,必須插入 `[TOC]`。 -### 3. 程式碼區塊 (Code Blocks) +### 3. 出版級文字規範 (Publishing Standards) +- **中英文空格**:中文與英文、數字之間 **不需要空格**。 (例:`使用 VS Code` 而非 `使用 VS Code`) +- **標點符號**:中文句子中夾雜英文時,必須使用 **中文標點符號**。 (例:`開設 FB、IG 帳號。` 而非 `開設 FB, IG 帳號。`) +- **UI 強調**:介紹軟體介面操作時,使用 `「」` 符號加以強調。 (例:完成後按 「Test Connection」) + +### 4. 圖片與圖號 (Images & Figures) +- **自動編號**:系統會自動根據出現順序編號為「圖 X」。 +- **圖名語法**:使用 `![圖名](url)`。 +- **全頁圖片**:若圖片需放整頁 (13x18cm),請在 Alt 文字中加入 `full-page` 標記。 (例:`![這是全頁圖 full-page](url)`) +- **截圖規範**:截圖請務必使用 **淺色亮底** 主題,並在關鍵步驟加上醒目框線。 +- **寬度限制**:系統會自動將圖片限制在 **13 cm** 寬度內。 + +### 5. 程式碼區塊 (Code Blocks) - **語法**:```語言[:ln|:no-ln] - **細節**: - 預設會顯示行號。 - 若為短小的設定檔,請強制標註 `:no-ln`。 - **範例**:```json:no-ln -### 4. 提示區塊 (Callouts) +### 6. 提示區塊 (Callouts) - **格式**:必須使用 `> [!標記]`。 - **類型限制**:僅支援 `TIP`, `NOTE`, `WARNING`。 - **轉換邏輯**: @@ -38,18 +50,16 @@ - 「技巧」、「建議」 -> `> [!TIP]` - 「警告」、「重要」 -> `> [!WARNING]` -### 5. 角色對話 (Chat Dialogues) - **極重要** -這是 AI 最容易出錯的地方,請嚴格執行: -- **左側 (AI/講者)**:`角色名稱 ":: 對話內容` (注意引號位置) -- **右側 (User/讀者)**:`對話內容 ::" 角色` -- **範例**: - - `GPT ":: 您好!有什麼我能幫您的?` - - `請幫我寫一段程式碼 ::" 使用者` +### 7. 角色對話 (Chat Dialogues) +- **左側 (AI/他人)**:`角色名稱 "::` (引號在冒號前) +- **右側 (User/作者)**:`角色名稱 ::"` (引號在冒號後) +- **置中 (System/旁白)**:`角色名稱 :":` (引號在中間) -### 6. 行內樣式轉換表 +### 8. 行內樣式轉換表 | 原始內容 | 轉換後格式 | 說明 | | :--- | :--- | :--- | -| 「點擊設定」 | `【設定】` | 所有 UI 按鈕、選單項目 | +| 「點擊設定」 | `「設定」` | UI 元素強調 (建議優先使用) | +| 【點擊設定】 | `【設定】` | UI 按鈕、選單項目 (帶底色) | | Ctrl+C | `[Ctrl]`+`[C]` | 所有實體按鍵 | | 《深入淺出》 | `『深入淺出』` | 所有書名、軟體專案名 | | [連結](url) | `[連結](url)` | 保持原樣,系統會自動轉 QR Code | @@ -59,132 +69,6 @@ ## 負面約束 (Negative Constraints) - **不要** 使用 HTML 標籤(如 ``, `
`)。 - **不要** 在 Callout 內嵌套另一個 Callout。 -- **不要** 自行發明 Callout 標籤(如 `[!DANGER]` 是不支援的)。 -- **不要** 改變 Mermaid 的標準語法。 - ---- -# AI Generation Guide for MD2DOC-Evolution - -本指南定義了將 Markdown 轉換為 MD2DOC-Evolution 格式的標準規範。AI 模型應嚴格遵守以下語法規則。 - -## 1. Frontmatter (YAML) -文件必須以標準 YAML 格式開頭。 - -### ✅ 正確範例 -```markdown ---- -title: 第2章:工具箱——打造你的數位軍火庫 -author: ChiYu ---- - -[TOC] - -# 第2章:工具箱——打造你的數位軍火庫 - -``` - -### ❌ 錯誤範例 (禁止) - -* ❌ 缺少結尾的 `---` -* ❌ 在 YAML 中使用 `##` 標題 -* ❌ 將 `[TOC]` 放進 YAML 區塊中 -* 如果有需要目錄才添加[TOC] - ---- - -## 2. 列表 (Lists) - -子列表必須使用 **2個空白 (Spaces)** 進行縮排,以確保層級正確。 - -### ✅ 正確範例 - -```markdown -* **『Python (Microsoft)』** - * 這是什麼:微軟官方出品的 Python 語言支援包。 - * 為什麼必裝:裝上它,你的『VS Code』才會真正「看懂」Python。 -* **『Prettier』** - * 這是什麼:你的程式碼專屬造型師。 - -``` - -### ❌ 錯誤範例 (扁平化) - -```markdown -* **『Python (Microsoft)』** -* 這是什麼:微軟官方出品的 Python 語言支援包。 (❌ 錯誤:沒有縮排,被視為同一層級) - -``` - ---- - -## 3. 對話 (Chat) - -使用特定的前綴語法來表示對話氣泡。 - -### 語法 - -`角色名稱 ::"` 換行後接續對話內容。 - -### ✅ 正確範例 - -```markdown -讀者 ::" -ChiYu,既然我們已經決定用 Python 來開發 RPG 遊戲的後端了,為什麼現在又要裝一個叫 Node.js 的東西? - -ChiYu ::" -這是一個很棒的問題!我們要區分清楚「產品」與「工具」的差別。 - -``` - -### ❌ 錯誤範例 - -* ❌ `ChiYu "::` (錯誤的引號位置) -* ❌ `::" ChiYu` (不需要結尾標籤) - ---- - -## 4. 行內樣式 (Inline Styles) - -請根據語意選擇正確的括號樣式。 - -| 類型 | 語法 | 範例 | -| --- | --- | --- | -| **UI 元素 / 按鈕** | `【】` | 請點擊【確定】按鈕、查看【檔案總管】。 | -| **快捷鍵** | `[]` | 按下 [Ctrl] + [C] 複製。 | -| **書名 / 專案名 / 強調物件** | `『』` | 本書使用『VS Code』進行開發。 | -| **一般強調** | `**` | 這是 **非常重要** 的觀念。 | - ---- - -## 5. 提示區塊 (Callouts) - -將筆記或警告轉換為 GitHub 風味的 Blockquotes。 - -### ✅ 正確範例 - -```markdown -> [!NOTE] -> 这是一个補充說明。 - -> [!WARNING] -> 【Windows 使用者請注意】:請務必勾選 Add to PATH。 - -``` - ---- - -## 6. 標題與結構 - -* 僅使用 H1 (#), H2 (##), H3 (###)。 -* H4 以下請轉換為粗體文字或列表項目。 -* 確保程式碼區塊包含語言標籤 (例如 ````python`)。 - -``` - -### 修改重點說明: - -1. **YAML 修正**:在 Prompt 的 `Rule #1` 中,我特別強調了「頭尾都要有 `---`」以及「嚴禁使用 `#` 符號」,這能直接解決 AI 生成 `## title:` 這種錯誤格式的問題。 -2. **列表縮排**:在 Prompt 的 `Rule #4` 和 Guide 的 `Section 2` 中,我明確要求了「2個空白縮排」,並給出了「扁平化錯誤」的負面範例,這對 AI 理解結構非常有幫助。 -3. **Chat 語法簡化**:根據您的指示,將語法統一為 `角色 ::"`。我在 Prompt 中加入了一條「禁止使用結尾標籤」的規則,以防止 AI 因為過度熱心而自行閉合標籤。 - -``` +- **不要** 自行發明 Callout 標籤。 +- **不要** 忽略關鍵步驟,避免用「這個大家應該都知道」為前提。 +- **不要** 忘記專有名詞第一次出現時要簡短解釋。 diff --git a/services/docx/builders/common.ts b/services/docx/builders/common.ts index 97e1ef5..5474650 100644 --- a/services/docx/builders/common.ts +++ b/services/docx/builders/common.ts @@ -101,6 +101,12 @@ export const parseInlineStyles = async (text: string, config?: DocxConfig): Prom shading: { fill: COLORS.BG_BUTTON, type: ShadingType.CLEAR, color: "auto" } })); break; + case InlineStyleType.UI_EMPHASIS: + runs.push(new TextRun({ + ...baseConfig, + bold: true + })); + break; case InlineStyleType.SHORTCUT: runs.push(new TextRun({ ...baseConfig, diff --git a/services/docx/builders/image.ts b/services/docx/builders/image.ts index 3b80023..2b982a3 100644 --- a/services/docx/builders/image.ts +++ b/services/docx/builders/image.ts @@ -4,7 +4,7 @@ * Licensed under the MIT License. */ -import { Paragraph, ImageRun, AlignmentType } from "docx"; +import { Paragraph, ImageRun, AlignmentType, TextRun } from "docx"; import { DocxConfig } from "../types"; import { WORD_THEME } from "../../../constants/theme"; @@ -60,13 +60,39 @@ export const createImageBlock = async (src: string, alt: string, config: DocxCon const dims = await getImageDimensions(realSrc); if (dims.width === 0) return []; - // Force image to fill the maximum page width - const maxWidthPx = (config.widthCm - 4) * 37.8; // Use slightly larger width (less margin) + // Increment Figure Counter + if (config.counters) { + config.counters.figure++; + } + const figNum = config.counters ? config.counters.figure : 0; + + // Publisher Requirement 09 & 10 + // Max width is 13cm. Full page is 13x18cm. + const isFullPage = alt.includes('full-page'); + const cleanAlt = alt.replace('full-page', '').trim(); - // Calculate target height to maintain aspect ratio - const ratio = maxWidthPx / dims.width; - const targetWidth = maxWidthPx; - const targetHeight = dims.height * ratio; + const MAX_WIDTH_CM = 13; + const MAX_HEIGHT_CM = isFullPage ? 18 : 20; // Limit height too to avoid breaking layout + + const maxWidthPx = MAX_WIDTH_CM * 37.8; // 1cm approx 37.8px (96dpi) + const maxHeightPx = MAX_HEIGHT_CM * 37.8; + + let targetWidth = dims.width; + let targetHeight = dims.height; + + // Scale to fit max width + if (targetWidth > maxWidthPx) { + const ratio = maxWidthPx / targetWidth; + targetWidth = maxWidthPx; + targetHeight = targetHeight * ratio; + } + + // Scale to fit max height if still too large + if (targetHeight > maxHeightPx) { + const ratio = maxHeightPx / targetHeight; + targetHeight = maxHeightPx; + targetWidth = targetWidth * ratio; + } // 4. Create Paragraphs const imagePara = new Paragraph({ @@ -86,21 +112,18 @@ export const createImageBlock = async (src: string, alt: string, config: DocxCon const result = [imagePara]; - // 4. Add Caption if Alt text exists - if (alt) { - result.push(new Paragraph({ - alignment: AlignmentType.CENTER, - children: [ - new TextRun({ - text: ` ▲ ${alt}`, - italics: true, - size: 18, // 9pt - color: "666666" - }) - ], - spacing: { before: 0, after: 200 }, - })); - } + // 4. Add Caption with Figure Number (Requirement 05) + result.push(new Paragraph({ + alignment: AlignmentType.CENTER, + children: [ + new TextRun({ + text: `圖 ${figNum} ${cleanAlt}`, + bold: true, + size: 20, // 10pt + }) + ], + spacing: { before: 0, after: 200 }, + })); return result; } catch (e) { diff --git a/services/docx/types.ts b/services/docx/types.ts index 6960e64..da33f11 100644 --- a/services/docx/types.ts +++ b/services/docx/types.ts @@ -13,4 +13,8 @@ export interface DocxConfig { showLineNumbers?: boolean; meta?: DocumentMeta; imageRegistry?: Record; + counters?: { + figure: number; + qr: number; + }; } diff --git a/services/docxGenerator.ts b/services/docxGenerator.ts index bd9d24b..d188aca 100644 --- a/services/docxGenerator.ts +++ b/services/docxGenerator.ts @@ -28,6 +28,12 @@ export const generateDocx = async ( config: DocxConfig = { widthCm: 17, heightCm: 23 } ): Promise => { + // Initialize counters for automatic numbering (Figures, QRs) + config.counters = { + figure: 0, + qr: 0 + }; + const docChildren: (Paragraph | Table)[] = []; for (const block of blocks) { diff --git a/services/parser/ast.ts b/services/parser/ast.ts index 9e96e4d..9dc3d3e 100644 --- a/services/parser/ast.ts +++ b/services/parser/ast.ts @@ -6,6 +6,7 @@ import { marked } from 'marked'; import { BlockType, ParsedBlock } from '../types'; +import { cleanTextForPublishing } from '../../utils/textProcessor'; // Configure marked options if needed marked.use({ @@ -27,7 +28,7 @@ export const parseMarkdownWithAST = (markdown: string, lineOffset: number = 0, c ...block, sourceLine: blockStartLine, startIndex: blockStartIndex, - endIndex: blockStartIndex + token.raw.length + endIndex: blockStartIndex + (token.raw?.length || 0) }); }; @@ -39,39 +40,62 @@ export const parseMarkdownWithAST = (markdown: string, lineOffset: number = 0, c BlockType.HEADING_3; addBlock({ type: headingType, - content: token.text + content: cleanTextForPublishing(token.text) }); break; case 'paragraph': const text = token.text; - // 1. TOC - if (text.trim() === '[TOC]' || text.trim() === '[toc]') { - addBlock({ type: BlockType.TOC, content: '' }); + // 1. TOC (Can be [TOC] followed by manual list in the same paragraph) + if (text.trim().startsWith('[TOC]') || text.trim().startsWith('[toc]')) { + addBlock({ + type: BlockType.TOC, + content: cleanTextForPublishing(text.replace(/\[TOC\]|\[toc\]/i, '').trim()) + }); break; } - // 2. Chat Dialogues - const centerMatch = text.match(/^(.+?)\s*:\":\s*(.*)$/); - if (centerMatch) { - addBlock({ type: BlockType.CHAT_CUSTOM, role: centerMatch[1].trim(), content: centerMatch[2].trim(), alignment: 'center' }); - break; - } - const rightMatch = text.match(/^(.+?)\s*::\"\s*(.*)$/); - if (rightMatch) { - addBlock({ type: BlockType.CHAT_CUSTOM, role: rightMatch[1].trim(), content: rightMatch[2].trim(), alignment: 'right' }); + // 2. Chat Dialogues (Handle multi-line if they are grouped by marked) + const lines = text.split('\n'); + let allChat = true; + const chatBlocks: ParsedBlock[] = []; + + for (const line of lines) { + const centerMatch = line.match(/^(.+?)\s*:\":\s*(.*)$/); + if (centerMatch) { + chatBlocks.push({ type: BlockType.CHAT_CUSTOM, role: centerMatch[1].trim(), content: cleanTextForPublishing(centerMatch[2].trim()), alignment: 'center' }); + continue; + } + const rightMatch = line.match(/^(.+?)\s*::\"\s*(.*)$/); + if (rightMatch) { + chatBlocks.push({ type: BlockType.CHAT_CUSTOM, role: rightMatch[1].trim(), content: cleanTextForPublishing(rightMatch[2].trim()), alignment: 'right' }); + continue; + } + const leftMatch = line.match(/^(.+?)\s*\"(?:::)\s*(.*)$/); + if (leftMatch) { + chatBlocks.push({ type: BlockType.CHAT_CUSTOM, role: leftMatch[1].trim(), content: cleanTextForPublishing(leftMatch[2].trim()), alignment: 'left' }); + continue; + } + allChat = false; break; } - const leftMatch = text.match(/^(.+?)\s*\"(?:::)\s*(.*)$/); - if (leftMatch) { - addBlock({ type: BlockType.CHAT_CUSTOM, role: leftMatch[1].trim(), content: leftMatch[2].trim(), alignment: 'left' }); + + if (allChat && chatBlocks.length > 0) { + chatBlocks.forEach((cb, idx) => { + blocks.push({ + ...cb, + sourceLine: blockStartLine + idx, + startIndex: blockStartIndex, + endIndex: blockStartIndex + token.raw.length + }); + }); break; } addBlock({ type: BlockType.PARAGRAPH, - content: token.text + content: cleanTextForPublishing(token.text) }); break; @@ -117,30 +141,33 @@ export const parseMarkdownWithAST = (markdown: string, lineOffset: number = 0, c const firstLine = firstToken.text.trim(); if (firstLine.startsWith('[!TIP]')) { calloutType = BlockType.CALLOUT_TIP; - content = rawBlockquote.replace(/^\\[!TIP\\]\s*/m, '').trim(); - // Simplified stripping for now, matching previous logic roughly - // Re-implementing specific stripping if needed const lines = rawBlockquote.split('\n'); - if (lines[0].includes('[!TIP]')) lines[0] = lines[0].replace('[!TIP]', '').trim(); - content = lines.join('\n'); + if (lines[0].includes('[!TIP]')) { + lines.shift(); + } + content = lines.join('\n').trim(); } else if (firstLine.startsWith('[!WARNING]')) { calloutType = BlockType.CALLOUT_WARNING; const lines = rawBlockquote.split('\n'); - if (lines[0].includes('[!WARNING]')) lines[0] = lines[0].replace('[!WARNING]', '').trim(); - content = lines.join('\n'); + if (lines[0].includes('[!WARNING]')) { + lines.shift(); + } + content = lines.join('\n').trim(); } else if (firstLine.startsWith('[!NOTE]')) { calloutType = BlockType.CALLOUT_NOTE; const lines = rawBlockquote.split('\n'); - if (lines[0].includes('[!NOTE]')) lines[0] = lines[0].replace('[!NOTE]', '').trim(); - content = lines.join('\n'); + if (lines[0].includes('[!NOTE]')) { + lines.shift(); + } + content = lines.join('\n').trim(); } } addBlock({ type: calloutType, - content: content + content: cleanTextForPublishing(content) }); break; @@ -172,7 +199,7 @@ export const parseMarkdownWithAST = (markdown: string, lineOffset: number = 0, c addBlock({ type: ordered ? BlockType.NUMBERED_LIST : BlockType.BULLET_LIST, - content: cleanText, + content: cleanTextForPublishing(cleanText), nestingLevel: level }); @@ -238,5 +265,33 @@ export const parseMarkdownWithAST = (markdown: string, lineOffset: number = 0, c processToken(token, blockStartLine, blockStartIndex); }); - return blocks; + // Post-processing: Merge adjacent TOC and List blocks if they are intended to be a manual TOC + const mergedBlocks: ParsedBlock[] = []; + for (let i = 0; i < blocks.length; i++) { + const current = blocks[i]; + if (current.type === BlockType.TOC && i + 1 < blocks.length) { + let nextIndex = i + 1; + let manualContent = current.content || ''; + + while (nextIndex < blocks.length && + (blocks[nextIndex].type === BlockType.BULLET_LIST || + blocks[nextIndex].type === BlockType.NUMBERED_LIST)) { + + const listBlock = blocks[nextIndex]; + // Reconstruct manual TOC line + const prefix = listBlock.type === BlockType.BULLET_LIST ? '- ' : '1. '; + manualContent += (manualContent ? '\n' : '') + prefix + listBlock.content; + nextIndex++; + } + + if (nextIndex > i + 1) { + mergedBlocks.push({ ...current, content: manualContent }); + i = nextIndex - 1; + continue; + } + } + mergedBlocks.push(current); + } + + return mergedBlocks; }; diff --git a/tests/__snapshots__/docxGenerator.test.ts.snap b/tests/__snapshots__/docxGenerator.test.ts.snap index 3880f87..a701044 100644 --- a/tests/__snapshots__/docxGenerator.test.ts.snap +++ b/tests/__snapshots__/docxGenerator.test.ts.snap @@ -587,7 +587,7 @@ exports[`docxGenerator > should generate correct document structure for given bl "pos": "w:pos" }, "root": { - "val": 1 + "val": "{default-bullet-0}" } } ] @@ -615,7 +615,12 @@ exports[`docxGenerator > should generate correct document structure for given bl ] } ], - "numberingReferences": [] + "numberingReferences": [ + { + "reference": "default-bullet", + "instance": 0 + } + ] }, { "rootKey": "w:r", @@ -790,7 +795,7 @@ exports[`docxGenerator > should generate correct document structure for given bl "pos": "w:pos" }, "root": { - "val": 1 + "val": "{default-bullet-0}" } } ] @@ -818,7 +823,12 @@ exports[`docxGenerator > should generate correct document structure for given bl ] } ], - "numberingReferences": [] + "numberingReferences": [ + { + "reference": "default-bullet", + "instance": 0 + } + ] } } ], @@ -1446,7 +1456,7 @@ exports[`docxGenerator > should generate correct document structure for given bl "pos": "w:pos" }, "root": { - "val": 1 + "val": "{default-bullet-0}" } } ] @@ -1474,7 +1484,12 @@ exports[`docxGenerator > should generate correct document structure for given bl ] } ], - "numberingReferences": [] + "numberingReferences": [ + { + "reference": "default-bullet", + "instance": 0 + } + ] }, { "rootKey": "w:r", @@ -1649,7 +1664,7 @@ exports[`docxGenerator > should generate correct document structure for given bl "pos": "w:pos" }, "root": { - "val": 1 + "val": "{default-bullet-0}" } } ] @@ -1677,7 +1692,12 @@ exports[`docxGenerator > should generate correct document structure for given bl ] } ], - "numberingReferences": [] + "numberingReferences": [ + { + "reference": "default-bullet", + "instance": 0 + } + ] } } ], diff --git a/tests/markdownParser.test.ts b/tests/markdownParser.test.ts index 5f6146a..9dab660 100644 --- a/tests/markdownParser.test.ts +++ b/tests/markdownParser.test.ts @@ -26,9 +26,9 @@ describe('markdownParser', () => { ].join('\n'); const { blocks } = parseMarkdown(input); expect(blocks).toHaveLength(3); - expect(blocks[0]).toEqual({ type: BlockType.HEADING_1, content: 'Heading 1' }); - expect(blocks[1]).toEqual({ type: BlockType.HEADING_2, content: 'Heading 2' }); - expect(blocks[2]).toEqual({ type: BlockType.HEADING_3, content: 'Heading 3' }); + expect(blocks[0]).toMatchObject({ type: BlockType.HEADING_1, content: 'Heading 1' }); + expect(blocks[1]).toMatchObject({ type: BlockType.HEADING_2, content: 'Heading 2' }); + expect(blocks[2]).toMatchObject({ type: BlockType.HEADING_3, content: 'Heading 3' }); }); it('should parse paragraphs correctly', () => { @@ -39,8 +39,8 @@ describe('markdownParser', () => { ].join('\n'); const { blocks } = parseMarkdown(input); expect(blocks).toHaveLength(2); - expect(blocks[0]).toEqual({ type: BlockType.PARAGRAPH, content: 'Paragraph 1' }); - expect(blocks[1]).toEqual({ type: BlockType.PARAGRAPH, content: 'Paragraph 2' }); + expect(blocks[0]).toMatchObject({ type: BlockType.PARAGRAPH, content: 'Paragraph 1' }); + expect(blocks[1]).toMatchObject({ type: BlockType.PARAGRAPH, content: 'Paragraph 2' }); }); it('should parse code blocks correctly', () => { diff --git a/utils/styleParser.ts b/utils/styleParser.ts index 2c4ccbc..68de995 100644 --- a/utils/styleParser.ts +++ b/utils/styleParser.ts @@ -11,6 +11,7 @@ export enum InlineStyleType { UNDERLINE = 'UNDERLINE', CODE = 'CODE', UI_BUTTON = 'UI_BUTTON', + UI_EMPHASIS = 'UI_EMPHASIS', LINK = 'LINK', IMAGE = 'IMAGE', SHORTCUT = 'SHORTCUT', @@ -26,10 +27,10 @@ export interface InlineStyleSegment { } export const parseInlineElements = (text: string): InlineStyleSegment[] => { - // Regex 順序:圖片 > 連結 > 粗體 > 斜體 > 底線 > 程式碼 > UI按鈕 > 快捷鍵 > 書名號 + // Regex 順序:圖片 > 連結 > 粗體 > 斜體 > 底線 > 程式碼 > UI按鈕 > UI強調 > 快捷鍵 > 書名號 // Image: !\[.*?\]\(.*?\) // Link: \[.*?\]\(.*?\) - const regex = /(!\[.*?\]\(.*?\))|(\[.*?\]\(.*?\))|(\*\*.*?\*\*)|(\*.*?\*)|(.*?<\/u>)|(`[^`]+`)|(【.*?】)|(\[.*?\])|(『.*?』)/g; + const regex = /(!\[.*?\]\(.*?\))|(\[.*?\]\(.*?\))|(\*\*.*?\*\*)|(\*.*?\*)|(.*?<\/u>)|(`[^`]+`)|(【.*?】)|(「.*?」)|(\[.*?\])|(『.*?』)/g; const segments: InlineStyleSegment[] = []; let lastIndex = 0; @@ -82,6 +83,9 @@ export const parseInlineElements = (text: string): InlineStyleSegment[] => { } else if (fullMatch.startsWith('【')) { type = InlineStyleType.UI_BUTTON; content = fullMatch; // 保留括號 + } else if (fullMatch.startsWith('「')) { + type = InlineStyleType.UI_EMPHASIS; + content = fullMatch; // 保留括號 } else if (fullMatch.startsWith('[')) { type = InlineStyleType.SHORTCUT; content = fullMatch; // 保留括號 diff --git a/utils/textProcessor.ts b/utils/textProcessor.ts new file mode 100644 index 0000000..0747ec4 --- /dev/null +++ b/utils/textProcessor.ts @@ -0,0 +1,60 @@ +/** + * BookPublisher MD2Docx + * Copyright (c) 2025 EricHuang + * Licensed under the MIT License. + */ + +/** + * Clean text for publishing requirements: + * 1. Remove spaces between CJK and English/Number characters. + * 2. Convert English punctuation to Chinese punctuation in CJK sentences. + */ +export const cleanTextForPublishing = (text: string): string => { + if (!text) return text; + + // Skip if it looks like a URL or is very short + if (text.startsWith('http') || text.length < 2) return text; + + let result = text; + + // 1. Remove space between CJK and English/Number + // CJK: \u4e00-\u9fa5 + // English/Number: a-zA-Z0-9 + + // Chinese followed by space followed by English/Number + result = result.replace(/([\u4e00-\u9fa5])\s+([a-zA-Z0-9])/g, '$1$2'); + + // English/Number followed by space followed by Chinese + result = result.replace(/([a-zA-Z0-9])\s+([\u4e00-\u9fa5])/g, '$1$2'); + + // 2. Convert punctuation in CJK context + const hasChinese = /[\u4e00-\u9fa5]/.test(result); + + if (hasChinese) { + // Avoid converting if it looks like a number (e.g., 1,000) or a technical term + + // Convert , -> , (if not preceded by a digit and followed by a digit) + result = result.replace(/([^0-9]),\s*/g, (match, p1) => { + // If p1 is a Chinese character or English letter, convert it + if (/[\u4e00-\u9fa5a-zA-Z]/.test(p1)) { + return p1 + ','; + } + return match; + }); + + // Convert ; -> ; + result = result.replace(/([\u4e00-\u9fa5]);\s*/g, '$1;'); + + // Convert ! -> ! + result = result.replace(/([\u4e00-\u9fa5])!\s*/g, '$1!'); + + // Convert ? -> ? + result = result.replace(/([\u4e00-\u9fa5])\?\s*/g, '$1?'); + + // Convert : -> : + // Avoid URLs: only convert if preceded by CJK + result = result.replace(/([\u4e00-\u9fa5]):\s*/g, '$1:'); + } + + return result; +};