From 61541b999d45f16210da2255b91c2add1295e852 Mon Sep 17 00:00:00 2001 From: Aryan Mane <146648718+ARYAN-9099@users.noreply.github.com> Date: Thu, 2 Jul 2026 19:48:26 +0530 Subject: [PATCH 1/2] Improve ChyoaParser: recursive tree walk with nested TOC EPUB output --- package.json | 3 +- plugin/js/parsers/ChyoaParser.js | 387 ++++++++++++++++++++++++++++++- readme.md | 1 + 3 files changed, 379 insertions(+), 12 deletions(-) diff --git a/package.json b/package.json index 1f86ae9be..fbf984dd2 100644 --- a/package.json +++ b/package.json @@ -114,7 +114,8 @@ { "name": "thanhlouis"}, { "name": "ltmerletti"}, { "name": "fnx4"}, - { "name": "Fox6935"} + { "name": "Fox6935"}, + { "name": "ARYAN-9099" } ], "license": "GPL-3.0-only", "bugs": { diff --git a/plugin/js/parsers/ChyoaParser.js b/plugin/js/parsers/ChyoaParser.js index 7813b4304..53c4b7f8a 100644 --- a/plugin/js/parsers/ChyoaParser.js +++ b/plugin/js/parsers/ChyoaParser.js @@ -1,34 +1,399 @@ +/* + Parser for https://chyoa.com + Full recursive branching-story support with nested TOC. + + Architecture note + ----------------- + CHYOA is a "Choose Your Own Adventure" site – stories are *trees*, not lists. + The standard WebToEpub flow (flat chapter list → linear download → pack) doesn't + fit. We therefore override three key methods: + + getChapterUrls() – does the full recursive tree-walk *here*, accumulating + content into this.chyoaItems as it goes. Returns a flat + URL list so the UI table still populates correctly. + + fetchContent() – no-op (everything was already fetched above). + + epubItemSupplier() – builds ChyoaEpubItems with correct TOC *depth* values + so EpubPacker produces a proper nested Table of Contents. +*/ "use strict"; parserFactory.register("chyoa.com", () => new ChyoaParser()); +// --------------------------------------------------------------------------- +// ChyoaEpubItem +// A custom EpubItem that carries a tree-depth for the nested TOC. +// --------------------------------------------------------------------------- +class ChyoaEpubItem extends EpubItem { + /** + * @param {string} sourceUrl + * @param {string} title Chapter title (shown in TOC) + * @param {number} index Epub item index (determines filename) + * @param {number} tocDepth Nesting level: 0 = root, 1 = first branch, … + * @param {Node[]} nodes Array of DOM nodes to write into the XHTML file + */ + constructor(sourceUrl, title, index, tocDepth, nodes) { + super(sourceUrl); + super.setIndex(index); + this.chapterTitle = title || ""; + this.tocDepth = tocDepth; + this.nodes = nodes; + } + + // EpubItem.chapterInfo() is what EpubPacker iterates to build toc.ncx / nav. + *chapterInfo() { + if (this.chapterTitle) { + yield { + depth: this.tocDepth, + title: this.chapterTitle, + src: this.getZipHref() + }; + } + } +} + +// --------------------------------------------------------------------------- +// ChyoaParser +// --------------------------------------------------------------------------- class ChyoaParser extends Parser { constructor() { super(); + // visitedUrls maps normalized URL → epub-item index. + // Tracks what we've already scraped to handle circular links. + this.visitedUrls = new Map(); + // Flat ordered list; each entry = { sourceUrl, title, index, depth, nodes } + this.chyoaItems = []; + this._nextIndex = 0; } - async getChapterUrls(dom) { - return [...dom.querySelectorAll("div.story-map-content a.title")] - .map(a => util.hyperLinkToChapter(a)); + // ========================================================================= + // STEP 1 – called by the framework after the first page is loaded. + // We walk the entire story tree here. + // ========================================================================= + async getChapterUrls(dom, chapterUrlsUI) { + // Reset in case parser is reused + this.visitedUrls = new Map(); + this.chyoaItems = []; + this._nextIndex = 0; + + let rootUrl = dom.baseURI; + await this._walkBranch(dom, rootUrl, 0, chapterUrlsUI); + + // Return flat list for the UI table (framework only needs sourceUrl + title) + return this.chyoaItems.map(item => ({ + sourceUrl: item.sourceUrl, + title: item.title, + isIncludeable: true + })); } - findContent(dom) { - return dom.querySelector("div.layout-content-wrapper"); + // ========================================================================= + // Recursive tree-walker + // Mirrors parser.getlinksfromsite() / scrape_links() from chyoa.py + // ========================================================================= + async _walkBranch(dom, url, depth, chapterUrlsUI) { + let normUrl = util.normalizeUrlForCompare(url); + + // Guard: already visited (circular reference) – stop here + if (this.visitedUrls.has(normUrl)) { + return; + } + + let index = this._nextIndex++; + this.visitedUrls.set(normUrl, index); + + // ── Extract content nodes for this chapter ── + let title = this._scrapeChapterTitle(dom); + let nodes = this._scrapeContentNodes(dom, url); + + this.chyoaItems.push({ sourceUrl: url, title, index, depth, nodes }); + + // Show live progress in the UI list while we're still walking + if (chapterUrlsUI && typeof chapterUrlsUI.showTocProgress === "function") { + chapterUrlsUI.showTocProgress([{ sourceUrl: url, title }]); + } + + // ── Follow choice links ── + let choices = this._scrapeChoiceLinks(dom); + for (let { href } of choices) { + let childNorm = util.normalizeUrlForCompare(href); + if (this.visitedUrls.has(childNorm)) { + // Already in tree – circular link; skip to avoid infinite recursion + continue; + } + try { + await this.rateLimitDelay(); + let childDom = (await HttpClient.wrapFetch(href)).responseXML; + await this._walkBranch(childDom, href, depth + 1, chapterUrlsUI); + } catch (err) { + ErrorLog.log(`ChyoaParser: failed fetching ${href}: ${err}`); + } + } + } + + // ========================================================================= + // DOM scraping helpers + // ========================================================================= + + /** Extract choice/branch links from a chapter page. */ + _scrapeChoiceLinks(dom) { + let seen = new Set(); + let results = []; + + // On CHYOA, choices live in div.question-content as tags + for (let a of dom.querySelectorAll("div.question-content a")) { + let href = a.href; + if (!href) continue; + + // Skip CHYOA's own UI action links (same filters as the Python scraper) + if (href.includes("/new?type=")) continue; + let text = (a.textContent || "").trim(); + if (text === "Add a new chapter") continue; + if (text === "Write a chapter") continue; + if (text === "Link a chapter") continue; + + // Only follow links that stay on chyoa.com + if (!href.includes("chyoa.com")) continue; + + let norm = util.normalizeUrlForCompare(href); + if (seen.has(norm)) continue; + seen.add(norm); + + results.push({ href, text }); + } + return results; + } + + /** Get the chapter title. Mirrors parser.scrape_title_author() in chyoa.py. */ + _scrapeChapterTitle(dom) { + // Chapter pages:

+ let chHeader = dom.querySelector("header.chapter-header"); + if (chHeader) { + let h1 = chHeader.querySelector("h1"); + if (h1 && h1.textContent.trim()) return h1.textContent.trim(); + let h2 = chHeader.querySelector("h2"); + if (h2 && h2.textContent.trim()) return h2.textContent.trim(); + } + // Story root pages:

+ let stHeader = dom.querySelector("header.story-header"); + if (stHeader) { + let h1 = stHeader.querySelector("h1"); + if (h1 && h1.textContent.trim()) return h1.textContent.trim(); + } + // Last resort: + return (dom.title || "").trim(); } + /** + * Extract and clean the readable content for one chapter. + * Returns an array of DOM nodes suitable for writing into an XHTML file. + * Mirrors parser.scrape_content() + saveEpub() content assembly in chyoa.py. + */ + _scrapeContentNodes(dom, sourceUrl) { + // Identify the chapter body + let contentEl = + dom.querySelector("div.chapter-content") || + dom.querySelector("div.layout-content-wrapper"); + + // Build a clean wrapper div + let wrapper = document.createElement("div"); + + // 1. Chapter heading (from chapter-header) + let chHeader = dom.querySelector("header.chapter-header"); + if (chHeader) { + let h1 = chHeader.querySelector("h1"); + let h2 = chHeader.querySelector("h2"); + if (h1) { + let heading = document.createElement("h2"); + heading.textContent = h1.textContent.trim(); + // Author is also in the chapter meta + let meta = dom.querySelector("p.meta"); + if (meta) { + let a = meta.querySelector("a"); + if (a) heading.textContent += " – by " + a.textContent.trim(); + } + wrapper.appendChild(heading); + } + if (h2) { + let subheading = document.createElement("h3"); + subheading.textContent = h2.textContent.trim(); + wrapper.appendChild(subheading); + } + } + + // 2. Story content + if (contentEl) { + // Clone so we don't mutate the live DOM + let clone = contentEl.cloneNode(true); + + // Remove CHYOA clutter: ads, ratings, navigation links + for (let sel of [ + "footer", + "div.chyoa-adzone", + "div.ratings", + "div.links", + "nav", + "script", + "noscript", + "input", + "button" + ]) { + for (let el of [...clone.querySelectorAll(sel)]) { + el.remove(); + } + } + + wrapper.appendChild(clone); + } else { + let p = document.createElement("p"); + p.textContent = "[No content found for: " + sourceUrl + "]"; + wrapper.appendChild(p); + } + + // 3. Question / choice prompt + let qHeader = dom.querySelector("header.question-header h2"); + if (qHeader && qHeader.textContent.trim()) { + let hr = document.createElement("hr"); + wrapper.appendChild(hr); + let q = document.createElement("h3"); + q.textContent = qHeader.textContent.trim(); + wrapper.appendChild(q); + } + + // 4. Choices as a list of links + // We use relative xhtml filenames that fixupHyperlinksInEpubItems() + // will resolve later (it rewrites chyoa.com URLs → ../Text/xhtmlXXX.xhtml) + let choices = this._scrapeChoiceLinks(dom); + if (choices.length > 0) { + let ul = document.createElement("ul"); + for (let { href, text } of choices) { + let li = document.createElement("li"); + let a = document.createElement("a"); + a.href = href; // will be rewritten to relative path by fixupHyperlinks + a.textContent = text; + li.appendChild(a); + ul.appendChild(li); + } + wrapper.appendChild(ul); + } + + return Array.from(wrapper.childNodes); + } + + // ========================================================================= + // STEP 2 – fetchContent() + // All content was already fetched during getChapterUrls(), so this is a no-op. + // The framework calls this when the user clicks "Pack EPUB". + // ========================================================================= + fetchContent() { + return Promise.resolve(); + } + + // ========================================================================= + // STEP 3 – epubItemSupplier() + // Build EPUB items from our pre-collected chyoaItems list. + // ========================================================================= + epubItemSupplier() { + // Respect the user's selection: honour isIncludeable flags set via the UI + // (webPages map is keyed by sourceUrl and carries isIncludeable after + // populateChapterUrlsTable runs) + let includedUrls = new Set(); + for (let [url, page] of this.state.webPages) { + if (page.isIncludeable !== false) { + includedUrls.add(util.normalizeUrlForCompare(url)); + } + } + + // If the webPages map is empty (e.g., framework never called setPagesToFetch), + // include everything. + let filterBySelection = (includedUrls.size > 0); + + let epubItems = []; + for (let item of this.chyoaItems) { + if (filterBySelection && + !includedUrls.has(util.normalizeUrlForCompare(item.sourceUrl))) { + continue; + } + epubItems.push( + new ChyoaEpubItem( + item.sourceUrl, + item.title, + item.index, + item.depth, + item.nodes + ) + ); + } + + this.fixupHyperlinksInEpubItems(epubItems); + return new EpubItemSupplier(this, epubItems, this.imageCollector); + } + + // ========================================================================= + // UI + // ========================================================================= + + /** + * Appends a CHYOA-specific note to the "Searching for URLs, please wait" + * banner so users know the tree-walk takes longer than a normal site. + */ + populateUI(dom) { + super.populateUI(dom); + let waitMsg = document.getElementById("findingChapterUrlsMessageRow"); + if (waitMsg) { + let note = document.createElement("span"); + note.textContent = " (CHYOA: fetching every branch — this may take several minutes for large stories)"; + note.style.fontStyle = "italic"; + waitMsg.appendChild(note); + } + } + + // ========================================================================= + // Metadata + // ========================================================================= extractTitleImpl(dom) { - return dom.querySelector("story-map-header h1"); + // Story root has <header class="story-header"><h1>…</h1> + let h1 = dom.querySelector("header.story-header h1"); + if (h1) return h1.textContent.trim(); + return Parser.extractTitleDefault(dom); + } + + extractAuthor(dom) { + // Author link is inside <p class="meta"><a …>AuthorName</a> + let a = dom.querySelector("p.meta a"); + if (a) return a.textContent.trim(); + return "<unknown>"; + } + + findContent(dom) { + return ( + dom.querySelector("div.chapter-content") || + dom.querySelector("div.layout-content-wrapper") + ); + } + + findCoverImageUrl(dom) { + let img = dom.querySelector("div.cover img"); + return img ? img.src : null; } + // ========================================================================= + // Cleanup + // ========================================================================= removeUnwantedElementsFromContentElement(element) { - util.removeChildElementsMatchingSelector(element, "footer, div.chyoa-adzone, " - + "div.ratings, div.links" ); + util.removeChildElementsMatchingSelector( + element, + "footer, div.chyoa-adzone, div.ratings, div.links" + ); super.removeUnwantedElementsFromContentElement(element); } + /** + * For CHYOA we must NOT strip the choice links that appear in content – + * they ARE the story navigation. Override to do nothing. + */ removeNextAndPreviousChapterHyperlinks() { - // Don't remove next or previous links - // This is a "choose your own story" like site - // So user picks links to next chapters + // intentionally empty } } diff --git a/readme.md b/readme.md index 3d7089f2a..e81cccf61 100644 --- a/readme.md +++ b/readme.md @@ -823,6 +823,7 @@ Don't forget to give the project a star! Thanks again! <li>thanhlouis</li> <li>fnx4</li> <li>Fox6935</li> + <li>ARYAN-9099</li> </ul> </details> From 4b5f4ec8e7ac43ebd8f7af691f11084cc2a1d45b Mon Sep 17 00:00:00 2001 From: Aryan Mane <146648718+ARYAN-9099@users.noreply.github.com> Date: Fri, 3 Jul 2026 18:01:07 +0530 Subject: [PATCH 2/2] Fixed images --- plugin/js/parsers/ChyoaParser.js | 100 +++++++++++++++++++++---------- 1 file changed, 67 insertions(+), 33 deletions(-) diff --git a/plugin/js/parsers/ChyoaParser.js b/plugin/js/parsers/ChyoaParser.js index 53c4b7f8a..34446e049 100644 --- a/plugin/js/parsers/ChyoaParser.js +++ b/plugin/js/parsers/ChyoaParser.js @@ -60,7 +60,7 @@ class ChyoaParser extends Parser { constructor() { super(); // visitedUrls maps normalized URL → epub-item index. - // Tracks what we've already scraped to handle circular links. + // Tracks what we've already fetched to handle circular links. this.visitedUrls = new Map(); // Flat ordered list; each entry = { sourceUrl, title, index, depth, nodes } this.chyoaItems = []; @@ -90,7 +90,7 @@ class ChyoaParser extends Parser { // ========================================================================= // Recursive tree-walker - // Mirrors parser.getlinksfromsite() / scrape_links() from chyoa.py + // Mirrors parser.getlinksfromsite() / fetch_links() from chyoa.py // ========================================================================= async _walkBranch(dom, url, depth, chapterUrlsUI) { let normUrl = util.normalizeUrlForCompare(url); @@ -104,8 +104,8 @@ class ChyoaParser extends Parser { this.visitedUrls.set(normUrl, index); // ── Extract content nodes for this chapter ── - let title = this._scrapeChapterTitle(dom); - let nodes = this._scrapeContentNodes(dom, url); + let title = this._extractChapterTitle(dom); + let nodes = this._extractContentNodes(dom, url); this.chyoaItems.push({ sourceUrl: url, title, index, depth, nodes }); @@ -115,7 +115,7 @@ class ChyoaParser extends Parser { } // ── Follow choice links ── - let choices = this._scrapeChoiceLinks(dom); + let choices = this._extractChoiceLinks(dom); for (let { href } of choices) { let childNorm = util.normalizeUrlForCompare(href); if (this.visitedUrls.has(childNorm)) { @@ -137,7 +137,7 @@ class ChyoaParser extends Parser { // ========================================================================= /** Extract choice/branch links from a chapter page. */ - _scrapeChoiceLinks(dom) { + _extractChoiceLinks(dom) { let seen = new Set(); let results = []; @@ -165,8 +165,8 @@ class ChyoaParser extends Parser { return results; } - /** Get the chapter title. Mirrors parser.scrape_title_author() in chyoa.py. */ - _scrapeChapterTitle(dom) { + /** Get the chapter title. Mirrors parser.extract_title_author() in chyoa.py. */ + _extractChapterTitle(dom) { // Chapter pages: <header class="chapter-header"><h1>…</h1> let chHeader = dom.querySelector("header.chapter-header"); if (chHeader) { @@ -188,9 +188,9 @@ class ChyoaParser extends Parser { /** * Extract and clean the readable content for one chapter. * Returns an array of DOM nodes suitable for writing into an XHTML file. - * Mirrors parser.scrape_content() + saveEpub() content assembly in chyoa.py. + * Mirrors parser.extract_content() + saveEpub() content assembly in chyoa.py. */ - _scrapeContentNodes(dom, sourceUrl) { + _extractContentNodes(dom, sourceUrl) { // Identify the chapter body let contentEl = dom.querySelector("div.chapter-content") || @@ -227,22 +227,8 @@ class ChyoaParser extends Parser { // Clone so we don't mutate the live DOM let clone = contentEl.cloneNode(true); - // Remove CHYOA clutter: ads, ratings, navigation links - for (let sel of [ - "footer", - "div.chyoa-adzone", - "div.ratings", - "div.links", - "nav", - "script", - "noscript", - "input", - "button" - ]) { - for (let el of [...clone.querySelectorAll(sel)]) { - el.remove(); - } - } + // Clean up unwanted elements + this.removeUnwantedElementsFromContentElement(clone); wrapper.appendChild(clone); } else { @@ -264,7 +250,7 @@ class ChyoaParser extends Parser { // 4. Choices as a list of links // We use relative xhtml filenames that fixupHyperlinksInEpubItems() // will resolve later (it rewrites chyoa.com URLs → ../Text/xhtmlXXX.xhtml) - let choices = this._scrapeChoiceLinks(dom); + let choices = this._extractChoiceLinks(dom); if (choices.length > 0) { let ul = document.createElement("ul"); for (let { href, text } of choices) { @@ -283,11 +269,51 @@ class ChyoaParser extends Parser { // ========================================================================= // STEP 2 – fetchContent() - // All content was already fetched during getChapterUrls(), so this is a no-op. - // The framework calls this when the user clicks "Pack EPUB". + // Content text was fetched during getChapterUrls(), but we must process + // images here so they are properly collected, downloaded, and included + // in the EPUB. // ========================================================================= - fetchContent() { - return Promise.resolve(); + async fetchContent() { + this.imageCollector.reset(); + this.imageCollector.setCoverImageUrl(CoverImageUI.getCoverImageUrl()); + + // Get the list of URLs the user actually selected to include + let includedUrls = new Set(); + for (let [url, page] of this.state.webPages) { + if (page.isIncludeable !== false) { + includedUrls.add(util.normalizeUrlForCompare(url)); + } + } + let filterBySelection = (includedUrls.size > 0); + let itemsToProcess = filterBySelection + ? this.chyoaItems.filter(item => includedUrls.has(util.normalizeUrlForCompare(item.sourceUrl))) + : this.chyoaItems; + + this.setUiToShowLoadingProgress(itemsToProcess.length); + + for (let item of itemsToProcess) { + let webPage = this.state.webPages.get(item.sourceUrl); + if (!webPage) { + webPage = { sourceUrl: item.sourceUrl, row: null }; + } + + if (webPage.row) { + ChapterUrlsUI.showDownloadState(webPage.row, ChapterUrlsUI.DOWNLOAD_STATE_DOWNLOADING); + } + + // Wrap the nodes back into a DOM element so imageCollector can process them + let wrapper = document.createElement("div"); + item.nodes.forEach(n => wrapper.appendChild(n)); + + try { + await this.fetchImagesUsedInDocument(wrapper, webPage); + } catch (err) { + ErrorLog.log(`ChyoaParser: failed fetching images for ${item.sourceUrl}: ${err}`); + } + + // Store the modified nodes (with image tags replaced) back into the item + item.nodes = Array.from(wrapper.childNodes); + } } // ========================================================================= @@ -315,13 +341,21 @@ class ChyoaParser extends Parser { !includedUrls.has(util.normalizeUrlForCompare(item.sourceUrl))) { continue; } + + // The imageCollector downloaded the images in fetchContent, but we must + // now mutate the DOM elements to point to the local EPUB files. + let wrapper = document.createElement("div"); + item.nodes.forEach(n => wrapper.appendChild(n)); + this.imageCollector.replaceImageTags(wrapper); + let updatedNodes = Array.from(wrapper.childNodes); + epubItems.push( new ChyoaEpubItem( item.sourceUrl, item.title, item.index, item.depth, - item.nodes + updatedNodes ) ); } @@ -384,7 +418,7 @@ class ChyoaParser extends Parser { removeUnwantedElementsFromContentElement(element) { util.removeChildElementsMatchingSelector( element, - "footer, div.chyoa-adzone, div.ratings, div.links" + "footer, div.chyoa-adzone, div.ratings, div.links, nav, script, noscript, input, button" ); super.removeUnwantedElementsFromContentElement(element); }