Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions plugin/js/main.js
Original file line number Diff line number Diff line change
Expand Up @@ -619,6 +619,17 @@ var main = (function() {

// actions to do when window opened
window.onload = async () => {
if (typeof DOMPurify === "undefined" || typeof zip === "undefined") {
let msg = "Error: WebToEpub is missing required third-party dependencies (DOMPurify or zip.js).\n\nIf you are running from a git clone, please run 'npm install' in the project root to fetch these dependencies.";
alert(msg);
let pleaseWait = document.getElementById("findingChapterUrlsMessageRow");
if (pleaseWait) {
pleaseWait.textContent = msg;
pleaseWait.style.color = "red";
pleaseWait.hidden = false;
}
return;
}
userPreferences = UserPreferences.readFromLocalStorage();
if (isRunningInTabMode()) {
ErrorLog.SuppressErrorLog = false;
Expand Down
136 changes: 131 additions & 5 deletions plugin/js/parsers/FreeWebNovelParser.js
Original file line number Diff line number Diff line change
Expand Up @@ -16,21 +16,72 @@ class FreeWebNovelParser extends Parser {
this.minimumThrottle = 1000;
}

async getChapterUrls(dom) {
async getChapterUrls(dom, chapterUrlsUI) {
let menu = dom.querySelector("ul#idData");
return util.hyperlinksToChapterList(menu);
let chapters = util.hyperlinksToChapterList(menu);

let totalPage = 1;
let indexSelect = dom.querySelector("#indexselect");
if (indexSelect) {
totalPage = indexSelect.querySelectorAll("option").length;
} else {
let scripts = [...dom.querySelectorAll("script")];
for (let script of scripts) {
let match = /totalPage:\s*(\d+)/.exec(script.textContent);
if (match) {
totalPage = parseInt(match[1]);
break;
}
}
}

if (totalPage > 1) {
chapterUrlsUI.showTocProgress(chapters);
let baseUrl = dom.baseURI;
let urlObj = new URL(baseUrl);
urlObj.search = "";
urlObj.hash = "";
let baseNovelUrl = urlObj.toString();

for (let page = 2; page <= totalPage; ++page) {
await this.rateLimitDelay();
let url = `${baseNovelUrl}?ajax=chapters&page=${page}`;
try {
let response = await HttpClient.fetchJson(url);
if (response?.json?.code === 200 && response.json.html) {
let parser = new DOMParser();
let tempDom = parser.parseFromString(response.json.html, "text/html");
util.setBaseTag(url, tempDom);
let partialChapters = util.hyperlinksToChapterList(tempDom);
if (partialChapters.length > 0) {
chapterUrlsUI.showTocProgress(partialChapters);
chapters = chapters.concat(partialChapters);
}
}
} catch (e) {
console.error("Failed to fetch TOC page: " + page, e);
}
}
}

return chapters;
}

extractTitleImpl(dom) {
return dom.querySelector("h1.tit");
}

extractAuthor(dom) {
return dom.querySelector("[title=Author]").parentNode.querySelector("a").textContent;
let element = dom.querySelector("[title=Author]");
return element ? element.parentNode.querySelector("a").textContent.trim() : "";
}

extractSubject(dom) {
let tags = [...dom.querySelector("[title=Genre]").parentNode.querySelectorAll("a")];
let element = dom.querySelector("[title=Genre]");
if (!element) {
return "";
}
let tags = [...element.parentNode.querySelectorAll("a")];
return tags.map(e => e.textContent.trim()).join(", ");
}

Expand All @@ -43,12 +94,77 @@ class FreeWebNovelParser extends Parser {
}

findContent(dom) {
return dom.querySelector("div.txt");
return dom.querySelector("div#article") || dom.querySelector("div.txt");
}

getInformationEpubItemChildNodes(dom) {
return [...dom.querySelectorAll("div.inner")];
}

removeUnwantedElementsFromContentElement(content) {
// Remove ads injected by third-party ad networks (such as SSP ads and PubFuture networks)
// whose div IDs start with 'bg-ssp-' or 'pf-'
util.removeChildElementsMatchingSelector(content, "div[id^='bg-ssp-'], div[id^='pf-']");

// Clean up any remaining ad divs or empty wrapper divs left behind after ads are deleted
for (let div of content.querySelectorAll("div")) {
if (div.id.startsWith("bg-ssp-") || div.id.startsWith("pf-")) {
div.remove();
}
// Remove parent wrapper divs if they are now completely empty
if (div.children.length === 0 && div.textContent.trim() === "") {
div.remove();
}
}

// Convert escaped/literal HTML tags (like &lt;strong&gt; or &lt;b&gt;) in text nodes to actual DOM elements
let walker = content.ownerDocument.createTreeWalker(
content,
NodeFilter.SHOW_TEXT,
null,
false
);
let nodesToReplace = [];
let node;
while ((node = walker.nextNode())) {
let val = node.nodeValue;
if (val && /(<strong|<b|<i|<em|<span|<br)/i.test(val)) {
nodesToReplace.push(node);
}
}
for (let tNode of nodesToReplace) {
let parent = tNode.parentNode;
if (parent) {
let doc = util.sanitize(tNode.nodeValue);
let body = doc.body;
while (body.firstChild) {
parent.insertBefore(body.firstChild, tNode);
}
tNode.remove();
}
}

// Clean embedded obfuscated/standard watermarks inside text nodes (e.g. freewebnovel.com, reewebnovel.com)
// Re-walk to ensure we also clean watermarks in any newly parsed text nodes
walker = content.ownerDocument.createTreeWalker(
content,
NodeFilter.SHOW_TEXT,
null,
false
);
while ((node = walker.nextNode())) {
let val = node.nodeValue;
if (val) {
// Normalize using NFKD to convert mathematical/stylized characters to standard ASCII
let normalized = val.normalize("NFKD");
if (/reewebnovel/i.test(normalized)) {
node.nodeValue = normalized.replace(/f?reewebnovel(?:\s*\.\s*com|\s+com)?/gi, "");
}
}
}

super.removeUnwantedElementsFromContentElement(content);
}
}

class NovelliveParser extends FreeWebNovelParser {
Expand Down Expand Up @@ -95,7 +211,17 @@ class FreeWebNovelComParser extends FreeWebNovelParser {
super();
}
removeUnwantedElementsFromContentElement(content) {
// Remove 'sub' elements inside paragraphs (which are sometimes used to hide watermarks or corrupt text)
util.removeChildElementsMatchingSelector(content, "p sub");

// Remove anti-scraping watermark paragraphs warning users to support the author on the original site
for (let p of content.querySelectorAll("p")) {
let text = p.textContent.toLowerCase();
if (text.includes("this story originates from") || text.includes("ensure the author gets the support")) {
p.remove();
}
}

super.removeUnwantedElementsFromContentElement(content);
}
}
2 changes: 1 addition & 1 deletion plugin/manifest.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"manifest_version": 3,
"name": "WebToEpub",
"version": "1.0.12.77",
"version": "1.0.12.78",
"default_locale": "en",
"icons": {
"128": "book128.png"
Expand Down
2 changes: 2 additions & 0 deletions unitTest/Tests.html
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@
<script src="../plugin/js/parsers/WuxiaworldParser.js"></script>
<script src="../plugin/js/parsers/XbanxiaParser.js"></script>
<script src="../plugin/js/parsers/ZirusMusingsParser.js"></script>
<script src="../plugin/js/parsers/FreeWebNovelParser.js"></script>
<script src="../plugin/@zip.js/zip.js/dist/zip-no-worker.min.js"></script>
<script src="../plugin/js/EpubItemSupplier.js"></script>
<script src="../plugin/js/CoverImageUI.js"></script>
Expand Down Expand Up @@ -145,6 +146,7 @@
<script src="UtestWuxiaworldParser.js"></script>
<script src="UtestXbanxiaParser.js"></script>
<script src='UtestParserFactory.js'></script>
<script src="UtestFreeWebNovelParser.js"></script>

<!-- elements Plug-in needs in its HTML -->
<div hidden>
Expand Down
141 changes: 141 additions & 0 deletions unitTest/UtestFreeWebNovelParser.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
"use strict";

module("FreeWebNovelParser");

QUnit.test("extractTitleImpl", function (assert) {
let dom = new DOMParser().parseFromString(FreeWebNovelNovelSample, "text/html");
let parser = new FreeWebNovelParser();
let title = parser.extractTitle(dom);
assert.equal(title, "All Jobs and Classes! I Just Wanted One Skill, Not Them All!");
});

QUnit.test("extractAuthor", function (assert) {
let dom = new DOMParser().parseFromString(FreeWebNovelNovelSample, "text/html");
let parser = new FreeWebNovelParser();
let author = parser.extractAuthor(dom);
assert.equal(author, "Comedian0");
});

QUnit.test("extractSubject", function (assert) {
let dom = new DOMParser().parseFromString(FreeWebNovelNovelSample, "text/html");
let parser = new FreeWebNovelParser();
let subject = parser.extractSubject(dom);
assert.equal(subject, "Action, Adventure, Comedy");
});

QUnit.test("findCoverImageUrl", function (assert) {
let dom = new DOMParser().parseFromString(FreeWebNovelNovelSample, "text/html");
let base = dom.createElement("base");
base.href = "https://freewebnovel.com/novel/all-jobs-and-classes-i-just-wanted-one-skill-not-them-all";
dom.head.appendChild(base);
let parser = new FreeWebNovelParser();
let cover = parser.findCoverImageUrl(dom);
assert.equal(cover, "https://freewebnovel.com/files/article/image/14/14511/14511s.jpg");
});

QUnit.test("getChapterUrls first page", async function (assert) {
let dom = new DOMParser().parseFromString(FreeWebNovelNovelSample, "text/html");
let parser = new FreeWebNovelParser();
let base = dom.createElement("base");
base.href = "https://freewebnovel.com/novel/all-jobs-and-classes-i-just-wanted-one-skill-not-them-all";
dom.head.appendChild(base);

let chapterUrlsUI = {
showTocProgress: function() {}
};

let chapters = await parser.getChapterUrls(dom, chapterUrlsUI);
assert.equal(chapters.length, 2);
assert.equal(chapters[0].title, "Chapter 01");
assert.equal(chapters[0].sourceUrl, "https://freewebnovel.com/novel/all-jobs-and-classes-i-just-wanted-one-skill-not-them-all/chapter-1");
});

QUnit.test("findChapterTitle", function (assert) {
let dom = new DOMParser().parseFromString(FreeWebNovelChapterSample, "text/html");
let parser = new FreeWebNovelParser();
let titleEl = parser.findChapterTitle(dom);
assert.equal(titleEl.textContent.trim(), "Chapter 01");
});

QUnit.test("findContent and clean", function (assert) {
let dom = new DOMParser().parseFromString(FreeWebNovelChapterSample, "text/html");
let parser = new FreeWebNovelComParser();
let content = parser.findContent(dom);
assert.ok(content !== null, "Content found");
parser.removeUnwantedElementsFromContentElement(content);

assert.equal(content.querySelector("div[id^='bg-ssp-']"), null, "Ads removed");
assert.equal(content.querySelector("div[id^='pf-']"), null, "PubFuture ads removed");

let paragraphs = [...content.querySelectorAll("p")];
let watermarkFound = paragraphs.some(p => p.textContent.includes("This story originates from"));
assert.notOk(watermarkFound, "Watermark paragraph removed");

// Check that embedded watermarks (with math alphanumeric characters and standard ASCII) are removed
assert.equal(paragraphs[0].textContent.trim(), "It started with the kind of cold that crawls under your nails and refuses to leave.");
assert.equal(paragraphs[1].textContent.trim(), "Screams. The thunder of crumpling steel.");
});

QUnit.test("convert literal HTML tags", function (assert) {
let dom = new DOMParser().parseFromString("<div><p>&lt;strong&gt;[Name: Aster Nilm&lt;/strong&gt;</p></div>", "text/html");
let parser = new FreeWebNovelParser();
let content = dom.querySelector("div");
parser.removeUnwantedElementsFromContentElement(content);

let strong = content.querySelector("strong");
assert.ok(strong !== null, "Strong element parsed");
assert.equal(strong.textContent, "[Name: Aster Nilm");
});

let FreeWebNovelNovelSample = `
<!DOCTYPE html>
<html>
<head>
<meta property="og:url" content="https://freewebnovel.com/novel/all-jobs-and-classes-i-just-wanted-one-skill-not-them-all">
</head>
<body>
<div class="m-imgtxt">
<div class="pic">
<img src="/files/article/image/14/14511/14511s.jpg">
</div>
<div class="txt">
<div class="item">
<span class="glyphicon glyphicon-user" title="Author"></span>
<div class="right"><a href="/author/Comedian0">Comedian0</a></div>
</div>
<div class="item">
<span class="glyphicon glyphicon-th-list" title="Genre"></span>
<div class="right">
<a href="/genre/Action">Action</a>, <a href="/genre/Adventure">Adventure</a>, <a href="/genre/Comedy">Comedy</a>
</div>
</div>
</div>
</div>
<div class="m-desc">
<h1 class="tit">All Jobs and Classes! I Just Wanted One Skill, Not Them All!</h1>
</div>
<ul class="ul-list5" id="idData">
<li><a href="/novel/all-jobs-and-classes-i-just-wanted-one-skill-not-them-all/chapter-1" title="Chapter 01">Chapter 01</a></li>
<li><a href="/novel/all-jobs-and-classes-i-just-wanted-one-skill-not-them-all/chapter-2" title="Chapter 02">Chapter 02</a></li>
</ul>
</body>
</html>
`;

let FreeWebNovelChapterSample = `
<!DOCTYPE html>
<html>
<body>
<span class="chapter">Chapter 01</span>
<div class="txt ">
<div id="article">
<div id="pf-1558-1">ad script</div>
<p>It started with the kind of <b>𝘧𝑟𝑒𝑒𝘸𝘦𝘣𝑛𝑜𝘷𝑒𝓁.𝘤𝘰𝓂</b> cold that crawls under your nails and refuses to leave.</p>
<div id="bg-ssp-6327">ad banner</div>
<p>This story originates from a different website. Ensure the author gets the support they deserve by reading it there.</p>
<p>Screams. The thunder of reewebnovel.com crumpling steel.</p>
</div>
</div>
</body>
</html>
`;