From 97383861f0bf801c687639ca305b895328015df8 Mon Sep 17 00:00:00 2001 From: Nikita Skovoroda Date: Sun, 21 Dec 2025 17:51:42 +0400 Subject: [PATCH 1/2] Switch to lowercase and exodus/bytes --- .github/workflows/build.yml | 9 +++- lib/html-encoding-sniffer.js | 16 +++--- package-lock.json | 48 +++++++----------- package.json | 4 +- .../bom/{UTF-16BE.html => utf-16be.html} | Bin .../bom/{UTF-16LE.html => utf-16le.html} | Bin test/fixtures/bom/{UTF-8.html => utf-8.html} | 0 ..._UTF-8.html => charset-bracket_utf-8.html} | 0 ... => charset-short-comment_iso-8859-2.html} | 0 ...harset_KOI8-R.html => charset_koi8-r.html} | 0 ...l => http-equiv-no-quotes_iso-8859-5.html} | 0 ...http-equiv-second-charset_iso-8859-2.html} | 0 ...http-equiv-trailing-space_iso-8859-2.html} | 0 test/tests.js | 20 ++++---- 14 files changed, 46 insertions(+), 51 deletions(-) rename test/fixtures/bom/{UTF-16BE.html => utf-16be.html} (100%) rename test/fixtures/bom/{UTF-16LE.html => utf-16le.html} (100%) rename test/fixtures/bom/{UTF-8.html => utf-8.html} (100%) rename test/fixtures/normal/{charset-bracket_UTF-8.html => charset-bracket_utf-8.html} (100%) rename test/fixtures/normal/{charset-short-comment_ISO-8859-2.html => charset-short-comment_iso-8859-2.html} (100%) rename test/fixtures/normal/{charset_KOI8-R.html => charset_koi8-r.html} (100%) rename test/fixtures/normal/{http-equiv-no-quotes_ISO-8859-5.html => http-equiv-no-quotes_iso-8859-5.html} (100%) rename test/fixtures/normal/{http-equiv-second-charset_ISO-8859-2.html => http-equiv-second-charset_iso-8859-2.html} (100%) rename test/fixtures/normal/{http-equiv-trailing-space_ISO-8859-2.html => http-equiv-trailing-space_iso-8859-2.html} (100%) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index b248e5b..6b581ca 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -12,9 +12,14 @@ jobs: fail-fast: false matrix: node-version: - - 18 + # Explicitly test minimum Node.js versions. Keep in sync with package.json. + - 20.19.0 - 20 - - latest + - 22.12.0 + - 22 + - 24.0.0 + - lts/* # currently 24 + - latest # currently 25 steps: - uses: actions/checkout@v4 - uses: actions/setup-node@v4 diff --git a/lib/html-encoding-sniffer.js b/lib/html-encoding-sniffer.js index 3603410..b6a97c9 100644 --- a/lib/html-encoding-sniffer.js +++ b/lib/html-encoding-sniffer.js @@ -1,12 +1,12 @@ "use strict"; -const whatwgEncoding = require("whatwg-encoding"); +const { getBOMEncoding, normalizeEncoding: labelToName } = require("@exodus/bytes/encoding-lite.js"); // https://html.spec.whatwg.org/#encoding-sniffing-algorithm module.exports = (uint8Array, { transportLayerEncodingLabel, defaultEncoding = "windows-1252" } = {}) => { - let encoding = whatwgEncoding.getBOMEncoding(uint8Array); + let encoding = getBOMEncoding(uint8Array); if (encoding === null && transportLayerEncodingLabel !== undefined) { - encoding = whatwgEncoding.labelToName(transportLayerEncodingLabel); + encoding = labelToName(transportLayerEncodingLabel); } if (encoding === null) { @@ -69,7 +69,7 @@ function prescanMetaCharset(uint8Array) { needPragma = true; } } else if (attrRes.attr.name === "charset") { - charset = whatwgEncoding.labelToName(attrRes.attr.value); + charset = labelToName(attrRes.attr.value); needPragma = false; } } @@ -86,8 +86,8 @@ function prescanMetaCharset(uint8Array) { continue; } - if (charset === "UTF-16LE" || charset === "UTF-16BE") { - charset = "UTF-8"; + if (charset === "utf-16le" || charset === "utf-16be") { + charset = "utf-8"; } if (charset === "x-user-defined") { charset = "windows-1252"; @@ -271,7 +271,7 @@ function extractCharacterEncodingFromMeta(string) { const nextIndex = string.indexOf(string[position], position + 1); if (nextIndex !== -1) { - return whatwgEncoding.labelToName(string.substring(position + 1, nextIndex)); + return labelToName(string.substring(position + 1, nextIndex)); } // It is an unmatched quotation mark @@ -287,7 +287,7 @@ function extractCharacterEncodingFromMeta(string) { string.length : position + indexOfASCIIWhitespaceOrSemicolon + 1; - return whatwgEncoding.labelToName(string.substring(position, end)); + return labelToName(string.substring(position, end)); } function isSpaceCharacter(c) { diff --git a/package-lock.json b/package-lock.json index f1ff477..8f3a150 100644 --- a/package-lock.json +++ b/package-lock.json @@ -9,14 +9,14 @@ "version": "4.0.0", "license": "MIT", "dependencies": { - "whatwg-encoding": "^3.1.1" + "@exodus/bytes": "^1.0.0" }, "devDependencies": { "@domenic/eslint-config": "^3.0.0", "eslint": "^8.53.0" }, "engines": { - "node": ">=18" + "node": "^20.19.0 || ^22.12.0 || >=24.0.0" } }, "node_modules/@aashutoshrathi/word-wrap": { @@ -93,6 +93,23 @@ "node": "^12.22.0 || ^14.17.0 || >=16.0.0" } }, + "node_modules/@exodus/bytes": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/@exodus/bytes/-/bytes-1.0.0.tgz", + "integrity": "sha512-ca1yz/xhQrk0IKDKtNZMc9rzEX+kw8hiBkLb3uHNU30Dssd3UVU8R/jTSGIydSLv6az76Y35WI4DJ5DfDOA2Fg==", + "license": "MIT", + "engines": { + "node": "^20.19.0 || ^22.12.0 || >=24.0.0" + }, + "peerDependencies": { + "@exodus/crypto": "^1.0.0-rc.4" + }, + "peerDependenciesMeta": { + "@exodus/crypto": { + "optional": true + } + } + }, "node_modules/@humanwhocodes/config-array": { "version": "0.11.13", "resolved": "https://registry.npmjs.org/@humanwhocodes/config-array/-/config-array-0.11.13.tgz", @@ -645,17 +662,6 @@ "node": ">=8" } }, - "node_modules/iconv-lite": { - "version": "0.6.3", - "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.6.3.tgz", - "integrity": "sha512-4fCk79wshMdzMp2rH06qWrJE4iolqLhCUH+OiuIgU++RB0+94NlDL81atO7GX55uUKueo0txHNtvEyI6D7WdMw==", - "dependencies": { - "safer-buffer": ">= 2.1.2 < 3.0.0" - }, - "engines": { - "node": ">=0.10.0" - } - }, "node_modules/ignore": { "version": "5.2.4", "resolved": "https://registry.npmjs.org/ignore/-/ignore-5.2.4.tgz", @@ -1029,11 +1035,6 @@ "queue-microtask": "^1.2.2" } }, - "node_modules/safer-buffer": { - "version": "2.1.2", - "resolved": "https://registry.npmjs.org/safer-buffer/-/safer-buffer-2.1.2.tgz", - "integrity": "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg==" - }, "node_modules/shebang-command": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/shebang-command/-/shebang-command-2.0.0.tgz", @@ -1130,17 +1131,6 @@ "punycode": "^2.1.0" } }, - "node_modules/whatwg-encoding": { - "version": "3.1.1", - "resolved": "https://registry.npmjs.org/whatwg-encoding/-/whatwg-encoding-3.1.1.tgz", - "integrity": "sha512-6qN4hJdMwfYBtE3YBTTHhoeuUrDBPZmbQaxWAqSALV/MeEnR5z1xd8UKud2RAkFoPkmB+hli1TZSnyi84xz1vQ==", - "dependencies": { - "iconv-lite": "0.6.3" - }, - "engines": { - "node": ">=18" - } - }, "node_modules/which": { "version": "2.0.2", "resolved": "https://registry.npmjs.org/which/-/which-2.0.2.tgz", diff --git a/package.json b/package.json index 42139f9..8c54466 100644 --- a/package.json +++ b/package.json @@ -18,13 +18,13 @@ "lint": "eslint ." }, "dependencies": { - "whatwg-encoding": "^3.1.1" + "@exodus/bytes": "^1.0.0" }, "devDependencies": { "@domenic/eslint-config": "^3.0.0", "eslint": "^8.53.0" }, "engines": { - "node": ">=18" + "node": "^20.19.0 || ^22.12.0 || >=24.0.0" } } diff --git a/test/fixtures/bom/UTF-16BE.html b/test/fixtures/bom/utf-16be.html similarity index 100% rename from test/fixtures/bom/UTF-16BE.html rename to test/fixtures/bom/utf-16be.html diff --git a/test/fixtures/bom/UTF-16LE.html b/test/fixtures/bom/utf-16le.html similarity index 100% rename from test/fixtures/bom/UTF-16LE.html rename to test/fixtures/bom/utf-16le.html diff --git a/test/fixtures/bom/UTF-8.html b/test/fixtures/bom/utf-8.html similarity index 100% rename from test/fixtures/bom/UTF-8.html rename to test/fixtures/bom/utf-8.html diff --git a/test/fixtures/normal/charset-bracket_UTF-8.html b/test/fixtures/normal/charset-bracket_utf-8.html similarity index 100% rename from test/fixtures/normal/charset-bracket_UTF-8.html rename to test/fixtures/normal/charset-bracket_utf-8.html diff --git a/test/fixtures/normal/charset-short-comment_ISO-8859-2.html b/test/fixtures/normal/charset-short-comment_iso-8859-2.html similarity index 100% rename from test/fixtures/normal/charset-short-comment_ISO-8859-2.html rename to test/fixtures/normal/charset-short-comment_iso-8859-2.html diff --git a/test/fixtures/normal/charset_KOI8-R.html b/test/fixtures/normal/charset_koi8-r.html similarity index 100% rename from test/fixtures/normal/charset_KOI8-R.html rename to test/fixtures/normal/charset_koi8-r.html diff --git a/test/fixtures/normal/http-equiv-no-quotes_ISO-8859-5.html b/test/fixtures/normal/http-equiv-no-quotes_iso-8859-5.html similarity index 100% rename from test/fixtures/normal/http-equiv-no-quotes_ISO-8859-5.html rename to test/fixtures/normal/http-equiv-no-quotes_iso-8859-5.html diff --git a/test/fixtures/normal/http-equiv-second-charset_ISO-8859-2.html b/test/fixtures/normal/http-equiv-second-charset_iso-8859-2.html similarity index 100% rename from test/fixtures/normal/http-equiv-second-charset_ISO-8859-2.html rename to test/fixtures/normal/http-equiv-second-charset_iso-8859-2.html diff --git a/test/fixtures/normal/http-equiv-trailing-space_ISO-8859-2.html b/test/fixtures/normal/http-equiv-trailing-space_iso-8859-2.html similarity index 100% rename from test/fixtures/normal/http-equiv-trailing-space_ISO-8859-2.html rename to test/fixtures/normal/http-equiv-trailing-space_iso-8859-2.html diff --git a/test/tests.js b/test/tests.js index 74ec2e0..dadd8b7 100644 --- a/test/tests.js +++ b/test/tests.js @@ -25,7 +25,7 @@ for (const file of fs.readdirSync(path.resolve(__dirname, "fixtures/bom"))) { it(`should sniff as ${desiredEncoding}, given overriding options`, () => { const sniffedEncoding = htmlEncodingSniffer(buffer, { transportLayerEncodingLabel: "windows-1252", - defaultEncoding: "UTF-16LE" + defaultEncoding: "utf-16le" }); assert.strictEqual(sniffedEncoding, desiredEncoding); @@ -47,7 +47,7 @@ for (const file of fs.readdirSync(path.resolve(__dirname, "fixtures/normal"))) { it("should sniff as the transport layer encoding, given that", () => { const sniffedEncoding = htmlEncodingSniffer(buffer, { transportLayerEncodingLabel: "windows-1251", - defaultEncoding: "ISO-8859-16" + defaultEncoding: "iso-8859-16" }); assert.strictEqual(sniffedEncoding, "windows-1251"); @@ -56,7 +56,7 @@ for (const file of fs.readdirSync(path.resolve(__dirname, "fixtures/normal"))) { it(`should sniff as ${desiredEncoding}, given only a default encoding`, () => { const sniffedEncoding = htmlEncodingSniffer(buffer, { - defaultEncoding: "ISO-8859-16" + defaultEncoding: "iso-8859-16" }); assert.strictEqual(sniffedEncoding, desiredEncoding); @@ -78,7 +78,7 @@ for (const file of fs.readdirSync(path.resolve(__dirname, "fixtures/no-result")) it("should sniff as the transport layer encoding, given that", () => { const sniffedEncoding = htmlEncodingSniffer(buffer, { transportLayerEncodingLabel: "windows-1251", - defaultEncoding: "ISO-8859-16" + defaultEncoding: "iso-8859-16" }); assert.strictEqual(sniffedEncoding, "windows-1251"); @@ -87,10 +87,10 @@ for (const file of fs.readdirSync(path.resolve(__dirname, "fixtures/no-result")) it("should sniff as the default encoding, given that", () => { const sniffedEncoding = htmlEncodingSniffer(buffer, { - defaultEncoding: "ISO-8859-16" + defaultEncoding: "iso-8859-16" }); - assert.strictEqual(sniffedEncoding, "ISO-8859-16"); + assert.strictEqual(sniffedEncoding, "iso-8859-16"); }); }); } @@ -102,13 +102,13 @@ for (const file of fs.readdirSync(path.resolve(__dirname, "fixtures/utf"))) { it("should sniff as UTF-8, given no options", () => { const sniffedEncoding = htmlEncodingSniffer(buffer); - assert.strictEqual(sniffedEncoding, "UTF-8"); + assert.strictEqual(sniffedEncoding, "utf-8"); }); it("should sniff as the transport layer encoding, given that", () => { const sniffedEncoding = htmlEncodingSniffer(buffer, { transportLayerEncodingLabel: "windows-1251", - defaultEncoding: "ISO-8859-16" + defaultEncoding: "iso-8859-16" }); assert.strictEqual(sniffedEncoding, "windows-1251"); @@ -117,10 +117,10 @@ for (const file of fs.readdirSync(path.resolve(__dirname, "fixtures/utf"))) { it("should sniff as UTF-8, given only a default encoding", () => { const sniffedEncoding = htmlEncodingSniffer(buffer, { - defaultEncoding: "ISO-8859-16" + defaultEncoding: "iso-8859-16" }); - assert.strictEqual(sniffedEncoding, "UTF-8"); + assert.strictEqual(sniffedEncoding, "utf-8"); }); }); } From f4f466974513199c7f42e63bdafd8cfb35d7f5a3 Mon Sep 17 00:00:00 2001 From: Domenic Denicola Date: Tue, 23 Dec 2025 13:49:16 +0900 Subject: [PATCH 2/2] Update readme too --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 73338ea..6d562c9 100644 --- a/README.md +++ b/README.md @@ -12,11 +12,11 @@ const sniffedEncoding = htmlEncodingSniffer(htmlBytes); The passed bytes are given as a `Uint8Array`; the Node.js `Buffer` subclass of `Uint8Array` will also work, as shown above. -The returned value will be a canonical [encoding name](https://encoding.spec.whatwg.org/#names-and-labels) (not a label). You might then combine this with the [whatwg-encoding](https://github.com/jsdom/whatwg-encoding) package to decode the result: +The returned value will be an [encoding label](https://encoding.spec.whatwg.org/#names-and-labels), and in particular, the label which is a lowercased version of the encoding's name. You might then combine this with the [`@exodus/bytes`](https://github.com/ExodusOSS/bytes/) package to decode the result: ```js -const whatwgEncoding = require("whatwg-encoding"); -const htmlString = whatwgEncoding.decode(htmlBytes, sniffedEncoding); +const { TextDecoder } = require("@exodus/bytes"); +const htmlString = (new TextEncoder(sniffedEncoding)).decode(htmlBytes); ``` ## Options