From 620ec97f623f613984609d954b4d76c9ed487f1e Mon Sep 17 00:00:00 2001 From: Dmitry Sharabin Date: Tue, 18 Nov 2025 23:14:00 +0100 Subject: [PATCH 1/7] First stab at functional `$inside` --- src/core/tokenize/match.js | 75 +++++++++++++++--- src/core/tokenize/util.js | 46 ++++++++++- src/languages/markdown.js | 79 ++++--------------- src/shared/util.js | 10 +++ .../markdown/code-block_feature.html.test | 4 +- ...block_language_detection_feature.html.test | 8 +- tests/languages/markdown/code_feature.test | 2 +- 7 files changed, 138 insertions(+), 86 deletions(-) diff --git a/src/core/tokenize/match.js b/src/core/tokenize/match.js index a151c36642..6a4763e40c 100644 --- a/src/core/tokenize/match.js +++ b/src/core/tokenize/match.js @@ -1,7 +1,7 @@ import { Token } from '../classes/token.js'; import singleton from '../prism.js'; import { tokenize } from './tokenize.js'; -import { resolve } from './util.js'; +import { resolve, tokenizeByNamedGroups } from './util.js'; /** * @this {Prism} @@ -21,7 +21,12 @@ export function _matchGrammar (text, tokenList, grammar, startNode, startPos, re for (const token in grammar) { const tokenValue = grammar[token]; - if (!grammar.hasOwnProperty(token) || token.startsWith('$') || !tokenValue) { + if ( + !grammar.hasOwnProperty(token) || + token.startsWith('$') || + !tokenValue || + typeof tokenValue === 'function' + ) { continue; } @@ -36,9 +41,20 @@ export function _matchGrammar (text, tokenList, grammar, startNode, startPos, re let { pattern, lookbehind = false, greedy = false, alias, inside } = patternObj; const insideGrammar = resolve.call(prism, inside); + let flagsToAdd = ''; + if (greedy && !pattern.global) { // Without the global flag, lastIndex won't work - patternObj.pattern = pattern = RegExp(pattern.source, pattern.flags + 'g'); + flagsToAdd += 'g'; + } + + if (pattern.source?.includes('(?<') && pattern.hasIndices === false) { + // Has named groups, we need to be able to capture their indices + flagsToAdd += 'd'; + } + + if (flagsToAdd) { + patternObj.pattern = pattern = RegExp(pattern.source, pattern.flags + flagsToAdd); } for ( @@ -63,7 +79,8 @@ export function _matchGrammar (text, tokenList, grammar, startNode, startPos, re } let removeCount = 1; // this is the to parameter of removeBetween - let match; + /** @type {RegExpExecArray | null} */ + let match = null; if (greedy) { match = matchPattern(pattern, pos, text, lookbehind); @@ -117,6 +134,10 @@ export function _matchGrammar (text, tokenList, grammar, startNode, startPos, re const from = match.index; const matchStr = match[0]; + + /** @type {TokenStream | string} */ + let content = matchStr; + const before = str.slice(0, from); const after = str.slice(from + matchStr.length); @@ -134,14 +155,42 @@ export function _matchGrammar (text, tokenList, grammar, startNode, startPos, re tokenList.removeRange(removeFrom, removeCount); - const wrapped = new Token( - token, - insideGrammar - ? tokenize.call(prism, matchStr, /** @type {Grammar} */ (insideGrammar)) - : matchStr, - alias, - matchStr - ); + const byGroups = match.groups ? tokenizeByNamedGroups(match) : null; + if (byGroups && byGroups.length > 1) { + content = byGroups + .map(arg => { + let content = typeof arg === 'string' ? arg : arg.content; + const type = typeof arg === 'string' ? undefined : arg.type; + + if (insideGrammar) { + let localInsideGrammar = type ? insideGrammar[type] : insideGrammar; + + if (typeof localInsideGrammar === 'function') { + // Late resolving + localInsideGrammar = resolve.call( + prism, + localInsideGrammar(match.groups) + ); + } + + if (localInsideGrammar) { + // @ts-ignore + content = tokenize.call(prism, content, localInsideGrammar); + } + } + + return typeof arg === 'object' && arg.type + ? new Token(arg.type, content) + : content; + }) + .flat(); // Flatten tokens like ['foo'] + } + else if (insideGrammar) { + // @ts-ignore + content = tokenize.call(prism, content, insideGrammar); + } + + const wrapped = new Token(token, content, alias, matchStr); currentNode = tokenList.addAfter(removeFrom, wrapped); if (after) { @@ -216,7 +265,7 @@ function toGrammarToken (pattern) { /** * @import { Prism } from '../prism.js'; - * @import { Grammar, GrammarToken, GrammarTokens, RegExpLike } from '../../types.d.ts'; + * @import { Grammar, GrammarToken, GrammarTokens, TokenStream, RegExpLike } from '../../types.d.ts'; */ /** diff --git a/src/core/tokenize/util.js b/src/core/tokenize/util.js index d4b0ce94dd..a7573a9d0b 100644 --- a/src/core/tokenize/util.js +++ b/src/core/tokenize/util.js @@ -1,9 +1,10 @@ +import { camelToKebabCase } from '../../shared/util.js'; import singleton from '../prism.js'; /** * @this {Prism} - * @param {Grammar | string | null | undefined} reference - * @returns {Grammar | undefined} + * @param {Grammar | string | Function | null | undefined} reference + * @returns {Grammar | Function | undefined} */ export function resolve (reference) { const prism = this ?? singleton; @@ -13,6 +14,11 @@ export function resolve (reference) { ret = prism.languageRegistry.getLanguage(ret)?.resolvedGrammar; } + if (typeof ret === 'function' && ret.length === 0) { + // Function with no arguments, resolve eagerly + ret = ret.call(prism); + } + if (typeof ret === 'object' && ret.$rest) { const restGrammar = resolve.call(prism, ret.$rest) ?? {}; if (typeof restGrammar === 'object') { @@ -25,6 +31,42 @@ export function resolve (reference) { return /** @type {Grammar | undefined} */ (ret); } +/** + * + * @param {RegExpExecArray} match + * @returns {({type: string, content: string} | string)[]} + */ +export function tokenizeByNamedGroups (match) { + const str = match[0]; + const result = []; + let i = 0; + + const entries = Object.entries(match.indices?.groups || {}) + .map(([type, [start, end]]) => ({ + type, + start: start - match.index, + end: end - match.index, + })) + .sort((a, b) => a.start - b.start); + + for (let { type, start, end } of entries) { + if (start > i) { + result.push(str.slice(i, start)); + } + + const content = str.slice(start, end); + type = camelToKebabCase(type); + result.push({ type, content }); + i = end; + } + + if (i < str.length) { + result.push(str.slice(i)); + } + + return result; +} + /** * @import { Prism } from '../prism.js'; * @import { Grammar, LanguageRegistry } from '../../types.d.ts'; diff --git a/src/languages/markdown.js b/src/languages/markdown.js index 3b09d1e0e9..846c14b97b 100644 --- a/src/languages/markdown.js +++ b/src/languages/markdown.js @@ -99,73 +99,24 @@ export default { // ```optional language // code block // ``` - pattern: /^```[\s\S]*?^```$/m, - greedy: true, - inside: /** @type {Grammar} */ ({ - 'code-block': { - pattern: /^(```.*(?:\n|\r\n?))[\s\S]+?(?=(?:\n|\r\n?)^```$)/m, - lookbehind: true, - }, - 'code-language': { - pattern: /^(```).+/, - lookbehind: true, - }, - 'punctuation': /```/, - /** @type {Grammar['$tokenize']} */ - $tokenize (code, grammar, Prism) { - const tokens = Prism.tokenize(code, withoutTokenize(grammar)); - - /* - * Add the correct `language-xxxx` class to this code block. Keep in mind that the `code-language` token - * is optional. But the grammar is defined so that there is only one case we have to handle: - * - * token.content = [ - * ```, - * xxxx, - * '\n', // exactly one new lines (\r or \n or \r\n) - * ..., - * '\n', // exactly one new lines again - * ``` - * ]; - */ - - const codeLang = tokens[1]; - const codeBlock = tokens[3]; - - if ( - typeof codeLang === 'object' && - typeof codeBlock === 'object' && - codeLang.type === 'code-language' && - codeBlock.type === 'code-block' - ) { - // this might be a language that Prism does not support - - // do some replacements to support C++, C#, and F# - const lang = getTextContent(codeLang.content) - .replace(/\b#/g, 'sharp') - .replace(/\b\+\+/g, 'pp'); - // only use the first word - const langName = /[a-z][\w-]*/i.exec(lang)?.[0].toLowerCase(); - if (langName) { - codeBlock.addAlias('language-' + langName); - - const grammar = - Prism.languageRegistry.getLanguage(lang)?.resolvedGrammar; - if (grammar) { - codeBlock.content = Prism.tokenize( - getTextContent(codeBlock), - grammar - ); - } - else { - codeBlock.addAlias('needs-highlighting'); - } + pattern: + /^```(?:\s*)(?\{[^{}]*\}|[a-z+#-]+)(?:\n|\r\n?)(?[\s\S]*?)(?:\n|\r\n?)```$/im, + inside: { + 'code-block': groups => { + let lang = groups.codeLanguage; + // Extract language code from curly braces like {r pressure, echo=FALSE} → r + if (lang.startsWith('{') && lang.endsWith('}')) { + const match = lang.slice(1, -1).match(/^(?:\s*)([a-z+#-]+)/i); + if (match) { + lang = match[0]; } } - - return tokens; + // Apply transformations: c++ → cpp, c# → csharp, f# → fsharp, etc. + lang = lang.replace(/\b#/g, 'sharp').replace(/\b\+\+/g, 'pp'); + return lang.toLowerCase(); }, - }), + 'punctuation': /```/, + }, }, ], 'title': [ diff --git a/src/shared/util.js b/src/shared/util.js index 04cb5b9f48..77b9a9e09a 100644 --- a/src/shared/util.js +++ b/src/shared/util.js @@ -76,3 +76,13 @@ export function kebabToCamelCase (kebab) { const [first, ...others] = kebab.split(/-/); return first + others.map(capitalize).join(''); } + +/** + * Converts the given camel case identifier to a kebab case identifier. + * + * @param {string} str + * @returns + */ +export function camelToKebabCase (str) { + return (str + '').replace(/[A-Z]/g, l => '-' + l.toLowerCase()); +} diff --git a/tests/languages/markdown/code-block_feature.html.test b/tests/languages/markdown/code-block_feature.html.test index 672c4cbca2..f443de39a7 100644 --- a/tests/languages/markdown/code-block_feature.html.test +++ b/tests/languages/markdown/code-block_feature.html.test @@ -11,7 +11,7 @@ ``` html - + < a @@ -38,7 +38,7 @@ ``` unknownLanguage - + <a href="#foo">Click me!</a> &amp; ``` diff --git a/tests/languages/markdown/code_block_language_detection_feature.html.test b/tests/languages/markdown/code_block_language_detection_feature.html.test index 6ccdc0b56e..95b8949427 100644 --- a/tests/languages/markdown/code_block_language_detection_feature.html.test +++ b/tests/languages/markdown/code_block_language_detection_feature.html.test @@ -19,27 +19,27 @@ plot(pressure) ``` js - let a = 0; + let a = 0; ``` ``` c++ - int a = 0; + int a = 0; ``` ``` c# - var a = 0; + var a = 0; ``` ``` {r pressure, echo=FALSE} - plot(pressure) + plot(pressure) ``` diff --git a/tests/languages/markdown/code_feature.test b/tests/languages/markdown/code_feature.test index b2e5a77f83..dda5b33710 100644 --- a/tests/languages/markdown/code_feature.test +++ b/tests/languages/markdown/code_feature.test @@ -21,7 +21,7 @@ var a = 0; ["code", "\tfoobar\r\n\tcontinuous"], ["code", [ - ["punctuation", "```"], ["code-language", " js"], + ["punctuation", "```"], ["code-language", "js"], ["code-block", "var a = 0;"], ["punctuation", "```"] ]] From ca35296026adfa700ec7916e195e8a4235fb4b08 Mon Sep 17 00:00:00 2001 From: Dmitry Sharabin Date: Tue, 18 Nov 2025 19:18:13 +0100 Subject: [PATCH 2/7] [pattern tests] Don't treat named capturing groups as unused --- tests/pattern-tests.js | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/pattern-tests.js b/tests/pattern-tests.js index 5c9e0cd3d0..6b98e079aa 100644 --- a/tests/pattern-tests.js +++ b/tests/pattern-tests.js @@ -269,7 +269,8 @@ function testPatterns (getPrism, mainLanguage) { await forEachPattern(({ ast, tokenPath, lookbehindGroup, reportError }) => { forEachCapturingGroup(ast.pattern, ({ group, number }) => { const isLookbehindGroup = group === lookbehindGroup; - if (group.references.length === 0 && !isLookbehindGroup) { + const isNamedGroup = !!group.name; // named capturing groups are used for tokenization, so they are not unused + if (group.references.length === 0 && !isLookbehindGroup && !isNamedGroup) { const fixes = []; fixes.push( `Make this group a non-capturing group ('(?:...)' instead of '(...)'). (It's usually this option.)` From 75a6d37613860b62e2666eaac7d86a44b81cfd14 Mon Sep 17 00:00:00 2001 From: Dmitry Sharabin Date: Wed, 19 Nov 2025 15:01:25 +0100 Subject: [PATCH 3/7] [regex-coverage test] Track patterns when Prism creates new `RegExp` objects The regex coverage test was failing to track pattern matches because Prism creates new `RegExp objects` when adding flags (e.g., `g` or `d`) during tokenization. The original approach intercepted `exec()` on individual regex objects, which missed matches on the newly created `RegExp` instances. Changes: - Replace `String(regex)` key with normalized source+flags key to match patterns even when Prism creates new `RegExp` objects with different flags - Switch from per-regex interception to global `RegExp.prototype.exec` interception to catch all pattern matches, including on new `RegExp` objects - Use a simple loop instead of `String.replace` in flag normalization to avoid triggering our own `RegExp.exec` interception (which caused infinite loops) This fixes the tracking issue for patterns like the markdown code block pattern that were previously reported as untested despite being used in tests. --- tests/coverage.js | 53 ++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 43 insertions(+), 10 deletions(-) diff --git a/tests/coverage.js b/tests/coverage.js index fa73f562fe..d3a1eedf64 100644 --- a/tests/coverage.js +++ b/tests/coverage.js @@ -9,6 +9,28 @@ describe('Pattern test coverage', () => { /** @type {Map} */ const patterns = new Map(); + /** + * Creates a key for pattern lookup based on source and normalized flags. + * Normalizes flags by removing `g` and `d` (which Prism may add) and sorting the rest. + * Uses a simple loop instead of `String.replace` to avoid triggering our `RegExp.exec` interception. + * + * @param {string} source + * @param {string} flags + * @returns {string} + */ + function getSourceKey (source, flags) { + // Normalize flags: remove 'g' and 'd', then sort + let normalizedFlags = ''; + for (let i = 0; i < flags.length; i++) { + const flag = flags[i]; + if (flag !== 'g' && flag !== 'd') { + normalizedFlags += flag; + } + } + normalizedFlags = normalizedFlags.split('').sort().join(''); + return `${source}|${normalizedFlags}`; + } + /** * @param {string | string[]} languages * @returns {Promise} @@ -31,7 +53,9 @@ describe('Pattern test coverage', () => { const regex = makeGlobal(value); object[key] = regex; - const patternKey = String(regex); + // Register with the original regex's source and flags (before making global) + // This matches what Prism will use when creating new RegExp objects + const patternKey = getSourceKey(value.source, value.flags); let data = patterns.get(patternKey); if (!data) { data = { @@ -43,21 +67,30 @@ describe('Pattern test coverage', () => { patterns.set(patternKey, data); } data.from.add(tokenPath); - const { matches } = data; - - regex.exec = string => { - const match = RegExp.prototype.exec.call(regex, string); - if (match) { - matches.push(match); - } - return match; - }; } }); return Prism; } + // Intercept RegExp.prototype.exec globally to track all pattern matches. + // We use global interception (instead of per-regex interception) because Prism creates new RegExp + // objects when adding flags (see src/core/tokenize/match.js). Per-regex interception + // would only catch the original regex objects, missing matches on the new ones. + // This is safe because we only track patterns that exist in our map. + const originalExec = RegExp.prototype.exec; + RegExp.prototype.exec = function (string) { + const match = originalExec.call(this, string); + if (match) { + const patternKey = getSourceKey(this.source, this.flags); + const data = patterns.get(patternKey); + if (data) { + data.matches.push(match); + } + } + return match; + }; + describe('Register all patterns', () => { it('all', async function () { this.slow(10 * 1000); From 8752c280849925b087794d8ce771304ae5b1a3f3 Mon Sep 17 00:00:00 2001 From: Dmitry Sharabin Date: Wed, 19 Nov 2025 15:05:56 +0100 Subject: [PATCH 4/7] [types] Adjust types and remove redundant imports --- src/shared/languages/templating.js | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/shared/languages/templating.js b/src/shared/languages/templating.js index 1410649070..6c47b7399d 100644 --- a/src/shared/languages/templating.js +++ b/src/shared/languages/templating.js @@ -127,9 +127,9 @@ export function templating (code, hostGrammar, templateGrammar, Prism) { hostGrammar = resolve.call(Prism, hostGrammar); templateGrammar = resolve.call(Prism, templateGrammar); - const { hostCode, tokenStack } = buildPlaceholders(code, templateGrammar, Prism); + const { hostCode, tokenStack } = buildPlaceholders(code, /** @type {Grammar | undefined} */ (templateGrammar), Prism); - const tokens = hostGrammar ? Prism.tokenize(hostCode, hostGrammar) : [hostCode]; + const tokens = hostGrammar ? Prism.tokenize(hostCode, /** @type {Grammar} */ (hostGrammar)) : [hostCode]; insertIntoHostToken(tokens, tokenStack); return tokens; } @@ -145,10 +145,10 @@ export function embeddedIn (hostGrammar) { } /** - * @import { Prism, Token } from '../../core.js'; - * @import { TokenStream, TokenStack, Grammar, LanguageRegistry} from '../../types.d.ts'; + * @import { Prism } from '../../core.js'; + * @import { TokenStream, TokenStack, Grammar } from '../../types.d.ts'; */ /** - * @typedef {Grammar | string | undefined | null} GrammarRef + * @typedef {Grammar | Function | string | undefined | null} GrammarRef */ From 8cb13942020df5dc9f8263cccbd38f7468337b06 Mon Sep 17 00:00:00 2001 From: Dmitry Sharabin Date: Thu, 20 Nov 2025 14:58:17 +0100 Subject: [PATCH 5/7] [markdown] Improve language detection Plus, add tests. --- src/languages/markdown.js | 2 +- .../code_block_language_detection_feature.html.test | 12 ++++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/src/languages/markdown.js b/src/languages/markdown.js index 846c14b97b..e502198d1e 100644 --- a/src/languages/markdown.js +++ b/src/languages/markdown.js @@ -100,7 +100,7 @@ export default { // code block // ``` pattern: - /^```(?:\s*)(?\{[^{}]*\}|[a-z+#-]+)(?:\n|\r\n?)(?[\s\S]*?)(?:\n|\r\n?)```$/im, + /^```(?:\s*)(?\{[^{}]*\}|[a-z+#-]+)(?:[ \t][^\n\r]*)?(?:\n|\r\n?)(?[\s\S]*?)(?:\n|\r\n?)```$/im, inside: { 'code-block': groups => { let lang = groups.codeLanguage; diff --git a/tests/languages/markdown/code_block_language_detection_feature.html.test b/tests/languages/markdown/code_block_language_detection_feature.html.test index 95b8949427..0d1460c6b3 100644 --- a/tests/languages/markdown/code_block_language_detection_feature.html.test +++ b/tests/languages/markdown/code_block_language_detection_feature.html.test @@ -14,6 +14,10 @@ var a = 0; plot(pressure) ``` +```js { data-copy="Copy the JavaScript snippet!" } +let bar = 42; +``` + ---------------------------------------------------- @@ -43,3 +47,11 @@ plot(pressure) plot(pressure) ``` + + + ``` + js + { data-copy="Copy the JavaScript snippet!" } + let bar = 42; + ``` + \ No newline at end of file From 455b5bcfb75828eccaf1b67bde68c8cfdcf90e86 Mon Sep 17 00:00:00 2001 From: Dmitry Sharabin Date: Thu, 20 Nov 2025 15:39:14 +0100 Subject: [PATCH 6/7] [markdown] Remove redundant capturing groups --- src/languages/markdown.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/languages/markdown.js b/src/languages/markdown.js index e502198d1e..6ccf0d8b2f 100644 --- a/src/languages/markdown.js +++ b/src/languages/markdown.js @@ -100,13 +100,13 @@ export default { // code block // ``` pattern: - /^```(?:\s*)(?\{[^{}]*\}|[a-z+#-]+)(?:[ \t][^\n\r]*)?(?:\n|\r\n?)(?[\s\S]*?)(?:\n|\r\n?)```$/im, + /^```\s*(?\{[^{}]*\}|[a-z+#-]+)(?:[ \t][^\n\r]*)?(?:\n|\r\n?)(?[\s\S]*?)(?:\n|\r\n?)```$/im, inside: { 'code-block': groups => { let lang = groups.codeLanguage; // Extract language code from curly braces like {r pressure, echo=FALSE} → r if (lang.startsWith('{') && lang.endsWith('}')) { - const match = lang.slice(1, -1).match(/^(?:\s*)([a-z+#-]+)/i); + const match = lang.slice(1, -1).match(/^\s*([a-z+#-]+)/i); if (match) { lang = match[0]; } From ad8510285c5a32467389e710105ea1486bfdef14 Mon Sep 17 00:00:00 2001 From: Dmitry Sharabin Date: Wed, 3 Dec 2025 18:17:19 +0100 Subject: [PATCH 7/7] Address @LeaVerou's feedback: Add comment --- src/core/tokenize/match.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core/tokenize/match.js b/src/core/tokenize/match.js index 6a4763e40c..99177fe0f2 100644 --- a/src/core/tokenize/match.js +++ b/src/core/tokenize/match.js @@ -25,7 +25,7 @@ export function _matchGrammar (text, tokenList, grammar, startNode, startPos, re !grammar.hasOwnProperty(token) || token.startsWith('$') || !tokenValue || - typeof tokenValue === 'function' + typeof tokenValue === 'function' // functional tokens ($inside for now) are handled on L170, and we should ignore them in all other cases ) { continue; }