diff --git a/src/core/tokenize/match.js b/src/core/tokenize/match.js
index a151c36642..99177fe0f2 100644
--- a/src/core/tokenize/match.js
+++ b/src/core/tokenize/match.js
@@ -1,7 +1,7 @@
import { Token } from '../classes/token.js';
import singleton from '../prism.js';
import { tokenize } from './tokenize.js';
-import { resolve } from './util.js';
+import { resolve, tokenizeByNamedGroups } from './util.js';
/**
* @this {Prism}
@@ -21,7 +21,12 @@ export function _matchGrammar (text, tokenList, grammar, startNode, startPos, re
for (const token in grammar) {
const tokenValue = grammar[token];
- if (!grammar.hasOwnProperty(token) || token.startsWith('$') || !tokenValue) {
+ if (
+ !grammar.hasOwnProperty(token) ||
+ token.startsWith('$') ||
+ !tokenValue ||
+ typeof tokenValue === 'function' // functional tokens ($inside for now) are handled on L170, and we should ignore them in all other cases
+ ) {
continue;
}
@@ -36,9 +41,20 @@ export function _matchGrammar (text, tokenList, grammar, startNode, startPos, re
let { pattern, lookbehind = false, greedy = false, alias, inside } = patternObj;
const insideGrammar = resolve.call(prism, inside);
+ let flagsToAdd = '';
+
if (greedy && !pattern.global) {
// Without the global flag, lastIndex won't work
- patternObj.pattern = pattern = RegExp(pattern.source, pattern.flags + 'g');
+ flagsToAdd += 'g';
+ }
+
+ if (pattern.source?.includes('(?<') && pattern.hasIndices === false) {
+ // Has named groups, we need to be able to capture their indices
+ flagsToAdd += 'd';
+ }
+
+ if (flagsToAdd) {
+ patternObj.pattern = pattern = RegExp(pattern.source, pattern.flags + flagsToAdd);
}
for (
@@ -63,7 +79,8 @@ export function _matchGrammar (text, tokenList, grammar, startNode, startPos, re
}
let removeCount = 1; // this is the to parameter of removeBetween
- let match;
+ /** @type {RegExpExecArray | null} */
+ let match = null;
if (greedy) {
match = matchPattern(pattern, pos, text, lookbehind);
@@ -117,6 +134,10 @@ export function _matchGrammar (text, tokenList, grammar, startNode, startPos, re
const from = match.index;
const matchStr = match[0];
+
+ /** @type {TokenStream | string} */
+ let content = matchStr;
+
const before = str.slice(0, from);
const after = str.slice(from + matchStr.length);
@@ -134,14 +155,42 @@ export function _matchGrammar (text, tokenList, grammar, startNode, startPos, re
tokenList.removeRange(removeFrom, removeCount);
- const wrapped = new Token(
- token,
- insideGrammar
- ? tokenize.call(prism, matchStr, /** @type {Grammar} */ (insideGrammar))
- : matchStr,
- alias,
- matchStr
- );
+ const byGroups = match.groups ? tokenizeByNamedGroups(match) : null;
+ if (byGroups && byGroups.length > 1) {
+ content = byGroups
+ .map(arg => {
+ let content = typeof arg === 'string' ? arg : arg.content;
+ const type = typeof arg === 'string' ? undefined : arg.type;
+
+ if (insideGrammar) {
+ let localInsideGrammar = type ? insideGrammar[type] : insideGrammar;
+
+ if (typeof localInsideGrammar === 'function') {
+ // Late resolving
+ localInsideGrammar = resolve.call(
+ prism,
+ localInsideGrammar(match.groups)
+ );
+ }
+
+ if (localInsideGrammar) {
+ // @ts-ignore
+ content = tokenize.call(prism, content, localInsideGrammar);
+ }
+ }
+
+ return typeof arg === 'object' && arg.type
+ ? new Token(arg.type, content)
+ : content;
+ })
+ .flat(); // Flatten tokens like ['foo']
+ }
+ else if (insideGrammar) {
+ // @ts-ignore
+ content = tokenize.call(prism, content, insideGrammar);
+ }
+
+ const wrapped = new Token(token, content, alias, matchStr);
currentNode = tokenList.addAfter(removeFrom, wrapped);
if (after) {
@@ -216,7 +265,7 @@ function toGrammarToken (pattern) {
/**
* @import { Prism } from '../prism.js';
- * @import { Grammar, GrammarToken, GrammarTokens, RegExpLike } from '../../types.d.ts';
+ * @import { Grammar, GrammarToken, GrammarTokens, TokenStream, RegExpLike } from '../../types.d.ts';
*/
/**
diff --git a/src/core/tokenize/util.js b/src/core/tokenize/util.js
index d4b0ce94dd..a7573a9d0b 100644
--- a/src/core/tokenize/util.js
+++ b/src/core/tokenize/util.js
@@ -1,9 +1,10 @@
+import { camelToKebabCase } from '../../shared/util.js';
import singleton from '../prism.js';
/**
* @this {Prism}
- * @param {Grammar | string | null | undefined} reference
- * @returns {Grammar | undefined}
+ * @param {Grammar | string | Function | null | undefined} reference
+ * @returns {Grammar | Function | undefined}
*/
export function resolve (reference) {
const prism = this ?? singleton;
@@ -13,6 +14,11 @@ export function resolve (reference) {
ret = prism.languageRegistry.getLanguage(ret)?.resolvedGrammar;
}
+ if (typeof ret === 'function' && ret.length === 0) {
+ // Function with no arguments, resolve eagerly
+ ret = ret.call(prism);
+ }
+
if (typeof ret === 'object' && ret.$rest) {
const restGrammar = resolve.call(prism, ret.$rest) ?? {};
if (typeof restGrammar === 'object') {
@@ -25,6 +31,42 @@ export function resolve (reference) {
return /** @type {Grammar | undefined} */ (ret);
}
+/**
+ *
+ * @param {RegExpExecArray} match
+ * @returns {({type: string, content: string} | string)[]}
+ */
+export function tokenizeByNamedGroups (match) {
+ const str = match[0];
+ const result = [];
+ let i = 0;
+
+ const entries = Object.entries(match.indices?.groups || {})
+ .map(([type, [start, end]]) => ({
+ type,
+ start: start - match.index,
+ end: end - match.index,
+ }))
+ .sort((a, b) => a.start - b.start);
+
+ for (let { type, start, end } of entries) {
+ if (start > i) {
+ result.push(str.slice(i, start));
+ }
+
+ const content = str.slice(start, end);
+ type = camelToKebabCase(type);
+ result.push({ type, content });
+ i = end;
+ }
+
+ if (i < str.length) {
+ result.push(str.slice(i));
+ }
+
+ return result;
+}
+
/**
* @import { Prism } from '../prism.js';
* @import { Grammar, LanguageRegistry } from '../../types.d.ts';
diff --git a/src/languages/markdown.js b/src/languages/markdown.js
index 3b09d1e0e9..6ccf0d8b2f 100644
--- a/src/languages/markdown.js
+++ b/src/languages/markdown.js
@@ -99,73 +99,24 @@ export default {
// ```optional language
// code block
// ```
- pattern: /^```[\s\S]*?^```$/m,
- greedy: true,
- inside: /** @type {Grammar} */ ({
- 'code-block': {
- pattern: /^(```.*(?:\n|\r\n?))[\s\S]+?(?=(?:\n|\r\n?)^```$)/m,
- lookbehind: true,
- },
- 'code-language': {
- pattern: /^(```).+/,
- lookbehind: true,
- },
- 'punctuation': /```/,
- /** @type {Grammar['$tokenize']} */
- $tokenize (code, grammar, Prism) {
- const tokens = Prism.tokenize(code, withoutTokenize(grammar));
-
- /*
- * Add the correct `language-xxxx` class to this code block. Keep in mind that the `code-language` token
- * is optional. But the grammar is defined so that there is only one case we have to handle:
- *
- * token.content = [
- * ```,
- * xxxx,
- * '\n', // exactly one new lines (\r or \n or \r\n)
- * ...,
- * '\n', // exactly one new lines again
- * ```
- * ];
- */
-
- const codeLang = tokens[1];
- const codeBlock = tokens[3];
-
- if (
- typeof codeLang === 'object' &&
- typeof codeBlock === 'object' &&
- codeLang.type === 'code-language' &&
- codeBlock.type === 'code-block'
- ) {
- // this might be a language that Prism does not support
-
- // do some replacements to support C++, C#, and F#
- const lang = getTextContent(codeLang.content)
- .replace(/\b#/g, 'sharp')
- .replace(/\b\+\+/g, 'pp');
- // only use the first word
- const langName = /[a-z][\w-]*/i.exec(lang)?.[0].toLowerCase();
- if (langName) {
- codeBlock.addAlias('language-' + langName);
-
- const grammar =
- Prism.languageRegistry.getLanguage(lang)?.resolvedGrammar;
- if (grammar) {
- codeBlock.content = Prism.tokenize(
- getTextContent(codeBlock),
- grammar
- );
- }
- else {
- codeBlock.addAlias('needs-highlighting');
- }
+ pattern:
+ /^```\s*(?\{[^{}]*\}|[a-z+#-]+)(?:[ \t][^\n\r]*)?(?:\n|\r\n?)(?[\s\S]*?)(?:\n|\r\n?)```$/im,
+ inside: {
+ 'code-block': groups => {
+ let lang = groups.codeLanguage;
+ // Extract language code from curly braces like {r pressure, echo=FALSE} → r
+ if (lang.startsWith('{') && lang.endsWith('}')) {
+ const match = lang.slice(1, -1).match(/^\s*([a-z+#-]+)/i);
+ if (match) {
+ lang = match[0];
}
}
-
- return tokens;
+ // Apply transformations: c++ → cpp, c# → csharp, f# → fsharp, etc.
+ lang = lang.replace(/\b#/g, 'sharp').replace(/\b\+\+/g, 'pp');
+ return lang.toLowerCase();
},
- }),
+ 'punctuation': /```/,
+ },
},
],
'title': [
diff --git a/src/shared/languages/templating.js b/src/shared/languages/templating.js
index 1410649070..6c47b7399d 100644
--- a/src/shared/languages/templating.js
+++ b/src/shared/languages/templating.js
@@ -127,9 +127,9 @@ export function templating (code, hostGrammar, templateGrammar, Prism) {
hostGrammar = resolve.call(Prism, hostGrammar);
templateGrammar = resolve.call(Prism, templateGrammar);
- const { hostCode, tokenStack } = buildPlaceholders(code, templateGrammar, Prism);
+ const { hostCode, tokenStack } = buildPlaceholders(code, /** @type {Grammar | undefined} */ (templateGrammar), Prism);
- const tokens = hostGrammar ? Prism.tokenize(hostCode, hostGrammar) : [hostCode];
+ const tokens = hostGrammar ? Prism.tokenize(hostCode, /** @type {Grammar} */ (hostGrammar)) : [hostCode];
insertIntoHostToken(tokens, tokenStack);
return tokens;
}
@@ -145,10 +145,10 @@ export function embeddedIn (hostGrammar) {
}
/**
- * @import { Prism, Token } from '../../core.js';
- * @import { TokenStream, TokenStack, Grammar, LanguageRegistry} from '../../types.d.ts';
+ * @import { Prism } from '../../core.js';
+ * @import { TokenStream, TokenStack, Grammar } from '../../types.d.ts';
*/
/**
- * @typedef {Grammar | string | undefined | null} GrammarRef
+ * @typedef {Grammar | Function | string | undefined | null} GrammarRef
*/
diff --git a/src/shared/util.js b/src/shared/util.js
index 04cb5b9f48..77b9a9e09a 100644
--- a/src/shared/util.js
+++ b/src/shared/util.js
@@ -76,3 +76,13 @@ export function kebabToCamelCase (kebab) {
const [first, ...others] = kebab.split(/-/);
return first + others.map(capitalize).join('');
}
+
+/**
+ * Converts the given camel case identifier to a kebab case identifier.
+ *
+ * @param {string} str
+ * @returns
+ */
+export function camelToKebabCase (str) {
+ return (str + '').replace(/[A-Z]/g, l => '-' + l.toLowerCase());
+}
diff --git a/tests/coverage.js b/tests/coverage.js
index fa73f562fe..d3a1eedf64 100644
--- a/tests/coverage.js
+++ b/tests/coverage.js
@@ -9,6 +9,28 @@ describe('Pattern test coverage', () => {
/** @type {Map} */
const patterns = new Map();
+ /**
+ * Creates a key for pattern lookup based on source and normalized flags.
+ * Normalizes flags by removing `g` and `d` (which Prism may add) and sorting the rest.
+ * Uses a simple loop instead of `String.replace` to avoid triggering our `RegExp.exec` interception.
+ *
+ * @param {string} source
+ * @param {string} flags
+ * @returns {string}
+ */
+ function getSourceKey (source, flags) {
+ // Normalize flags: remove 'g' and 'd', then sort
+ let normalizedFlags = '';
+ for (let i = 0; i < flags.length; i++) {
+ const flag = flags[i];
+ if (flag !== 'g' && flag !== 'd') {
+ normalizedFlags += flag;
+ }
+ }
+ normalizedFlags = normalizedFlags.split('').sort().join('');
+ return `${source}|${normalizedFlags}`;
+ }
+
/**
* @param {string | string[]} languages
* @returns {Promise}
@@ -31,7 +53,9 @@ describe('Pattern test coverage', () => {
const regex = makeGlobal(value);
object[key] = regex;
- const patternKey = String(regex);
+ // Register with the original regex's source and flags (before making global)
+ // This matches what Prism will use when creating new RegExp objects
+ const patternKey = getSourceKey(value.source, value.flags);
let data = patterns.get(patternKey);
if (!data) {
data = {
@@ -43,21 +67,30 @@ describe('Pattern test coverage', () => {
patterns.set(patternKey, data);
}
data.from.add(tokenPath);
- const { matches } = data;
-
- regex.exec = string => {
- const match = RegExp.prototype.exec.call(regex, string);
- if (match) {
- matches.push(match);
- }
- return match;
- };
}
});
return Prism;
}
+ // Intercept RegExp.prototype.exec globally to track all pattern matches.
+ // We use global interception (instead of per-regex interception) because Prism creates new RegExp
+ // objects when adding flags (see src/core/tokenize/match.js). Per-regex interception
+ // would only catch the original regex objects, missing matches on the new ones.
+ // This is safe because we only track patterns that exist in our map.
+ const originalExec = RegExp.prototype.exec;
+ RegExp.prototype.exec = function (string) {
+ const match = originalExec.call(this, string);
+ if (match) {
+ const patternKey = getSourceKey(this.source, this.flags);
+ const data = patterns.get(patternKey);
+ if (data) {
+ data.matches.push(match);
+ }
+ }
+ return match;
+ };
+
describe('Register all patterns', () => {
it('all', async function () {
this.slow(10 * 1000);
diff --git a/tests/languages/markdown/code-block_feature.html.test b/tests/languages/markdown/code-block_feature.html.test
index 672c4cbca2..f443de39a7 100644
--- a/tests/languages/markdown/code-block_feature.html.test
+++ b/tests/languages/markdown/code-block_feature.html.test
@@ -11,7 +11,7 @@
```
html
-
+
<
a
@@ -38,7 +38,7 @@
```
unknownLanguage
-
+
<a href="#foo">Click me!</a> &
```
diff --git a/tests/languages/markdown/code_block_language_detection_feature.html.test b/tests/languages/markdown/code_block_language_detection_feature.html.test
index 6ccdc0b56e..0d1460c6b3 100644
--- a/tests/languages/markdown/code_block_language_detection_feature.html.test
+++ b/tests/languages/markdown/code_block_language_detection_feature.html.test
@@ -14,32 +14,44 @@ var a = 0;
plot(pressure)
```
+```js { data-copy="Copy the JavaScript snippet!" }
+let bar = 42;
+```
+
----------------------------------------------------
```
js
- let a = 0;
+ let a = 0;
```
```
c++
- int a = 0;
+ int a = 0;
```
```
c#
- var a = 0;
+ var a = 0;
```
```
{r pressure, echo=FALSE}
- plot(pressure)
+ plot(pressure)
```
+
+
+ ```
+ js
+ { data-copy="Copy the JavaScript snippet!" }
+ let bar = 42;
+ ```
+
\ No newline at end of file
diff --git a/tests/languages/markdown/code_feature.test b/tests/languages/markdown/code_feature.test
index b2e5a77f83..dda5b33710 100644
--- a/tests/languages/markdown/code_feature.test
+++ b/tests/languages/markdown/code_feature.test
@@ -21,7 +21,7 @@ var a = 0;
["code", "\tfoobar\r\n\tcontinuous"],
["code", [
- ["punctuation", "```"], ["code-language", " js"],
+ ["punctuation", "```"], ["code-language", "js"],
["code-block", "var a = 0;"],
["punctuation", "```"]
]]
diff --git a/tests/pattern-tests.js b/tests/pattern-tests.js
index 5c9e0cd3d0..6b98e079aa 100644
--- a/tests/pattern-tests.js
+++ b/tests/pattern-tests.js
@@ -269,7 +269,8 @@ function testPatterns (getPrism, mainLanguage) {
await forEachPattern(({ ast, tokenPath, lookbehindGroup, reportError }) => {
forEachCapturingGroup(ast.pattern, ({ group, number }) => {
const isLookbehindGroup = group === lookbehindGroup;
- if (group.references.length === 0 && !isLookbehindGroup) {
+ const isNamedGroup = !!group.name; // named capturing groups are used for tokenization, so they are not unused
+ if (group.references.length === 0 && !isLookbehindGroup && !isNamedGroup) {
const fixes = [];
fixes.push(
`Make this group a non-capturing group ('(?:...)' instead of '(...)'). (It's usually this option.)`