From d140e7d321ff7e5d2f55a27927531c74d8b4f94e Mon Sep 17 00:00:00 2001 From: y21 <30553356+y21@users.noreply.github.com> Date: Sun, 2 Mar 2025 00:38:25 +0100 Subject: [PATCH 1/5] rework regex engine to better understand backtracking --- crates/dash_compiler/src/instruction.rs | 9 +- crates/dash_compiler/src/lib.rs | 2 +- crates/dash_decompiler/src/decompiler.rs | 2 +- crates/dash_middle/src/compiler/constant.rs | 6 +- crates/dash_middle/src/parser/expr.rs | 8 +- crates/dash_parser/src/expr.rs | 14 +- crates/dash_regex/src/error.rs | 3 + crates/dash_regex/src/flags.rs | 3 +- crates/dash_regex/src/graph/build.rs | 160 +++++++++++++ crates/dash_regex/src/graph/eval.rs | 246 ++++++++++++++++++++ crates/dash_regex/src/graph/mod.rs | 52 +++++ crates/dash_regex/src/graph/node.rs | 93 ++++++++ crates/dash_regex/src/lib.rs | 94 +++++--- crates/dash_regex/src/node.rs | 24 +- crates/dash_regex/src/parser.rs | 16 +- crates/dash_vm/src/dispatch.rs | 4 +- crates/dash_vm/src/gc/trace.rs | 4 +- crates/dash_vm/src/js_std/regex.rs | 93 ++++---- crates/dash_vm/src/value/regex.rs | 12 +- 19 files changed, 700 insertions(+), 145 deletions(-) create mode 100644 crates/dash_regex/src/graph/build.rs create mode 100644 crates/dash_regex/src/graph/eval.rs create mode 100644 crates/dash_regex/src/graph/mod.rs create mode 100644 crates/dash_regex/src/graph/node.rs diff --git a/crates/dash_compiler/src/instruction.rs b/crates/dash_compiler/src/instruction.rs index 6821e6b6..62e8b2ef 100755 --- a/crates/dash_compiler/src/instruction.rs +++ b/crates/dash_compiler/src/instruction.rs @@ -133,13 +133,8 @@ impl InstructionBuilder<'_, '_> { Ok(()) } - pub fn build_regex_constant( - &mut self, - regex: dash_regex::ParsedRegex, - flags: dash_regex::Flags, - sym: Symbol, - ) -> Result<(), LimitExceededError> { - let RegexConstant(id) = self.current_function_mut().cp.add_regex((regex, flags, sym))?; + pub fn build_regex_constant(&mut self, regex: dash_regex::Regex, sym: Symbol) -> Result<(), LimitExceededError> { + let RegexConstant(id) = self.current_function_mut().cp.add_regex((regex, sym))?; self.write_instr(Instruction::Regex); self.writew(id); Ok(()) diff --git a/crates/dash_compiler/src/lib.rs b/crates/dash_compiler/src/lib.rs index b55bd7dd..60999611 100644 --- a/crates/dash_compiler/src/lib.rs +++ b/crates/dash_compiler/src/lib.rs @@ -606,7 +606,7 @@ impl Visitor> for FunctionCompiler<'_> { LiteralExpr::Number(n) => ib.build_number_constant(n), LiteralExpr::String(s) => ib.build_string_constant(s), LiteralExpr::Identifier(_) => unreachable!("identifiers are handled in visit_identifier_expression"), - LiteralExpr::Regex(regex, flags, sym) => ib.build_regex_constant(regex, flags, sym), + LiteralExpr::Regex(regex, sym) => ib.build_regex_constant(regex, sym), LiteralExpr::Null => ib.build_null_constant(), LiteralExpr::Undefined => ib.build_undefined_constant(), }; diff --git a/crates/dash_decompiler/src/decompiler.rs b/crates/dash_decompiler/src/decompiler.rs index 93de4195..9c544fd8 100644 --- a/crates/dash_decompiler/src/decompiler.rs +++ b/crates/dash_decompiler/src/decompiler.rs @@ -147,7 +147,7 @@ impl<'interner, 'buf> FunctionDecompiler<'interner, 'buf> { ), Instruction::Regex => ( "regex", - &self.interner.resolve(self.constants.regexes[RegexConstant(id)].2) as &dyn fmt::Display, + &self.interner.resolve(self.constants.regexes[RegexConstant(id)].1) as &dyn fmt::Display, ), _ => unreachable!(), }; diff --git a/crates/dash_middle/src/compiler/constant.rs b/crates/dash_middle/src/compiler/constant.rs index 9240bc07..3edca5ab 100755 --- a/crates/dash_middle/src/compiler/constant.rs +++ b/crates/dash_middle/src/compiler/constant.rs @@ -2,7 +2,7 @@ use core::fmt; use std::cell::Cell; use std::rc::Rc; -use dash_regex::{Flags, ParsedRegex}; +use dash_regex::Regex; use crate::index_type; use crate::indexvec::IndexThinVec; @@ -96,7 +96,7 @@ pub struct ConstantPool { pub symbols: IndexThinVec, pub booleans: IndexThinVec, pub functions: IndexThinVec, FunctionConstant>, - pub regexes: IndexThinVec<(ParsedRegex, Flags, Symbol), RegexConstant>, + pub regexes: IndexThinVec<(Regex, Symbol), RegexConstant>, } pub struct LimitExceededError; @@ -120,6 +120,6 @@ impl ConstantPool { add_symbol(symbols, Symbol) -> SymbolConstant, add_boolean(booleans, bool) -> BooleanConstant, add_function(functions, Rc) -> FunctionConstant, - add_regex(regexes, (ParsedRegex, Flags, Symbol)) -> RegexConstant + add_regex(regexes, (Regex, Symbol)) -> RegexConstant ); } diff --git a/crates/dash_middle/src/parser/expr.rs b/crates/dash_middle/src/parser/expr.rs index f3e5fa8a..2c4815d8 100644 --- a/crates/dash_middle/src/parser/expr.rs +++ b/crates/dash_middle/src/parser/expr.rs @@ -202,8 +202,8 @@ impl ExprKind { Self::Literal(LiteralExpr::Undefined) } - pub fn regex_literal(regex: dash_regex::ParsedRegex, flags: dash_regex::Flags, source: Symbol) -> Self { - Self::Literal(LiteralExpr::Regex(regex, flags, source)) + pub fn regex_literal(regex: dash_regex::Regex, source: Symbol) -> Self { + Self::Literal(LiteralExpr::Regex(regex, source)) } /// Creates a function call expression @@ -551,8 +551,8 @@ pub enum LiteralExpr { #[display(fmt = "\"{_0}\"")] String(Symbol), - #[display(fmt = "/{_2}/")] - Regex(dash_regex::ParsedRegex, dash_regex::Flags, Symbol), + #[display(fmt = "/{_1}/")] + Regex(dash_regex::Regex, Symbol), #[display(fmt = "null")] Null, diff --git a/crates/dash_parser/src/expr.rs b/crates/dash_parser/src/expr.rs index 64740d57..20857993 100644 --- a/crates/dash_parser/src/expr.rs +++ b/crates/dash_parser/src/expr.rs @@ -10,7 +10,6 @@ use dash_middle::parser::statement::{ StatementKind, }; use dash_middle::sourcemap::Span; -use dash_regex::Flags; use crate::{Parser, any}; @@ -874,14 +873,9 @@ impl Parser<'_, '_> { // Trim / prefix and suffix let full = self.interner.resolve(literal); let full = &full[1..full.len() - 1]; - let (nodes, flags) = match dash_regex::Parser::new(full.as_bytes()).parse_all().and_then(|node| { - self.interner - .resolve(flags) - .parse::() - .map_err(Into::into) - .map(|flags| (node, flags)) - }) { - Ok((nodes, flags)) => (nodes, flags), + let flags = self.interner.resolve(flags); + let regex = match dash_regex::compile(full, flags) { + Ok(regex) => regex, Err(err) => { let tok = *self.previous().unwrap(); self.error(Error::RegexSyntaxError(tok, err)); @@ -890,7 +884,7 @@ impl Parser<'_, '_> { }; Expr { span: current.span, - kind: ExprKind::regex_literal(nodes, flags, literal), + kind: ExprKind::regex_literal(regex, literal), } } other if other.is_identifier() => { diff --git a/crates/dash_regex/src/error.rs b/crates/dash_regex/src/error.rs index 1a3d127b..252be1e0 100644 --- a/crates/dash_regex/src/error.rs +++ b/crates/dash_regex/src/error.rs @@ -10,6 +10,9 @@ pub enum Error { #[error("unexpected character: {}", *.0 as char)] UnexpectedChar(u8), + #[error("number too large to fit in a u32")] + Overflow, + #[error("{0}")] Flags(#[from] flags::Error), } diff --git a/crates/dash_regex/src/flags.rs b/crates/dash_regex/src/flags.rs index 6b17cad2..eef465d3 100644 --- a/crates/dash_regex/src/flags.rs +++ b/crates/dash_regex/src/flags.rs @@ -1,12 +1,11 @@ use std::str::FromStr; use bitflags::bitflags; -use serde::{Deserialize, Serialize}; use thiserror::Error; bitflags! { #[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord)] - #[cfg_attr(feature = "format", derive(Serialize, Deserialize))] + #[cfg_attr(feature = "format", derive(serde::Serialize, serde::Deserialize))] pub struct Flags: u8 { const GLOBAL = 1; const IGNORE_CASE = 2; diff --git a/crates/dash_regex/src/graph/build.rs b/crates/dash_regex/src/graph/build.rs new file mode 100644 index 00000000..59dd2fce --- /dev/null +++ b/crates/dash_regex/src/graph/build.rs @@ -0,0 +1,160 @@ +use core::slice; +use std::collections::HashMap; + +use crate::graph::node::{BuildGraph, CharacterClassItem, Node, NodeId, NodeKind}; +use crate::node::{CharacterClassItem as ParsedCharacterClassItem, GroupCaptureMode}; + +use crate::node::Node as ParseNode; +use crate::parser::ParsedRegex; + +use super::node::Graph; + +type CaptureGroupMap = HashMap<*const ParseNode, u32>; + +pub fn number_groups(regex: &ParsedRegex) -> CaptureGroupMap { + fn inner(map: &mut CaptureGroupMap, _counter: &mut u32, nodes: &[ParseNode]) { + if let Some((node, rest)) = nodes.split_first() { + match node { + ParseNode::Group(id, nodes) => { + if let GroupCaptureMode::Id(id) = *id { + map.insert(node, id.try_into().unwrap()); + } + + inner(map, _counter, nodes); + } + ParseNode::Optional(node) => inner(map, _counter, slice::from_ref(&**node)), + ParseNode::Or(left, right) => { + inner(map, _counter, left); + inner(map, _counter, right); + } + ParseNode::Repetition { node, .. } => inner(map, _counter, slice::from_ref(&**node)), + ParseNode::AnyCharacter + | ParseNode::MetaSequence(_) + | ParseNode::LiteralCharacter(_) + | ParseNode::CharacterClass(_) + | ParseNode::Anchor(_) => {} // cannot contain group nodes + } + + inner(map, _counter, rest); + } + } + + let mut map = HashMap::new(); + let counter = &mut 0; + inner(&mut map, counter, ®ex.nodes); + map +} + +pub fn build(group_numbers: &CaptureGroupMap, regex: &ParsedRegex) -> (Graph, Option) { + fn lower_repetition( + graph: &mut BuildGraph, + group_numbers: &CaptureGroupMap, + node: &ParseNode, + min: u32, + max: Option, + next: Option, + ) -> NodeId { + let end_id = graph.push(Node { + next, + kind: NodeKind::RepetitionEnd { + start: NodeId::DUMMY, // will be set later + }, + }); + let inner_id = inner(graph, group_numbers, slice::from_ref(node), Some(end_id)).unwrap(); + let start_id = graph.push(Node { + next, + kind: NodeKind::RepetitionStart { + min, + max, + inner: inner_id, + }, + }); + let NodeKind::RepetitionEnd { start } = &mut graph[end_id].kind else { + unreachable!() + }; + *start = start_id; + start_id + } + + fn inner( + graph: &mut BuildGraph, + group_numbers: &CaptureGroupMap, + nodes: &[ParseNode], + outer_next: Option, + ) -> Option { + if let Some((current, rest)) = nodes.split_first() { + let next = inner(graph, group_numbers, rest, outer_next); + match *current { + ParseNode::AnyCharacter => Some(graph.push(Node { + next, + kind: NodeKind::AnyCharacter, + })), + ParseNode::MetaSequence(meta) => Some(graph.push(Node { + next, + kind: NodeKind::Meta(meta), + })), + ParseNode::Repetition { ref node, min, max } => { + Some(lower_repetition(graph, group_numbers, node, min, max, next)) + } + ParseNode::LiteralCharacter(literal) => Some(graph.push(Node { + next, + kind: NodeKind::Literal(literal), + })), + ParseNode::CharacterClass(ref parse_items) => { + let items = parse_items + .iter() + .map(|item| match *item { + ParsedCharacterClassItem::Node(ParseNode::AnyCharacter) => CharacterClassItem::AnyCharacter, + ParsedCharacterClassItem::Node(ParseNode::LiteralCharacter(literal)) => { + CharacterClassItem::Literal(literal) + } + ParsedCharacterClassItem::Node(ParseNode::MetaSequence(meta)) => { + CharacterClassItem::Meta(meta) + } + ParsedCharacterClassItem::Node(ref node) => { + panic!("cannot lower {node:?} in character class") + } + ParsedCharacterClassItem::Range(from, to) => CharacterClassItem::Range(from, to), + }) + .collect::>(); + + Some(graph.push(Node { + next, + kind: NodeKind::CharacterClass(items), + })) + } + ParseNode::Anchor(anchor) => Some(graph.push(Node { + next, + kind: NodeKind::Anchor(anchor), + })), + ParseNode::Or(ref left, ref right) => { + let left = inner(graph, group_numbers, left, next).unwrap(); + let right = inner(graph, group_numbers, right, next).unwrap(); + Some(graph.push(Node { + next, + kind: NodeKind::Or(left, right), + })) + } + ParseNode::Optional(ref node) => Some(lower_repetition(graph, group_numbers, node, 0, Some(1), next)), + ParseNode::Group(_, ref nodes) => { + let group_id = group_numbers.get(&(current as *const ParseNode)).copied(); + let end = graph.push(Node { + next, + kind: NodeKind::GroupEnd { group_id }, + }); + let inner_id = inner(graph, group_numbers, nodes, Some(end)).unwrap(); + Some(graph.push(Node { + next: Some(inner_id), + kind: NodeKind::GroupStart { group_id }, + })) + } + } + } else { + outer_next + } + } + + let mut graph = BuildGraph::new(); + let root = inner(&mut graph, group_numbers, ®ex.nodes, None); + (graph.finalize(), root) +} diff --git a/crates/dash_regex/src/graph/eval.rs b/crates/dash_regex/src/graph/eval.rs new file mode 100644 index 00000000..8867c3dc --- /dev/null +++ b/crates/dash_regex/src/graph/eval.rs @@ -0,0 +1,246 @@ +use crate::graph::node::CharacterClassItem; +use crate::node::Anchor; + +use super::Regex; +use super::node::{Graph, NodeId, NodeKind}; + +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub enum ProcessedGroupState { + Confirmed, + Unconfirmed, +} + +struct Cx<'a> { + processed_groups: &'a mut [Option<(u32, u32, ProcessedGroupState)>], + pending_groups: &'a mut [(Option, Option)], + /// The full input source of this "attempt". + full_input: &'a [u8], + graph: &'a Graph, + /// The offset of `full_input` in the *original* input string. + offset_from_original: u32, + current_repetition_count: Option, +} + +impl Cx<'_> { + /// Returns the offset of the passed in slice relative to the full input. + /// The slice must actually be obtained from the full input for the return value to make sense. + /// The value is unspecified (but not undefined) if passed an input slice from somewhere else. + pub fn offset(&self, s: &[u8]) -> u32 { + (s.as_ptr().addr() - self.full_input.as_ptr().addr()) as u32 + } + + /// Same as `offset`, but returns it relative to the original input. + pub fn offset_from_original(&self, s: &[u8]) -> u32 { + self.offset_from_original + self.offset(s) + } + + /// Creates a new context usable for the specified node. + pub fn for_node(&mut self, node: NodeId, origin: NodeId) -> Cx<'_> { + let Self { + processed_groups: &mut ref mut processed_groups, + pending_groups: &mut ref mut pending_groups, + full_input, + graph, + offset_from_original, + mut current_repetition_count, + } = *self; + + if let NodeKind::RepetitionStart { .. } = graph[node].kind { + if let NodeKind::RepetitionEnd { .. } = graph[origin].kind { + current_repetition_count = Some(current_repetition_count.unwrap() + 1); + } else { + current_repetition_count = Some(0); + } + } + + Cx { + processed_groups, + pending_groups, + full_input, + graph, + offset_from_original, + current_repetition_count, + } + } +} + +fn step(mut cx: Cx, node_id: NodeId, mut input: &[u8]) -> bool { + // The reason for shadowing cx with a borrow here is so that you're forced to go through `Cx::for_node` when calling `step(...)`. + // You can't pass the same `cx` when evaluating a sub-node. + let cx = &mut cx; + let node = &cx.graph[node_id]; + + let mut matches = match node.kind { + NodeKind::AnyCharacter => { + if let Some(rest) = input.get(1..) { + input = rest; + true + } else { + false + } + } + NodeKind::RepetitionStart { min, max, inner } => 'arm: { + let current_repetition_count = cx.current_repetition_count.unwrap(); + + if let Some(max) = max { + if current_repetition_count >= max { + // We've done `max` number of iterations. + break 'arm true; + } + } + + if step(cx.for_node(inner, node_id), inner, input) { + // This has automatically also checked the rest input. Don't need to do that again here after the match. + return true; + } + current_repetition_count >= min + } + NodeKind::Anchor(Anchor::StartOfString) => input.len() == cx.full_input.len(), + NodeKind::Anchor(Anchor::EndOfString) => input.is_empty(), + NodeKind::Meta(meta) => { + if let Some((_, rest)) = input.split_first().filter(|&(&c, _)| meta.matches(c)) { + input = rest; + true + } else { + false + } + } + NodeKind::CharacterClass(ref items) => { + if let Some((_, rest)) = input.split_first().filter(|&(&c, _)| { + items.iter().copied().any(|item| match item { + CharacterClassItem::Literal(lit) => lit == c, + CharacterClassItem::AnyCharacter => true, + CharacterClassItem::Meta(meta) => meta.matches(c), + CharacterClassItem::Range(min, max) => (min..=max).contains(&c), + }) + }) { + input = rest; + true + } else { + false + } + } + NodeKind::Literal(lit) => { + if let Some((_, rest)) = input.split_first().filter(|&(&c, _)| c == lit) { + input = rest; + true + } else { + false + } + } + NodeKind::Or(left, right) => { + return step(cx.for_node(left, node_id), left, input) || step(cx.for_node(right, node_id), right, input); + } + NodeKind::RepetitionEnd { start } => { + return step(cx.for_node(start, node_id), start, input); + } + NodeKind::GroupStart { group_id } => { + if let Some(group_id) = group_id { + let offset = cx.offset_from_original(input); + cx.pending_groups[group_id as usize] = (Some(offset), None); + } + true + } + NodeKind::GroupEnd { group_id } => { + if let Some(group_id) = group_id { + let group_id = group_id as usize; + + let old = cx.processed_groups[group_id]; + let start = cx.pending_groups[group_id].0.unwrap(); + let end = cx.offset_from_original(input); + cx.processed_groups[group_id] = Some((start, end, ProcessedGroupState::Unconfirmed)); + + return if let Some(next) = node.next { + let matches = step(cx.for_node(next, node_id), next, input); + cx.pending_groups[group_id] = (Some(start), Some(end)); + + if matches { + if cx.processed_groups[group_id].is_none_or(|(.., s)| s == ProcessedGroupState::Unconfirmed) { + // This group may have been processed again in a subsequent iteration. + // Only overwrite it back with this iteration's if it's still unconfirmed + cx.processed_groups[group_id] = Some((start, end, ProcessedGroupState::Confirmed)); + } + + true + } else { + // We did not match. Restore to old. + if let Some((a, b, _)) = old { + cx.processed_groups[group_id] = Some((a, b, ProcessedGroupState::Unconfirmed)); + } else { + cx.processed_groups[group_id] = None; + } + false + } + } else { + // No next node. + cx.processed_groups[group_id].as_mut().unwrap().2 = ProcessedGroupState::Confirmed; + true + }; + } + + true + } + }; + + if let Some(next) = node.next { + matches = matches && step(cx.for_node(next, node_id), next, input); + } + matches +} + +#[derive(Debug)] +pub struct EvalSuccess { + pub groups: Box<[Option<(u32, u32, ProcessedGroupState)>]>, +} + +#[derive(Debug)] +pub struct NoMatch; + +pub fn eval(regex: &Regex, mut input: &[u8]) -> Result { + let Some(root) = regex.root else { + // Nothing to do for empty regexes. + return Ok(EvalSuccess { groups: Box::default() }); + }; + + let mut processed_groups = vec![None; regex.group_count as usize].into_boxed_slice(); + let mut pending_groups = vec![(None, None); regex.group_count as usize].into_boxed_slice(); + let mut offset_from_original = 0; + loop { + // TODO: add a fast reject path where we find the first required character and seek to it in `input` + processed_groups[0] = Some(( + offset_from_original, + offset_from_original + input.len() as u32, + ProcessedGroupState::Confirmed, + )); + processed_groups[1..].fill(None); + pending_groups.fill((None, None)); + + let cx = Cx { + processed_groups: &mut processed_groups, + pending_groups: &mut pending_groups, + current_repetition_count: if let NodeKind::RepetitionStart { .. } = regex.graph[root].kind { + Some(0) + } else { + None + }, + offset_from_original, + full_input: input, + graph: ®ex.graph, + }; + + if step(cx, root, input) { + return Ok(EvalSuccess { + groups: processed_groups, + }); + } + + if let Some(rest) = input.get(1..) { + offset_from_original += 1; + input = rest; + } else { + break; + } + } + + Err(NoMatch) +} diff --git a/crates/dash_regex/src/graph/mod.rs b/crates/dash_regex/src/graph/mod.rs new file mode 100644 index 00000000..a0e8b11c --- /dev/null +++ b/crates/dash_regex/src/graph/mod.rs @@ -0,0 +1,52 @@ +mod build; +pub mod eval; +pub mod node; + +use eval::{EvalSuccess, NoMatch}; +use node::{Graph, NodeId}; + +use crate::Flags; +use crate::parser::ParsedRegex; + +/// A finalized, compiled regex. +#[derive(Debug, Clone)] +#[cfg_attr(feature = "format", derive(serde::Serialize, serde::Deserialize))] +pub struct Regex { + graph: Graph, + flags: Flags, + root: Option, + group_count: u32, +} + +impl Regex { + pub fn eval(&self, input: &str) -> Result { + eval::eval(self, input.as_bytes()) + } + + pub fn matches(&self, input: &str) -> bool { + self.eval(input).is_ok() + } + + pub fn flags(&self) -> Flags { + self.flags + } +} + +pub fn compile(regex: ParsedRegex, flags: Flags) -> Regex { + // We're going to have a hashmap with pointers as keys. + // Accidentally moving the regex would invalidate pointers. + // We never actually dereference them so it doesn't matter for safety, but it would still lead to + // bugs. So make it a borrow. + let regex = ®ex; + + let numbered = build::number_groups(regex); + let (graph, root) = build::build(&numbered, regex); + let group_count = u32::try_from(regex.group_count).unwrap(); + + Regex { + graph, + group_count, + flags, + root, + } +} diff --git a/crates/dash_regex/src/graph/node.rs b/crates/dash_regex/src/graph/node.rs new file mode 100644 index 00000000..07afb359 --- /dev/null +++ b/crates/dash_regex/src/graph/node.rs @@ -0,0 +1,93 @@ +use std::ops::{Deref, DerefMut, Index, IndexMut}; + +use crate::node::{Anchor, MetaSequence}; + +#[derive(Debug, Clone, Copy)] +#[cfg_attr(feature = "format", derive(serde::Serialize, serde::Deserialize))] +pub struct NodeId(u32); +impl NodeId { + pub(super) const DUMMY: NodeId = NodeId(u32::MAX); +} + +#[derive(Debug, Clone)] +#[cfg_attr(feature = "format", derive(serde::Serialize, serde::Deserialize))] +pub struct Node { + pub next: Option, + pub kind: NodeKind, +} + +#[derive(Debug, Clone)] +#[cfg_attr(feature = "format", derive(serde::Serialize, serde::Deserialize))] +pub enum NodeKind { + AnyCharacter, + RepetitionStart { + min: u32, + max: Option, + /// The node being repeated + inner: NodeId, + }, + Anchor(Anchor), + Meta(MetaSequence), + CharacterClass(Box<[CharacterClassItem]>), + Literal(u8), + Or(NodeId, NodeId), + RepetitionEnd { + /// The `RepetitionStart` node to jump to when executing the next repetition iteration + start: NodeId, + }, + GroupStart { + group_id: Option, + }, + GroupEnd { + group_id: Option, + }, +} + +#[derive(Debug, Copy, Clone)] +#[cfg_attr(feature = "format", derive(serde::Serialize, serde::Deserialize))] +pub enum CharacterClassItem { + Literal(u8), + AnyCharacter, + Meta(MetaSequence), + Range(u8, u8), +} + +pub type BuildGraph = Graph>; + +#[derive(Debug, Clone)] +#[cfg_attr(feature = "format", derive(serde::Serialize, serde::Deserialize))] +pub struct Graph> { + nodes: C, +} + +impl BuildGraph { + pub fn new() -> Self { + Self { nodes: Vec::new() } + } + + pub fn push(&mut self, node: Node) -> NodeId { + let id = u32::try_from(self.nodes.len()).expect("attempted to insert more than 2^32 nodes"); + self.nodes.push(node); + NodeId(id) + } + + pub fn finalize(self) -> Graph { + Graph { + nodes: self.nodes.into_boxed_slice(), + } + } +} + +// Requires an indirection through the deref trait because `Box<[T]>` does not implement `Index`... +impl>> Index for Graph { + type Output = Node; + fn index(&self, index: NodeId) -> &Self::Output { + &self.nodes[index.0 as usize] + } +} + +impl>> IndexMut for Graph { + fn index_mut(&mut self, index: NodeId) -> &mut Self::Output { + &mut self.nodes[index.0 as usize] + } +} diff --git a/crates/dash_regex/src/lib.rs b/crates/dash_regex/src/lib.rs index 4f5d539c..09e9c150 100644 --- a/crates/dash_regex/src/lib.rs +++ b/crates/dash_regex/src/lib.rs @@ -1,55 +1,73 @@ +use std::str::FromStr; + pub use error::Error; -pub use matcher::Matcher; -pub use node::Node; -pub use parser::Parser; +pub use graph::eval::EvalSuccess; pub mod error; pub mod flags; -pub mod matcher; -pub mod node; -pub mod parser; -mod stream; -mod visitor; +mod node; +mod parser; + +mod graph; pub use flags::Flags; -pub use parser::ParsedRegex; +pub use graph::Regex; +use parser::Parser; + +pub trait ParseFlags { + fn parse(self) -> Result; +} + +impl ParseFlags for &str { + fn parse(self) -> Result { + Flags::from_str(self).map_err(Into::into) + } +} + +impl ParseFlags for Flags { + fn parse(self) -> Result { + Ok(self) + } +} + +pub fn compile(input: &str, flags: impl ParseFlags) -> Result { + let parsed = Parser::new(input.as_bytes()).parse_all()?; + let flags = flags.parse()?; + Ok(graph::compile(parsed, flags)) +} #[cfg(test)] #[test] pub fn test() { - use parser::Parser; - - use crate::matcher::Matcher; + fn assert_matches_groups(regex: &Regex, input: &str, groups: &[&str]) { + let res = regex.eval(input).unwrap(); - fn matches(regex: &str, input: &str) -> bool { - let nodes = Parser::new(regex.as_bytes()).parse_all().unwrap(); - let mut matcher = Matcher::new(&nodes, input.as_bytes()); - matcher.matches() + for (&expected, got) in groups.iter().zip(&res.groups[1..]) { + let (from, to, _) = got.expect("no group"); + assert_eq!(expected, &input[from as usize..to as usize]); + } } - fn matches_groups(regex: &str, input: &str, groups: &[&str]) -> bool { - let nodes = Parser::new(regex.as_bytes()).parse_all().unwrap(); - let mut matcher = Matcher::new(&nodes, input.as_bytes()); - matcher.matches() - && nodes.group_count - 1 == groups.len() - && matcher - .groups - .iter() - .skip(1) - .zip(groups) - .all(|(group, expected)| group.map(|range| &input[range]) == Some(*expected)) - } + let hex_regex = compile( + "^#?([0-9a-fA-F]{2})([0-9a-fA-F]{2})([0-9a-fA-F]{2})([0-9a-fA-F]{2})$", + "", + ) + .unwrap(); + assert!(hex_regex.matches("#aabbccdd")); + assert!(!hex_regex.matches("#AAb")); + assert!(hex_regex.matches("#aBcDEEf0")); - const HEX_REGEX: &str = "^#?([0-9a-fA-F]{2})([0-9a-fA-F]{2})([0-9a-fA-F]{2})([0-9a-fA-F]{2})$"; - assert!(matches(HEX_REGEX, "#aabbccdd")); - assert!(!matches(HEX_REGEX, "#AAb")); - assert!(matches(HEX_REGEX, "#aBcDEEf0")); + assert!(compile("\\d", "").unwrap().matches("a1")); + assert!(compile("V\\dX", "").unwrap().matches("aV1aVaXaV1Xs")); + assert!(!compile("V\\dX", "").unwrap().matches("aV1aVaXaV?Xs")); - assert!(matches("\\d", "a1")); - assert!(matches("V\\dX", "aV1aVaXaV1Xs")); - assert!(!matches("V\\dX", "aV1aVaXaV?Xs")); + let rgb_regex = compile(r"rgb[\s|\(]+((?:[-\+]?\d*\.\d+%?)|(?:[-\+]?\d+%?))[,|\s]+((?:[-\+]?\d*\.\d+%?)|(?:[-\+]?\d+%?))[,|\s]+((?:[-\+]?\d*\.\d+%?)|(?:[-\+]?\d+%?))\s*\)?","").unwrap(); + assert!(rgb_regex.matches("rgb(255, 255, 255)")); + assert_matches_groups(&rgb_regex, "rgb(144, 17, 9)", &["144", "17", "9"]); - const RGB: &str = r"rgb[\s|\(]+((?:[-\+]?\d*\.\d+%?)|(?:[-\+]?\d+%?))[,|\s]+((?:[-\+]?\d*\.\d+%?)|(?:[-\+]?\d+%?))[,|\s]+((?:[-\+]?\d*\.\d+%?)|(?:[-\+]?\d+%?))\s*\)?"; - assert!(matches(RGB, "rgb(255, 255, 255)")); - assert!(matches_groups(RGB, "rgb(144, 17, 9)", &["144", "17", "9"])); + // Backtracking + assert_matches_groups(&compile("x(.+)x", "").unwrap(), "vxxxv", &["x"]); + assert_matches_groups(&compile(".(.)+abcd", "").unwrap(), "vxabcdabcabcabcabc", &["x"]); + assert_matches_groups(&compile("(.+)+a", "").unwrap(), "bba", &["bb"]); + assert_matches_groups(&compile("(.+)+ac", "").unwrap(), "bacbaabaabaa", &["b"]); } diff --git a/crates/dash_regex/src/node.rs b/crates/dash_regex/src/node.rs index 2b89bbcd..b13c2dac 100644 --- a/crates/dash_regex/src/node.rs +++ b/crates/dash_regex/src/node.rs @@ -9,7 +9,7 @@ pub enum CharacterClassItem { } #[cfg_attr(feature = "format", derive(Serialize, Deserialize))] -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, Copy, PartialEq)] pub enum GroupCaptureMode { /// `(?:...)` None, @@ -24,8 +24,8 @@ pub enum Node { MetaSequence(MetaSequence), Repetition { node: Box, - min: usize, - max: Option, + min: u32, + max: Option, }, LiteralCharacter(u8), CharacterClass(Vec), @@ -36,14 +36,14 @@ pub enum Node { } impl Node { - pub fn unbounded_max_repetition(node: Node, min: usize) -> Self { + pub fn unbounded_max_repetition(node: Node, min: u32) -> Self { Self::Repetition { node: Box::new(node), min, max: None, } } - pub fn repetition(node: Node, min: usize, max: usize) -> Self { + pub fn repetition(node: Node, min: u32, max: u32) -> Self { Self::Repetition { node: Box::new(node), min, @@ -56,15 +56,25 @@ impl Node { } #[cfg_attr(feature = "format", derive(Serialize, Deserialize))] -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, Copy, PartialEq)] pub enum MetaSequence { Digit, Word, Whitespace, } +impl MetaSequence { + pub fn matches(self, c: u8) -> bool { + match self { + MetaSequence::Digit => c.is_ascii_digit(), + MetaSequence::Word => c.is_ascii_alphanumeric() || c == b'_', + MetaSequence::Whitespace => c.is_ascii_whitespace(), + } + } +} + #[cfg_attr(feature = "format", derive(Serialize, Deserialize))] -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, Copy, PartialEq)] pub enum Anchor { StartOfString, EndOfString, diff --git a/crates/dash_regex/src/parser.rs b/crates/dash_regex/src/parser.rs index aaa23eee..100c435b 100644 --- a/crates/dash_regex/src/parser.rs +++ b/crates/dash_regex/src/parser.rs @@ -1,7 +1,5 @@ use std::mem; -use serde::{Deserialize, Serialize}; - use crate::error::Error; use crate::node::{Anchor, CharacterClassItem, GroupCaptureMode, MetaSequence, Node}; @@ -12,7 +10,7 @@ pub struct Parser<'a> { } #[derive(Debug, Clone)] -#[cfg_attr(feature = "format", derive(Serialize, Deserialize))] +#[cfg_attr(feature = "format", derive(serde::Serialize, serde::Deserialize))] pub struct ParsedRegex { pub nodes: Vec, pub group_count: usize, @@ -101,12 +99,14 @@ impl<'a> Parser<'a> { Ok(node) } - fn read_int(&mut self) -> Result { - let mut number = 0; + fn read_u32(&mut self) -> Result { + let mut number = 0u32; while let Some(byte) = self.current() { match byte { b'0'..=b'9' => { - number = number * 10 + (byte - b'0') as usize; + number = number.checked_mul(10).ok_or(Error::Overflow)?; + number = number.checked_add((byte - b'0') as u32).ok_or(Error::Overflow)?; + self.advance(); } _ => return Ok(number), @@ -116,7 +116,7 @@ impl<'a> Parser<'a> { } fn parse_bounded_repetition(&mut self, node: Node) -> Result { - let min = self.read_int()?; + let min = self.read_u32()?; match self.current() { Some(b',') => { self.advance(); @@ -126,7 +126,7 @@ impl<'a> Parser<'a> { Ok(Node::unbounded_max_repetition(node, min)) } _ => { - let max = self.read_int()?; + let max = self.read_u32()?; self.advance(); // } Ok(Node::repetition(node, min, max)) } diff --git a/crates/dash_vm/src/dispatch.rs b/crates/dash_vm/src/dispatch.rs index 4a6d418d..68167b64 100755 --- a/crates/dash_vm/src/dispatch.rs +++ b/crates/dash_vm/src/dispatch.rs @@ -618,9 +618,9 @@ mod handlers { pub fn regex_constant(mut cx: DispatchContext<'_>) -> Result, Unrooted> { let id = cx.fetchw_and_inc_ip(); - let (nodes, flags, source) = &cx.constants().regexes[RegexConstant(id)]; + let (regex, source) = &cx.constants().regexes[RegexConstant(id)]; - let regex = RegExp::new(nodes.clone(), *flags, JsString::from(*source), &cx.scope); + let regex = RegExp::new(regex.clone(), JsString::from(*source), &cx.scope); let regex = cx.scope.register(regex); cx.push_stack(Value::object(regex).into()); Ok(None) diff --git a/crates/dash_vm/src/gc/trace.rs b/crates/dash_vm/src/gc/trace.rs index 3e0e49b6..236fc72c 100644 --- a/crates/dash_vm/src/gc/trace.rs +++ b/crates/dash_vm/src/gc/trace.rs @@ -6,7 +6,7 @@ use std::rc::Rc; use dash_middle::compiler::constant::ConstantPool; use dash_middle::interner::StringInterner; -use dash_regex::{Flags, ParsedRegex}; +use dash_regex::Regex; use crate::value::Unrooted; use crate::value::primitive::{Null, Number, Undefined}; @@ -184,7 +184,7 @@ unsafe impl Trace for dash_middle::compiler::constant::Function { booleans.as_slice().trace(cx); functions.as_slice().trace(cx); - for (ParsedRegex { .. }, Flags { .. }, sym) in regexes.as_slice() { + for (Regex { .. }, sym) in regexes.as_slice() { sym.trace(cx); } } diff --git a/crates/dash_vm/src/js_std/regex.rs b/crates/dash_vm/src/js_std/regex.rs index a83d1025..1569ebc0 100644 --- a/crates/dash_vm/src/js_std/regex.rs +++ b/crates/dash_vm/src/js_std/regex.rs @@ -6,9 +6,7 @@ use crate::value::ops::conversions::ValueConversion; use crate::value::regex::{RegExp, RegExpInner}; use crate::value::{Value, ValueContext}; use dash_middle::interner::sym; -use dash_regex::Flags; -use dash_regex::matcher::Matcher as RegexMatcher; -use dash_regex::parser::Parser as RegexParser; +use dash_regex::{EvalSuccess, Flags}; use super::receiver_t; @@ -26,7 +24,7 @@ pub fn constructor(cx: CallContext) -> Result { None => Flags::empty(), }; - let nodes = match RegexParser::new(pattern.res(cx.scope).as_bytes()).parse_all() { + let nodes = match dash_regex::compile(pattern.res(cx.scope), flags) { Ok(nodes) => nodes, Err(err) => throw!(cx.scope, SyntaxError, "Regex parser error: {}", err), }; @@ -34,7 +32,6 @@ pub fn constructor(cx: CallContext) -> Result { let new_target = cx.new_target.unwrap_or(cx.scope.statics.regexp_ctor); let regex = RegExp::with_obj( nodes, - flags, pattern, NamedObject::instance_for_new_target(new_target, cx.scope)?, ); @@ -47,35 +44,32 @@ pub fn test(cx: CallContext) -> Result { let regex = receiver_t::(cx.scope, &cx.this, "RegExp.prototype.test")?; - let RegExpInner { - regex, - last_index, - flags, - .. - } = match regex.inner() { + let RegExpInner { regex, last_index, .. } = match regex.inner() { Some(nodes) => nodes, None => throw!(cx.scope, TypeError, "Receiver must be an initialized RegExp object"), }; let text = text.res(cx.scope); - let is_global = flags.contains(Flags::GLOBAL); + let is_global = regex.flags().contains(Flags::GLOBAL); if is_global && last_index.get() >= text.len() { last_index.set(0); return Ok(Value::boolean(false)); } - let mut matcher = RegexMatcher::new(regex, &text.as_bytes()[last_index.get()..]); - if matcher.matches() { - if is_global { - last_index.set(last_index.get() + matcher.groups.get(0).unwrap().end); + match regex.eval(&text[last_index.get()..]) { + Ok(EvalSuccess { groups }) => { + if is_global { + last_index.set(last_index.get() + groups[0].unwrap().1 as usize); + } + Ok(Value::boolean(true)) } - Ok(Value::boolean(true)) - } else { - if is_global { - last_index.set(0); + Err(_) => { + if is_global { + last_index.set(0); + } + Ok(Value::boolean(false)) } - Ok(Value::boolean(false)) } } @@ -84,49 +78,44 @@ pub fn exec(cx: CallContext<'_, '_>) -> Result { let regex = receiver_t::(cx.scope, &cx.this, "RegExp.prototype.exec")?; - let RegExpInner { - regex, - last_index, - flags, - .. - } = match regex.inner() { + let RegExpInner { regex, last_index, .. } = match regex.inner() { Some(nodes) => nodes, None => throw!(cx.scope, TypeError, "Receiver must be an initialized RegExp object"), }; let text = text.res(cx.scope).to_owned(); - let is_global = flags.contains(Flags::GLOBAL); + let is_global = regex.flags().contains(Flags::GLOBAL); if is_global && last_index.get() >= text.len() { last_index.set(0); return Ok(Value::null()); } - let mut matcher = RegexMatcher::new(regex, &text.as_bytes()[last_index.get()..]); - if matcher.matches() { - if is_global { - last_index.set(last_index.get() + matcher.groups.get(0).unwrap().end); + match regex.eval(&text[last_index.get()..]) { + Ok(EvalSuccess { groups }) => { + if is_global { + last_index.set(last_index.get() + groups[0].unwrap().1 as usize); + } + + let groups = groups + .into_iter() + .map(|group| { + let sub = match group { + Some((from, to, _)) => cx.scope.intern(&text[from as usize..to as usize]).into(), + None => sym::null.into(), + }; + PropertyValue::static_default(Value::string(sub)) + }) + .collect(); + + let groups = Array::from_vec(groups, cx.scope); + Ok(Value::object(cx.scope.register(groups))) } - - let groups = matcher - .groups - .iter() - .map(|g| { - let sub = match g { - Some(r) => cx.scope.intern(&text[r]).into(), - None => sym::null.into(), - }; - PropertyValue::static_default(Value::string(sub)) - }) - .collect(); - - let groups = Array::from_vec(groups, cx.scope); - Ok(Value::object(cx.scope.register(groups))) - } else { - if is_global { - last_index.set(0); + Err(_) => { + if is_global { + last_index.set(0); + } + Ok(Value::null()) } - - Ok(Value::null()) } } diff --git a/crates/dash_vm/src/value/regex.rs b/crates/dash_vm/src/value/regex.rs index 96a3e7ba..de245aaf 100644 --- a/crates/dash_vm/src/value/regex.rs +++ b/crates/dash_vm/src/value/regex.rs @@ -1,7 +1,7 @@ use std::cell::Cell; use dash_proc_macro::Trace; -use dash_regex::{Flags, ParsedRegex}; +use dash_regex::Regex; use crate::gc::trace::{Trace, TraceCtxt}; use crate::{Vm, delegate, extract}; @@ -11,8 +11,7 @@ use super::string::JsString; #[derive(Debug)] pub struct RegExpInner { - pub regex: ParsedRegex, - pub flags: Flags, + pub regex: Regex, pub source: JsString, pub last_index: Cell, } @@ -21,7 +20,6 @@ unsafe impl Trace for RegExpInner { fn trace(&self, cx: &mut TraceCtxt<'_>) { let Self { regex: _, - flags: _, source, last_index: _, } = self; @@ -36,11 +34,10 @@ pub struct RegExp { } impl RegExp { - pub fn new(regex: ParsedRegex, flags: Flags, source: JsString, vm: &Vm) -> Self { + pub fn new(regex: Regex, source: JsString, vm: &Vm) -> Self { Self { inner: Some(RegExpInner { regex, - flags, source, last_index: Cell::new(0), }), @@ -48,11 +45,10 @@ impl RegExp { } } - pub fn with_obj(regex: ParsedRegex, flags: Flags, source: JsString, object: NamedObject) -> Self { + pub fn with_obj(regex: Regex, source: JsString, object: NamedObject) -> Self { Self { inner: Some(RegExpInner { regex, - flags, source, last_index: Cell::new(0), }), From c47f684e9ac5ffb470b75f21ee5ea67d9302f2ab Mon Sep 17 00:00:00 2001 From: y21 <30553356+y21@users.noreply.github.com> Date: Mon, 31 Mar 2025 20:07:28 +0200 Subject: [PATCH 2/5] reimeplement repetition state as a stack --- .gitignore | 2 + Cargo.lock | 17 ++- crates/dash_regex/src/graph/eval.rs | 174 ++++++++++++++++------------ crates/dash_regex/src/lib.rs | 14 ++- 4 files changed, 129 insertions(+), 78 deletions(-) diff --git a/.gitignore b/.gitignore index 38b8d1c2..47b6d028 100755 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,5 @@ target *.json *.sh dash-cli/tests +typeck +jimp-testing diff --git a/Cargo.lock b/Cargo.lock index 4c48cf89..5909cd75 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -157,9 +157,12 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" [[package]] name = "cc" -version = "1.0.88" +version = "1.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "02f341c093d19155a6e41631ce5971aac4e9a868262212153124c15fa22d1cdc" +checksum = "be714c154be609ec7f5dad223a33bf1482fff90472de28f7362806e6d4832b8c" +dependencies = [ + "shlex", +] [[package]] name = "cfg-if" @@ -1084,9 +1087,9 @@ checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" [[package]] name = "libc" -version = "0.2.153" +version = "0.2.170" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd" +checksum = "875b3680cb2f8f71bdcf9a30f38d48282f5d3c95cbf9b3fa57269bb5d5c06828" [[package]] name = "libloading" @@ -1932,6 +1935,12 @@ dependencies = [ "lazy_static", ] +[[package]] +name = "shlex" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" + [[package]] name = "signal-hook-registry" version = "1.4.1" diff --git a/crates/dash_regex/src/graph/eval.rs b/crates/dash_regex/src/graph/eval.rs index 8867c3dc..c71d5231 100644 --- a/crates/dash_regex/src/graph/eval.rs +++ b/crates/dash_regex/src/graph/eval.rs @@ -1,3 +1,5 @@ +use std::cell::Cell; + use crate::graph::node::CharacterClassItem; use crate::node::Anchor; @@ -10,7 +12,7 @@ pub enum ProcessedGroupState { Unconfirmed, } -struct Cx<'a> { +struct Shared<'a> { processed_groups: &'a mut [Option<(u32, u32, ProcessedGroupState)>], pending_groups: &'a mut [(Option, Option)], /// The full input source of this "attempt". @@ -18,69 +20,75 @@ struct Cx<'a> { graph: &'a Graph, /// The offset of `full_input` in the *original* input string. offset_from_original: u32, - current_repetition_count: Option, } - -impl Cx<'_> { +impl Shared<'_> { /// Returns the offset of the passed in slice relative to the full input. /// The slice must actually be obtained from the full input for the return value to make sense. /// The value is unspecified (but not undefined) if passed an input slice from somewhere else. - pub fn offset(&self, s: &[u8]) -> u32 { - (s.as_ptr().addr() - self.full_input.as_ptr().addr()) as u32 + pub fn offset_of(&self, remaining: &[u8]) -> u32 { + (remaining.as_ptr().addr() - self.full_input.as_ptr().addr()) as u32 } /// Same as `offset`, but returns it relative to the original input. - pub fn offset_from_original(&self, s: &[u8]) -> u32 { - self.offset_from_original + self.offset(s) + pub fn offset_of_from_original(&self, remaining: &[u8]) -> u32 { + self.offset_from_original + self.offset_of(remaining) } +} - /// Creates a new context usable for the specified node. - pub fn for_node(&mut self, node: NodeId, origin: NodeId) -> Cx<'_> { - let Self { - processed_groups: &mut ref mut processed_groups, - pending_groups: &mut ref mut pending_groups, - full_input, - graph, - offset_from_original, - mut current_repetition_count, - } = *self; +#[derive(Debug, Clone)] +struct Cx<'a> { + /// How many iterations have matched so far + current_repetition_count: Cell>, + /// The offset at the start of this iteration, used to determine if we're making any progress. + /// If this is the same as the offset at the end of an iteration, we can return true early as it will match forever. + current_repetition_start: Cell>, + parent: Option<&'a Cx<'a>>, +} + +impl<'a> Cx<'a> { + pub fn for_node(&'a self, shared: &Shared<'_>, target: NodeId, origin: NodeId, remaining: &[u8]) -> Cx<'a> { + let mut current_repetition_count = self.current_repetition_count.clone(); + let mut current_repetition_start = self.current_repetition_start.clone(); + let mut parent = self.parent; - if let NodeKind::RepetitionStart { .. } = graph[node].kind { - if let NodeKind::RepetitionEnd { .. } = graph[origin].kind { - current_repetition_count = Some(current_repetition_count.unwrap() + 1); + // Moving to a RepetitionStart means we either prepare/initialize a repetition (set to 0), + // or increment it if we're coming from a RepetitionEnd specifically. + if let NodeKind::RepetitionStart { .. } = shared.graph[target].kind { + current_repetition_start = Cell::new(Some(shared.offset_of(remaining))); + + if let NodeKind::RepetitionEnd { .. } = shared.graph[origin].kind { + *current_repetition_count.get_mut().as_mut().unwrap() += 1; } else { - current_repetition_count = Some(0); + current_repetition_count = Cell::new(Some(0)); + parent = Some(self); } } Cx { - processed_groups, - pending_groups, - full_input, - graph, - offset_from_original, current_repetition_count, + current_repetition_start, + parent, } } } -fn step(mut cx: Cx, node_id: NodeId, mut input: &[u8]) -> bool { +fn step(shared: &mut Shared<'_>, cx: Cx<'_>, node_id: NodeId, mut remaining: &[u8]) -> bool { // The reason for shadowing cx with a borrow here is so that you're forced to go through `Cx::for_node` when calling `step(...)`. // You can't pass the same `cx` when evaluating a sub-node. - let cx = &mut cx; - let node = &cx.graph[node_id]; + let mut cx = &cx; + let node = &shared.graph[node_id]; let mut matches = match node.kind { NodeKind::AnyCharacter => { - if let Some(rest) = input.get(1..) { - input = rest; + if let Some(rest) = remaining.get(1..) { + remaining = rest; true } else { false } } NodeKind::RepetitionStart { min, max, inner } => 'arm: { - let current_repetition_count = cx.current_repetition_count.unwrap(); + let current_repetition_count = cx.current_repetition_count.get().unwrap(); if let Some(max) = max { if current_repetition_count >= max { @@ -89,24 +97,30 @@ fn step(mut cx: Cx, node_id: NodeId, mut input: &[u8]) -> bool { } } - if step(cx.for_node(inner, node_id), inner, input) { - // This has automatically also checked the rest input. Don't need to do that again here after the match. + if step(shared, cx.for_node(shared, inner, node_id, remaining), inner, remaining) { + // This has automatically also checked the rest input. Don't (shouldn't) need to do that again here after the match. return true; } + + // Getting here means the regex cannot match the string with another repetition iteration, + // and we are on track to backtrack. + // This requires us to "pop" the current repetition and continue with the outer/parent repetition context, + // as this might be a nested repetition. + cx = cx.parent.unwrap(); current_repetition_count >= min } - NodeKind::Anchor(Anchor::StartOfString) => input.len() == cx.full_input.len(), - NodeKind::Anchor(Anchor::EndOfString) => input.is_empty(), + NodeKind::Anchor(Anchor::StartOfString) => remaining.len() == shared.full_input.len(), + NodeKind::Anchor(Anchor::EndOfString) => remaining.is_empty(), NodeKind::Meta(meta) => { - if let Some((_, rest)) = input.split_first().filter(|&(&c, _)| meta.matches(c)) { - input = rest; + if let Some((_, rest)) = remaining.split_first().filter(|&(&c, _)| meta.matches(c)) { + remaining = rest; true } else { false } } NodeKind::CharacterClass(ref items) => { - if let Some((_, rest)) = input.split_first().filter(|&(&c, _)| { + if let Some((_, rest)) = remaining.split_first().filter(|&(&c, _)| { items.iter().copied().any(|item| match item { CharacterClassItem::Literal(lit) => lit == c, CharacterClassItem::AnyCharacter => true, @@ -114,76 +128,80 @@ fn step(mut cx: Cx, node_id: NodeId, mut input: &[u8]) -> bool { CharacterClassItem::Range(min, max) => (min..=max).contains(&c), }) }) { - input = rest; + remaining = rest; true } else { false } } NodeKind::Literal(lit) => { - if let Some((_, rest)) = input.split_first().filter(|&(&c, _)| c == lit) { - input = rest; + if let Some((_, rest)) = remaining.split_first().filter(|&(&c, _)| c == lit) { + remaining = rest; true } else { false } } NodeKind::Or(left, right) => { - return step(cx.for_node(left, node_id), left, input) || step(cx.for_node(right, node_id), right, input); + return step(shared, cx.for_node(shared, left, node_id, remaining), left, remaining) + || step(shared, cx.for_node(shared, right, node_id, remaining), right, remaining); } NodeKind::RepetitionEnd { start } => { - return step(cx.for_node(start, node_id), start, input); + let end_off = shared.offset_of(remaining); + if cx.current_repetition_start.get().unwrap() == end_off { + // We haven't made any progress in this repetition iteration and won't. + return true; + } else { + return step(shared, cx.for_node(shared, start, node_id, remaining), start, remaining); + } } NodeKind::GroupStart { group_id } => { if let Some(group_id) = group_id { - let offset = cx.offset_from_original(input); - cx.pending_groups[group_id as usize] = (Some(offset), None); + let offset = shared.offset_of_from_original(remaining); + shared.pending_groups[group_id as usize] = (Some(offset), None); } true } NodeKind::GroupEnd { group_id } => { if let Some(group_id) = group_id { let group_id = group_id as usize; - - let old = cx.processed_groups[group_id]; - let start = cx.pending_groups[group_id].0.unwrap(); - let end = cx.offset_from_original(input); - cx.processed_groups[group_id] = Some((start, end, ProcessedGroupState::Unconfirmed)); + let old = shared.processed_groups[group_id]; + let start = shared.pending_groups[group_id].0.unwrap(); + let end = shared.offset_of_from_original(remaining); + shared.processed_groups[group_id] = Some((start, end, ProcessedGroupState::Unconfirmed)); return if let Some(next) = node.next { - let matches = step(cx.for_node(next, node_id), next, input); - cx.pending_groups[group_id] = (Some(start), Some(end)); - + let matches = step(shared, cx.for_node(shared, next, node_id, remaining), next, remaining); + shared.pending_groups[group_id] = (Some(start), Some(end)); if matches { - if cx.processed_groups[group_id].is_none_or(|(.., s)| s == ProcessedGroupState::Unconfirmed) { + if shared.processed_groups[group_id].is_none_or(|(.., s)| s == ProcessedGroupState::Unconfirmed) + { // This group may have been processed again in a subsequent iteration. // Only overwrite it back with this iteration's if it's still unconfirmed - cx.processed_groups[group_id] = Some((start, end, ProcessedGroupState::Confirmed)); + shared.processed_groups[group_id] = Some((start, end, ProcessedGroupState::Confirmed)); } - true } else { // We did not match. Restore to old. if let Some((a, b, _)) = old { - cx.processed_groups[group_id] = Some((a, b, ProcessedGroupState::Unconfirmed)); + shared.processed_groups[group_id] = Some((a, b, ProcessedGroupState::Unconfirmed)); } else { - cx.processed_groups[group_id] = None; + shared.processed_groups[group_id] = None; } false } } else { // No next node. - cx.processed_groups[group_id].as_mut().unwrap().2 = ProcessedGroupState::Confirmed; + shared.processed_groups[group_id].as_mut().unwrap().2 = ProcessedGroupState::Confirmed; true }; } - true } }; if let Some(next) = node.next { - matches = matches && step(cx.for_node(next, node_id), next, input); + matches = matches && step(shared, cx.for_node(shared, next, node_id, remaining), next, remaining); } matches } @@ -215,20 +233,32 @@ pub fn eval(regex: &Regex, mut input: &[u8]) -> Result { processed_groups[1..].fill(None); pending_groups.fill((None, None)); - let cx = Cx { - processed_groups: &mut processed_groups, - pending_groups: &mut pending_groups, - current_repetition_count: if let NodeKind::RepetitionStart { .. } = regex.graph[root].kind { - Some(0) + let outer_cx: Cx<'_> = Cx { + current_repetition_count: Cell::new(None), + current_repetition_start: Cell::new(None), + parent: None, + }; + let (current_repetition_count, current_repetition_start, outer_cx) = + if let NodeKind::RepetitionStart { .. } = regex.graph[root].kind { + (Some(0), Some(0), Some(&outer_cx)) } else { - None - }, - offset_from_original, + (None, None, None) + }; + + let mut shared = Shared { full_input: input, graph: ®ex.graph, + offset_from_original, + pending_groups: &mut pending_groups, + processed_groups: &mut processed_groups, + }; + let cx = Cx { + current_repetition_count: Cell::new(current_repetition_count), + current_repetition_start: Cell::new(current_repetition_start), + parent: outer_cx, }; - if step(cx, root, input) { + if step(&mut shared, cx, root, input) { return Ok(EvalSuccess { groups: processed_groups, }); diff --git a/crates/dash_regex/src/lib.rs b/crates/dash_regex/src/lib.rs index 09e9c150..860203a3 100644 --- a/crates/dash_regex/src/lib.rs +++ b/crates/dash_regex/src/lib.rs @@ -68,6 +68,16 @@ pub fn test() { // Backtracking assert_matches_groups(&compile("x(.+)x", "").unwrap(), "vxxxv", &["x"]); assert_matches_groups(&compile(".(.)+abcd", "").unwrap(), "vxabcdabcabcabcabc", &["x"]); - assert_matches_groups(&compile("(.+)+a", "").unwrap(), "bba", &["bb"]); - assert_matches_groups(&compile("(.+)+ac", "").unwrap(), "bacbaabaabaa", &["b"]); + assert_matches_groups(&compile("(.+)+a", "").unwrap(), "ba", &["b"]); + // Degenerate backtracking + assert_matches_groups(&compile("(.+)+ac", "").unwrap(), "bacbaabaabaabaa", &["b"]); + + assert_matches_groups(&compile("(ab+){3,}", "").unwrap(), "ababab", &["ab"]); + assert_matches_groups(&compile("(([ab]+)b){3,}", "").unwrap(), "abababaa", &["ab", "a"]); + assert!(compile("(([ab]+)b){3,}", "").unwrap().eval("ababaaaa").is_err()); + assert!(compile("(([ab]+)b){3,}", "").unwrap().eval("ababaaba").is_ok()); + + // Infinite regex needs to terminate eventually + assert_matches_groups(&compile("(.?)+", "").unwrap(), "", &[""]); + assert_matches_groups(&compile("(.?)+", "").unwrap(), "aa", &["a"]); } From 09411d8087e06a2979bbdd618ceb132295a30f3b Mon Sep 17 00:00:00 2001 From: y21 <30553356+y21@users.noreply.github.com> Date: Mon, 31 Mar 2025 21:22:52 +0200 Subject: [PATCH 3/5] add `NodeKind::End` to properly indicate matched string length --- crates/dash_regex/src/graph/build.rs | 72 +++++++++++++++------------- crates/dash_regex/src/graph/eval.rs | 22 +++++---- crates/dash_regex/src/graph/mod.rs | 2 +- crates/dash_regex/src/graph/node.rs | 1 + 4 files changed, 53 insertions(+), 44 deletions(-) diff --git a/crates/dash_regex/src/graph/build.rs b/crates/dash_regex/src/graph/build.rs index 59dd2fce..a635bd7d 100644 --- a/crates/dash_regex/src/graph/build.rs +++ b/crates/dash_regex/src/graph/build.rs @@ -45,24 +45,24 @@ pub fn number_groups(regex: &ParsedRegex) -> CaptureGroupMap { map } -pub fn build(group_numbers: &CaptureGroupMap, regex: &ParsedRegex) -> (Graph, Option) { +pub fn build(group_numbers: &CaptureGroupMap, regex: &ParsedRegex) -> (Graph, NodeId) { fn lower_repetition( graph: &mut BuildGraph, group_numbers: &CaptureGroupMap, node: &ParseNode, min: u32, max: Option, - next: Option, + next: NodeId, ) -> NodeId { let end_id = graph.push(Node { - next, + next: Some(next), kind: NodeKind::RepetitionEnd { start: NodeId::DUMMY, // will be set later }, }); - let inner_id = inner(graph, group_numbers, slice::from_ref(node), Some(end_id)).unwrap(); + let inner_id = inner(graph, group_numbers, slice::from_ref(node), end_id); let start_id = graph.push(Node { - next, + next: Some(next), kind: NodeKind::RepetitionStart { min, max, @@ -80,26 +80,26 @@ pub fn build(group_numbers: &CaptureGroupMap, regex: &ParsedRegex) -> (Graph, Op graph: &mut BuildGraph, group_numbers: &CaptureGroupMap, nodes: &[ParseNode], - outer_next: Option, - ) -> Option { + outer_next: NodeId, + ) -> NodeId { if let Some((current, rest)) = nodes.split_first() { let next = inner(graph, group_numbers, rest, outer_next); match *current { - ParseNode::AnyCharacter => Some(graph.push(Node { - next, + ParseNode::AnyCharacter => graph.push(Node { + next: Some(next), kind: NodeKind::AnyCharacter, - })), - ParseNode::MetaSequence(meta) => Some(graph.push(Node { - next, + }), + ParseNode::MetaSequence(meta) => graph.push(Node { + next: Some(next), kind: NodeKind::Meta(meta), - })), + }), ParseNode::Repetition { ref node, min, max } => { - Some(lower_repetition(graph, group_numbers, node, min, max, next)) + lower_repetition(graph, group_numbers, node, min, max, next) } - ParseNode::LiteralCharacter(literal) => Some(graph.push(Node { - next, + ParseNode::LiteralCharacter(literal) => graph.push(Node { + next: Some(next), kind: NodeKind::Literal(literal), - })), + }), ParseNode::CharacterClass(ref parse_items) => { let items = parse_items .iter() @@ -118,35 +118,35 @@ pub fn build(group_numbers: &CaptureGroupMap, regex: &ParsedRegex) -> (Graph, Op }) .collect::>(); - Some(graph.push(Node { - next, + graph.push(Node { + next: Some(next), kind: NodeKind::CharacterClass(items), - })) + }) } - ParseNode::Anchor(anchor) => Some(graph.push(Node { - next, + ParseNode::Anchor(anchor) => graph.push(Node { + next: Some(next), kind: NodeKind::Anchor(anchor), - })), + }), ParseNode::Or(ref left, ref right) => { - let left = inner(graph, group_numbers, left, next).unwrap(); - let right = inner(graph, group_numbers, right, next).unwrap(); - Some(graph.push(Node { - next, + let left = inner(graph, group_numbers, left, next); + let right = inner(graph, group_numbers, right, next); + graph.push(Node { + next: Some(next), kind: NodeKind::Or(left, right), - })) + }) } - ParseNode::Optional(ref node) => Some(lower_repetition(graph, group_numbers, node, 0, Some(1), next)), + ParseNode::Optional(ref node) => lower_repetition(graph, group_numbers, node, 0, Some(1), next), ParseNode::Group(_, ref nodes) => { let group_id = group_numbers.get(&(current as *const ParseNode)).copied(); let end = graph.push(Node { - next, + next: Some(next), kind: NodeKind::GroupEnd { group_id }, }); - let inner_id = inner(graph, group_numbers, nodes, Some(end)).unwrap(); - Some(graph.push(Node { + let inner_id = inner(graph, group_numbers, nodes, end); + graph.push(Node { next: Some(inner_id), kind: NodeKind::GroupStart { group_id }, - })) + }) } } } else { @@ -155,6 +155,10 @@ pub fn build(group_numbers: &CaptureGroupMap, regex: &ParsedRegex) -> (Graph, Op } let mut graph = BuildGraph::new(); - let root = inner(&mut graph, group_numbers, ®ex.nodes, None); + let end = graph.push(Node { + kind: NodeKind::End, + next: None, + }); + let root = inner(&mut graph, group_numbers, ®ex.nodes, end); (graph.finalize(), root) } diff --git a/crates/dash_regex/src/graph/eval.rs b/crates/dash_regex/src/graph/eval.rs index c71d5231..81fcfab4 100644 --- a/crates/dash_regex/src/graph/eval.rs +++ b/crates/dash_regex/src/graph/eval.rs @@ -20,6 +20,7 @@ struct Shared<'a> { graph: &'a Graph, /// The offset of `full_input` in the *original* input string. offset_from_original: u32, + end_offset: Option, } impl Shared<'_> { /// Returns the offset of the passed in slice relative to the full input. @@ -149,7 +150,8 @@ fn step(shared: &mut Shared<'_>, cx: Cx<'_>, node_id: NodeId, mut remaining: &[u NodeKind::RepetitionEnd { start } => { let end_off = shared.offset_of(remaining); if cx.current_repetition_start.get().unwrap() == end_off { - // We haven't made any progress in this repetition iteration and won't. + // We haven't made any progress in this repetition iteration and won't. Treat this as the end of the regex. + shared.end_offset = Some(shared.offset_of(remaining)); return true; } else { return step(shared, cx.for_node(shared, start, node_id, remaining), start, remaining); @@ -198,6 +200,11 @@ fn step(shared: &mut Shared<'_>, cx: Cx<'_>, node_id: NodeId, mut remaining: &[u } true } + NodeKind::End => { + shared.end_offset = Some(shared.offset_of(remaining)); + assert!(node.next.is_none()); + return true; + } }; if let Some(next) = node.next { @@ -215,11 +222,6 @@ pub struct EvalSuccess { pub struct NoMatch; pub fn eval(regex: &Regex, mut input: &[u8]) -> Result { - let Some(root) = regex.root else { - // Nothing to do for empty regexes. - return Ok(EvalSuccess { groups: Box::default() }); - }; - let mut processed_groups = vec![None; regex.group_count as usize].into_boxed_slice(); let mut pending_groups = vec![(None, None); regex.group_count as usize].into_boxed_slice(); let mut offset_from_original = 0; @@ -227,7 +229,7 @@ pub fn eval(regex: &Regex, mut input: &[u8]) -> Result { // TODO: add a fast reject path where we find the first required character and seek to it in `input` processed_groups[0] = Some(( offset_from_original, - offset_from_original + input.len() as u32, + offset_from_original, ProcessedGroupState::Confirmed, )); processed_groups[1..].fill(None); @@ -239,7 +241,7 @@ pub fn eval(regex: &Regex, mut input: &[u8]) -> Result { parent: None, }; let (current_repetition_count, current_repetition_start, outer_cx) = - if let NodeKind::RepetitionStart { .. } = regex.graph[root].kind { + if let NodeKind::RepetitionStart { .. } = regex.graph[regex.root].kind { (Some(0), Some(0), Some(&outer_cx)) } else { (None, None, None) @@ -251,6 +253,7 @@ pub fn eval(regex: &Regex, mut input: &[u8]) -> Result { offset_from_original, pending_groups: &mut pending_groups, processed_groups: &mut processed_groups, + end_offset: None, }; let cx = Cx { current_repetition_count: Cell::new(current_repetition_count), @@ -258,7 +261,8 @@ pub fn eval(regex: &Regex, mut input: &[u8]) -> Result { parent: outer_cx, }; - if step(&mut shared, cx, root, input) { + if step(&mut shared, cx, regex.root, input) { + processed_groups[0].as_mut().unwrap().1 += shared.end_offset.unwrap(); return Ok(EvalSuccess { groups: processed_groups, }); diff --git a/crates/dash_regex/src/graph/mod.rs b/crates/dash_regex/src/graph/mod.rs index a0e8b11c..41dbd8c7 100644 --- a/crates/dash_regex/src/graph/mod.rs +++ b/crates/dash_regex/src/graph/mod.rs @@ -14,7 +14,7 @@ use crate::parser::ParsedRegex; pub struct Regex { graph: Graph, flags: Flags, - root: Option, + root: NodeId, group_count: u32, } diff --git a/crates/dash_regex/src/graph/node.rs b/crates/dash_regex/src/graph/node.rs index 07afb359..1246d52c 100644 --- a/crates/dash_regex/src/graph/node.rs +++ b/crates/dash_regex/src/graph/node.rs @@ -41,6 +41,7 @@ pub enum NodeKind { GroupEnd { group_id: Option, }, + End, } #[derive(Debug, Copy, Clone)] From 8447443bbb129983463649ecb1a5c1914e7107f1 Mon Sep 17 00:00:00 2001 From: y21 <30553356+y21@users.noreply.github.com> Date: Mon, 31 Mar 2025 21:30:53 +0200 Subject: [PATCH 4/5] fix ^ anchors when retrying substrings --- crates/dash_regex/src/graph/eval.rs | 5 ++++- crates/dash_regex/src/lib.rs | 4 ++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/crates/dash_regex/src/graph/eval.rs b/crates/dash_regex/src/graph/eval.rs index 81fcfab4..69ed7a4d 100644 --- a/crates/dash_regex/src/graph/eval.rs +++ b/crates/dash_regex/src/graph/eval.rs @@ -110,7 +110,10 @@ fn step(shared: &mut Shared<'_>, cx: Cx<'_>, node_id: NodeId, mut remaining: &[u cx = cx.parent.unwrap(); current_repetition_count >= min } - NodeKind::Anchor(Anchor::StartOfString) => remaining.len() == shared.full_input.len(), + NodeKind::Anchor(Anchor::StartOfString) => { + // Make sure it's both at the start of the current attempt as well as from all previous failed attempts + shared.offset_from_original == 0 && remaining.len() == shared.full_input.len() + } NodeKind::Anchor(Anchor::EndOfString) => remaining.is_empty(), NodeKind::Meta(meta) => { if let Some((_, rest)) = remaining.split_first().filter(|&(&c, _)| meta.matches(c)) { diff --git a/crates/dash_regex/src/lib.rs b/crates/dash_regex/src/lib.rs index 860203a3..69de1d07 100644 --- a/crates/dash_regex/src/lib.rs +++ b/crates/dash_regex/src/lib.rs @@ -80,4 +80,8 @@ pub fn test() { // Infinite regex needs to terminate eventually assert_matches_groups(&compile("(.?)+", "").unwrap(), "", &[""]); assert_matches_groups(&compile("(.?)+", "").unwrap(), "aa", &["a"]); + + // ^ anchor must not match when retrying substrings + assert!(!compile("^m", "").unwrap().matches("ama")); + assert!(compile("^m", "").unwrap().matches("ma")); } From 15e4eb32aca678e56e0f74f90ccaed508c8cb6cb Mon Sep 17 00:00:00 2001 From: y21 <30553356+y21@users.noreply.github.com> Date: Mon, 31 Mar 2025 21:58:00 +0200 Subject: [PATCH 5/5] dont parse character class contents as arbitrary nodes --- crates/dash_regex/src/parser.rs | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/crates/dash_regex/src/parser.rs b/crates/dash_regex/src/parser.rs index 100c435b..9db3f558 100644 --- a/crates/dash_regex/src/parser.rs +++ b/crates/dash_regex/src/parser.rs @@ -153,7 +153,9 @@ impl<'a> Parser<'a> { Some(b'-') => { self.advance(); match nodes.last() { - Some(&CharacterClassItem::Node(Node::LiteralCharacter(start))) => { + Some(&CharacterClassItem::Node(Node::LiteralCharacter(start))) + if start.is_ascii_alphanumeric() => + { let end = self.next_byte().ok_or(Error::UnexpectedEof)?; nodes.pop(); nodes.push(CharacterClassItem::Range(start, end)); @@ -161,7 +163,15 @@ impl<'a> Parser<'a> { _ => nodes.push(CharacterClassItem::Node(Node::LiteralCharacter(b'-'))), } } - _ => nodes.push(CharacterClassItem::Node(self.parse_primary()?)), + Some(b'\\') => { + self.advance(); + nodes.push(CharacterClassItem::Node(self.parse_escape()?)); + } + Some(other) => { + self.advance(); + nodes.push(CharacterClassItem::Node(Node::LiteralCharacter(other))) + } + None => break, } }