mirror of
https://github.com/LadybirdBrowser/ladybird
synced 2026-04-26 01:35:08 +02:00
Add LibRegex's new Rust ECMAScript regular expression engine. Replace the old parser's direct pattern-to-bytecode pipeline with a split architecture: parse patterns into a lossless AST first, then lower that AST into bytecode for a dedicated backtracking VM. Keep the syntax tree as the place for validation, analysis, and optimization instead of teaching every transformation to rewrite partially built bytecode. Specialize this backend for the job LibJS actually needs. The old C++ engine shared one generic parser and matcher stack across ECMA-262 and POSIX modes and supported both byte-string and UTF-16 inputs. The new engine focuses on ECMA-262 semantics on WTF-16 data, which lets it model lone surrogates and other JavaScript-specific behavior directly instead of carrying POSIX and multi-encoding constraints through the whole implementation. Fill in the ECMAScript features needed to replace the old engine for real web workloads: Unicode properties and sets, lookahead and lookbehind, named groups and backreferences, modifier groups, string properties, large quantifiers, lone surrogates, and the parser and VM corner cases those features exercise. Reshape the runtime around compile-time pattern hints and a hotter VM loop. Pre-resolve Unicode properties, derive first-character, character-class, and simple-scan filters, extract safe trailing literals for anchored patterns, add literal and literal-alternation fast paths, and keep reusable scratch storage for registers, backtracking state, and modifier stacks. Teach `find_all` to stay inside one VM so global searches stop paying setup costs on every match. Make those shortcuts semantics-aware instead of merely fast. In Unicode mode, do not use literal fast paths for lone surrogates, since ECMA-262 must not let `/\ud83d/u` match inside a surrogate pair. Likewise, only derive end-anchor suffix hints when the suffix lies on every path to `Match`, so lookarounds and disjunctions cannot skip into a shared tail and produce false negatives. This commit lands the Rust crate, the C++ wrapper, the build integration, and the initial LibJS-side plumbing needed to exercise the new engine under real RegExp callers before removing the legacy backend.
324 lines
9.9 KiB
Rust
324 lines
9.9 KiB
Rust
/*
|
|
* Copyright (c) 2026-present, the Ladybird developers.
|
|
*
|
|
* SPDX-License-Identifier: BSD-2-Clause
|
|
*/
|
|
|
|
//! Bytecode instruction set for the regex VM.
|
|
//!
|
|
//! This is an implementation artifact for the matcher closures described in
|
|
//! ECMA-262 Pattern Semantics.
|
|
//!
|
|
//! Spec:
|
|
//! - <https://tc39.es/ecma262/#sec-pattern-semantics>
|
|
//! - <https://tc39.es/ecma262/#sec-compilepattern>
|
|
//!
|
|
//! Instructions are compact and operate on a virtual machine with:
|
|
//! - A current position in the input string
|
|
//! - A set of registers for capture group positions
|
|
//! - A backtrack stack for saving/restoring state
|
|
|
|
/// A named capture group mapping derived from the pattern's named captures.
|
|
/// <https://tc39.es/ecma262/#sec-parsepattern>
|
|
#[derive(Debug, Clone)]
|
|
pub struct NamedGroupEntry {
|
|
pub name: String,
|
|
pub index: u32,
|
|
}
|
|
|
|
/// A compiled regex program.
|
|
///
|
|
/// Spec model: the internal matcher produced by `CompilePattern`.
|
|
/// <https://tc39.es/ecma262/#sec-compilepattern>
|
|
#[derive(Debug, Clone)]
|
|
pub struct Program {
|
|
/// The bytecode instructions.
|
|
pub instructions: Vec<Instruction>,
|
|
/// Number of capture groups (not counting group 0).
|
|
pub capture_count: u32,
|
|
/// Total number of registers needed (2 per capture group + 2 for group 0).
|
|
pub register_count: u32,
|
|
/// Whether Unicode mode is enabled (affects surrogate pair decoding).
|
|
pub unicode: bool,
|
|
/// Whether v-flag (unicode sets) mode is enabled.
|
|
pub unicode_sets: bool,
|
|
/// Base ignore_case flag from pattern flags.
|
|
pub ignore_case: bool,
|
|
/// Base multiline flag from pattern flags.
|
|
pub multiline: bool,
|
|
/// Base dot_all flag from pattern flags.
|
|
pub dot_all: bool,
|
|
/// Named capture groups (name → group index).
|
|
pub named_groups: Vec<NamedGroupEntry>,
|
|
}
|
|
|
|
/// A single bytecode instruction in Ladybird's concrete implementation of the
|
|
/// abstract matchers from ECMA-262 Pattern Semantics.
|
|
/// <https://tc39.es/ecma262/#sec-pattern-semantics>
|
|
#[derive(Debug, Clone, PartialEq, Eq)]
|
|
pub enum Instruction {
|
|
/// Match a single character (u32 code point, supports WTF-16 lone surrogates).
|
|
Char(u32),
|
|
|
|
/// Match a single character, case-insensitive (u32 code points).
|
|
CharNoCase(u32, u32),
|
|
|
|
/// Match any character (`.`). If `dot_all` is true, matches newlines too.
|
|
AnyChar { dot_all: bool },
|
|
|
|
/// Match a character in a set of ranges. `negated` inverts the match.
|
|
CharClass {
|
|
ranges: Vec<CharRange>,
|
|
negated: bool,
|
|
},
|
|
|
|
/// Match a built-in character class (\d, \w, \s and negations).
|
|
BuiltinClass(BuiltinCharacterClass),
|
|
|
|
/// Match a Unicode property (boxed to keep Instruction small).
|
|
UnicodeProperty(Box<UnicodePropertyData>),
|
|
|
|
/// Unconditional jump to target instruction.
|
|
Jump(u32),
|
|
|
|
/// Split execution: try `prefer` first, backtrack to `other`.
|
|
/// This is the fundamental backtracking primitive.
|
|
Split { prefer: u32, other: u32 },
|
|
|
|
/// Save current input position to register `reg`.
|
|
Save(u32),
|
|
|
|
/// Clear register `reg` to -1 (no match).
|
|
ClearRegister(u32),
|
|
|
|
/// Assert start of input (or line if multiline).
|
|
AssertStart { multiline: bool },
|
|
|
|
/// Assert end of input (or line if multiline).
|
|
AssertEnd { multiline: bool },
|
|
|
|
/// Assert word boundary.
|
|
AssertWordBoundary,
|
|
|
|
/// Assert non-word boundary.
|
|
AssertNonWordBoundary,
|
|
|
|
/// Match succeeded.
|
|
Match,
|
|
|
|
/// Fail — force backtrack.
|
|
Fail,
|
|
|
|
/// Backreference: match the same string as capture group `index`.
|
|
Backref(u32),
|
|
|
|
/// Named backreference.
|
|
BackrefNamed(String),
|
|
|
|
/// Begin a repetition counter at register `counter_reg`.
|
|
/// Sets the register to 0.
|
|
RepeatStart { counter_reg: u32 },
|
|
|
|
/// Check repetition: if counter < max, increment and jump to `body`.
|
|
/// Otherwise fall through. Used with Split for greedy/lazy.
|
|
RepeatCheck {
|
|
counter_reg: u32,
|
|
min: u32,
|
|
max: Option<u32>,
|
|
body: u32,
|
|
greedy: bool,
|
|
},
|
|
|
|
/// Lookahead/lookbehind assertion.
|
|
/// `positive`: whether match must succeed or fail.
|
|
/// `forward`: lookahead (true) or lookbehind (false).
|
|
/// `body`: start of the assertion body.
|
|
/// `end`: instruction after the assertion.
|
|
LookStart {
|
|
positive: bool,
|
|
forward: bool,
|
|
end: u32,
|
|
},
|
|
|
|
/// End of a lookaround body. Signals success of the assertion sub-match.
|
|
LookEnd,
|
|
|
|
/// Push modifier flags onto the modifier stack.
|
|
PushModifiers {
|
|
ignore_case: Option<bool>,
|
|
multiline: Option<bool>,
|
|
dot_all: Option<bool>,
|
|
},
|
|
|
|
/// Pop modifier flags from the modifier stack.
|
|
PopModifiers,
|
|
|
|
/// No-op, used as a placeholder during compilation.
|
|
Nop,
|
|
|
|
/// Atomically match one string from a Unicode string property.
|
|
/// Tries multi-codepoint strings first (longest match wins), then falls
|
|
/// back to single-codepoint UnicodeProperty match. Does not create
|
|
/// backtrack points -- once a match is found, it's committed.
|
|
StringPropertyMatch {
|
|
/// Multi-codepoint strings, sorted longest first and packed as:
|
|
/// [len, cp0, cp1, ..., len, cp0, ...]
|
|
strings: Box<[u32]>,
|
|
/// Fallback for single-codepoint matches.
|
|
property: Box<UnicodePropertyData>,
|
|
},
|
|
|
|
/// Progress check: save position at `reg`, fail if no progress since last visit.
|
|
/// Used to prevent infinite loops in zero-width quantifier bodies.
|
|
/// When `clear_captures` is set, those registers are cleared to -1 before
|
|
/// backtracking on zero-width, per ECMA-262 RepeatMatcher step 2.b.
|
|
ProgressCheck { reg: u32, clear_captures: Vec<u32> },
|
|
|
|
/// Greedy loop for simple character matchers.
|
|
/// Greedily consumes as many matching characters as possible, then pushes a
|
|
/// single backtrack state. On backtrack, gives up one character at a time.
|
|
/// This avoids per-iteration Split/backtrack overhead for simple quantifiers.
|
|
GreedyLoop {
|
|
matcher: SimpleMatch,
|
|
min: u32,
|
|
max: Option<u32>,
|
|
},
|
|
|
|
/// Lazy loop for simple character matchers.
|
|
/// Tries to match as few characters as possible, then on backtrack consumes one more.
|
|
LazyLoop {
|
|
matcher: SimpleMatch,
|
|
min: u32,
|
|
max: Option<u32>,
|
|
},
|
|
}
|
|
|
|
/// The kind of a resolved Unicode property.
|
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
|
#[repr(u8)]
|
|
pub enum PropertyKind {
|
|
Script = 0,
|
|
ScriptExtension = 1,
|
|
GeneralCategory = 2,
|
|
BinaryProperty = 3,
|
|
}
|
|
|
|
impl PropertyKind {
|
|
pub fn from_u8(v: u8) -> Option<Self> {
|
|
match v {
|
|
0 => Some(Self::Script),
|
|
1 => Some(Self::ScriptExtension),
|
|
2 => Some(Self::GeneralCategory),
|
|
3 => Some(Self::BinaryProperty),
|
|
_ => None,
|
|
}
|
|
}
|
|
}
|
|
|
|
/// A resolved Unicode property — the string name/value has been resolved to
|
|
/// an ICU enum at compile time, so match-time lookups avoid string parsing.
|
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
|
pub struct ResolvedProperty {
|
|
/// The kind of Unicode property (Script, GeneralCategory, etc.).
|
|
pub kind: PropertyKind,
|
|
/// ICU enum value (e.g. script code, general category, binary property ID).
|
|
pub id: u32,
|
|
}
|
|
|
|
/// Data for a Unicode property match instruction.
|
|
#[derive(Debug, Clone, PartialEq, Eq)]
|
|
pub struct UnicodePropertyData {
|
|
pub negated: bool,
|
|
pub name: String,
|
|
pub value: Option<String>,
|
|
/// Resolved property for fast O(1) ICU trie lookups at match time.
|
|
pub resolved: Option<ResolvedProperty>,
|
|
}
|
|
|
|
/// A simple character matcher for optimized greedy/lazy loops.
|
|
#[derive(Debug, Clone, PartialEq, Eq)]
|
|
pub enum SimpleMatch {
|
|
/// Any character (`.`), with dot_all flag.
|
|
AnyChar { dot_all: bool },
|
|
/// A single character.
|
|
Char(u32),
|
|
/// Case-insensitive character.
|
|
CharNoCase(u32, u32),
|
|
/// Character class (set of ranges), negated flag.
|
|
CharClass {
|
|
ranges: Vec<CharRange>,
|
|
negated: bool,
|
|
},
|
|
/// Built-in class (\d, \w, \s, etc.)
|
|
BuiltinClass(BuiltinCharacterClass),
|
|
/// Unicode property (\p{...}, \P{...}).
|
|
UnicodeProperty(Box<UnicodePropertyData>),
|
|
}
|
|
|
|
/// A character range for CharClass instructions (u32 code points).
|
|
#[derive(Debug, Clone, PartialEq, Eq)]
|
|
pub struct CharRange {
|
|
pub start: u32,
|
|
pub end: u32,
|
|
}
|
|
|
|
/// Re-export for use by the compiler and VM.
|
|
pub use crate::ast::BuiltinCharacterClass;
|
|
|
|
impl Default for Program {
|
|
fn default() -> Self {
|
|
Self::new()
|
|
}
|
|
}
|
|
|
|
impl Program {
|
|
pub fn new() -> Self {
|
|
Self {
|
|
instructions: Vec::new(),
|
|
capture_count: 0,
|
|
register_count: 2, // group 0 always exists
|
|
unicode: false,
|
|
unicode_sets: false,
|
|
ignore_case: false,
|
|
multiline: false,
|
|
dot_all: false,
|
|
named_groups: Vec::new(),
|
|
}
|
|
}
|
|
|
|
pub fn emit(&mut self, inst: Instruction) -> u32 {
|
|
let idx = self.instructions.len() as u32;
|
|
self.instructions.push(inst);
|
|
idx
|
|
}
|
|
|
|
pub fn current_offset(&self) -> u32 {
|
|
self.instructions.len() as u32
|
|
}
|
|
|
|
pub fn patch_jump(&mut self, at: u32, target: u32) {
|
|
match &mut self.instructions[at as usize] {
|
|
Instruction::Jump(t) => *t = target,
|
|
Instruction::Split { prefer, .. } if *prefer == u32::MAX => *prefer = target,
|
|
Instruction::Split { other, .. } if *other == u32::MAX => *other = target,
|
|
inst => panic!("cannot patch non-jump instruction: {inst:?}"),
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Encode a Unicode code point as WTF-16 into `out`.
|
|
/// Returns `None` if the code point is out of range (> U+10FFFF).
|
|
pub fn append_code_point_wtf16(out: &mut Vec<u16>, cp: u32) -> Option<()> {
|
|
if cp <= 0xFFFF {
|
|
out.push(cp as u16);
|
|
return Some(());
|
|
}
|
|
if cp > 0x10FFFF {
|
|
return None;
|
|
}
|
|
let cp = cp - 0x10000;
|
|
out.push(0xD800 | ((cp >> 10) as u16));
|
|
out.push(0xDC00 | ((cp & 0x3FF) as u16));
|
|
Some(())
|
|
}
|