LibRegex: Optimize (^|literal) split prefixes

Patterns like `(?:^|;)\s*foo=...` can only start matching at input
start or at occurrences of the separator, but the generic
start-position loop still entered the VM at each byte and paid the
leading split/backtrack cost on every miss.

Teach the start-position analysis to recognize this `(^|literal)`
shape and jump straight to those candidate positions. Keep the
optimization narrow: wider literal sets would need a single-pass
scanner, and rescanning once per literal would make miss-heavy
alternations quadratic.

Add a LibRegex test for the cookie-style prefix. TestRegex still
passes, and a release js benchmark exercising this shape remains
fast.
This commit is contained in:
Andreas Kling
2026-03-27 21:55:36 +01:00
committed by Andreas Kling
parent 275e141823
commit 3efe8043f7
Notes: github-actions[bot] 2026-03-29 14:09:14 +00:00
2 changed files with 173 additions and 0 deletions

View File

@@ -485,6 +485,33 @@ pub fn find_all_with_scratch<I: Input>(
let mut count = 0i32;
let mut pos = start_pos;
if let Some(ref start_hint) = hints.start_position_hint
&& !program.unicode
{
while let Some(candidate_pos) = next_literal_start_from_hint(input, pos, start_hint) {
vm.reset(candidate_pos);
match vm.run() {
VmResult::Match => {
let match_start = vm.registers[0];
let match_end = vm.registers[1];
let idx = count as usize * 2;
if idx + 1 >= capacity {
return -1;
}
result_buf[idx] = match_start;
result_buf[idx + 1] = match_end;
count += 1;
pos = next_search_position(match_start, match_end);
}
VmResult::LimitExceeded => return -2,
VmResult::NoMatch => {
pos = candidate_pos + 1;
}
}
}
return count;
}
// Fast path: non-unicode pattern starting with a literal character.
if let Some((ch, false)) = hints.first_char
&& !program.unicode
@@ -623,6 +650,29 @@ fn execute_into_impl<I: Input>(
let mut vm = Vm::new(program, input, start_pos, scratch);
let mut hit_limit = false;
if let Some(ref start_hint) = hints.start_position_hint
&& !program.unicode
{
let mut pos = start_pos;
while let Some(candidate_pos) = next_literal_start_from_hint(input, pos, start_hint) {
vm.reset(candidate_pos);
match vm.run() {
VmResult::Match => {
copy_captures_to_out(vm.registers, program.capture_count, out);
return VmResult::Match;
}
VmResult::LimitExceeded => hit_limit = true,
VmResult::NoMatch => {}
}
pos = candidate_pos + 1;
}
return if hit_limit {
VmResult::LimitExceeded
} else {
VmResult::NoMatch
};
}
// Fast path: non-unicode pattern starting with a literal character.
// Use iter().position() for bulk scanning (LLVM can auto-vectorize this).
if let Some((ch, false)) = hints.first_char
@@ -827,6 +877,9 @@ pub struct PatternHints {
first_char: Option<(u32, bool)>,
/// First instruction filter: skip positions where the first matcher can't match.
first_filter: Option<SimpleMatch>,
/// Leading alternatives that can only begin at position 0 or at one of a
/// small set of literal code units.
start_position_hint: Option<StartPositionHint>,
/// Pattern starts with ^ (AssertStart) — only try at line starts (or input start).
starts_with_anchor: bool,
/// Whether the anchor is multiline (^ matches at line starts, not just input start).
@@ -842,6 +895,11 @@ pub struct PatternHints {
can_match_empty: bool,
}
struct StartPositionHint {
includes_input_start: bool,
literal_code_units: Vec<u16>,
}
/// A simple pattern that can be scanned without the full VM.
enum SimpleScan {
/// A single character class.
@@ -906,6 +964,95 @@ fn first_char_at(instructions: &[Instruction], pc: usize) -> Option<(u32, bool)>
}
}
enum LeadingAlternativeStart {
InputStart,
LiteralCodeUnit(u16),
}
fn leading_alternative_start_at(
instructions: &[Instruction],
pc: usize,
) -> Option<LeadingAlternativeStart> {
match instructions.get(pc)? {
Instruction::AssertStart { multiline: false } => Some(LeadingAlternativeStart::InputStart),
Instruction::Char(c) if *c <= 0xFFFF => {
Some(LeadingAlternativeStart::LiteralCodeUnit(*c as u16))
}
Instruction::Save(_) | Instruction::Nop => {
leading_alternative_start_at(instructions, pc + 1)
}
_ => None,
}
}
fn analyze_start_position_hint(
instructions: &[Instruction],
start: usize,
) -> Option<StartPositionHint> {
let Instruction::Split { .. } = instructions.get(start)? else {
return None;
};
let mut hint = StartPositionHint {
includes_input_start: false,
literal_code_units: Vec::new(),
};
let mut pc = start;
loop {
match instructions.get(pc)? {
Instruction::Split { prefer, other } => {
match leading_alternative_start_at(instructions, *prefer as usize)? {
LeadingAlternativeStart::InputStart => hint.includes_input_start = true,
LeadingAlternativeStart::LiteralCodeUnit(ch) => {
if !hint.literal_code_units.contains(&ch) {
hint.literal_code_units.push(ch);
}
}
}
pc = *other as usize;
}
_ => {
match leading_alternative_start_at(instructions, pc)? {
LeadingAlternativeStart::InputStart => hint.includes_input_start = true,
LeadingAlternativeStart::LiteralCodeUnit(ch) => {
if !hint.literal_code_units.contains(&ch) {
hint.literal_code_units.push(ch);
}
}
}
break;
}
}
}
// Keep this hint narrowly scoped to `(^|literal)` prefixes. Wider literal
// sets require a single-pass scanner; repeated `find_code_unit()` probes
// per literal turn miss-heavy inputs quadratic.
if hint.includes_input_start && hint.literal_code_units.len() == 1 {
Some(hint)
} else {
None
}
}
#[inline(always)]
fn next_literal_start_from_hint<I: Input>(
input: I,
start: usize,
hint: &StartPositionHint,
) -> Option<usize> {
let [literal] = hint.literal_code_units.as_slice() else {
return None;
};
if hint.includes_input_start && start == 0 {
return Some(0);
}
input.next_literal_start(start, *literal)
}
/// Analyze the program to extract optimization hints.
pub fn analyze_pattern(program: &Program, can_match_empty: bool) -> PatternHints {
// Pattern typically starts with Save(0), then the first real instruction.
@@ -955,6 +1102,12 @@ pub fn analyze_pattern(program: &Program, can_match_empty: bool) -> PatternHints
None
};
let start_position_hint = if !program.unicode {
analyze_start_position_hint(&program.instructions, filter_offset)
} else {
None
};
let (starts_with_anchor, anchor_multiline) = match first_inst {
Some(Instruction::AssertStart { multiline }) => (true, *multiline || program.multiline),
_ => (false, false),
@@ -1019,6 +1172,7 @@ pub fn analyze_pattern(program: &Program, can_match_empty: bool) -> PatternHints
PatternHints {
first_char,
first_filter,
start_position_hint,
starts_with_anchor,
anchor_multiline,
trailing_literal,