mirror of
https://github.com/LadybirdBrowser/ladybird
synced 2026-04-25 17:25:08 +02:00
LibRegex: Optimize (^|literal) split prefixes
Patterns like `(?:^|;)\s*foo=...` can only start matching at input start or at occurrences of the separator, but the generic start-position loop still entered the VM at each byte and paid the leading split/backtrack cost on every miss. Teach the start-position analysis to recognize this `(^|literal)` shape and jump straight to those candidate positions. Keep the optimization narrow: wider literal sets would need a single-pass scanner, and rescanning once per literal would make miss-heavy alternations quadratic. Add a LibRegex test for the cookie-style prefix. TestRegex still passes, and a release js benchmark exercising this shape remains fast.
This commit is contained in:
committed by
Andreas Kling
parent
275e141823
commit
3efe8043f7
Notes:
github-actions[bot]
2026-03-29 14:09:14 +00:00
Author: https://github.com/awesomekling Commit: https://github.com/LadybirdBrowser/ladybird/commit/3efe8043f70 Pull-request: https://github.com/LadybirdBrowser/ladybird/pull/8667
@@ -485,6 +485,33 @@ pub fn find_all_with_scratch<I: Input>(
|
||||
let mut count = 0i32;
|
||||
let mut pos = start_pos;
|
||||
|
||||
if let Some(ref start_hint) = hints.start_position_hint
|
||||
&& !program.unicode
|
||||
{
|
||||
while let Some(candidate_pos) = next_literal_start_from_hint(input, pos, start_hint) {
|
||||
vm.reset(candidate_pos);
|
||||
match vm.run() {
|
||||
VmResult::Match => {
|
||||
let match_start = vm.registers[0];
|
||||
let match_end = vm.registers[1];
|
||||
let idx = count as usize * 2;
|
||||
if idx + 1 >= capacity {
|
||||
return -1;
|
||||
}
|
||||
result_buf[idx] = match_start;
|
||||
result_buf[idx + 1] = match_end;
|
||||
count += 1;
|
||||
pos = next_search_position(match_start, match_end);
|
||||
}
|
||||
VmResult::LimitExceeded => return -2,
|
||||
VmResult::NoMatch => {
|
||||
pos = candidate_pos + 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
// Fast path: non-unicode pattern starting with a literal character.
|
||||
if let Some((ch, false)) = hints.first_char
|
||||
&& !program.unicode
|
||||
@@ -623,6 +650,29 @@ fn execute_into_impl<I: Input>(
|
||||
let mut vm = Vm::new(program, input, start_pos, scratch);
|
||||
let mut hit_limit = false;
|
||||
|
||||
if let Some(ref start_hint) = hints.start_position_hint
|
||||
&& !program.unicode
|
||||
{
|
||||
let mut pos = start_pos;
|
||||
while let Some(candidate_pos) = next_literal_start_from_hint(input, pos, start_hint) {
|
||||
vm.reset(candidate_pos);
|
||||
match vm.run() {
|
||||
VmResult::Match => {
|
||||
copy_captures_to_out(vm.registers, program.capture_count, out);
|
||||
return VmResult::Match;
|
||||
}
|
||||
VmResult::LimitExceeded => hit_limit = true,
|
||||
VmResult::NoMatch => {}
|
||||
}
|
||||
pos = candidate_pos + 1;
|
||||
}
|
||||
return if hit_limit {
|
||||
VmResult::LimitExceeded
|
||||
} else {
|
||||
VmResult::NoMatch
|
||||
};
|
||||
}
|
||||
|
||||
// Fast path: non-unicode pattern starting with a literal character.
|
||||
// Use iter().position() for bulk scanning (LLVM can auto-vectorize this).
|
||||
if let Some((ch, false)) = hints.first_char
|
||||
@@ -827,6 +877,9 @@ pub struct PatternHints {
|
||||
first_char: Option<(u32, bool)>,
|
||||
/// First instruction filter: skip positions where the first matcher can't match.
|
||||
first_filter: Option<SimpleMatch>,
|
||||
/// Leading alternatives that can only begin at position 0 or at one of a
|
||||
/// small set of literal code units.
|
||||
start_position_hint: Option<StartPositionHint>,
|
||||
/// Pattern starts with ^ (AssertStart) — only try at line starts (or input start).
|
||||
starts_with_anchor: bool,
|
||||
/// Whether the anchor is multiline (^ matches at line starts, not just input start).
|
||||
@@ -842,6 +895,11 @@ pub struct PatternHints {
|
||||
can_match_empty: bool,
|
||||
}
|
||||
|
||||
struct StartPositionHint {
|
||||
includes_input_start: bool,
|
||||
literal_code_units: Vec<u16>,
|
||||
}
|
||||
|
||||
/// A simple pattern that can be scanned without the full VM.
|
||||
enum SimpleScan {
|
||||
/// A single character class.
|
||||
@@ -906,6 +964,95 @@ fn first_char_at(instructions: &[Instruction], pc: usize) -> Option<(u32, bool)>
|
||||
}
|
||||
}
|
||||
|
||||
enum LeadingAlternativeStart {
|
||||
InputStart,
|
||||
LiteralCodeUnit(u16),
|
||||
}
|
||||
|
||||
fn leading_alternative_start_at(
|
||||
instructions: &[Instruction],
|
||||
pc: usize,
|
||||
) -> Option<LeadingAlternativeStart> {
|
||||
match instructions.get(pc)? {
|
||||
Instruction::AssertStart { multiline: false } => Some(LeadingAlternativeStart::InputStart),
|
||||
Instruction::Char(c) if *c <= 0xFFFF => {
|
||||
Some(LeadingAlternativeStart::LiteralCodeUnit(*c as u16))
|
||||
}
|
||||
Instruction::Save(_) | Instruction::Nop => {
|
||||
leading_alternative_start_at(instructions, pc + 1)
|
||||
}
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
fn analyze_start_position_hint(
|
||||
instructions: &[Instruction],
|
||||
start: usize,
|
||||
) -> Option<StartPositionHint> {
|
||||
let Instruction::Split { .. } = instructions.get(start)? else {
|
||||
return None;
|
||||
};
|
||||
|
||||
let mut hint = StartPositionHint {
|
||||
includes_input_start: false,
|
||||
literal_code_units: Vec::new(),
|
||||
};
|
||||
let mut pc = start;
|
||||
|
||||
loop {
|
||||
match instructions.get(pc)? {
|
||||
Instruction::Split { prefer, other } => {
|
||||
match leading_alternative_start_at(instructions, *prefer as usize)? {
|
||||
LeadingAlternativeStart::InputStart => hint.includes_input_start = true,
|
||||
LeadingAlternativeStart::LiteralCodeUnit(ch) => {
|
||||
if !hint.literal_code_units.contains(&ch) {
|
||||
hint.literal_code_units.push(ch);
|
||||
}
|
||||
}
|
||||
}
|
||||
pc = *other as usize;
|
||||
}
|
||||
_ => {
|
||||
match leading_alternative_start_at(instructions, pc)? {
|
||||
LeadingAlternativeStart::InputStart => hint.includes_input_start = true,
|
||||
LeadingAlternativeStart::LiteralCodeUnit(ch) => {
|
||||
if !hint.literal_code_units.contains(&ch) {
|
||||
hint.literal_code_units.push(ch);
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Keep this hint narrowly scoped to `(^|literal)` prefixes. Wider literal
|
||||
// sets require a single-pass scanner; repeated `find_code_unit()` probes
|
||||
// per literal turn miss-heavy inputs quadratic.
|
||||
if hint.includes_input_start && hint.literal_code_units.len() == 1 {
|
||||
Some(hint)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn next_literal_start_from_hint<I: Input>(
|
||||
input: I,
|
||||
start: usize,
|
||||
hint: &StartPositionHint,
|
||||
) -> Option<usize> {
|
||||
let [literal] = hint.literal_code_units.as_slice() else {
|
||||
return None;
|
||||
};
|
||||
|
||||
if hint.includes_input_start && start == 0 {
|
||||
return Some(0);
|
||||
}
|
||||
|
||||
input.next_literal_start(start, *literal)
|
||||
}
|
||||
|
||||
/// Analyze the program to extract optimization hints.
|
||||
pub fn analyze_pattern(program: &Program, can_match_empty: bool) -> PatternHints {
|
||||
// Pattern typically starts with Save(0), then the first real instruction.
|
||||
@@ -955,6 +1102,12 @@ pub fn analyze_pattern(program: &Program, can_match_empty: bool) -> PatternHints
|
||||
None
|
||||
};
|
||||
|
||||
let start_position_hint = if !program.unicode {
|
||||
analyze_start_position_hint(&program.instructions, filter_offset)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let (starts_with_anchor, anchor_multiline) = match first_inst {
|
||||
Some(Instruction::AssertStart { multiline }) => (true, *multiline || program.multiline),
|
||||
_ => (false, false),
|
||||
@@ -1019,6 +1172,7 @@ pub fn analyze_pattern(program: &Program, can_match_empty: bool) -> PatternHints
|
||||
PatternHints {
|
||||
first_char,
|
||||
first_filter,
|
||||
start_position_hint,
|
||||
starts_with_anchor,
|
||||
anchor_multiline,
|
||||
trailing_literal,
|
||||
|
||||
Reference in New Issue
Block a user