mirror of
https://github.com/LadybirdBrowser/ladybird
synced 2026-04-25 17:25:08 +02:00
LibRegex: Leave suffix minima for repeated simple loops
Repeated simple loops like "a+".repeat(100) compile to a chain of greedy loop instructions. When one loop failed, the VM only knew how to give back one character at a time unless the next instruction was a literal char, so V8's regexp-fallback case ran into the backtrack limit instead of finding the obvious match. When a greedy simple loop is followed by more loops for the same matcher, sum their minimum counts and backtrack far enough to leave the missing suffix in one step. If that suffix is already available to the right, still give back one character so the VM makes progress instead of reusing the same greedy state forever. The RegExp runtime test now covers the Chromium regexp-fallback case through exec(), global exec(), and both replace() paths, plus bounded same-matcher chains where the suffix minimum is partly missing or already available.
This commit is contained in:
committed by
Andreas Kling
parent
f24bdb9f94
commit
c828c87408
Notes:
github-actions[bot]
2026-03-31 14:01:59 +00:00
Author: https://github.com/awesomekling Commit: https://github.com/LadybirdBrowser/ladybird/commit/c828c87408b Pull-request: https://github.com/LadybirdBrowser/ladybird/pull/8696
@@ -2702,6 +2702,116 @@ impl<'a, I: Input> Vm<'a, I> {
|
||||
);
|
||||
}
|
||||
|
||||
// Sum the minimum counts of immediately following loops that match the
|
||||
// same simple atom, so a greedy loop can leave that suffix in one step.
|
||||
fn same_matcher_loop_suffix_min(&self, mut pc: usize, matcher: &SimpleMatch) -> Option<u32> {
|
||||
let mut total = 0u32;
|
||||
let mut saw_positive_min = false;
|
||||
|
||||
while let Some(instruction) = self.program.instructions.get(pc) {
|
||||
let min = match instruction {
|
||||
Instruction::GreedyLoop {
|
||||
matcher: next_matcher,
|
||||
min,
|
||||
..
|
||||
}
|
||||
| Instruction::LazyLoop {
|
||||
matcher: next_matcher,
|
||||
min,
|
||||
..
|
||||
} if next_matcher == matcher => *min,
|
||||
_ => break,
|
||||
};
|
||||
total = total.checked_add(min)?;
|
||||
saw_positive_min |= min > 0;
|
||||
pc += 1;
|
||||
}
|
||||
|
||||
if saw_positive_min { Some(total) } else { None }
|
||||
}
|
||||
|
||||
fn count_available_same_matcher_chars(
|
||||
&self,
|
||||
mut pos: usize,
|
||||
matcher: &SimpleMatch,
|
||||
max: u32,
|
||||
backward: bool,
|
||||
) -> u32 {
|
||||
let mut count = 0;
|
||||
|
||||
while count < max {
|
||||
let cp = if backward {
|
||||
let next_pos = self.retreat_one_char(pos);
|
||||
if next_pos == pos {
|
||||
break;
|
||||
}
|
||||
pos = next_pos;
|
||||
decode_code_point(self.program.unicode, self.input, pos)
|
||||
} else {
|
||||
if pos >= self.input_len() {
|
||||
break;
|
||||
}
|
||||
let cp = decode_code_point(self.program.unicode, self.input, pos);
|
||||
pos = self.advance_one_char(pos);
|
||||
cp
|
||||
};
|
||||
|
||||
if !self.match_simple(cp, matcher) {
|
||||
break;
|
||||
}
|
||||
|
||||
count += 1;
|
||||
}
|
||||
|
||||
count
|
||||
}
|
||||
|
||||
fn same_matcher_loop_suffix_deficit(
|
||||
&self,
|
||||
pc: usize,
|
||||
current_pos: usize,
|
||||
backward: bool,
|
||||
) -> Option<u32> {
|
||||
let matcher = match self.program.instructions.get(pc) {
|
||||
Some(Instruction::GreedyLoop { matcher, .. }) => matcher,
|
||||
_ => return None,
|
||||
};
|
||||
|
||||
let suffix_min = self.same_matcher_loop_suffix_min(pc + 1, matcher)?;
|
||||
let available =
|
||||
self.count_available_same_matcher_chars(current_pos, matcher, suffix_min, backward);
|
||||
let deficit = suffix_min.saturating_sub(available);
|
||||
(deficit > 0).then_some(deficit)
|
||||
}
|
||||
|
||||
fn retreat_n_chars(&self, mut pos: usize, count: u32) -> Option<usize> {
|
||||
if !self.program.unicode {
|
||||
return pos.checked_sub(count as usize);
|
||||
}
|
||||
for _ in 0..count {
|
||||
let next = self.retreat_one_char(pos);
|
||||
if next == pos {
|
||||
return None;
|
||||
}
|
||||
pos = next;
|
||||
}
|
||||
Some(pos)
|
||||
}
|
||||
|
||||
fn advance_n_chars(&self, mut pos: usize, count: u32) -> Option<usize> {
|
||||
if !self.program.unicode {
|
||||
return pos.checked_add(count as usize);
|
||||
}
|
||||
for _ in 0..count {
|
||||
let next = self.advance_one_char(pos);
|
||||
if next == pos {
|
||||
return None;
|
||||
}
|
||||
pos = next;
|
||||
}
|
||||
Some(pos)
|
||||
}
|
||||
|
||||
fn backtrack(&mut self) -> bool {
|
||||
if self.backtrack_stack.len() <= self.bt_floor {
|
||||
return false;
|
||||
@@ -2737,6 +2847,9 @@ impl<'a, I: Input> Vm<'a, I> {
|
||||
self.modifiers = modifiers;
|
||||
self.modifier_stack.truncate(modifier_stack_len);
|
||||
|
||||
let same_matcher_loop_suffix_deficit =
|
||||
self.same_matcher_loop_suffix_deficit(pc as usize, current_pos, backward);
|
||||
|
||||
if backward {
|
||||
// Backward mode (lookbehind): the greedy loop consumed
|
||||
// leftward from start_pos to current_pos.
|
||||
@@ -2756,62 +2869,75 @@ impl<'a, I: Input> Vm<'a, I> {
|
||||
};
|
||||
|
||||
let next_inst = self.program.instructions.get(pc as usize + 1);
|
||||
let new_pos = match next_inst {
|
||||
Some(Instruction::Char(target))
|
||||
if !self.modifiers.ignore_case && *target <= 0xFFFF =>
|
||||
{
|
||||
let target = *target as u16;
|
||||
// In backward mode the next Char reads the code
|
||||
// point immediately to the left of the current
|
||||
// position, so scan candidate boundaries and
|
||||
// check the code unit before each boundary.
|
||||
let mut scan_pos = self.advance_one_char(current_pos);
|
||||
loop {
|
||||
if scan_pos > max_pos {
|
||||
return self.backtrack();
|
||||
}
|
||||
if scan_pos > 0 && self.input_code_unit(scan_pos - 1) == target
|
||||
{
|
||||
break scan_pos;
|
||||
}
|
||||
let next_scan_pos = self.advance_one_char(scan_pos);
|
||||
if next_scan_pos == scan_pos {
|
||||
return self.backtrack();
|
||||
}
|
||||
scan_pos = next_scan_pos;
|
||||
}
|
||||
let new_pos = if let Some(suffix_deficit) = same_matcher_loop_suffix_deficit
|
||||
{
|
||||
let Some(pos) = self.advance_n_chars(current_pos, suffix_deficit)
|
||||
else {
|
||||
return self.backtrack();
|
||||
};
|
||||
if pos > max_pos {
|
||||
return self.backtrack();
|
||||
}
|
||||
Some(Instruction::Char(target))
|
||||
if !self.modifiers.ignore_case && *target > 0xFFFF =>
|
||||
{
|
||||
let hi = ((*target - 0x10000) >> 10) as u16 + 0xD800;
|
||||
let lo = ((*target - 0x10000) & 0x3FF) as u16 + 0xDC00;
|
||||
let mut scan_pos = self.advance_one_char(current_pos);
|
||||
loop {
|
||||
if scan_pos > max_pos {
|
||||
pos
|
||||
} else {
|
||||
match next_inst {
|
||||
Some(Instruction::Char(target))
|
||||
if !self.modifiers.ignore_case && *target <= 0xFFFF =>
|
||||
{
|
||||
let target = *target as u16;
|
||||
// In backward mode the next Char reads the code
|
||||
// point immediately to the left of the current
|
||||
// position, so scan candidate boundaries and
|
||||
// check the code unit before each boundary.
|
||||
let mut scan_pos = self.advance_one_char(current_pos);
|
||||
loop {
|
||||
if scan_pos > max_pos {
|
||||
return self.backtrack();
|
||||
}
|
||||
if scan_pos > 0
|
||||
&& self.input_code_unit(scan_pos - 1) == target
|
||||
{
|
||||
break scan_pos;
|
||||
}
|
||||
let next_scan_pos = self.advance_one_char(scan_pos);
|
||||
if next_scan_pos == scan_pos {
|
||||
return self.backtrack();
|
||||
}
|
||||
scan_pos = next_scan_pos;
|
||||
}
|
||||
}
|
||||
Some(Instruction::Char(target))
|
||||
if !self.modifiers.ignore_case && *target > 0xFFFF =>
|
||||
{
|
||||
let hi = ((*target - 0x10000) >> 10) as u16 + 0xD800;
|
||||
let lo = ((*target - 0x10000) & 0x3FF) as u16 + 0xDC00;
|
||||
let mut scan_pos = self.advance_one_char(current_pos);
|
||||
loop {
|
||||
if scan_pos > max_pos {
|
||||
return self.backtrack();
|
||||
}
|
||||
if scan_pos >= 2
|
||||
&& self.input_code_unit(scan_pos - 2) == hi
|
||||
&& self.input_code_unit(scan_pos - 1) == lo
|
||||
{
|
||||
break scan_pos;
|
||||
}
|
||||
let next_scan_pos = self.advance_one_char(scan_pos);
|
||||
if next_scan_pos == scan_pos {
|
||||
return self.backtrack();
|
||||
}
|
||||
scan_pos = next_scan_pos;
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
// Give back one character: advance rightward.
|
||||
let advanced = self.advance_one_char(current_pos);
|
||||
if advanced > max_pos {
|
||||
return self.backtrack();
|
||||
}
|
||||
if scan_pos >= 2
|
||||
&& self.input_code_unit(scan_pos - 2) == hi
|
||||
&& self.input_code_unit(scan_pos - 1) == lo
|
||||
{
|
||||
break scan_pos;
|
||||
}
|
||||
let next_scan_pos = self.advance_one_char(scan_pos);
|
||||
if next_scan_pos == scan_pos {
|
||||
return self.backtrack();
|
||||
}
|
||||
scan_pos = next_scan_pos;
|
||||
advanced
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
// Give back one character: advance rightward.
|
||||
let advanced = self.advance_one_char(current_pos);
|
||||
if advanced > max_pos {
|
||||
return self.backtrack();
|
||||
}
|
||||
advanced
|
||||
}
|
||||
};
|
||||
|
||||
// Push updated state if we can still give back more.
|
||||
@@ -2853,59 +2979,71 @@ impl<'a, I: Input> Vm<'a, I> {
|
||||
// Char, scan backward for that char to skip positions
|
||||
// that can't match.
|
||||
let next_inst = self.program.instructions.get(pc as usize + 1);
|
||||
let new_pos = match next_inst {
|
||||
Some(Instruction::Char(target))
|
||||
if !self.modifiers.ignore_case && *target <= 0xFFFF =>
|
||||
{
|
||||
// BMP char: scan code units directly.
|
||||
let target = *target as u16;
|
||||
let mut scan_pos =
|
||||
if current_pos > 0 { current_pos - 1 } else { 0 };
|
||||
loop {
|
||||
if scan_pos < min_pos {
|
||||
return self.backtrack();
|
||||
}
|
||||
if self.input_code_unit(scan_pos) == target {
|
||||
break scan_pos;
|
||||
}
|
||||
if scan_pos == 0 {
|
||||
return self.backtrack();
|
||||
}
|
||||
scan_pos -= 1;
|
||||
}
|
||||
let new_pos = if let Some(suffix_deficit) = same_matcher_loop_suffix_deficit
|
||||
{
|
||||
let Some(pos) = self.retreat_n_chars(current_pos, suffix_deficit)
|
||||
else {
|
||||
return self.backtrack();
|
||||
};
|
||||
if pos < min_pos {
|
||||
return self.backtrack();
|
||||
}
|
||||
Some(Instruction::Char(target))
|
||||
if !self.modifiers.ignore_case && *target > 0xFFFF =>
|
||||
{
|
||||
// Supplementary char: scan for the surrogate pair.
|
||||
let hi = ((*target - 0x10000) >> 10) as u16 + 0xD800;
|
||||
let lo = ((*target - 0x10000) & 0x3FF) as u16 + 0xDC00;
|
||||
let mut scan_pos =
|
||||
if current_pos > 1 { current_pos - 2 } else { 0 };
|
||||
loop {
|
||||
if scan_pos < min_pos {
|
||||
pos
|
||||
} else {
|
||||
match next_inst {
|
||||
Some(Instruction::Char(target))
|
||||
if !self.modifiers.ignore_case && *target <= 0xFFFF =>
|
||||
{
|
||||
// BMP char: scan code units directly.
|
||||
let target = *target as u16;
|
||||
let mut scan_pos =
|
||||
if current_pos > 0 { current_pos - 1 } else { 0 };
|
||||
loop {
|
||||
if scan_pos < min_pos {
|
||||
return self.backtrack();
|
||||
}
|
||||
if self.input_code_unit(scan_pos) == target {
|
||||
break scan_pos;
|
||||
}
|
||||
if scan_pos == 0 {
|
||||
return self.backtrack();
|
||||
}
|
||||
scan_pos -= 1;
|
||||
}
|
||||
}
|
||||
Some(Instruction::Char(target))
|
||||
if !self.modifiers.ignore_case && *target > 0xFFFF =>
|
||||
{
|
||||
// Supplementary char: scan for the surrogate pair.
|
||||
let hi = ((*target - 0x10000) >> 10) as u16 + 0xD800;
|
||||
let lo = ((*target - 0x10000) & 0x3FF) as u16 + 0xDC00;
|
||||
let mut scan_pos =
|
||||
if current_pos > 1 { current_pos - 2 } else { 0 };
|
||||
loop {
|
||||
if scan_pos < min_pos {
|
||||
return self.backtrack();
|
||||
}
|
||||
if scan_pos + 1 < self.input_len()
|
||||
&& self.input_code_unit(scan_pos) == hi
|
||||
&& self.input_code_unit(scan_pos + 1) == lo
|
||||
{
|
||||
break scan_pos;
|
||||
}
|
||||
if scan_pos == 0 {
|
||||
return self.backtrack();
|
||||
}
|
||||
scan_pos -= 1;
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
// Default: retreat one character.
|
||||
let retreated = self.retreat_one_char(current_pos);
|
||||
if retreated < min_pos {
|
||||
return self.backtrack();
|
||||
}
|
||||
if scan_pos + 1 < self.input_len()
|
||||
&& self.input_code_unit(scan_pos) == hi
|
||||
&& self.input_code_unit(scan_pos + 1) == lo
|
||||
{
|
||||
break scan_pos;
|
||||
}
|
||||
if scan_pos == 0 {
|
||||
return self.backtrack();
|
||||
}
|
||||
scan_pos -= 1;
|
||||
retreated
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
// Default: retreat one character.
|
||||
let retreated = self.retreat_one_char(current_pos);
|
||||
if retreated < min_pos {
|
||||
return self.backtrack();
|
||||
}
|
||||
retreated
|
||||
}
|
||||
};
|
||||
|
||||
// Push updated greedy state for further backtracking.
|
||||
|
||||
Reference in New Issue
Block a user