LibRegex: Leave suffix minima for repeated simple loops

Repeated simple loops like "a+".repeat(100) compile to a chain of
greedy loop instructions. When one loop failed, the VM only knew how
to give back one character at a time unless the next instruction was a
literal char, so V8's regexp-fallback case ran into the backtrack
limit instead of finding the obvious match.

When a greedy simple loop is followed by more loops for the same
matcher, sum their minimum counts and backtrack far enough to leave the
missing suffix in one step. If that suffix is already available to the
right, still give back one character so the VM makes progress instead
of reusing the same greedy state forever.

The RegExp runtime test now covers the Chromium regexp-fallback case
through exec(), global exec(), and both replace() paths, plus bounded
same-matcher chains where the suffix minimum is partly missing or
already available.
This commit is contained in:
Andreas Kling
2026-03-30 18:29:29 +02:00
committed by Andreas Kling
parent f24bdb9f94
commit c828c87408
Notes: github-actions[bot] 2026-03-31 14:01:59 +00:00
2 changed files with 266 additions and 99 deletions

View File

@@ -2702,6 +2702,116 @@ impl<'a, I: Input> Vm<'a, I> {
);
}
// Sum the minimum counts of immediately following loops that match the
// same simple atom, so a greedy loop can leave that suffix in one step.
fn same_matcher_loop_suffix_min(&self, mut pc: usize, matcher: &SimpleMatch) -> Option<u32> {
let mut total = 0u32;
let mut saw_positive_min = false;
while let Some(instruction) = self.program.instructions.get(pc) {
let min = match instruction {
Instruction::GreedyLoop {
matcher: next_matcher,
min,
..
}
| Instruction::LazyLoop {
matcher: next_matcher,
min,
..
} if next_matcher == matcher => *min,
_ => break,
};
total = total.checked_add(min)?;
saw_positive_min |= min > 0;
pc += 1;
}
if saw_positive_min { Some(total) } else { None }
}
fn count_available_same_matcher_chars(
&self,
mut pos: usize,
matcher: &SimpleMatch,
max: u32,
backward: bool,
) -> u32 {
let mut count = 0;
while count < max {
let cp = if backward {
let next_pos = self.retreat_one_char(pos);
if next_pos == pos {
break;
}
pos = next_pos;
decode_code_point(self.program.unicode, self.input, pos)
} else {
if pos >= self.input_len() {
break;
}
let cp = decode_code_point(self.program.unicode, self.input, pos);
pos = self.advance_one_char(pos);
cp
};
if !self.match_simple(cp, matcher) {
break;
}
count += 1;
}
count
}
fn same_matcher_loop_suffix_deficit(
&self,
pc: usize,
current_pos: usize,
backward: bool,
) -> Option<u32> {
let matcher = match self.program.instructions.get(pc) {
Some(Instruction::GreedyLoop { matcher, .. }) => matcher,
_ => return None,
};
let suffix_min = self.same_matcher_loop_suffix_min(pc + 1, matcher)?;
let available =
self.count_available_same_matcher_chars(current_pos, matcher, suffix_min, backward);
let deficit = suffix_min.saturating_sub(available);
(deficit > 0).then_some(deficit)
}
fn retreat_n_chars(&self, mut pos: usize, count: u32) -> Option<usize> {
if !self.program.unicode {
return pos.checked_sub(count as usize);
}
for _ in 0..count {
let next = self.retreat_one_char(pos);
if next == pos {
return None;
}
pos = next;
}
Some(pos)
}
fn advance_n_chars(&self, mut pos: usize, count: u32) -> Option<usize> {
if !self.program.unicode {
return pos.checked_add(count as usize);
}
for _ in 0..count {
let next = self.advance_one_char(pos);
if next == pos {
return None;
}
pos = next;
}
Some(pos)
}
fn backtrack(&mut self) -> bool {
if self.backtrack_stack.len() <= self.bt_floor {
return false;
@@ -2737,6 +2847,9 @@ impl<'a, I: Input> Vm<'a, I> {
self.modifiers = modifiers;
self.modifier_stack.truncate(modifier_stack_len);
let same_matcher_loop_suffix_deficit =
self.same_matcher_loop_suffix_deficit(pc as usize, current_pos, backward);
if backward {
// Backward mode (lookbehind): the greedy loop consumed
// leftward from start_pos to current_pos.
@@ -2756,62 +2869,75 @@ impl<'a, I: Input> Vm<'a, I> {
};
let next_inst = self.program.instructions.get(pc as usize + 1);
let new_pos = match next_inst {
Some(Instruction::Char(target))
if !self.modifiers.ignore_case && *target <= 0xFFFF =>
{
let target = *target as u16;
// In backward mode the next Char reads the code
// point immediately to the left of the current
// position, so scan candidate boundaries and
// check the code unit before each boundary.
let mut scan_pos = self.advance_one_char(current_pos);
loop {
if scan_pos > max_pos {
return self.backtrack();
}
if scan_pos > 0 && self.input_code_unit(scan_pos - 1) == target
{
break scan_pos;
}
let next_scan_pos = self.advance_one_char(scan_pos);
if next_scan_pos == scan_pos {
return self.backtrack();
}
scan_pos = next_scan_pos;
}
let new_pos = if let Some(suffix_deficit) = same_matcher_loop_suffix_deficit
{
let Some(pos) = self.advance_n_chars(current_pos, suffix_deficit)
else {
return self.backtrack();
};
if pos > max_pos {
return self.backtrack();
}
Some(Instruction::Char(target))
if !self.modifiers.ignore_case && *target > 0xFFFF =>
{
let hi = ((*target - 0x10000) >> 10) as u16 + 0xD800;
let lo = ((*target - 0x10000) & 0x3FF) as u16 + 0xDC00;
let mut scan_pos = self.advance_one_char(current_pos);
loop {
if scan_pos > max_pos {
pos
} else {
match next_inst {
Some(Instruction::Char(target))
if !self.modifiers.ignore_case && *target <= 0xFFFF =>
{
let target = *target as u16;
// In backward mode the next Char reads the code
// point immediately to the left of the current
// position, so scan candidate boundaries and
// check the code unit before each boundary.
let mut scan_pos = self.advance_one_char(current_pos);
loop {
if scan_pos > max_pos {
return self.backtrack();
}
if scan_pos > 0
&& self.input_code_unit(scan_pos - 1) == target
{
break scan_pos;
}
let next_scan_pos = self.advance_one_char(scan_pos);
if next_scan_pos == scan_pos {
return self.backtrack();
}
scan_pos = next_scan_pos;
}
}
Some(Instruction::Char(target))
if !self.modifiers.ignore_case && *target > 0xFFFF =>
{
let hi = ((*target - 0x10000) >> 10) as u16 + 0xD800;
let lo = ((*target - 0x10000) & 0x3FF) as u16 + 0xDC00;
let mut scan_pos = self.advance_one_char(current_pos);
loop {
if scan_pos > max_pos {
return self.backtrack();
}
if scan_pos >= 2
&& self.input_code_unit(scan_pos - 2) == hi
&& self.input_code_unit(scan_pos - 1) == lo
{
break scan_pos;
}
let next_scan_pos = self.advance_one_char(scan_pos);
if next_scan_pos == scan_pos {
return self.backtrack();
}
scan_pos = next_scan_pos;
}
}
_ => {
// Give back one character: advance rightward.
let advanced = self.advance_one_char(current_pos);
if advanced > max_pos {
return self.backtrack();
}
if scan_pos >= 2
&& self.input_code_unit(scan_pos - 2) == hi
&& self.input_code_unit(scan_pos - 1) == lo
{
break scan_pos;
}
let next_scan_pos = self.advance_one_char(scan_pos);
if next_scan_pos == scan_pos {
return self.backtrack();
}
scan_pos = next_scan_pos;
advanced
}
}
_ => {
// Give back one character: advance rightward.
let advanced = self.advance_one_char(current_pos);
if advanced > max_pos {
return self.backtrack();
}
advanced
}
};
// Push updated state if we can still give back more.
@@ -2853,59 +2979,71 @@ impl<'a, I: Input> Vm<'a, I> {
// Char, scan backward for that char to skip positions
// that can't match.
let next_inst = self.program.instructions.get(pc as usize + 1);
let new_pos = match next_inst {
Some(Instruction::Char(target))
if !self.modifiers.ignore_case && *target <= 0xFFFF =>
{
// BMP char: scan code units directly.
let target = *target as u16;
let mut scan_pos =
if current_pos > 0 { current_pos - 1 } else { 0 };
loop {
if scan_pos < min_pos {
return self.backtrack();
}
if self.input_code_unit(scan_pos) == target {
break scan_pos;
}
if scan_pos == 0 {
return self.backtrack();
}
scan_pos -= 1;
}
let new_pos = if let Some(suffix_deficit) = same_matcher_loop_suffix_deficit
{
let Some(pos) = self.retreat_n_chars(current_pos, suffix_deficit)
else {
return self.backtrack();
};
if pos < min_pos {
return self.backtrack();
}
Some(Instruction::Char(target))
if !self.modifiers.ignore_case && *target > 0xFFFF =>
{
// Supplementary char: scan for the surrogate pair.
let hi = ((*target - 0x10000) >> 10) as u16 + 0xD800;
let lo = ((*target - 0x10000) & 0x3FF) as u16 + 0xDC00;
let mut scan_pos =
if current_pos > 1 { current_pos - 2 } else { 0 };
loop {
if scan_pos < min_pos {
pos
} else {
match next_inst {
Some(Instruction::Char(target))
if !self.modifiers.ignore_case && *target <= 0xFFFF =>
{
// BMP char: scan code units directly.
let target = *target as u16;
let mut scan_pos =
if current_pos > 0 { current_pos - 1 } else { 0 };
loop {
if scan_pos < min_pos {
return self.backtrack();
}
if self.input_code_unit(scan_pos) == target {
break scan_pos;
}
if scan_pos == 0 {
return self.backtrack();
}
scan_pos -= 1;
}
}
Some(Instruction::Char(target))
if !self.modifiers.ignore_case && *target > 0xFFFF =>
{
// Supplementary char: scan for the surrogate pair.
let hi = ((*target - 0x10000) >> 10) as u16 + 0xD800;
let lo = ((*target - 0x10000) & 0x3FF) as u16 + 0xDC00;
let mut scan_pos =
if current_pos > 1 { current_pos - 2 } else { 0 };
loop {
if scan_pos < min_pos {
return self.backtrack();
}
if scan_pos + 1 < self.input_len()
&& self.input_code_unit(scan_pos) == hi
&& self.input_code_unit(scan_pos + 1) == lo
{
break scan_pos;
}
if scan_pos == 0 {
return self.backtrack();
}
scan_pos -= 1;
}
}
_ => {
// Default: retreat one character.
let retreated = self.retreat_one_char(current_pos);
if retreated < min_pos {
return self.backtrack();
}
if scan_pos + 1 < self.input_len()
&& self.input_code_unit(scan_pos) == hi
&& self.input_code_unit(scan_pos + 1) == lo
{
break scan_pos;
}
if scan_pos == 0 {
return self.backtrack();
}
scan_pos -= 1;
retreated
}
}
_ => {
// Default: retreat one character.
let retreated = self.retreat_one_char(current_pos);
if retreated < min_pos {
return self.backtrack();
}
retreated
}
};
// Push updated greedy state for further backtracking.