LibRegex: Inline \w and \d ranges in legacy positive char classes

Previously, a character class containing any builtin (\d, \w, \s)
forced the compiler down the slow "complex class" path, which emits
a disjunction of alternatives and backtracks at runtime.

For non-unicode, non-unicode-sets, non-negated classes, \w and \d
can be inlined as their raw ASCII code-point ranges. The resulting
class stays on the fast path and compiles into a single sorted
CharClass instruction.

The unicode/unicode_sets and negation guards are required for
correctness: with the /u + /i flags, \w gains non-ASCII members
via case folding (e.g. U+017F, U+212A), and negated classes have
a separate, smarter compilation path.
This commit is contained in:
Aliaksandr Kalenik
2026-04-21 15:22:00 +02:00
committed by Andreas Kling
parent 8c4870f207
commit 7685a5e14a
Notes: github-actions[bot] 2026-04-21 14:37:38 +00:00
2 changed files with 53 additions and 0 deletions

View File

@@ -58,6 +58,34 @@ struct Compiler {
}
impl Compiler {
fn append_ascii_char_range(char_ranges: &mut Vec<CharRange>, start: u8, end: u8) {
char_ranges.push(CharRange {
start: u32::from(start),
end: u32::from(end),
});
}
fn append_builtin_class_ranges_for_legacy_positive_class(
char_ranges: &mut Vec<CharRange>,
builtin_class: BuiltinCharacterClass,
) -> bool {
match builtin_class {
BuiltinCharacterClass::Digit => {
Self::append_ascii_char_range(char_ranges, b'0', b'9');
true
}
BuiltinCharacterClass::Word => {
// Legacy `\w` is ASCII-only: `[A-Za-z0-9_]`.
Self::append_ascii_char_range(char_ranges, b'0', b'9');
Self::append_ascii_char_range(char_ranges, b'A', b'Z');
Self::append_ascii_char_range(char_ranges, b'_', b'_');
Self::append_ascii_char_range(char_ranges, b'a', b'z');
true
}
_ => false,
}
}
fn new(pattern: &Pattern) -> Self {
// Registers: 0-1 for group 0, then 2 per capture group.
let register_count = 2 + pattern.capture_count * 2;
@@ -951,6 +979,7 @@ impl Compiler {
let mut has_builtin = false;
let split_surrogates = !self.program.unicode && !self.program.unicode_sets;
let can_inline_builtin_ranges = !cc.negated && !self.program.unicode && !self.program.unicode_sets;
for r in ranges {
match r {
CharacterClassRange::Single(cp) => {
@@ -966,6 +995,12 @@ impl Compiler {
CharacterClassRange::Range(lo, hi) => {
char_ranges.push(CharRange { start: *lo, end: *hi });
}
CharacterClassRange::BuiltinClass(class)
if can_inline_builtin_ranges
&& Self::append_builtin_class_ranges_for_legacy_positive_class(
&mut char_ranges,
*class,
) => {}
CharacterClassRange::BuiltinClass(_) | CharacterClassRange::UnicodeProperty(_) => {
has_builtin = true;
}

View File

@@ -201,6 +201,24 @@ TEST_CASE(unicode_ignore_case_word_boundary_literal_preserves_behavior)
EXPECT_EQ(regex.test(u"\u212A"sv, 0), regex::MatchResult::Match);
}
TEST_CASE(mixed_positive_class_with_word_builtin_preserves_legacy_ignore_case_behavior)
{
auto regex = MUST(regex::ECMAScriptRegex::compile("[\\w\\$]+"sv, { .ignore_case = true }));
EXPECT_EQ(regex.test("AZ_09$"sv, 0), regex::MatchResult::Match);
EXPECT_EQ(regex.test(u"\u017F"sv, 0), regex::MatchResult::NoMatch);
EXPECT_EQ(regex.test(u"\u212A"sv, 0), regex::MatchResult::NoMatch);
}
TEST_CASE(mixed_positive_class_with_digit_builtin_preserves_behavior)
{
auto regex = MUST(regex::ECMAScriptRegex::compile("[A-Z\\d-]+"sv, { .ignore_case = true }));
EXPECT_EQ(regex.test("ABC-123"sv, 0), regex::MatchResult::Match);
EXPECT_EQ(regex.test("abc"sv, 0), regex::MatchResult::Match);
EXPECT_EQ(regex.test("!"sv, 0), regex::MatchResult::NoMatch);
}
TEST_CASE(find_all_returns_non_overlapping_matches)
{
auto regex = MUST(regex::ECMAScriptRegex::compile("aba"sv, {}));