LibRegex: Inline \w and \d ranges in legacy positive char classes

Previously, a character class containing any builtin (\d, \w, \s) forced the compiler down the slow "complex class" path, which emits a disjunction of alternatives and backtracks at runtime. For non-unicode, non-unicode-sets, non-negated classes, \w and \d can be inlined as their raw ASCII code-point ranges. The resulting class stays on the fast path and compiles into a single sorted CharClass instruction. The unicode/unicode_sets and negation guards are required for correctness: with the /u + /i flags, \w gains non-ASCII members via case folding (e.g. U+017F, U+212A), and negated classes have a separate, smarter compilation path.
Author: https://github.com/kalenikaliaksandr Commit: https://github.com/LadybirdBrowser/ladybird/commit/7685a5e14af Pull-request: https://github.com/LadybirdBrowser/ladybird/pull/9011
2026-04-25 17:25:08 +02:00 · 2026-04-21 15:22:00 +02:00 · 2026-04-21 14:37:38 +00:00
parent 8c4870f207
commit 7685a5e14a
2 changed files with 53 additions and 0 deletions
--- a/Libraries/LibRegex/Rust/src/compiler.rs
+++ b/Libraries/LibRegex/Rust/src/compiler.rs
@@ -58,6 +58,34 @@ struct Compiler {
 }

 impl Compiler {
+    fn append_ascii_char_range(char_ranges: &mut Vec<CharRange>, start: u8, end: u8) {
+        char_ranges.push(CharRange {
+            start: u32::from(start),
+            end: u32::from(end),
+        });
+    }
+
+    fn append_builtin_class_ranges_for_legacy_positive_class(
+        char_ranges: &mut Vec<CharRange>,
+        builtin_class: BuiltinCharacterClass,
+    ) -> bool {
+        match builtin_class {
+            BuiltinCharacterClass::Digit => {
+                Self::append_ascii_char_range(char_ranges, b'0', b'9');
+                true
+            }
+            BuiltinCharacterClass::Word => {
+                // Legacy `\w` is ASCII-only: `[A-Za-z0-9_]`.
+                Self::append_ascii_char_range(char_ranges, b'0', b'9');
+                Self::append_ascii_char_range(char_ranges, b'A', b'Z');
+                Self::append_ascii_char_range(char_ranges, b'_', b'_');
+                Self::append_ascii_char_range(char_ranges, b'a', b'z');
+                true
+            }
+            _ => false,
+        }
+    }
+
    fn new(pattern: &Pattern) -> Self {
        // Registers: 0-1 for group 0, then 2 per capture group.
        let register_count = 2 + pattern.capture_count * 2;
@@ -951,6 +979,7 @@ impl Compiler {
                let mut has_builtin = false;

                let split_surrogates = !self.program.unicode && !self.program.unicode_sets;
+                let can_inline_builtin_ranges = !cc.negated && !self.program.unicode && !self.program.unicode_sets;
                for r in ranges {
                    match r {
                        CharacterClassRange::Single(cp) => {
@@ -966,6 +995,12 @@ impl Compiler {
                        CharacterClassRange::Range(lo, hi) => {
                            char_ranges.push(CharRange { start: *lo, end: *hi });
                        }
+                        CharacterClassRange::BuiltinClass(class)
+                            if can_inline_builtin_ranges
+                                && Self::append_builtin_class_ranges_for_legacy_positive_class(
+                                    &mut char_ranges,
+                                    *class,
+                                ) => {}
                        CharacterClassRange::BuiltinClass(_) | CharacterClassRange::UnicodeProperty(_) => {
                            has_builtin = true;
                        }
--- a/Tests/LibRegex/TestRegex.cpp
+++ b/Tests/LibRegex/TestRegex.cpp
@@ -201,6 +201,24 @@ TEST_CASE(unicode_ignore_case_word_boundary_literal_preserves_behavior)
    EXPECT_EQ(regex.test(u"\u212A"sv, 0), regex::MatchResult::Match);
 }

+TEST_CASE(mixed_positive_class_with_word_builtin_preserves_legacy_ignore_case_behavior)
+{
+    auto regex = MUST(regex::ECMAScriptRegex::compile("[\\w\\$]+"sv, { .ignore_case = true }));
+
+    EXPECT_EQ(regex.test("AZ_09$"sv, 0), regex::MatchResult::Match);
+    EXPECT_EQ(regex.test(u"\u017F"sv, 0), regex::MatchResult::NoMatch);
+    EXPECT_EQ(regex.test(u"\u212A"sv, 0), regex::MatchResult::NoMatch);
+}
+
+TEST_CASE(mixed_positive_class_with_digit_builtin_preserves_behavior)
+{
+    auto regex = MUST(regex::ECMAScriptRegex::compile("[A-Z\\d-]+"sv, { .ignore_case = true }));
+
+    EXPECT_EQ(regex.test("ABC-123"sv, 0), regex::MatchResult::Match);
+    EXPECT_EQ(regex.test("abc"sv, 0), regex::MatchResult::Match);
+    EXPECT_EQ(regex.test("!"sv, 0), regex::MatchResult::NoMatch);
+}
+
 TEST_CASE(find_all_returns_non_overlapping_matches)
 {
    auto regex = MUST(regex::ECMAScriptRegex::compile("aba"sv, {}));