diff --git a/Libraries/LibRegex/Rust/src/compiler.rs b/Libraries/LibRegex/Rust/src/compiler.rs index 4529465bb0b..975ab27965e 100644 --- a/Libraries/LibRegex/Rust/src/compiler.rs +++ b/Libraries/LibRegex/Rust/src/compiler.rs @@ -58,6 +58,34 @@ struct Compiler { } impl Compiler { + fn append_ascii_char_range(char_ranges: &mut Vec, start: u8, end: u8) { + char_ranges.push(CharRange { + start: u32::from(start), + end: u32::from(end), + }); + } + + fn append_builtin_class_ranges_for_legacy_positive_class( + char_ranges: &mut Vec, + builtin_class: BuiltinCharacterClass, + ) -> bool { + match builtin_class { + BuiltinCharacterClass::Digit => { + Self::append_ascii_char_range(char_ranges, b'0', b'9'); + true + } + BuiltinCharacterClass::Word => { + // Legacy `\w` is ASCII-only: `[A-Za-z0-9_]`. + Self::append_ascii_char_range(char_ranges, b'0', b'9'); + Self::append_ascii_char_range(char_ranges, b'A', b'Z'); + Self::append_ascii_char_range(char_ranges, b'_', b'_'); + Self::append_ascii_char_range(char_ranges, b'a', b'z'); + true + } + _ => false, + } + } + fn new(pattern: &Pattern) -> Self { // Registers: 0-1 for group 0, then 2 per capture group. let register_count = 2 + pattern.capture_count * 2; @@ -951,6 +979,7 @@ impl Compiler { let mut has_builtin = false; let split_surrogates = !self.program.unicode && !self.program.unicode_sets; + let can_inline_builtin_ranges = !cc.negated && !self.program.unicode && !self.program.unicode_sets; for r in ranges { match r { CharacterClassRange::Single(cp) => { @@ -966,6 +995,12 @@ impl Compiler { CharacterClassRange::Range(lo, hi) => { char_ranges.push(CharRange { start: *lo, end: *hi }); } + CharacterClassRange::BuiltinClass(class) + if can_inline_builtin_ranges + && Self::append_builtin_class_ranges_for_legacy_positive_class( + &mut char_ranges, + *class, + ) => {} CharacterClassRange::BuiltinClass(_) | CharacterClassRange::UnicodeProperty(_) => { has_builtin = true; } diff --git a/Tests/LibRegex/TestRegex.cpp b/Tests/LibRegex/TestRegex.cpp index 126f5a2aeae..383af0700bb 100644 --- a/Tests/LibRegex/TestRegex.cpp +++ b/Tests/LibRegex/TestRegex.cpp @@ -201,6 +201,24 @@ TEST_CASE(unicode_ignore_case_word_boundary_literal_preserves_behavior) EXPECT_EQ(regex.test(u"\u212A"sv, 0), regex::MatchResult::Match); } +TEST_CASE(mixed_positive_class_with_word_builtin_preserves_legacy_ignore_case_behavior) +{ + auto regex = MUST(regex::ECMAScriptRegex::compile("[\\w\\$]+"sv, { .ignore_case = true })); + + EXPECT_EQ(regex.test("AZ_09$"sv, 0), regex::MatchResult::Match); + EXPECT_EQ(regex.test(u"\u017F"sv, 0), regex::MatchResult::NoMatch); + EXPECT_EQ(regex.test(u"\u212A"sv, 0), regex::MatchResult::NoMatch); +} + +TEST_CASE(mixed_positive_class_with_digit_builtin_preserves_behavior) +{ + auto regex = MUST(regex::ECMAScriptRegex::compile("[A-Z\\d-]+"sv, { .ignore_case = true })); + + EXPECT_EQ(regex.test("ABC-123"sv, 0), regex::MatchResult::Match); + EXPECT_EQ(regex.test("abc"sv, 0), regex::MatchResult::Match); + EXPECT_EQ(regex.test("!"sv, 0), regex::MatchResult::NoMatch); +} + TEST_CASE(find_all_returns_non_overlapping_matches) { auto regex = MUST(regex::ECMAScriptRegex::compile("aba"sv, {}));