mirror of
https://github.com/LadybirdBrowser/ladybird
synced 2026-04-25 17:25:08 +02:00
LibRegex: Inline \w and \d ranges in legacy positive char classes
Previously, a character class containing any builtin (\d, \w, \s) forced the compiler down the slow "complex class" path, which emits a disjunction of alternatives and backtracks at runtime. For non-unicode, non-unicode-sets, non-negated classes, \w and \d can be inlined as their raw ASCII code-point ranges. The resulting class stays on the fast path and compiles into a single sorted CharClass instruction. The unicode/unicode_sets and negation guards are required for correctness: with the /u + /i flags, \w gains non-ASCII members via case folding (e.g. U+017F, U+212A), and negated classes have a separate, smarter compilation path.
This commit is contained in:
committed by
Andreas Kling
parent
8c4870f207
commit
7685a5e14a
Notes:
github-actions[bot]
2026-04-21 14:37:38 +00:00
Author: https://github.com/kalenikaliaksandr Commit: https://github.com/LadybirdBrowser/ladybird/commit/7685a5e14af Pull-request: https://github.com/LadybirdBrowser/ladybird/pull/9011
@@ -58,6 +58,34 @@ struct Compiler {
|
||||
}
|
||||
|
||||
impl Compiler {
|
||||
fn append_ascii_char_range(char_ranges: &mut Vec<CharRange>, start: u8, end: u8) {
|
||||
char_ranges.push(CharRange {
|
||||
start: u32::from(start),
|
||||
end: u32::from(end),
|
||||
});
|
||||
}
|
||||
|
||||
fn append_builtin_class_ranges_for_legacy_positive_class(
|
||||
char_ranges: &mut Vec<CharRange>,
|
||||
builtin_class: BuiltinCharacterClass,
|
||||
) -> bool {
|
||||
match builtin_class {
|
||||
BuiltinCharacterClass::Digit => {
|
||||
Self::append_ascii_char_range(char_ranges, b'0', b'9');
|
||||
true
|
||||
}
|
||||
BuiltinCharacterClass::Word => {
|
||||
// Legacy `\w` is ASCII-only: `[A-Za-z0-9_]`.
|
||||
Self::append_ascii_char_range(char_ranges, b'0', b'9');
|
||||
Self::append_ascii_char_range(char_ranges, b'A', b'Z');
|
||||
Self::append_ascii_char_range(char_ranges, b'_', b'_');
|
||||
Self::append_ascii_char_range(char_ranges, b'a', b'z');
|
||||
true
|
||||
}
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
||||
fn new(pattern: &Pattern) -> Self {
|
||||
// Registers: 0-1 for group 0, then 2 per capture group.
|
||||
let register_count = 2 + pattern.capture_count * 2;
|
||||
@@ -951,6 +979,7 @@ impl Compiler {
|
||||
let mut has_builtin = false;
|
||||
|
||||
let split_surrogates = !self.program.unicode && !self.program.unicode_sets;
|
||||
let can_inline_builtin_ranges = !cc.negated && !self.program.unicode && !self.program.unicode_sets;
|
||||
for r in ranges {
|
||||
match r {
|
||||
CharacterClassRange::Single(cp) => {
|
||||
@@ -966,6 +995,12 @@ impl Compiler {
|
||||
CharacterClassRange::Range(lo, hi) => {
|
||||
char_ranges.push(CharRange { start: *lo, end: *hi });
|
||||
}
|
||||
CharacterClassRange::BuiltinClass(class)
|
||||
if can_inline_builtin_ranges
|
||||
&& Self::append_builtin_class_ranges_for_legacy_positive_class(
|
||||
&mut char_ranges,
|
||||
*class,
|
||||
) => {}
|
||||
CharacterClassRange::BuiltinClass(_) | CharacterClassRange::UnicodeProperty(_) => {
|
||||
has_builtin = true;
|
||||
}
|
||||
|
||||
@@ -201,6 +201,24 @@ TEST_CASE(unicode_ignore_case_word_boundary_literal_preserves_behavior)
|
||||
EXPECT_EQ(regex.test(u"\u212A"sv, 0), regex::MatchResult::Match);
|
||||
}
|
||||
|
||||
TEST_CASE(mixed_positive_class_with_word_builtin_preserves_legacy_ignore_case_behavior)
|
||||
{
|
||||
auto regex = MUST(regex::ECMAScriptRegex::compile("[\\w\\$]+"sv, { .ignore_case = true }));
|
||||
|
||||
EXPECT_EQ(regex.test("AZ_09$"sv, 0), regex::MatchResult::Match);
|
||||
EXPECT_EQ(regex.test(u"\u017F"sv, 0), regex::MatchResult::NoMatch);
|
||||
EXPECT_EQ(regex.test(u"\u212A"sv, 0), regex::MatchResult::NoMatch);
|
||||
}
|
||||
|
||||
TEST_CASE(mixed_positive_class_with_digit_builtin_preserves_behavior)
|
||||
{
|
||||
auto regex = MUST(regex::ECMAScriptRegex::compile("[A-Z\\d-]+"sv, { .ignore_case = true }));
|
||||
|
||||
EXPECT_EQ(regex.test("ABC-123"sv, 0), regex::MatchResult::Match);
|
||||
EXPECT_EQ(regex.test("abc"sv, 0), regex::MatchResult::Match);
|
||||
EXPECT_EQ(regex.test("!"sv, 0), regex::MatchResult::NoMatch);
|
||||
}
|
||||
|
||||
TEST_CASE(find_all_returns_non_overlapping_matches)
|
||||
{
|
||||
auto regex = MUST(regex::ECMAScriptRegex::compile("aba"sv, {}));
|
||||
|
||||
Reference in New Issue
Block a user