LibRegex: Add support for string literals in character classes

This commit is contained in:
aplefull
2025-11-25 17:32:35 +01:00
committed by Ali Mohammad Pur
parent a49c39de32
commit eed4dd3745
Notes: github-actions[bot] 2025-11-26 10:35:42 +00:00
4 changed files with 230 additions and 25 deletions

View File

@@ -34,6 +34,15 @@ static constexpr StringView identity_escape_characters(bool unicode, bool browse
return "^$\\.*+?()[]{}|"sv;
}
static bool has_multiple_code_points(String const& str)
{
Utf8View utf8_view { str.bytes_as_string_view() };
auto it = utf8_view.begin();
if (it == utf8_view.end())
return false;
return ++it != utf8_view.end();
}
ALWAYS_INLINE bool Parser::set_error(Error error)
{
if (m_parser_state.error == Error::NoError) {
@@ -1855,7 +1864,16 @@ bool ECMA262Parser::parse_character_class(ByteCode& stack, size_t& match_length_
compares.empend(CompareTypeAndValuePair { CharacterCompareType::EndAndOr, 0 });
}
match_length_minimum += 1;
bool has_empty_string_set = any_of(compares, [this](auto const& compare) {
if (compare.type != CharacterCompareType::StringSet)
return false;
auto const& trie = m_parser_state.bytecode.string_set_table().get_u8_trie(compare.value);
return trie.has_metadata() && trie.metadata_value();
});
if (!has_empty_string_set)
match_length_minimum += 1;
stack.insert_bytecode_compare_values(move(compares));
return true;
}
@@ -2299,6 +2317,15 @@ Optional<u32> ECMA262Parser::parse_class_set_character()
return {};
}
if (match(TokenType::EscapeSequence)) {
auto escape_value = m_parser_state.current_token.value();
consume();
if (escape_value[0] == '\\' && escape_value.length() == 2) {
return escape_value[1];
}
}
auto start_position = tell();
ArmedScopeGuard restore { [&] { back(tell() - start_position + 1); } };
@@ -2359,6 +2386,62 @@ bool ECMA262Parser::parse_class_set_operand(Vector<regex::CompareTypeAndValuePai
{
auto start_position = tell();
// ClassStringDisjunction :: "\q{" ClassStringDisjunctionContents "}"
// ClassStringDisjunctionContents :: ClassString | ClassString "|" ClassStringDisjunctionContents
// ClassString :: [empty] | NonEmptyClassString
// NonEmptyClassString :: ClassCharacter NonEmptyClassString[opt]
if (try_skip("\\q"sv)) {
if (!match(TokenType::LeftCurly)) {
back(2);
return false;
}
consume();
Vector<String> strings;
StringBuilder current_string;
while (!match(TokenType::RightCurly)) {
if (done()) {
set_error(Error::MismatchingBrace);
return false;
}
if (match(TokenType::Pipe)) {
consume();
strings.append(MUST(current_string.to_string()));
current_string.clear();
continue;
}
auto character = parse_class_set_character();
if (!character.has_value()) {
if (has_error())
return false;
set_error(Error::InvalidCharacterClass);
return false;
}
current_string.append_code_point(character.value());
}
strings.append(MUST(current_string.to_string()));
consume(TokenType::RightCurly, Error::MismatchingBrace);
bool is_negated = any_of(compares, [](auto const& compare) {
return compare.type == CharacterCompareType::Inverse;
});
if (is_negated && any_of(strings, has_multiple_code_points)) {
set_error(Error::NegatedCharacterClassStrings);
return false;
}
auto string_set_index = m_parser_state.bytecode.string_set_table().set(strings);
compares.empend(CompareTypeAndValuePair { CharacterCompareType::StringSet, string_set_index });
return true;
}
// ClassSetOperand :: ClassSetCharacter | ClassStringDisjunction | NestedClass
if (auto character = parse_class_set_character(); character.has_value()) {
compares.append({ CharacterCompareType::Char, character.value() });
@@ -2414,15 +2497,6 @@ bool ECMA262Parser::parse_class_set_operand(Vector<regex::CompareTypeAndValuePai
if (has_error())
return false;
// ClassStringDisjunction :: "\q{" ClassStringDisjunctionContents "}"
// ClassStringDisjunctionContents :: ClassString | ClassString "|" ClassStringDisjunctionContents
// ClassString :: [empty] | NonEmptyClassString
// NonEmptyClassString :: ClassCharacter NonEmptyClassString[opt]
if (try_skip("\\q{"sv)) {
// FIXME: Implement this :P
return set_error(Error::InvalidCharacterClass);
}
back(tell() - start_position + 1);
return false;
}
@@ -2491,14 +2565,17 @@ bool ECMA262Parser::parse_nested_class(Vector<regex::CompareTypeAndValuePair>& c
return;
}
for (auto const& compare : compares) {
if (compare.type == CharacterCompareType::Inverse) {
set_error(Error::NegatedCharacterClassStrings);
return;
}
auto strings = Unicode::get_property_strings(property);
bool is_negated = any_of(compares, [](auto const& compare) {
return compare.type == CharacterCompareType::Inverse;
});
if (is_negated && any_of(strings, has_multiple_code_points)) {
set_error(Error::NegatedCharacterClassStrings);
return;
}
auto strings = Unicode::get_property_strings(property);
if (!strings.is_empty()) {
auto string_set_index = m_parser_state.bytecode.string_set_table().set(move(strings));
compares.empend(CompareTypeAndValuePair { CharacterCompareType::StringSet, string_set_index });