mirror of
https://github.com/LadybirdBrowser/ladybird
synced 2026-04-28 10:37:17 +02:00
LibRegex: Support UTF-16 RegexStringView and improve Unicode matching
When the Unicode option is not set, regular expressions should match based on code units; when it is set, they should match based on code points. To do so, the regex parser must combine surrogate pairs when the Unicode option is set. Further, RegexStringView needs to know if the flag is set in order to return code point vs. code unit based string lengths and substrings.
This commit is contained in:
committed by
Linus Groh
parent
2e45e52993
commit
47f6bb38a1
Notes:
sideshowbarker
2024-07-18 08:26:31 +09:00
Author: https://github.com/trflynn89 Commit: https://github.com/SerenityOS/serenity/commit/47f6bb38a1b Pull-request: https://github.com/SerenityOS/serenity/pull/8931 Reviewed-by: https://github.com/linusg ✅
@@ -10,6 +10,7 @@
|
||||
#include <AK/String.h>
|
||||
#include <AK/StringBuilder.h>
|
||||
#include <AK/StringUtils.h>
|
||||
#include <AK/Utf16View.h>
|
||||
|
||||
namespace regex {
|
||||
|
||||
@@ -1440,13 +1441,31 @@ bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_mini
|
||||
|
||||
if (try_skip("u")) {
|
||||
if (auto code_point = read_digits(ReadDigitsInitialZeroState::Allow, true, 4); code_point.has_value()) {
|
||||
// FIXME: The minimum length depends on the mode - should be utf8-length in u8 mode.
|
||||
// In Unicode mode, we need to combine surrogate pairs into a single code point. But we also need to be
|
||||
// rather forgiving if the surrogate pairs are invalid. So if a second code unit follows this code unit,
|
||||
// but doesn't form a valid surrogate pair, insert bytecode for both code units individually.
|
||||
Optional<u32> low_surrogate;
|
||||
if (unicode && Utf16View::is_high_surrogate(*code_point) && try_skip("\\u")) {
|
||||
low_surrogate = read_digits(ReadDigitsInitialZeroState::Allow, true, 4);
|
||||
if (!low_surrogate.has_value()) {
|
||||
set_error(Error::InvalidPattern);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (Utf16View::is_low_surrogate(*low_surrogate)) {
|
||||
*code_point = Utf16View::decode_surrogate_pair(*code_point, *low_surrogate);
|
||||
low_surrogate.clear();
|
||||
}
|
||||
}
|
||||
|
||||
match_length_minimum += 1;
|
||||
StringBuilder builder;
|
||||
builder.append_code_point(code_point.value());
|
||||
// FIXME: This isn't actually correct for ECMAScript.
|
||||
auto u8_encoded = builder.string_view();
|
||||
stack.insert_bytecode_compare_string(u8_encoded);
|
||||
stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)code_point.value() } });
|
||||
|
||||
if (low_surrogate.has_value()) {
|
||||
match_length_minimum += 1;
|
||||
stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)low_surrogate.value() } });
|
||||
}
|
||||
|
||||
return true;
|
||||
} else if (!unicode) {
|
||||
// '\u' is allowed in non-unicode mode, just matches 'u'.
|
||||
|
||||
Reference in New Issue
Block a user