LibRegex: Properly track code units in u-v modes

Previously, both string_position and view_index used code unit offsets
regardless of mode. Now in unicode mode, these variables track code
point positions while string_position_in_code_units is properly
updated to reflect code unit offsets.
This commit is contained in:
aplefull
2025-10-22 13:40:15 +02:00
committed by Ali Mohammad Pur
parent fb258639d1
commit 5632a52531
Notes: github-actions[bot] 2025-10-24 19:24:41 +00:00
2 changed files with 51 additions and 3 deletions

View File

@@ -237,10 +237,17 @@ RegexResult Matcher<Parser>::match(Vector<RegexStringView> const& views, Optiona
input.view = view;
dbgln_if(REGEX_DEBUG, "[match] Starting match with view ({}): _{}_", view.length(), view);
auto view_length = view.length_in_code_units();
auto view_length = view.length();
size_t view_index = m_pattern->start_offset;
state.string_position = view_index;
state.string_position_in_code_units = view_index;
if (view.unicode()) {
if (view_index < view_length)
state.string_position_in_code_units = view.code_unit_offset_of(view_index);
else
state.string_position_in_code_units = view.length_in_code_units();
} else {
state.string_position_in_code_units = view_index;
}
bool succeeded = false;
if (view_index == view_length && m_pattern->parser_result.match_length_minimum == 0) {
@@ -303,7 +310,14 @@ RegexResult Matcher<Parser>::match(Vector<RegexStringView> const& views, Optiona
input.match_index = match_count;
state.string_position = view_index;
state.string_position_in_code_units = view_index;
if (input.view.unicode()) {
if (view_index < view_length)
state.string_position_in_code_units = input.view.code_unit_offset_of(view_index);
else
state.string_position_in_code_units = input.view.length_in_code_units();
} else {
state.string_position_in_code_units = view_index;
}
state.instruction_position = 0;
state.repetition_marks.clear();