mirror of
https://github.com/LadybirdBrowser/ladybird
synced 2026-04-25 17:25:08 +02:00
LibRegex: Remove the legacy C++ ECMA-262 engine
Delete the old C++ ECMA-262 parser, optimizer, and matcher now that all in-tree users compile and execute through `ECMAScriptRegex`. Stop building the legacy engine, remove its source files and the POSIX-only fuzzers that depended on it, and update the remaining LibRegex tests to target the Rust-backed facade instead of the deleted implementation. Clean up the last includes, comments, and helper paths that only existed to support the old backend. After this commit LibRegex has a single ECMAScript engine in-tree, eliminating duplicated maintenance and unifying future regex work.
This commit is contained in:
committed by
Ali Mohammad Pur
parent
e243e146de
commit
d7bf9d3898
Notes:
github-actions[bot]
2026-03-27 16:35:07 +00:00
Author: https://github.com/awesomekling Commit: https://github.com/LadybirdBrowser/ladybird/commit/d7bf9d3898c Pull-request: https://github.com/LadybirdBrowser/ladybird/pull/8612 Reviewed-by: https://github.com/jdahlin Reviewed-by: https://github.com/trflynn89
@@ -1,21 +1,15 @@
|
||||
set(SOURCES
|
||||
RegexByteCode.cpp
|
||||
RegexLexer.cpp
|
||||
RegexMatcher.cpp
|
||||
RegexOptimizer.cpp
|
||||
RegexParser.cpp
|
||||
)
|
||||
|
||||
if(SERENITYOS)
|
||||
list(APPEND SOURCES C/Regex.cpp)
|
||||
if (NOT ENABLE_RUST)
|
||||
message(FATAL_ERROR "LibRegex requires ENABLE_RUST; the legacy C++ regex engine has been removed")
|
||||
endif()
|
||||
|
||||
set(SOURCES
|
||||
ECMAScriptRegex.cpp
|
||||
RustRegex.cpp
|
||||
)
|
||||
|
||||
ladybird_lib(LibRegex regex EXPLICIT_SYMBOL_EXPORT)
|
||||
target_link_libraries(LibRegex PRIVATE LibUnicode)
|
||||
|
||||
if (ENABLE_RUST)
|
||||
target_sources(LibRegex PRIVATE RustRegex.cpp)
|
||||
import_rust_crate(MANIFEST_PATH Rust/Cargo.toml CRATE_NAME libregex_rust)
|
||||
target_link_libraries(LibRegex PRIVATE libregex_rust)
|
||||
target_compile_definitions(LibRegex PRIVATE ENABLE_RUST)
|
||||
endif()
|
||||
import_rust_crate(MANIFEST_PATH Rust/Cargo.toml CRATE_NAME libregex_rust)
|
||||
target_link_libraries(LibRegex PRIVATE libregex_rust)
|
||||
target_compile_definitions(LibRegex PRIVATE ENABLE_RUST)
|
||||
|
||||
@@ -1,30 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
|
||||
*
|
||||
* SPDX-License-Identifier: BSD-2-Clause
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <AK/Types.h>
|
||||
#include <LibRegex/Export.h>
|
||||
|
||||
namespace regex {
|
||||
|
||||
struct CompareTypeAndValuePair;
|
||||
|
||||
enum class Error : u8;
|
||||
class Lexer;
|
||||
class PosixExtendedParser;
|
||||
class ECMA262Parser;
|
||||
|
||||
class ByteCode;
|
||||
|
||||
class RegexStringView;
|
||||
|
||||
}
|
||||
|
||||
using regex::ECMA262Parser;
|
||||
using regex::Lexer;
|
||||
using regex::PosixExtendedParser;
|
||||
using regex::RegexStringView;
|
||||
@@ -1,10 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
|
||||
*
|
||||
* SPDX-License-Identifier: BSD-2-Clause
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <LibRegex/Forward.h>
|
||||
#include <LibRegex/RegexMatcher.h>
|
||||
@@ -1,524 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
|
||||
*
|
||||
* SPDX-License-Identifier: BSD-2-Clause
|
||||
*/
|
||||
|
||||
#include "RegexByteCode.h"
|
||||
|
||||
#include <AK/CharacterTypes.h>
|
||||
#include <AK/StringBuilder.h>
|
||||
#include <LibUnicode/CharacterTypes.h>
|
||||
|
||||
namespace regex {
|
||||
|
||||
StringView execution_result_name(ExecutionResult result)
|
||||
{
|
||||
switch (result) {
|
||||
#define __ENUMERATE_EXECUTION_RESULT(x) \
|
||||
case ExecutionResult::x: \
|
||||
return #x##sv;
|
||||
ENUMERATE_EXECUTION_RESULTS
|
||||
#undef __ENUMERATE_EXECUTION_RESULT
|
||||
default:
|
||||
VERIFY_NOT_REACHED();
|
||||
return "<Unknown>"sv;
|
||||
}
|
||||
}
|
||||
|
||||
StringView opcode_id_name(OpCodeId opcode)
|
||||
{
|
||||
switch (opcode) {
|
||||
#define __ENUMERATE_OPCODE(x) \
|
||||
case OpCodeId::x: \
|
||||
return #x##sv;
|
||||
|
||||
ENUMERATE_OPCODES
|
||||
|
||||
#undef __ENUMERATE_OPCODE
|
||||
default:
|
||||
VERIFY_NOT_REACHED();
|
||||
return "<Unknown>"sv;
|
||||
}
|
||||
}
|
||||
|
||||
StringView fork_if_condition_name(ForkIfCondition condition)
|
||||
{
|
||||
switch (condition) {
|
||||
#define __ENUMERATE_FORK_IF_CONDITION(x) \
|
||||
case ForkIfCondition::x: \
|
||||
return #x##sv;
|
||||
ENUMERATE_FORK_IF_CONDITIONS
|
||||
#undef __ENUMERATE_FORK_IF_CONDITION
|
||||
default:
|
||||
return "<Unknown>"sv;
|
||||
}
|
||||
}
|
||||
|
||||
StringView boundary_check_type_name(BoundaryCheckType ty)
|
||||
{
|
||||
switch (ty) {
|
||||
#define __ENUMERATE_BOUNDARY_CHECK_TYPE(x) \
|
||||
case BoundaryCheckType::x: \
|
||||
return #x##sv;
|
||||
ENUMERATE_BOUNDARY_CHECK_TYPES
|
||||
#undef __ENUMERATE_BOUNDARY_CHECK_TYPE
|
||||
default:
|
||||
VERIFY_NOT_REACHED();
|
||||
return "<Unknown>"sv;
|
||||
}
|
||||
}
|
||||
|
||||
StringView character_compare_type_name(CharacterCompareType ch_compare_type)
|
||||
{
|
||||
switch (ch_compare_type) {
|
||||
#define __ENUMERATE_CHARACTER_COMPARE_TYPE(x) \
|
||||
case CharacterCompareType::x: \
|
||||
return #x##sv;
|
||||
ENUMERATE_CHARACTER_COMPARE_TYPES
|
||||
#undef __ENUMERATE_CHARACTER_COMPARE_TYPE
|
||||
default:
|
||||
VERIFY_NOT_REACHED();
|
||||
return "<Unknown>"sv;
|
||||
}
|
||||
}
|
||||
|
||||
StringView character_class_name(CharClass ch_class)
|
||||
{
|
||||
switch (ch_class) {
|
||||
#define __ENUMERATE_CHARACTER_CLASS(x) \
|
||||
case CharClass::x: \
|
||||
return #x##sv;
|
||||
ENUMERATE_CHARACTER_CLASSES
|
||||
#undef __ENUMERATE_CHARACTER_CLASS
|
||||
default:
|
||||
VERIFY_NOT_REACHED();
|
||||
return "<Unknown>"sv;
|
||||
}
|
||||
}
|
||||
|
||||
static bool is_word_character(u32 code_point, bool case_insensitive, bool unicode_mode)
|
||||
{
|
||||
if (is_ascii_alphanumeric(code_point) || code_point == '_')
|
||||
return true;
|
||||
|
||||
if (case_insensitive && unicode_mode) {
|
||||
auto canonical = Unicode::canonicalize(code_point, unicode_mode);
|
||||
if (is_ascii_alphanumeric(canonical) || canonical == '_')
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
size_t ByteCode::s_next_checkpoint_serial_id { 0 };
|
||||
u32 s_next_string_table_serial { 1 };
|
||||
static u32 s_next_string_set_table_serial { 1 };
|
||||
|
||||
StringSetTable::StringSetTable()
|
||||
: m_serial(s_next_string_set_table_serial++)
|
||||
{
|
||||
}
|
||||
|
||||
StringSetTable::~StringSetTable()
|
||||
{
|
||||
if (m_serial == s_next_string_set_table_serial - 1 && m_u8_tries.is_empty())
|
||||
--s_next_string_set_table_serial;
|
||||
}
|
||||
|
||||
StringSetTable::StringSetTable(StringSetTable const& other)
|
||||
: m_serial(s_next_string_set_table_serial++)
|
||||
{
|
||||
for (auto const& entry : other.m_u8_tries)
|
||||
m_u8_tries.set(entry.key, MUST(const_cast<StringSetTrie&>(entry.value).deep_copy()));
|
||||
for (auto const& entry : other.m_u16_tries)
|
||||
m_u16_tries.set(entry.key, MUST(const_cast<StringSetTrie&>(entry.value).deep_copy()));
|
||||
}
|
||||
|
||||
StringSetTable& StringSetTable::operator=(StringSetTable const& other)
|
||||
{
|
||||
if (this != &other) {
|
||||
m_u8_tries.clear();
|
||||
m_u16_tries.clear();
|
||||
for (auto const& entry : other.m_u8_tries)
|
||||
m_u8_tries.set(entry.key, MUST(const_cast<StringSetTrie&>(entry.value).deep_copy()));
|
||||
for (auto const& entry : other.m_u16_tries)
|
||||
m_u16_tries.set(entry.key, MUST(const_cast<StringSetTrie&>(entry.value).deep_copy()));
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
bool matches_character_class(CharClass character_class, u32 ch, bool insensitive, bool unicode_mode)
|
||||
{
|
||||
constexpr auto is_space_or_line_terminator = [](u32 code_point) {
|
||||
if ((code_point == 0x0a) || (code_point == 0x0d) || (code_point == 0x2028) || (code_point == 0x2029))
|
||||
return true;
|
||||
if ((code_point == 0x09) || (code_point == 0x0b) || (code_point == 0x0c) || (code_point == 0xfeff))
|
||||
return true;
|
||||
return Unicode::code_point_has_space_separator_general_category(code_point);
|
||||
};
|
||||
|
||||
switch (character_class) {
|
||||
case CharClass::Alnum:
|
||||
return is_ascii_alphanumeric(ch);
|
||||
case CharClass::Alpha:
|
||||
return is_ascii_alpha(ch);
|
||||
case CharClass::Blank:
|
||||
return is_ascii_blank(ch);
|
||||
case CharClass::Cntrl:
|
||||
return is_ascii_control(ch);
|
||||
case CharClass::Digit:
|
||||
return is_ascii_digit(ch);
|
||||
case CharClass::Graph:
|
||||
return is_ascii_graphical(ch);
|
||||
case CharClass::Lower:
|
||||
return is_ascii_lower_alpha(ch) || (insensitive && is_ascii_upper_alpha(ch));
|
||||
case CharClass::Print:
|
||||
return is_ascii_printable(ch);
|
||||
case CharClass::Punct:
|
||||
return is_ascii_punctuation(ch);
|
||||
case CharClass::Space:
|
||||
return is_space_or_line_terminator(ch);
|
||||
case CharClass::Upper:
|
||||
return is_ascii_upper_alpha(ch) || (insensitive && is_ascii_lower_alpha(ch));
|
||||
case CharClass::Word:
|
||||
return is_word_character(ch, insensitive, unicode_mode);
|
||||
case CharClass::Xdigit:
|
||||
return is_ascii_hex_digit(ch);
|
||||
}
|
||||
|
||||
VERIFY_NOT_REACHED();
|
||||
}
|
||||
|
||||
ByteString opcode_arguments_string(OpCodeId id, ByteCodeValueType const* data, size_t ip, MatchState const& state, ByteCodeBase const& bytecode)
|
||||
{
|
||||
// argument(N) = data[ip + 1 + N]
|
||||
auto arg = [&](size_t n) -> ByteCodeValueType { return data[ip + 1 + n]; };
|
||||
auto sz = opcode_size(id, data, ip);
|
||||
|
||||
switch (id) {
|
||||
case OpCodeId::SaveModifiers:
|
||||
return ByteString::formatted("new_modifiers={:#x}", arg(0));
|
||||
case OpCodeId::RestoreModifiers:
|
||||
case OpCodeId::Exit:
|
||||
case OpCodeId::FailForks:
|
||||
case OpCodeId::PopSaved:
|
||||
case OpCodeId::Save:
|
||||
case OpCodeId::Restore:
|
||||
case OpCodeId::CheckBegin:
|
||||
case OpCodeId::CheckEnd:
|
||||
return ByteString::empty();
|
||||
case OpCodeId::GoBack:
|
||||
return ByteString::formatted("count={}", arg(0));
|
||||
case OpCodeId::SetStepBack:
|
||||
return ByteString::formatted("step={}", static_cast<i64>(arg(0)));
|
||||
case OpCodeId::IncStepBack:
|
||||
return ByteString::formatted("inc step back");
|
||||
case OpCodeId::CheckStepBack:
|
||||
return ByteString::formatted("check step back");
|
||||
case OpCodeId::CheckSavedPosition:
|
||||
return ByteString::formatted("check saved back");
|
||||
case OpCodeId::Jump:
|
||||
return ByteString::formatted("offset={} [&{}]", static_cast<ssize_t>(arg(0)), ip + sz + static_cast<ssize_t>(arg(0)));
|
||||
case OpCodeId::ForkJump:
|
||||
return ByteString::formatted("offset={} [&{}], sp: {}", static_cast<ssize_t>(arg(0)), ip + sz + static_cast<ssize_t>(arg(0)), state.string_position);
|
||||
case OpCodeId::ForkReplaceJump:
|
||||
return ByteString::formatted("offset={} [&{}], sp: {}", static_cast<ssize_t>(arg(0)), ip + sz + static_cast<ssize_t>(arg(0)), state.string_position);
|
||||
case OpCodeId::ForkStay:
|
||||
return ByteString::formatted("offset={} [&{}], sp: {}", static_cast<ssize_t>(arg(0)), ip + sz + static_cast<ssize_t>(arg(0)), state.string_position);
|
||||
case OpCodeId::ForkReplaceStay:
|
||||
return ByteString::formatted("offset={} [&{}], sp: {}", static_cast<ssize_t>(arg(0)), ip + sz + static_cast<ssize_t>(arg(0)), state.string_position);
|
||||
case OpCodeId::CheckBoundary:
|
||||
return ByteString::formatted("kind={} ({})", static_cast<unsigned long>(arg(0)), boundary_check_type_name(static_cast<BoundaryCheckType>(arg(0))));
|
||||
case OpCodeId::ClearCaptureGroup:
|
||||
case OpCodeId::SaveLeftCaptureGroup:
|
||||
case OpCodeId::SaveRightCaptureGroup:
|
||||
case OpCodeId::Checkpoint:
|
||||
return ByteString::formatted("id={}", arg(0));
|
||||
case OpCodeId::FailIfEmpty:
|
||||
return ByteString::formatted("checkpoint={}", arg(0));
|
||||
case OpCodeId::SaveRightNamedCaptureGroup:
|
||||
return ByteString::formatted("name_id={}, id={}", arg(0), arg(1));
|
||||
case OpCodeId::RSeekTo: {
|
||||
auto ch = arg(0);
|
||||
if (ch <= 0x7f)
|
||||
return ByteString::formatted("before '{}'", ch);
|
||||
return ByteString::formatted("before u+{:04x}", arg(0));
|
||||
}
|
||||
case OpCodeId::Compare:
|
||||
return ByteString::formatted("argc={}, args={} ", arg(0), arg(1));
|
||||
case OpCodeId::CompareSimple: {
|
||||
StringBuilder builder;
|
||||
auto type = static_cast<CharacterCompareType>(arg(1));
|
||||
builder.append(character_compare_type_name(type));
|
||||
switch (type) {
|
||||
case CharacterCompareType::Char: {
|
||||
auto ch = arg(2);
|
||||
if (is_ascii_printable(ch))
|
||||
builder.append(ByteString::formatted(" '{:c}'", static_cast<char>(ch)));
|
||||
else
|
||||
builder.append(ByteString::formatted(" 0x{:x}", ch));
|
||||
break;
|
||||
}
|
||||
case CharacterCompareType::String: {
|
||||
auto string_index = arg(2);
|
||||
auto string = bytecode.get_u16_string(string_index);
|
||||
builder.appendff(" \"{}\"", string);
|
||||
break;
|
||||
}
|
||||
case CharacterCompareType::CharClass: {
|
||||
auto character_class = static_cast<CharClass>(arg(2));
|
||||
builder.appendff(" {}", character_class_name(character_class));
|
||||
break;
|
||||
}
|
||||
case CharacterCompareType::Reference: {
|
||||
auto ref = arg(2);
|
||||
builder.appendff(" number={}", ref);
|
||||
break;
|
||||
}
|
||||
case CharacterCompareType::NamedReference: {
|
||||
auto ref = arg(2);
|
||||
builder.appendff(" named_number={}", ref);
|
||||
break;
|
||||
}
|
||||
case CharacterCompareType::GeneralCategory:
|
||||
case CharacterCompareType::Property:
|
||||
case CharacterCompareType::Script:
|
||||
case CharacterCompareType::ScriptExtension:
|
||||
case CharacterCompareType::StringSet: {
|
||||
builder.appendff(" value={}", arg(2));
|
||||
break;
|
||||
}
|
||||
case CharacterCompareType::LookupTable: {
|
||||
auto count_sensitive = arg(2);
|
||||
auto count_insensitive = arg(3);
|
||||
for (size_t j = 0; j < count_sensitive; ++j) {
|
||||
auto range = static_cast<CharRange>(arg(4 + j));
|
||||
builder.appendff(" {:x}-{:x}", range.from, range.to);
|
||||
}
|
||||
if (count_insensitive > 0) {
|
||||
builder.append(" [insensitive ranges:"sv);
|
||||
for (size_t j = 0; j < count_insensitive; ++j) {
|
||||
auto range = static_cast<CharRange>(arg(4 + count_sensitive + j));
|
||||
builder.appendff(" {:x}-{:x}", range.from, range.to);
|
||||
}
|
||||
builder.append(" ]"sv);
|
||||
}
|
||||
break;
|
||||
}
|
||||
case CharacterCompareType::CharRange: {
|
||||
auto value = arg(2);
|
||||
auto range = static_cast<CharRange>(value);
|
||||
builder.appendff(" {:x}-{:x}", range.from, range.to);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
break;
|
||||
}
|
||||
return builder.to_byte_string();
|
||||
}
|
||||
case OpCodeId::Repeat: {
|
||||
auto repeat_id = arg(2);
|
||||
auto reps = repeat_id < state.repetition_marks.size() ? state.repetition_marks.at(repeat_id) : 0;
|
||||
return ByteString::formatted("offset={} [&{}] count={} id={} rep={}, sp: {}",
|
||||
static_cast<ssize_t>(arg(0)),
|
||||
ip - arg(0),
|
||||
arg(1) + 1,
|
||||
repeat_id,
|
||||
reps + 1,
|
||||
state.string_position);
|
||||
}
|
||||
case OpCodeId::ResetRepeat: {
|
||||
auto repeat_id = arg(0);
|
||||
auto reps = repeat_id < state.repetition_marks.size() ? state.repetition_marks.at(repeat_id) : 0;
|
||||
return ByteString::formatted("id={} rep={}", repeat_id, reps + 1);
|
||||
}
|
||||
case OpCodeId::JumpNonEmpty:
|
||||
return ByteString::formatted("{} offset={} [&{}], cp={}",
|
||||
opcode_id_name(static_cast<OpCodeId>(arg(2))),
|
||||
static_cast<ssize_t>(arg(0)), ip + sz + static_cast<ssize_t>(arg(0)),
|
||||
arg(1));
|
||||
case OpCodeId::ForkIf:
|
||||
return ByteString::formatted("{} {} offset={} [&{}]",
|
||||
opcode_id_name(static_cast<OpCodeId>(arg(1))),
|
||||
fork_if_condition_name(static_cast<ForkIfCondition>(arg(2))),
|
||||
static_cast<ssize_t>(arg(0)), ip + sz + static_cast<ssize_t>(arg(0)));
|
||||
}
|
||||
VERIFY_NOT_REACHED();
|
||||
}
|
||||
|
||||
Vector<ByteString> compare_variable_arguments_to_byte_string(ByteCodeValueType const* data, size_t ip, MatchState const& state, ByteCodeBase const& bytecode, Optional<MatchInput const&> input)
|
||||
{
|
||||
Vector<ByteString> result;
|
||||
|
||||
size_t offset = ip + 3;
|
||||
RegexStringView const& view = input.has_value() ? input.value().view : StringView {};
|
||||
|
||||
auto argument_count = data[ip + 1]; // arguments_count for Compare
|
||||
|
||||
for (size_t i = 0; i < argument_count; ++i) {
|
||||
auto compare_type = static_cast<CharacterCompareType>(data[offset++]);
|
||||
result.empend(ByteString::formatted("type={} [{}]", static_cast<size_t>(compare_type), character_compare_type_name(compare_type)));
|
||||
|
||||
auto string_start_offset = state.string_position_before_match;
|
||||
|
||||
if (compare_type == CharacterCompareType::Char) {
|
||||
auto ch = data[offset++];
|
||||
auto is_ascii = is_ascii_printable(ch);
|
||||
if (is_ascii)
|
||||
result.empend(ByteString::formatted(" value='{:c}'", static_cast<char>(ch)));
|
||||
else
|
||||
result.empend(ByteString::formatted(" value={:x}", ch));
|
||||
|
||||
if (!view.is_null() && view.length() > string_start_offset) {
|
||||
if (is_ascii) {
|
||||
result.empend(ByteString::formatted(
|
||||
" compare against: '{}'",
|
||||
view.substring_view(string_start_offset, string_start_offset > view.length() ? 0 : 1).to_byte_string()));
|
||||
} else {
|
||||
auto str = view.substring_view(string_start_offset, string_start_offset > view.length() ? 0 : 1).to_byte_string();
|
||||
u8 buf[8] { 0 };
|
||||
__builtin_memcpy(buf, str.characters(), min(str.length(), sizeof(buf)));
|
||||
result.empend(ByteString::formatted(" compare against: {:x},{:x},{:x},{:x},{:x},{:x},{:x},{:x}",
|
||||
buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7]));
|
||||
}
|
||||
}
|
||||
} else if (compare_type == CharacterCompareType::Reference) {
|
||||
auto ref = data[offset++];
|
||||
result.empend(ByteString::formatted(" number={}", ref));
|
||||
if (input.has_value()) {
|
||||
if (state.capture_group_matches_size() > input->match_index) {
|
||||
auto match = state.capture_group_matches(input->match_index);
|
||||
if (match.size() > ref) {
|
||||
auto& group = match[ref];
|
||||
result.empend(ByteString::formatted(" left={}", group.left_column));
|
||||
result.empend(ByteString::formatted(" right={}", group.left_column + group.view.length_in_code_units()));
|
||||
result.empend(ByteString::formatted(" contents='{}'", group.view));
|
||||
} else {
|
||||
result.empend(ByteString::formatted(" (invalid ref, max={})", match.size() - 1));
|
||||
}
|
||||
} else {
|
||||
result.empend(ByteString::formatted(" (invalid index {}, max={})", input->match_index, state.capture_group_matches_size() - 1));
|
||||
}
|
||||
}
|
||||
} else if (compare_type == CharacterCompareType::NamedReference) {
|
||||
auto ref = data[offset++];
|
||||
result.empend(ByteString::formatted(" named_number={}", ref));
|
||||
if (input.has_value()) {
|
||||
if (state.capture_group_matches_size() > input->match_index) {
|
||||
auto match = state.capture_group_matches(input->match_index);
|
||||
if (match.size() > ref) {
|
||||
auto& group = match[ref];
|
||||
result.empend(ByteString::formatted(" left={}", group.left_column));
|
||||
result.empend(ByteString::formatted(" right={}", group.left_column + group.view.length_in_code_units()));
|
||||
result.empend(ByteString::formatted(" contents='{}'", group.view));
|
||||
} else {
|
||||
result.empend(ByteString::formatted(" (invalid ref {}, max={})", ref, match.size() - 1));
|
||||
}
|
||||
} else {
|
||||
result.empend(ByteString::formatted(" (invalid index {}, max={})", input->match_index, state.capture_group_matches_size() - 1));
|
||||
}
|
||||
}
|
||||
} else if (compare_type == CharacterCompareType::String) {
|
||||
auto str_id = data[offset++];
|
||||
auto string = bytecode.get_u16_string(str_id);
|
||||
result.empend(ByteString::formatted(" value=\"{}\"", string));
|
||||
if (!view.is_null() && view.length() > state.string_position)
|
||||
result.empend(ByteString::formatted(
|
||||
" compare against: \"{}\"",
|
||||
input.value().view.substring_view(string_start_offset, string_start_offset + string.length_in_code_units() > view.length() ? 0 : string.length_in_code_units()).to_byte_string()));
|
||||
} else if (compare_type == CharacterCompareType::CharClass) {
|
||||
auto character_class = static_cast<CharClass>(data[offset++]);
|
||||
result.empend(ByteString::formatted(" ch_class={} [{}]", static_cast<size_t>(character_class), character_class_name(character_class)));
|
||||
if (!view.is_null() && view.length() > state.string_position)
|
||||
result.empend(ByteString::formatted(
|
||||
" compare against: '{}'",
|
||||
input.value().view.substring_view(string_start_offset, state.string_position > view.length() ? 0 : 1).to_byte_string()));
|
||||
} else if (compare_type == CharacterCompareType::CharRange) {
|
||||
auto value = static_cast<CharRange>(data[offset++]);
|
||||
result.empend(ByteString::formatted(" ch_range={:x}-{:x}", value.from, value.to));
|
||||
if (!view.is_null() && view.length() > state.string_position)
|
||||
result.empend(ByteString::formatted(
|
||||
" compare against: '{}'",
|
||||
input.value().view.substring_view(string_start_offset, state.string_position > view.length() ? 0 : 1).to_byte_string()));
|
||||
} else if (compare_type == CharacterCompareType::LookupTable) {
|
||||
auto count_sensitive = data[offset++];
|
||||
auto count_insensitive = data[offset++];
|
||||
for (size_t j = 0; j < count_sensitive; ++j) {
|
||||
auto range = static_cast<CharRange>(data[offset++]);
|
||||
result.append(ByteString::formatted(" {:x}-{:x}", range.from, range.to));
|
||||
}
|
||||
if (count_insensitive > 0) {
|
||||
result.append(" [insensitive ranges:");
|
||||
for (size_t j = 0; j < count_insensitive; ++j) {
|
||||
auto range = static_cast<CharRange>(data[offset++]);
|
||||
result.append(ByteString::formatted(" {:x}-{:x}", range.from, range.to));
|
||||
}
|
||||
result.append(" ]");
|
||||
}
|
||||
|
||||
if (!view.is_null() && view.length() > state.string_position)
|
||||
result.empend(ByteString::formatted(
|
||||
" compare against: '{}'",
|
||||
input.value().view.substring_view(string_start_offset, state.string_position > view.length() ? 0 : 1).to_byte_string()));
|
||||
} else if (compare_type == CharacterCompareType::GeneralCategory
|
||||
|| compare_type == CharacterCompareType::Property
|
||||
|| compare_type == CharacterCompareType::Script
|
||||
|| compare_type == CharacterCompareType::ScriptExtension
|
||||
|| compare_type == CharacterCompareType::StringSet) {
|
||||
auto value = data[offset++];
|
||||
result.empend(ByteString::formatted(" value={}", value));
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
Vector<CompareTypeAndValuePair> flat_compares_at(ByteCodeValueType const* data, size_t ip, bool is_simple)
|
||||
{
|
||||
Vector<CompareTypeAndValuePair> result;
|
||||
|
||||
size_t offset = ip + (is_simple ? 2 : 3);
|
||||
auto argument_count = is_simple ? 1 : data[ip + OpArgs::Compare::arguments_count];
|
||||
|
||||
for (size_t i = 0; i < argument_count; ++i) {
|
||||
auto compare_type = (CharacterCompareType)data[offset++];
|
||||
|
||||
if (compare_type == CharacterCompareType::Char) {
|
||||
auto ch = data[offset++];
|
||||
result.append({ compare_type, ch });
|
||||
} else if (compare_type == CharacterCompareType::Reference) {
|
||||
auto ref = data[offset++];
|
||||
result.append({ compare_type, ref });
|
||||
} else if (compare_type == CharacterCompareType::NamedReference) {
|
||||
auto ref = data[offset++];
|
||||
result.append({ compare_type, ref });
|
||||
} else if (compare_type == CharacterCompareType::String) {
|
||||
auto string_index = data[offset++];
|
||||
result.append({ compare_type, string_index });
|
||||
} else if (compare_type == CharacterCompareType::CharClass) {
|
||||
auto character_class = data[offset++];
|
||||
result.append({ compare_type, character_class });
|
||||
} else if (compare_type == CharacterCompareType::CharRange) {
|
||||
auto value = data[offset++];
|
||||
result.append({ compare_type, value });
|
||||
} else if (compare_type == CharacterCompareType::LookupTable) {
|
||||
auto count_sensitive = data[offset++];
|
||||
auto count_insensitive = data[offset++];
|
||||
for (size_t j = 0; j < count_sensitive; ++j)
|
||||
result.append({ CharacterCompareType::CharRange, data[offset++] });
|
||||
offset += count_insensitive; // Skip insensitive ranges
|
||||
} else if (compare_type == CharacterCompareType::GeneralCategory
|
||||
|| compare_type == CharacterCompareType::Property
|
||||
|| compare_type == CharacterCompareType::Script
|
||||
|| compare_type == CharacterCompareType::ScriptExtension
|
||||
|| compare_type == CharacterCompareType::StringSet) {
|
||||
auto value = data[offset++];
|
||||
result.append({ compare_type, value });
|
||||
} else {
|
||||
result.append({ compare_type, 0 });
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,21 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2021, Ali Mohammad Pur <mpfard@serenityos.org>
|
||||
*
|
||||
* SPDX-License-Identifier: BSD-2-Clause
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "Forward.h"
|
||||
#include <AK/Vector.h>
|
||||
|
||||
namespace regex {
|
||||
|
||||
class Optimizer {
|
||||
public:
|
||||
static void append_alternation(ByteCode& target, ByteCode&& left, ByteCode&& right);
|
||||
static void append_alternation(ByteCode& target, Span<ByteCode> alternatives);
|
||||
static void append_character_class(ByteCode& target, Vector<CompareTypeAndValuePair>&& pairs);
|
||||
};
|
||||
|
||||
}
|
||||
@@ -1,173 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
|
||||
*
|
||||
* SPDX-License-Identifier: BSD-2-Clause
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <AK/StringBuilder.h>
|
||||
#include <LibRegex/RegexMatcher.h>
|
||||
|
||||
namespace regex {
|
||||
|
||||
template<typename ByteCode>
|
||||
class RegexDebug {
|
||||
public:
|
||||
RegexDebug(FILE* file = stdout)
|
||||
: m_file(file)
|
||||
{
|
||||
}
|
||||
|
||||
virtual ~RegexDebug() = default;
|
||||
|
||||
template<typename T>
|
||||
void print_raw_bytecode(Regex<T>& regex) const
|
||||
{
|
||||
auto& bytecode = regex.parser_result.bytecode.template get<ByteCode>();
|
||||
size_t index { 0 };
|
||||
for (auto& value : bytecode) {
|
||||
outln(m_file, "OpCode i={:3} [{:#02X}]", index, value);
|
||||
++index;
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void print_bytecode(Regex<T> const& regex) const
|
||||
{
|
||||
print_bytecode(regex.parser_result.bytecode.template get<ByteCode>());
|
||||
}
|
||||
|
||||
void print_bytecode(ByteCode const& bytecode) const
|
||||
{
|
||||
auto state = MatchState::only_for_enumeration();
|
||||
ByteCodeValueType const* data;
|
||||
auto data_size = bytecode.size();
|
||||
Optional<Vector<ByteCodeValueType>> flat_storage;
|
||||
|
||||
if constexpr (IsSame<ByteCode, FlatByteCode>) {
|
||||
data = bytecode.flat_data().data();
|
||||
} else {
|
||||
flat_storage.emplace();
|
||||
flat_storage->ensure_capacity(data_size);
|
||||
for (size_t i = 0; i < data_size; ++i)
|
||||
flat_storage->unchecked_append(bytecode[i]);
|
||||
data = flat_storage->data();
|
||||
}
|
||||
|
||||
for (;;) {
|
||||
auto id = (data_size <= state.instruction_position)
|
||||
? OpCodeId::Exit
|
||||
: static_cast<OpCodeId>(data[state.instruction_position]);
|
||||
auto sz = opcode_size(id, data, state.instruction_position);
|
||||
print_opcode("PrintBytecode", id, data, state, bytecode);
|
||||
out(m_file, "{}", m_debug_stripline);
|
||||
|
||||
if (id == OpCodeId::Exit)
|
||||
break;
|
||||
|
||||
state.instruction_position += sz;
|
||||
}
|
||||
|
||||
out(m_file, "String Table:\n");
|
||||
for (auto const& entry : bytecode.string_table().m_table)
|
||||
outln(m_file, "+ {} -> {:x}", entry.key, entry.value);
|
||||
out(m_file, "Reverse String Table:\n");
|
||||
for (auto const& entry : bytecode.string_table().m_inverse_table)
|
||||
outln(m_file, "+ {:x} -> {}", entry.key, entry.value);
|
||||
|
||||
out(m_file, "(u16) String Table:\n");
|
||||
for (auto const& entry : bytecode.u16_string_table().m_table)
|
||||
outln(m_file, "+ {} -> {:x}", entry.key, entry.value);
|
||||
out(m_file, "Reverse (u16) String Table:\n");
|
||||
for (auto const& entry : bytecode.u16_string_table().m_inverse_table)
|
||||
outln(m_file, "+ {:x} -> {}", entry.key, entry.value);
|
||||
|
||||
fflush(m_file);
|
||||
}
|
||||
|
||||
void print_opcode(ByteString const& system, OpCodeId id, ByteCodeValueType const* data, MatchState& state, ByteCodeBase const& bytecode, size_t recursion = 0, bool newline = true) const
|
||||
{
|
||||
auto opcode_str = ByteString::formatted("[{:#02X}] {}", (int)id, opcode_id_name(id));
|
||||
out(m_file, "{:15} | {:5} | {:9} | {:35} | {:30} | {:20}",
|
||||
system.characters(),
|
||||
state.instruction_position,
|
||||
recursion,
|
||||
opcode_str.characters(),
|
||||
opcode_arguments_string(id, data, state.instruction_position, state, bytecode).characters(),
|
||||
ByteString::formatted("ip: {:3}, sp: {:3}", state.instruction_position, state.string_position));
|
||||
if (newline)
|
||||
outln();
|
||||
if (newline && id == OpCodeId::Compare) {
|
||||
for (auto& line : compare_variable_arguments_to_byte_string(data, state.instruction_position, state, bytecode))
|
||||
outln(m_file, "{:15} | {:5} | {:9} | {:35} | {:30} | {:20}", "", "", "", "", line, "");
|
||||
}
|
||||
}
|
||||
|
||||
void print_result(OpCodeId id, ByteCodeValueType const* data, size_t data_size, ByteCodeBase const& bytecode, MatchInput const& input, MatchState& state, size_t current_opcode_size, ExecutionResult result) const
|
||||
{
|
||||
StringBuilder builder;
|
||||
builder.append(execution_result_name(result));
|
||||
builder.appendff(", fc: {}, ss: {}", input.fail_counter, input.saved_positions.size());
|
||||
if (result == ExecutionResult::Succeeded) {
|
||||
builder.appendff(", ip: {}/{}, sp: {}/{}", state.instruction_position, data_size - 1, state.string_position, input.view.length() - 1);
|
||||
} else if (result == ExecutionResult::Fork_PrioHigh) {
|
||||
builder.appendff(", next ip: {}", state.fork_at_position + current_opcode_size);
|
||||
} else if (result != ExecutionResult::Failed) {
|
||||
builder.appendff(", next ip: {}", state.instruction_position + current_opcode_size);
|
||||
}
|
||||
|
||||
outln(m_file, " | {:20}", builder.to_byte_string());
|
||||
|
||||
if (id == OpCodeId::CheckSavedPosition) {
|
||||
auto last_saved = input.saved_positions.is_empty()
|
||||
? "saved: <empty>"_string
|
||||
: MUST(String::formatted("saved: {}", input.saved_positions.last()));
|
||||
outln(m_file, "{:15} | {:5} | {:9} | {:35} | {:30} | {:20}", "", "", "", "", last_saved, "");
|
||||
}
|
||||
if (id == OpCodeId::CheckStepBack || id == OpCodeId::IncStepBack) {
|
||||
auto last_step_back = state.step_backs.is_empty()
|
||||
? "step: <empty>"_string
|
||||
: MUST(String::formatted("step: {}", state.step_backs.last()));
|
||||
outln(m_file, "{:15} | {:5} | {:9} | {:35} | {:30} | {:20}", "", "", "", "", last_step_back, "");
|
||||
}
|
||||
|
||||
if (id == OpCodeId::Compare) {
|
||||
for (auto& line : compare_variable_arguments_to_byte_string(data, state.instruction_position, state, bytecode, input)) {
|
||||
outln(m_file, "{:15} | {:5} | {:9} | {:35} | {:30} | {:20}", "", "", "", "", line, "");
|
||||
}
|
||||
}
|
||||
|
||||
out(m_file, "{}", m_debug_stripline);
|
||||
}
|
||||
|
||||
void print_header()
|
||||
{
|
||||
StringBuilder builder;
|
||||
builder.appendff("{:15} | {:5} | {:9} | {:35} | {:30} | {:20} | {:20}\n", "System", "Index", "Recursion", "OpCode", "Arguments", "State", "Result");
|
||||
auto length = builder.length();
|
||||
for (size_t i = 0; i < length; ++i) {
|
||||
builder.append('=');
|
||||
}
|
||||
auto str = builder.to_byte_string();
|
||||
VERIFY(!str.is_empty());
|
||||
|
||||
outln(m_file, "{}", str);
|
||||
fflush(m_file);
|
||||
|
||||
builder.clear();
|
||||
for (size_t i = 0; i < length; ++i) {
|
||||
builder.append('-');
|
||||
}
|
||||
builder.append('\n');
|
||||
m_debug_stripline = builder.to_byte_string();
|
||||
}
|
||||
|
||||
private:
|
||||
ByteString m_debug_stripline;
|
||||
FILE* m_file;
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
using regex::RegexDebug;
|
||||
@@ -1,56 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
|
||||
* Copyright (c) 2020-2022, Ali Mohammad Pur <mpfard@serenityos.org>
|
||||
*
|
||||
* SPDX-License-Identifier: BSD-2-Clause
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
enum __Regex_Error {
|
||||
__Regex_NoError,
|
||||
__Regex_InvalidPattern, // Invalid regular expression.
|
||||
__Regex_InvalidCollationElement, // Invalid collating element referenced.
|
||||
__Regex_InvalidCharacterClass, // Invalid character class type referenced.
|
||||
__Regex_InvalidTrailingEscape, // Trailing \ in pattern.
|
||||
__Regex_InvalidNumber, // Number in \digit invalid or in error.
|
||||
__Regex_MismatchingBracket, // [ ] imbalance.
|
||||
__Regex_MismatchingParen, // ( ) imbalance.
|
||||
__Regex_MismatchingBrace, // { } imbalance.
|
||||
__Regex_InvalidBraceContent, // Content of {} invalid: not a number, number too large, more than two numbers, first larger than second.
|
||||
__Regex_InvalidBracketContent, // Content of [] invalid.
|
||||
__Regex_InvalidRange, // Invalid endpoint in range expression.
|
||||
__Regex_InvalidRepetitionMarker, // ?, * or + not preceded by valid regular expression.
|
||||
__Regex_ReachedMaxRecursion, // MaximumRecursion has been reached.
|
||||
__Regex_EmptySubExpression, // Sub expression has empty content.
|
||||
__Regex_InvalidCaptureGroup, // Content of capture group is invalid.
|
||||
__Regex_InvalidNameForCaptureGroup, // Name of capture group is invalid.
|
||||
__Regex_InvalidNameForProperty, // Name of property is invalid.
|
||||
__Regex_DuplicateNamedCapture, // Duplicate named capture group
|
||||
__Regex_InvalidCharacterClassEscape, // Invalid escaped entity in character class.
|
||||
__Regex_NegatedCharacterClassStrings, // Negated character class cannot contain strings.
|
||||
__Regex_InvalidModifierGroup, // Invalid modifier group.
|
||||
__Regex_RepeatedModifierFlag, // Repeated flag in modifier group.
|
||||
};
|
||||
|
||||
enum __RegexAllFlags {
|
||||
__Regex_Global = 1, // All matches (don't return after first match)
|
||||
__Regex_Insensitive = __Regex_Global << 1, // Case insensitive match (ignores case of [a-zA-Z])
|
||||
__Regex_Ungreedy = __Regex_Global << 2, // The match becomes lazy by default. Now a ? following a quantifier makes it greedy
|
||||
__Regex_Unicode = __Regex_Global << 3, // Enable all unicode features and interpret all unicode escape sequences as such
|
||||
__Regex_Extended = __Regex_Global << 4, // Ignore whitespaces. Spaces and text after a # in the pattern are ignored
|
||||
__Regex_Extra = __Regex_Global << 5, // Disallow meaningless escapes. A \ followed by a letter with no special meaning is faulted
|
||||
__Regex_MatchNotBeginOfLine = __Regex_Global << 6, // Pattern is not forced to ^ -> search in whole string!
|
||||
__Regex_MatchNotEndOfLine = __Regex_Global << 7, // Don't Force the dollar sign, $, to always match end of the string, instead of end of the line. This option is ignored if the Multiline-flag is set
|
||||
__Regex_SkipSubExprResults = __Regex_Global << 8, // Do not return sub expressions in the result
|
||||
__Regex_SingleLine = __Regex_Global << 10, // Dot matches newline characters
|
||||
__Regex_Sticky = __Regex_Global << 11, // Force the pattern to only match consecutive matches from where the previous match ended.
|
||||
__Regex_Multiline = __Regex_Global << 12, // Handle newline characters. Match each line, one by one.
|
||||
__Regex_SingleMatch = __Regex_Global << 13, // Stop after acquiring a single match.
|
||||
__Regex_UnicodeSets = __Regex_Global << 14, // ECMA262 Parser specific: Allow set operations in char classes.
|
||||
__Regex_Internal_Stateful = __Regex_Global << 15, // Internal flag; enables stateful matches.
|
||||
__Regex_Internal_BrowserExtended = __Regex_Global << 16, // Internal flag; enable browser-specific ECMA262 extensions.
|
||||
__Regex_Internal_ConsiderNewline = __Regex_Global << 17, // Internal flag; allow matchers to consider newlines as line separators.
|
||||
__Regex_Internal_ECMA262DotSemantics = __Regex_Global << 18, // Internal flag; use ECMA262 semantics for dot ('.') - disallow CR/LF/LS/PS instead of just CR.
|
||||
__Regex_Last = __Regex_Internal_ECMA262DotSemantics,
|
||||
};
|
||||
@@ -1,96 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
|
||||
*
|
||||
* SPDX-License-Identifier: BSD-2-Clause
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "RegexDefs.h"
|
||||
#include <AK/StringView.h>
|
||||
#include <AK/Types.h>
|
||||
|
||||
namespace regex {
|
||||
|
||||
enum class Error : u8 {
|
||||
NoError = __Regex_NoError,
|
||||
InvalidPattern = __Regex_InvalidPattern, // Invalid regular expression.
|
||||
InvalidCollationElement = __Regex_InvalidCollationElement, // Invalid collating element referenced.
|
||||
InvalidCharacterClass = __Regex_InvalidCharacterClass, // Invalid character class type referenced.
|
||||
InvalidTrailingEscape = __Regex_InvalidTrailingEscape, // Trailing \ in pattern.
|
||||
InvalidNumber = __Regex_InvalidNumber, // Number in \digit invalid or in error.
|
||||
MismatchingBracket = __Regex_MismatchingBracket, // [ ] imbalance.
|
||||
MismatchingParen = __Regex_MismatchingParen, // ( ) imbalance.
|
||||
MismatchingBrace = __Regex_MismatchingBrace, // { } imbalance.
|
||||
InvalidBraceContent = __Regex_InvalidBraceContent, // Content of {} invalid: not a number, number too large, more than two numbers, first larger than second.
|
||||
InvalidBracketContent = __Regex_InvalidBracketContent, // Content of [] invalid.
|
||||
InvalidRange = __Regex_InvalidRange, // Invalid endpoint in range expression.
|
||||
InvalidRepetitionMarker = __Regex_InvalidRepetitionMarker, // ?, * or + not preceded by valid regular expression.
|
||||
ReachedMaxRecursion = __Regex_ReachedMaxRecursion, // MaximumRecursion has been reached.
|
||||
EmptySubExpression = __Regex_EmptySubExpression, // Sub expression has empty content.
|
||||
InvalidCaptureGroup = __Regex_InvalidCaptureGroup, // Content of capture group is invalid.
|
||||
InvalidNameForCaptureGroup = __Regex_InvalidNameForCaptureGroup, // Name of capture group is invalid.
|
||||
InvalidNameForProperty = __Regex_InvalidNameForProperty, // Name of property is invalid.
|
||||
DuplicateNamedCapture = __Regex_DuplicateNamedCapture, // Name of property is invalid.
|
||||
InvalidCharacterClassEscape = __Regex_InvalidCharacterClassEscape, // Invalid escaped entity in character class.
|
||||
NegatedCharacterClassStrings = __Regex_NegatedCharacterClassStrings, // Negated character class may contain strings.
|
||||
InvalidModifierGroup = __Regex_InvalidModifierGroup, // Invalid modifier group.
|
||||
RepeatedModifierFlag = __Regex_RepeatedModifierFlag, // Repeated flag in modifier group.
|
||||
};
|
||||
|
||||
inline StringView get_error_string(Error error)
|
||||
{
|
||||
switch (error) {
|
||||
case Error::NoError:
|
||||
return "No error"sv;
|
||||
case Error::InvalidPattern:
|
||||
return "Invalid regular expression."sv;
|
||||
case Error::InvalidCollationElement:
|
||||
return "Invalid collating element referenced."sv;
|
||||
case Error::InvalidCharacterClass:
|
||||
return "Invalid character class type referenced."sv;
|
||||
case Error::InvalidTrailingEscape:
|
||||
return "Trailing \\ in pattern."sv;
|
||||
case Error::InvalidNumber:
|
||||
return "Number in \\digit invalid or in error."sv;
|
||||
case Error::MismatchingBracket:
|
||||
return "[ ] imbalance."sv;
|
||||
case Error::MismatchingParen:
|
||||
return "( ) imbalance."sv;
|
||||
case Error::MismatchingBrace:
|
||||
return "{ } imbalance."sv;
|
||||
case Error::InvalidBraceContent:
|
||||
return "Content of {} invalid: not a number, number too large, more than two numbers, first larger than second."sv;
|
||||
case Error::InvalidBracketContent:
|
||||
return "Content of [] invalid."sv;
|
||||
case Error::InvalidRange:
|
||||
return "Invalid endpoint in range expression."sv;
|
||||
case Error::InvalidRepetitionMarker:
|
||||
return "?, * or + not preceded by valid regular expression."sv;
|
||||
case Error::ReachedMaxRecursion:
|
||||
return "Maximum recursion has been reached."sv;
|
||||
case Error::EmptySubExpression:
|
||||
return "Sub expression has empty content."sv;
|
||||
case Error::InvalidCaptureGroup:
|
||||
return "Content of capture group is invalid."sv;
|
||||
case Error::InvalidNameForCaptureGroup:
|
||||
return "Name of capture group is invalid."sv;
|
||||
case Error::InvalidNameForProperty:
|
||||
return "Name of property is invalid."sv;
|
||||
case Error::DuplicateNamedCapture:
|
||||
return "Duplicate capture group name"sv;
|
||||
case Error::InvalidCharacterClassEscape:
|
||||
return "Invalid escaped entity in character class."sv;
|
||||
case Error::NegatedCharacterClassStrings:
|
||||
return "Negated character class cannot contain strings."sv;
|
||||
case Error::InvalidModifierGroup:
|
||||
return "Invalid modifier group."sv;
|
||||
case Error::RepeatedModifierFlag:
|
||||
return "Repeated flag in modifier group."sv;
|
||||
}
|
||||
return "Undefined error."sv;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
using regex::get_error_string;
|
||||
@@ -1,183 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
|
||||
*
|
||||
* SPDX-License-Identifier: BSD-2-Clause
|
||||
*/
|
||||
|
||||
#include "RegexLexer.h"
|
||||
#include <AK/Assertions.h>
|
||||
#include <AK/Debug.h>
|
||||
#include <AK/Format.h>
|
||||
|
||||
namespace regex {
|
||||
|
||||
char const* Token::name(TokenType const type)
|
||||
{
|
||||
switch (type) {
|
||||
#define __ENUMERATE_REGEX_TOKEN(x) \
|
||||
case TokenType::x: \
|
||||
return #x;
|
||||
ENUMERATE_REGEX_TOKENS
|
||||
#undef __ENUMERATE_REGEX_TOKEN
|
||||
default:
|
||||
VERIFY_NOT_REACHED();
|
||||
return "<Unknown>";
|
||||
}
|
||||
}
|
||||
|
||||
char const* Token::name() const
|
||||
{
|
||||
return name(m_type);
|
||||
}
|
||||
|
||||
Lexer::Lexer()
|
||||
: GenericLexer(StringView {})
|
||||
{
|
||||
}
|
||||
|
||||
Lexer::Lexer(StringView const source)
|
||||
: GenericLexer(source)
|
||||
{
|
||||
}
|
||||
|
||||
void Lexer::back(size_t offset)
|
||||
{
|
||||
if (offset == m_index + 1)
|
||||
offset = m_index; // 'position == 0' occurs twice.
|
||||
|
||||
VERIFY(offset <= m_index);
|
||||
if (!offset)
|
||||
return;
|
||||
m_index -= offset;
|
||||
m_previous_position = (m_index > 0) ? m_index - 1 : 0;
|
||||
}
|
||||
|
||||
char Lexer::consume()
|
||||
{
|
||||
m_previous_position = m_index;
|
||||
return GenericLexer::consume();
|
||||
}
|
||||
|
||||
void Lexer::reset()
|
||||
{
|
||||
m_index = 0;
|
||||
m_current_token = { TokenType::Eof, 0, {} };
|
||||
m_previous_position = 0;
|
||||
}
|
||||
|
||||
Token Lexer::next()
|
||||
{
|
||||
size_t token_start_position;
|
||||
|
||||
auto begin_token = [&] {
|
||||
token_start_position = m_index;
|
||||
};
|
||||
|
||||
auto commit_token = [&](auto type) -> Token& {
|
||||
VERIFY(token_start_position + m_previous_position - token_start_position + 1 <= m_input.length());
|
||||
auto substring = m_input.substring_view(token_start_position, m_previous_position - token_start_position + 1);
|
||||
m_current_token = Token(type, token_start_position, substring);
|
||||
return m_current_token;
|
||||
};
|
||||
|
||||
auto emit_token = [&](auto type) -> Token& {
|
||||
m_current_token = Token(type, m_index, m_input.substring_view(m_index, 1));
|
||||
consume();
|
||||
return m_current_token;
|
||||
};
|
||||
|
||||
auto match_escape_sequence = [&]() -> size_t {
|
||||
switch (peek(1)) {
|
||||
case '^':
|
||||
case '.':
|
||||
case '[':
|
||||
case ']':
|
||||
case '$':
|
||||
case '(':
|
||||
case ')':
|
||||
case '|':
|
||||
case '*':
|
||||
case '+':
|
||||
case '?':
|
||||
case '{':
|
||||
case '\\':
|
||||
return 2;
|
||||
default:
|
||||
dbgln_if(REGEX_DEBUG, "[LEXER] Found invalid escape sequence: \\{:c} (the parser will have to deal with this!)", peek(1));
|
||||
return 0;
|
||||
}
|
||||
};
|
||||
|
||||
while (m_index < m_input.length()) {
|
||||
auto ch = peek();
|
||||
if (ch == '(')
|
||||
return emit_token(TokenType::LeftParen);
|
||||
|
||||
if (ch == ')')
|
||||
return emit_token(TokenType::RightParen);
|
||||
|
||||
if (ch == '{')
|
||||
return emit_token(TokenType::LeftCurly);
|
||||
|
||||
if (ch == '}')
|
||||
return emit_token(TokenType::RightCurly);
|
||||
|
||||
if (ch == '[')
|
||||
return emit_token(TokenType::LeftBracket);
|
||||
|
||||
if (ch == ']')
|
||||
return emit_token(TokenType::RightBracket);
|
||||
|
||||
if (ch == '.')
|
||||
return emit_token(TokenType::Period);
|
||||
|
||||
if (ch == '*')
|
||||
return emit_token(TokenType::Asterisk);
|
||||
|
||||
if (ch == '+')
|
||||
return emit_token(TokenType::Plus);
|
||||
|
||||
if (ch == '$')
|
||||
return emit_token(TokenType::Dollar);
|
||||
|
||||
if (ch == '^')
|
||||
return emit_token(TokenType::Circumflex);
|
||||
|
||||
if (ch == '|')
|
||||
return emit_token(TokenType::Pipe);
|
||||
|
||||
if (ch == '?')
|
||||
return emit_token(TokenType::Questionmark);
|
||||
|
||||
if (ch == ',')
|
||||
return emit_token(TokenType::Comma);
|
||||
|
||||
if (ch == '/')
|
||||
return emit_token(TokenType::Slash);
|
||||
|
||||
if (ch == '=')
|
||||
return emit_token(TokenType::EqualSign);
|
||||
|
||||
if (ch == ':')
|
||||
return emit_token(TokenType::Colon);
|
||||
|
||||
if (ch == '-')
|
||||
return emit_token(TokenType::HyphenMinus);
|
||||
|
||||
if (ch == '\\') {
|
||||
size_t escape = match_escape_sequence();
|
||||
if (escape > 0) {
|
||||
begin_token();
|
||||
for (size_t i = 0; i < escape; ++i)
|
||||
consume();
|
||||
return commit_token(TokenType::EscapeSequence);
|
||||
}
|
||||
}
|
||||
|
||||
return emit_token(TokenType::Char);
|
||||
}
|
||||
|
||||
return Token(TokenType::Eof, m_index, {});
|
||||
}
|
||||
|
||||
}
|
||||
@@ -1,86 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
|
||||
*
|
||||
* SPDX-License-Identifier: BSD-2-Clause
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <AK/Forward.h>
|
||||
#include <AK/GenericLexer.h>
|
||||
#include <AK/StringView.h>
|
||||
#include <LibRegex/Forward.h>
|
||||
|
||||
namespace regex {
|
||||
|
||||
#define ENUMERATE_REGEX_TOKENS \
|
||||
__ENUMERATE_REGEX_TOKEN(Eof) \
|
||||
__ENUMERATE_REGEX_TOKEN(Char) \
|
||||
__ENUMERATE_REGEX_TOKEN(Circumflex) \
|
||||
__ENUMERATE_REGEX_TOKEN(Period) \
|
||||
__ENUMERATE_REGEX_TOKEN(LeftParen) \
|
||||
__ENUMERATE_REGEX_TOKEN(RightParen) \
|
||||
__ENUMERATE_REGEX_TOKEN(LeftCurly) \
|
||||
__ENUMERATE_REGEX_TOKEN(RightCurly) \
|
||||
__ENUMERATE_REGEX_TOKEN(LeftBracket) \
|
||||
__ENUMERATE_REGEX_TOKEN(RightBracket) \
|
||||
__ENUMERATE_REGEX_TOKEN(Asterisk) \
|
||||
__ENUMERATE_REGEX_TOKEN(EscapeSequence) \
|
||||
__ENUMERATE_REGEX_TOKEN(Dollar) \
|
||||
__ENUMERATE_REGEX_TOKEN(Pipe) \
|
||||
__ENUMERATE_REGEX_TOKEN(Plus) \
|
||||
__ENUMERATE_REGEX_TOKEN(Comma) \
|
||||
__ENUMERATE_REGEX_TOKEN(Slash) \
|
||||
__ENUMERATE_REGEX_TOKEN(EqualSign) \
|
||||
__ENUMERATE_REGEX_TOKEN(HyphenMinus) \
|
||||
__ENUMERATE_REGEX_TOKEN(Colon) \
|
||||
__ENUMERATE_REGEX_TOKEN(Questionmark)
|
||||
|
||||
enum class TokenType {
|
||||
#define __ENUMERATE_REGEX_TOKEN(x) x,
|
||||
ENUMERATE_REGEX_TOKENS
|
||||
#undef __ENUMERATE_REGEX_TOKEN
|
||||
};
|
||||
|
||||
class Token {
|
||||
public:
|
||||
Token() = default;
|
||||
Token(TokenType const type, size_t const start_position, StringView const value)
|
||||
: m_type(type)
|
||||
, m_position(start_position)
|
||||
, m_value(value)
|
||||
{
|
||||
}
|
||||
|
||||
TokenType type() const { return m_type; }
|
||||
StringView value() const { return m_value; }
|
||||
size_t position() const { return m_position; }
|
||||
|
||||
char const* name() const;
|
||||
static char const* name(TokenType);
|
||||
|
||||
private:
|
||||
TokenType m_type { TokenType::Eof };
|
||||
size_t m_position { 0 };
|
||||
StringView m_value {};
|
||||
};
|
||||
|
||||
class REGEX_API Lexer : public GenericLexer {
|
||||
public:
|
||||
Lexer();
|
||||
explicit Lexer(StringView source);
|
||||
Token next();
|
||||
void reset();
|
||||
void back(size_t offset);
|
||||
char consume();
|
||||
void set_source(StringView const source) { m_input = source; }
|
||||
auto const& source() const { return m_input; }
|
||||
|
||||
private:
|
||||
size_t m_previous_position { 0 };
|
||||
Token m_current_token { TokenType::Eof, 0, {} };
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
using regex::Lexer;
|
||||
@@ -1,576 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
|
||||
*
|
||||
* SPDX-License-Identifier: BSD-2-Clause
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "Forward.h"
|
||||
#include "RegexOptions.h"
|
||||
|
||||
#include <AK/ByteString.h>
|
||||
#include <AK/COWVector.h>
|
||||
#include <AK/Error.h>
|
||||
#include <AK/FlyString.h>
|
||||
#include <AK/MemMem.h>
|
||||
#include <AK/StringBuilder.h>
|
||||
#include <AK/StringView.h>
|
||||
#include <AK/UnicodeUtils.h>
|
||||
#include <AK/Utf16String.h>
|
||||
#include <AK/Utf16View.h>
|
||||
#include <AK/Utf32View.h>
|
||||
#include <AK/Utf8View.h>
|
||||
#include <AK/Variant.h>
|
||||
#include <AK/Vector.h>
|
||||
#include <LibUnicode/CharacterTypes.h>
|
||||
|
||||
namespace regex {
|
||||
|
||||
class RegexStringView {
|
||||
public:
|
||||
RegexStringView() = default;
|
||||
|
||||
RegexStringView(String const& string)
|
||||
: m_view(string.bytes_as_string_view())
|
||||
{
|
||||
}
|
||||
|
||||
RegexStringView(StringView const view)
|
||||
: m_view(view)
|
||||
{
|
||||
}
|
||||
|
||||
RegexStringView(Utf16View view)
|
||||
: m_view(view)
|
||||
{
|
||||
}
|
||||
|
||||
RegexStringView(String&&) = delete;
|
||||
|
||||
Utf16View const& u16_view() const
|
||||
{
|
||||
return m_view.get<Utf16View>();
|
||||
}
|
||||
|
||||
bool is_u16_view() const
|
||||
{
|
||||
return m_view.has<Utf16View>();
|
||||
}
|
||||
|
||||
bool unicode() const { return m_unicode; }
|
||||
void set_unicode(bool unicode) { m_unicode = unicode; }
|
||||
|
||||
bool is_empty() const
|
||||
{
|
||||
return m_view.visit([](auto& view) { return view.is_empty(); });
|
||||
}
|
||||
|
||||
bool is_null() const
|
||||
{
|
||||
return m_view.visit([](auto& view) { return view.is_null(); });
|
||||
}
|
||||
|
||||
size_t length() const
|
||||
{
|
||||
if (unicode()) {
|
||||
return m_view.visit(
|
||||
[](Utf16View const& view) { return view.length_in_code_points(); },
|
||||
[](auto const& view) { return view.length(); });
|
||||
}
|
||||
|
||||
return length_in_code_units();
|
||||
}
|
||||
|
||||
size_t length_in_code_units() const
|
||||
{
|
||||
return m_view.visit(
|
||||
[](Utf16View const& view) { return view.length_in_code_units(); },
|
||||
[](auto const& view) { return view.length(); });
|
||||
}
|
||||
|
||||
size_t length_of_code_point(u32 code_point) const
|
||||
{
|
||||
return m_view.visit(
|
||||
[&](Utf16View const&) {
|
||||
if (code_point < 0x10000)
|
||||
return 1;
|
||||
return 2;
|
||||
},
|
||||
[&](auto const&) {
|
||||
if (code_point <= 0x7f)
|
||||
return 1;
|
||||
if (code_point <= 0x07ff)
|
||||
return 2;
|
||||
if (code_point <= 0xffff)
|
||||
return 3;
|
||||
return 4;
|
||||
});
|
||||
}
|
||||
|
||||
RegexStringView typed_null_view()
|
||||
{
|
||||
auto view = m_view.visit(
|
||||
[&]<typename T>(T const&) {
|
||||
return RegexStringView { T {} };
|
||||
});
|
||||
view.set_unicode(unicode());
|
||||
return view;
|
||||
}
|
||||
|
||||
RegexStringView construct_as_same(Span<u32> data, Optional<ByteString>& optional_string_storage, Utf16String& optional_utf16_storage) const
|
||||
{
|
||||
auto view = m_view.visit(
|
||||
[&optional_string_storage, data]<typename T>(T const&) {
|
||||
StringBuilder builder;
|
||||
for (auto ch : data)
|
||||
builder.append(ch); // Note: The type conversion is intentional.
|
||||
optional_string_storage = builder.to_byte_string();
|
||||
return RegexStringView { T { *optional_string_storage } };
|
||||
},
|
||||
[&optional_utf16_storage, data](Utf16View) {
|
||||
optional_utf16_storage = Utf16String::from_utf32({ data.data(), data.size() });
|
||||
return RegexStringView { optional_utf16_storage.utf16_view() };
|
||||
});
|
||||
|
||||
view.set_unicode(unicode());
|
||||
return view;
|
||||
}
|
||||
|
||||
Vector<RegexStringView> lines() const
|
||||
{
|
||||
return m_view.visit(
|
||||
[](StringView view) {
|
||||
auto views = view.lines(StringView::ConsiderCarriageReturn::No);
|
||||
Vector<RegexStringView> new_views;
|
||||
for (auto& view : views)
|
||||
new_views.empend(view);
|
||||
return new_views;
|
||||
},
|
||||
[](Utf16View view) {
|
||||
if (view.is_empty())
|
||||
return Vector<RegexStringView> { view };
|
||||
|
||||
Vector<RegexStringView> views;
|
||||
while (!view.is_empty()) {
|
||||
auto position = view.find_code_unit_offset(u'\n');
|
||||
if (!position.has_value())
|
||||
break;
|
||||
auto offset = position.value() / sizeof(u16);
|
||||
views.empend(view.substring_view(0, offset));
|
||||
view = view.substring_view(offset + 1, view.length_in_code_units() - offset - 1);
|
||||
}
|
||||
if (!view.is_empty())
|
||||
views.empend(view);
|
||||
return views;
|
||||
});
|
||||
}
|
||||
|
||||
RegexStringView substring_view(size_t offset, size_t length) const
|
||||
{
|
||||
if (unicode()) {
|
||||
auto view = m_view.visit(
|
||||
[&](auto view) { return RegexStringView { view.substring_view(offset, length) }; },
|
||||
[&](Utf16View const& view) { return RegexStringView { view.unicode_substring_view(offset, length) }; });
|
||||
|
||||
view.set_unicode(unicode());
|
||||
return view;
|
||||
}
|
||||
|
||||
auto view = m_view.visit([&](auto view) { return RegexStringView { view.substring_view(offset, length) }; });
|
||||
view.set_unicode(unicode());
|
||||
return view;
|
||||
}
|
||||
|
||||
ByteString to_byte_string() const
|
||||
{
|
||||
return m_view.visit(
|
||||
[](StringView view) { return view.to_byte_string(); },
|
||||
[](Utf16View view) { return view.to_byte_string().release_value_but_fixme_should_propagate_errors(); });
|
||||
}
|
||||
|
||||
ErrorOr<String> to_string() const
|
||||
{
|
||||
return m_view.visit(
|
||||
[](StringView view) { return String::from_utf8(view); },
|
||||
[](Utf16View view) { return view.to_utf8(); });
|
||||
}
|
||||
|
||||
u32 code_point_at(size_t code_unit_index) const
|
||||
{
|
||||
return m_view.visit(
|
||||
[&](StringView view) -> u32 {
|
||||
auto ch = view[code_unit_index];
|
||||
if constexpr (IsSigned<char>) {
|
||||
if (ch < 0)
|
||||
return 256u + ch;
|
||||
return ch;
|
||||
}
|
||||
},
|
||||
[&](Utf16View const& view) -> u32 { return view.code_point_at(code_unit_index); });
|
||||
}
|
||||
|
||||
// Returns the code point at the code unit offset if the Unicode flag is set. Otherwise, returns the code unit.
|
||||
u32 unicode_aware_code_point_at(size_t code_unit_index) const
|
||||
{
|
||||
if (unicode())
|
||||
return code_point_at(code_unit_index);
|
||||
|
||||
return m_view.visit(
|
||||
[&](StringView view) -> u32 {
|
||||
auto ch = view[code_unit_index];
|
||||
if constexpr (IsSigned<char>) {
|
||||
if (ch < 0)
|
||||
return 256u + ch;
|
||||
return ch;
|
||||
}
|
||||
},
|
||||
[&](Utf16View const& view) -> u32 { return view.code_unit_at(code_unit_index); });
|
||||
}
|
||||
|
||||
size_t code_unit_offset_of(size_t code_point_index) const
|
||||
{
|
||||
return m_view.visit(
|
||||
[&](StringView view) -> u32 {
|
||||
Utf8View utf8_view { view };
|
||||
return utf8_view.byte_offset_of(code_point_index);
|
||||
},
|
||||
[&](Utf16View const& view) -> u32 {
|
||||
return view.code_unit_offset_of(code_point_index);
|
||||
});
|
||||
}
|
||||
|
||||
bool operator==(char const* cstring) const
|
||||
{
|
||||
return m_view.visit(
|
||||
[&](Utf16View) { return to_byte_string() == cstring; },
|
||||
[&](StringView view) { return view == cstring; });
|
||||
}
|
||||
|
||||
bool operator==(StringView string) const
|
||||
{
|
||||
return m_view.visit(
|
||||
[&](Utf16View) { return to_byte_string() == string; },
|
||||
[&](StringView view) { return view == string; });
|
||||
}
|
||||
|
||||
bool operator==(Utf16View const& other) const
|
||||
{
|
||||
return m_view.visit(
|
||||
[&](Utf16View const& view) { return view == other; },
|
||||
[&](StringView view) { return view == RegexStringView { other }.to_byte_string(); });
|
||||
}
|
||||
|
||||
bool equals(RegexStringView other) const
|
||||
{
|
||||
return other.m_view.visit([this](auto const& view) { return operator==(view); });
|
||||
}
|
||||
|
||||
bool equals_ignoring_case(RegexStringView other, bool unicode_mode) const
|
||||
{
|
||||
return m_view.visit(
|
||||
[&](StringView view) {
|
||||
return other.m_view.visit(
|
||||
[&](StringView other_view) {
|
||||
if (!unicode_mode)
|
||||
return view.equals_ignoring_ascii_case(other_view);
|
||||
|
||||
Utf8View view_utf8(view);
|
||||
Utf8View other_utf8(other_view);
|
||||
return Unicode::ranges_equal_ignoring_case(view_utf8, other_utf8, unicode_mode);
|
||||
},
|
||||
[&](Utf16View other_view) {
|
||||
Utf8View view_utf8(view);
|
||||
return Unicode::ranges_equal_ignoring_case(view_utf8, other_view, unicode_mode);
|
||||
},
|
||||
[](auto&) -> bool { TODO(); });
|
||||
},
|
||||
[&](Utf16View view) {
|
||||
return other.m_view.visit(
|
||||
[&](StringView other_view) {
|
||||
Utf8View other_utf8(other_view);
|
||||
return Unicode::ranges_equal_ignoring_case(view, other_utf8, unicode_mode);
|
||||
},
|
||||
[&](Utf16View other_view) {
|
||||
if (!unicode_mode)
|
||||
return view.equals_ignoring_ascii_case(other_view);
|
||||
|
||||
return Unicode::ranges_equal_ignoring_case(view, other_view, unicode_mode);
|
||||
},
|
||||
[](auto&) -> bool { TODO(); });
|
||||
},
|
||||
[](auto&) -> bool { TODO(); });
|
||||
}
|
||||
|
||||
bool starts_with(StringView str) const
|
||||
{
|
||||
return m_view.visit(
|
||||
[&](Utf16View) -> bool {
|
||||
TODO();
|
||||
},
|
||||
[&](StringView view) { return view.starts_with(str); });
|
||||
}
|
||||
|
||||
struct FoundIndex {
|
||||
size_t code_unit_index;
|
||||
size_t code_point_index;
|
||||
};
|
||||
Optional<FoundIndex> find_index_of_previous(u32 code_point, size_t end_code_point_index, size_t end_code_unit_index) const
|
||||
{
|
||||
return m_view.visit(
|
||||
[&](Utf16View const& view) -> Optional<FoundIndex> {
|
||||
auto result = view.find_last_code_point_offset(code_point, end_code_unit_index);
|
||||
if (!result.has_value())
|
||||
return {};
|
||||
return FoundIndex { result.value(), view.code_point_offset_of(result.value()) };
|
||||
},
|
||||
[&](StringView const& view) -> Optional<FoundIndex> {
|
||||
if (unicode()) {
|
||||
Utf8View utf8_view { view };
|
||||
auto it = utf8_view.begin();
|
||||
size_t current_code_point_index = 0;
|
||||
Optional<FoundIndex> found_index;
|
||||
|
||||
for (; it != utf8_view.end(); ++it, ++current_code_point_index) {
|
||||
if (current_code_point_index >= end_code_point_index)
|
||||
break;
|
||||
if (*it == code_point) {
|
||||
auto byte_index = utf8_view.byte_offset_of(it);
|
||||
found_index = { byte_index, current_code_point_index };
|
||||
}
|
||||
}
|
||||
|
||||
return found_index;
|
||||
}
|
||||
|
||||
auto byte_index = view.substring_view(0, min(end_code_unit_index, view.length())).find_last(code_point);
|
||||
if (!byte_index.has_value())
|
||||
return {};
|
||||
return FoundIndex { byte_index.value(), byte_index.value() };
|
||||
});
|
||||
}
|
||||
|
||||
FoundIndex find_end_of_line(size_t start_code_point_index, size_t start_code_unit_index) const
|
||||
{
|
||||
constexpr auto is_newline = [](u32 ch) { return ch == '\n' || ch == '\r' || ch == 0x2028 || ch == 0x2029; };
|
||||
|
||||
return m_view.visit(
|
||||
[&](Utf16View const& view) -> FoundIndex {
|
||||
size_t code_unit_index = start_code_unit_index;
|
||||
size_t code_point_index = start_code_point_index;
|
||||
while (code_unit_index < view.length_in_code_units()) {
|
||||
auto code_unit = view.code_unit_at(code_unit_index);
|
||||
u32 ch = code_unit;
|
||||
size_t code_units_for_this = 1;
|
||||
if (AK::UnicodeUtils::is_utf16_high_surrogate(code_unit) && code_unit_index + 1 < view.length_in_code_units()) {
|
||||
auto next_code_unit = view.code_unit_at(code_unit_index + 1);
|
||||
if (AK::UnicodeUtils::is_utf16_low_surrogate(next_code_unit)) {
|
||||
ch = AK::UnicodeUtils::decode_utf16_surrogate_pair(code_unit, next_code_unit);
|
||||
code_units_for_this = 2;
|
||||
}
|
||||
}
|
||||
|
||||
if (is_newline(ch))
|
||||
return FoundIndex { code_unit_index, code_point_index };
|
||||
code_unit_index += code_units_for_this;
|
||||
++code_point_index;
|
||||
}
|
||||
return FoundIndex { view.length_in_code_units(), code_point_index };
|
||||
},
|
||||
[&](StringView const& view) -> FoundIndex {
|
||||
if (unicode()) {
|
||||
Utf8View utf8_view { view };
|
||||
auto it = utf8_view.begin();
|
||||
size_t current_code_point_index = 0;
|
||||
|
||||
// Skip to start position
|
||||
while (it != utf8_view.end() && current_code_point_index < start_code_point_index) {
|
||||
++it;
|
||||
++current_code_point_index;
|
||||
}
|
||||
|
||||
for (; it != utf8_view.end(); ++it, ++current_code_point_index) {
|
||||
if (is_newline(*it)) {
|
||||
return FoundIndex { utf8_view.byte_offset_of(it), current_code_point_index };
|
||||
}
|
||||
}
|
||||
|
||||
return FoundIndex { view.length(), utf8_view.length() };
|
||||
}
|
||||
|
||||
for (size_t i = start_code_unit_index; i < view.length(); ++i) {
|
||||
if (is_newline(static_cast<u8>(view[i])))
|
||||
return FoundIndex { i, i };
|
||||
}
|
||||
return FoundIndex { view.length(), view.length() };
|
||||
});
|
||||
}
|
||||
|
||||
private:
|
||||
NO_UNIQUE_ADDRESS Variant<StringView, Utf16View> m_view { StringView {} };
|
||||
NO_UNIQUE_ADDRESS bool m_unicode { false };
|
||||
};
|
||||
|
||||
class Match final {
|
||||
public:
|
||||
Match() = default;
|
||||
~Match() = default;
|
||||
|
||||
Match(RegexStringView view_, size_t const line_, size_t const column_, size_t const global_offset_)
|
||||
: view(view_)
|
||||
, line(line_)
|
||||
, column(column_)
|
||||
, global_offset(global_offset_)
|
||||
, left_column(column_)
|
||||
{
|
||||
}
|
||||
|
||||
Match(RegexStringView const view_, size_t capture_group_name_, size_t const line_, size_t const column_, size_t const global_offset_)
|
||||
: view(view_)
|
||||
, capture_group_name(capture_group_name_)
|
||||
, line(line_)
|
||||
, column(column_)
|
||||
, global_offset(global_offset_)
|
||||
, left_column(column_)
|
||||
{
|
||||
}
|
||||
|
||||
void reset()
|
||||
{
|
||||
view = view.typed_null_view();
|
||||
capture_group_name = -1;
|
||||
line = 0;
|
||||
column = 0;
|
||||
global_offset = 0;
|
||||
left_column = 0;
|
||||
}
|
||||
|
||||
RegexStringView view {};
|
||||
|
||||
// This is a string table index. -1 if none. Not using Optional to keep the struct trivially copyable.
|
||||
ssize_t capture_group_name { -1 };
|
||||
|
||||
size_t line { 0 };
|
||||
size_t column { 0 };
|
||||
size_t global_offset { 0 };
|
||||
|
||||
// ugly, as not usable by user, but needed to prevent to create extra vectors that are
|
||||
// able to store the column when the left paren has been found
|
||||
size_t left_column { 0 };
|
||||
};
|
||||
|
||||
struct MatchInput {
|
||||
RegexStringView view {};
|
||||
AllOptions regex_options {};
|
||||
size_t start_offset { 0 }; // For Stateful matches, saved and restored from Regex::start_offset.
|
||||
|
||||
size_t match_index { 0 };
|
||||
size_t line { 0 };
|
||||
size_t column { 0 };
|
||||
|
||||
size_t global_offset { 0 }; // For multiline matching, knowing the offset from start could be important
|
||||
|
||||
mutable size_t fail_counter { 0 };
|
||||
mutable Vector<size_t> saved_positions;
|
||||
mutable Vector<size_t> saved_code_unit_positions;
|
||||
mutable Vector<size_t> saved_forks_since_last_save;
|
||||
mutable Optional<size_t> fork_to_replace;
|
||||
|
||||
bool in_the_middle_of_a_line { false };
|
||||
StringView pattern {};
|
||||
};
|
||||
|
||||
struct MatchState {
|
||||
size_t capture_group_count;
|
||||
size_t string_position_before_match { 0 };
|
||||
size_t string_position { 0 };
|
||||
size_t string_position_in_code_units { 0 };
|
||||
size_t instruction_position { 0 };
|
||||
size_t fork_at_position { 0 };
|
||||
size_t forks_since_last_save { 0 };
|
||||
size_t string_position_before_rseek { NumericLimits<size_t>::max() };
|
||||
size_t string_position_in_code_units_before_rseek { NumericLimits<size_t>::max() };
|
||||
Optional<size_t> initiating_fork;
|
||||
COWVector<Match> matches;
|
||||
COWVector<Match> flat_capture_group_matches; // Vector<Vector<Match>> indexed by match index, then by capture group id; flattened for performance
|
||||
COWVector<u64> repetition_marks;
|
||||
Vector<u64, 64> checkpoints;
|
||||
Vector<i64> step_backs;
|
||||
Vector<FlagsUnderlyingType, 1> modifier_stack;
|
||||
AllOptions current_options;
|
||||
|
||||
explicit MatchState(size_t capture_group_count, AllOptions options = {})
|
||||
: capture_group_count(capture_group_count)
|
||||
, current_options(options)
|
||||
{
|
||||
}
|
||||
|
||||
MatchState(MatchState const&) = default;
|
||||
MatchState(MatchState&&) = default;
|
||||
|
||||
MatchState& operator=(MatchState const&) = default;
|
||||
MatchState& operator=(MatchState&&) = default;
|
||||
|
||||
static MatchState only_for_enumeration() { return MatchState { 0 }; }
|
||||
|
||||
size_t capture_group_matches_size() const
|
||||
{
|
||||
return flat_capture_group_matches.size() / capture_group_count;
|
||||
}
|
||||
|
||||
Span<Match const> capture_group_matches(size_t match_index) const
|
||||
{
|
||||
return flat_capture_group_matches.span().slice(match_index * capture_group_count, capture_group_count);
|
||||
}
|
||||
|
||||
Span<Match> mutable_capture_group_matches(size_t match_index)
|
||||
{
|
||||
return flat_capture_group_matches.mutable_span().slice(match_index * capture_group_count, capture_group_count);
|
||||
}
|
||||
|
||||
// For size_t in {0..300}, ips in {0..750} and repetitions in {0..50}, there are zero collisions.
|
||||
u64 u64_hash() const
|
||||
{
|
||||
u64 hash = 0xcbf29ce484222325;
|
||||
auto combine = [&hash](auto value) {
|
||||
hash ^= static_cast<u64>(value);
|
||||
hash *= 0x9e3779b97f4a7c15;
|
||||
};
|
||||
auto combine_vector = [&combine](auto const& vector, auto tag) {
|
||||
combine(tag);
|
||||
combine(vector.size());
|
||||
for (auto& value : vector)
|
||||
combine(value);
|
||||
};
|
||||
|
||||
combine(string_position_before_match);
|
||||
combine(string_position);
|
||||
combine(string_position_in_code_units);
|
||||
combine(instruction_position);
|
||||
combine(fork_at_position);
|
||||
combine(initiating_fork.value_or(0) + initiating_fork.has_value());
|
||||
combine_vector(repetition_marks, 0xbeefbeefbeefbeef);
|
||||
combine_vector(checkpoints, 0xfacefacefaceface);
|
||||
combine_vector(step_backs, 0xfedefedefedefede);
|
||||
|
||||
return hash;
|
||||
}
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
using regex::RegexStringView;
|
||||
|
||||
template<>
|
||||
struct AK::Formatter<regex::RegexStringView> : Formatter<StringView> {
|
||||
ErrorOr<void> format(FormatBuilder& builder, regex::RegexStringView value)
|
||||
{
|
||||
auto string = value.to_byte_string();
|
||||
return Formatter<StringView>::format(builder, string);
|
||||
}
|
||||
};
|
||||
|
||||
template<>
|
||||
struct AK::Traits<regex::Match> : public AK::DefaultTraits<regex::Match> {
|
||||
constexpr static bool is_trivial() { return true; }
|
||||
};
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,312 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
|
||||
*
|
||||
* SPDX-License-Identifier: BSD-2-Clause
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "RegexByteCode.h"
|
||||
#include "RegexMatch.h"
|
||||
#include "RegexOptions.h"
|
||||
#include "RegexParser.h"
|
||||
|
||||
#include <AK/Forward.h>
|
||||
#include <AK/GenericLexer.h>
|
||||
#include <AK/Vector.h>
|
||||
#include <ctype.h>
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
namespace regex {
|
||||
|
||||
namespace Detail {
|
||||
|
||||
struct Block {
|
||||
size_t start;
|
||||
size_t end;
|
||||
StringView comment { "N/A"sv };
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
static constexpr size_t const c_max_recursion = 5000;
|
||||
|
||||
struct REGEX_API RegexResult final {
|
||||
bool success { false };
|
||||
size_t count { 0 };
|
||||
Vector<Match> matches;
|
||||
Vector<Match> flat_capture_group_matches;
|
||||
Vector<Span<Match>> capture_group_matches;
|
||||
size_t n_operations { 0 };
|
||||
size_t n_capture_groups { 0 };
|
||||
size_t n_named_capture_groups { 0 };
|
||||
};
|
||||
|
||||
template<class Parser>
|
||||
class REGEX_API Regex;
|
||||
|
||||
template<class Parser>
|
||||
class REGEX_API Matcher final {
|
||||
|
||||
public:
|
||||
Matcher(Regex<Parser> const* pattern, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {})
|
||||
: m_pattern(pattern)
|
||||
, m_regex_options(regex_options.value_or({}))
|
||||
{
|
||||
}
|
||||
~Matcher() = default;
|
||||
|
||||
RegexResult match(RegexStringView, Optional<typename ParserTraits<Parser>::OptionsType> = {}) const;
|
||||
RegexResult match(Vector<RegexStringView> const&, Optional<typename ParserTraits<Parser>::OptionsType> = {}) const;
|
||||
|
||||
typename ParserTraits<Parser>::OptionsType options() const
|
||||
{
|
||||
return m_regex_options;
|
||||
}
|
||||
|
||||
void reset_pattern(Badge<Regex<Parser>>, Regex<Parser> const* pattern)
|
||||
{
|
||||
m_pattern = pattern;
|
||||
}
|
||||
|
||||
private:
|
||||
enum class ExecuteResult {
|
||||
DidNotMatch,
|
||||
Matched,
|
||||
DidNotMatchAndNoFurtherPossibleMatchesInView,
|
||||
};
|
||||
ExecuteResult execute(MatchInput const& input, MatchState& state, size_t& operations) const;
|
||||
|
||||
Regex<Parser> const* m_pattern;
|
||||
typename ParserTraits<Parser>::OptionsType const m_regex_options;
|
||||
};
|
||||
|
||||
template<class Parser>
|
||||
class REGEX_API Regex final {
|
||||
public:
|
||||
ByteString pattern_value;
|
||||
regex::Parser::Result parser_result;
|
||||
OwnPtr<Matcher<Parser>> matcher { nullptr };
|
||||
mutable size_t start_offset { 0 };
|
||||
|
||||
static regex::Parser::Result parse_pattern(StringView pattern, typename ParserTraits<Parser>::OptionsType regex_options = {});
|
||||
|
||||
explicit Regex(ByteString pattern, typename ParserTraits<Parser>::OptionsType regex_options = {});
|
||||
Regex(regex::Parser::Result parse_result, ByteString pattern, typename ParserTraits<Parser>::OptionsType regex_options = {});
|
||||
Regex(Regex const&);
|
||||
~Regex() = default;
|
||||
Regex(Regex&&);
|
||||
Regex& operator=(Regex&&);
|
||||
|
||||
typename ParserTraits<Parser>::OptionsType options() const;
|
||||
ByteString error_string(Optional<ByteString> message = {}) const;
|
||||
|
||||
RegexResult match(RegexStringView view, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const
|
||||
{
|
||||
if (!matcher || parser_result.error != Error::NoError)
|
||||
return {};
|
||||
return matcher->match(view, regex_options);
|
||||
}
|
||||
|
||||
RegexResult match(Vector<RegexStringView> const& views, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const
|
||||
{
|
||||
if (!matcher || parser_result.error != Error::NoError)
|
||||
return {};
|
||||
return matcher->match(views, regex_options);
|
||||
}
|
||||
|
||||
ByteString replace(RegexStringView view, StringView replacement_pattern, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const
|
||||
{
|
||||
if (!matcher || parser_result.error != Error::NoError)
|
||||
return {};
|
||||
|
||||
StringBuilder builder;
|
||||
size_t start_offset = 0;
|
||||
RegexResult result = matcher->match(view, regex_options);
|
||||
if (!result.success)
|
||||
return view.to_byte_string();
|
||||
|
||||
for (size_t i = 0; i < result.matches.size(); ++i) {
|
||||
auto& match = result.matches[i];
|
||||
builder.append(view.substring_view(start_offset, match.global_offset - start_offset).to_byte_string());
|
||||
start_offset = match.global_offset + match.view.length();
|
||||
GenericLexer lexer(replacement_pattern);
|
||||
while (!lexer.is_eof()) {
|
||||
if (lexer.consume_specific('\\')) {
|
||||
if (lexer.consume_specific('\\')) {
|
||||
builder.append('\\');
|
||||
continue;
|
||||
}
|
||||
auto number = lexer.consume_while(isdigit);
|
||||
if (auto index = number.to_number<unsigned>(); index.has_value() && result.n_capture_groups >= index.value()) {
|
||||
builder.append(result.capture_group_matches[i][index.value() - 1].view.to_byte_string());
|
||||
} else {
|
||||
builder.appendff("\\{}", number);
|
||||
}
|
||||
} else {
|
||||
builder.append(lexer.consume_while([](auto ch) { return ch != '\\'; }));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
builder.append(view.substring_view(start_offset, view.length() - start_offset).to_byte_string());
|
||||
|
||||
return builder.to_byte_string();
|
||||
}
|
||||
|
||||
// FIXME: replace(Vector<RegexStringView> const , ...)
|
||||
|
||||
RegexResult search(RegexStringView view, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const
|
||||
{
|
||||
if (!matcher || parser_result.error != Error::NoError)
|
||||
return {};
|
||||
|
||||
AllOptions options = (AllOptions)regex_options.value_or({});
|
||||
if ((options & AllFlags::MatchNotBeginOfLine) && (options & AllFlags::MatchNotEndOfLine)) {
|
||||
options.reset_flag(AllFlags::MatchNotEndOfLine);
|
||||
options.reset_flag(AllFlags::MatchNotBeginOfLine);
|
||||
}
|
||||
options.reset_flag(AllFlags::Internal_Stateful);
|
||||
options |= AllFlags::Global;
|
||||
|
||||
return matcher->match(view, options);
|
||||
}
|
||||
|
||||
RegexResult search(Vector<RegexStringView> const& views, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const
|
||||
{
|
||||
if (!matcher || parser_result.error != Error::NoError)
|
||||
return {};
|
||||
|
||||
AllOptions options = (AllOptions)regex_options.value_or({});
|
||||
if ((options & AllFlags::MatchNotBeginOfLine) && (options & AllFlags::MatchNotEndOfLine)) {
|
||||
options.reset_flag(AllFlags::MatchNotEndOfLine);
|
||||
options.reset_flag(AllFlags::MatchNotBeginOfLine);
|
||||
}
|
||||
options.reset_flag(AllFlags::Internal_Stateful);
|
||||
options |= AllFlags::Global;
|
||||
|
||||
return matcher->match(views, options);
|
||||
}
|
||||
|
||||
bool match(RegexStringView view, RegexResult& m, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const
|
||||
{
|
||||
m = match(view, regex_options);
|
||||
return m.success;
|
||||
}
|
||||
|
||||
bool match(Vector<RegexStringView> const& views, RegexResult& m, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const
|
||||
{
|
||||
m = match(views, regex_options);
|
||||
return m.success;
|
||||
}
|
||||
|
||||
bool search(RegexStringView view, RegexResult& m, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const
|
||||
{
|
||||
m = search(view, regex_options);
|
||||
return m.success;
|
||||
}
|
||||
|
||||
bool search(Vector<RegexStringView> const& views, RegexResult& m, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const
|
||||
{
|
||||
m = search(views, regex_options);
|
||||
return m.success;
|
||||
}
|
||||
|
||||
bool has_match(RegexStringView view, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const
|
||||
{
|
||||
if (!matcher || parser_result.error != Error::NoError)
|
||||
return false;
|
||||
RegexResult result = matcher->match(view, AllOptions { regex_options.value_or({}) } | AllFlags::SkipSubExprResults);
|
||||
return result.success;
|
||||
}
|
||||
|
||||
bool has_match(Vector<RegexStringView> const& views, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const
|
||||
{
|
||||
if (!matcher || parser_result.error != Error::NoError)
|
||||
return false;
|
||||
RegexResult result = matcher->match(views, AllOptions { regex_options.value_or({}) } | AllFlags::SkipSubExprResults);
|
||||
return result.success;
|
||||
}
|
||||
|
||||
using BasicBlockList = Vector<Detail::Block>;
|
||||
static BasicBlockList split_basic_blocks(ByteCode const&);
|
||||
|
||||
private:
|
||||
void run_optimization_passes();
|
||||
void rewrite_with_useless_jumps_removed();
|
||||
void attempt_rewrite_loops_as_atomic_groups(BasicBlockList const&);
|
||||
bool attempt_rewrite_entire_match_as_substring_search(BasicBlockList const&);
|
||||
void attempt_rewrite_adjacent_compares_as_string_compare(BasicBlockList const&);
|
||||
void attempt_rewrite_dot_star_sequences_as_seek(BasicBlockList const&);
|
||||
void rewrite_simple_compares(BasicBlockList const&);
|
||||
void fill_optimization_data(BasicBlockList const&);
|
||||
};
|
||||
|
||||
// free standing functions for match, search and has_match
|
||||
template<class Parser>
|
||||
RegexResult match(RegexStringView view, Regex<Parser>& pattern, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {})
|
||||
{
|
||||
return pattern.match(view, regex_options);
|
||||
}
|
||||
|
||||
template<class Parser>
|
||||
RegexResult match(Vector<RegexStringView> const& view, Regex<Parser>& pattern, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {})
|
||||
{
|
||||
return pattern.match(view, regex_options);
|
||||
}
|
||||
|
||||
template<class Parser>
|
||||
bool match(RegexStringView view, Regex<Parser>& pattern, RegexResult&, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {})
|
||||
{
|
||||
return pattern.match(view, regex_options);
|
||||
}
|
||||
|
||||
template<class Parser>
|
||||
bool match(Vector<RegexStringView> const& view, Regex<Parser>& pattern, RegexResult&, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {})
|
||||
{
|
||||
return pattern.match(view, regex_options);
|
||||
}
|
||||
|
||||
template<class Parser>
|
||||
RegexResult search(RegexStringView view, Regex<Parser>& pattern, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {})
|
||||
{
|
||||
return pattern.search(view, regex_options);
|
||||
}
|
||||
|
||||
template<class Parser>
|
||||
RegexResult search(Vector<RegexStringView> const& views, Regex<Parser>& pattern, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {})
|
||||
{
|
||||
return pattern.search(views, regex_options);
|
||||
}
|
||||
|
||||
template<class Parser>
|
||||
bool search(RegexStringView view, Regex<Parser>& pattern, RegexResult&, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {})
|
||||
{
|
||||
return pattern.search(view, regex_options);
|
||||
}
|
||||
|
||||
template<class Parser>
|
||||
bool search(Vector<RegexStringView> const& views, Regex<Parser>& pattern, RegexResult&, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {})
|
||||
{
|
||||
return pattern.search(views, regex_options);
|
||||
}
|
||||
|
||||
template<class Parser>
|
||||
bool has_match(RegexStringView view, Regex<Parser>& pattern, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {})
|
||||
{
|
||||
return pattern.has_match(view, regex_options);
|
||||
}
|
||||
|
||||
template<class Parser>
|
||||
bool has_match(Vector<RegexStringView> const& views, Regex<Parser>& pattern, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {})
|
||||
{
|
||||
return pattern.has_match(views, regex_options);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
using regex::has_match;
|
||||
using regex::match;
|
||||
using regex::Regex;
|
||||
using regex::RegexResult;
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,143 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
|
||||
*
|
||||
* SPDX-License-Identifier: BSD-2-Clause
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "RegexDefs.h"
|
||||
#include <AK/Types.h>
|
||||
#include <stdio.h>
|
||||
|
||||
namespace regex {
|
||||
|
||||
using FlagsUnderlyingType = u32;
|
||||
|
||||
enum class AllFlags {
|
||||
Default = 0,
|
||||
Global = __Regex_Global, // All matches (don't return after first match)
|
||||
Insensitive = __Regex_Insensitive, // Case insensitive match (ignores case of [a-zA-Z])
|
||||
Ungreedy = __Regex_Ungreedy, // The match becomes lazy by default. Now a ? following a quantifier makes it greedy
|
||||
Unicode = __Regex_Unicode, // Enable all unicode features and interpret all unicode escape sequences as such
|
||||
Extended = __Regex_Extended, // Ignore whitespaces. Spaces and text after a # in the pattern are ignored
|
||||
Extra = __Regex_Extra, // Disallow meaningless escapes. A \ followed by a letter with no special meaning is faulted
|
||||
MatchNotBeginOfLine = __Regex_MatchNotBeginOfLine, // Pattern is not forced to ^ -> search in whole string!
|
||||
MatchNotEndOfLine = __Regex_MatchNotEndOfLine, // Don't Force the dollar sign, $, to always match end of the string, instead of end of the line. This option is ignored if the Multiline-flag is set
|
||||
SkipSubExprResults = __Regex_SkipSubExprResults, // Do not return sub expressions in the result
|
||||
SingleLine = __Regex_SingleLine, // Dot matches newline characters
|
||||
Sticky = __Regex_Sticky, // Force the pattern to only match consecutive matches from where the previous match ended.
|
||||
Multiline = __Regex_Multiline, // Handle newline characters. Match each line, one by one.
|
||||
SingleMatch = __Regex_SingleMatch, // Stop after acquiring a single match.
|
||||
UnicodeSets = __Regex_UnicodeSets, // Only for ECMA262, Allow set operations in character classes.
|
||||
Internal_Stateful = __Regex_Internal_Stateful, // Make global matches match one result at a time, and further match() calls on the same instance continue where the previous one left off.
|
||||
Internal_BrowserExtended = __Regex_Internal_BrowserExtended, // Only for ECMA262, Enable the behaviors defined in section B.1.4. of the ECMA262 spec.
|
||||
Internal_ConsiderNewline = __Regex_Internal_ConsiderNewline, // Only for ECMA262, Allow multiline matches to consider newlines as line boundaries.
|
||||
Internal_ECMA262DotSemantics = __Regex_Internal_ECMA262DotSemantics, // Use ECMA262 dot semantics: disallow matching CR/LF/LS/PS instead of just CR.
|
||||
Last = Internal_BrowserExtended,
|
||||
};
|
||||
|
||||
enum class PosixFlags : FlagsUnderlyingType {
|
||||
Default = 0,
|
||||
Global = (FlagsUnderlyingType)AllFlags::Global,
|
||||
Insensitive = (FlagsUnderlyingType)AllFlags::Insensitive,
|
||||
Ungreedy = (FlagsUnderlyingType)AllFlags::Ungreedy,
|
||||
Unicode = (FlagsUnderlyingType)AllFlags::Unicode,
|
||||
Extended = (FlagsUnderlyingType)AllFlags::Extended,
|
||||
Extra = (FlagsUnderlyingType)AllFlags::Extra,
|
||||
MatchNotBeginOfLine = (FlagsUnderlyingType)AllFlags::MatchNotBeginOfLine,
|
||||
MatchNotEndOfLine = (FlagsUnderlyingType)AllFlags::MatchNotEndOfLine,
|
||||
SkipSubExprResults = (FlagsUnderlyingType)AllFlags::SkipSubExprResults,
|
||||
Multiline = (FlagsUnderlyingType)AllFlags::Multiline,
|
||||
SingleMatch = (FlagsUnderlyingType)AllFlags::SingleMatch,
|
||||
};
|
||||
|
||||
enum class ECMAScriptFlags : FlagsUnderlyingType {
|
||||
Default = (FlagsUnderlyingType)AllFlags::Internal_ECMA262DotSemantics,
|
||||
Global = (FlagsUnderlyingType)AllFlags::Global | (FlagsUnderlyingType)AllFlags::Internal_Stateful, // Note: ECMAScript "Global" creates a stateful regex.
|
||||
Insensitive = (FlagsUnderlyingType)AllFlags::Insensitive,
|
||||
Ungreedy = (FlagsUnderlyingType)AllFlags::Ungreedy,
|
||||
Unicode = (FlagsUnderlyingType)AllFlags::Unicode,
|
||||
Extended = (FlagsUnderlyingType)AllFlags::Extended,
|
||||
Extra = (FlagsUnderlyingType)AllFlags::Extra,
|
||||
SingleLine = (FlagsUnderlyingType)AllFlags::SingleLine,
|
||||
Sticky = (FlagsUnderlyingType)AllFlags::Sticky,
|
||||
Multiline = (FlagsUnderlyingType)AllFlags::Multiline,
|
||||
UnicodeSets = (FlagsUnderlyingType)AllFlags::UnicodeSets,
|
||||
BrowserExtended = (FlagsUnderlyingType)AllFlags::Internal_BrowserExtended,
|
||||
};
|
||||
|
||||
template<class T>
|
||||
class RegexOptions {
|
||||
public:
|
||||
using FlagsType = T;
|
||||
|
||||
RegexOptions() = default;
|
||||
|
||||
constexpr RegexOptions(T flags)
|
||||
: m_flags(static_cast<T>(to_underlying(flags) | to_underlying(T::Default)))
|
||||
{
|
||||
}
|
||||
|
||||
template<class U>
|
||||
constexpr RegexOptions(RegexOptions<U> other)
|
||||
: RegexOptions(static_cast<T>(to_underlying(other.value())))
|
||||
{
|
||||
}
|
||||
|
||||
operator bool() const { return !!*this; }
|
||||
bool operator!() const { return (FlagsUnderlyingType)m_flags == 0; }
|
||||
|
||||
constexpr RegexOptions<T> operator|(T flag) const { return RegexOptions<T> { (T)((FlagsUnderlyingType)m_flags | (FlagsUnderlyingType)flag) }; }
|
||||
constexpr RegexOptions<T> operator&(T flag) const { return RegexOptions<T> { (T)((FlagsUnderlyingType)m_flags & (FlagsUnderlyingType)flag) }; }
|
||||
|
||||
constexpr RegexOptions<T>& operator|=(T flag)
|
||||
{
|
||||
m_flags = (T)((FlagsUnderlyingType)m_flags | (FlagsUnderlyingType)flag);
|
||||
return *this;
|
||||
}
|
||||
|
||||
constexpr RegexOptions<T>& operator&=(T flag)
|
||||
{
|
||||
m_flags = (T)((FlagsUnderlyingType)m_flags & (FlagsUnderlyingType)flag);
|
||||
return *this;
|
||||
}
|
||||
|
||||
void reset_flags() { m_flags = (T)0; }
|
||||
void reset_flag(T flag) { m_flags = (T)((FlagsUnderlyingType)m_flags & ~(FlagsUnderlyingType)flag); }
|
||||
void set_flag(T flag) { *this |= flag; }
|
||||
bool has_flag_set(T flag) const { return (FlagsUnderlyingType)flag == ((FlagsUnderlyingType)m_flags & (FlagsUnderlyingType)flag); }
|
||||
constexpr T value() const { return m_flags; }
|
||||
|
||||
private:
|
||||
T m_flags { T::Default };
|
||||
};
|
||||
|
||||
template<class T>
|
||||
constexpr RegexOptions<T> operator|(T lhs, T rhs)
|
||||
{
|
||||
return RegexOptions<T> { lhs } |= rhs;
|
||||
}
|
||||
|
||||
template<class T>
|
||||
constexpr RegexOptions<T> operator&(T lhs, T rhs)
|
||||
{
|
||||
return RegexOptions<T> { lhs } &= rhs;
|
||||
}
|
||||
|
||||
template<class T>
|
||||
constexpr T operator~(T flag)
|
||||
{
|
||||
return (T) ~((FlagsUnderlyingType)flag);
|
||||
}
|
||||
|
||||
using AllOptions = RegexOptions<AllFlags>;
|
||||
using ECMAScriptOptions = RegexOptions<ECMAScriptFlags>;
|
||||
using PosixOptions = RegexOptions<PosixFlags>;
|
||||
|
||||
}
|
||||
|
||||
using regex::ECMAScriptFlags;
|
||||
using regex::ECMAScriptOptions;
|
||||
using regex::PosixFlags;
|
||||
using regex::PosixOptions;
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,360 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
|
||||
*
|
||||
* SPDX-License-Identifier: BSD-2-Clause
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "RegexByteCode.h"
|
||||
#include "RegexError.h"
|
||||
#include "RegexLexer.h"
|
||||
#include "RegexOptions.h"
|
||||
|
||||
#include <AK/FlyString.h>
|
||||
#include <AK/Forward.h>
|
||||
#include <AK/HashMap.h>
|
||||
#include <AK/HashTable.h>
|
||||
#include <AK/Types.h>
|
||||
#include <AK/Vector.h>
|
||||
#include <LibUnicode/Forward.h>
|
||||
|
||||
namespace regex {
|
||||
|
||||
class PosixExtendedParser;
|
||||
class PosixBasicParser;
|
||||
class ECMA262Parser;
|
||||
|
||||
template<typename T>
|
||||
struct GenericParserTraits {
|
||||
using OptionsType = T;
|
||||
};
|
||||
|
||||
template<typename T>
|
||||
struct ParserTraits : public GenericParserTraits<T> {
|
||||
};
|
||||
|
||||
template<>
|
||||
struct ParserTraits<PosixExtendedParser> : public GenericParserTraits<PosixOptions> {
|
||||
};
|
||||
|
||||
template<>
|
||||
struct ParserTraits<PosixBasicParser> : public GenericParserTraits<PosixOptions> {
|
||||
};
|
||||
|
||||
template<>
|
||||
struct ParserTraits<ECMA262Parser> : public GenericParserTraits<ECMAScriptOptions> {
|
||||
};
|
||||
|
||||
struct NamedCaptureGroup {
|
||||
size_t group_index;
|
||||
size_t alternative_id;
|
||||
};
|
||||
|
||||
class REGEX_API Parser {
|
||||
public:
|
||||
struct Result {
|
||||
Variant<ByteCode, FlatByteCode> bytecode;
|
||||
size_t capture_groups_count { 0 };
|
||||
size_t named_capture_groups_count { 0 };
|
||||
size_t match_length_minimum { 0 };
|
||||
Error error { Error::NoError };
|
||||
Token error_token {};
|
||||
Vector<FlyString> capture_groups {};
|
||||
AllOptions options {};
|
||||
|
||||
struct {
|
||||
Optional<Vector<u16>> pure_substring_search;
|
||||
// If populated, the pattern only accepts strings that start with a character in these ranges.
|
||||
Vector<CharRange> starting_ranges;
|
||||
Vector<CharRange> starting_ranges_insensitive;
|
||||
bool only_start_of_line = false;
|
||||
} optimization_data {};
|
||||
};
|
||||
|
||||
explicit Parser(Lexer& lexer)
|
||||
: m_parser_state(lexer)
|
||||
{
|
||||
}
|
||||
|
||||
Parser(Lexer& lexer, AllOptions regex_options)
|
||||
: m_parser_state(lexer, regex_options)
|
||||
{
|
||||
}
|
||||
|
||||
virtual ~Parser() = default;
|
||||
|
||||
Result parse(Optional<AllOptions> regex_options = {});
|
||||
bool has_error() const { return m_parser_state.error != Error::NoError; }
|
||||
Error error() const { return m_parser_state.error; }
|
||||
AllOptions options() const { return m_parser_state.regex_options; }
|
||||
|
||||
protected:
|
||||
virtual bool parse_internal(ByteCode&, size_t& match_length_minimum) = 0;
|
||||
bool resolve_forward_named_references();
|
||||
|
||||
ALWAYS_INLINE bool match(TokenType type) const;
|
||||
ALWAYS_INLINE bool match(char ch) const;
|
||||
ALWAYS_INLINE bool match_ordinary_characters();
|
||||
ALWAYS_INLINE Token consume();
|
||||
ALWAYS_INLINE Token consume(TokenType type, Error error);
|
||||
ALWAYS_INLINE bool consume(ByteString const&);
|
||||
ALWAYS_INLINE Optional<u32> consume_escaped_code_point(bool unicode);
|
||||
ALWAYS_INLINE bool try_skip(StringView);
|
||||
ALWAYS_INLINE bool lookahead_any(StringView);
|
||||
ALWAYS_INLINE unsigned char skip();
|
||||
ALWAYS_INLINE void back(size_t = 1);
|
||||
ALWAYS_INLINE void reset();
|
||||
ALWAYS_INLINE bool done() const;
|
||||
ALWAYS_INLINE bool set_error(Error error);
|
||||
|
||||
size_t tell() const { return m_parser_state.current_token.position(); }
|
||||
|
||||
struct ParserState {
|
||||
Lexer& lexer;
|
||||
Token current_token;
|
||||
Error error = Error::NoError;
|
||||
Token error_token { TokenType::Eof, 0, {} };
|
||||
ByteCode bytecode;
|
||||
size_t capture_groups_count { 0 };
|
||||
size_t named_capture_groups_count { 0 };
|
||||
size_t match_length_minimum { 0 };
|
||||
bool greedy_lookaround { true };
|
||||
size_t repetition_mark_count { 0 };
|
||||
bool in_negated_character_class { false };
|
||||
AllOptions regex_options;
|
||||
HashMap<size_t, size_t> capture_group_minimum_lengths;
|
||||
HashTable<size_t> optional_capture_groups;
|
||||
OrderedHashMap<FlyString, Vector<NamedCaptureGroup>> named_capture_groups;
|
||||
|
||||
struct UnresolvedNamedReference {
|
||||
FlyString name;
|
||||
size_t bytecode_offset;
|
||||
};
|
||||
Vector<UnresolvedNamedReference> unresolved_named_references;
|
||||
|
||||
explicit ParserState(Lexer& lexer)
|
||||
: lexer(lexer)
|
||||
, current_token(lexer.next())
|
||||
{
|
||||
}
|
||||
explicit ParserState(Lexer& lexer, AllOptions regex_options)
|
||||
: lexer(lexer)
|
||||
, current_token(lexer.next())
|
||||
, regex_options(regex_options)
|
||||
{
|
||||
}
|
||||
};
|
||||
|
||||
ParserState m_parser_state;
|
||||
};
|
||||
|
||||
class REGEX_API AbstractPosixParser : public Parser {
|
||||
protected:
|
||||
explicit AbstractPosixParser(Lexer& lexer)
|
||||
: Parser(lexer)
|
||||
{
|
||||
}
|
||||
|
||||
AbstractPosixParser(Lexer& lexer, Optional<typename ParserTraits<PosixExtendedParser>::OptionsType> regex_options)
|
||||
: Parser(lexer, regex_options.value_or({}))
|
||||
{
|
||||
}
|
||||
|
||||
ALWAYS_INLINE bool parse_bracket_expression(Vector<CompareTypeAndValuePair>&, size_t&);
|
||||
};
|
||||
|
||||
class REGEX_API PosixBasicParser final : public AbstractPosixParser {
|
||||
public:
|
||||
explicit PosixBasicParser(Lexer& lexer)
|
||||
: AbstractPosixParser(lexer)
|
||||
{
|
||||
}
|
||||
|
||||
PosixBasicParser(Lexer& lexer, Optional<typename ParserTraits<PosixBasicParser>::OptionsType> regex_options)
|
||||
: AbstractPosixParser(lexer, regex_options.value_or({}))
|
||||
{
|
||||
}
|
||||
|
||||
~PosixBasicParser() = default;
|
||||
|
||||
private:
|
||||
bool parse_internal(ByteCode&, size_t&) override;
|
||||
|
||||
bool parse_root(ByteCode&, size_t&);
|
||||
bool parse_re_expression(ByteCode&, size_t&);
|
||||
bool parse_simple_re(ByteCode&, size_t&);
|
||||
bool parse_nonduplicating_re(ByteCode&, size_t&);
|
||||
bool parse_one_char_or_collation_element(ByteCode&, size_t&);
|
||||
|
||||
constexpr static size_t number_of_addressable_capture_groups = 9;
|
||||
size_t m_capture_group_minimum_lengths[number_of_addressable_capture_groups] { 0 };
|
||||
bool m_capture_group_seen[number_of_addressable_capture_groups] { false };
|
||||
size_t m_current_capture_group_depth { 0 };
|
||||
};
|
||||
|
||||
class REGEX_API PosixExtendedParser final : public AbstractPosixParser {
|
||||
constexpr static auto default_options = static_cast<PosixFlags>(AllFlags::SingleLine) | static_cast<PosixFlags>(AllFlags::Internal_ConsiderNewline);
|
||||
|
||||
public:
|
||||
explicit PosixExtendedParser(Lexer& lexer)
|
||||
: AbstractPosixParser(lexer, default_options)
|
||||
{
|
||||
}
|
||||
|
||||
PosixExtendedParser(Lexer& lexer, Optional<typename ParserTraits<PosixExtendedParser>::OptionsType> regex_options)
|
||||
: AbstractPosixParser(lexer, regex_options.value_or({}) | default_options.value())
|
||||
{
|
||||
}
|
||||
|
||||
~PosixExtendedParser() = default;
|
||||
|
||||
private:
|
||||
ALWAYS_INLINE bool match_repetition_symbol();
|
||||
|
||||
bool parse_internal(ByteCode&, size_t&) override;
|
||||
|
||||
bool parse_root(ByteCode&, size_t&);
|
||||
ALWAYS_INLINE bool parse_sub_expression(ByteCode&, size_t&);
|
||||
ALWAYS_INLINE bool parse_bracket_expression(ByteCode&, size_t&);
|
||||
ALWAYS_INLINE bool parse_repetition_symbol(ByteCode&, size_t&);
|
||||
};
|
||||
|
||||
class REGEX_API ECMA262Parser final : public Parser {
|
||||
constexpr static ECMAScriptOptions default_options = static_cast<ECMAScriptFlags>(AllFlags::Internal_ConsiderNewline);
|
||||
|
||||
public:
|
||||
explicit ECMA262Parser(Lexer& lexer)
|
||||
: Parser(lexer, default_options)
|
||||
{
|
||||
m_capture_groups_in_scope.empend();
|
||||
}
|
||||
|
||||
ECMA262Parser(Lexer& lexer, Optional<typename ParserTraits<ECMA262Parser>::OptionsType> regex_options)
|
||||
: Parser(lexer, regex_options.value_or({}) | default_options.value())
|
||||
{
|
||||
m_should_use_browser_extended_grammar = regex_options.has_value() && regex_options->has_flag_set(ECMAScriptFlags::BrowserExtended);
|
||||
m_capture_groups_in_scope.empend();
|
||||
}
|
||||
|
||||
~ECMA262Parser() = default;
|
||||
|
||||
private:
|
||||
bool parse_internal(ByteCode&, size_t&) override;
|
||||
|
||||
struct ParseFlags {
|
||||
bool unicode { false };
|
||||
bool named { false };
|
||||
bool unicode_sets { false };
|
||||
};
|
||||
|
||||
enum class ReadDigitsInitialZeroState {
|
||||
Allow,
|
||||
Disallow,
|
||||
};
|
||||
StringView read_digits_as_string(ReadDigitsInitialZeroState initial_zero = ReadDigitsInitialZeroState::Allow, bool hex = false, int max_count = -1, int min_count = -1);
|
||||
Optional<unsigned> read_digits(ReadDigitsInitialZeroState initial_zero = ReadDigitsInitialZeroState::Allow, bool hex = false, int max_count = -1, int min_count = -1);
|
||||
FlyString read_capture_group_specifier(bool take_starting_angle_bracket = false);
|
||||
|
||||
struct Script {
|
||||
Unicode::Script script {};
|
||||
bool is_extension { false };
|
||||
};
|
||||
using PropertyEscape = Variant<Unicode::Property, Unicode::GeneralCategory, Script, Empty>;
|
||||
Optional<PropertyEscape> read_unicode_property_escape();
|
||||
|
||||
bool parse_pattern(ByteCode&, size_t&, ParseFlags);
|
||||
bool parse_disjunction(ByteCode&, size_t&, ParseFlags);
|
||||
bool parse_alternative(ByteCode&, size_t&, ParseFlags);
|
||||
bool parse_term(ByteCode&, size_t&, ParseFlags);
|
||||
bool parse_assertion(ByteCode&, size_t&, ParseFlags);
|
||||
bool parse_atom(ByteCode&, size_t&, ParseFlags);
|
||||
bool parse_quantifier(ByteCode&, size_t&, ParseFlags);
|
||||
bool parse_interval_quantifier(Optional<u64>& repeat_min, Optional<u64>& repeat_max);
|
||||
bool parse_atom_escape(ByteCode&, size_t&, ParseFlags);
|
||||
bool parse_character_class(ByteCode&, size_t&, ParseFlags);
|
||||
bool parse_capture_group(ByteCode&, size_t&, ParseFlags);
|
||||
Optional<CharClass> parse_character_class_escape(bool& out_inverse, bool expect_backslash = false);
|
||||
bool parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>&, ParseFlags);
|
||||
bool parse_unicode_property_escape(PropertyEscape& property, bool& negated);
|
||||
|
||||
bool parse_character_escape(Vector<CompareTypeAndValuePair>&, size_t&, ParseFlags);
|
||||
|
||||
bool parse_class_set_expression(Vector<CompareTypeAndValuePair>&);
|
||||
bool parse_class_union(Vector<CompareTypeAndValuePair>&);
|
||||
bool parse_class_intersection(Vector<CompareTypeAndValuePair>&);
|
||||
bool parse_class_subtraction(Vector<CompareTypeAndValuePair>&);
|
||||
bool parse_class_set_range(Vector<CompareTypeAndValuePair>&);
|
||||
bool parse_class_set_operand(Vector<CompareTypeAndValuePair>&);
|
||||
bool parse_nested_class(Vector<CompareTypeAndValuePair>&);
|
||||
Optional<u32> parse_class_set_character();
|
||||
|
||||
// Used only by B.1.4, Regular Expression Patterns (Extended for use in browsers)
|
||||
bool parse_quantifiable_assertion(ByteCode&, size_t&, ParseFlags);
|
||||
bool parse_extended_atom(ByteCode&, size_t&, ParseFlags);
|
||||
bool parse_inner_disjunction(ByteCode& bytecode_stack, size_t& length, ParseFlags);
|
||||
bool parse_invalid_braced_quantifier(); // Note: This function either parses and *fails*, or doesn't parse anything and returns false.
|
||||
Optional<u8> parse_legacy_octal_escape();
|
||||
|
||||
bool has_duplicate_in_current_alternative(FlyString const& name);
|
||||
|
||||
size_t ensure_total_number_of_capturing_parenthesis();
|
||||
|
||||
auto save_parser_state()
|
||||
{
|
||||
auto saved_token = m_parser_state.current_token;
|
||||
auto saved_lexer_index = m_parser_state.lexer.tell();
|
||||
|
||||
return ArmedScopeGuard { [this, saved_token, saved_lexer_index] {
|
||||
m_parser_state.current_token = saved_token;
|
||||
m_parser_state.lexer.back(m_parser_state.lexer.tell() - saved_lexer_index);
|
||||
} };
|
||||
}
|
||||
|
||||
void enter_capture_group_scope() { m_capture_groups_in_scope.empend(); }
|
||||
|
||||
void exit_capture_group_scope()
|
||||
{
|
||||
auto last = m_capture_groups_in_scope.take_last();
|
||||
m_capture_groups_in_scope.last().extend(move(last));
|
||||
}
|
||||
|
||||
void clear_all_capture_groups_in_scope(ByteCode& stack)
|
||||
{
|
||||
for (auto& index : m_capture_groups_in_scope.last())
|
||||
stack.insert_bytecode_clear_capture_group(index);
|
||||
}
|
||||
|
||||
void mark_capture_groups_as_optional_from(size_t first_group)
|
||||
{
|
||||
for (size_t i = first_group + 1; i <= m_parser_state.capture_groups_count; ++i)
|
||||
m_parser_state.optional_capture_groups.set(i);
|
||||
}
|
||||
|
||||
// ECMA-262's flavour of regex is a bit weird in that it allows backrefs to reference "future" captures, and such backrefs
|
||||
// always match the empty string. So we have to know how many capturing parenthesis there are, but we don't want to always
|
||||
// parse it twice, so we'll just do so when it's actually needed.
|
||||
// Most patterns should have no need to ever populate this field.
|
||||
Optional<size_t> m_total_number_of_capturing_parenthesis;
|
||||
|
||||
// We need to keep track of the current alternative's named capture groups, so we can check for duplicates.
|
||||
size_t m_current_alternative_id { 0 };
|
||||
|
||||
// Keep the Annex B. behavior behind a flag, the users can enable it by passing the `ECMAScriptFlags::BrowserExtended` flag.
|
||||
bool m_should_use_browser_extended_grammar { false };
|
||||
|
||||
// ECMA-262 basically requires that we clear the inner captures of a capture group before trying to match it,
|
||||
// by requiring that (...)+ only contain the matches for the last iteration.
|
||||
// To do that, we have to keep track of which capture groups are "in scope", so we can clear them as needed.
|
||||
Vector<Vector<size_t>> m_capture_groups_in_scope;
|
||||
};
|
||||
|
||||
using PosixExtended = PosixExtendedParser;
|
||||
using PosixBasic = PosixBasicParser;
|
||||
using ECMA262 = ECMA262Parser;
|
||||
|
||||
}
|
||||
|
||||
using regex::ECMA262;
|
||||
using regex::PosixBasic;
|
||||
using regex::PosixExtended;
|
||||
Reference in New Issue
Block a user