LibRegex: Remove the legacy C++ ECMA-262 engine

Delete the old C++ ECMA-262 parser, optimizer, and matcher now that all
in-tree users compile and execute through `ECMAScriptRegex`.

Stop building the legacy engine, remove its source files and the
POSIX-only fuzzers that depended on it, and update the remaining
LibRegex tests to target the Rust-backed facade instead of the deleted
implementation. Clean up the last includes, comments, and helper paths
that only existed to support the old backend.

After this commit LibRegex has a single ECMAScript engine in-tree,
eliminating duplicated maintenance and unifying future regex work.
This commit is contained in:
Andreas Kling
2026-03-25 10:52:59 +01:00
committed by Ali Mohammad Pur
parent e243e146de
commit d7bf9d3898
Notes: github-actions[bot] 2026-03-27 16:35:07 +00:00
25 changed files with 59 additions and 13411 deletions

View File

@@ -1,21 +1,15 @@
set(SOURCES
RegexByteCode.cpp
RegexLexer.cpp
RegexMatcher.cpp
RegexOptimizer.cpp
RegexParser.cpp
)
if(SERENITYOS)
list(APPEND SOURCES C/Regex.cpp)
if (NOT ENABLE_RUST)
message(FATAL_ERROR "LibRegex requires ENABLE_RUST; the legacy C++ regex engine has been removed")
endif()
set(SOURCES
ECMAScriptRegex.cpp
RustRegex.cpp
)
ladybird_lib(LibRegex regex EXPLICIT_SYMBOL_EXPORT)
target_link_libraries(LibRegex PRIVATE LibUnicode)
if (ENABLE_RUST)
target_sources(LibRegex PRIVATE RustRegex.cpp)
import_rust_crate(MANIFEST_PATH Rust/Cargo.toml CRATE_NAME libregex_rust)
target_link_libraries(LibRegex PRIVATE libregex_rust)
target_compile_definitions(LibRegex PRIVATE ENABLE_RUST)
endif()
import_rust_crate(MANIFEST_PATH Rust/Cargo.toml CRATE_NAME libregex_rust)
target_link_libraries(LibRegex PRIVATE libregex_rust)
target_compile_definitions(LibRegex PRIVATE ENABLE_RUST)

View File

@@ -1,30 +0,0 @@
/*
* Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#pragma once
#include <AK/Types.h>
#include <LibRegex/Export.h>
namespace regex {
struct CompareTypeAndValuePair;
enum class Error : u8;
class Lexer;
class PosixExtendedParser;
class ECMA262Parser;
class ByteCode;
class RegexStringView;
}
using regex::ECMA262Parser;
using regex::Lexer;
using regex::PosixExtendedParser;
using regex::RegexStringView;

View File

@@ -1,10 +0,0 @@
/*
* Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#pragma once
#include <LibRegex/Forward.h>
#include <LibRegex/RegexMatcher.h>

View File

@@ -1,524 +0,0 @@
/*
* Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#include "RegexByteCode.h"
#include <AK/CharacterTypes.h>
#include <AK/StringBuilder.h>
#include <LibUnicode/CharacterTypes.h>
namespace regex {
StringView execution_result_name(ExecutionResult result)
{
switch (result) {
#define __ENUMERATE_EXECUTION_RESULT(x) \
case ExecutionResult::x: \
return #x##sv;
ENUMERATE_EXECUTION_RESULTS
#undef __ENUMERATE_EXECUTION_RESULT
default:
VERIFY_NOT_REACHED();
return "<Unknown>"sv;
}
}
StringView opcode_id_name(OpCodeId opcode)
{
switch (opcode) {
#define __ENUMERATE_OPCODE(x) \
case OpCodeId::x: \
return #x##sv;
ENUMERATE_OPCODES
#undef __ENUMERATE_OPCODE
default:
VERIFY_NOT_REACHED();
return "<Unknown>"sv;
}
}
StringView fork_if_condition_name(ForkIfCondition condition)
{
switch (condition) {
#define __ENUMERATE_FORK_IF_CONDITION(x) \
case ForkIfCondition::x: \
return #x##sv;
ENUMERATE_FORK_IF_CONDITIONS
#undef __ENUMERATE_FORK_IF_CONDITION
default:
return "<Unknown>"sv;
}
}
StringView boundary_check_type_name(BoundaryCheckType ty)
{
switch (ty) {
#define __ENUMERATE_BOUNDARY_CHECK_TYPE(x) \
case BoundaryCheckType::x: \
return #x##sv;
ENUMERATE_BOUNDARY_CHECK_TYPES
#undef __ENUMERATE_BOUNDARY_CHECK_TYPE
default:
VERIFY_NOT_REACHED();
return "<Unknown>"sv;
}
}
StringView character_compare_type_name(CharacterCompareType ch_compare_type)
{
switch (ch_compare_type) {
#define __ENUMERATE_CHARACTER_COMPARE_TYPE(x) \
case CharacterCompareType::x: \
return #x##sv;
ENUMERATE_CHARACTER_COMPARE_TYPES
#undef __ENUMERATE_CHARACTER_COMPARE_TYPE
default:
VERIFY_NOT_REACHED();
return "<Unknown>"sv;
}
}
StringView character_class_name(CharClass ch_class)
{
switch (ch_class) {
#define __ENUMERATE_CHARACTER_CLASS(x) \
case CharClass::x: \
return #x##sv;
ENUMERATE_CHARACTER_CLASSES
#undef __ENUMERATE_CHARACTER_CLASS
default:
VERIFY_NOT_REACHED();
return "<Unknown>"sv;
}
}
static bool is_word_character(u32 code_point, bool case_insensitive, bool unicode_mode)
{
if (is_ascii_alphanumeric(code_point) || code_point == '_')
return true;
if (case_insensitive && unicode_mode) {
auto canonical = Unicode::canonicalize(code_point, unicode_mode);
if (is_ascii_alphanumeric(canonical) || canonical == '_')
return true;
}
return false;
}
size_t ByteCode::s_next_checkpoint_serial_id { 0 };
u32 s_next_string_table_serial { 1 };
static u32 s_next_string_set_table_serial { 1 };
StringSetTable::StringSetTable()
: m_serial(s_next_string_set_table_serial++)
{
}
StringSetTable::~StringSetTable()
{
if (m_serial == s_next_string_set_table_serial - 1 && m_u8_tries.is_empty())
--s_next_string_set_table_serial;
}
StringSetTable::StringSetTable(StringSetTable const& other)
: m_serial(s_next_string_set_table_serial++)
{
for (auto const& entry : other.m_u8_tries)
m_u8_tries.set(entry.key, MUST(const_cast<StringSetTrie&>(entry.value).deep_copy()));
for (auto const& entry : other.m_u16_tries)
m_u16_tries.set(entry.key, MUST(const_cast<StringSetTrie&>(entry.value).deep_copy()));
}
StringSetTable& StringSetTable::operator=(StringSetTable const& other)
{
if (this != &other) {
m_u8_tries.clear();
m_u16_tries.clear();
for (auto const& entry : other.m_u8_tries)
m_u8_tries.set(entry.key, MUST(const_cast<StringSetTrie&>(entry.value).deep_copy()));
for (auto const& entry : other.m_u16_tries)
m_u16_tries.set(entry.key, MUST(const_cast<StringSetTrie&>(entry.value).deep_copy()));
}
return *this;
}
bool matches_character_class(CharClass character_class, u32 ch, bool insensitive, bool unicode_mode)
{
constexpr auto is_space_or_line_terminator = [](u32 code_point) {
if ((code_point == 0x0a) || (code_point == 0x0d) || (code_point == 0x2028) || (code_point == 0x2029))
return true;
if ((code_point == 0x09) || (code_point == 0x0b) || (code_point == 0x0c) || (code_point == 0xfeff))
return true;
return Unicode::code_point_has_space_separator_general_category(code_point);
};
switch (character_class) {
case CharClass::Alnum:
return is_ascii_alphanumeric(ch);
case CharClass::Alpha:
return is_ascii_alpha(ch);
case CharClass::Blank:
return is_ascii_blank(ch);
case CharClass::Cntrl:
return is_ascii_control(ch);
case CharClass::Digit:
return is_ascii_digit(ch);
case CharClass::Graph:
return is_ascii_graphical(ch);
case CharClass::Lower:
return is_ascii_lower_alpha(ch) || (insensitive && is_ascii_upper_alpha(ch));
case CharClass::Print:
return is_ascii_printable(ch);
case CharClass::Punct:
return is_ascii_punctuation(ch);
case CharClass::Space:
return is_space_or_line_terminator(ch);
case CharClass::Upper:
return is_ascii_upper_alpha(ch) || (insensitive && is_ascii_lower_alpha(ch));
case CharClass::Word:
return is_word_character(ch, insensitive, unicode_mode);
case CharClass::Xdigit:
return is_ascii_hex_digit(ch);
}
VERIFY_NOT_REACHED();
}
ByteString opcode_arguments_string(OpCodeId id, ByteCodeValueType const* data, size_t ip, MatchState const& state, ByteCodeBase const& bytecode)
{
// argument(N) = data[ip + 1 + N]
auto arg = [&](size_t n) -> ByteCodeValueType { return data[ip + 1 + n]; };
auto sz = opcode_size(id, data, ip);
switch (id) {
case OpCodeId::SaveModifiers:
return ByteString::formatted("new_modifiers={:#x}", arg(0));
case OpCodeId::RestoreModifiers:
case OpCodeId::Exit:
case OpCodeId::FailForks:
case OpCodeId::PopSaved:
case OpCodeId::Save:
case OpCodeId::Restore:
case OpCodeId::CheckBegin:
case OpCodeId::CheckEnd:
return ByteString::empty();
case OpCodeId::GoBack:
return ByteString::formatted("count={}", arg(0));
case OpCodeId::SetStepBack:
return ByteString::formatted("step={}", static_cast<i64>(arg(0)));
case OpCodeId::IncStepBack:
return ByteString::formatted("inc step back");
case OpCodeId::CheckStepBack:
return ByteString::formatted("check step back");
case OpCodeId::CheckSavedPosition:
return ByteString::formatted("check saved back");
case OpCodeId::Jump:
return ByteString::formatted("offset={} [&{}]", static_cast<ssize_t>(arg(0)), ip + sz + static_cast<ssize_t>(arg(0)));
case OpCodeId::ForkJump:
return ByteString::formatted("offset={} [&{}], sp: {}", static_cast<ssize_t>(arg(0)), ip + sz + static_cast<ssize_t>(arg(0)), state.string_position);
case OpCodeId::ForkReplaceJump:
return ByteString::formatted("offset={} [&{}], sp: {}", static_cast<ssize_t>(arg(0)), ip + sz + static_cast<ssize_t>(arg(0)), state.string_position);
case OpCodeId::ForkStay:
return ByteString::formatted("offset={} [&{}], sp: {}", static_cast<ssize_t>(arg(0)), ip + sz + static_cast<ssize_t>(arg(0)), state.string_position);
case OpCodeId::ForkReplaceStay:
return ByteString::formatted("offset={} [&{}], sp: {}", static_cast<ssize_t>(arg(0)), ip + sz + static_cast<ssize_t>(arg(0)), state.string_position);
case OpCodeId::CheckBoundary:
return ByteString::formatted("kind={} ({})", static_cast<unsigned long>(arg(0)), boundary_check_type_name(static_cast<BoundaryCheckType>(arg(0))));
case OpCodeId::ClearCaptureGroup:
case OpCodeId::SaveLeftCaptureGroup:
case OpCodeId::SaveRightCaptureGroup:
case OpCodeId::Checkpoint:
return ByteString::formatted("id={}", arg(0));
case OpCodeId::FailIfEmpty:
return ByteString::formatted("checkpoint={}", arg(0));
case OpCodeId::SaveRightNamedCaptureGroup:
return ByteString::formatted("name_id={}, id={}", arg(0), arg(1));
case OpCodeId::RSeekTo: {
auto ch = arg(0);
if (ch <= 0x7f)
return ByteString::formatted("before '{}'", ch);
return ByteString::formatted("before u+{:04x}", arg(0));
}
case OpCodeId::Compare:
return ByteString::formatted("argc={}, args={} ", arg(0), arg(1));
case OpCodeId::CompareSimple: {
StringBuilder builder;
auto type = static_cast<CharacterCompareType>(arg(1));
builder.append(character_compare_type_name(type));
switch (type) {
case CharacterCompareType::Char: {
auto ch = arg(2);
if (is_ascii_printable(ch))
builder.append(ByteString::formatted(" '{:c}'", static_cast<char>(ch)));
else
builder.append(ByteString::formatted(" 0x{:x}", ch));
break;
}
case CharacterCompareType::String: {
auto string_index = arg(2);
auto string = bytecode.get_u16_string(string_index);
builder.appendff(" \"{}\"", string);
break;
}
case CharacterCompareType::CharClass: {
auto character_class = static_cast<CharClass>(arg(2));
builder.appendff(" {}", character_class_name(character_class));
break;
}
case CharacterCompareType::Reference: {
auto ref = arg(2);
builder.appendff(" number={}", ref);
break;
}
case CharacterCompareType::NamedReference: {
auto ref = arg(2);
builder.appendff(" named_number={}", ref);
break;
}
case CharacterCompareType::GeneralCategory:
case CharacterCompareType::Property:
case CharacterCompareType::Script:
case CharacterCompareType::ScriptExtension:
case CharacterCompareType::StringSet: {
builder.appendff(" value={}", arg(2));
break;
}
case CharacterCompareType::LookupTable: {
auto count_sensitive = arg(2);
auto count_insensitive = arg(3);
for (size_t j = 0; j < count_sensitive; ++j) {
auto range = static_cast<CharRange>(arg(4 + j));
builder.appendff(" {:x}-{:x}", range.from, range.to);
}
if (count_insensitive > 0) {
builder.append(" [insensitive ranges:"sv);
for (size_t j = 0; j < count_insensitive; ++j) {
auto range = static_cast<CharRange>(arg(4 + count_sensitive + j));
builder.appendff(" {:x}-{:x}", range.from, range.to);
}
builder.append(" ]"sv);
}
break;
}
case CharacterCompareType::CharRange: {
auto value = arg(2);
auto range = static_cast<CharRange>(value);
builder.appendff(" {:x}-{:x}", range.from, range.to);
break;
}
default:
break;
}
return builder.to_byte_string();
}
case OpCodeId::Repeat: {
auto repeat_id = arg(2);
auto reps = repeat_id < state.repetition_marks.size() ? state.repetition_marks.at(repeat_id) : 0;
return ByteString::formatted("offset={} [&{}] count={} id={} rep={}, sp: {}",
static_cast<ssize_t>(arg(0)),
ip - arg(0),
arg(1) + 1,
repeat_id,
reps + 1,
state.string_position);
}
case OpCodeId::ResetRepeat: {
auto repeat_id = arg(0);
auto reps = repeat_id < state.repetition_marks.size() ? state.repetition_marks.at(repeat_id) : 0;
return ByteString::formatted("id={} rep={}", repeat_id, reps + 1);
}
case OpCodeId::JumpNonEmpty:
return ByteString::formatted("{} offset={} [&{}], cp={}",
opcode_id_name(static_cast<OpCodeId>(arg(2))),
static_cast<ssize_t>(arg(0)), ip + sz + static_cast<ssize_t>(arg(0)),
arg(1));
case OpCodeId::ForkIf:
return ByteString::formatted("{} {} offset={} [&{}]",
opcode_id_name(static_cast<OpCodeId>(arg(1))),
fork_if_condition_name(static_cast<ForkIfCondition>(arg(2))),
static_cast<ssize_t>(arg(0)), ip + sz + static_cast<ssize_t>(arg(0)));
}
VERIFY_NOT_REACHED();
}
Vector<ByteString> compare_variable_arguments_to_byte_string(ByteCodeValueType const* data, size_t ip, MatchState const& state, ByteCodeBase const& bytecode, Optional<MatchInput const&> input)
{
Vector<ByteString> result;
size_t offset = ip + 3;
RegexStringView const& view = input.has_value() ? input.value().view : StringView {};
auto argument_count = data[ip + 1]; // arguments_count for Compare
for (size_t i = 0; i < argument_count; ++i) {
auto compare_type = static_cast<CharacterCompareType>(data[offset++]);
result.empend(ByteString::formatted("type={} [{}]", static_cast<size_t>(compare_type), character_compare_type_name(compare_type)));
auto string_start_offset = state.string_position_before_match;
if (compare_type == CharacterCompareType::Char) {
auto ch = data[offset++];
auto is_ascii = is_ascii_printable(ch);
if (is_ascii)
result.empend(ByteString::formatted(" value='{:c}'", static_cast<char>(ch)));
else
result.empend(ByteString::formatted(" value={:x}", ch));
if (!view.is_null() && view.length() > string_start_offset) {
if (is_ascii) {
result.empend(ByteString::formatted(
" compare against: '{}'",
view.substring_view(string_start_offset, string_start_offset > view.length() ? 0 : 1).to_byte_string()));
} else {
auto str = view.substring_view(string_start_offset, string_start_offset > view.length() ? 0 : 1).to_byte_string();
u8 buf[8] { 0 };
__builtin_memcpy(buf, str.characters(), min(str.length(), sizeof(buf)));
result.empend(ByteString::formatted(" compare against: {:x},{:x},{:x},{:x},{:x},{:x},{:x},{:x}",
buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7]));
}
}
} else if (compare_type == CharacterCompareType::Reference) {
auto ref = data[offset++];
result.empend(ByteString::formatted(" number={}", ref));
if (input.has_value()) {
if (state.capture_group_matches_size() > input->match_index) {
auto match = state.capture_group_matches(input->match_index);
if (match.size() > ref) {
auto& group = match[ref];
result.empend(ByteString::formatted(" left={}", group.left_column));
result.empend(ByteString::formatted(" right={}", group.left_column + group.view.length_in_code_units()));
result.empend(ByteString::formatted(" contents='{}'", group.view));
} else {
result.empend(ByteString::formatted(" (invalid ref, max={})", match.size() - 1));
}
} else {
result.empend(ByteString::formatted(" (invalid index {}, max={})", input->match_index, state.capture_group_matches_size() - 1));
}
}
} else if (compare_type == CharacterCompareType::NamedReference) {
auto ref = data[offset++];
result.empend(ByteString::formatted(" named_number={}", ref));
if (input.has_value()) {
if (state.capture_group_matches_size() > input->match_index) {
auto match = state.capture_group_matches(input->match_index);
if (match.size() > ref) {
auto& group = match[ref];
result.empend(ByteString::formatted(" left={}", group.left_column));
result.empend(ByteString::formatted(" right={}", group.left_column + group.view.length_in_code_units()));
result.empend(ByteString::formatted(" contents='{}'", group.view));
} else {
result.empend(ByteString::formatted(" (invalid ref {}, max={})", ref, match.size() - 1));
}
} else {
result.empend(ByteString::formatted(" (invalid index {}, max={})", input->match_index, state.capture_group_matches_size() - 1));
}
}
} else if (compare_type == CharacterCompareType::String) {
auto str_id = data[offset++];
auto string = bytecode.get_u16_string(str_id);
result.empend(ByteString::formatted(" value=\"{}\"", string));
if (!view.is_null() && view.length() > state.string_position)
result.empend(ByteString::formatted(
" compare against: \"{}\"",
input.value().view.substring_view(string_start_offset, string_start_offset + string.length_in_code_units() > view.length() ? 0 : string.length_in_code_units()).to_byte_string()));
} else if (compare_type == CharacterCompareType::CharClass) {
auto character_class = static_cast<CharClass>(data[offset++]);
result.empend(ByteString::formatted(" ch_class={} [{}]", static_cast<size_t>(character_class), character_class_name(character_class)));
if (!view.is_null() && view.length() > state.string_position)
result.empend(ByteString::formatted(
" compare against: '{}'",
input.value().view.substring_view(string_start_offset, state.string_position > view.length() ? 0 : 1).to_byte_string()));
} else if (compare_type == CharacterCompareType::CharRange) {
auto value = static_cast<CharRange>(data[offset++]);
result.empend(ByteString::formatted(" ch_range={:x}-{:x}", value.from, value.to));
if (!view.is_null() && view.length() > state.string_position)
result.empend(ByteString::formatted(
" compare against: '{}'",
input.value().view.substring_view(string_start_offset, state.string_position > view.length() ? 0 : 1).to_byte_string()));
} else if (compare_type == CharacterCompareType::LookupTable) {
auto count_sensitive = data[offset++];
auto count_insensitive = data[offset++];
for (size_t j = 0; j < count_sensitive; ++j) {
auto range = static_cast<CharRange>(data[offset++]);
result.append(ByteString::formatted(" {:x}-{:x}", range.from, range.to));
}
if (count_insensitive > 0) {
result.append(" [insensitive ranges:");
for (size_t j = 0; j < count_insensitive; ++j) {
auto range = static_cast<CharRange>(data[offset++]);
result.append(ByteString::formatted(" {:x}-{:x}", range.from, range.to));
}
result.append(" ]");
}
if (!view.is_null() && view.length() > state.string_position)
result.empend(ByteString::formatted(
" compare against: '{}'",
input.value().view.substring_view(string_start_offset, state.string_position > view.length() ? 0 : 1).to_byte_string()));
} else if (compare_type == CharacterCompareType::GeneralCategory
|| compare_type == CharacterCompareType::Property
|| compare_type == CharacterCompareType::Script
|| compare_type == CharacterCompareType::ScriptExtension
|| compare_type == CharacterCompareType::StringSet) {
auto value = data[offset++];
result.empend(ByteString::formatted(" value={}", value));
}
}
return result;
}
Vector<CompareTypeAndValuePair> flat_compares_at(ByteCodeValueType const* data, size_t ip, bool is_simple)
{
Vector<CompareTypeAndValuePair> result;
size_t offset = ip + (is_simple ? 2 : 3);
auto argument_count = is_simple ? 1 : data[ip + OpArgs::Compare::arguments_count];
for (size_t i = 0; i < argument_count; ++i) {
auto compare_type = (CharacterCompareType)data[offset++];
if (compare_type == CharacterCompareType::Char) {
auto ch = data[offset++];
result.append({ compare_type, ch });
} else if (compare_type == CharacterCompareType::Reference) {
auto ref = data[offset++];
result.append({ compare_type, ref });
} else if (compare_type == CharacterCompareType::NamedReference) {
auto ref = data[offset++];
result.append({ compare_type, ref });
} else if (compare_type == CharacterCompareType::String) {
auto string_index = data[offset++];
result.append({ compare_type, string_index });
} else if (compare_type == CharacterCompareType::CharClass) {
auto character_class = data[offset++];
result.append({ compare_type, character_class });
} else if (compare_type == CharacterCompareType::CharRange) {
auto value = data[offset++];
result.append({ compare_type, value });
} else if (compare_type == CharacterCompareType::LookupTable) {
auto count_sensitive = data[offset++];
auto count_insensitive = data[offset++];
for (size_t j = 0; j < count_sensitive; ++j)
result.append({ CharacterCompareType::CharRange, data[offset++] });
offset += count_insensitive; // Skip insensitive ranges
} else if (compare_type == CharacterCompareType::GeneralCategory
|| compare_type == CharacterCompareType::Property
|| compare_type == CharacterCompareType::Script
|| compare_type == CharacterCompareType::ScriptExtension
|| compare_type == CharacterCompareType::StringSet) {
auto value = data[offset++];
result.append({ compare_type, value });
} else {
result.append({ compare_type, 0 });
}
}
return result;
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -1,21 +0,0 @@
/*
* Copyright (c) 2021, Ali Mohammad Pur <mpfard@serenityos.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#pragma once
#include "Forward.h"
#include <AK/Vector.h>
namespace regex {
class Optimizer {
public:
static void append_alternation(ByteCode& target, ByteCode&& left, ByteCode&& right);
static void append_alternation(ByteCode& target, Span<ByteCode> alternatives);
static void append_character_class(ByteCode& target, Vector<CompareTypeAndValuePair>&& pairs);
};
}

View File

@@ -1,173 +0,0 @@
/*
* Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#pragma once
#include <AK/StringBuilder.h>
#include <LibRegex/RegexMatcher.h>
namespace regex {
template<typename ByteCode>
class RegexDebug {
public:
RegexDebug(FILE* file = stdout)
: m_file(file)
{
}
virtual ~RegexDebug() = default;
template<typename T>
void print_raw_bytecode(Regex<T>& regex) const
{
auto& bytecode = regex.parser_result.bytecode.template get<ByteCode>();
size_t index { 0 };
for (auto& value : bytecode) {
outln(m_file, "OpCode i={:3} [{:#02X}]", index, value);
++index;
}
}
template<typename T>
void print_bytecode(Regex<T> const& regex) const
{
print_bytecode(regex.parser_result.bytecode.template get<ByteCode>());
}
void print_bytecode(ByteCode const& bytecode) const
{
auto state = MatchState::only_for_enumeration();
ByteCodeValueType const* data;
auto data_size = bytecode.size();
Optional<Vector<ByteCodeValueType>> flat_storage;
if constexpr (IsSame<ByteCode, FlatByteCode>) {
data = bytecode.flat_data().data();
} else {
flat_storage.emplace();
flat_storage->ensure_capacity(data_size);
for (size_t i = 0; i < data_size; ++i)
flat_storage->unchecked_append(bytecode[i]);
data = flat_storage->data();
}
for (;;) {
auto id = (data_size <= state.instruction_position)
? OpCodeId::Exit
: static_cast<OpCodeId>(data[state.instruction_position]);
auto sz = opcode_size(id, data, state.instruction_position);
print_opcode("PrintBytecode", id, data, state, bytecode);
out(m_file, "{}", m_debug_stripline);
if (id == OpCodeId::Exit)
break;
state.instruction_position += sz;
}
out(m_file, "String Table:\n");
for (auto const& entry : bytecode.string_table().m_table)
outln(m_file, "+ {} -> {:x}", entry.key, entry.value);
out(m_file, "Reverse String Table:\n");
for (auto const& entry : bytecode.string_table().m_inverse_table)
outln(m_file, "+ {:x} -> {}", entry.key, entry.value);
out(m_file, "(u16) String Table:\n");
for (auto const& entry : bytecode.u16_string_table().m_table)
outln(m_file, "+ {} -> {:x}", entry.key, entry.value);
out(m_file, "Reverse (u16) String Table:\n");
for (auto const& entry : bytecode.u16_string_table().m_inverse_table)
outln(m_file, "+ {:x} -> {}", entry.key, entry.value);
fflush(m_file);
}
void print_opcode(ByteString const& system, OpCodeId id, ByteCodeValueType const* data, MatchState& state, ByteCodeBase const& bytecode, size_t recursion = 0, bool newline = true) const
{
auto opcode_str = ByteString::formatted("[{:#02X}] {}", (int)id, opcode_id_name(id));
out(m_file, "{:15} | {:5} | {:9} | {:35} | {:30} | {:20}",
system.characters(),
state.instruction_position,
recursion,
opcode_str.characters(),
opcode_arguments_string(id, data, state.instruction_position, state, bytecode).characters(),
ByteString::formatted("ip: {:3}, sp: {:3}", state.instruction_position, state.string_position));
if (newline)
outln();
if (newline && id == OpCodeId::Compare) {
for (auto& line : compare_variable_arguments_to_byte_string(data, state.instruction_position, state, bytecode))
outln(m_file, "{:15} | {:5} | {:9} | {:35} | {:30} | {:20}", "", "", "", "", line, "");
}
}
void print_result(OpCodeId id, ByteCodeValueType const* data, size_t data_size, ByteCodeBase const& bytecode, MatchInput const& input, MatchState& state, size_t current_opcode_size, ExecutionResult result) const
{
StringBuilder builder;
builder.append(execution_result_name(result));
builder.appendff(", fc: {}, ss: {}", input.fail_counter, input.saved_positions.size());
if (result == ExecutionResult::Succeeded) {
builder.appendff(", ip: {}/{}, sp: {}/{}", state.instruction_position, data_size - 1, state.string_position, input.view.length() - 1);
} else if (result == ExecutionResult::Fork_PrioHigh) {
builder.appendff(", next ip: {}", state.fork_at_position + current_opcode_size);
} else if (result != ExecutionResult::Failed) {
builder.appendff(", next ip: {}", state.instruction_position + current_opcode_size);
}
outln(m_file, " | {:20}", builder.to_byte_string());
if (id == OpCodeId::CheckSavedPosition) {
auto last_saved = input.saved_positions.is_empty()
? "saved: <empty>"_string
: MUST(String::formatted("saved: {}", input.saved_positions.last()));
outln(m_file, "{:15} | {:5} | {:9} | {:35} | {:30} | {:20}", "", "", "", "", last_saved, "");
}
if (id == OpCodeId::CheckStepBack || id == OpCodeId::IncStepBack) {
auto last_step_back = state.step_backs.is_empty()
? "step: <empty>"_string
: MUST(String::formatted("step: {}", state.step_backs.last()));
outln(m_file, "{:15} | {:5} | {:9} | {:35} | {:30} | {:20}", "", "", "", "", last_step_back, "");
}
if (id == OpCodeId::Compare) {
for (auto& line : compare_variable_arguments_to_byte_string(data, state.instruction_position, state, bytecode, input)) {
outln(m_file, "{:15} | {:5} | {:9} | {:35} | {:30} | {:20}", "", "", "", "", line, "");
}
}
out(m_file, "{}", m_debug_stripline);
}
void print_header()
{
StringBuilder builder;
builder.appendff("{:15} | {:5} | {:9} | {:35} | {:30} | {:20} | {:20}\n", "System", "Index", "Recursion", "OpCode", "Arguments", "State", "Result");
auto length = builder.length();
for (size_t i = 0; i < length; ++i) {
builder.append('=');
}
auto str = builder.to_byte_string();
VERIFY(!str.is_empty());
outln(m_file, "{}", str);
fflush(m_file);
builder.clear();
for (size_t i = 0; i < length; ++i) {
builder.append('-');
}
builder.append('\n');
m_debug_stripline = builder.to_byte_string();
}
private:
ByteString m_debug_stripline;
FILE* m_file;
};
}
using regex::RegexDebug;

View File

@@ -1,56 +0,0 @@
/*
* Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
* Copyright (c) 2020-2022, Ali Mohammad Pur <mpfard@serenityos.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#pragma once
enum __Regex_Error {
__Regex_NoError,
__Regex_InvalidPattern, // Invalid regular expression.
__Regex_InvalidCollationElement, // Invalid collating element referenced.
__Regex_InvalidCharacterClass, // Invalid character class type referenced.
__Regex_InvalidTrailingEscape, // Trailing \ in pattern.
__Regex_InvalidNumber, // Number in \digit invalid or in error.
__Regex_MismatchingBracket, // [ ] imbalance.
__Regex_MismatchingParen, // ( ) imbalance.
__Regex_MismatchingBrace, // { } imbalance.
__Regex_InvalidBraceContent, // Content of {} invalid: not a number, number too large, more than two numbers, first larger than second.
__Regex_InvalidBracketContent, // Content of [] invalid.
__Regex_InvalidRange, // Invalid endpoint in range expression.
__Regex_InvalidRepetitionMarker, // ?, * or + not preceded by valid regular expression.
__Regex_ReachedMaxRecursion, // MaximumRecursion has been reached.
__Regex_EmptySubExpression, // Sub expression has empty content.
__Regex_InvalidCaptureGroup, // Content of capture group is invalid.
__Regex_InvalidNameForCaptureGroup, // Name of capture group is invalid.
__Regex_InvalidNameForProperty, // Name of property is invalid.
__Regex_DuplicateNamedCapture, // Duplicate named capture group
__Regex_InvalidCharacterClassEscape, // Invalid escaped entity in character class.
__Regex_NegatedCharacterClassStrings, // Negated character class cannot contain strings.
__Regex_InvalidModifierGroup, // Invalid modifier group.
__Regex_RepeatedModifierFlag, // Repeated flag in modifier group.
};
enum __RegexAllFlags {
__Regex_Global = 1, // All matches (don't return after first match)
__Regex_Insensitive = __Regex_Global << 1, // Case insensitive match (ignores case of [a-zA-Z])
__Regex_Ungreedy = __Regex_Global << 2, // The match becomes lazy by default. Now a ? following a quantifier makes it greedy
__Regex_Unicode = __Regex_Global << 3, // Enable all unicode features and interpret all unicode escape sequences as such
__Regex_Extended = __Regex_Global << 4, // Ignore whitespaces. Spaces and text after a # in the pattern are ignored
__Regex_Extra = __Regex_Global << 5, // Disallow meaningless escapes. A \ followed by a letter with no special meaning is faulted
__Regex_MatchNotBeginOfLine = __Regex_Global << 6, // Pattern is not forced to ^ -> search in whole string!
__Regex_MatchNotEndOfLine = __Regex_Global << 7, // Don't Force the dollar sign, $, to always match end of the string, instead of end of the line. This option is ignored if the Multiline-flag is set
__Regex_SkipSubExprResults = __Regex_Global << 8, // Do not return sub expressions in the result
__Regex_SingleLine = __Regex_Global << 10, // Dot matches newline characters
__Regex_Sticky = __Regex_Global << 11, // Force the pattern to only match consecutive matches from where the previous match ended.
__Regex_Multiline = __Regex_Global << 12, // Handle newline characters. Match each line, one by one.
__Regex_SingleMatch = __Regex_Global << 13, // Stop after acquiring a single match.
__Regex_UnicodeSets = __Regex_Global << 14, // ECMA262 Parser specific: Allow set operations in char classes.
__Regex_Internal_Stateful = __Regex_Global << 15, // Internal flag; enables stateful matches.
__Regex_Internal_BrowserExtended = __Regex_Global << 16, // Internal flag; enable browser-specific ECMA262 extensions.
__Regex_Internal_ConsiderNewline = __Regex_Global << 17, // Internal flag; allow matchers to consider newlines as line separators.
__Regex_Internal_ECMA262DotSemantics = __Regex_Global << 18, // Internal flag; use ECMA262 semantics for dot ('.') - disallow CR/LF/LS/PS instead of just CR.
__Regex_Last = __Regex_Internal_ECMA262DotSemantics,
};

View File

@@ -1,96 +0,0 @@
/*
* Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#pragma once
#include "RegexDefs.h"
#include <AK/StringView.h>
#include <AK/Types.h>
namespace regex {
enum class Error : u8 {
NoError = __Regex_NoError,
InvalidPattern = __Regex_InvalidPattern, // Invalid regular expression.
InvalidCollationElement = __Regex_InvalidCollationElement, // Invalid collating element referenced.
InvalidCharacterClass = __Regex_InvalidCharacterClass, // Invalid character class type referenced.
InvalidTrailingEscape = __Regex_InvalidTrailingEscape, // Trailing \ in pattern.
InvalidNumber = __Regex_InvalidNumber, // Number in \digit invalid or in error.
MismatchingBracket = __Regex_MismatchingBracket, // [ ] imbalance.
MismatchingParen = __Regex_MismatchingParen, // ( ) imbalance.
MismatchingBrace = __Regex_MismatchingBrace, // { } imbalance.
InvalidBraceContent = __Regex_InvalidBraceContent, // Content of {} invalid: not a number, number too large, more than two numbers, first larger than second.
InvalidBracketContent = __Regex_InvalidBracketContent, // Content of [] invalid.
InvalidRange = __Regex_InvalidRange, // Invalid endpoint in range expression.
InvalidRepetitionMarker = __Regex_InvalidRepetitionMarker, // ?, * or + not preceded by valid regular expression.
ReachedMaxRecursion = __Regex_ReachedMaxRecursion, // MaximumRecursion has been reached.
EmptySubExpression = __Regex_EmptySubExpression, // Sub expression has empty content.
InvalidCaptureGroup = __Regex_InvalidCaptureGroup, // Content of capture group is invalid.
InvalidNameForCaptureGroup = __Regex_InvalidNameForCaptureGroup, // Name of capture group is invalid.
InvalidNameForProperty = __Regex_InvalidNameForProperty, // Name of property is invalid.
DuplicateNamedCapture = __Regex_DuplicateNamedCapture, // Name of property is invalid.
InvalidCharacterClassEscape = __Regex_InvalidCharacterClassEscape, // Invalid escaped entity in character class.
NegatedCharacterClassStrings = __Regex_NegatedCharacterClassStrings, // Negated character class may contain strings.
InvalidModifierGroup = __Regex_InvalidModifierGroup, // Invalid modifier group.
RepeatedModifierFlag = __Regex_RepeatedModifierFlag, // Repeated flag in modifier group.
};
inline StringView get_error_string(Error error)
{
switch (error) {
case Error::NoError:
return "No error"sv;
case Error::InvalidPattern:
return "Invalid regular expression."sv;
case Error::InvalidCollationElement:
return "Invalid collating element referenced."sv;
case Error::InvalidCharacterClass:
return "Invalid character class type referenced."sv;
case Error::InvalidTrailingEscape:
return "Trailing \\ in pattern."sv;
case Error::InvalidNumber:
return "Number in \\digit invalid or in error."sv;
case Error::MismatchingBracket:
return "[ ] imbalance."sv;
case Error::MismatchingParen:
return "( ) imbalance."sv;
case Error::MismatchingBrace:
return "{ } imbalance."sv;
case Error::InvalidBraceContent:
return "Content of {} invalid: not a number, number too large, more than two numbers, first larger than second."sv;
case Error::InvalidBracketContent:
return "Content of [] invalid."sv;
case Error::InvalidRange:
return "Invalid endpoint in range expression."sv;
case Error::InvalidRepetitionMarker:
return "?, * or + not preceded by valid regular expression."sv;
case Error::ReachedMaxRecursion:
return "Maximum recursion has been reached."sv;
case Error::EmptySubExpression:
return "Sub expression has empty content."sv;
case Error::InvalidCaptureGroup:
return "Content of capture group is invalid."sv;
case Error::InvalidNameForCaptureGroup:
return "Name of capture group is invalid."sv;
case Error::InvalidNameForProperty:
return "Name of property is invalid."sv;
case Error::DuplicateNamedCapture:
return "Duplicate capture group name"sv;
case Error::InvalidCharacterClassEscape:
return "Invalid escaped entity in character class."sv;
case Error::NegatedCharacterClassStrings:
return "Negated character class cannot contain strings."sv;
case Error::InvalidModifierGroup:
return "Invalid modifier group."sv;
case Error::RepeatedModifierFlag:
return "Repeated flag in modifier group."sv;
}
return "Undefined error."sv;
}
}
using regex::get_error_string;

View File

@@ -1,183 +0,0 @@
/*
* Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#include "RegexLexer.h"
#include <AK/Assertions.h>
#include <AK/Debug.h>
#include <AK/Format.h>
namespace regex {
char const* Token::name(TokenType const type)
{
switch (type) {
#define __ENUMERATE_REGEX_TOKEN(x) \
case TokenType::x: \
return #x;
ENUMERATE_REGEX_TOKENS
#undef __ENUMERATE_REGEX_TOKEN
default:
VERIFY_NOT_REACHED();
return "<Unknown>";
}
}
char const* Token::name() const
{
return name(m_type);
}
Lexer::Lexer()
: GenericLexer(StringView {})
{
}
Lexer::Lexer(StringView const source)
: GenericLexer(source)
{
}
void Lexer::back(size_t offset)
{
if (offset == m_index + 1)
offset = m_index; // 'position == 0' occurs twice.
VERIFY(offset <= m_index);
if (!offset)
return;
m_index -= offset;
m_previous_position = (m_index > 0) ? m_index - 1 : 0;
}
char Lexer::consume()
{
m_previous_position = m_index;
return GenericLexer::consume();
}
void Lexer::reset()
{
m_index = 0;
m_current_token = { TokenType::Eof, 0, {} };
m_previous_position = 0;
}
Token Lexer::next()
{
size_t token_start_position;
auto begin_token = [&] {
token_start_position = m_index;
};
auto commit_token = [&](auto type) -> Token& {
VERIFY(token_start_position + m_previous_position - token_start_position + 1 <= m_input.length());
auto substring = m_input.substring_view(token_start_position, m_previous_position - token_start_position + 1);
m_current_token = Token(type, token_start_position, substring);
return m_current_token;
};
auto emit_token = [&](auto type) -> Token& {
m_current_token = Token(type, m_index, m_input.substring_view(m_index, 1));
consume();
return m_current_token;
};
auto match_escape_sequence = [&]() -> size_t {
switch (peek(1)) {
case '^':
case '.':
case '[':
case ']':
case '$':
case '(':
case ')':
case '|':
case '*':
case '+':
case '?':
case '{':
case '\\':
return 2;
default:
dbgln_if(REGEX_DEBUG, "[LEXER] Found invalid escape sequence: \\{:c} (the parser will have to deal with this!)", peek(1));
return 0;
}
};
while (m_index < m_input.length()) {
auto ch = peek();
if (ch == '(')
return emit_token(TokenType::LeftParen);
if (ch == ')')
return emit_token(TokenType::RightParen);
if (ch == '{')
return emit_token(TokenType::LeftCurly);
if (ch == '}')
return emit_token(TokenType::RightCurly);
if (ch == '[')
return emit_token(TokenType::LeftBracket);
if (ch == ']')
return emit_token(TokenType::RightBracket);
if (ch == '.')
return emit_token(TokenType::Period);
if (ch == '*')
return emit_token(TokenType::Asterisk);
if (ch == '+')
return emit_token(TokenType::Plus);
if (ch == '$')
return emit_token(TokenType::Dollar);
if (ch == '^')
return emit_token(TokenType::Circumflex);
if (ch == '|')
return emit_token(TokenType::Pipe);
if (ch == '?')
return emit_token(TokenType::Questionmark);
if (ch == ',')
return emit_token(TokenType::Comma);
if (ch == '/')
return emit_token(TokenType::Slash);
if (ch == '=')
return emit_token(TokenType::EqualSign);
if (ch == ':')
return emit_token(TokenType::Colon);
if (ch == '-')
return emit_token(TokenType::HyphenMinus);
if (ch == '\\') {
size_t escape = match_escape_sequence();
if (escape > 0) {
begin_token();
for (size_t i = 0; i < escape; ++i)
consume();
return commit_token(TokenType::EscapeSequence);
}
}
return emit_token(TokenType::Char);
}
return Token(TokenType::Eof, m_index, {});
}
}

View File

@@ -1,86 +0,0 @@
/*
* Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#pragma once
#include <AK/Forward.h>
#include <AK/GenericLexer.h>
#include <AK/StringView.h>
#include <LibRegex/Forward.h>
namespace regex {
#define ENUMERATE_REGEX_TOKENS \
__ENUMERATE_REGEX_TOKEN(Eof) \
__ENUMERATE_REGEX_TOKEN(Char) \
__ENUMERATE_REGEX_TOKEN(Circumflex) \
__ENUMERATE_REGEX_TOKEN(Period) \
__ENUMERATE_REGEX_TOKEN(LeftParen) \
__ENUMERATE_REGEX_TOKEN(RightParen) \
__ENUMERATE_REGEX_TOKEN(LeftCurly) \
__ENUMERATE_REGEX_TOKEN(RightCurly) \
__ENUMERATE_REGEX_TOKEN(LeftBracket) \
__ENUMERATE_REGEX_TOKEN(RightBracket) \
__ENUMERATE_REGEX_TOKEN(Asterisk) \
__ENUMERATE_REGEX_TOKEN(EscapeSequence) \
__ENUMERATE_REGEX_TOKEN(Dollar) \
__ENUMERATE_REGEX_TOKEN(Pipe) \
__ENUMERATE_REGEX_TOKEN(Plus) \
__ENUMERATE_REGEX_TOKEN(Comma) \
__ENUMERATE_REGEX_TOKEN(Slash) \
__ENUMERATE_REGEX_TOKEN(EqualSign) \
__ENUMERATE_REGEX_TOKEN(HyphenMinus) \
__ENUMERATE_REGEX_TOKEN(Colon) \
__ENUMERATE_REGEX_TOKEN(Questionmark)
enum class TokenType {
#define __ENUMERATE_REGEX_TOKEN(x) x,
ENUMERATE_REGEX_TOKENS
#undef __ENUMERATE_REGEX_TOKEN
};
class Token {
public:
Token() = default;
Token(TokenType const type, size_t const start_position, StringView const value)
: m_type(type)
, m_position(start_position)
, m_value(value)
{
}
TokenType type() const { return m_type; }
StringView value() const { return m_value; }
size_t position() const { return m_position; }
char const* name() const;
static char const* name(TokenType);
private:
TokenType m_type { TokenType::Eof };
size_t m_position { 0 };
StringView m_value {};
};
class REGEX_API Lexer : public GenericLexer {
public:
Lexer();
explicit Lexer(StringView source);
Token next();
void reset();
void back(size_t offset);
char consume();
void set_source(StringView const source) { m_input = source; }
auto const& source() const { return m_input; }
private:
size_t m_previous_position { 0 };
Token m_current_token { TokenType::Eof, 0, {} };
};
}
using regex::Lexer;

View File

@@ -1,576 +0,0 @@
/*
* Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#pragma once
#include "Forward.h"
#include "RegexOptions.h"
#include <AK/ByteString.h>
#include <AK/COWVector.h>
#include <AK/Error.h>
#include <AK/FlyString.h>
#include <AK/MemMem.h>
#include <AK/StringBuilder.h>
#include <AK/StringView.h>
#include <AK/UnicodeUtils.h>
#include <AK/Utf16String.h>
#include <AK/Utf16View.h>
#include <AK/Utf32View.h>
#include <AK/Utf8View.h>
#include <AK/Variant.h>
#include <AK/Vector.h>
#include <LibUnicode/CharacterTypes.h>
namespace regex {
class RegexStringView {
public:
RegexStringView() = default;
RegexStringView(String const& string)
: m_view(string.bytes_as_string_view())
{
}
RegexStringView(StringView const view)
: m_view(view)
{
}
RegexStringView(Utf16View view)
: m_view(view)
{
}
RegexStringView(String&&) = delete;
Utf16View const& u16_view() const
{
return m_view.get<Utf16View>();
}
bool is_u16_view() const
{
return m_view.has<Utf16View>();
}
bool unicode() const { return m_unicode; }
void set_unicode(bool unicode) { m_unicode = unicode; }
bool is_empty() const
{
return m_view.visit([](auto& view) { return view.is_empty(); });
}
bool is_null() const
{
return m_view.visit([](auto& view) { return view.is_null(); });
}
size_t length() const
{
if (unicode()) {
return m_view.visit(
[](Utf16View const& view) { return view.length_in_code_points(); },
[](auto const& view) { return view.length(); });
}
return length_in_code_units();
}
size_t length_in_code_units() const
{
return m_view.visit(
[](Utf16View const& view) { return view.length_in_code_units(); },
[](auto const& view) { return view.length(); });
}
size_t length_of_code_point(u32 code_point) const
{
return m_view.visit(
[&](Utf16View const&) {
if (code_point < 0x10000)
return 1;
return 2;
},
[&](auto const&) {
if (code_point <= 0x7f)
return 1;
if (code_point <= 0x07ff)
return 2;
if (code_point <= 0xffff)
return 3;
return 4;
});
}
RegexStringView typed_null_view()
{
auto view = m_view.visit(
[&]<typename T>(T const&) {
return RegexStringView { T {} };
});
view.set_unicode(unicode());
return view;
}
RegexStringView construct_as_same(Span<u32> data, Optional<ByteString>& optional_string_storage, Utf16String& optional_utf16_storage) const
{
auto view = m_view.visit(
[&optional_string_storage, data]<typename T>(T const&) {
StringBuilder builder;
for (auto ch : data)
builder.append(ch); // Note: The type conversion is intentional.
optional_string_storage = builder.to_byte_string();
return RegexStringView { T { *optional_string_storage } };
},
[&optional_utf16_storage, data](Utf16View) {
optional_utf16_storage = Utf16String::from_utf32({ data.data(), data.size() });
return RegexStringView { optional_utf16_storage.utf16_view() };
});
view.set_unicode(unicode());
return view;
}
Vector<RegexStringView> lines() const
{
return m_view.visit(
[](StringView view) {
auto views = view.lines(StringView::ConsiderCarriageReturn::No);
Vector<RegexStringView> new_views;
for (auto& view : views)
new_views.empend(view);
return new_views;
},
[](Utf16View view) {
if (view.is_empty())
return Vector<RegexStringView> { view };
Vector<RegexStringView> views;
while (!view.is_empty()) {
auto position = view.find_code_unit_offset(u'\n');
if (!position.has_value())
break;
auto offset = position.value() / sizeof(u16);
views.empend(view.substring_view(0, offset));
view = view.substring_view(offset + 1, view.length_in_code_units() - offset - 1);
}
if (!view.is_empty())
views.empend(view);
return views;
});
}
RegexStringView substring_view(size_t offset, size_t length) const
{
if (unicode()) {
auto view = m_view.visit(
[&](auto view) { return RegexStringView { view.substring_view(offset, length) }; },
[&](Utf16View const& view) { return RegexStringView { view.unicode_substring_view(offset, length) }; });
view.set_unicode(unicode());
return view;
}
auto view = m_view.visit([&](auto view) { return RegexStringView { view.substring_view(offset, length) }; });
view.set_unicode(unicode());
return view;
}
ByteString to_byte_string() const
{
return m_view.visit(
[](StringView view) { return view.to_byte_string(); },
[](Utf16View view) { return view.to_byte_string().release_value_but_fixme_should_propagate_errors(); });
}
ErrorOr<String> to_string() const
{
return m_view.visit(
[](StringView view) { return String::from_utf8(view); },
[](Utf16View view) { return view.to_utf8(); });
}
u32 code_point_at(size_t code_unit_index) const
{
return m_view.visit(
[&](StringView view) -> u32 {
auto ch = view[code_unit_index];
if constexpr (IsSigned<char>) {
if (ch < 0)
return 256u + ch;
return ch;
}
},
[&](Utf16View const& view) -> u32 { return view.code_point_at(code_unit_index); });
}
// Returns the code point at the code unit offset if the Unicode flag is set. Otherwise, returns the code unit.
u32 unicode_aware_code_point_at(size_t code_unit_index) const
{
if (unicode())
return code_point_at(code_unit_index);
return m_view.visit(
[&](StringView view) -> u32 {
auto ch = view[code_unit_index];
if constexpr (IsSigned<char>) {
if (ch < 0)
return 256u + ch;
return ch;
}
},
[&](Utf16View const& view) -> u32 { return view.code_unit_at(code_unit_index); });
}
size_t code_unit_offset_of(size_t code_point_index) const
{
return m_view.visit(
[&](StringView view) -> u32 {
Utf8View utf8_view { view };
return utf8_view.byte_offset_of(code_point_index);
},
[&](Utf16View const& view) -> u32 {
return view.code_unit_offset_of(code_point_index);
});
}
bool operator==(char const* cstring) const
{
return m_view.visit(
[&](Utf16View) { return to_byte_string() == cstring; },
[&](StringView view) { return view == cstring; });
}
bool operator==(StringView string) const
{
return m_view.visit(
[&](Utf16View) { return to_byte_string() == string; },
[&](StringView view) { return view == string; });
}
bool operator==(Utf16View const& other) const
{
return m_view.visit(
[&](Utf16View const& view) { return view == other; },
[&](StringView view) { return view == RegexStringView { other }.to_byte_string(); });
}
bool equals(RegexStringView other) const
{
return other.m_view.visit([this](auto const& view) { return operator==(view); });
}
bool equals_ignoring_case(RegexStringView other, bool unicode_mode) const
{
return m_view.visit(
[&](StringView view) {
return other.m_view.visit(
[&](StringView other_view) {
if (!unicode_mode)
return view.equals_ignoring_ascii_case(other_view);
Utf8View view_utf8(view);
Utf8View other_utf8(other_view);
return Unicode::ranges_equal_ignoring_case(view_utf8, other_utf8, unicode_mode);
},
[&](Utf16View other_view) {
Utf8View view_utf8(view);
return Unicode::ranges_equal_ignoring_case(view_utf8, other_view, unicode_mode);
},
[](auto&) -> bool { TODO(); });
},
[&](Utf16View view) {
return other.m_view.visit(
[&](StringView other_view) {
Utf8View other_utf8(other_view);
return Unicode::ranges_equal_ignoring_case(view, other_utf8, unicode_mode);
},
[&](Utf16View other_view) {
if (!unicode_mode)
return view.equals_ignoring_ascii_case(other_view);
return Unicode::ranges_equal_ignoring_case(view, other_view, unicode_mode);
},
[](auto&) -> bool { TODO(); });
},
[](auto&) -> bool { TODO(); });
}
bool starts_with(StringView str) const
{
return m_view.visit(
[&](Utf16View) -> bool {
TODO();
},
[&](StringView view) { return view.starts_with(str); });
}
struct FoundIndex {
size_t code_unit_index;
size_t code_point_index;
};
Optional<FoundIndex> find_index_of_previous(u32 code_point, size_t end_code_point_index, size_t end_code_unit_index) const
{
return m_view.visit(
[&](Utf16View const& view) -> Optional<FoundIndex> {
auto result = view.find_last_code_point_offset(code_point, end_code_unit_index);
if (!result.has_value())
return {};
return FoundIndex { result.value(), view.code_point_offset_of(result.value()) };
},
[&](StringView const& view) -> Optional<FoundIndex> {
if (unicode()) {
Utf8View utf8_view { view };
auto it = utf8_view.begin();
size_t current_code_point_index = 0;
Optional<FoundIndex> found_index;
for (; it != utf8_view.end(); ++it, ++current_code_point_index) {
if (current_code_point_index >= end_code_point_index)
break;
if (*it == code_point) {
auto byte_index = utf8_view.byte_offset_of(it);
found_index = { byte_index, current_code_point_index };
}
}
return found_index;
}
auto byte_index = view.substring_view(0, min(end_code_unit_index, view.length())).find_last(code_point);
if (!byte_index.has_value())
return {};
return FoundIndex { byte_index.value(), byte_index.value() };
});
}
FoundIndex find_end_of_line(size_t start_code_point_index, size_t start_code_unit_index) const
{
constexpr auto is_newline = [](u32 ch) { return ch == '\n' || ch == '\r' || ch == 0x2028 || ch == 0x2029; };
return m_view.visit(
[&](Utf16View const& view) -> FoundIndex {
size_t code_unit_index = start_code_unit_index;
size_t code_point_index = start_code_point_index;
while (code_unit_index < view.length_in_code_units()) {
auto code_unit = view.code_unit_at(code_unit_index);
u32 ch = code_unit;
size_t code_units_for_this = 1;
if (AK::UnicodeUtils::is_utf16_high_surrogate(code_unit) && code_unit_index + 1 < view.length_in_code_units()) {
auto next_code_unit = view.code_unit_at(code_unit_index + 1);
if (AK::UnicodeUtils::is_utf16_low_surrogate(next_code_unit)) {
ch = AK::UnicodeUtils::decode_utf16_surrogate_pair(code_unit, next_code_unit);
code_units_for_this = 2;
}
}
if (is_newline(ch))
return FoundIndex { code_unit_index, code_point_index };
code_unit_index += code_units_for_this;
++code_point_index;
}
return FoundIndex { view.length_in_code_units(), code_point_index };
},
[&](StringView const& view) -> FoundIndex {
if (unicode()) {
Utf8View utf8_view { view };
auto it = utf8_view.begin();
size_t current_code_point_index = 0;
// Skip to start position
while (it != utf8_view.end() && current_code_point_index < start_code_point_index) {
++it;
++current_code_point_index;
}
for (; it != utf8_view.end(); ++it, ++current_code_point_index) {
if (is_newline(*it)) {
return FoundIndex { utf8_view.byte_offset_of(it), current_code_point_index };
}
}
return FoundIndex { view.length(), utf8_view.length() };
}
for (size_t i = start_code_unit_index; i < view.length(); ++i) {
if (is_newline(static_cast<u8>(view[i])))
return FoundIndex { i, i };
}
return FoundIndex { view.length(), view.length() };
});
}
private:
NO_UNIQUE_ADDRESS Variant<StringView, Utf16View> m_view { StringView {} };
NO_UNIQUE_ADDRESS bool m_unicode { false };
};
class Match final {
public:
Match() = default;
~Match() = default;
Match(RegexStringView view_, size_t const line_, size_t const column_, size_t const global_offset_)
: view(view_)
, line(line_)
, column(column_)
, global_offset(global_offset_)
, left_column(column_)
{
}
Match(RegexStringView const view_, size_t capture_group_name_, size_t const line_, size_t const column_, size_t const global_offset_)
: view(view_)
, capture_group_name(capture_group_name_)
, line(line_)
, column(column_)
, global_offset(global_offset_)
, left_column(column_)
{
}
void reset()
{
view = view.typed_null_view();
capture_group_name = -1;
line = 0;
column = 0;
global_offset = 0;
left_column = 0;
}
RegexStringView view {};
// This is a string table index. -1 if none. Not using Optional to keep the struct trivially copyable.
ssize_t capture_group_name { -1 };
size_t line { 0 };
size_t column { 0 };
size_t global_offset { 0 };
// ugly, as not usable by user, but needed to prevent to create extra vectors that are
// able to store the column when the left paren has been found
size_t left_column { 0 };
};
struct MatchInput {
RegexStringView view {};
AllOptions regex_options {};
size_t start_offset { 0 }; // For Stateful matches, saved and restored from Regex::start_offset.
size_t match_index { 0 };
size_t line { 0 };
size_t column { 0 };
size_t global_offset { 0 }; // For multiline matching, knowing the offset from start could be important
mutable size_t fail_counter { 0 };
mutable Vector<size_t> saved_positions;
mutable Vector<size_t> saved_code_unit_positions;
mutable Vector<size_t> saved_forks_since_last_save;
mutable Optional<size_t> fork_to_replace;
bool in_the_middle_of_a_line { false };
StringView pattern {};
};
struct MatchState {
size_t capture_group_count;
size_t string_position_before_match { 0 };
size_t string_position { 0 };
size_t string_position_in_code_units { 0 };
size_t instruction_position { 0 };
size_t fork_at_position { 0 };
size_t forks_since_last_save { 0 };
size_t string_position_before_rseek { NumericLimits<size_t>::max() };
size_t string_position_in_code_units_before_rseek { NumericLimits<size_t>::max() };
Optional<size_t> initiating_fork;
COWVector<Match> matches;
COWVector<Match> flat_capture_group_matches; // Vector<Vector<Match>> indexed by match index, then by capture group id; flattened for performance
COWVector<u64> repetition_marks;
Vector<u64, 64> checkpoints;
Vector<i64> step_backs;
Vector<FlagsUnderlyingType, 1> modifier_stack;
AllOptions current_options;
explicit MatchState(size_t capture_group_count, AllOptions options = {})
: capture_group_count(capture_group_count)
, current_options(options)
{
}
MatchState(MatchState const&) = default;
MatchState(MatchState&&) = default;
MatchState& operator=(MatchState const&) = default;
MatchState& operator=(MatchState&&) = default;
static MatchState only_for_enumeration() { return MatchState { 0 }; }
size_t capture_group_matches_size() const
{
return flat_capture_group_matches.size() / capture_group_count;
}
Span<Match const> capture_group_matches(size_t match_index) const
{
return flat_capture_group_matches.span().slice(match_index * capture_group_count, capture_group_count);
}
Span<Match> mutable_capture_group_matches(size_t match_index)
{
return flat_capture_group_matches.mutable_span().slice(match_index * capture_group_count, capture_group_count);
}
// For size_t in {0..300}, ips in {0..750} and repetitions in {0..50}, there are zero collisions.
u64 u64_hash() const
{
u64 hash = 0xcbf29ce484222325;
auto combine = [&hash](auto value) {
hash ^= static_cast<u64>(value);
hash *= 0x9e3779b97f4a7c15;
};
auto combine_vector = [&combine](auto const& vector, auto tag) {
combine(tag);
combine(vector.size());
for (auto& value : vector)
combine(value);
};
combine(string_position_before_match);
combine(string_position);
combine(string_position_in_code_units);
combine(instruction_position);
combine(fork_at_position);
combine(initiating_fork.value_or(0) + initiating_fork.has_value());
combine_vector(repetition_marks, 0xbeefbeefbeefbeef);
combine_vector(checkpoints, 0xfacefacefaceface);
combine_vector(step_backs, 0xfedefedefedefede);
return hash;
}
};
}
using regex::RegexStringView;
template<>
struct AK::Formatter<regex::RegexStringView> : Formatter<StringView> {
ErrorOr<void> format(FormatBuilder& builder, regex::RegexStringView value)
{
auto string = value.to_byte_string();
return Formatter<StringView>::format(builder, string);
}
};
template<>
struct AK::Traits<regex::Match> : public AK::DefaultTraits<regex::Match> {
constexpr static bool is_trivial() { return true; }
};

File diff suppressed because it is too large Load Diff

View File

@@ -1,312 +0,0 @@
/*
* Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#pragma once
#include "RegexByteCode.h"
#include "RegexMatch.h"
#include "RegexOptions.h"
#include "RegexParser.h"
#include <AK/Forward.h>
#include <AK/GenericLexer.h>
#include <AK/Vector.h>
#include <ctype.h>
#include <stdio.h>
namespace regex {
namespace Detail {
struct Block {
size_t start;
size_t end;
StringView comment { "N/A"sv };
};
}
static constexpr size_t const c_max_recursion = 5000;
struct REGEX_API RegexResult final {
bool success { false };
size_t count { 0 };
Vector<Match> matches;
Vector<Match> flat_capture_group_matches;
Vector<Span<Match>> capture_group_matches;
size_t n_operations { 0 };
size_t n_capture_groups { 0 };
size_t n_named_capture_groups { 0 };
};
template<class Parser>
class REGEX_API Regex;
template<class Parser>
class REGEX_API Matcher final {
public:
Matcher(Regex<Parser> const* pattern, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {})
: m_pattern(pattern)
, m_regex_options(regex_options.value_or({}))
{
}
~Matcher() = default;
RegexResult match(RegexStringView, Optional<typename ParserTraits<Parser>::OptionsType> = {}) const;
RegexResult match(Vector<RegexStringView> const&, Optional<typename ParserTraits<Parser>::OptionsType> = {}) const;
typename ParserTraits<Parser>::OptionsType options() const
{
return m_regex_options;
}
void reset_pattern(Badge<Regex<Parser>>, Regex<Parser> const* pattern)
{
m_pattern = pattern;
}
private:
enum class ExecuteResult {
DidNotMatch,
Matched,
DidNotMatchAndNoFurtherPossibleMatchesInView,
};
ExecuteResult execute(MatchInput const& input, MatchState& state, size_t& operations) const;
Regex<Parser> const* m_pattern;
typename ParserTraits<Parser>::OptionsType const m_regex_options;
};
template<class Parser>
class REGEX_API Regex final {
public:
ByteString pattern_value;
regex::Parser::Result parser_result;
OwnPtr<Matcher<Parser>> matcher { nullptr };
mutable size_t start_offset { 0 };
static regex::Parser::Result parse_pattern(StringView pattern, typename ParserTraits<Parser>::OptionsType regex_options = {});
explicit Regex(ByteString pattern, typename ParserTraits<Parser>::OptionsType regex_options = {});
Regex(regex::Parser::Result parse_result, ByteString pattern, typename ParserTraits<Parser>::OptionsType regex_options = {});
Regex(Regex const&);
~Regex() = default;
Regex(Regex&&);
Regex& operator=(Regex&&);
typename ParserTraits<Parser>::OptionsType options() const;
ByteString error_string(Optional<ByteString> message = {}) const;
RegexResult match(RegexStringView view, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const
{
if (!matcher || parser_result.error != Error::NoError)
return {};
return matcher->match(view, regex_options);
}
RegexResult match(Vector<RegexStringView> const& views, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const
{
if (!matcher || parser_result.error != Error::NoError)
return {};
return matcher->match(views, regex_options);
}
ByteString replace(RegexStringView view, StringView replacement_pattern, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const
{
if (!matcher || parser_result.error != Error::NoError)
return {};
StringBuilder builder;
size_t start_offset = 0;
RegexResult result = matcher->match(view, regex_options);
if (!result.success)
return view.to_byte_string();
for (size_t i = 0; i < result.matches.size(); ++i) {
auto& match = result.matches[i];
builder.append(view.substring_view(start_offset, match.global_offset - start_offset).to_byte_string());
start_offset = match.global_offset + match.view.length();
GenericLexer lexer(replacement_pattern);
while (!lexer.is_eof()) {
if (lexer.consume_specific('\\')) {
if (lexer.consume_specific('\\')) {
builder.append('\\');
continue;
}
auto number = lexer.consume_while(isdigit);
if (auto index = number.to_number<unsigned>(); index.has_value() && result.n_capture_groups >= index.value()) {
builder.append(result.capture_group_matches[i][index.value() - 1].view.to_byte_string());
} else {
builder.appendff("\\{}", number);
}
} else {
builder.append(lexer.consume_while([](auto ch) { return ch != '\\'; }));
}
}
}
builder.append(view.substring_view(start_offset, view.length() - start_offset).to_byte_string());
return builder.to_byte_string();
}
// FIXME: replace(Vector<RegexStringView> const , ...)
RegexResult search(RegexStringView view, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const
{
if (!matcher || parser_result.error != Error::NoError)
return {};
AllOptions options = (AllOptions)regex_options.value_or({});
if ((options & AllFlags::MatchNotBeginOfLine) && (options & AllFlags::MatchNotEndOfLine)) {
options.reset_flag(AllFlags::MatchNotEndOfLine);
options.reset_flag(AllFlags::MatchNotBeginOfLine);
}
options.reset_flag(AllFlags::Internal_Stateful);
options |= AllFlags::Global;
return matcher->match(view, options);
}
RegexResult search(Vector<RegexStringView> const& views, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const
{
if (!matcher || parser_result.error != Error::NoError)
return {};
AllOptions options = (AllOptions)regex_options.value_or({});
if ((options & AllFlags::MatchNotBeginOfLine) && (options & AllFlags::MatchNotEndOfLine)) {
options.reset_flag(AllFlags::MatchNotEndOfLine);
options.reset_flag(AllFlags::MatchNotBeginOfLine);
}
options.reset_flag(AllFlags::Internal_Stateful);
options |= AllFlags::Global;
return matcher->match(views, options);
}
bool match(RegexStringView view, RegexResult& m, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const
{
m = match(view, regex_options);
return m.success;
}
bool match(Vector<RegexStringView> const& views, RegexResult& m, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const
{
m = match(views, regex_options);
return m.success;
}
bool search(RegexStringView view, RegexResult& m, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const
{
m = search(view, regex_options);
return m.success;
}
bool search(Vector<RegexStringView> const& views, RegexResult& m, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const
{
m = search(views, regex_options);
return m.success;
}
bool has_match(RegexStringView view, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const
{
if (!matcher || parser_result.error != Error::NoError)
return false;
RegexResult result = matcher->match(view, AllOptions { regex_options.value_or({}) } | AllFlags::SkipSubExprResults);
return result.success;
}
bool has_match(Vector<RegexStringView> const& views, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const
{
if (!matcher || parser_result.error != Error::NoError)
return false;
RegexResult result = matcher->match(views, AllOptions { regex_options.value_or({}) } | AllFlags::SkipSubExprResults);
return result.success;
}
using BasicBlockList = Vector<Detail::Block>;
static BasicBlockList split_basic_blocks(ByteCode const&);
private:
void run_optimization_passes();
void rewrite_with_useless_jumps_removed();
void attempt_rewrite_loops_as_atomic_groups(BasicBlockList const&);
bool attempt_rewrite_entire_match_as_substring_search(BasicBlockList const&);
void attempt_rewrite_adjacent_compares_as_string_compare(BasicBlockList const&);
void attempt_rewrite_dot_star_sequences_as_seek(BasicBlockList const&);
void rewrite_simple_compares(BasicBlockList const&);
void fill_optimization_data(BasicBlockList const&);
};
// free standing functions for match, search and has_match
template<class Parser>
RegexResult match(RegexStringView view, Regex<Parser>& pattern, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {})
{
return pattern.match(view, regex_options);
}
template<class Parser>
RegexResult match(Vector<RegexStringView> const& view, Regex<Parser>& pattern, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {})
{
return pattern.match(view, regex_options);
}
template<class Parser>
bool match(RegexStringView view, Regex<Parser>& pattern, RegexResult&, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {})
{
return pattern.match(view, regex_options);
}
template<class Parser>
bool match(Vector<RegexStringView> const& view, Regex<Parser>& pattern, RegexResult&, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {})
{
return pattern.match(view, regex_options);
}
template<class Parser>
RegexResult search(RegexStringView view, Regex<Parser>& pattern, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {})
{
return pattern.search(view, regex_options);
}
template<class Parser>
RegexResult search(Vector<RegexStringView> const& views, Regex<Parser>& pattern, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {})
{
return pattern.search(views, regex_options);
}
template<class Parser>
bool search(RegexStringView view, Regex<Parser>& pattern, RegexResult&, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {})
{
return pattern.search(view, regex_options);
}
template<class Parser>
bool search(Vector<RegexStringView> const& views, Regex<Parser>& pattern, RegexResult&, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {})
{
return pattern.search(views, regex_options);
}
template<class Parser>
bool has_match(RegexStringView view, Regex<Parser>& pattern, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {})
{
return pattern.has_match(view, regex_options);
}
template<class Parser>
bool has_match(Vector<RegexStringView> const& views, Regex<Parser>& pattern, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {})
{
return pattern.has_match(views, regex_options);
}
}
using regex::has_match;
using regex::match;
using regex::Regex;
using regex::RegexResult;

File diff suppressed because it is too large Load Diff

View File

@@ -1,143 +0,0 @@
/*
* Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#pragma once
#include "RegexDefs.h"
#include <AK/Types.h>
#include <stdio.h>
namespace regex {
using FlagsUnderlyingType = u32;
enum class AllFlags {
Default = 0,
Global = __Regex_Global, // All matches (don't return after first match)
Insensitive = __Regex_Insensitive, // Case insensitive match (ignores case of [a-zA-Z])
Ungreedy = __Regex_Ungreedy, // The match becomes lazy by default. Now a ? following a quantifier makes it greedy
Unicode = __Regex_Unicode, // Enable all unicode features and interpret all unicode escape sequences as such
Extended = __Regex_Extended, // Ignore whitespaces. Spaces and text after a # in the pattern are ignored
Extra = __Regex_Extra, // Disallow meaningless escapes. A \ followed by a letter with no special meaning is faulted
MatchNotBeginOfLine = __Regex_MatchNotBeginOfLine, // Pattern is not forced to ^ -> search in whole string!
MatchNotEndOfLine = __Regex_MatchNotEndOfLine, // Don't Force the dollar sign, $, to always match end of the string, instead of end of the line. This option is ignored if the Multiline-flag is set
SkipSubExprResults = __Regex_SkipSubExprResults, // Do not return sub expressions in the result
SingleLine = __Regex_SingleLine, // Dot matches newline characters
Sticky = __Regex_Sticky, // Force the pattern to only match consecutive matches from where the previous match ended.
Multiline = __Regex_Multiline, // Handle newline characters. Match each line, one by one.
SingleMatch = __Regex_SingleMatch, // Stop after acquiring a single match.
UnicodeSets = __Regex_UnicodeSets, // Only for ECMA262, Allow set operations in character classes.
Internal_Stateful = __Regex_Internal_Stateful, // Make global matches match one result at a time, and further match() calls on the same instance continue where the previous one left off.
Internal_BrowserExtended = __Regex_Internal_BrowserExtended, // Only for ECMA262, Enable the behaviors defined in section B.1.4. of the ECMA262 spec.
Internal_ConsiderNewline = __Regex_Internal_ConsiderNewline, // Only for ECMA262, Allow multiline matches to consider newlines as line boundaries.
Internal_ECMA262DotSemantics = __Regex_Internal_ECMA262DotSemantics, // Use ECMA262 dot semantics: disallow matching CR/LF/LS/PS instead of just CR.
Last = Internal_BrowserExtended,
};
enum class PosixFlags : FlagsUnderlyingType {
Default = 0,
Global = (FlagsUnderlyingType)AllFlags::Global,
Insensitive = (FlagsUnderlyingType)AllFlags::Insensitive,
Ungreedy = (FlagsUnderlyingType)AllFlags::Ungreedy,
Unicode = (FlagsUnderlyingType)AllFlags::Unicode,
Extended = (FlagsUnderlyingType)AllFlags::Extended,
Extra = (FlagsUnderlyingType)AllFlags::Extra,
MatchNotBeginOfLine = (FlagsUnderlyingType)AllFlags::MatchNotBeginOfLine,
MatchNotEndOfLine = (FlagsUnderlyingType)AllFlags::MatchNotEndOfLine,
SkipSubExprResults = (FlagsUnderlyingType)AllFlags::SkipSubExprResults,
Multiline = (FlagsUnderlyingType)AllFlags::Multiline,
SingleMatch = (FlagsUnderlyingType)AllFlags::SingleMatch,
};
enum class ECMAScriptFlags : FlagsUnderlyingType {
Default = (FlagsUnderlyingType)AllFlags::Internal_ECMA262DotSemantics,
Global = (FlagsUnderlyingType)AllFlags::Global | (FlagsUnderlyingType)AllFlags::Internal_Stateful, // Note: ECMAScript "Global" creates a stateful regex.
Insensitive = (FlagsUnderlyingType)AllFlags::Insensitive,
Ungreedy = (FlagsUnderlyingType)AllFlags::Ungreedy,
Unicode = (FlagsUnderlyingType)AllFlags::Unicode,
Extended = (FlagsUnderlyingType)AllFlags::Extended,
Extra = (FlagsUnderlyingType)AllFlags::Extra,
SingleLine = (FlagsUnderlyingType)AllFlags::SingleLine,
Sticky = (FlagsUnderlyingType)AllFlags::Sticky,
Multiline = (FlagsUnderlyingType)AllFlags::Multiline,
UnicodeSets = (FlagsUnderlyingType)AllFlags::UnicodeSets,
BrowserExtended = (FlagsUnderlyingType)AllFlags::Internal_BrowserExtended,
};
template<class T>
class RegexOptions {
public:
using FlagsType = T;
RegexOptions() = default;
constexpr RegexOptions(T flags)
: m_flags(static_cast<T>(to_underlying(flags) | to_underlying(T::Default)))
{
}
template<class U>
constexpr RegexOptions(RegexOptions<U> other)
: RegexOptions(static_cast<T>(to_underlying(other.value())))
{
}
operator bool() const { return !!*this; }
bool operator!() const { return (FlagsUnderlyingType)m_flags == 0; }
constexpr RegexOptions<T> operator|(T flag) const { return RegexOptions<T> { (T)((FlagsUnderlyingType)m_flags | (FlagsUnderlyingType)flag) }; }
constexpr RegexOptions<T> operator&(T flag) const { return RegexOptions<T> { (T)((FlagsUnderlyingType)m_flags & (FlagsUnderlyingType)flag) }; }
constexpr RegexOptions<T>& operator|=(T flag)
{
m_flags = (T)((FlagsUnderlyingType)m_flags | (FlagsUnderlyingType)flag);
return *this;
}
constexpr RegexOptions<T>& operator&=(T flag)
{
m_flags = (T)((FlagsUnderlyingType)m_flags & (FlagsUnderlyingType)flag);
return *this;
}
void reset_flags() { m_flags = (T)0; }
void reset_flag(T flag) { m_flags = (T)((FlagsUnderlyingType)m_flags & ~(FlagsUnderlyingType)flag); }
void set_flag(T flag) { *this |= flag; }
bool has_flag_set(T flag) const { return (FlagsUnderlyingType)flag == ((FlagsUnderlyingType)m_flags & (FlagsUnderlyingType)flag); }
constexpr T value() const { return m_flags; }
private:
T m_flags { T::Default };
};
template<class T>
constexpr RegexOptions<T> operator|(T lhs, T rhs)
{
return RegexOptions<T> { lhs } |= rhs;
}
template<class T>
constexpr RegexOptions<T> operator&(T lhs, T rhs)
{
return RegexOptions<T> { lhs } &= rhs;
}
template<class T>
constexpr T operator~(T flag)
{
return (T) ~((FlagsUnderlyingType)flag);
}
using AllOptions = RegexOptions<AllFlags>;
using ECMAScriptOptions = RegexOptions<ECMAScriptFlags>;
using PosixOptions = RegexOptions<PosixFlags>;
}
using regex::ECMAScriptFlags;
using regex::ECMAScriptOptions;
using regex::PosixFlags;
using regex::PosixOptions;

File diff suppressed because it is too large Load Diff

View File

@@ -1,360 +0,0 @@
/*
* Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#pragma once
#include "RegexByteCode.h"
#include "RegexError.h"
#include "RegexLexer.h"
#include "RegexOptions.h"
#include <AK/FlyString.h>
#include <AK/Forward.h>
#include <AK/HashMap.h>
#include <AK/HashTable.h>
#include <AK/Types.h>
#include <AK/Vector.h>
#include <LibUnicode/Forward.h>
namespace regex {
class PosixExtendedParser;
class PosixBasicParser;
class ECMA262Parser;
template<typename T>
struct GenericParserTraits {
using OptionsType = T;
};
template<typename T>
struct ParserTraits : public GenericParserTraits<T> {
};
template<>
struct ParserTraits<PosixExtendedParser> : public GenericParserTraits<PosixOptions> {
};
template<>
struct ParserTraits<PosixBasicParser> : public GenericParserTraits<PosixOptions> {
};
template<>
struct ParserTraits<ECMA262Parser> : public GenericParserTraits<ECMAScriptOptions> {
};
struct NamedCaptureGroup {
size_t group_index;
size_t alternative_id;
};
class REGEX_API Parser {
public:
struct Result {
Variant<ByteCode, FlatByteCode> bytecode;
size_t capture_groups_count { 0 };
size_t named_capture_groups_count { 0 };
size_t match_length_minimum { 0 };
Error error { Error::NoError };
Token error_token {};
Vector<FlyString> capture_groups {};
AllOptions options {};
struct {
Optional<Vector<u16>> pure_substring_search;
// If populated, the pattern only accepts strings that start with a character in these ranges.
Vector<CharRange> starting_ranges;
Vector<CharRange> starting_ranges_insensitive;
bool only_start_of_line = false;
} optimization_data {};
};
explicit Parser(Lexer& lexer)
: m_parser_state(lexer)
{
}
Parser(Lexer& lexer, AllOptions regex_options)
: m_parser_state(lexer, regex_options)
{
}
virtual ~Parser() = default;
Result parse(Optional<AllOptions> regex_options = {});
bool has_error() const { return m_parser_state.error != Error::NoError; }
Error error() const { return m_parser_state.error; }
AllOptions options() const { return m_parser_state.regex_options; }
protected:
virtual bool parse_internal(ByteCode&, size_t& match_length_minimum) = 0;
bool resolve_forward_named_references();
ALWAYS_INLINE bool match(TokenType type) const;
ALWAYS_INLINE bool match(char ch) const;
ALWAYS_INLINE bool match_ordinary_characters();
ALWAYS_INLINE Token consume();
ALWAYS_INLINE Token consume(TokenType type, Error error);
ALWAYS_INLINE bool consume(ByteString const&);
ALWAYS_INLINE Optional<u32> consume_escaped_code_point(bool unicode);
ALWAYS_INLINE bool try_skip(StringView);
ALWAYS_INLINE bool lookahead_any(StringView);
ALWAYS_INLINE unsigned char skip();
ALWAYS_INLINE void back(size_t = 1);
ALWAYS_INLINE void reset();
ALWAYS_INLINE bool done() const;
ALWAYS_INLINE bool set_error(Error error);
size_t tell() const { return m_parser_state.current_token.position(); }
struct ParserState {
Lexer& lexer;
Token current_token;
Error error = Error::NoError;
Token error_token { TokenType::Eof, 0, {} };
ByteCode bytecode;
size_t capture_groups_count { 0 };
size_t named_capture_groups_count { 0 };
size_t match_length_minimum { 0 };
bool greedy_lookaround { true };
size_t repetition_mark_count { 0 };
bool in_negated_character_class { false };
AllOptions regex_options;
HashMap<size_t, size_t> capture_group_minimum_lengths;
HashTable<size_t> optional_capture_groups;
OrderedHashMap<FlyString, Vector<NamedCaptureGroup>> named_capture_groups;
struct UnresolvedNamedReference {
FlyString name;
size_t bytecode_offset;
};
Vector<UnresolvedNamedReference> unresolved_named_references;
explicit ParserState(Lexer& lexer)
: lexer(lexer)
, current_token(lexer.next())
{
}
explicit ParserState(Lexer& lexer, AllOptions regex_options)
: lexer(lexer)
, current_token(lexer.next())
, regex_options(regex_options)
{
}
};
ParserState m_parser_state;
};
class REGEX_API AbstractPosixParser : public Parser {
protected:
explicit AbstractPosixParser(Lexer& lexer)
: Parser(lexer)
{
}
AbstractPosixParser(Lexer& lexer, Optional<typename ParserTraits<PosixExtendedParser>::OptionsType> regex_options)
: Parser(lexer, regex_options.value_or({}))
{
}
ALWAYS_INLINE bool parse_bracket_expression(Vector<CompareTypeAndValuePair>&, size_t&);
};
class REGEX_API PosixBasicParser final : public AbstractPosixParser {
public:
explicit PosixBasicParser(Lexer& lexer)
: AbstractPosixParser(lexer)
{
}
PosixBasicParser(Lexer& lexer, Optional<typename ParserTraits<PosixBasicParser>::OptionsType> regex_options)
: AbstractPosixParser(lexer, regex_options.value_or({}))
{
}
~PosixBasicParser() = default;
private:
bool parse_internal(ByteCode&, size_t&) override;
bool parse_root(ByteCode&, size_t&);
bool parse_re_expression(ByteCode&, size_t&);
bool parse_simple_re(ByteCode&, size_t&);
bool parse_nonduplicating_re(ByteCode&, size_t&);
bool parse_one_char_or_collation_element(ByteCode&, size_t&);
constexpr static size_t number_of_addressable_capture_groups = 9;
size_t m_capture_group_minimum_lengths[number_of_addressable_capture_groups] { 0 };
bool m_capture_group_seen[number_of_addressable_capture_groups] { false };
size_t m_current_capture_group_depth { 0 };
};
class REGEX_API PosixExtendedParser final : public AbstractPosixParser {
constexpr static auto default_options = static_cast<PosixFlags>(AllFlags::SingleLine) | static_cast<PosixFlags>(AllFlags::Internal_ConsiderNewline);
public:
explicit PosixExtendedParser(Lexer& lexer)
: AbstractPosixParser(lexer, default_options)
{
}
PosixExtendedParser(Lexer& lexer, Optional<typename ParserTraits<PosixExtendedParser>::OptionsType> regex_options)
: AbstractPosixParser(lexer, regex_options.value_or({}) | default_options.value())
{
}
~PosixExtendedParser() = default;
private:
ALWAYS_INLINE bool match_repetition_symbol();
bool parse_internal(ByteCode&, size_t&) override;
bool parse_root(ByteCode&, size_t&);
ALWAYS_INLINE bool parse_sub_expression(ByteCode&, size_t&);
ALWAYS_INLINE bool parse_bracket_expression(ByteCode&, size_t&);
ALWAYS_INLINE bool parse_repetition_symbol(ByteCode&, size_t&);
};
class REGEX_API ECMA262Parser final : public Parser {
constexpr static ECMAScriptOptions default_options = static_cast<ECMAScriptFlags>(AllFlags::Internal_ConsiderNewline);
public:
explicit ECMA262Parser(Lexer& lexer)
: Parser(lexer, default_options)
{
m_capture_groups_in_scope.empend();
}
ECMA262Parser(Lexer& lexer, Optional<typename ParserTraits<ECMA262Parser>::OptionsType> regex_options)
: Parser(lexer, regex_options.value_or({}) | default_options.value())
{
m_should_use_browser_extended_grammar = regex_options.has_value() && regex_options->has_flag_set(ECMAScriptFlags::BrowserExtended);
m_capture_groups_in_scope.empend();
}
~ECMA262Parser() = default;
private:
bool parse_internal(ByteCode&, size_t&) override;
struct ParseFlags {
bool unicode { false };
bool named { false };
bool unicode_sets { false };
};
enum class ReadDigitsInitialZeroState {
Allow,
Disallow,
};
StringView read_digits_as_string(ReadDigitsInitialZeroState initial_zero = ReadDigitsInitialZeroState::Allow, bool hex = false, int max_count = -1, int min_count = -1);
Optional<unsigned> read_digits(ReadDigitsInitialZeroState initial_zero = ReadDigitsInitialZeroState::Allow, bool hex = false, int max_count = -1, int min_count = -1);
FlyString read_capture_group_specifier(bool take_starting_angle_bracket = false);
struct Script {
Unicode::Script script {};
bool is_extension { false };
};
using PropertyEscape = Variant<Unicode::Property, Unicode::GeneralCategory, Script, Empty>;
Optional<PropertyEscape> read_unicode_property_escape();
bool parse_pattern(ByteCode&, size_t&, ParseFlags);
bool parse_disjunction(ByteCode&, size_t&, ParseFlags);
bool parse_alternative(ByteCode&, size_t&, ParseFlags);
bool parse_term(ByteCode&, size_t&, ParseFlags);
bool parse_assertion(ByteCode&, size_t&, ParseFlags);
bool parse_atom(ByteCode&, size_t&, ParseFlags);
bool parse_quantifier(ByteCode&, size_t&, ParseFlags);
bool parse_interval_quantifier(Optional<u64>& repeat_min, Optional<u64>& repeat_max);
bool parse_atom_escape(ByteCode&, size_t&, ParseFlags);
bool parse_character_class(ByteCode&, size_t&, ParseFlags);
bool parse_capture_group(ByteCode&, size_t&, ParseFlags);
Optional<CharClass> parse_character_class_escape(bool& out_inverse, bool expect_backslash = false);
bool parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>&, ParseFlags);
bool parse_unicode_property_escape(PropertyEscape& property, bool& negated);
bool parse_character_escape(Vector<CompareTypeAndValuePair>&, size_t&, ParseFlags);
bool parse_class_set_expression(Vector<CompareTypeAndValuePair>&);
bool parse_class_union(Vector<CompareTypeAndValuePair>&);
bool parse_class_intersection(Vector<CompareTypeAndValuePair>&);
bool parse_class_subtraction(Vector<CompareTypeAndValuePair>&);
bool parse_class_set_range(Vector<CompareTypeAndValuePair>&);
bool parse_class_set_operand(Vector<CompareTypeAndValuePair>&);
bool parse_nested_class(Vector<CompareTypeAndValuePair>&);
Optional<u32> parse_class_set_character();
// Used only by B.1.4, Regular Expression Patterns (Extended for use in browsers)
bool parse_quantifiable_assertion(ByteCode&, size_t&, ParseFlags);
bool parse_extended_atom(ByteCode&, size_t&, ParseFlags);
bool parse_inner_disjunction(ByteCode& bytecode_stack, size_t& length, ParseFlags);
bool parse_invalid_braced_quantifier(); // Note: This function either parses and *fails*, or doesn't parse anything and returns false.
Optional<u8> parse_legacy_octal_escape();
bool has_duplicate_in_current_alternative(FlyString const& name);
size_t ensure_total_number_of_capturing_parenthesis();
auto save_parser_state()
{
auto saved_token = m_parser_state.current_token;
auto saved_lexer_index = m_parser_state.lexer.tell();
return ArmedScopeGuard { [this, saved_token, saved_lexer_index] {
m_parser_state.current_token = saved_token;
m_parser_state.lexer.back(m_parser_state.lexer.tell() - saved_lexer_index);
} };
}
void enter_capture_group_scope() { m_capture_groups_in_scope.empend(); }
void exit_capture_group_scope()
{
auto last = m_capture_groups_in_scope.take_last();
m_capture_groups_in_scope.last().extend(move(last));
}
void clear_all_capture_groups_in_scope(ByteCode& stack)
{
for (auto& index : m_capture_groups_in_scope.last())
stack.insert_bytecode_clear_capture_group(index);
}
void mark_capture_groups_as_optional_from(size_t first_group)
{
for (size_t i = first_group + 1; i <= m_parser_state.capture_groups_count; ++i)
m_parser_state.optional_capture_groups.set(i);
}
// ECMA-262's flavour of regex is a bit weird in that it allows backrefs to reference "future" captures, and such backrefs
// always match the empty string. So we have to know how many capturing parenthesis there are, but we don't want to always
// parse it twice, so we'll just do so when it's actually needed.
// Most patterns should have no need to ever populate this field.
Optional<size_t> m_total_number_of_capturing_parenthesis;
// We need to keep track of the current alternative's named capture groups, so we can check for duplicates.
size_t m_current_alternative_id { 0 };
// Keep the Annex B. behavior behind a flag, the users can enable it by passing the `ECMAScriptFlags::BrowserExtended` flag.
bool m_should_use_browser_extended_grammar { false };
// ECMA-262 basically requires that we clear the inner captures of a capture group before trying to match it,
// by requiring that (...)+ only contain the matches for the last iteration.
// To do that, we have to keep track of which capture groups are "in scope", so we can clear them as needed.
Vector<Vector<size_t>> m_capture_groups_in_scope;
};
using PosixExtended = PosixExtendedParser;
using PosixBasic = PosixBasicParser;
using ECMA262 = ECMA262Parser;
}
using regex::ECMA262;
using regex::PosixBasic;
using regex::PosixExtended;