Files
ladybird/Libraries/LibRegex/RegexByteCode.h

1028 lines
37 KiB
C++

/*
* Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#pragma once
#include "RegexBytecodeStreamOptimizer.h"
#include "RegexMatch.h"
#include <AK/Concepts.h>
#include <AK/DisjointChunks.h>
#include <AK/Forward.h>
#include <AK/HashMap.h>
#include <AK/Trie.h>
#include <AK/Types.h>
#include <AK/Utf16FlyString.h>
#include <AK/Vector.h>
#include <LibUnicode/Forward.h>
namespace regex {
using ByteCodeValueType = u64;
#define ENUMERATE_OPCODES \
__ENUMERATE_OPCODE(Compare) \
__ENUMERATE_OPCODE(Jump) \
__ENUMERATE_OPCODE(JumpNonEmpty) \
__ENUMERATE_OPCODE(ForkJump) \
__ENUMERATE_OPCODE(ForkStay) \
__ENUMERATE_OPCODE(ForkReplaceJump) \
__ENUMERATE_OPCODE(ForkReplaceStay) \
__ENUMERATE_OPCODE(ForkIf) \
__ENUMERATE_OPCODE(FailForks) \
__ENUMERATE_OPCODE(PopSaved) \
__ENUMERATE_OPCODE(SaveLeftCaptureGroup) \
__ENUMERATE_OPCODE(SaveRightCaptureGroup) \
__ENUMERATE_OPCODE(SaveRightNamedCaptureGroup) \
__ENUMERATE_OPCODE(RSeekTo) \
__ENUMERATE_OPCODE(CheckBegin) \
__ENUMERATE_OPCODE(CheckEnd) \
__ENUMERATE_OPCODE(CheckBoundary) \
__ENUMERATE_OPCODE(Save) \
__ENUMERATE_OPCODE(Restore) \
__ENUMERATE_OPCODE(GoBack) \
__ENUMERATE_OPCODE(SetStepBack) \
__ENUMERATE_OPCODE(IncStepBack) \
__ENUMERATE_OPCODE(CheckStepBack) \
__ENUMERATE_OPCODE(CheckSavedPosition) \
__ENUMERATE_OPCODE(ClearCaptureGroup) \
__ENUMERATE_OPCODE(FailIfEmpty) \
__ENUMERATE_OPCODE(Repeat) \
__ENUMERATE_OPCODE(ResetRepeat) \
__ENUMERATE_OPCODE(Checkpoint) \
__ENUMERATE_OPCODE(CompareSimple) \
__ENUMERATE_OPCODE(SaveModifiers) \
__ENUMERATE_OPCODE(RestoreModifiers) \
__ENUMERATE_OPCODE(Exit)
// clang-format off
enum class OpCodeId : ByteCodeValueType {
#define __ENUMERATE_OPCODE(x) x,
ENUMERATE_OPCODES
#undef __ENUMERATE_OPCODE
First = Compare,
Last = Exit,
};
// clang-format on
#define ENUMERATE_CHARACTER_COMPARE_TYPES \
__ENUMERATE_CHARACTER_COMPARE_TYPE(Undefined) \
__ENUMERATE_CHARACTER_COMPARE_TYPE(Inverse) \
__ENUMERATE_CHARACTER_COMPARE_TYPE(TemporaryInverse) \
__ENUMERATE_CHARACTER_COMPARE_TYPE(AnyChar) \
__ENUMERATE_CHARACTER_COMPARE_TYPE(Char) \
__ENUMERATE_CHARACTER_COMPARE_TYPE(String) \
__ENUMERATE_CHARACTER_COMPARE_TYPE(CharClass) \
__ENUMERATE_CHARACTER_COMPARE_TYPE(CharRange) \
__ENUMERATE_CHARACTER_COMPARE_TYPE(Reference) \
__ENUMERATE_CHARACTER_COMPARE_TYPE(NamedReference) \
__ENUMERATE_CHARACTER_COMPARE_TYPE(Property) \
__ENUMERATE_CHARACTER_COMPARE_TYPE(GeneralCategory) \
__ENUMERATE_CHARACTER_COMPARE_TYPE(Script) \
__ENUMERATE_CHARACTER_COMPARE_TYPE(ScriptExtension) \
__ENUMERATE_CHARACTER_COMPARE_TYPE(RangeExpressionDummy) \
__ENUMERATE_CHARACTER_COMPARE_TYPE(LookupTable) \
__ENUMERATE_CHARACTER_COMPARE_TYPE(And) \
__ENUMERATE_CHARACTER_COMPARE_TYPE(Or) \
__ENUMERATE_CHARACTER_COMPARE_TYPE(EndAndOr) \
__ENUMERATE_CHARACTER_COMPARE_TYPE(Subtract) \
__ENUMERATE_CHARACTER_COMPARE_TYPE(StringSet)
enum class CharacterCompareType : ByteCodeValueType {
#define __ENUMERATE_CHARACTER_COMPARE_TYPE(x) x,
ENUMERATE_CHARACTER_COMPARE_TYPES
#undef __ENUMERATE_CHARACTER_COMPARE_TYPE
};
#define ENUMERATE_CHARACTER_CLASSES \
__ENUMERATE_CHARACTER_CLASS(Alnum) \
__ENUMERATE_CHARACTER_CLASS(Cntrl) \
__ENUMERATE_CHARACTER_CLASS(Lower) \
__ENUMERATE_CHARACTER_CLASS(Space) \
__ENUMERATE_CHARACTER_CLASS(Alpha) \
__ENUMERATE_CHARACTER_CLASS(Digit) \
__ENUMERATE_CHARACTER_CLASS(Print) \
__ENUMERATE_CHARACTER_CLASS(Upper) \
__ENUMERATE_CHARACTER_CLASS(Blank) \
__ENUMERATE_CHARACTER_CLASS(Graph) \
__ENUMERATE_CHARACTER_CLASS(Punct) \
__ENUMERATE_CHARACTER_CLASS(Word) \
__ENUMERATE_CHARACTER_CLASS(Xdigit)
enum class CharClass : ByteCodeValueType {
#define __ENUMERATE_CHARACTER_CLASS(x) x,
ENUMERATE_CHARACTER_CLASSES
#undef __ENUMERATE_CHARACTER_CLASS
};
#define ENUMERATE_BOUNDARY_CHECK_TYPES \
__ENUMERATE_BOUNDARY_CHECK_TYPE(Word) \
__ENUMERATE_BOUNDARY_CHECK_TYPE(NonWord)
enum class BoundaryCheckType : ByteCodeValueType {
#define __ENUMERATE_BOUNDARY_CHECK_TYPE(x) x,
ENUMERATE_BOUNDARY_CHECK_TYPES
#undef __ENUMERATE_BOUNDARY_CHECK_TYPE
};
#define ENUMERATE_FORK_IF_CONDITIONS \
__ENUMERATE_FORK_IF_CONDITION(AtStartOfLine) \
__ENUMERATE_FORK_IF_CONDITION(Invalid) /* Must be last */
enum class ForkIfCondition : ByteCodeValueType {
#define __ENUMERATE_FORK_IF_CONDITION(x) x,
ENUMERATE_FORK_IF_CONDITIONS
#undef __ENUMERATE_FORK_IF_CONDITION
};
struct CharRange {
u32 from;
u32 to;
CharRange(u64 value)
: from(value >> 32)
, to(value & 0xffffffff)
{
}
CharRange(u32 from, u32 to)
: from(from)
, to(to)
{
}
operator ByteCodeValueType() const { return ((u64)from << 32) | to; }
};
struct CompareTypeAndValuePair {
CharacterCompareType type;
ByteCodeValueType value;
};
REGEX_API extern u32 s_next_string_table_serial;
template<typename StringType>
struct StringTable {
StringTable()
: m_serial(s_next_string_table_serial++)
{
}
~StringTable()
{
if (m_serial != 0) {
if (m_serial == s_next_string_table_serial - 1 && m_table.is_empty())
--s_next_string_table_serial; // We didn't use this serial, put it back.
}
}
StringTable(StringTable const& other)
{
// Pull a new serial for this copy
m_serial = s_next_string_table_serial++;
m_table = other.m_table;
m_inverse_table = other.m_inverse_table;
}
StringTable(StringTable&& other)
{
m_serial = other.m_serial;
m_table = move(other.m_table);
m_inverse_table = move(other.m_inverse_table);
// Clear other's data to avoid double-deletion of serial
other.m_serial = 0;
}
StringTable& operator=(StringTable const& other)
{
if (this != &other) {
m_serial = s_next_string_table_serial++;
m_table = other.m_table;
m_inverse_table = other.m_inverse_table;
}
return *this;
}
StringTable& operator=(StringTable&& other)
{
if (this != &other) {
m_serial = other.m_serial;
m_table = move(other.m_table);
m_inverse_table = move(other.m_inverse_table);
// Clear other's data to avoid double-deletion of serial
other.m_serial = 0;
}
return *this;
}
ByteCodeValueType set(StringType string)
{
u32 local_index = m_table.size() + 0x4242;
ByteCodeValueType global_index;
if (auto maybe_local_index = m_table.get(string); maybe_local_index.has_value()) {
local_index = maybe_local_index.value();
global_index = static_cast<ByteCodeValueType>(m_serial) << 32 | static_cast<ByteCodeValueType>(local_index);
} else {
global_index = static_cast<ByteCodeValueType>(m_serial) << 32 | static_cast<ByteCodeValueType>(local_index);
m_table.set(string, global_index);
m_inverse_table.set(global_index, string);
}
return global_index;
}
StringType get(ByteCodeValueType index) const
{
return m_inverse_table.get(index).value();
}
u32 m_serial { 0 };
HashMap<StringType, ByteCodeValueType> m_table;
HashMap<ByteCodeValueType, StringType> m_inverse_table;
};
using StringSetTrie = Trie<u32, bool>;
struct REGEX_API StringSetTable {
StringSetTable();
~StringSetTable();
StringSetTable(StringSetTable const& other);
StringSetTable(StringSetTable&&) = default;
StringSetTable& operator=(StringSetTable const& other);
StringSetTable& operator=(StringSetTable&&) = default;
ByteCodeValueType set(Vector<String> const& strings)
{
u32 local_index = m_u8_tries.size();
ByteCodeValueType global_index = static_cast<ByteCodeValueType>(m_serial) << 32 | static_cast<ByteCodeValueType>(local_index);
StringSetTrie u8_trie { 0, false };
StringSetTrie u16_trie { 0, false };
for (auto const& str : strings) {
Vector<u32> code_points;
Utf8View utf8_view { str.bytes_as_string_view() };
for (auto code_point : utf8_view)
code_points.append(code_point);
(void)u8_trie.insert(code_points.begin(), code_points.end(), true, [](auto&, auto) { return false; });
auto utf16_string = Utf16String::from_utf32({ code_points.data(), code_points.size() });
Vector<u32> u16_code_units;
auto utf16_view = utf16_string.utf16_view();
for (size_t i = 0; i < utf16_view.length_in_code_units(); i++) {
auto code_unit = utf16_view.code_unit_at(i);
u16_code_units.append(code_unit);
}
(void)u16_trie.insert(u16_code_units.begin(), u16_code_units.end(), true, [](auto&, auto) { return false; });
}
m_u8_tries.set(global_index, move(u8_trie));
m_u16_tries.set(global_index, move(u16_trie));
return global_index;
}
StringSetTrie const& get_u8_trie(ByteCodeValueType index) const
{
return m_u8_tries.get(index).value();
}
StringSetTrie const& get_u16_trie(ByteCodeValueType index) const
{
return m_u16_tries.get(index).value();
}
u32 m_serial { 0 };
HashMap<ByteCodeValueType, StringSetTrie> m_u8_tries;
HashMap<ByteCodeValueType, StringSetTrie> m_u16_tries;
};
struct ByteCodeBase {
FlyString get_string(size_t index) const { return m_string_table.get(index); }
auto const& string_table() const { return m_string_table; }
auto get_u16_string(size_t index) const { return m_u16_string_table.get(index); }
auto const& u16_string_table() const { return m_u16_string_table; }
auto const& string_set_table() const { return m_string_set_table; }
auto& string_set_table() { return m_string_set_table; }
Optional<size_t> get_group_name_index(size_t group_index) const
{
return m_group_name_mappings.get(group_index);
}
protected:
StringTable<FlyString> m_string_table;
StringTable<Utf16FlyString> m_u16_string_table;
StringSetTable m_string_set_table;
HashMap<size_t, size_t> m_group_name_mappings;
};
class REGEX_API ByteCode : public ByteCodeBase
, public DisjointChunks<ByteCodeValueType> {
using Base = DisjointChunks<ByteCodeValueType>;
friend class FlatByteCode;
public:
using Base::append;
ByteCode() = default;
ByteCode(ByteCode const&) = default;
ByteCode(ByteCode&&) = default;
ByteCode(Base&&) = delete;
ByteCode(Base const&) = delete;
~ByteCode() = default;
ByteCode& operator=(ByteCode const&) = default;
ByteCode& operator=(ByteCode&&) = default;
ByteCode& operator=(Base&& value) = delete;
ByteCode& operator=(Base const& value) = delete;
void extend(ByteCode&& other)
{
merge_string_tables_from({ &other, 1 });
Base::extend(move(other));
}
void extend(ByteCode const& other)
{
merge_string_tables_from({ &other, 1 });
Base::extend(other);
}
template<SameAs<Vector<ByteCodeValueType>> T>
void extend(T other)
{
Base::append(move(other));
}
template<typename... Args>
void empend(Args&&... args)
{
if (is_empty())
Base::append({});
Base::last_chunk().empend(forward<Args>(args)...);
}
template<typename T>
void append(T&& value)
{
if (is_empty())
Base::append({});
Base::last_chunk().append(forward<T>(value));
}
template<typename T>
void prepend(T&& value)
{
if (is_empty())
return append(forward<T>(value));
Base::first_chunk().prepend(forward<T>(value));
}
void append(Span<ByteCodeValueType const> value)
{
if (is_empty())
Base::append({});
auto& last = Base::last_chunk();
last.ensure_capacity(value.size());
for (auto v : value)
last.unchecked_append(v);
}
void ensure_capacity(size_t capacity)
{
if (is_empty())
Base::append({});
Base::last_chunk().ensure_capacity(capacity);
}
void last_chunk() const = delete;
void first_chunk() const = delete;
void merge_string_tables_from(Span<ByteCode const> others)
{
for (auto const& other : others) {
for (auto const& entry : other.m_string_table.m_table) {
auto const result = m_string_table.m_inverse_table.set(entry.value, entry.key);
if (result != HashSetResult::InsertedNewEntry) {
if (m_string_table.m_inverse_table.get(entry.value) == entry.key) // Already in inverse table.
continue;
dbgln("StringTable: Detected ID clash in string tables! ID {} seems to be reused", entry.value);
dbgln("Old: {}, New: {}", m_string_table.m_inverse_table.get(entry.value), entry.key);
VERIFY_NOT_REACHED();
}
m_string_table.m_table.set(entry.key, entry.value);
}
m_string_table.m_inverse_table.update(other.m_string_table.m_inverse_table);
for (auto const& entry : other.m_u16_string_table.m_table) {
auto const result = m_u16_string_table.m_inverse_table.set(entry.value, entry.key);
if (result != HashSetResult::InsertedNewEntry) {
if (m_u16_string_table.m_inverse_table.get(entry.value) == entry.key) // Already in inverse table.
continue;
dbgln("StringTable: Detected ID clash in string tables! ID {} seems to be reused", entry.value);
dbgln("Old: {}, New: {}", m_u16_string_table.m_inverse_table.get(entry.value), entry.key);
VERIFY_NOT_REACHED();
}
m_u16_string_table.m_table.set(entry.key, entry.value);
}
m_u16_string_table.m_inverse_table.update(other.m_u16_string_table.m_inverse_table);
for (auto const& entry : other.m_string_set_table.m_u8_tries) {
m_string_set_table.m_u8_tries.set(entry.key, MUST(const_cast<StringSetTrie&>(entry.value).deep_copy()));
}
for (auto const& entry : other.m_string_set_table.m_u16_tries) {
m_string_set_table.m_u16_tries.set(entry.key, MUST(const_cast<StringSetTrie&>(entry.value).deep_copy()));
}
for (auto const& mapping : other.m_group_name_mappings) {
m_group_name_mappings.set(mapping.key, mapping.value);
}
}
}
void insert_bytecode_compare_values(Vector<CompareTypeAndValuePair>&& pairs)
{
Optimizer::append_character_class(*this, move(pairs));
}
void insert_bytecode_check_boundary(BoundaryCheckType type)
{
ByteCode bytecode;
bytecode.empend((ByteCodeValueType)OpCodeId::CheckBoundary);
bytecode.empend((ByteCodeValueType)type);
extend(move(bytecode));
}
void insert_bytecode_clear_capture_group(size_t index)
{
empend(static_cast<ByteCodeValueType>(OpCodeId::ClearCaptureGroup));
empend(index);
}
void insert_bytecode_compare_string(Utf16FlyString string)
{
empend(static_cast<ByteCodeValueType>(OpCodeId::Compare));
empend(static_cast<u64>(1)); // number of arguments
empend(static_cast<u64>(2)); // size of arguments
empend(static_cast<ByteCodeValueType>(CharacterCompareType::String));
auto index = m_u16_string_table.set(move(string));
empend(index);
}
void insert_bytecode_group_capture_left(size_t capture_groups_count)
{
empend(static_cast<ByteCodeValueType>(OpCodeId::SaveLeftCaptureGroup));
empend(capture_groups_count);
}
void insert_bytecode_group_capture_right(size_t capture_groups_count)
{
empend(static_cast<ByteCodeValueType>(OpCodeId::SaveRightCaptureGroup));
empend(capture_groups_count);
}
void insert_bytecode_group_capture_right(size_t capture_groups_count, FlyString name)
{
empend(static_cast<ByteCodeValueType>(OpCodeId::SaveRightNamedCaptureGroup));
auto name_string_index = m_string_table.set(move(name));
empend(name_string_index);
empend(capture_groups_count);
m_group_name_mappings.set(capture_groups_count - 1, name_string_index);
}
void insert_bytecode_save_modifiers(FlagsUnderlyingType new_modifiers)
{
empend(static_cast<ByteCodeValueType>(OpCodeId::SaveModifiers));
empend(static_cast<ByteCodeValueType>(new_modifiers));
}
void insert_bytecode_restore_modifiers()
{
empend(static_cast<ByteCodeValueType>(OpCodeId::RestoreModifiers));
}
enum class LookAroundType {
LookAhead,
LookBehind,
NegatedLookAhead,
NegatedLookBehind,
};
void insert_bytecode_lookaround(ByteCode&& lookaround_body, LookAroundType type, size_t match_length = 0, bool greedy_lookaround = true)
{
// FIXME: The save stack will grow infinitely with repeated failures
// as we do not discard that on failure (we don't necessarily know how many to pop with the current architecture).
switch (type) {
case LookAroundType::LookAhead: {
// SAVE
// FORKJUMP _BODY
// POPSAVED
// LABEL _BODY
// REGEXP BODY
// RESTORE
empend((ByteCodeValueType)OpCodeId::Save);
empend((ByteCodeValueType)OpCodeId::ForkJump);
empend((ByteCodeValueType)1);
empend((ByteCodeValueType)OpCodeId::PopSaved);
extend(move(lookaround_body));
empend((ByteCodeValueType)OpCodeId::Restore);
return;
}
case LookAroundType::NegatedLookAhead: {
// JUMP _A
// LABEL _L
// REGEXP BODY
// FAIL
// LABEL _A
// SAVE
// FORKJUMP _L
// RESTORE
auto body_length = lookaround_body.size();
empend((ByteCodeValueType)OpCodeId::Jump);
empend((ByteCodeValueType)body_length + 1); // JUMP to label _A
extend(move(lookaround_body));
empend((ByteCodeValueType)OpCodeId::FailForks);
empend((ByteCodeValueType)OpCodeId::Save);
empend((ByteCodeValueType)OpCodeId::ForkJump);
empend((ByteCodeValueType) - (body_length + 4)); // JUMP to label _L
empend((ByteCodeValueType)OpCodeId::Restore);
return;
}
case LookAroundType::LookBehind: {
// SAVE
// SET_STEPBACK match_length(BODY)-1
// LABEL _START
// INC_STEPBACK
// FORK_JUMP _BODY
// CHECK_STEPBACK
// JUMP _START
// LABEL _BODY
// REGEX BODY
// CHECK_SAVED_POSITION
// RESTORE
auto body_length = lookaround_body.size();
empend((ByteCodeValueType)OpCodeId::Save);
empend((ByteCodeValueType)OpCodeId::SetStepBack);
empend((ByteCodeValueType)match_length - 1);
empend((ByteCodeValueType)OpCodeId::IncStepBack);
empend((ByteCodeValueType)OpCodeId::ForkJump);
empend((ByteCodeValueType)1 + 2); // JUMP to label _BODY
empend((ByteCodeValueType)OpCodeId::CheckStepBack);
empend((ByteCodeValueType)OpCodeId::Jump);
empend((ByteCodeValueType)-6); // JUMP to label _START
extend(move(lookaround_body));
if (greedy_lookaround) {
empend((ByteCodeValueType)OpCodeId::ForkJump);
empend((ByteCodeValueType)(0 - 2 - body_length - 6));
}
empend((ByteCodeValueType)OpCodeId::CheckSavedPosition);
empend((ByteCodeValueType)OpCodeId::Restore);
return;
}
case LookAroundType::NegatedLookBehind: {
// JUMP _A
// LABEL _L
// GOBACK match_length(BODY)
// REGEXP BODY
// FAIL
// LABEL _A
// SAVE
// FORKJUMP _L
// RESTORE
auto body_length = lookaround_body.size();
empend((ByteCodeValueType)OpCodeId::Jump);
empend((ByteCodeValueType)body_length + 3); // JUMP to label _A
empend((ByteCodeValueType)OpCodeId::GoBack);
empend((ByteCodeValueType)match_length);
extend(move(lookaround_body));
empend((ByteCodeValueType)OpCodeId::FailForks);
empend((ByteCodeValueType)OpCodeId::Save);
empend((ByteCodeValueType)OpCodeId::ForkJump);
empend((ByteCodeValueType) - (body_length + 6)); // JUMP to label _L
empend((ByteCodeValueType)OpCodeId::Restore);
return;
}
}
VERIFY_NOT_REACHED();
}
void insert_bytecode_alternation(ByteCode&& left, ByteCode&& right)
{
// FORKJUMP _ALT
// REGEXP ALT2
// JUMP _END
// LABEL _ALT
// REGEXP ALT1
// LABEL _END
// Optimisation: Eliminate extra work by unifying common pre-and-postfix exprs.
Optimizer::append_alternation(*this, move(left), move(right));
}
template<Integral T>
static void transform_bytecode_repetition_min_max(ByteCode& bytecode_to_repeat, T minimum, Optional<T> maximum, size_t min_repetition_mark_id, size_t max_repetition_mark_id, bool greedy = true)
{
if (!maximum.has_value()) {
if (minimum == 0)
return transform_bytecode_repetition_any(bytecode_to_repeat, greedy);
if (minimum == 1)
return transform_bytecode_repetition_min_one(bytecode_to_repeat, greedy);
}
if (minimum == 0 && maximum.has_value() && maximum.value() == 1) {
return transform_bytecode_repetition_zero_or_one(bytecode_to_repeat, greedy);
}
ByteCode new_bytecode;
new_bytecode.insert_bytecode_repetition_n(bytecode_to_repeat, minimum, min_repetition_mark_id);
if (maximum.has_value()) {
// (REPEAT REGEXP MIN)
// LABEL _MAX_LOOP |
// FORK END |
// CHECKPOINT (if min==0) |
// REGEXP |
// FAILIFEMPTY (if min==0) |
// REPEAT _MAX_LOOP MAX-MIN | if max > min
// FORK END |
// CHECKPOINT (if min==0) |
// REGEXP |
// FAILIFEMPTY (if min==0) |
// LABEL END |
// RESET _MAX_LOOP |
auto jump_kind = static_cast<ByteCodeValueType>(greedy ? OpCodeId::ForkStay : OpCodeId::ForkJump);
if (maximum.value() > minimum) {
new_bytecode.empend(jump_kind);
new_bytecode.empend((ByteCodeValueType)0); // Placeholder for the jump target.
auto pre_loop_fork_jump_index = new_bytecode.size();
auto checkpoint1 = minimum == 0 ? s_next_checkpoint_serial_id++ : 0;
if (minimum == 0) {
new_bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::Checkpoint));
new_bytecode.empend(static_cast<ByteCodeValueType>(checkpoint1));
}
new_bytecode.extend(bytecode_to_repeat);
if (minimum == 0) {
new_bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::FailIfEmpty));
new_bytecode.empend(checkpoint1);
}
auto repetitions = maximum.value() - minimum;
auto fork_jump_address = new_bytecode.size();
if (repetitions > 1) {
auto repeated_bytecode_size = bytecode_to_repeat.size();
if (minimum == 0)
repeated_bytecode_size += 4; // Checkpoint + FailIfEmpty
new_bytecode.empend((ByteCodeValueType)OpCodeId::Repeat);
new_bytecode.empend(repeated_bytecode_size + 2);
new_bytecode.empend(static_cast<ByteCodeValueType>(repetitions - 1));
new_bytecode.empend(max_repetition_mark_id);
new_bytecode.empend(jump_kind);
new_bytecode.empend((ByteCodeValueType)0); // Placeholder for the jump target.
auto post_loop_fork_jump_index = new_bytecode.size();
auto checkpoint2 = minimum == 0 ? s_next_checkpoint_serial_id++ : 0;
if (minimum == 0) {
new_bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::Checkpoint));
new_bytecode.empend(static_cast<ByteCodeValueType>(checkpoint2));
}
new_bytecode.extend(bytecode_to_repeat);
if (minimum == 0) {
new_bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::FailIfEmpty));
new_bytecode.empend(checkpoint2);
}
fork_jump_address = new_bytecode.size();
new_bytecode[post_loop_fork_jump_index - 1] = (ByteCodeValueType)(fork_jump_address - post_loop_fork_jump_index);
new_bytecode.empend((ByteCodeValueType)OpCodeId::ResetRepeat);
new_bytecode.empend((ByteCodeValueType)max_repetition_mark_id);
}
new_bytecode[pre_loop_fork_jump_index - 1] = (ByteCodeValueType)(fork_jump_address - pre_loop_fork_jump_index);
}
} else {
// no maximum value set, repeat finding if possible:
// (REPEAT REGEXP MIN)
// LABEL _START
// CHECKPOINT _C
// REGEXP
// JUMP_NONEMPTY _C _START FORK
// Note: This is only safe because REPEAT will leave one iteration outside (see repetition_n)
auto checkpoint = s_next_checkpoint_serial_id++;
new_bytecode.insert(new_bytecode.size() - bytecode_to_repeat.size(), (ByteCodeValueType)OpCodeId::Checkpoint);
new_bytecode.insert(new_bytecode.size() - bytecode_to_repeat.size(), (ByteCodeValueType)checkpoint);
auto jump_kind = static_cast<ByteCodeValueType>(greedy ? OpCodeId::ForkJump : OpCodeId::ForkStay);
new_bytecode.empend((ByteCodeValueType)OpCodeId::JumpNonEmpty);
new_bytecode.empend(-bytecode_to_repeat.size() - 4 - 2); // Jump to the last iteration
new_bytecode.empend(checkpoint); // if _C is not empty.
new_bytecode.empend(jump_kind);
}
bytecode_to_repeat = move(new_bytecode);
}
template<Integral T>
void insert_bytecode_repetition_n(ByteCode& bytecode_to_repeat, T n, size_t repetition_mark_id)
{
// LABEL _LOOP
// REGEXP
// REPEAT _LOOP N-1
// REGEXP
if (n == 0)
return;
// Note: this bytecode layout allows callers to repeat the last REGEXP instruction without the
// REPEAT instruction forcing another loop.
extend(bytecode_to_repeat);
if (n > 1) {
empend(static_cast<ByteCodeValueType>(OpCodeId::Repeat));
empend(bytecode_to_repeat.size());
empend(static_cast<ByteCodeValueType>(n - 1));
empend(repetition_mark_id);
extend(bytecode_to_repeat);
}
}
static void transform_bytecode_repetition_min_one(ByteCode& bytecode_to_repeat, bool greedy)
{
// LABEL _START = -bytecode_to_repeat.size()
// CHECKPOINT _C
// REGEXP
// JUMP_NONEMPTY _C _START FORKSTAY (FORKJUMP -> Greedy)
auto checkpoint = s_next_checkpoint_serial_id++;
bytecode_to_repeat.prepend((ByteCodeValueType)checkpoint);
bytecode_to_repeat.prepend((ByteCodeValueType)OpCodeId::Checkpoint);
bytecode_to_repeat.empend((ByteCodeValueType)OpCodeId::JumpNonEmpty);
bytecode_to_repeat.empend(-bytecode_to_repeat.size() - 3); // Jump to the _START label...
bytecode_to_repeat.empend(checkpoint); // ...if _C is not empty
if (greedy)
bytecode_to_repeat.empend(static_cast<ByteCodeValueType>(OpCodeId::ForkJump));
else
bytecode_to_repeat.empend(static_cast<ByteCodeValueType>(OpCodeId::ForkStay));
}
static void transform_bytecode_repetition_any(ByteCode& bytecode_to_repeat, bool greedy)
{
// LABEL _START
// FORKJUMP _END (FORKSTAY -> Greedy)
// CHECKPOINT _C
// REGEXP
// FAILIFEMPTY _C
// JUMP_NONEMPTY _C _START JUMP
// LABEL _END
// LABEL _START = m_bytes.size();
ByteCode bytecode;
if (greedy)
bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::ForkStay));
else
bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::ForkJump));
bytecode.empend(bytecode_to_repeat.size() + 2 + 4 + 2); // Jump to the _END label
auto checkpoint = s_next_checkpoint_serial_id++;
bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::Checkpoint));
bytecode.empend(static_cast<ByteCodeValueType>(checkpoint));
bytecode.extend(bytecode_to_repeat);
bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::FailIfEmpty));
bytecode.empend(checkpoint);
bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::JumpNonEmpty));
bytecode.empend(-bytecode.size() - 3); // Jump(...) to the _START label...
bytecode.empend(checkpoint); // ...only if _C passes.
bytecode.empend((ByteCodeValueType)OpCodeId::Jump);
// LABEL _END = bytecode.size()
bytecode_to_repeat = move(bytecode);
}
static void transform_bytecode_repetition_zero_or_one(ByteCode& bytecode_to_repeat, bool greedy)
{
// FORKJUMP _END (FORKSTAY -> Greedy)
// CHECKPOINT _C
// REGEXP
// FAILIFEMPTY _C
// LABEL _END
ByteCode bytecode;
if (greedy)
bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::ForkStay));
else
bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::ForkJump));
bytecode.empend(bytecode_to_repeat.size() + 4); // Jump to the _END label
auto checkpoint = s_next_checkpoint_serial_id++;
bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::Checkpoint));
bytecode.empend(static_cast<ByteCodeValueType>(checkpoint));
bytecode.extend(move(bytecode_to_repeat));
bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::FailIfEmpty));
bytecode.empend(checkpoint);
// LABEL _END = bytecode.size()
bytecode_to_repeat = move(bytecode);
}
static void reset_checkpoint_serial_id() { s_next_checkpoint_serial_id = 0; }
private:
static size_t s_next_checkpoint_serial_id;
};
class REGEX_API FlatByteCode : public ByteCodeBase {
public:
static FlatByteCode from(ByteCode&& bytecode)
{
FlatByteCode flat_bytecode;
if (!bytecode.is_empty())
flat_bytecode.m_data = move(static_cast<DisjointChunks<ByteCodeValueType>&>(bytecode).first_chunk());
flat_bytecode.m_string_table = move(bytecode.m_string_table);
flat_bytecode.m_u16_string_table = move(bytecode.m_u16_string_table);
flat_bytecode.m_string_set_table = move(bytecode.m_string_set_table);
flat_bytecode.m_group_name_mappings = move(bytecode.m_group_name_mappings);
return flat_bytecode;
}
Span<ByteCodeValueType const> flat_data() const { return m_data.span(); }
auto& at(size_t index) { return m_data.data()[index]; }
auto const& at(size_t index) const { return m_data.data()[index]; }
auto& operator[](size_t index) { return m_data.data()[index]; }
auto const& operator[](size_t index) const { return m_data.data()[index]; }
auto size() const { return m_data.size(); }
auto begin() const { return m_data.begin(); }
auto end() const { return m_data.end(); }
private:
Vector<ByteCodeValueType> m_data;
};
#define ENUMERATE_EXECUTION_RESULTS \
__ENUMERATE_EXECUTION_RESULT(Continue) \
__ENUMERATE_EXECUTION_RESULT(Fork_PrioHigh) \
__ENUMERATE_EXECUTION_RESULT(Fork_PrioLow) \
__ENUMERATE_EXECUTION_RESULT(Failed) \
__ENUMERATE_EXECUTION_RESULT(Failed_ExecuteLowPrioForks) \
__ENUMERATE_EXECUTION_RESULT(Failed_ExecuteLowPrioForksButNoFurtherPossibleMatches) \
__ENUMERATE_EXECUTION_RESULT(Succeeded)
enum class ExecutionResult : u8 {
#define __ENUMERATE_EXECUTION_RESULT(x) x,
ENUMERATE_EXECUTION_RESULTS
#undef __ENUMERATE_EXECUTION_RESULT
};
StringView execution_result_name(ExecutionResult result);
REGEX_API StringView opcode_id_name(OpCodeId opcode_id);
StringView boundary_check_type_name(BoundaryCheckType);
StringView character_compare_type_name(CharacterCompareType result);
StringView character_class_name(CharClass ch_class);
StringView fork_if_condition_name(ForkIfCondition condition);
namespace OpArgs {
struct Jump {
static constexpr size_t offset = 1;
};
struct GoBack {
static constexpr size_t count = 1;
};
struct SetStepBack {
static constexpr size_t step = 1;
};
struct CheckBoundary {
static constexpr size_t type = 1;
};
struct ClearCaptureGroup {
static constexpr size_t id = 1;
};
struct FailIfEmpty {
static constexpr size_t checkpoint = 1;
};
struct SaveLeftCaptureGroup {
static constexpr size_t id = 1;
};
struct SaveRightCaptureGroup {
static constexpr size_t id = 1;
};
struct SaveModifiers {
static constexpr size_t new_modifiers = 1;
};
struct ResetRepeat {
static constexpr size_t id = 1;
};
struct Checkpoint {
static constexpr size_t id = 1;
};
struct RSeekTo {
static constexpr size_t ch = 1;
};
struct SaveRightNamedCaptureGroup {
static constexpr size_t name_index = 1;
static constexpr size_t id = 2;
};
struct Repeat {
static constexpr size_t offset = 1;
static constexpr size_t count = 2;
static constexpr size_t id = 3;
};
struct JumpNonEmpty {
static constexpr size_t offset = 1;
static constexpr size_t checkpoint = 2;
static constexpr size_t form = 3;
};
struct ForkIf {
static constexpr size_t offset = 1;
static constexpr size_t form = 2;
static constexpr size_t condition = 3;
};
struct Compare {
static constexpr size_t arguments_count = 1;
static constexpr size_t arguments_size = 2;
static constexpr size_t data_start = 3;
};
struct CompareSimple {
static constexpr size_t arguments_size = 1;
static constexpr size_t data_start = 2;
};
}
inline size_t opcode_size(OpCodeId id, ByteCodeValueType const* data, size_t ip)
{
switch (id) {
case OpCodeId::Exit:
case OpCodeId::FailForks:
case OpCodeId::PopSaved:
case OpCodeId::Save:
case OpCodeId::Restore:
case OpCodeId::IncStepBack:
case OpCodeId::CheckStepBack:
case OpCodeId::CheckSavedPosition:
case OpCodeId::CheckBegin:
case OpCodeId::CheckEnd:
case OpCodeId::RestoreModifiers:
return 1;
case OpCodeId::Jump:
case OpCodeId::ForkJump:
case OpCodeId::ForkStay:
case OpCodeId::ForkReplaceJump:
case OpCodeId::ForkReplaceStay:
case OpCodeId::GoBack:
case OpCodeId::SetStepBack:
case OpCodeId::CheckBoundary:
case OpCodeId::ClearCaptureGroup:
case OpCodeId::FailIfEmpty:
case OpCodeId::SaveLeftCaptureGroup:
case OpCodeId::SaveRightCaptureGroup:
case OpCodeId::SaveModifiers:
case OpCodeId::ResetRepeat:
case OpCodeId::Checkpoint:
case OpCodeId::RSeekTo:
return 2;
case OpCodeId::SaveRightNamedCaptureGroup:
return 3;
case OpCodeId::Repeat:
case OpCodeId::JumpNonEmpty:
case OpCodeId::ForkIf:
return 4;
case OpCodeId::Compare:
return data[ip + OpArgs::Compare::arguments_size] + 3;
case OpCodeId::CompareSimple:
return 2 + data[ip + OpArgs::CompareSimple::arguments_size];
}
VERIFY_NOT_REACHED();
}
Vector<CompareTypeAndValuePair> flat_compares_at(ByteCodeValueType const* data, size_t ip, bool is_simple);
bool matches_character_class(CharClass, u32 ch, bool insensitive, bool unicode_mode);
REGEX_API ByteString opcode_arguments_string(OpCodeId id, ByteCodeValueType const* data, size_t ip, MatchState const& state, ByteCodeBase const& bytecode);
REGEX_API Vector<ByteString> compare_variable_arguments_to_byte_string(ByteCodeValueType const* data, size_t ip, MatchState const& state, ByteCodeBase const& bytecode, Optional<MatchInput const&> input = {});
}