mirror of
https://github.com/LadybirdBrowser/ladybird
synced 2026-04-26 01:35:08 +02:00
Compared to the old boost-style combine, this reduces collisions by 36% after folding for ranges SP=0..300, IP=0..750 and reps=0..50.
577 lines
20 KiB
C++
577 lines
20 KiB
C++
/*
|
|
* Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
|
|
*
|
|
* SPDX-License-Identifier: BSD-2-Clause
|
|
*/
|
|
|
|
#pragma once
|
|
|
|
#include "Forward.h"
|
|
#include "RegexOptions.h"
|
|
|
|
#include <AK/ByteString.h>
|
|
#include <AK/COWVector.h>
|
|
#include <AK/Error.h>
|
|
#include <AK/FlyString.h>
|
|
#include <AK/MemMem.h>
|
|
#include <AK/StringBuilder.h>
|
|
#include <AK/StringView.h>
|
|
#include <AK/UnicodeUtils.h>
|
|
#include <AK/Utf16String.h>
|
|
#include <AK/Utf16View.h>
|
|
#include <AK/Utf32View.h>
|
|
#include <AK/Utf8View.h>
|
|
#include <AK/Variant.h>
|
|
#include <AK/Vector.h>
|
|
#include <LibUnicode/CharacterTypes.h>
|
|
|
|
namespace regex {
|
|
|
|
class RegexStringView {
|
|
public:
|
|
RegexStringView() = default;
|
|
|
|
RegexStringView(String const& string)
|
|
: m_view(string.bytes_as_string_view())
|
|
{
|
|
}
|
|
|
|
RegexStringView(StringView const view)
|
|
: m_view(view)
|
|
{
|
|
}
|
|
|
|
RegexStringView(Utf16View view)
|
|
: m_view(view)
|
|
{
|
|
}
|
|
|
|
RegexStringView(String&&) = delete;
|
|
|
|
Utf16View const& u16_view() const
|
|
{
|
|
return m_view.get<Utf16View>();
|
|
}
|
|
|
|
bool is_u16_view() const
|
|
{
|
|
return m_view.has<Utf16View>();
|
|
}
|
|
|
|
bool unicode() const { return m_unicode; }
|
|
void set_unicode(bool unicode) { m_unicode = unicode; }
|
|
|
|
bool is_empty() const
|
|
{
|
|
return m_view.visit([](auto& view) { return view.is_empty(); });
|
|
}
|
|
|
|
bool is_null() const
|
|
{
|
|
return m_view.visit([](auto& view) { return view.is_null(); });
|
|
}
|
|
|
|
size_t length() const
|
|
{
|
|
if (unicode()) {
|
|
return m_view.visit(
|
|
[](Utf16View const& view) { return view.length_in_code_points(); },
|
|
[](auto const& view) { return view.length(); });
|
|
}
|
|
|
|
return length_in_code_units();
|
|
}
|
|
|
|
size_t length_in_code_units() const
|
|
{
|
|
return m_view.visit(
|
|
[](Utf16View const& view) { return view.length_in_code_units(); },
|
|
[](auto const& view) { return view.length(); });
|
|
}
|
|
|
|
size_t length_of_code_point(u32 code_point) const
|
|
{
|
|
return m_view.visit(
|
|
[&](Utf16View const&) {
|
|
if (code_point < 0x10000)
|
|
return 1;
|
|
return 2;
|
|
},
|
|
[&](auto const&) {
|
|
if (code_point <= 0x7f)
|
|
return 1;
|
|
if (code_point <= 0x07ff)
|
|
return 2;
|
|
if (code_point <= 0xffff)
|
|
return 3;
|
|
return 4;
|
|
});
|
|
}
|
|
|
|
RegexStringView typed_null_view()
|
|
{
|
|
auto view = m_view.visit(
|
|
[&]<typename T>(T const&) {
|
|
return RegexStringView { T {} };
|
|
});
|
|
view.set_unicode(unicode());
|
|
return view;
|
|
}
|
|
|
|
RegexStringView construct_as_same(Span<u32> data, Optional<ByteString>& optional_string_storage, Utf16String& optional_utf16_storage) const
|
|
{
|
|
auto view = m_view.visit(
|
|
[&optional_string_storage, data]<typename T>(T const&) {
|
|
StringBuilder builder;
|
|
for (auto ch : data)
|
|
builder.append(ch); // Note: The type conversion is intentional.
|
|
optional_string_storage = builder.to_byte_string();
|
|
return RegexStringView { T { *optional_string_storage } };
|
|
},
|
|
[&optional_utf16_storage, data](Utf16View) {
|
|
optional_utf16_storage = Utf16String::from_utf32({ data.data(), data.size() });
|
|
return RegexStringView { optional_utf16_storage.utf16_view() };
|
|
});
|
|
|
|
view.set_unicode(unicode());
|
|
return view;
|
|
}
|
|
|
|
Vector<RegexStringView> lines() const
|
|
{
|
|
return m_view.visit(
|
|
[](StringView view) {
|
|
auto views = view.lines(StringView::ConsiderCarriageReturn::No);
|
|
Vector<RegexStringView> new_views;
|
|
for (auto& view : views)
|
|
new_views.empend(view);
|
|
return new_views;
|
|
},
|
|
[](Utf16View view) {
|
|
if (view.is_empty())
|
|
return Vector<RegexStringView> { view };
|
|
|
|
Vector<RegexStringView> views;
|
|
while (!view.is_empty()) {
|
|
auto position = view.find_code_unit_offset(u'\n');
|
|
if (!position.has_value())
|
|
break;
|
|
auto offset = position.value() / sizeof(u16);
|
|
views.empend(view.substring_view(0, offset));
|
|
view = view.substring_view(offset + 1, view.length_in_code_units() - offset - 1);
|
|
}
|
|
if (!view.is_empty())
|
|
views.empend(view);
|
|
return views;
|
|
});
|
|
}
|
|
|
|
RegexStringView substring_view(size_t offset, size_t length) const
|
|
{
|
|
if (unicode()) {
|
|
auto view = m_view.visit(
|
|
[&](auto view) { return RegexStringView { view.substring_view(offset, length) }; },
|
|
[&](Utf16View const& view) { return RegexStringView { view.unicode_substring_view(offset, length) }; });
|
|
|
|
view.set_unicode(unicode());
|
|
return view;
|
|
}
|
|
|
|
auto view = m_view.visit([&](auto view) { return RegexStringView { view.substring_view(offset, length) }; });
|
|
view.set_unicode(unicode());
|
|
return view;
|
|
}
|
|
|
|
ByteString to_byte_string() const
|
|
{
|
|
return m_view.visit(
|
|
[](StringView view) { return view.to_byte_string(); },
|
|
[](Utf16View view) { return view.to_byte_string().release_value_but_fixme_should_propagate_errors(); });
|
|
}
|
|
|
|
ErrorOr<String> to_string() const
|
|
{
|
|
return m_view.visit(
|
|
[](StringView view) { return String::from_utf8(view); },
|
|
[](Utf16View view) { return view.to_utf8(); });
|
|
}
|
|
|
|
u32 code_point_at(size_t code_unit_index) const
|
|
{
|
|
return m_view.visit(
|
|
[&](StringView view) -> u32 {
|
|
auto ch = view[code_unit_index];
|
|
if constexpr (IsSigned<char>) {
|
|
if (ch < 0)
|
|
return 256u + ch;
|
|
return ch;
|
|
}
|
|
},
|
|
[&](Utf16View const& view) -> u32 { return view.code_point_at(code_unit_index); });
|
|
}
|
|
|
|
// Returns the code point at the code unit offset if the Unicode flag is set. Otherwise, returns the code unit.
|
|
u32 unicode_aware_code_point_at(size_t code_unit_index) const
|
|
{
|
|
if (unicode())
|
|
return code_point_at(code_unit_index);
|
|
|
|
return m_view.visit(
|
|
[&](StringView view) -> u32 {
|
|
auto ch = view[code_unit_index];
|
|
if constexpr (IsSigned<char>) {
|
|
if (ch < 0)
|
|
return 256u + ch;
|
|
return ch;
|
|
}
|
|
},
|
|
[&](Utf16View const& view) -> u32 { return view.code_unit_at(code_unit_index); });
|
|
}
|
|
|
|
size_t code_unit_offset_of(size_t code_point_index) const
|
|
{
|
|
return m_view.visit(
|
|
[&](StringView view) -> u32 {
|
|
Utf8View utf8_view { view };
|
|
return utf8_view.byte_offset_of(code_point_index);
|
|
},
|
|
[&](Utf16View const& view) -> u32 {
|
|
return view.code_unit_offset_of(code_point_index);
|
|
});
|
|
}
|
|
|
|
bool operator==(char const* cstring) const
|
|
{
|
|
return m_view.visit(
|
|
[&](Utf16View) { return to_byte_string() == cstring; },
|
|
[&](StringView view) { return view == cstring; });
|
|
}
|
|
|
|
bool operator==(StringView string) const
|
|
{
|
|
return m_view.visit(
|
|
[&](Utf16View) { return to_byte_string() == string; },
|
|
[&](StringView view) { return view == string; });
|
|
}
|
|
|
|
bool operator==(Utf16View const& other) const
|
|
{
|
|
return m_view.visit(
|
|
[&](Utf16View const& view) { return view == other; },
|
|
[&](StringView view) { return view == RegexStringView { other }.to_byte_string(); });
|
|
}
|
|
|
|
bool equals(RegexStringView other) const
|
|
{
|
|
return other.m_view.visit([this](auto const& view) { return operator==(view); });
|
|
}
|
|
|
|
bool equals_ignoring_case(RegexStringView other, bool unicode_mode) const
|
|
{
|
|
return m_view.visit(
|
|
[&](StringView view) {
|
|
return other.m_view.visit(
|
|
[&](StringView other_view) {
|
|
if (!unicode_mode)
|
|
return view.equals_ignoring_ascii_case(other_view);
|
|
|
|
Utf8View view_utf8(view);
|
|
Utf8View other_utf8(other_view);
|
|
return Unicode::ranges_equal_ignoring_case(view_utf8, other_utf8, unicode_mode);
|
|
},
|
|
[&](Utf16View other_view) {
|
|
Utf8View view_utf8(view);
|
|
return Unicode::ranges_equal_ignoring_case(view_utf8, other_view, unicode_mode);
|
|
},
|
|
[](auto&) -> bool { TODO(); });
|
|
},
|
|
[&](Utf16View view) {
|
|
return other.m_view.visit(
|
|
[&](StringView other_view) {
|
|
Utf8View other_utf8(other_view);
|
|
return Unicode::ranges_equal_ignoring_case(view, other_utf8, unicode_mode);
|
|
},
|
|
[&](Utf16View other_view) {
|
|
if (!unicode_mode)
|
|
return view.equals_ignoring_ascii_case(other_view);
|
|
|
|
return Unicode::ranges_equal_ignoring_case(view, other_view, unicode_mode);
|
|
},
|
|
[](auto&) -> bool { TODO(); });
|
|
},
|
|
[](auto&) -> bool { TODO(); });
|
|
}
|
|
|
|
bool starts_with(StringView str) const
|
|
{
|
|
return m_view.visit(
|
|
[&](Utf16View) -> bool {
|
|
TODO();
|
|
},
|
|
[&](StringView view) { return view.starts_with(str); });
|
|
}
|
|
|
|
struct FoundIndex {
|
|
size_t code_unit_index;
|
|
size_t code_point_index;
|
|
};
|
|
Optional<FoundIndex> find_index_of_previous(u32 code_point, size_t end_code_point_index, size_t end_code_unit_index) const
|
|
{
|
|
return m_view.visit(
|
|
[&](Utf16View const& view) -> Optional<FoundIndex> {
|
|
auto result = view.find_last_code_point_offset(code_point, end_code_unit_index);
|
|
if (!result.has_value())
|
|
return {};
|
|
return FoundIndex { result.value(), view.code_point_offset_of(result.value()) };
|
|
},
|
|
[&](StringView const& view) -> Optional<FoundIndex> {
|
|
if (unicode()) {
|
|
Utf8View utf8_view { view };
|
|
auto it = utf8_view.begin();
|
|
size_t current_code_point_index = 0;
|
|
Optional<FoundIndex> found_index;
|
|
|
|
for (; it != utf8_view.end(); ++it, ++current_code_point_index) {
|
|
if (current_code_point_index >= end_code_point_index)
|
|
break;
|
|
if (*it == code_point) {
|
|
auto byte_index = utf8_view.byte_offset_of(it);
|
|
found_index = { byte_index, current_code_point_index };
|
|
}
|
|
}
|
|
|
|
return found_index;
|
|
}
|
|
|
|
auto byte_index = view.substring_view(0, min(end_code_unit_index, view.length())).find_last(code_point);
|
|
if (!byte_index.has_value())
|
|
return {};
|
|
return FoundIndex { byte_index.value(), byte_index.value() };
|
|
});
|
|
}
|
|
|
|
FoundIndex find_end_of_line(size_t start_code_point_index, size_t start_code_unit_index) const
|
|
{
|
|
constexpr auto is_newline = [](u32 ch) { return ch == '\n' || ch == '\r' || ch == 0x2028 || ch == 0x2029; };
|
|
|
|
return m_view.visit(
|
|
[&](Utf16View const& view) -> FoundIndex {
|
|
size_t code_unit_index = start_code_unit_index;
|
|
size_t code_point_index = start_code_point_index;
|
|
while (code_unit_index < view.length_in_code_units()) {
|
|
auto code_unit = view.code_unit_at(code_unit_index);
|
|
u32 ch = code_unit;
|
|
size_t code_units_for_this = 1;
|
|
if (AK::UnicodeUtils::is_utf16_high_surrogate(code_unit) && code_unit_index + 1 < view.length_in_code_units()) {
|
|
auto next_code_unit = view.code_unit_at(code_unit_index + 1);
|
|
if (AK::UnicodeUtils::is_utf16_low_surrogate(next_code_unit)) {
|
|
ch = AK::UnicodeUtils::decode_utf16_surrogate_pair(code_unit, next_code_unit);
|
|
code_units_for_this = 2;
|
|
}
|
|
}
|
|
|
|
if (is_newline(ch))
|
|
return FoundIndex { code_unit_index, code_point_index };
|
|
code_unit_index += code_units_for_this;
|
|
++code_point_index;
|
|
}
|
|
return FoundIndex { view.length_in_code_units(), code_point_index };
|
|
},
|
|
[&](StringView const& view) -> FoundIndex {
|
|
if (unicode()) {
|
|
Utf8View utf8_view { view };
|
|
auto it = utf8_view.begin();
|
|
size_t current_code_point_index = 0;
|
|
|
|
// Skip to start position
|
|
while (it != utf8_view.end() && current_code_point_index < start_code_point_index) {
|
|
++it;
|
|
++current_code_point_index;
|
|
}
|
|
|
|
for (; it != utf8_view.end(); ++it, ++current_code_point_index) {
|
|
if (is_newline(*it)) {
|
|
return FoundIndex { utf8_view.byte_offset_of(it), current_code_point_index };
|
|
}
|
|
}
|
|
|
|
return FoundIndex { view.length(), utf8_view.length() };
|
|
}
|
|
|
|
for (size_t i = start_code_unit_index; i < view.length(); ++i) {
|
|
if (is_newline(static_cast<u8>(view[i])))
|
|
return FoundIndex { i, i };
|
|
}
|
|
return FoundIndex { view.length(), view.length() };
|
|
});
|
|
}
|
|
|
|
private:
|
|
NO_UNIQUE_ADDRESS Variant<StringView, Utf16View> m_view { StringView {} };
|
|
NO_UNIQUE_ADDRESS bool m_unicode { false };
|
|
};
|
|
|
|
class Match final {
|
|
public:
|
|
Match() = default;
|
|
~Match() = default;
|
|
|
|
Match(RegexStringView view_, size_t const line_, size_t const column_, size_t const global_offset_)
|
|
: view(view_)
|
|
, line(line_)
|
|
, column(column_)
|
|
, global_offset(global_offset_)
|
|
, left_column(column_)
|
|
{
|
|
}
|
|
|
|
Match(RegexStringView const view_, size_t capture_group_name_, size_t const line_, size_t const column_, size_t const global_offset_)
|
|
: view(view_)
|
|
, capture_group_name(capture_group_name_)
|
|
, line(line_)
|
|
, column(column_)
|
|
, global_offset(global_offset_)
|
|
, left_column(column_)
|
|
{
|
|
}
|
|
|
|
void reset()
|
|
{
|
|
view = view.typed_null_view();
|
|
capture_group_name = -1;
|
|
line = 0;
|
|
column = 0;
|
|
global_offset = 0;
|
|
left_column = 0;
|
|
}
|
|
|
|
RegexStringView view {};
|
|
|
|
// This is a string table index. -1 if none. Not using Optional to keep the struct trivially copyable.
|
|
ssize_t capture_group_name { -1 };
|
|
|
|
size_t line { 0 };
|
|
size_t column { 0 };
|
|
size_t global_offset { 0 };
|
|
|
|
// ugly, as not usable by user, but needed to prevent to create extra vectors that are
|
|
// able to store the column when the left paren has been found
|
|
size_t left_column { 0 };
|
|
};
|
|
|
|
struct MatchInput {
|
|
RegexStringView view {};
|
|
AllOptions regex_options {};
|
|
size_t start_offset { 0 }; // For Stateful matches, saved and restored from Regex::start_offset.
|
|
|
|
size_t match_index { 0 };
|
|
size_t line { 0 };
|
|
size_t column { 0 };
|
|
|
|
size_t global_offset { 0 }; // For multiline matching, knowing the offset from start could be important
|
|
|
|
mutable size_t fail_counter { 0 };
|
|
mutable Vector<size_t> saved_positions;
|
|
mutable Vector<size_t> saved_code_unit_positions;
|
|
mutable Vector<size_t> saved_forks_since_last_save;
|
|
mutable Optional<size_t> fork_to_replace;
|
|
|
|
bool in_the_middle_of_a_line { false };
|
|
StringView pattern {};
|
|
};
|
|
|
|
struct MatchState {
|
|
size_t capture_group_count;
|
|
size_t string_position_before_match { 0 };
|
|
size_t string_position { 0 };
|
|
size_t string_position_in_code_units { 0 };
|
|
size_t instruction_position { 0 };
|
|
size_t fork_at_position { 0 };
|
|
size_t forks_since_last_save { 0 };
|
|
size_t string_position_before_rseek { NumericLimits<size_t>::max() };
|
|
size_t string_position_in_code_units_before_rseek { NumericLimits<size_t>::max() };
|
|
Optional<size_t> initiating_fork;
|
|
COWVector<Match> matches;
|
|
COWVector<Match> flat_capture_group_matches; // Vector<Vector<Match>> indexed by match index, then by capture group id; flattened for performance
|
|
COWVector<u64> repetition_marks;
|
|
Vector<u64, 64> checkpoints;
|
|
Vector<i64> step_backs;
|
|
Vector<FlagsUnderlyingType, 1> modifier_stack;
|
|
AllOptions current_options;
|
|
|
|
explicit MatchState(size_t capture_group_count, AllOptions options = {})
|
|
: capture_group_count(capture_group_count)
|
|
, current_options(options)
|
|
{
|
|
}
|
|
|
|
MatchState(MatchState const&) = default;
|
|
MatchState(MatchState&&) = default;
|
|
|
|
MatchState& operator=(MatchState const&) = default;
|
|
MatchState& operator=(MatchState&&) = default;
|
|
|
|
static MatchState only_for_enumeration() { return MatchState { 0 }; }
|
|
|
|
size_t capture_group_matches_size() const
|
|
{
|
|
return flat_capture_group_matches.size() / capture_group_count;
|
|
}
|
|
|
|
Span<Match const> capture_group_matches(size_t match_index) const
|
|
{
|
|
return flat_capture_group_matches.span().slice(match_index * capture_group_count, capture_group_count);
|
|
}
|
|
|
|
Span<Match> mutable_capture_group_matches(size_t match_index)
|
|
{
|
|
return flat_capture_group_matches.mutable_span().slice(match_index * capture_group_count, capture_group_count);
|
|
}
|
|
|
|
// For size_t in {0..300}, ips in {0..750} and repetitions in {0..50}, there are zero collisions.
|
|
u64 u64_hash() const
|
|
{
|
|
u64 hash = 0xcbf29ce484222325;
|
|
auto combine = [&hash](auto value) {
|
|
hash ^= static_cast<u64>(value);
|
|
hash *= 0x9e3779b97f4a7c15;
|
|
};
|
|
auto combine_vector = [&combine](auto const& vector, auto tag) {
|
|
combine(tag);
|
|
combine(vector.size());
|
|
for (auto& value : vector)
|
|
combine(value);
|
|
};
|
|
|
|
combine(string_position_before_match);
|
|
combine(string_position);
|
|
combine(string_position_in_code_units);
|
|
combine(instruction_position);
|
|
combine(fork_at_position);
|
|
combine(initiating_fork.value_or(0) + initiating_fork.has_value());
|
|
combine_vector(repetition_marks, 0xbeefbeefbeefbeef);
|
|
combine_vector(checkpoints, 0xfacefacefaceface);
|
|
combine_vector(step_backs, 0xfedefedefedefede);
|
|
|
|
return hash;
|
|
}
|
|
};
|
|
|
|
}
|
|
|
|
using regex::RegexStringView;
|
|
|
|
template<>
|
|
struct AK::Formatter<regex::RegexStringView> : Formatter<StringView> {
|
|
ErrorOr<void> format(FormatBuilder& builder, regex::RegexStringView value)
|
|
{
|
|
auto string = value.to_byte_string();
|
|
return Formatter<StringView>::format(builder, string);
|
|
}
|
|
};
|
|
|
|
template<>
|
|
struct AK::Traits<regex::Match> : public AK::DefaultTraits<regex::Match> {
|
|
constexpr static bool is_trivial() { return true; }
|
|
};
|