mirror of
https://github.com/LadybirdBrowser/ladybird
synced 2026-04-26 01:35:08 +02:00
Previously, a character class containing any builtin (\d, \w, \s) forced the compiler down the slow "complex class" path, which emits a disjunction of alternatives and backtracks at runtime. For non-unicode, non-unicode-sets, non-negated classes, \w and \d can be inlined as their raw ASCII code-point ranges. The resulting class stays on the fast path and compiles into a single sorted CharClass instruction. The unicode/unicode_sets and negation guards are required for correctness: with the /u + /i flags, \w gains non-ASCII members via case folding (e.g. U+017F, U+212A), and negated classes have a separate, smarter compilation path.
731 lines
26 KiB
C++
731 lines
26 KiB
C++
/*
|
||
* Copyright (c) 2026-present, the Ladybird developers.
|
||
*
|
||
* SPDX-License-Identifier: BSD-2-Clause
|
||
*/
|
||
|
||
#include <AK/Array.h>
|
||
#include <AK/StringBuilder.h>
|
||
#include <AK/Utf16String.h>
|
||
#include <LibTest/TestCase.h>
|
||
|
||
#include <LibRegex/ECMAScriptRegex.h>
|
||
|
||
static regex::ECMAScriptRegex compile_regex(StringView pattern, regex::ECMAScriptCompileFlags flags = {})
|
||
{
|
||
return MUST(regex::ECMAScriptRegex::compile(pattern, flags));
|
||
}
|
||
|
||
static bool compile_succeeds(StringView pattern, regex::ECMAScriptCompileFlags flags = {})
|
||
{
|
||
return !regex::ECMAScriptRegex::compile(pattern, flags).is_error();
|
||
}
|
||
|
||
static bool matches(StringView pattern, StringView subject, regex::ECMAScriptCompileFlags flags = {})
|
||
{
|
||
auto regex = compile_regex(pattern, flags);
|
||
auto utf16_subject = Utf16String::from_utf8(subject);
|
||
auto result = regex.test(utf16_subject, 0);
|
||
EXPECT(result != regex::MatchResult::LimitExceeded);
|
||
return result == regex::MatchResult::Match;
|
||
}
|
||
|
||
static Optional<Utf16View> capture_group(regex::ECMAScriptRegex const& regex, Utf16View input, unsigned group_index)
|
||
{
|
||
auto start = regex.capture_slot(group_index * 2);
|
||
auto end = regex.capture_slot(group_index * 2 + 1);
|
||
if (start < 0 || end < 0)
|
||
return {};
|
||
return input.substring_view(start, end - start);
|
||
}
|
||
|
||
static void expect_capture_eq(regex::ECMAScriptRegex const& regex, Utf16View input, unsigned group_index, StringView expected)
|
||
{
|
||
auto capture = capture_group(regex, input, group_index);
|
||
EXPECT(capture.has_value());
|
||
if (capture.has_value())
|
||
EXPECT(*capture == expected);
|
||
}
|
||
|
||
static void expect_capture_unmatched(regex::ECMAScriptRegex const& regex, unsigned group_index)
|
||
{
|
||
EXPECT_EQ(regex.capture_slot(group_index * 2), -1);
|
||
EXPECT_EQ(regex.capture_slot(group_index * 2 + 1), -1);
|
||
}
|
||
|
||
TEST_CASE(compile_rejects_invalid_pattern)
|
||
{
|
||
auto regex = regex::ECMAScriptRegex::compile("("sv, {});
|
||
EXPECT(regex.is_error());
|
||
}
|
||
|
||
TEST_CASE(exec_tracks_named_capture_slots)
|
||
{
|
||
auto regex = MUST(regex::ECMAScriptRegex::compile("(?<word>foo)(bar)"sv, {}));
|
||
|
||
EXPECT_EQ(regex.capture_count(), 2u);
|
||
EXPECT_EQ(regex.total_groups(), 3u);
|
||
EXPECT_EQ(regex.named_groups().size(), 1u);
|
||
EXPECT_EQ(regex.named_groups()[0].name, "word"sv);
|
||
EXPECT_EQ(regex.named_groups()[0].index, 1u);
|
||
|
||
EXPECT_EQ(regex.exec(u"foobar"sv, 0), regex::MatchResult::Match);
|
||
EXPECT_EQ(regex.capture_slot(0), 0);
|
||
EXPECT_EQ(regex.capture_slot(1), 6);
|
||
EXPECT_EQ(regex.capture_slot(2), 0);
|
||
EXPECT_EQ(regex.capture_slot(3), 3);
|
||
EXPECT_EQ(regex.capture_slot(4), 3);
|
||
EXPECT_EQ(regex.capture_slot(5), 6);
|
||
}
|
||
|
||
TEST_CASE(exec_reports_unmatched_optional_groups)
|
||
{
|
||
auto regex = MUST(regex::ECMAScriptRegex::compile("(foo)?bar"sv, {}));
|
||
|
||
EXPECT_EQ(regex.exec(u"bar"sv, 0), regex::MatchResult::Match);
|
||
EXPECT_EQ(regex.capture_slot(0), 0);
|
||
EXPECT_EQ(regex.capture_slot(1), 3);
|
||
EXPECT_EQ(regex.capture_slot(2), -1);
|
||
EXPECT_EQ(regex.capture_slot(3), -1);
|
||
}
|
||
|
||
TEST_CASE(ascii_backed_inputs_preserve_match_results)
|
||
{
|
||
auto regex = MUST(regex::ECMAScriptRegex::compile("(?<word>foo)(bar)"sv, {}));
|
||
|
||
EXPECT_EQ(regex.exec("foobar"sv, 0), regex::MatchResult::Match);
|
||
EXPECT_EQ(regex.capture_slot(0), 0);
|
||
EXPECT_EQ(regex.capture_slot(1), 6);
|
||
EXPECT_EQ(regex.capture_slot(2), 0);
|
||
EXPECT_EQ(regex.capture_slot(3), 3);
|
||
EXPECT_EQ(regex.capture_slot(4), 3);
|
||
EXPECT_EQ(regex.capture_slot(5), 6);
|
||
|
||
EXPECT_EQ(regex.test("foobar"sv, 0), regex::MatchResult::Match);
|
||
EXPECT_EQ(regex.find_all("foobar foobar"sv, 0), 2);
|
||
EXPECT_EQ(regex.find_all_match(0).start, 0);
|
||
EXPECT_EQ(regex.find_all_match(0).end, 6);
|
||
EXPECT_EQ(regex.find_all_match(1).start, 7);
|
||
EXPECT_EQ(regex.find_all_match(1).end, 13);
|
||
}
|
||
|
||
TEST_CASE(test_honors_ignore_case)
|
||
{
|
||
auto regex = MUST(regex::ECMAScriptRegex::compile("casesensitive"sv, { .ignore_case = true }));
|
||
|
||
EXPECT_EQ(regex.test(u"CaseSensitive"sv, 0), regex::MatchResult::Match);
|
||
EXPECT_EQ(regex.test(u"something else"sv, 0), regex::MatchResult::NoMatch);
|
||
}
|
||
|
||
TEST_CASE(ascii_ignore_case_literal_search_preserves_behavior)
|
||
{
|
||
auto regex = MUST(regex::ECMAScriptRegex::compile("zfvr"sv, { .ignore_case = true }));
|
||
|
||
EXPECT_EQ(regex.exec("...ZFVR..."sv, 0), regex::MatchResult::Match);
|
||
EXPECT_EQ(regex.capture_slot(0), 3);
|
||
EXPECT_EQ(regex.capture_slot(1), 7);
|
||
|
||
EXPECT_EQ(regex.find_all("zfvr ZFVR zFVr"sv, 0), 3);
|
||
EXPECT_EQ(regex.find_all_match(0).start, 0);
|
||
EXPECT_EQ(regex.find_all_match(1).start, 5);
|
||
EXPECT_EQ(regex.find_all_match(2).start, 10);
|
||
}
|
||
|
||
TEST_CASE(ascii_ignore_case_literal_search_handles_punctuation_prefixes)
|
||
{
|
||
auto regex = MUST(regex::ECMAScriptRegex::compile("##yv22##"sv, { .ignore_case = true }));
|
||
|
||
EXPECT_EQ(regex.find_all("##YV22## and ##yv22##"sv, 0), 2);
|
||
EXPECT_EQ(regex.find_all_match(0).start, 0);
|
||
EXPECT_EQ(regex.find_all_match(0).end, 8);
|
||
EXPECT_EQ(regex.find_all_match(1).start, 13);
|
||
EXPECT_EQ(regex.find_all_match(1).end, 21);
|
||
}
|
||
|
||
TEST_CASE(ascii_ignore_case_literal_alternation_preserves_behavior)
|
||
{
|
||
auto regex = MUST(regex::ECMAScriptRegex::compile("##yv22##|zfvr|puebzr"sv, { .ignore_case = true }));
|
||
|
||
EXPECT_EQ(regex.find_all("##YV22## zFVr PUEBZR"sv, 0), 3);
|
||
EXPECT_EQ(regex.find_all_match(0).start, 0);
|
||
EXPECT_EQ(regex.find_all_match(0).end, 8);
|
||
EXPECT_EQ(regex.find_all_match(1).start, 9);
|
||
EXPECT_EQ(regex.find_all_match(1).end, 13);
|
||
EXPECT_EQ(regex.find_all_match(2).start, 14);
|
||
EXPECT_EQ(regex.find_all_match(2).end, 20);
|
||
}
|
||
|
||
TEST_CASE(ascii_ignore_case_literal_alternation_respects_source_order)
|
||
{
|
||
auto regex = MUST(regex::ECMAScriptRegex::compile("foo|f"sv, { .ignore_case = true }));
|
||
|
||
EXPECT_EQ(regex.exec("FoO"sv, 0), regex::MatchResult::Match);
|
||
EXPECT_EQ(regex.capture_slot(0), 0);
|
||
EXPECT_EQ(regex.capture_slot(1), 3);
|
||
}
|
||
|
||
TEST_CASE(unicode_ignore_case_literal_alternation_preserves_behavior)
|
||
{
|
||
auto regex = MUST(regex::ECMAScriptRegex::compile("s|k"sv, { .ignore_case = true, .unicode = true }));
|
||
|
||
EXPECT_EQ(regex.test(u"\u017F"sv, 0), regex::MatchResult::Match);
|
||
EXPECT_EQ(regex.test(u"\u212A"sv, 0), regex::MatchResult::Match);
|
||
}
|
||
|
||
TEST_CASE(word_boundary_literal_preserves_behavior)
|
||
{
|
||
auto regex = MUST(regex::ECMAScriptRegex::compile("\\bfoo\\b"sv, {}));
|
||
|
||
EXPECT_EQ(regex.find_all("foo foo-bar barfoo foo2 _foo foo_"sv, 0), 2);
|
||
EXPECT_EQ(regex.find_all_match(0).start, 0);
|
||
EXPECT_EQ(regex.find_all_match(0).end, 3);
|
||
EXPECT_EQ(regex.find_all_match(1).start, 4);
|
||
EXPECT_EQ(regex.find_all_match(1).end, 7);
|
||
}
|
||
|
||
TEST_CASE(ascii_ignore_case_word_boundary_literal_preserves_behavior)
|
||
{
|
||
auto regex = MUST(regex::ECMAScriptRegex::compile("\\bzfvr\\b"sv, { .ignore_case = true }));
|
||
|
||
EXPECT_EQ(regex.find_all("ZFVR zfvr1 _ZFVR zFVr"sv, 0), 2);
|
||
EXPECT_EQ(regex.find_all_match(0).start, 0);
|
||
EXPECT_EQ(regex.find_all_match(0).end, 4);
|
||
EXPECT_EQ(regex.find_all_match(1).start, 17);
|
||
EXPECT_EQ(regex.find_all_match(1).end, 21);
|
||
}
|
||
|
||
TEST_CASE(unicode_ignore_case_word_boundary_literal_preserves_behavior)
|
||
{
|
||
auto regex = MUST(regex::ECMAScriptRegex::compile("\\bk\\b"sv, { .ignore_case = true, .unicode = true }));
|
||
|
||
EXPECT_EQ(regex.test(u"\u212A"sv, 0), regex::MatchResult::Match);
|
||
}
|
||
|
||
TEST_CASE(mixed_positive_class_with_word_builtin_preserves_legacy_ignore_case_behavior)
|
||
{
|
||
auto regex = MUST(regex::ECMAScriptRegex::compile("[\\w\\$]+"sv, { .ignore_case = true }));
|
||
|
||
EXPECT_EQ(regex.test("AZ_09$"sv, 0), regex::MatchResult::Match);
|
||
EXPECT_EQ(regex.test(u"\u017F"sv, 0), regex::MatchResult::NoMatch);
|
||
EXPECT_EQ(regex.test(u"\u212A"sv, 0), regex::MatchResult::NoMatch);
|
||
}
|
||
|
||
TEST_CASE(mixed_positive_class_with_digit_builtin_preserves_behavior)
|
||
{
|
||
auto regex = MUST(regex::ECMAScriptRegex::compile("[A-Z\\d-]+"sv, { .ignore_case = true }));
|
||
|
||
EXPECT_EQ(regex.test("ABC-123"sv, 0), regex::MatchResult::Match);
|
||
EXPECT_EQ(regex.test("abc"sv, 0), regex::MatchResult::Match);
|
||
EXPECT_EQ(regex.test("!"sv, 0), regex::MatchResult::NoMatch);
|
||
}
|
||
|
||
TEST_CASE(find_all_returns_non_overlapping_matches)
|
||
{
|
||
auto regex = MUST(regex::ECMAScriptRegex::compile("aba"sv, {}));
|
||
|
||
EXPECT_EQ(regex.find_all(u"aba aba"sv, 0), 2);
|
||
EXPECT_EQ(regex.find_all_match(0).start, 0);
|
||
EXPECT_EQ(regex.find_all_match(0).end, 3);
|
||
EXPECT_EQ(regex.find_all_match(1).start, 4);
|
||
EXPECT_EQ(regex.find_all_match(1).end, 7);
|
||
}
|
||
|
||
TEST_CASE(unicode_property_matching_works)
|
||
{
|
||
auto regex = MUST(regex::ECMAScriptRegex::compile("\\p{ASCII}+"sv, { .unicode = true }));
|
||
|
||
EXPECT_EQ(regex.test(u"ASCII"sv, 0), regex::MatchResult::Match);
|
||
EXPECT_EQ(regex.test(u"😀"sv, 0), regex::MatchResult::NoMatch);
|
||
}
|
||
|
||
TEST_CASE(end_anchored_suffix_patterns_preserve_behavior)
|
||
{
|
||
auto regex = MUST(regex::ECMAScriptRegex::compile("(.*)\\/client-(.*)\\.js$"sv, {}));
|
||
|
||
EXPECT_EQ(regex.test(u"https://cdn.example.com/assets/client-main.js"sv, 0), regex::MatchResult::Match);
|
||
EXPECT_EQ(regex.test(u"<script src=\"/assets/client-main.js\"></script>"sv, 0), regex::MatchResult::NoMatch);
|
||
}
|
||
|
||
TEST_CASE(leading_start_or_separator_prefix_preserves_behavior)
|
||
{
|
||
auto regex = MUST(regex::ECMAScriptRegex::compile("(?:^|;)\\s*foo=([^;]*)"sv, {}));
|
||
|
||
{
|
||
auto subject = Utf16String::from_utf8("foo=bar"sv);
|
||
EXPECT_EQ(regex.exec(subject, 0), regex::MatchResult::Match);
|
||
expect_capture_eq(regex, subject, 1, "bar"sv);
|
||
}
|
||
|
||
{
|
||
auto subject = Utf16String::from_utf8("a=1; foo=bar; baz=qux"sv);
|
||
EXPECT_EQ(regex.exec(subject, 0), regex::MatchResult::Match);
|
||
expect_capture_eq(regex, subject, 1, "bar"sv);
|
||
}
|
||
|
||
EXPECT_EQ(regex.test(u"a=1; baz=qux"sv, 0), regex::MatchResult::NoMatch);
|
||
}
|
||
|
||
TEST_CASE(required_literal_prefilter_preserves_assignment_extractors)
|
||
{
|
||
auto regex = MUST(regex::ECMAScriptRegex::compile("(?:^|;)\\s*foo=([^;]*)"sv, {}));
|
||
|
||
{
|
||
auto subject = Utf16String::from_utf8("a=1; bar=baz; foo=qux"sv);
|
||
EXPECT_EQ(regex.exec(subject, 0), regex::MatchResult::Match);
|
||
expect_capture_eq(regex, subject, 1, "qux"sv);
|
||
}
|
||
|
||
EXPECT_EQ(regex.test(u"a=1; bar=baz; quux=7"sv, 0), regex::MatchResult::NoMatch);
|
||
}
|
||
|
||
TEST_CASE(ascii_ignore_case_required_literal_prefilter_preserves_behavior)
|
||
{
|
||
auto regex = MUST(regex::ECMAScriptRegex::compile("\\bfoo\\s*=\\s*([^;]*)"sv, { .ignore_case = true }));
|
||
|
||
{
|
||
auto subject = Utf16String::from_utf8("FOO = Bar"sv);
|
||
EXPECT_EQ(regex.exec(subject, 0), regex::MatchResult::Match);
|
||
expect_capture_eq(regex, subject, 1, "Bar"sv);
|
||
}
|
||
|
||
EXPECT_EQ(regex.test(u"bar = baz"sv, 0), regex::MatchResult::NoMatch);
|
||
}
|
||
|
||
TEST_CASE(required_literal_prefilter_handles_common_substrings_across_alternatives)
|
||
{
|
||
auto regex = MUST(regex::ECMAScriptRegex::compile("(\\$\\{name\\})|(\\$name\\b)"sv, {}));
|
||
|
||
EXPECT_EQ(regex.find_all("${name} $name"sv, 0), 2);
|
||
EXPECT_EQ(regex.find_all_match(0).start, 0);
|
||
EXPECT_EQ(regex.find_all_match(0).end, 7);
|
||
EXPECT_EQ(regex.find_all_match(1).start, 8);
|
||
EXPECT_EQ(regex.find_all_match(1).end, 13);
|
||
EXPECT_EQ(regex.test(u"${other} $other"sv, 0), regex::MatchResult::NoMatch);
|
||
}
|
||
|
||
TEST_CASE(required_literal_prefilter_compiles_large_exact_quantifiers)
|
||
{
|
||
StringBuilder pattern_builder;
|
||
pattern_builder.append("(?:ab){"sv);
|
||
pattern_builder.appendff("{}", 1'000'000);
|
||
pattern_builder.append("}"sv);
|
||
auto pattern = MUST(pattern_builder.to_string());
|
||
|
||
EXPECT(compile_succeeds(pattern));
|
||
}
|
||
|
||
TEST_CASE(required_literal_prefilter_compiles_long_literal_alternations)
|
||
{
|
||
StringBuilder branch_builder;
|
||
branch_builder.append_repeated("a"sv, 1'024);
|
||
auto shared_prefix = MUST(branch_builder.to_string());
|
||
|
||
StringBuilder pattern_builder;
|
||
pattern_builder.append(shared_prefix);
|
||
pattern_builder.append("b|"sv);
|
||
pattern_builder.append(shared_prefix);
|
||
pattern_builder.append("c"sv);
|
||
auto pattern = MUST(pattern_builder.to_string());
|
||
|
||
auto regex = MUST(regex::ECMAScriptRegex::compile(pattern, {}));
|
||
|
||
StringBuilder subject_builder;
|
||
subject_builder.append(shared_prefix);
|
||
subject_builder.append("c"sv);
|
||
auto matching_subject = MUST(subject_builder.to_string());
|
||
|
||
subject_builder.trim(1);
|
||
subject_builder.append("d"sv);
|
||
auto missing_subject = MUST(subject_builder.to_string());
|
||
|
||
EXPECT_EQ(regex.test(Utf16String::from_utf8(matching_subject), 0), regex::MatchResult::Match);
|
||
EXPECT_EQ(regex.test(Utf16String::from_utf8(missing_subject), 0), regex::MatchResult::NoMatch);
|
||
}
|
||
|
||
TEST_CASE(restored_ecmascript_parse_coverage)
|
||
{
|
||
struct Test {
|
||
StringView pattern;
|
||
bool should_compile { true };
|
||
regex::ECMAScriptCompileFlags flags {};
|
||
};
|
||
|
||
static constexpr Test tests[] {
|
||
{ "^hello.$"sv },
|
||
{ "\\x"sv },
|
||
{ "\\x1"sv },
|
||
{ "\\x1"sv, false, { .unicode = true } },
|
||
{ "\\x11"sv, true, { .unicode = true } },
|
||
{ "\\"sv, false },
|
||
{ "(?"sv, false },
|
||
{ "\\u1234"sv, true, { .unicode = true } },
|
||
{ "[\\u1234]"sv, true, { .unicode = true } },
|
||
{ "\\u1"sv, false, { .unicode = true } },
|
||
{ "[\\u1]"sv, false, { .unicode = true } },
|
||
{ "{1}"sv, false },
|
||
{ "{1,2}"sv, false },
|
||
{ "\\uxxxx"sv, false, { .unicode = true } },
|
||
{ "\\u{10ffff}"sv, true, { .unicode = true } },
|
||
{ "\\u{110000}"sv, false, { .unicode = true } },
|
||
{ "\\p{ASCII}"sv, true, { .unicode = true } },
|
||
{ "\\p{}"sv, false, { .unicode = true } },
|
||
{ "\\p{AsCiI}"sv, false, { .unicode = true } },
|
||
{ "(?<a>a)(?<a>b)"sv, false },
|
||
{ "(?:(?<x>a)|(?<y>a)(?<x>b))(?:(?<z>c)|(?<z>d))"sv },
|
||
{ "(?<1a>a)"sv, false },
|
||
{ "(?<$$_$$>a)"sv },
|
||
{ "(?<ÿ>a)"sv },
|
||
{ "(?<𝓑𝓻𝓸𝔀𝓷>a)"sv },
|
||
{ "(?ii:a)"sv, false },
|
||
{ "(?-:a)"sv, false },
|
||
{ "(?i)"sv, false },
|
||
{ "(?-i)"sv, false },
|
||
{ "["sv, false },
|
||
{ "[ -"sv, false },
|
||
{ "[[x[]]]"sv, true, { .unicode_sets = true } },
|
||
{ "[\\w--x]"sv, true, { .unicode_sets = true } },
|
||
};
|
||
|
||
for (auto const& test : tests)
|
||
EXPECT_EQ(compile_succeeds(test.pattern, test.flags), test.should_compile);
|
||
}
|
||
|
||
TEST_CASE(restored_ecmascript_match_coverage)
|
||
{
|
||
struct Test {
|
||
StringView pattern;
|
||
StringView subject;
|
||
bool should_match { true };
|
||
regex::ECMAScriptCompileFlags flags {};
|
||
};
|
||
|
||
static constexpr Test tests[] {
|
||
{ "^hello.$"sv, "hello1"sv },
|
||
{ "^h{0,1}ello.$"sv, "ello1"sv },
|
||
{ "^hell\\x6f1$"sv, "hello1"sv },
|
||
{ "^hel(?<LO>l.)1$"sv, "hello1"sv },
|
||
{ "\\b.*\\b"sv, "hello1"sv },
|
||
{ "bar(?=f.)foo"sv, "barfoo"sv },
|
||
{ "bar(?=foo)bar"sv, "barbar"sv, false },
|
||
{ "bar(?!foo)bar"sv, "barbar"sv },
|
||
{ "bar(?!bar)bar"sv, "barbar"sv, false },
|
||
{ "bar.*(?<=foo)"sv, "barbar"sv, false },
|
||
{ "bar.*(?<!foo)"sv, "barbar"sv },
|
||
{ "(?:)"sv, ""sv },
|
||
{ "(?<=.{3})f"sv, "abcdef"sv },
|
||
{ "(?<=.{3})f"sv, "abc😀ef"sv, true, { .unicode = true } },
|
||
{ "a(?=.(?=c)|b)b"sv, "ab"sv },
|
||
{ "(?=)(?=\\d)"sv, "smart"sv, false },
|
||
{ "(?<!.*q.*?)(?<=h.*)THIS(?=.*!)"sv, "hey THIS does match!"sv },
|
||
{ "(.*a)?(x)"sv, "x"sv },
|
||
{ "^\\w*[\\u212A]"sv, "K"sv, true, { .ignore_case = true, .unicode = true } },
|
||
{ "^a*A\\d"sv, "aaaa5"sv, true, { .ignore_case = true } },
|
||
{ "^\\u{017f}*s$"sv, "ſs"sv, true, { .ignore_case = true, .unicode = true } },
|
||
{ "(a+)+b"sv, "aaaaaaaaaaaaaaaaaaaaaaaaa"sv, false },
|
||
};
|
||
|
||
for (auto const& test : tests)
|
||
EXPECT_EQ(matches(test.pattern, test.subject, test.flags), test.should_match);
|
||
}
|
||
|
||
TEST_CASE(restored_lookbehind_capture_coverage)
|
||
{
|
||
{
|
||
auto regex = compile_regex("(?<=(a|cc))b"sv);
|
||
auto subject = Utf16String::from_utf8("ccb"sv);
|
||
|
||
EXPECT_EQ(regex.exec(subject, 0), regex::MatchResult::Match);
|
||
expect_capture_eq(regex, subject, 0, "b"sv);
|
||
expect_capture_eq(regex, subject, 1, "cc"sv);
|
||
}
|
||
{
|
||
auto regex = compile_regex("((?<=\\b)[d-f]{3})"sv);
|
||
auto subject = Utf16String::from_utf8("abc def"sv);
|
||
|
||
EXPECT_EQ(regex.exec(subject, 0), regex::MatchResult::Match);
|
||
expect_capture_eq(regex, subject, 0, "def"sv);
|
||
expect_capture_eq(regex, subject, 1, "def"sv);
|
||
}
|
||
{
|
||
auto regex = compile_regex("(?<=(b+))c"sv);
|
||
auto subject = Utf16String::from_utf8("abbbbbbc"sv);
|
||
|
||
EXPECT_EQ(regex.exec(subject, 0), regex::MatchResult::Match);
|
||
expect_capture_eq(regex, subject, 0, "c"sv);
|
||
expect_capture_eq(regex, subject, 1, "bbbbbb"sv);
|
||
}
|
||
{
|
||
auto regex = compile_regex("(?<=((?:b\\d{2})+))c"sv);
|
||
auto subject = Utf16String::from_utf8("ab12b23b34c"sv);
|
||
|
||
EXPECT_EQ(regex.exec(subject, 0), regex::MatchResult::Match);
|
||
expect_capture_eq(regex, subject, 0, "c"sv);
|
||
expect_capture_eq(regex, subject, 1, "b12b23b34"sv);
|
||
}
|
||
}
|
||
|
||
TEST_CASE(restored_inversion_state_in_char_class_coverage)
|
||
{
|
||
{
|
||
auto regex = compile_regex("[\\S\\s]"sv);
|
||
auto subject = Utf16String::from_utf8("hello"sv);
|
||
|
||
EXPECT_EQ(regex.exec(subject, 0), regex::MatchResult::Match);
|
||
expect_capture_eq(regex, subject, 0, "h"sv);
|
||
}
|
||
{
|
||
auto regex = compile_regex("[^\\S\\n]"sv);
|
||
auto subject = Utf16String::from_utf8("\n"sv);
|
||
|
||
EXPECT_EQ(regex.exec(subject, 0), regex::MatchResult::NoMatch);
|
||
}
|
||
{
|
||
auto regex = compile_regex("[^\\S]"sv);
|
||
auto subject = Utf16String::from_utf8("\t"sv);
|
||
|
||
EXPECT_EQ(regex.exec(subject, 0), regex::MatchResult::Match);
|
||
expect_capture_eq(regex, subject, 0, "\t"sv);
|
||
}
|
||
}
|
||
|
||
TEST_CASE(restored_quantified_alternation_capture_coverage)
|
||
{
|
||
{
|
||
auto regex = compile_regex("^(a|a?)+$"sv);
|
||
auto subject = Utf16String::from_utf8("a"sv);
|
||
|
||
EXPECT_EQ(regex.exec(subject, 0), regex::MatchResult::Match);
|
||
expect_capture_eq(regex, subject, 0, "a"sv);
|
||
expect_capture_eq(regex, subject, 1, "a"sv);
|
||
}
|
||
{
|
||
auto regex = compile_regex("^(a|a?)+$"sv);
|
||
auto subject = Utf16String::from_utf8("aa"sv);
|
||
|
||
EXPECT_EQ(regex.exec(subject, 0), regex::MatchResult::Match);
|
||
expect_capture_eq(regex, subject, 0, "aa"sv);
|
||
expect_capture_eq(regex, subject, 1, "a"sv);
|
||
}
|
||
}
|
||
|
||
TEST_CASE(restored_zero_width_backreference_coverage)
|
||
{
|
||
{
|
||
auto regex = compile_regex("(a*)b\\1+"sv);
|
||
auto subject = Utf16String::from_utf8("baaac"sv);
|
||
|
||
EXPECT_EQ(regex.exec(subject, 0), regex::MatchResult::Match);
|
||
expect_capture_eq(regex, subject, 0, "b"sv);
|
||
expect_capture_eq(regex, subject, 1, ""sv);
|
||
}
|
||
{
|
||
auto regex = compile_regex("(x)?\\1y"sv);
|
||
auto subject = Utf16String::from_utf8("y"sv);
|
||
|
||
EXPECT_EQ(regex.exec(subject, 0), regex::MatchResult::Match);
|
||
expect_capture_eq(regex, subject, 0, "y"sv);
|
||
expect_capture_unmatched(regex, 1);
|
||
}
|
||
{
|
||
auto regex = compile_regex("(?!(y)y)(\\1)z"sv);
|
||
auto subject = Utf16String::from_utf8("xyyz"sv);
|
||
|
||
EXPECT_EQ(regex.exec(subject, 0), regex::MatchResult::Match);
|
||
expect_capture_eq(regex, subject, 0, "z"sv);
|
||
expect_capture_unmatched(regex, 1);
|
||
expect_capture_eq(regex, subject, 2, ""sv);
|
||
}
|
||
}
|
||
|
||
TEST_CASE(restored_backreference_to_undefined_capture_groups)
|
||
{
|
||
{
|
||
auto regex = compile_regex("(?:(?<x>a)|(?<x>b))\\k<x>"sv);
|
||
auto subject = Utf16String::from_utf8("bb"sv);
|
||
|
||
EXPECT_EQ(regex.exec(subject, 0), regex::MatchResult::Match);
|
||
expect_capture_eq(regex, subject, 0, "bb"sv);
|
||
expect_capture_unmatched(regex, 1);
|
||
expect_capture_eq(regex, subject, 2, "b"sv);
|
||
}
|
||
{
|
||
auto regex = compile_regex("(?:(?:(?<x>a)|(?<x>b))\\k<x>){2}"sv);
|
||
auto subject = Utf16String::from_utf8("aabb"sv);
|
||
|
||
EXPECT_EQ(regex.exec(subject, 0), regex::MatchResult::Match);
|
||
expect_capture_eq(regex, subject, 0, "aabb"sv);
|
||
expect_capture_unmatched(regex, 1);
|
||
expect_capture_eq(regex, subject, 2, "b"sv);
|
||
}
|
||
{
|
||
auto regex = compile_regex("(?:(?<x>a)|(?<x>b))\\k<x>"sv);
|
||
auto subject = Utf16String::from_utf8("aa"sv);
|
||
|
||
EXPECT_EQ(regex.exec(subject, 0), regex::MatchResult::Match);
|
||
expect_capture_eq(regex, subject, 0, "aa"sv);
|
||
expect_capture_eq(regex, subject, 1, "a"sv);
|
||
expect_capture_unmatched(regex, 2);
|
||
}
|
||
{
|
||
auto regex = compile_regex("(.*?)a(?!(a+)b\\2c)\\2(.*)"sv);
|
||
auto subject = Utf16String::from_utf8("baaabaac"sv);
|
||
|
||
EXPECT_EQ(regex.exec(subject, 0), regex::MatchResult::Match);
|
||
expect_capture_eq(regex, subject, 0, "baaabaac"sv);
|
||
expect_capture_eq(regex, subject, 1, "ba"sv);
|
||
expect_capture_unmatched(regex, 2);
|
||
expect_capture_eq(regex, subject, 3, "abaac"sv);
|
||
}
|
||
{
|
||
auto regex = compile_regex("^(?:(?<a>x)|(?<a>y)|z)\\k<a>$"sv);
|
||
auto subject = Utf16String::from_utf8("z"sv);
|
||
|
||
EXPECT_EQ(regex.exec(subject, 0), regex::MatchResult::Match);
|
||
expect_capture_eq(regex, subject, 0, "z"sv);
|
||
expect_capture_unmatched(regex, 1);
|
||
expect_capture_unmatched(regex, 2);
|
||
}
|
||
{
|
||
auto regex = compile_regex("^(?:(?<a>x)|(?<a>y)|z){2}\\k<a>$"sv);
|
||
auto subject = Utf16String::from_utf8("xz"sv);
|
||
|
||
EXPECT_EQ(regex.exec(subject, 0), regex::MatchResult::Match);
|
||
expect_capture_eq(regex, subject, 0, "xz"sv);
|
||
expect_capture_unmatched(regex, 1);
|
||
expect_capture_unmatched(regex, 2);
|
||
}
|
||
}
|
||
|
||
TEST_CASE(restored_optional_groups_with_empty_matches)
|
||
{
|
||
{
|
||
auto regex = compile_regex("^(.*)(.*)?$"sv);
|
||
auto subject = Utf16String::from_utf8("a"sv);
|
||
|
||
EXPECT_EQ(regex.exec(subject, 0), regex::MatchResult::Match);
|
||
expect_capture_eq(regex, subject, 1, "a"sv);
|
||
expect_capture_unmatched(regex, 2);
|
||
}
|
||
{
|
||
auto regex = compile_regex("()?"sv);
|
||
auto subject = Utf16String::from_utf8(""sv);
|
||
|
||
EXPECT_EQ(regex.exec(subject, 0), regex::MatchResult::Match);
|
||
expect_capture_unmatched(regex, 1);
|
||
}
|
||
{
|
||
auto regex = compile_regex("(z)((a+)?(b+)?(c))*"sv);
|
||
auto subject = Utf16String::from_utf8("zaacbbbcac"sv);
|
||
|
||
EXPECT_EQ(regex.exec(subject, 0), regex::MatchResult::Match);
|
||
expect_capture_eq(regex, subject, 1, "z"sv);
|
||
expect_capture_eq(regex, subject, 2, "ac"sv);
|
||
expect_capture_eq(regex, subject, 3, "a"sv);
|
||
expect_capture_unmatched(regex, 4);
|
||
expect_capture_eq(regex, subject, 5, "c"sv);
|
||
}
|
||
{
|
||
auto regex = compile_regex("(?:(?=(abc)))?a"sv);
|
||
auto subject = Utf16String::from_utf8("abc"sv);
|
||
|
||
EXPECT_EQ(regex.exec(subject, 0), regex::MatchResult::Match);
|
||
expect_capture_eq(regex, subject, 0, "a"sv);
|
||
expect_capture_unmatched(regex, 1);
|
||
}
|
||
{
|
||
auto regex = compile_regex("^(?:(?=(abc))){0,1}a"sv);
|
||
auto subject = Utf16String::from_utf8("abc"sv);
|
||
|
||
EXPECT_EQ(regex.exec(subject, 0), regex::MatchResult::Match);
|
||
expect_capture_eq(regex, subject, 0, "a"sv);
|
||
expect_capture_unmatched(regex, 1);
|
||
}
|
||
}
|
||
|
||
TEST_CASE(restored_ecmascript_modifier_coverage)
|
||
{
|
||
struct Test {
|
||
StringView pattern;
|
||
StringView subject;
|
||
bool should_match { true };
|
||
regex::ECMAScriptCompileFlags flags {};
|
||
};
|
||
|
||
static constexpr Test tests[] {
|
||
{ "a(?i:b)c"sv, "aBc"sv },
|
||
{ "a(?i:b)c"sv, "aBC"sv, false },
|
||
{ "a(?s:.)c"sv, "a\nc"sv },
|
||
{ "(?ims:a.b)"sv, "A\nB"sv },
|
||
{ "(?i:a(?-i:b)c)"sv, "AbC"sv },
|
||
{ "(?i:a(?-i:b)c)"sv, "ABC"sv, false },
|
||
{ "a(?-i:b)c"sv, "AbC"sv, true, { .ignore_case = true } },
|
||
{ "a(?-i:b)c"sv, "ABC"sv, false, { .ignore_case = true } },
|
||
{ "x.(?m:^a)"sv, "x\na"sv, true, { .dot_all = true } },
|
||
};
|
||
|
||
for (auto const& test : tests)
|
||
EXPECT_EQ(matches(test.pattern, test.subject, test.flags), test.should_match);
|
||
}
|
||
|
||
TEST_CASE(restored_unicode_property_and_sets_coverage)
|
||
{
|
||
struct Test {
|
||
StringView pattern;
|
||
StringView subject;
|
||
bool should_match { true };
|
||
regex::ECMAScriptCompileFlags flags {};
|
||
};
|
||
|
||
static constexpr Test tests[] {
|
||
{ "\\p{ASCII}"sv, "a"sv, false },
|
||
{ "\\p{ASCII}"sv, "p{ASCII}"sv },
|
||
{ "\\p{ASCII}"sv, "a"sv, true, { .unicode = true } },
|
||
{ "\\p{ASCII}"sv, "😀"sv, false, { .unicode = true } },
|
||
{ "\\P{ASCII}"sv, "a"sv, false, { .unicode = true } },
|
||
{ "\\P{ASCII}"sv, "😀"sv, true, { .unicode = true } },
|
||
{ "\\p{ASCII_Hex_Digit}"sv, "1"sv, true, { .unicode = true } },
|
||
{ "\\P{ASCII_Hex_Digit}"sv, "x"sv, true, { .unicode = true } },
|
||
{ "\\p{General_Category=Cased_Letter}"sv, "A"sv, true, { .unicode = true } },
|
||
{ "\\P{Cased_Letter}"sv, "9"sv, true, { .unicode = true } },
|
||
{ "\\p{sc=Latin}"sv, "A"sv, true, { .unicode = true } },
|
||
{ "\\u{1f600}"sv, "😀"sv, true, { .unicode = true } },
|
||
{ "[\\w--x]"sv, "x"sv, false, { .unicode_sets = true } },
|
||
{ "[\\w--x]"sv, "y"sv, true, { .unicode_sets = true } },
|
||
{ "[\\w&&x]"sv, "x"sv, true, { .unicode_sets = true } },
|
||
{ "[[0-9\\w]--x--6]"sv, "6"sv, false, { .unicode_sets = true } },
|
||
{ "[[0-9\\w]--x--6]"sv, "9"sv, true, { .unicode_sets = true } },
|
||
};
|
||
|
||
for (auto const& test : tests)
|
||
EXPECT_EQ(matches(test.pattern, test.subject, test.flags), test.should_match);
|
||
}
|
||
|
||
TEST_CASE(restored_empty_match_and_loop_coverage)
|
||
{
|
||
static constexpr StringView patterns[] {
|
||
"(a*)*"sv,
|
||
"(a*?)*"sv,
|
||
"(a*)*?"sv,
|
||
"(?:)*?"sv,
|
||
"(a?)+$"sv,
|
||
};
|
||
|
||
for (auto pattern : patterns)
|
||
EXPECT(matches(pattern, ""sv));
|
||
|
||
auto regex = compile_regex(".*"sv, { .global = true });
|
||
auto subject = Utf16String::from_utf8(""sv);
|
||
EXPECT_EQ(regex.find_all(subject, 0), 1);
|
||
EXPECT_EQ(regex.find_all_match(0).start, 0);
|
||
EXPECT_EQ(regex.find_all_match(0).end, 0);
|
||
}
|
||
|
||
TEST_CASE(restored_long_fork_chain_coverage)
|
||
{
|
||
auto regex = compile_regex("(?:aa)*"sv);
|
||
auto subject = MUST(String::repeated('a', 1000));
|
||
auto utf16_subject = Utf16String::from_utf8(subject.bytes_as_string_view());
|
||
|
||
EXPECT_EQ(regex.test(utf16_subject, 0), regex::MatchResult::Match);
|
||
}
|