mirror of
https://github.com/LadybirdBrowser/ladybird
synced 2026-04-25 17:25:08 +02:00
Delete Lexer.cpp/h and Token.cpp, replacing all tokenization with a new rust_tokenize() FFI function that calls back for each token. Rewrite SyntaxHighlighter.cpp and js.cpp REPL to use the Rust tokenizer. The token type and category enums in Token.h now mirror the Rust definitions in token.rs. Move is_syntax_character/is_whitespace/is_line_terminator helpers into RegExpConstructor.cpp as static functions, since they were only used there.
414 lines
18 KiB
C++
414 lines
18 KiB
C++
/*
|
||
* Copyright (c) 2020, Matthew Olsson <mattco@serenityos.org>
|
||
*
|
||
* SPDX-License-Identifier: BSD-2-Clause
|
||
*/
|
||
|
||
#include <AK/CharacterTypes.h>
|
||
#include <AK/Find.h>
|
||
#include <LibJS/Runtime/Error.h>
|
||
#include <LibJS/Runtime/GlobalObject.h>
|
||
#include <LibJS/Runtime/RegExpConstructor.h>
|
||
#include <LibJS/Runtime/RegExpObject.h>
|
||
#include <LibJS/Runtime/Value.h>
|
||
#include <LibUnicode/CharacterTypes.h>
|
||
|
||
namespace JS {
|
||
|
||
static bool is_syntax_character(u32 code_point)
|
||
{
|
||
static constexpr auto syntax_characters = "^$\\.*+?()[]{}|"sv;
|
||
return is_ascii(code_point) && syntax_characters.contains(static_cast<char>(code_point));
|
||
}
|
||
|
||
static bool is_whitespace(u32 code_point)
|
||
{
|
||
if (is_ascii_space(code_point))
|
||
return true;
|
||
if (code_point == 0x00A0 || code_point == 0xFEFF)
|
||
return true;
|
||
return Unicode::code_point_has_space_separator_general_category(code_point);
|
||
}
|
||
|
||
static bool is_line_terminator(u32 code_point)
|
||
{
|
||
return code_point == '\n' || code_point == '\r' || code_point == 0x2028 || code_point == 0x2029;
|
||
}
|
||
|
||
GC_DEFINE_ALLOCATOR(RegExpConstructor);
|
||
|
||
RegExpConstructor::RegExpConstructor(Realm& realm)
|
||
: NativeFunction(realm.vm().names.RegExp.as_string(), realm.intrinsics().function_prototype())
|
||
{
|
||
}
|
||
|
||
void RegExpConstructor::initialize(Realm& realm)
|
||
{
|
||
auto& vm = this->vm();
|
||
Base::initialize(realm);
|
||
|
||
// 22.2.5.2 RegExp.prototype, https://tc39.es/ecma262/#sec-regexp.prototype
|
||
define_direct_property(vm.names.prototype, realm.intrinsics().regexp_prototype(), 0);
|
||
|
||
u8 attr = Attribute::Writable | Attribute::Configurable;
|
||
define_native_function(realm, vm.names.escape, escape, 1, attr);
|
||
define_native_accessor(realm, vm.well_known_symbol_species(), symbol_species_getter, {}, Attribute::Configurable);
|
||
|
||
define_direct_property(vm.names.length, Value(2), Attribute::Configurable);
|
||
|
||
// Additional properties of the RegExp constructor, https://github.com/tc39/proposal-regexp-legacy-features#additional-properties-of-the-regexp-constructor
|
||
define_native_accessor(realm, vm.names.input, input_getter, input_setter, Attribute::Configurable);
|
||
define_native_accessor(realm, vm.names.inputAlias, input_alias_getter, input_alias_setter, Attribute::Configurable);
|
||
define_native_accessor(realm, vm.names.lastMatch, last_match_getter, {}, Attribute::Configurable);
|
||
define_native_accessor(realm, vm.names.lastMatchAlias, last_match_alias_getter, {}, Attribute::Configurable);
|
||
define_native_accessor(realm, vm.names.lastParen, last_paren_getter, {}, Attribute::Configurable);
|
||
define_native_accessor(realm, vm.names.lastParenAlias, last_paren_alias_getter, {}, Attribute::Configurable);
|
||
define_native_accessor(realm, vm.names.leftContext, left_context_getter, {}, Attribute::Configurable);
|
||
define_native_accessor(realm, vm.names.leftContextAlias, left_context_alias_getter, {}, Attribute::Configurable);
|
||
define_native_accessor(realm, vm.names.rightContext, right_context_getter, {}, Attribute::Configurable);
|
||
define_native_accessor(realm, vm.names.rightContextAlias, right_context_alias_getter, {}, Attribute::Configurable);
|
||
define_native_accessor(realm, vm.names.$1, group_1_getter, {}, Attribute::Configurable);
|
||
define_native_accessor(realm, vm.names.$2, group_2_getter, {}, Attribute::Configurable);
|
||
define_native_accessor(realm, vm.names.$3, group_3_getter, {}, Attribute::Configurable);
|
||
define_native_accessor(realm, vm.names.$4, group_4_getter, {}, Attribute::Configurable);
|
||
define_native_accessor(realm, vm.names.$5, group_5_getter, {}, Attribute::Configurable);
|
||
define_native_accessor(realm, vm.names.$6, group_6_getter, {}, Attribute::Configurable);
|
||
define_native_accessor(realm, vm.names.$7, group_7_getter, {}, Attribute::Configurable);
|
||
define_native_accessor(realm, vm.names.$8, group_8_getter, {}, Attribute::Configurable);
|
||
define_native_accessor(realm, vm.names.$9, group_9_getter, {}, Attribute::Configurable);
|
||
}
|
||
|
||
// 22.2.4.1 RegExp ( pattern, flags ), https://tc39.es/ecma262/#sec-regexp-pattern-flags
|
||
ThrowCompletionOr<Value> RegExpConstructor::call()
|
||
{
|
||
auto& vm = this->vm();
|
||
|
||
auto pattern = vm.argument(0);
|
||
auto flags = vm.argument(1);
|
||
|
||
// 1. Let patternIsRegExp be ? IsRegExp(pattern).
|
||
bool pattern_is_regexp = TRY(pattern.is_regexp(vm));
|
||
|
||
// 2. If NewTarget is undefined, then
|
||
// a. Let newTarget be the active function object.
|
||
auto& new_target = *this;
|
||
|
||
// b. If patternIsRegExp is true and flags is undefined, then
|
||
if (pattern_is_regexp && flags.is_undefined()) {
|
||
// i. Let patternConstructor be ? Get(pattern, "constructor").
|
||
auto pattern_constructor = TRY(pattern.as_object().get(vm.names.constructor));
|
||
|
||
// ii. If SameValue(newTarget, patternConstructor) is true, return pattern.
|
||
if (same_value(&new_target, pattern_constructor))
|
||
return pattern;
|
||
}
|
||
|
||
return TRY(construct(new_target));
|
||
}
|
||
|
||
// 22.2.4.1 RegExp ( pattern, flags ), https://tc39.es/ecma262/#sec-regexp-pattern-flags
|
||
ThrowCompletionOr<GC::Ref<Object>> RegExpConstructor::construct(FunctionObject& new_target)
|
||
{
|
||
auto& vm = this->vm();
|
||
|
||
auto pattern = vm.argument(0);
|
||
auto flags = vm.argument(1);
|
||
|
||
// 1. Let patternIsRegExp be ? IsRegExp(pattern).
|
||
bool pattern_is_regexp = TRY(pattern.is_regexp(vm));
|
||
|
||
// NOTE: Step 2 is handled in call() above.
|
||
// 3. Else, let newTarget be NewTarget.
|
||
|
||
Value pattern_value;
|
||
Value flags_value;
|
||
|
||
// 4. If pattern is an Object and pattern has a [[RegExpMatcher]] internal slot, then
|
||
if (auto regexp_pattern = pattern.as_if<RegExpObject>()) {
|
||
// a. Let P be pattern.[[OriginalSource]].
|
||
pattern_value = PrimitiveString::create(vm, regexp_pattern->pattern());
|
||
|
||
// b. If flags is undefined, let F be pattern.[[OriginalFlags]].
|
||
if (flags.is_undefined())
|
||
flags_value = PrimitiveString::create(vm, regexp_pattern->flags());
|
||
// c. Else, let F be flags.
|
||
else
|
||
flags_value = flags;
|
||
}
|
||
// 5. Else if patternIsRegExp is true, then
|
||
else if (pattern_is_regexp) {
|
||
// a. Let P be ? Get(pattern, "source").
|
||
pattern_value = TRY(pattern.as_object().get(vm.names.source));
|
||
|
||
// b. If flags is undefined, then
|
||
if (flags.is_undefined()) {
|
||
// i. Let F be ? Get(pattern, "flags").
|
||
flags_value = TRY(pattern.as_object().get(vm.names.flags));
|
||
}
|
||
// c. Else, let F be flags.
|
||
else {
|
||
flags_value = flags;
|
||
}
|
||
}
|
||
// 6. Else,
|
||
else {
|
||
// a. Let P be pattern.
|
||
pattern_value = pattern;
|
||
|
||
// b. Let F be flags.
|
||
flags_value = flags;
|
||
}
|
||
|
||
// 7. Let O be ? RegExpAlloc(newTarget).
|
||
auto regexp_object = TRY(regexp_alloc(vm, new_target));
|
||
|
||
// 8. Return ? RegExpInitialize(O, P, F).
|
||
return TRY(regexp_object->regexp_initialize(vm, pattern_value, flags_value));
|
||
}
|
||
|
||
// 22.2.5.1.1 EncodeForRegExpEscape ( cp ), https://tc39.es/ecma262/#sec-encodeforregexpescape
|
||
static String encode_for_regexp_escape(u32 code_point)
|
||
{
|
||
// https://tc39.es/ecma262/#table-controlescape-code-point-values
|
||
// Table 63: ControlEscape Code Point Values
|
||
struct ControlEscape {
|
||
u32 code_point { 0 };
|
||
char control_escape { 0 };
|
||
};
|
||
static constexpr auto control_escapes = to_array<ControlEscape>({
|
||
{ 0x09, 't' },
|
||
{ 0x0A, 'n' },
|
||
{ 0x0B, 'v' },
|
||
{ 0x0C, 'f' },
|
||
{ 0x0D, 'r' },
|
||
});
|
||
|
||
// 1. If c is matched by SyntaxCharacter or c is U+002F (SOLIDUS), then
|
||
if (is_syntax_character(code_point) || code_point == '/') {
|
||
// a. Return the string-concatenation of 0x005C (REVERSE SOLIDUS) and UTF16EncodeCodePoint(c).
|
||
return MUST(String::formatted("\\{}", String::from_code_point(code_point)));
|
||
}
|
||
|
||
// 2. Else if c is the code point listed in some cell of the “Code Point” column of Table 63, then
|
||
auto it = find_if(control_escapes.begin(), control_escapes.end(), [&](auto const& escape) {
|
||
return escape.code_point == code_point;
|
||
});
|
||
|
||
if (it != control_escapes.end()) {
|
||
// a. Return the string-concatenation of 0x005C (REVERSE SOLIDUS) and the string in the “ControlEscape” column
|
||
// of the row whose “Code Point” column contains c.
|
||
return MUST(String::formatted("\\{}", it->control_escape));
|
||
}
|
||
|
||
// 3. Let otherPunctuators be the string-concatenation of ",-=<>#&!%:;@~'`" and the code unit 0x0022 (QUOTATION MARK).
|
||
// 4. Let toEscape be StringToCodePoints(otherPunctuators).
|
||
static constexpr Utf8View to_escape { ",-=<>#&!%:;@~'`\""sv };
|
||
|
||
// 5. If toEscape contains c, c is matched by either WhiteSpace or LineTerminator, or c has the same numeric value
|
||
// as a leading surrogate or trailing surrogate, then
|
||
if (to_escape.contains(code_point) || is_whitespace(code_point) || is_line_terminator(code_point) || is_unicode_surrogate(code_point)) {
|
||
// a. Let cNum be the numeric value of c.
|
||
// b. If cNum ≤ 0xFF, then
|
||
if (code_point <= 0xFF) {
|
||
// i. Let hex be Number::toString(𝔽(cNum), 16).
|
||
// ii. Return the string-concatenation of the code unit 0x005C (REVERSE SOLIDUS), "x", and
|
||
// StringPad(hex, 2, "0", START).
|
||
return MUST(String::formatted("\\x{:02x}", code_point));
|
||
}
|
||
|
||
// c. Let escaped be the empty String.
|
||
// d. Let codeUnits be UTF16EncodeCodePoint(c).
|
||
// e. For each code unit cu of codeUnits, do
|
||
// i. Set escaped to the string-concatenation of escaped and UnicodeEscape(cu).
|
||
// f. Return escaped.
|
||
return MUST(String::formatted("\\u{:04x}", code_point));
|
||
}
|
||
|
||
// 6. Return UTF16EncodeCodePoint(c).
|
||
return String::from_code_point(code_point);
|
||
}
|
||
|
||
// 22.2.5.1 RegExp.escape ( S ), https://tc39.es/ecma262/#sec-regexp.escape
|
||
JS_DEFINE_NATIVE_FUNCTION(RegExpConstructor::escape)
|
||
{
|
||
auto string = vm.argument(0);
|
||
|
||
// 1. If S is not a String, throw a TypeError exception.
|
||
if (!string.is_string())
|
||
return vm.throw_completion<TypeError>(ErrorType::NotAString, string);
|
||
|
||
// 2. Let escaped be the empty String.
|
||
StringBuilder escaped(string.as_string().utf8_string().byte_count());
|
||
|
||
// 3. Let cpList be StringToCodePoints(S).
|
||
auto code_point_list = string.as_string().utf8_string();
|
||
|
||
// 4. For each code point c of cpList, do
|
||
for (auto code_point : code_point_list.code_points()) {
|
||
// a. If escaped is the empty String and c is matched by either DecimalDigit or AsciiLetter, then
|
||
if (escaped.is_empty() && is_ascii_alphanumeric(code_point)) {
|
||
// i. NOTE: Escaping a leading digit ensures that output corresponds with pattern text which may be used
|
||
// after a \0 character escape or a DecimalEscape such as \1 and still match S rather than be interpreted
|
||
// as an extension of the preceding escape sequence. Escaping a leading ASCII letter does the same for
|
||
// the context after \c.
|
||
|
||
// ii. Let numericValue be the numeric value of c.
|
||
// iii. Let hex be Number::toString(𝔽(numericValue), 16).
|
||
// iv. Assert: The length of hex is 2.
|
||
// v. Set escaped to the string-concatenation of the code unit 0x005C (REVERSE SOLIDUS), "x", and hex.
|
||
escaped.appendff("\\x{:02x}", code_point);
|
||
}
|
||
// b. Else,
|
||
else {
|
||
// i. Set escaped to the string-concatenation of escaped and EncodeForRegExpEscape(c).
|
||
escaped.append(encode_for_regexp_escape(code_point));
|
||
}
|
||
}
|
||
|
||
// 5. Return escaped.
|
||
return JS::PrimitiveString::create(vm, MUST(escaped.to_string()));
|
||
}
|
||
|
||
// 22.2.5.3 get RegExp [ %Symbol.species% ], https://tc39.es/ecma262/#sec-get-regexp-@@species
|
||
JS_DEFINE_NATIVE_FUNCTION(RegExpConstructor::symbol_species_getter)
|
||
{
|
||
// 1. Return the this value.
|
||
return vm.this_value();
|
||
}
|
||
|
||
// get RegExp.input, https://github.com/tc39/proposal-regexp-legacy-features#get-regexpinput
|
||
JS_DEFINE_NATIVE_FUNCTION(RegExpConstructor::input_getter)
|
||
{
|
||
auto regexp_constructor = vm.current_realm()->intrinsics().regexp_constructor();
|
||
|
||
// 1. Return ? GetLegacyRegExpStaticProperty(%RegExp%, this value, [[RegExpInput]]).
|
||
auto property_getter = &RegExpLegacyStaticProperties::input;
|
||
return TRY(get_legacy_regexp_static_property(vm, regexp_constructor, vm.this_value(), property_getter));
|
||
}
|
||
|
||
// get RegExp.$_, https://github.com/tc39/proposal-regexp-legacy-features#get-regexp_
|
||
JS_DEFINE_NATIVE_FUNCTION(RegExpConstructor::input_alias_getter)
|
||
{
|
||
// Keep the same implementation with `get RegExp.input`
|
||
return input_getter(vm);
|
||
}
|
||
|
||
// set RegExp.input, https://github.com/tc39/proposal-regexp-legacy-features#set-regexpinput--val
|
||
JS_DEFINE_NATIVE_FUNCTION(RegExpConstructor::input_setter)
|
||
{
|
||
auto regexp_constructor = vm.current_realm()->intrinsics().regexp_constructor();
|
||
|
||
// 1. Perform ? SetLegacyRegExpStaticProperty(%RegExp%, this value, [[RegExpInput]], val).
|
||
auto property_setter = &RegExpLegacyStaticProperties::set_input;
|
||
TRY(set_legacy_regexp_static_property(vm, regexp_constructor, vm.this_value(), property_setter, vm.argument(0)));
|
||
return js_undefined();
|
||
}
|
||
|
||
// set RegExp.$_, https://github.com/tc39/proposal-regexp-legacy-features#set-regexp_---val
|
||
JS_DEFINE_NATIVE_FUNCTION(RegExpConstructor::input_alias_setter)
|
||
{
|
||
// Keep the same implementation with `set RegExp.input`
|
||
return input_setter(vm);
|
||
}
|
||
|
||
// get RegExp.lastMatch, https://github.com/tc39/proposal-regexp-legacy-features#get-regexplastmatch
|
||
JS_DEFINE_NATIVE_FUNCTION(RegExpConstructor::last_match_getter)
|
||
{
|
||
auto regexp_constructor = vm.current_realm()->intrinsics().regexp_constructor();
|
||
|
||
// 1. Return ? GetLegacyRegExpStaticProperty(%RegExp%, this value, [[RegExpLastMatch]]).
|
||
auto property_getter = &RegExpLegacyStaticProperties::last_match;
|
||
return TRY(get_legacy_regexp_static_property(vm, regexp_constructor, vm.this_value(), property_getter));
|
||
}
|
||
|
||
// get RegExp.$&, https://github.com/tc39/proposal-regexp-legacy-features#get-regexp
|
||
JS_DEFINE_NATIVE_FUNCTION(RegExpConstructor::last_match_alias_getter)
|
||
{
|
||
// Keep the same implementation with `get RegExp.lastMatch`
|
||
return last_match_getter(vm);
|
||
}
|
||
|
||
// get RegExp.lastParen, https://github.com/tc39/proposal-regexp-legacy-features#get-regexplastparen
|
||
JS_DEFINE_NATIVE_FUNCTION(RegExpConstructor::last_paren_getter)
|
||
{
|
||
auto regexp_constructor = vm.current_realm()->intrinsics().regexp_constructor();
|
||
|
||
// 1. Return ? GetLegacyRegExpStaticProperty(%RegExp%, this value, [[RegExpLastParen]]).
|
||
auto property_getter = &RegExpLegacyStaticProperties::last_paren;
|
||
return TRY(get_legacy_regexp_static_property(vm, regexp_constructor, vm.this_value(), property_getter));
|
||
}
|
||
|
||
// get RegExp.$+, https://github.com/tc39/proposal-regexp-legacy-features#get-regexp-1
|
||
JS_DEFINE_NATIVE_FUNCTION(RegExpConstructor::last_paren_alias_getter)
|
||
{
|
||
// Keep the same implementation with `get RegExp.lastParen`
|
||
return last_paren_getter(vm);
|
||
}
|
||
|
||
// get RegExp.leftContext, https://github.com/tc39/proposal-regexp-legacy-features#get-regexpleftcontext
|
||
JS_DEFINE_NATIVE_FUNCTION(RegExpConstructor::left_context_getter)
|
||
{
|
||
auto regexp_constructor = vm.current_realm()->intrinsics().regexp_constructor();
|
||
|
||
// 1. Return ? GetLegacyRegExpStaticProperty(%RegExp%, this value, [[RegExpLeftContext]]).
|
||
auto property_getter = &RegExpLegacyStaticProperties::left_context;
|
||
return TRY(get_legacy_regexp_static_property(vm, regexp_constructor, vm.this_value(), property_getter));
|
||
}
|
||
|
||
// get RegExp.$`, https://github.com/tc39/proposal-regexp-legacy-features#get-regexp-2
|
||
JS_DEFINE_NATIVE_FUNCTION(RegExpConstructor::left_context_alias_getter)
|
||
{
|
||
// Keep the same implementation with `get RegExp.leftContext`
|
||
return left_context_getter(vm);
|
||
}
|
||
|
||
// get RegExp.rightContext, https://github.com/tc39/proposal-regexp-legacy-features#get-regexprightcontext
|
||
JS_DEFINE_NATIVE_FUNCTION(RegExpConstructor::right_context_getter)
|
||
{
|
||
auto regexp_constructor = vm.current_realm()->intrinsics().regexp_constructor();
|
||
|
||
// 1. Return ? GetLegacyRegExpStaticProperty(%RegExp%, this value, [[RegExpRightContext]]).
|
||
auto property_getter = &RegExpLegacyStaticProperties::right_context;
|
||
return TRY(get_legacy_regexp_static_property(vm, regexp_constructor, vm.this_value(), property_getter));
|
||
}
|
||
|
||
// get RegExp.$', https://github.com/tc39/proposal-regexp-legacy-features#get-regexp-3
|
||
JS_DEFINE_NATIVE_FUNCTION(RegExpConstructor::right_context_alias_getter)
|
||
{
|
||
// Keep the same implementation with `get RegExp.rightContext`
|
||
return right_context_getter(vm);
|
||
}
|
||
|
||
#define DEFINE_REGEXP_GROUP_GETTER(n) \
|
||
JS_DEFINE_NATIVE_FUNCTION(RegExpConstructor::group_##n##_getter) \
|
||
{ \
|
||
auto regexp_constructor = vm.current_realm()->intrinsics().regexp_constructor(); \
|
||
\
|
||
/* 1. Return ? GetLegacyRegExpStaticProperty(%RegExp%, this value, [[RegExpParen##n##]]).*/ \
|
||
auto property_getter = &RegExpLegacyStaticProperties::$##n; \
|
||
return TRY(get_legacy_regexp_static_property(vm, regexp_constructor, vm.this_value(), property_getter)); \
|
||
}
|
||
|
||
// get RegExp.$1, https://github.com/tc39/proposal-regexp-legacy-features#get-regexp1
|
||
DEFINE_REGEXP_GROUP_GETTER(1);
|
||
// get RegExp.$2, https://github.com/tc39/proposal-regexp-legacy-features#get-regexp2
|
||
DEFINE_REGEXP_GROUP_GETTER(2);
|
||
// get RegExp.$3, https://github.com/tc39/proposal-regexp-legacy-features#get-regexp3
|
||
DEFINE_REGEXP_GROUP_GETTER(3);
|
||
// get RegExp.$4, https://github.com/tc39/proposal-regexp-legacy-features#get-regexp4
|
||
DEFINE_REGEXP_GROUP_GETTER(4);
|
||
// get RegExp.$5, https://github.com/tc39/proposal-regexp-legacy-features#get-regexp5
|
||
DEFINE_REGEXP_GROUP_GETTER(5);
|
||
// get RegExp.$6, https://github.com/tc39/proposal-regexp-legacy-features#get-regexp6
|
||
DEFINE_REGEXP_GROUP_GETTER(6);
|
||
// get RegExp.$7, https://github.com/tc39/proposal-regexp-legacy-features#get-regexp7
|
||
DEFINE_REGEXP_GROUP_GETTER(7);
|
||
// get RegExp.$8, https://github.com/tc39/proposal-regexp-legacy-features#get-regexp8
|
||
DEFINE_REGEXP_GROUP_GETTER(8);
|
||
// get RegExp.$9, https://github.com/tc39/proposal-regexp-legacy-features#get-regexp9
|
||
DEFINE_REGEXP_GROUP_GETTER(9);
|
||
|
||
#undef DEFINE_REGEXP_GROUP_GETTER
|
||
|
||
}
|