diff --git a/Libraries/LibJS/CMakeLists.txt b/Libraries/LibJS/CMakeLists.txt index cc24d060700..39f8c99ff1b 100644 --- a/Libraries/LibJS/CMakeLists.txt +++ b/Libraries/LibJS/CMakeLists.txt @@ -18,7 +18,6 @@ set(SOURCES Contrib/Test262/IsHTMLDDA.cpp CyclicModule.cpp Heap/Cell.cpp - Lexer.cpp Module.cpp ParserError.cpp Print.cpp @@ -265,7 +264,6 @@ set(SOURCES SourceTextModule.cpp SyntaxHighlighter.cpp SyntheticModule.cpp - Token.cpp ) generate_bytecode_def_derived() diff --git a/Libraries/LibJS/Lexer.cpp b/Libraries/LibJS/Lexer.cpp deleted file mode 100644 index 4f13ca21652..00000000000 --- a/Libraries/LibJS/Lexer.cpp +++ /dev/null @@ -1,1049 +0,0 @@ -/* - * Copyright (c) 2020, Stephan Unverwerth - * Copyright (c) 2020-2021, Linus Groh - * - * SPDX-License-Identifier: BSD-2-Clause - */ - -#include -#include -#include -#include -#include -#include - -namespace JS { - -HashMap Lexer::s_keywords; - -static constexpr TokenType parse_two_char_token(Utf16View const& view) -{ - if (view.length_in_code_units() != 2) - return TokenType::Invalid; - - auto ch0 = view.code_unit_at(0); - auto ch1 = view.code_unit_at(1); - - switch (ch0) { - case '=': - switch (ch1) { - case '>': - return TokenType::Arrow; - case '=': - return TokenType::EqualsEquals; - default: - return TokenType::Invalid; - } - case '+': - switch (ch1) { - case '=': - return TokenType::PlusEquals; - case '+': - return TokenType::PlusPlus; - default: - return TokenType::Invalid; - } - case '-': - switch (ch1) { - case '=': - return TokenType::MinusEquals; - case '-': - return TokenType::MinusMinus; - default: - return TokenType::Invalid; - } - case '*': - switch (ch1) { - case '=': - return TokenType::AsteriskEquals; - case '*': - return TokenType::DoubleAsterisk; - default: - return TokenType::Invalid; - } - case '/': - switch (ch1) { - case '=': - return TokenType::SlashEquals; - default: - return TokenType::Invalid; - } - case '%': - switch (ch1) { - case '=': - return TokenType::PercentEquals; - default: - return TokenType::Invalid; - } - case '&': - switch (ch1) { - case '=': - return TokenType::AmpersandEquals; - case '&': - return TokenType::DoubleAmpersand; - default: - return TokenType::Invalid; - } - case '|': - switch (ch1) { - case '=': - return TokenType::PipeEquals; - case '|': - return TokenType::DoublePipe; - default: - return TokenType::Invalid; - } - case '^': - switch (ch1) { - case '=': - return TokenType::CaretEquals; - default: - return TokenType::Invalid; - } - case '<': - switch (ch1) { - case '=': - return TokenType::LessThanEquals; - case '<': - return TokenType::ShiftLeft; - default: - return TokenType::Invalid; - } - case '>': - switch (ch1) { - case '=': - return TokenType::GreaterThanEquals; - case '>': - return TokenType::ShiftRight; - default: - return TokenType::Invalid; - } - case '?': - switch (ch1) { - case '?': - return TokenType::DoubleQuestionMark; - case '.': - return TokenType::QuestionMarkPeriod; - default: - return TokenType::Invalid; - } - case '!': - switch (ch1) { - case '=': - return TokenType::ExclamationMarkEquals; - default: - return TokenType::Invalid; - } - default: - return TokenType::Invalid; - } -} - -static constexpr TokenType parse_three_char_token(Utf16View const& view) -{ - if (view.length_in_code_units() != 3) - return TokenType::Invalid; - - auto ch0 = view.code_unit_at(0); - auto ch1 = view.code_unit_at(1); - auto ch2 = view.code_unit_at(2); - - switch (ch0) { - case '<': - if (ch1 == '<' && ch2 == '=') - return TokenType::ShiftLeftEquals; - return TokenType::Invalid; - case '>': - if (ch1 == '>' && ch2 == '=') - return TokenType::ShiftRightEquals; - if (ch1 == '>' && ch2 == '>') - return TokenType::UnsignedShiftRight; - return TokenType::Invalid; - case '=': - if (ch1 == '=' && ch2 == '=') - return TokenType::EqualsEqualsEquals; - return TokenType::Invalid; - case '!': - if (ch1 == '=' && ch2 == '=') - return TokenType::ExclamationMarkEqualsEquals; - return TokenType::Invalid; - case '.': - if (ch1 == '.' && ch2 == '.') - return TokenType::TripleDot; - return TokenType::Invalid; - case '*': - if (ch1 == '*' && ch2 == '=') - return TokenType::DoubleAsteriskEquals; - return TokenType::Invalid; - case '&': - if (ch1 == '&' && ch2 == '=') - return TokenType::DoubleAmpersandEquals; - return TokenType::Invalid; - case '|': - if (ch1 == '|' && ch2 == '=') - return TokenType::DoublePipeEquals; - return TokenType::Invalid; - case '?': - if (ch1 == '?' && ch2 == '=') - return TokenType::DoubleQuestionMarkEquals; - return TokenType::Invalid; - default: - return TokenType::Invalid; - } -} - -static consteval AK::Array make_single_char_tokens_array() -{ - AK::Array array; - array.fill(TokenType::Invalid); - array['&'] = TokenType::Ampersand; - array['*'] = TokenType::Asterisk; - array['['] = TokenType::BracketOpen; - array[']'] = TokenType::BracketClose; - array['^'] = TokenType::Caret; - array[':'] = TokenType::Colon; - array[','] = TokenType::Comma; - array['{'] = TokenType::CurlyOpen; - array['}'] = TokenType::CurlyClose; - array['='] = TokenType::Equals; - array['!'] = TokenType::ExclamationMark; - array['-'] = TokenType::Minus; - array['('] = TokenType::ParenOpen; - array[')'] = TokenType::ParenClose; - array['%'] = TokenType::Percent; - array['.'] = TokenType::Period; - array['|'] = TokenType::Pipe; - array['+'] = TokenType::Plus; - array['?'] = TokenType::QuestionMark; - array[';'] = TokenType::Semicolon; - array['/'] = TokenType::Slash; - array['~'] = TokenType::Tilde; - array['<'] = TokenType::LessThan; - array['>'] = TokenType::GreaterThan; - return array; -} - -static constexpr auto s_single_char_tokens = make_single_char_tokens_array(); - -Lexer::Lexer(NonnullRefPtr source_code, size_t line_number, size_t line_column) - : m_source_code(move(source_code)) - , m_current_token(TokenType::Eof, {}, {}, {}, 0, 0, 0) - , m_line_number(line_number) - , m_line_column(line_column) -{ - if (s_keywords.is_empty()) { - s_keywords.set("async"_utf16_fly_string, TokenType::Async); - s_keywords.set("await"_utf16_fly_string, TokenType::Await); - s_keywords.set("break"_utf16_fly_string, TokenType::Break); - s_keywords.set("case"_utf16_fly_string, TokenType::Case); - s_keywords.set("catch"_utf16_fly_string, TokenType::Catch); - s_keywords.set("class"_utf16_fly_string, TokenType::Class); - s_keywords.set("const"_utf16_fly_string, TokenType::Const); - s_keywords.set("continue"_utf16_fly_string, TokenType::Continue); - s_keywords.set("debugger"_utf16_fly_string, TokenType::Debugger); - s_keywords.set("default"_utf16_fly_string, TokenType::Default); - s_keywords.set("delete"_utf16_fly_string, TokenType::Delete); - s_keywords.set("do"_utf16_fly_string, TokenType::Do); - s_keywords.set("else"_utf16_fly_string, TokenType::Else); - s_keywords.set("enum"_utf16_fly_string, TokenType::Enum); - s_keywords.set("export"_utf16_fly_string, TokenType::Export); - s_keywords.set("extends"_utf16_fly_string, TokenType::Extends); - s_keywords.set("false"_utf16_fly_string, TokenType::BoolLiteral); - s_keywords.set("finally"_utf16_fly_string, TokenType::Finally); - s_keywords.set("for"_utf16_fly_string, TokenType::For); - s_keywords.set("function"_utf16_fly_string, TokenType::Function); - s_keywords.set("if"_utf16_fly_string, TokenType::If); - s_keywords.set("import"_utf16_fly_string, TokenType::Import); - s_keywords.set("in"_utf16_fly_string, TokenType::In); - s_keywords.set("instanceof"_utf16_fly_string, TokenType::Instanceof); - s_keywords.set("let"_utf16_fly_string, TokenType::Let); - s_keywords.set("new"_utf16_fly_string, TokenType::New); - s_keywords.set("null"_utf16_fly_string, TokenType::NullLiteral); - s_keywords.set("return"_utf16_fly_string, TokenType::Return); - s_keywords.set("super"_utf16_fly_string, TokenType::Super); - s_keywords.set("switch"_utf16_fly_string, TokenType::Switch); - s_keywords.set("this"_utf16_fly_string, TokenType::This); - s_keywords.set("throw"_utf16_fly_string, TokenType::Throw); - s_keywords.set("true"_utf16_fly_string, TokenType::BoolLiteral); - s_keywords.set("try"_utf16_fly_string, TokenType::Try); - s_keywords.set("typeof"_utf16_fly_string, TokenType::Typeof); - s_keywords.set("var"_utf16_fly_string, TokenType::Var); - s_keywords.set("void"_utf16_fly_string, TokenType::Void); - s_keywords.set("while"_utf16_fly_string, TokenType::While); - s_keywords.set("with"_utf16_fly_string, TokenType::With); - s_keywords.set("yield"_utf16_fly_string, TokenType::Yield); - } - - consume(); -} - -void Lexer::consume() -{ - auto did_reach_eof = [this] { - if (m_position < source_code().length_in_code_units()) - return false; - m_eof = true; - m_current_code_unit = '\0'; - m_position = source_code().length_in_code_units() + 1; - m_line_column++; - return true; - }; - - if (m_position > source_code().length_in_code_units()) - return; - - if (did_reach_eof()) - return; - - if (is_line_terminator()) { - if constexpr (LEXER_DEBUG) { - StringView type; - if (m_current_code_unit == '\n') - type = "LINE FEED"sv; - else if (m_current_code_unit == '\r') - type = "CARRIAGE RETURN"sv; - else if (m_current_code_unit == LINE_SEPARATOR) - type = "LINE SEPARATOR"sv; - else if (m_current_code_unit == PARAGRAPH_SEPARATOR) - type = "PARAGRAPH SEPARATOR"sv; - else - VERIFY_NOT_REACHED(); - dbgln("Found a line terminator: {}", type); - } - - // If the previous character is \r and the current one \n we already updated line number - // and column - don't do it again. From https://tc39.es/ecma262/#sec-line-terminators: - // The sequence is commonly used as a line terminator. - // It should be considered a single SourceCharacter for the purpose of reporting line numbers. - auto second_char_of_crlf = m_position > 1 && source().code_unit_at(m_position - 2) == '\r' && m_current_code_unit == '\n'; - - if (!second_char_of_crlf) { - m_line_number++; - m_line_column = 1; - dbgln_if(LEXER_DEBUG, "Incremented line number, now at: line {}, column 1", m_line_number); - } else { - dbgln_if(LEXER_DEBUG, "Previous was CR, this is LF - not incrementing line number again."); - } - } else { - if (AK::UnicodeUtils::is_utf16_high_surrogate(m_current_code_unit) && m_position < source_code().length_in_code_units()) { - if (AK::UnicodeUtils::is_utf16_low_surrogate(source().code_unit_at(m_position))) { - ++m_position; - - if (did_reach_eof()) - return; - } - } - - ++m_line_column; - } - - m_current_code_unit = source().code_unit_at(m_position++); -} - -bool Lexer::consume_decimal_number() -{ - if (!is_ascii_digit(m_current_code_unit)) - return false; - - while (is_ascii_digit(m_current_code_unit) || match_numeric_literal_separator_followed_by(is_ascii_digit)) { - consume(); - } - return true; -} - -bool Lexer::consume_exponent() -{ - consume(); - if (m_current_code_unit == '-' || m_current_code_unit == '+') - consume(); - - if (!is_ascii_digit(m_current_code_unit)) - return false; - - return consume_decimal_number(); -} - -static constexpr bool is_octal_digit(char16_t ch) -{ - return ch >= '0' && ch <= '7'; -} - -bool Lexer::consume_octal_number() -{ - consume(); - if (!is_octal_digit(m_current_code_unit)) - return false; - - while (is_octal_digit(m_current_code_unit) || match_numeric_literal_separator_followed_by(is_octal_digit)) - consume(); - - return true; -} - -bool Lexer::consume_hexadecimal_number() -{ - consume(); - if (!is_ascii_hex_digit(m_current_code_unit)) - return false; - - while (is_ascii_hex_digit(m_current_code_unit) || match_numeric_literal_separator_followed_by(is_ascii_hex_digit)) - consume(); - - return true; -} - -static constexpr bool is_binary_digit(char16_t ch) -{ - return ch == '0' || ch == '1'; -} - -bool Lexer::consume_binary_number() -{ - consume(); - if (!is_binary_digit(m_current_code_unit)) - return false; - - while (is_binary_digit(m_current_code_unit) || match_numeric_literal_separator_followed_by(is_binary_digit)) - consume(); - - return true; -} - -template -bool Lexer::match_numeric_literal_separator_followed_by(Callback callback) const -{ - if (m_position >= source_code().length_in_code_units()) - return false; - return m_current_code_unit == '_' - && callback(source().code_unit_at(m_position)); -} - -bool Lexer::match(char16_t a, char16_t b) const -{ - if (m_position >= source_code().length_in_code_units()) - return false; - - return m_current_code_unit == a - && source().code_unit_at(m_position) == b; -} - -bool Lexer::match(char16_t a, char16_t b, char16_t c) const -{ - if (m_position + 1 >= source_code().length_in_code_units()) - return false; - - return m_current_code_unit == a - && source().code_unit_at(m_position) == b - && source().code_unit_at(m_position + 1) == c; -} - -bool Lexer::match(char16_t a, char16_t b, char16_t c, char16_t d) const -{ - if (m_position + 2 >= source_code().length_in_code_units()) - return false; - - return m_current_code_unit == a - && source().code_unit_at(m_position) == b - && source().code_unit_at(m_position + 1) == c - && source().code_unit_at(m_position + 2) == d; -} - -bool Lexer::is_eof() const -{ - return m_eof; -} - -ALWAYS_INLINE bool Lexer::is_line_terminator() const -{ - // OPTIMIZATION: Fast-path for ASCII characters. - if (m_current_code_unit == '\n' || m_current_code_unit == '\r') - return true; - if (is_ascii(m_current_code_unit)) - return false; - - return JS::is_line_terminator(current_code_point()); -} - -ALWAYS_INLINE u32 Lexer::current_code_point() const -{ - if (m_position == 0) - return AK::UnicodeUtils::REPLACEMENT_CODE_POINT; - - auto substring = source().substring_view(m_position - 1); - if (substring.is_empty()) - return AK::UnicodeUtils::REPLACEMENT_CODE_POINT; - - return *substring.begin(); -} - -bool Lexer::is_whitespace() const -{ - // OPTIMIZATION: Fast-path for ASCII characters. - if (is_ascii_space(m_current_code_unit)) - return true; - if (is_ascii(m_current_code_unit)) - return false; - - return JS::is_whitespace(current_code_point()); -} - -// UnicodeEscapeSequence :: https://tc39.es/ecma262/#prod-UnicodeEscapeSequence -// u Hex4Digits -// u{ CodePoint } -Optional Lexer::is_identifier_unicode_escape(size_t& identifier_length) const -{ - Utf16GenericLexer lexer(source().substring_view(m_position - 1)); - - if (auto code_point_or_error = lexer.consume_escaped_code_point(false); !code_point_or_error.is_error()) { - identifier_length = lexer.tell(); - return code_point_or_error.value(); - } - - return {}; -} - -// IdentifierStart :: https://tc39.es/ecma262/#prod-IdentifierStart -// UnicodeIDStart -// $ -// _ -// \ UnicodeEscapeSequence -Optional Lexer::is_identifier_start(size_t& identifier_length) const -{ - u32 code_point = current_code_point(); - identifier_length = 1; - - if (code_point == '\\') { - if (auto maybe_code_point = is_identifier_unicode_escape(identifier_length); maybe_code_point.has_value()) - code_point = *maybe_code_point; - else - return {}; - } - - if (is_ascii_alpha(code_point) || code_point == '_' || code_point == '$') - return code_point; - - // Optimization: the first codepoint with the ID_Start property after A-Za-z is outside the - // ASCII range (0x00AA), so we can skip code_point_has_property() for any ASCII characters. - if (is_ascii(code_point)) - return {}; - - if (Unicode::code_point_has_identifier_start_property(code_point)) - return code_point; - - return {}; -} - -// IdentifierPart :: https://tc39.es/ecma262/#prod-IdentifierPart -// UnicodeIDContinue -// $ -// \ UnicodeEscapeSequence -// -// -Optional Lexer::is_identifier_middle(size_t& identifier_length) const -{ - u32 code_point = current_code_point(); - identifier_length = 1; - - if (code_point == '\\') { - if (auto maybe_code_point = is_identifier_unicode_escape(identifier_length); maybe_code_point.has_value()) - code_point = *maybe_code_point; - else - return {}; - } - - if (is_ascii_alphanumeric(code_point) || (code_point == '$') || (code_point == ZERO_WIDTH_NON_JOINER) || (code_point == ZERO_WIDTH_JOINER)) - return code_point; - - // Optimization: the first codepoint with the ID_Continue property after A-Za-z0-9_ is outside the - // ASCII range (0x00AA), so we can skip code_point_has_property() for any ASCII characters. - if (code_point == '_') - return code_point; - if (is_ascii(code_point)) - return {}; - - if (Unicode::code_point_has_identifier_continue_property(code_point)) - return code_point; - - return {}; -} - -bool Lexer::is_line_comment_start(bool line_has_token_yet) const -{ - return match('/', '/') - || (m_allow_html_comments && match('<', '!', '-', '-')) - // "-->" is considered a line comment start if the current line is only whitespace and/or - // other block comment(s); or in other words: the current line does not have a token or - // ongoing line comment yet - || (m_allow_html_comments && !line_has_token_yet && match('-', '-', '>')) - // https://tc39.es/ecma262/#sec-hashbang - || (match('#', '!') && m_position == 1); -} - -bool Lexer::is_block_comment_start() const -{ - return match('/', '*'); -} - -bool Lexer::is_block_comment_end() const -{ - return match('*', '/'); -} - -bool Lexer::is_numeric_literal_start() const -{ - return is_ascii_digit(m_current_code_unit) || (m_current_code_unit == '.' && m_position < source_code().length_in_code_units() && is_ascii_digit(source().code_unit_at(m_position))); -} - -bool Lexer::slash_means_division() const -{ - auto type = m_current_token.type(); - return m_current_token.is_identifier_name() - || type == TokenType::BigIntLiteral - || type == TokenType::BracketClose - || type == TokenType::CurlyClose - || type == TokenType::MinusMinus - || type == TokenType::NumericLiteral - || type == TokenType::ParenClose - || type == TokenType::PlusPlus - || type == TokenType::PrivateIdentifier - || type == TokenType::RegexLiteral - || type == TokenType::StringLiteral - || type == TokenType::TemplateLiteralEnd; -} - -Token const& Lexer::next() -{ - auto trivia_start = m_position; - auto in_template = !m_template_states.is_empty(); - bool line_has_token_yet = m_line_column > 1; - bool unterminated_comment = false; - - if (!in_template || m_template_states.last().in_expr) { - // consume whitespace and comments - while (true) { - if (is_line_terminator()) { - line_has_token_yet = false; - do { - consume(); - } while (is_line_terminator()); - } else if (is_whitespace()) { - do { - consume(); - } while (is_whitespace()); - } else if (is_line_comment_start(line_has_token_yet)) { - consume(); - do { - consume(); - } while (!is_eof() && !is_line_terminator()); - } else if (is_block_comment_start()) { - size_t start_line_number = m_line_number; - consume(); - do { - consume(); - } while (!is_eof() && !is_block_comment_end()); - if (is_eof()) - unterminated_comment = true; - consume(); // consume * - if (is_eof()) - unterminated_comment = true; - consume(); // consume / - - if (start_line_number != m_line_number) - line_has_token_yet = false; - } else { - break; - } - } - } - - size_t value_start = m_position; - size_t value_start_line_number = m_line_number; - size_t value_start_column_number = m_line_column; - auto token_type = TokenType::Invalid; - auto did_consume_whitespace_or_comments = trivia_start != value_start; - // This is being used to communicate info about invalid tokens to the parser, which then - // can turn that into more specific error messages - instead of us having to make up a - // bunch of Invalid* tokens (bad numeric literals, unterminated comments etc.) - Token::Message token_message = Token::Message::None; - - Optional identifier; - size_t identifier_length = 0; - - if (m_current_token.type() == TokenType::RegexLiteral && !is_eof() && is_ascii_alpha(m_current_code_unit) && !did_consume_whitespace_or_comments) { - token_type = TokenType::RegexFlags; - while (!is_eof() && is_ascii_alpha(m_current_code_unit)) - consume(); - } else if (m_current_code_unit == '`') { - consume(); - - if (!in_template) { - token_type = TokenType::TemplateLiteralStart; - m_template_states.append({ false, 0 }); - } else { - if (m_template_states.last().in_expr) { - m_template_states.append({ false, 0 }); - token_type = TokenType::TemplateLiteralStart; - } else { - m_template_states.take_last(); - token_type = TokenType::TemplateLiteralEnd; - } - } - } else if (in_template && m_template_states.last().in_expr && m_template_states.last().open_bracket_count == 0 && m_current_code_unit == '}') { - consume(); - token_type = TokenType::TemplateLiteralExprEnd; - m_template_states.last().in_expr = false; - } else if (in_template && !m_template_states.last().in_expr) { - if (is_eof()) { - token_type = TokenType::UnterminatedTemplateLiteral; - m_template_states.take_last(); - } else if (match('$', '{')) { - token_type = TokenType::TemplateLiteralExprStart; - consume(); - consume(); - m_template_states.last().in_expr = true; - } else { - // TemplateCharacter :: - // $ [lookahead ≠ {] - // \ TemplateEscapeSequence - // \ NotEscapeSequence - // LineContinuation - // LineTerminatorSequence - // SourceCharacter but not one of ` or \ or $ or LineTerminator - while (!match('$', '{') && m_current_code_unit != '`' && !is_eof()) { - if (match('\\', '$') || match('\\', '`') || match('\\', '\\')) - consume(); - consume(); - } - if (is_eof() && !m_template_states.is_empty()) - token_type = TokenType::UnterminatedTemplateLiteral; - else - token_type = TokenType::TemplateLiteralString; - } - } else if (m_current_code_unit == '#') { - // Note: This has some duplicated code with the identifier lexing below - consume(); - auto code_point = is_identifier_start(identifier_length); - if (code_point.has_value()) { - StringBuilder builder(StringBuilder::Mode::UTF16); - builder.append_code_point('#'); - do { - builder.append_code_point(*code_point); - for (size_t i = 0; i < identifier_length; ++i) - consume(); - - code_point = is_identifier_middle(identifier_length); - } while (code_point.has_value()); - - identifier = builder.to_utf16_string(); - token_type = TokenType::PrivateIdentifier; - } else { - token_type = TokenType::Invalid; - token_message = Token::Message::StartOfPrivateNameNotFollowedByValidIdentifier; - } - } else if (auto code_point = is_identifier_start(identifier_length); code_point.has_value()) { - bool has_escaped_character = false; - // identifier or keyword - StringBuilder builder(StringBuilder::Mode::UTF16); - do { - builder.append_code_point(*code_point); - for (size_t i = 0; i < identifier_length; ++i) - consume(); - - has_escaped_character |= identifier_length > 1; - - code_point = is_identifier_middle(identifier_length); - } while (code_point.has_value()); - - identifier = builder.to_utf16_string(); - - auto it = s_keywords.find(identifier->hash(), [&](auto& entry) { return entry.key == identifier; }); - if (it == s_keywords.end()) - token_type = TokenType::Identifier; - else - token_type = has_escaped_character ? TokenType::EscapedKeyword : it->value; - } else if (is_numeric_literal_start()) { - token_type = TokenType::NumericLiteral; - bool is_invalid_numeric_literal = false; - if (m_current_code_unit == '0') { - consume(); - if (m_current_code_unit == '.') { - // decimal - consume(); - while (is_ascii_digit(m_current_code_unit)) - consume(); - if (m_current_code_unit == 'e' || m_current_code_unit == 'E') - is_invalid_numeric_literal = !consume_exponent(); - } else if (m_current_code_unit == 'e' || m_current_code_unit == 'E') { - is_invalid_numeric_literal = !consume_exponent(); - } else if (m_current_code_unit == 'o' || m_current_code_unit == 'O') { - // octal - is_invalid_numeric_literal = !consume_octal_number(); - if (m_current_code_unit == 'n') { - consume(); - token_type = TokenType::BigIntLiteral; - } - } else if (m_current_code_unit == 'b' || m_current_code_unit == 'B') { - // binary - is_invalid_numeric_literal = !consume_binary_number(); - if (m_current_code_unit == 'n') { - consume(); - token_type = TokenType::BigIntLiteral; - } - } else if (m_current_code_unit == 'x' || m_current_code_unit == 'X') { - // hexadecimal - is_invalid_numeric_literal = !consume_hexadecimal_number(); - if (m_current_code_unit == 'n') { - consume(); - token_type = TokenType::BigIntLiteral; - } - } else if (m_current_code_unit == 'n') { - consume(); - token_type = TokenType::BigIntLiteral; - } else if (is_ascii_digit(m_current_code_unit)) { - // octal without '0o' prefix. Forbidden in 'strict mode' - do { - consume(); - } while (is_ascii_digit(m_current_code_unit)); - } - } else { - // 1...9 or period - while (is_ascii_digit(m_current_code_unit) || match_numeric_literal_separator_followed_by(is_ascii_digit)) - consume(); - if (m_current_code_unit == 'n') { - consume(); - token_type = TokenType::BigIntLiteral; - } else { - if (m_current_code_unit == '.') { - consume(); - if (m_current_code_unit == '_') - is_invalid_numeric_literal = true; - - while (is_ascii_digit(m_current_code_unit) || match_numeric_literal_separator_followed_by(is_ascii_digit)) { - consume(); - } - } - if (m_current_code_unit == 'e' || m_current_code_unit == 'E') { - if (!consume_exponent()) - is_invalid_numeric_literal = true; - } - } - } - if (is_invalid_numeric_literal) { - token_type = TokenType::Invalid; - token_message = Token::Message::InvalidNumericLiteral; - } - } else if (m_current_code_unit == '"' || m_current_code_unit == '\'') { - auto stop_char = m_current_code_unit; - consume(); - // Note: LS/PS line terminators are allowed in string literals. - while (m_current_code_unit != stop_char && m_current_code_unit != '\r' && m_current_code_unit != '\n' && !is_eof()) { - if (m_current_code_unit == '\\') { - consume(); - if (m_current_code_unit == '\r' && m_position < source_code().length_in_code_units() && source().code_unit_at(m_position) == '\n') { - consume(); - } - } - consume(); - } - if (m_current_code_unit != stop_char) { - token_type = TokenType::UnterminatedStringLiteral; - } else { - consume(); - token_type = TokenType::StringLiteral; - } - } else if (m_current_code_unit == '/' && !slash_means_division()) { - consume(); - token_type = consume_regex_literal(); - } else if (m_eof) { - if (unterminated_comment) { - token_type = TokenType::Invalid; - token_message = Token::Message::UnterminatedMultiLineComment; - } else { - token_type = TokenType::Eof; - } - } else { - bool found_token = false; - - // There is only one four-char operator: >>>= - if (match('>', '>', '>', '=')) { - found_token = true; - token_type = TokenType::UnsignedShiftRightEquals; - consume(); - consume(); - consume(); - consume(); - } - - if (!found_token && m_position + 1 < source_code().length_in_code_units()) { - auto three_chars_view = source().substring_view(m_position - 1, 3); - if (auto type = parse_three_char_token(three_chars_view); type != TokenType::Invalid) { - found_token = true; - token_type = type; - consume(); - consume(); - consume(); - } - } - - if (!found_token && m_position < source_code().length_in_code_units()) { - auto two_chars_view = source().substring_view(m_position - 1, 2); - if (auto type = parse_two_char_token(two_chars_view); type != TokenType::Invalid) { - // OptionalChainingPunctuator :: ?. [lookahead ∉ DecimalDigit] - if (!(type == TokenType::QuestionMarkPeriod && m_position + 1 < source_code().length_in_code_units() && is_ascii_digit(source().code_unit_at(m_position + 1)))) { - found_token = true; - token_type = type; - consume(); - consume(); - } - } - } - - if (!found_token && is_ascii(m_current_code_unit)) { - if (auto type = s_single_char_tokens[static_cast(m_current_code_unit)]; type != TokenType::Invalid) { - found_token = true; - token_type = type; - consume(); - } - } - - if (!found_token) { - token_type = TokenType::Invalid; - consume(); - } - } - - if (!m_template_states.is_empty() && m_template_states.last().in_expr) { - if (token_type == TokenType::CurlyOpen) { - m_template_states.last().open_bracket_count++; - } else if (token_type == TokenType::CurlyClose) { - m_template_states.last().open_bracket_count--; - } - } - - m_current_token = Token( - token_type, - token_message, - source().substring_view(trivia_start - 1, value_start - trivia_start), - source().substring_view(value_start - 1, m_position - value_start), - value_start_line_number, - value_start_column_number, - value_start - 1); - - if (identifier.has_value()) - m_current_token.set_identifier_value(identifier.release_value()); - - if constexpr (LEXER_DEBUG) { - dbgln("------------------------------"); - dbgln("Token: {}", m_current_token.name()); - dbgln("Trivia: _{}_", m_current_token.trivia()); - dbgln("Value: _{}_", m_current_token.value()); - dbgln("Line: {}, Column: {}", m_current_token.line_number(), m_current_token.line_column()); - dbgln("------------------------------"); - } - - return m_current_token; -} - -Token const& Lexer::force_slash_as_regex() -{ - VERIFY(m_current_token.type() == TokenType::Slash || m_current_token.type() == TokenType::SlashEquals); - - bool has_equals = m_current_token.type() == TokenType::SlashEquals; - - VERIFY(m_position > 0); - size_t value_start = m_position - 1; - - if (has_equals) { - VERIFY(source().code_unit_at(value_start - 1) == '='); - --value_start; - --m_position; - m_current_code_unit = '='; - } - - TokenType token_type = consume_regex_literal(); - - m_current_token = Token( - token_type, - Token::Message::None, - m_current_token.trivia(), - source().substring_view(value_start - 1, m_position - value_start), - m_current_token.line_number(), - m_current_token.line_column(), - value_start - 1); - - if constexpr (LEXER_DEBUG) { - dbgln("------------------------------"); - dbgln("Token: {}", m_current_token.name()); - dbgln("Trivia: _{}_", m_current_token.trivia()); - dbgln("Value: _{}_", m_current_token.value()); - dbgln("Line: {}, Column: {}", m_current_token.line_number(), m_current_token.line_column()); - dbgln("------------------------------"); - } - - return m_current_token; -} - -TokenType Lexer::consume_regex_literal() -{ - while (!is_eof()) { - if (is_line_terminator() || (!m_regex_is_in_character_class && m_current_code_unit == '/')) - break; - - if (m_current_code_unit == '[') { - m_regex_is_in_character_class = true; - } else if (m_current_code_unit == ']') { - m_regex_is_in_character_class = false; - } else if (!m_regex_is_in_character_class && m_current_code_unit == '/') { - break; - } - - if (match('\\', '/') || match('\\', '[') || match('\\', '\\') || (m_regex_is_in_character_class && match('\\', ']'))) - consume(); - consume(); - } - - if (m_current_code_unit == '/') { - consume(); - return TokenType::RegexLiteral; - } - - return TokenType::UnterminatedRegexLiteral; -} - -// https://tc39.es/ecma262/#prod-SyntaxCharacter -bool is_syntax_character(u32 code_point) -{ - // SyntaxCharacter :: one of - // ^ $ \ . * + ? ( ) [ ] { } | - static constexpr auto syntax_characters = "^$\\.*+?()[]{}|"sv; - return is_ascii(code_point) && syntax_characters.contains(static_cast(code_point)); -} - -// https://tc39.es/ecma262/#prod-WhiteSpace -bool is_whitespace(u32 code_point) -{ - // WhiteSpace :: - // - // - // - // - // - if (is_ascii_space(code_point)) - return true; - if (code_point == NO_BREAK_SPACE || code_point == ZERO_WIDTH_NO_BREAK_SPACE) - return true; - return Unicode::code_point_has_space_separator_general_category(code_point); -} - -// https://tc39.es/ecma262/#prod-LineTerminator -bool is_line_terminator(u32 code_point) -{ - // LineTerminator :: - // - // - // - // - return code_point == '\n' || code_point == '\r' || code_point == LINE_SEPARATOR || code_point == PARAGRAPH_SEPARATOR; -} - -} diff --git a/Libraries/LibJS/Lexer.h b/Libraries/LibJS/Lexer.h deleted file mode 100644 index 1810be2b0f5..00000000000 --- a/Libraries/LibJS/Lexer.h +++ /dev/null @@ -1,88 +0,0 @@ -/* - * Copyright (c) 2020, Stephan Unverwerth - * Copyright (c) 2020-2025, Andreas Kling - * - * SPDX-License-Identifier: BSD-2-Clause - */ - -#pragma once - -#include -#include -#include -#include -#include - -namespace JS { - -class JS_API Lexer { -public: - explicit Lexer(NonnullRefPtr, size_t line_number = 1, size_t line_column = 0); - - // These both advance the lexer and return a reference to the current token. - Token const& next(); - Token const& force_slash_as_regex(); - - [[nodiscard]] Token const& current_token() const { return m_current_token; } - - SourceCode const& source_code() const { return m_source_code; } - Utf16View const& source() const { return m_source_code->code_view(); } - Utf16String const& source_string() const { return m_source_code->code(); } - String const& filename() const { return m_source_code->filename(); } - - void disallow_html_comments() { m_allow_html_comments = false; } - -private: - void consume(); - bool consume_exponent(); - bool consume_octal_number(); - bool consume_hexadecimal_number(); - bool consume_binary_number(); - bool consume_decimal_number(); - - u32 current_code_point() const; - - bool is_eof() const; - bool is_line_terminator() const; - bool is_whitespace() const; - Optional is_identifier_unicode_escape(size_t& identifier_length) const; - Optional is_identifier_start(size_t& identifier_length) const; - Optional is_identifier_middle(size_t& identifier_length) const; - bool is_line_comment_start(bool line_has_token_yet) const; - bool is_block_comment_start() const; - bool is_block_comment_end() const; - bool is_numeric_literal_start() const; - bool match(char16_t, char16_t) const; - bool match(char16_t, char16_t, char16_t) const; - bool match(char16_t, char16_t, char16_t, char16_t) const; - template - bool match_numeric_literal_separator_followed_by(Callback) const; - bool slash_means_division() const; - - TokenType consume_regex_literal(); - - NonnullRefPtr m_source_code; - size_t m_position { 0 }; - Token m_current_token; - char16_t m_current_code_unit { 0 }; - bool m_eof { false }; - bool m_regex_is_in_character_class { false }; - bool m_allow_html_comments { true }; - - size_t m_line_number { 1 }; - size_t m_line_column { 0 }; - - struct TemplateState { - bool in_expr; - u8 open_bracket_count; - }; - Vector m_template_states; - - static HashMap s_keywords; -}; - -bool is_syntax_character(u32 code_point); -bool is_whitespace(u32 code_point); -bool is_line_terminator(u32 code_point); - -} diff --git a/Libraries/LibJS/Runtime/RegExpConstructor.cpp b/Libraries/LibJS/Runtime/RegExpConstructor.cpp index 2e89a16100c..6d23f32fffb 100644 --- a/Libraries/LibJS/Runtime/RegExpConstructor.cpp +++ b/Libraries/LibJS/Runtime/RegExpConstructor.cpp @@ -6,15 +6,35 @@ #include #include -#include #include #include #include #include #include +#include namespace JS { +static bool is_syntax_character(u32 code_point) +{ + static constexpr auto syntax_characters = "^$\\.*+?()[]{}|"sv; + return is_ascii(code_point) && syntax_characters.contains(static_cast(code_point)); +} + +static bool is_whitespace(u32 code_point) +{ + if (is_ascii_space(code_point)) + return true; + if (code_point == 0x00A0 || code_point == 0xFEFF) + return true; + return Unicode::code_point_has_space_separator_general_category(code_point); +} + +static bool is_line_terminator(u32 code_point) +{ + return code_point == '\n' || code_point == '\r' || code_point == 0x2028 || code_point == 0x2029; +} + GC_DEFINE_ALLOCATOR(RegExpConstructor); RegExpConstructor::RegExpConstructor(Realm& realm) @@ -164,7 +184,7 @@ static String encode_for_regexp_escape(u32 code_point) }); // 1. If c is matched by SyntaxCharacter or c is U+002F (SOLIDUS), then - if (JS::is_syntax_character(code_point) || code_point == '/') { + if (is_syntax_character(code_point) || code_point == '/') { // a. Return the string-concatenation of 0x005C (REVERSE SOLIDUS) and UTF16EncodeCodePoint(c). return MUST(String::formatted("\\{}", String::from_code_point(code_point))); } @@ -186,7 +206,7 @@ static String encode_for_regexp_escape(u32 code_point) // 5. If toEscape contains c, c is matched by either WhiteSpace or LineTerminator, or c has the same numeric value // as a leading surrogate or trailing surrogate, then - if (to_escape.contains(code_point) || JS::is_whitespace(code_point) || JS::is_line_terminator(code_point) || is_unicode_surrogate(code_point)) { + if (to_escape.contains(code_point) || is_whitespace(code_point) || is_line_terminator(code_point) || is_unicode_surrogate(code_point)) { // a. Let cNum be the numeric value of c. // b. If cNum ≤ 0xFF, then if (code_point <= 0xFF) { diff --git a/Libraries/LibJS/Rust/cbindgen.toml b/Libraries/LibJS/Rust/cbindgen.toml index 8b47b1c09f8..0da5bf09e07 100644 --- a/Libraries/LibJS/Rust/cbindgen.toml +++ b/Libraries/LibJS/Rust/cbindgen.toml @@ -14,7 +14,7 @@ sys_includes = ["stdint.h", "stddef.h"] usize_is_size_t = true [export] -include = ["ConstantTag", "LiteralValueKind", "WellKnownSymbolKind"] +include = ["ConstantTag", "LiteralValueKind", "WellKnownSymbolKind", "FFIToken"] [export.mangle] rename_types = "PascalCase" diff --git a/Libraries/LibJS/Rust/src/lib.rs b/Libraries/LibJS/Rust/src/lib.rs index 3d9b46deaa6..f75bc3d3ed3 100644 --- a/Libraries/LibJS/Rust/src/lib.rs +++ b/Libraries/LibJS/Rust/src/lib.rs @@ -2646,3 +2646,53 @@ unsafe extern "C" { contains_direct_call_to_eval: bool, ); } + +/// C-compatible token info for the tokenize callback. +#[repr(C)] +pub struct FFIToken { + pub token_type: u8, + pub category: u8, + pub offset: u32, + pub length: u32, + pub trivia_offset: u32, + pub trivia_length: u32, +} + +/// Tokenize a UTF-16 source string, calling `callback` for each token. +/// +/// # Safety +/// - `source` must point to a valid UTF-16 buffer of `source_len` elements. +/// - `callback` must be a valid function pointer. +/// - `ctx` is passed through to the callback. +#[unsafe(no_mangle)] +pub unsafe extern "C" fn rust_tokenize( + source: *const u16, + source_len: usize, + ctx: *mut c_void, + callback: unsafe extern "C" fn(ctx: *mut c_void, token: *const FFIToken), +) { + unsafe { + abort_on_panic(|| { + let Some(source_slice) = source_from_raw(source, source_len) else { + return; + }; + let mut lex = lexer::Lexer::new(source_slice, 1, 0); + loop { + let tok = lex.next(); + let is_eof = tok.token_type == token::TokenType::Eof; + let ffi_tok = FFIToken { + token_type: tok.token_type as u8, + category: tok.token_type.category() as u8, + offset: tok.value_start, + length: tok.value_len, + trivia_offset: tok.trivia_start, + trivia_length: tok.trivia_len, + }; + callback(ctx, &raw const ffi_tok); + if is_eof { + break; + } + } + }); + } +} diff --git a/Libraries/LibJS/Rust/src/token.rs b/Libraries/LibJS/Rust/src/token.rs index 0a381ecc9dd..bae3856b596 100644 --- a/Libraries/LibJS/Rust/src/token.rs +++ b/Libraries/LibJS/Rust/src/token.rs @@ -7,6 +7,7 @@ //! Token types and Token struct for the lexer. #[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[repr(C)] pub enum TokenCategory { Invalid, Trivia, @@ -25,6 +26,7 @@ pub enum TokenCategory { macro_rules! define_tokens { ( $( $variant:ident => $category:ident ),* $(,)? ) => { #[derive(Debug, Clone, Copy, PartialEq, Eq)] + #[repr(C)] pub enum TokenType { $( $variant, )* } diff --git a/Libraries/LibJS/SyntaxHighlighter.cpp b/Libraries/LibJS/SyntaxHighlighter.cpp index 24c496a68d8..3d8eadc0cad 100644 --- a/Libraries/LibJS/SyntaxHighlighter.cpp +++ b/Libraries/LibJS/SyntaxHighlighter.cpp @@ -6,16 +6,21 @@ */ #include +#include #include -#include +#include #include #include +#ifdef ENABLE_RUST +# include +#endif + namespace JS { -static Gfx::TextAttributes style_for_token_type(Gfx::Palette const& palette, TokenType type) +static Gfx::TextAttributes style_for_token_category(Gfx::Palette const& palette, TokenCategory category) { - switch (Token::category(type)) { + switch (category) { case TokenCategory::Invalid: return { palette.syntax_comment() }; case TokenCategory::Number: @@ -39,8 +44,7 @@ static Gfx::TextAttributes style_for_token_type(Gfx::Palette const& palette, Tok bool SyntaxHighlighter::is_identifier(u64 token) const { - auto js_token = static_cast(static_cast(token)); - return js_token == TokenType::Identifier; + return token_type_from_packed(token) == TokenType::Identifier; } bool SyntaxHighlighter::is_navigatable([[maybe_unused]] u64 token) const @@ -48,80 +52,103 @@ bool SyntaxHighlighter::is_navigatable([[maybe_unused]] u64 token) const return false; } +struct RehighlightState { + Gfx::Palette const& palette; + Vector& spans; + Vector& folding_regions; + u16 const* source; + Syntax::TextPosition position { 0, 0 }; + + struct FoldStart { + Syntax::TextRange range; + }; + Vector folding_region_starts; +}; + +static void advance_position(Syntax::TextPosition& position, u16 const* source, u32 start, u32 len) +{ + for (u32 i = 0; i < len; ++i) { + if (source[start + i] == '\n') { + position.set_line(position.line() + 1); + position.set_column(0); + } else { + position.set_column(position.column() + 1); + } + } +} + +static void on_token(void* ctx, FFI::FFIToken const* ffi_token) +{ + auto& state = *static_cast(ctx); + auto token_type = static_cast(ffi_token->token_type); + auto category = static_cast(ffi_token->category); + + // Emit trivia span + if (ffi_token->trivia_length > 0) { + auto trivia_start = state.position; + advance_position(state.position, state.source, ffi_token->trivia_offset, ffi_token->trivia_length); + Syntax::TextDocumentSpan span; + span.range.set_start(trivia_start); + span.range.set_end({ state.position.line(), state.position.column() }); + span.attributes = style_for_token_category(state.palette, TokenCategory::Trivia); + span.is_skippable = true; + span.data = pack_token_data(TokenType::Trivia, TokenCategory::Trivia); + state.spans.append(span); + } + + // Emit token span + auto token_start = state.position; + if (ffi_token->length > 0) { + advance_position(state.position, state.source, ffi_token->offset, ffi_token->length); + Syntax::TextDocumentSpan span; + span.range.set_start(token_start); + span.range.set_end({ state.position.line(), state.position.column() }); + span.attributes = style_for_token_category(state.palette, category); + span.is_skippable = false; + span.data = pack_token_data(token_type, category); + state.spans.append(span); + } + + // Track folding regions for {} blocks + if (token_type == TokenType::CurlyOpen) { + state.folding_region_starts.append({ .range = { token_start, state.position } }); + } else if (token_type == TokenType::CurlyClose) { + if (!state.folding_region_starts.is_empty()) { + auto curly_open = state.folding_region_starts.take_last(); + Syntax::TextDocumentFoldingRegion region; + region.range.set_start(curly_open.range.end()); + region.range.set_end(token_start); + state.folding_regions.append(region); + } + } +} + void SyntaxHighlighter::rehighlight(Palette const& palette) { auto text = m_client->get_text(); - - Lexer lexer(SourceCode::create({}, Utf16String::from_utf8(text))); + auto source_utf16 = Utf16String::from_utf8(text); + auto source_code = SourceCode::create({}, move(source_utf16)); + auto const* source_data = source_code->utf16_data(); + auto source_len = source_code->length_in_code_units(); Vector spans; Vector folding_regions; - Syntax::TextPosition position { 0, 0 }; - Syntax::TextPosition start { 0, 0 }; - auto advance_position = [&position](u32 code_point) { - if (code_point == '\n') { - position.set_line(position.line() + 1); - position.set_column(0); - } else - position.set_column(position.column() + 1); + RehighlightState state { + .palette = palette, + .spans = spans, + .folding_regions = folding_regions, + .source = source_data, + .position = {}, + .folding_region_starts = {}, }; - auto append_token = [&](Utf16View const& str, Token const& token, bool is_trivia) { - if (str.is_empty()) - return; - - start = position; - for (auto code_point : str) - advance_position(code_point); - - Syntax::TextDocumentSpan span; - span.range.set_start(start); - span.range.set_end({ position.line(), position.column() }); - auto type = is_trivia ? TokenType::Trivia : token.type(); - span.attributes = style_for_token_type(palette, type); - span.is_skippable = is_trivia; - span.data = static_cast(type); - spans.append(span); - - dbgln_if(SYNTAX_HIGHLIGHTING_DEBUG, "{}{} @ '{}' {}:{} - {}:{}", - token.name(), - is_trivia ? " (trivia)" : "", - token.value(), - span.range.start().line(), span.range.start().column(), - span.range.end().line(), span.range.end().column()); - }; - - struct TokenData { - Token token; - Syntax::TextRange range; - }; - Vector folding_region_start_tokens; - - bool was_eof = false; - for (auto token = lexer.next(); !was_eof; token = lexer.next()) { - append_token(token.trivia(), token, true); - - auto token_start_position = position; - append_token(token.value(), token, false); - - if (token.type() == TokenType::Eof) - was_eof = true; - - // Create folding regions for {} blocks - if (token.type() == TokenType::CurlyOpen) { - folding_region_start_tokens.append({ .token = token, - .range = { token_start_position, position } }); - } else if (token.type() == TokenType::CurlyClose) { - if (!folding_region_start_tokens.is_empty()) { - auto curly_open = folding_region_start_tokens.take_last(); - Syntax::TextDocumentFoldingRegion region; - region.range.set_start(curly_open.range.end()); - region.range.set_end(token_start_position); - folding_regions.append(region); - } - } - } +#ifdef ENABLE_RUST + FFI::rust_tokenize(source_data, source_len, &state, + [](void* ctx, FFI::FFIToken const* token) { on_token(ctx, token); }); +#else + (void)source_len; +#endif m_client->do_set_spans(move(spans)); m_client->do_set_folding_regions(move(folding_regions)); @@ -136,9 +163,9 @@ Vector SyntaxHighlighter::matching_token { static Vector pairs; if (pairs.is_empty()) { - pairs.append({ static_cast(TokenType::CurlyOpen), static_cast(TokenType::CurlyClose) }); - pairs.append({ static_cast(TokenType::ParenOpen), static_cast(TokenType::ParenClose) }); - pairs.append({ static_cast(TokenType::BracketOpen), static_cast(TokenType::BracketClose) }); + pairs.append({ pack_token_data(TokenType::CurlyOpen, TokenCategory::Punctuation), pack_token_data(TokenType::CurlyClose, TokenCategory::Punctuation) }); + pairs.append({ pack_token_data(TokenType::ParenOpen, TokenCategory::Punctuation), pack_token_data(TokenType::ParenClose, TokenCategory::Punctuation) }); + pairs.append({ pack_token_data(TokenType::BracketOpen, TokenCategory::Punctuation), pack_token_data(TokenType::BracketClose, TokenCategory::Punctuation) }); } return pairs; } diff --git a/Libraries/LibJS/Token.cpp b/Libraries/LibJS/Token.cpp deleted file mode 100644 index bc5d086290a..00000000000 --- a/Libraries/LibJS/Token.cpp +++ /dev/null @@ -1,308 +0,0 @@ -/* - * Copyright (c) 2020, Stephan Unverwerth - * Copyright (c) 2020-2021, Linus Groh - * - * SPDX-License-Identifier: BSD-2-Clause - */ - -#include -#include -#include -#include -#include - -namespace JS { - -char const* Token::name(TokenType type) -{ - switch (type) { -#define __ENUMERATE_JS_TOKEN(type, category) \ - case TokenType::type: \ - return #type; - ENUMERATE_JS_TOKENS -#undef __ENUMERATE_JS_TOKEN - default: - VERIFY_NOT_REACHED(); - return ""; - } -} - -char const* Token::name() const -{ - return name(m_type); -} - -TokenCategory Token::category(TokenType type) -{ - switch (type) { -#define __ENUMERATE_JS_TOKEN(type, category) \ - case TokenType::type: \ - return TokenCategory::category; - ENUMERATE_JS_TOKENS -#undef __ENUMERATE_JS_TOKEN - default: - VERIFY_NOT_REACHED(); - } -} - -TokenCategory Token::category() const -{ - return category(m_type); -} - -double Token::double_value() const -{ - VERIFY(type() == TokenType::NumericLiteral); - - auto value = this->value(); - Utf16String buffer; - - if (value.contains('_')) { - buffer = value.replace("_"sv, {}, ReplaceMode::All); - value = buffer; - } - - auto parse_integer_digits = [](Utf16View digits, u8 radix) -> double { - if (auto v = digits.to_number(TrimWhitespace::No, radix); v.has_value()) - return static_cast(v.value()); - double result = 0.0; - for (size_t i = 0; i < digits.length_in_code_units(); ++i) { - auto digit = parse_ascii_hex_digit(digits.code_unit_at(i)); - result = result * radix + digit; - } - return result; - }; - - if (value.length_in_code_units() >= 2 && value.starts_with('0')) { - auto next = value.code_unit_at(1); - - // hexadecimal - if (next == 'x' || next == 'X') - return parse_integer_digits(value.substring_view(2), 16); - - // octal - if (next == 'o' || next == 'O') - return parse_integer_digits(value.substring_view(2), 8); - - // binary - if (next == 'b' || next == 'B') - return parse_integer_digits(value.substring_view(2), 2); - - // also octal, but syntax error in strict mode - if (is_ascii_digit(next) && !value.contains_any_of({ { '8', '9' } })) - return parse_integer_digits(value.substring_view(1), 8); - } - - // This should always be a valid double - return value.to_number(TrimWhitespace::No).value(); -} - -Utf16String Token::string_value(StringValueStatus& status) const -{ - VERIFY(type() == TokenType::StringLiteral || type() == TokenType::TemplateLiteralString); - - auto is_template = type() == TokenType::TemplateLiteralString; - auto value = this->value(); - - Utf16GenericLexer lexer(is_template ? value : value.substring_view(1, value.length_in_code_units() - 2)); - - auto encoding_failure = [&status](StringValueStatus parse_status) -> Utf16String { - status = parse_status; - return {}; - }; - - StringBuilder builder(StringBuilder::Mode::UTF16); - - while (!lexer.is_eof()) { - // No escape, consume one char and continue - if (!lexer.next_is('\\')) { - if (is_template && lexer.next_is('\r')) { - lexer.ignore(); - if (lexer.next_is('\n')) - lexer.ignore(); - - builder.append('\n'); - continue; - } - - builder.append_code_unit(lexer.consume()); - continue; - } - - // Unicode escape - if (lexer.next_is("\\u"sv)) { - auto code_point_or_error = lexer.consume_escaped_code_point(); - - if (code_point_or_error.is_error()) { - switch (code_point_or_error.error()) { - case AK::UnicodeEscapeError::MalformedUnicodeEscape: - return encoding_failure(StringValueStatus::MalformedUnicodeEscape); - case AK::UnicodeEscapeError::UnicodeEscapeOverflow: - return encoding_failure(StringValueStatus::UnicodeEscapeOverflow); - } - } - - builder.append_code_point(code_point_or_error.value()); - continue; - } - - lexer.ignore(); - VERIFY(!lexer.is_eof()); - - // Line continuation - if (lexer.next_is('\n') || lexer.next_is('\r')) { - if (lexer.next_is("\r\n"sv)) - lexer.ignore(); - lexer.ignore(); - continue; - } - // Line continuation - if (lexer.next_is(LINE_SEPARATOR) || lexer.next_is(PARAGRAPH_SEPARATOR)) { - lexer.ignore(); - continue; - } - // Null-byte escape - if (lexer.next_is('0') && !is_ascii_digit(lexer.peek(1))) { - lexer.ignore(); - builder.append('\0'); - continue; - } - // Hex escape - if (lexer.next_is('x')) { - lexer.ignore(); - if (!is_ascii_hex_digit(lexer.peek()) || !is_ascii_hex_digit(lexer.peek(1))) - return encoding_failure(StringValueStatus::MalformedHexEscape); - - auto code_point = lexer.consume(2).to_number(TrimWhitespace::No, 16).value(); - VERIFY(code_point <= 255); - - builder.append_code_point(code_point); - continue; - } - - // In non-strict mode LegacyOctalEscapeSequence is allowed in strings: - // https://tc39.es/ecma262/#sec-additional-syntax-string-literals - Optional octal_str; - - auto is_octal_digit = [](auto ch) { return ch >= '0' && ch <= '7'; }; - auto is_zero_to_three = [](auto ch) { return ch >= '0' && ch <= '3'; }; - auto is_four_to_seven = [](auto ch) { return ch >= '4' && ch <= '7'; }; - - // OctalDigit [lookahead ∉ OctalDigit] - if (is_octal_digit(lexer.peek()) && !is_octal_digit(lexer.peek(1))) - octal_str = lexer.consume(1); - // ZeroToThree OctalDigit [lookahead ∉ OctalDigit] - else if (is_zero_to_three(lexer.peek()) && is_octal_digit(lexer.peek(1)) && !is_octal_digit(lexer.peek(2))) - octal_str = lexer.consume(2); - // FourToSeven OctalDigit - else if (is_four_to_seven(lexer.peek()) && is_octal_digit(lexer.peek(1))) - octal_str = lexer.consume(2); - // ZeroToThree OctalDigit OctalDigit - else if (is_zero_to_three(lexer.peek()) && is_octal_digit(lexer.peek(1)) && is_octal_digit(lexer.peek(2))) - octal_str = lexer.consume(3); - - if (octal_str.has_value()) { - status = StringValueStatus::LegacyOctalEscapeSequence; - - auto code_point = octal_str->to_number(TrimWhitespace::No, 8).value(); - VERIFY(code_point <= 255); - - builder.append_code_point(code_point); - continue; - } - - if (lexer.next_is('8') || lexer.next_is('9')) { - status = StringValueStatus::LegacyOctalEscapeSequence; - builder.append_code_unit(lexer.consume()); - continue; - } - - lexer.retreat(); - builder.append_code_unit(lexer.consume_escaped_character('\\', "b\bf\fn\nr\rt\tv\v"sv)); - } - - return builder.to_utf16_string(); -} - -// 12.8.6.2 Static Semantics: TRV, https://tc39.es/ecma262/#sec-static-semantics-trv -Utf16String Token::raw_template_value() const -{ - return value().replace("\r\n"sv, "\n"sv, ReplaceMode::All).replace("\r"sv, "\n"sv, ReplaceMode::All); -} - -bool Token::bool_value() const -{ - VERIFY(type() == TokenType::BoolLiteral); - return value() == "true"sv; -} - -bool Token::is_identifier_name() const -{ - // IdentifierNames are Identifiers + ReservedWords - // The standard defines this reversed: Identifiers are IdentifierNames except reserved words - // https://tc39.es/ecma262/#prod-Identifier - return m_type == TokenType::Identifier - || m_type == TokenType::EscapedKeyword - || m_type == TokenType::Await - || m_type == TokenType::Async - || m_type == TokenType::BoolLiteral - || m_type == TokenType::Break - || m_type == TokenType::Case - || m_type == TokenType::Catch - || m_type == TokenType::Class - || m_type == TokenType::Const - || m_type == TokenType::Continue - || m_type == TokenType::Debugger - || m_type == TokenType::Default - || m_type == TokenType::Delete - || m_type == TokenType::Do - || m_type == TokenType::Else - || m_type == TokenType::Enum - || m_type == TokenType::Export - || m_type == TokenType::Extends - || m_type == TokenType::Finally - || m_type == TokenType::For - || m_type == TokenType::Function - || m_type == TokenType::If - || m_type == TokenType::Import - || m_type == TokenType::In - || m_type == TokenType::Instanceof - || m_type == TokenType::Let - || m_type == TokenType::New - || m_type == TokenType::NullLiteral - || m_type == TokenType::Return - || m_type == TokenType::Super - || m_type == TokenType::Switch - || m_type == TokenType::This - || m_type == TokenType::Throw - || m_type == TokenType::Try - || m_type == TokenType::Typeof - || m_type == TokenType::Var - || m_type == TokenType::Void - || m_type == TokenType::While - || m_type == TokenType::With - || m_type == TokenType::Yield; -} - -bool Token::trivia_contains_line_terminator() const -{ - return m_trivia.contains('\n') || m_trivia.contains('\r') || m_trivia.contains(LINE_SEPARATOR) || m_trivia.contains(PARAGRAPH_SEPARATOR); -} - -String Token::message() const -{ - switch (m_message) { - case Message::StartOfPrivateNameNotFollowedByValidIdentifier: - return "Start of private name '#' but not followed by valid identifier"_string; - case Message::InvalidNumericLiteral: - return "Invalid numeric literal"_string; - case Message::UnterminatedMultiLineComment: - return "Unterminated multi-line comment"_string; - case Message::None: - return {}; - } - VERIFY_NOT_REACHED(); - return {}; -} - -} diff --git a/Libraries/LibJS/Token.h b/Libraries/LibJS/Token.h index a1a344291b6..d0f185b70e3 100644 --- a/Libraries/LibJS/Token.h +++ b/Libraries/LibJS/Token.h @@ -1,169 +1,22 @@ /* - * Copyright (c) 2020, Stephan Unverwerth + * Copyright (c) 2026-present, the Ladybird developers. * * SPDX-License-Identifier: BSD-2-Clause */ #pragma once -#include -#include -#include -#include +#include namespace JS { -// U+00A0 NO BREAK SPACE -constexpr inline char16_t const NO_BREAK_SPACE { 0x00A0 }; +constexpr u32 LINE_SEPARATOR = 0x2028; +constexpr u32 PARAGRAPH_SEPARATOR = 0x2029; -// U+200C ZERO WIDTH NON-JOINER -constexpr inline char16_t const ZERO_WIDTH_NON_JOINER { 0x200C }; +// NB: These enums must match the Rust token::TokenCategory and +// token::TokenType enums in Libraries/LibJS/Rust/src/token.rs. -// U+200D ZERO WIDTH JOINER -constexpr inline char16_t const ZERO_WIDTH_JOINER { 0x200D }; - -// U+2028 LINE SEPARATOR -constexpr inline char16_t const LINE_SEPARATOR { 0x2028 }; - -// U+2029 PARAGRAPH SEPARATOR -constexpr inline char16_t const PARAGRAPH_SEPARATOR { 0x2029 }; - -// U+FEFF ZERO WIDTH NO-BREAK SPACE -constexpr inline char16_t const ZERO_WIDTH_NO_BREAK_SPACE { 0xFEFF }; - -#define ENUMERATE_JS_TOKENS \ - __ENUMERATE_JS_TOKEN(Ampersand, Operator) \ - __ENUMERATE_JS_TOKEN(AmpersandEquals, Operator) \ - __ENUMERATE_JS_TOKEN(Arrow, Operator) \ - __ENUMERATE_JS_TOKEN(Asterisk, Operator) \ - __ENUMERATE_JS_TOKEN(AsteriskEquals, Operator) \ - __ENUMERATE_JS_TOKEN(Async, Keyword) \ - __ENUMERATE_JS_TOKEN(Await, Keyword) \ - __ENUMERATE_JS_TOKEN(BigIntLiteral, Number) \ - __ENUMERATE_JS_TOKEN(BoolLiteral, Keyword) \ - __ENUMERATE_JS_TOKEN(BracketClose, Punctuation) \ - __ENUMERATE_JS_TOKEN(BracketOpen, Punctuation) \ - __ENUMERATE_JS_TOKEN(Break, Keyword) \ - __ENUMERATE_JS_TOKEN(Caret, Operator) \ - __ENUMERATE_JS_TOKEN(CaretEquals, Operator) \ - __ENUMERATE_JS_TOKEN(Case, ControlKeyword) \ - __ENUMERATE_JS_TOKEN(Catch, ControlKeyword) \ - __ENUMERATE_JS_TOKEN(Class, Keyword) \ - __ENUMERATE_JS_TOKEN(Colon, Punctuation) \ - __ENUMERATE_JS_TOKEN(Comma, Punctuation) \ - __ENUMERATE_JS_TOKEN(Const, Keyword) \ - __ENUMERATE_JS_TOKEN(Continue, ControlKeyword) \ - __ENUMERATE_JS_TOKEN(CurlyClose, Punctuation) \ - __ENUMERATE_JS_TOKEN(CurlyOpen, Punctuation) \ - __ENUMERATE_JS_TOKEN(Debugger, Keyword) \ - __ENUMERATE_JS_TOKEN(Default, ControlKeyword) \ - __ENUMERATE_JS_TOKEN(Delete, Keyword) \ - __ENUMERATE_JS_TOKEN(Do, ControlKeyword) \ - __ENUMERATE_JS_TOKEN(DoubleAmpersand, Operator) \ - __ENUMERATE_JS_TOKEN(DoubleAmpersandEquals, Operator) \ - __ENUMERATE_JS_TOKEN(DoubleAsterisk, Operator) \ - __ENUMERATE_JS_TOKEN(DoubleAsteriskEquals, Operator) \ - __ENUMERATE_JS_TOKEN(DoublePipe, Operator) \ - __ENUMERATE_JS_TOKEN(DoublePipeEquals, Operator) \ - __ENUMERATE_JS_TOKEN(DoubleQuestionMark, Operator) \ - __ENUMERATE_JS_TOKEN(DoubleQuestionMarkEquals, Operator) \ - __ENUMERATE_JS_TOKEN(Else, ControlKeyword) \ - __ENUMERATE_JS_TOKEN(Enum, Keyword) \ - __ENUMERATE_JS_TOKEN(Eof, Invalid) \ - __ENUMERATE_JS_TOKEN(Equals, Operator) \ - __ENUMERATE_JS_TOKEN(EqualsEquals, Operator) \ - __ENUMERATE_JS_TOKEN(EqualsEqualsEquals, Operator) \ - __ENUMERATE_JS_TOKEN(EscapedKeyword, Identifier) \ - __ENUMERATE_JS_TOKEN(ExclamationMark, Operator) \ - __ENUMERATE_JS_TOKEN(ExclamationMarkEquals, Operator) \ - __ENUMERATE_JS_TOKEN(ExclamationMarkEqualsEquals, Operator) \ - __ENUMERATE_JS_TOKEN(Export, Keyword) \ - __ENUMERATE_JS_TOKEN(Extends, Keyword) \ - __ENUMERATE_JS_TOKEN(Finally, ControlKeyword) \ - __ENUMERATE_JS_TOKEN(For, ControlKeyword) \ - __ENUMERATE_JS_TOKEN(Function, Keyword) \ - __ENUMERATE_JS_TOKEN(GreaterThan, Operator) \ - __ENUMERATE_JS_TOKEN(GreaterThanEquals, Operator) \ - __ENUMERATE_JS_TOKEN(Identifier, Identifier) \ - __ENUMERATE_JS_TOKEN(If, ControlKeyword) \ - __ENUMERATE_JS_TOKEN(Implements, Keyword) \ - __ENUMERATE_JS_TOKEN(Import, Keyword) \ - __ENUMERATE_JS_TOKEN(In, Keyword) \ - __ENUMERATE_JS_TOKEN(Instanceof, Keyword) \ - __ENUMERATE_JS_TOKEN(Interface, Keyword) \ - __ENUMERATE_JS_TOKEN(Invalid, Invalid) \ - __ENUMERATE_JS_TOKEN(LessThan, Operator) \ - __ENUMERATE_JS_TOKEN(LessThanEquals, Operator) \ - __ENUMERATE_JS_TOKEN(Let, Keyword) \ - __ENUMERATE_JS_TOKEN(Minus, Operator) \ - __ENUMERATE_JS_TOKEN(MinusEquals, Operator) \ - __ENUMERATE_JS_TOKEN(MinusMinus, Operator) \ - __ENUMERATE_JS_TOKEN(New, Keyword) \ - __ENUMERATE_JS_TOKEN(NullLiteral, Keyword) \ - __ENUMERATE_JS_TOKEN(NumericLiteral, Number) \ - __ENUMERATE_JS_TOKEN(Package, Keyword) \ - __ENUMERATE_JS_TOKEN(ParenClose, Punctuation) \ - __ENUMERATE_JS_TOKEN(ParenOpen, Punctuation) \ - __ENUMERATE_JS_TOKEN(Percent, Operator) \ - __ENUMERATE_JS_TOKEN(PercentEquals, Operator) \ - __ENUMERATE_JS_TOKEN(Period, Operator) \ - __ENUMERATE_JS_TOKEN(Pipe, Operator) \ - __ENUMERATE_JS_TOKEN(PipeEquals, Operator) \ - __ENUMERATE_JS_TOKEN(Plus, Operator) \ - __ENUMERATE_JS_TOKEN(PlusEquals, Operator) \ - __ENUMERATE_JS_TOKEN(PlusPlus, Operator) \ - __ENUMERATE_JS_TOKEN(Private, Keyword) \ - __ENUMERATE_JS_TOKEN(PrivateIdentifier, Identifier) \ - __ENUMERATE_JS_TOKEN(Protected, Keyword) \ - __ENUMERATE_JS_TOKEN(Public, Keyword) \ - __ENUMERATE_JS_TOKEN(QuestionMark, Operator) \ - __ENUMERATE_JS_TOKEN(QuestionMarkPeriod, Operator) \ - __ENUMERATE_JS_TOKEN(RegexFlags, String) \ - __ENUMERATE_JS_TOKEN(RegexLiteral, String) \ - __ENUMERATE_JS_TOKEN(Return, ControlKeyword) \ - __ENUMERATE_JS_TOKEN(Semicolon, Punctuation) \ - __ENUMERATE_JS_TOKEN(ShiftLeft, Operator) \ - __ENUMERATE_JS_TOKEN(ShiftLeftEquals, Operator) \ - __ENUMERATE_JS_TOKEN(ShiftRight, Operator) \ - __ENUMERATE_JS_TOKEN(ShiftRightEquals, Operator) \ - __ENUMERATE_JS_TOKEN(Slash, Operator) \ - __ENUMERATE_JS_TOKEN(SlashEquals, Operator) \ - __ENUMERATE_JS_TOKEN(Static, Keyword) \ - __ENUMERATE_JS_TOKEN(StringLiteral, String) \ - __ENUMERATE_JS_TOKEN(Super, Keyword) \ - __ENUMERATE_JS_TOKEN(Switch, ControlKeyword) \ - __ENUMERATE_JS_TOKEN(TemplateLiteralEnd, String) \ - __ENUMERATE_JS_TOKEN(TemplateLiteralExprEnd, Punctuation) \ - __ENUMERATE_JS_TOKEN(TemplateLiteralExprStart, Punctuation) \ - __ENUMERATE_JS_TOKEN(TemplateLiteralStart, String) \ - __ENUMERATE_JS_TOKEN(TemplateLiteralString, String) \ - __ENUMERATE_JS_TOKEN(This, Keyword) \ - __ENUMERATE_JS_TOKEN(Throw, ControlKeyword) \ - __ENUMERATE_JS_TOKEN(Tilde, Operator) \ - __ENUMERATE_JS_TOKEN(TripleDot, Operator) \ - __ENUMERATE_JS_TOKEN(Trivia, Trivia) \ - __ENUMERATE_JS_TOKEN(Try, ControlKeyword) \ - __ENUMERATE_JS_TOKEN(Typeof, Keyword) \ - __ENUMERATE_JS_TOKEN(UnsignedShiftRight, Operator) \ - __ENUMERATE_JS_TOKEN(UnsignedShiftRightEquals, Operator) \ - __ENUMERATE_JS_TOKEN(UnterminatedRegexLiteral, String) \ - __ENUMERATE_JS_TOKEN(UnterminatedStringLiteral, String) \ - __ENUMERATE_JS_TOKEN(UnterminatedTemplateLiteral, String) \ - __ENUMERATE_JS_TOKEN(Var, Keyword) \ - __ENUMERATE_JS_TOKEN(Void, Keyword) \ - __ENUMERATE_JS_TOKEN(While, ControlKeyword) \ - __ENUMERATE_JS_TOKEN(With, ControlKeyword) \ - __ENUMERATE_JS_TOKEN(Yield, ControlKeyword) - -enum class TokenType { -#define __ENUMERATE_JS_TOKEN(type, category) type, - ENUMERATE_JS_TOKENS -#undef __ENUMERATE_JS_TOKEN - _COUNT_OF_TOKENS -}; -constexpr size_t cs_num_of_js_tokens = static_cast(TokenType::_COUNT_OF_TOKENS); - -enum class TokenCategory { +enum class TokenCategory : u8 { Invalid, Trivia, Number, @@ -172,91 +25,151 @@ enum class TokenCategory { Operator, Keyword, ControlKeyword, - Identifier + Identifier, }; -class JS_API Token { -public: - enum class Message { - None, - StartOfPrivateNameNotFollowedByValidIdentifier, - InvalidNumericLiteral, - UnterminatedMultiLineComment, - }; - - Token() = default; - - Token(TokenType type, Message message, Utf16View const& trivia, Utf16View const& value, size_t line_number, size_t line_column, size_t offset) - : m_type(type) - , m_message(message) - , m_trivia(trivia) - , m_original_value(value) - , m_value(value) - , m_line_number(line_number) - , m_line_column(line_column) - , m_offset(offset) - { - } - - TokenType type() const { return m_type; } - TokenCategory category() const; - static TokenCategory category(TokenType); - char const* name() const; - static char const* name(TokenType); - - String message() const; - Utf16View const& trivia() const { return m_trivia; } - Utf16View const& original_value() const { return m_original_value; } - - Utf16View value() const - { - return m_value.visit( - [](Utf16View const& view) { return view; }, - [](Utf16FlyString const& identifier) { return identifier.view(); }, - [](Empty) -> Utf16View { VERIFY_NOT_REACHED(); }); - } - - Utf16FlyString fly_string_value() const - { - return m_value.visit( - [](Utf16View const& view) { return Utf16FlyString::from_utf16(view); }, - [](Utf16FlyString const& identifier) { return identifier; }, - [](Empty) -> Utf16FlyString { VERIFY_NOT_REACHED(); }); - } - - u32 line_number() const { return m_line_number; } - u32 line_column() const { return m_line_column; } - u32 offset() const { return m_offset; } - double double_value() const; - bool bool_value() const; - - enum class StringValueStatus { - Ok, - MalformedHexEscape, - MalformedUnicodeEscape, - UnicodeEscapeOverflow, - LegacyOctalEscapeSequence, - }; - Utf16String string_value(StringValueStatus& status) const; - Utf16String raw_template_value() const; - - void set_identifier_value(Utf16FlyString value) - { - m_value = move(value); - } - - bool is_identifier_name() const; - bool trivia_contains_line_terminator() const; - -private: - TokenType m_type { TokenType::Invalid }; - Message m_message { Message::None }; - Utf16View m_trivia; - Utf16View m_original_value; - Variant m_value; - u32 m_line_number { 0 }; - u32 m_line_column { 0 }; - u32 m_offset { 0 }; +// NB: Keep in sync with define_tokens! in token.rs. +// The order must be identical (alphabetical by variant name). +enum class TokenType : u8 { + Ampersand, + AmpersandEquals, + Arrow, + Asterisk, + AsteriskEquals, + Async, + Await, + BigIntLiteral, + BoolLiteral, + BracketClose, + BracketOpen, + Break, + Caret, + CaretEquals, + Case, + Catch, + Class, + Colon, + Comma, + Const, + Continue, + CurlyClose, + CurlyOpen, + Debugger, + Default, + Delete, + Do, + DoubleAmpersand, + DoubleAmpersandEquals, + DoubleAsterisk, + DoubleAsteriskEquals, + DoublePipe, + DoublePipeEquals, + DoubleQuestionMark, + DoubleQuestionMarkEquals, + Else, + Enum, + Eof, + Equals, + EqualsEquals, + EqualsEqualsEquals, + EscapedKeyword, + ExclamationMark, + ExclamationMarkEquals, + ExclamationMarkEqualsEquals, + Export, + Extends, + Finally, + For, + Function, + GreaterThan, + GreaterThanEquals, + Identifier, + If, + Implements, + Import, + In, + Instanceof, + Interface, + Invalid, + LessThan, + LessThanEquals, + Let, + Minus, + MinusEquals, + MinusMinus, + New, + NullLiteral, + NumericLiteral, + Package, + ParenClose, + ParenOpen, + Percent, + PercentEquals, + Period, + Pipe, + PipeEquals, + Plus, + PlusEquals, + PlusPlus, + Private, + PrivateIdentifier, + Protected, + Public, + QuestionMark, + QuestionMarkPeriod, + RegexFlags, + RegexLiteral, + Return, + Semicolon, + ShiftLeft, + ShiftLeftEquals, + ShiftRight, + ShiftRightEquals, + Slash, + SlashEquals, + Static, + StringLiteral, + Super, + Switch, + TemplateLiteralEnd, + TemplateLiteralExprEnd, + TemplateLiteralExprStart, + TemplateLiteralStart, + TemplateLiteralString, + This, + Throw, + Tilde, + TripleDot, + Trivia, + Try, + Typeof, + UnsignedShiftRight, + UnsignedShiftRightEquals, + UnterminatedRegexLiteral, + UnterminatedStringLiteral, + UnterminatedTemplateLiteral, + Var, + Void, + While, + With, + Yield, + _COUNT_OF_TOKENS, }; +// Pack token type and category into a u64 for span data storage. +inline u64 pack_token_data(TokenType type, TokenCategory category) +{ + return (static_cast(category) << 8) | static_cast(type); +} + +inline TokenType token_type_from_packed(u64 data) +{ + return static_cast(data & 0xFF); +} + +inline TokenCategory token_category_from_packed(u64 data) +{ + return static_cast((data >> 8) & 0xFF); +} + } diff --git a/Libraries/LibWebView/SourceHighlighter.cpp b/Libraries/LibWebView/SourceHighlighter.cpp index f4c810486eb..a548465ead8 100644 --- a/Libraries/LibWebView/SourceHighlighter.cpp +++ b/Libraries/LibWebView/SourceHighlighter.cpp @@ -201,7 +201,7 @@ StringView SourceHighlighterClient::class_for_token(u64 token_type) const }; auto class_for_js_token = [](u64 token_type) { - auto category = JS::Token::category(static_cast(token_type)); + auto category = JS::token_category_from_packed(token_type); switch (category) { case JS::TokenCategory::Invalid: return "invalid"sv; diff --git a/Utilities/js.cpp b/Utilities/js.cpp index e23583af4bf..212f440db2e 100644 --- a/Utilities/js.cpp +++ b/Utilities/js.cpp @@ -16,7 +16,6 @@ #include #include #include -#include #include #include #include @@ -25,8 +24,11 @@ #include #include #include +#include #include +#include #include +#include #include #include #include @@ -509,7 +511,8 @@ static ErrorOr read_next_piece() piece.append(line); piece.append('\n'); - auto lexer = JS::Lexer(JS::SourceCode::create({}, Utf16String::from_utf8(line))); + + auto source_code = JS::SourceCode::create({}, Utf16String::from_utf8(line)); enum { NotInLabelOrObjectKey, @@ -517,38 +520,45 @@ static ErrorOr read_next_piece() InLabelOrObjectKey } label_state { NotInLabelOrObjectKey }; - for (JS::Token token = lexer.next(); token.type() != JS::TokenType::Eof; token = lexer.next()) { - switch (token.type()) { - case JS::TokenType::BracketOpen: - case JS::TokenType::CurlyOpen: - case JS::TokenType::ParenOpen: - label_state = NotInLabelOrObjectKey; - s_repl_line_level++; - break; - case JS::TokenType::BracketClose: - case JS::TokenType::CurlyClose: - case JS::TokenType::ParenClose: - label_state = NotInLabelOrObjectKey; - s_repl_line_level--; - break; + struct BracketState { + decltype(label_state)* label; + int* level; + } bracket_state { &label_state, &s_repl_line_level }; - case JS::TokenType::Identifier: - case JS::TokenType::StringLiteral: - if (label_state == NotInLabelOrObjectKey) - label_state = InLabelOrObjectKeyIdentifier; - else - label_state = NotInLabelOrObjectKey; - break; - case JS::TokenType::Colon: - if (label_state == InLabelOrObjectKeyIdentifier) - label_state = InLabelOrObjectKey; - else - label_state = NotInLabelOrObjectKey; - break; - default: - break; - } - } + JS::FFI::rust_tokenize(source_code->utf16_data(), source_code->length_in_code_units(), &bracket_state, + [](void* ctx, JS::FFI::FFIToken const* tok) { + auto& state = *static_cast(ctx); + auto type = static_cast(tok->token_type); + switch (type) { + case JS::TokenType::BracketOpen: + case JS::TokenType::CurlyOpen: + case JS::TokenType::ParenOpen: + *state.label = NotInLabelOrObjectKey; + (*state.level)++; + break; + case JS::TokenType::BracketClose: + case JS::TokenType::CurlyClose: + case JS::TokenType::ParenClose: + *state.label = NotInLabelOrObjectKey; + (*state.level)--; + break; + case JS::TokenType::Identifier: + case JS::TokenType::StringLiteral: + if (*state.label == NotInLabelOrObjectKey) + *state.label = InLabelOrObjectKeyIdentifier; + else + *state.label = NotInLabelOrObjectKey; + break; + case JS::TokenType::Colon: + if (*state.label == InLabelOrObjectKeyIdentifier) + *state.label = InLabelOrObjectKey; + else + *state.label = NotInLabelOrObjectKey; + break; + default: + break; + } + }); if (label_state == InLabelOrObjectKey) { // If there's a label or object literal key at the end of this line, @@ -618,63 +628,69 @@ static ErrorOr run_repl(bool gc_on_every_allocation, bool syntax_highlight) size_t open_indents = s_repl_line_level; auto line = editor.line(); - JS::Lexer lexer(JS::SourceCode::create({}, Utf16String::from_utf8(line))); - bool indenters_starting_line = true; - for (JS::Token token = lexer.next(); token.type() != JS::TokenType::Eof; token = lexer.next()) { - auto length = token.value().length_in_code_units(); - auto start = token.offset(); - auto end = start + length; - if (indenters_starting_line) { - if (token.type() != JS::TokenType::ParenClose && token.type() != JS::TokenType::BracketClose && token.type() != JS::TokenType::CurlyClose) { - indenters_starting_line = false; - } else { - --open_indents; - } - } + auto source_code = JS::SourceCode::create({}, Utf16String::from_utf8(line)); - switch (token.category()) { - case JS::TokenCategory::Invalid: - stylize({ start, end, Line::Span::CodepointOriented }, { Line::Style::Foreground(Line::Style::XtermColor::Red), Line::Style::Underline }); - break; - case JS::TokenCategory::Number: - stylize({ start, end, Line::Span::CodepointOriented }, { Line::Style::Foreground(Line::Style::XtermColor::Magenta) }); - break; - case JS::TokenCategory::String: - stylize({ start, end, Line::Span::CodepointOriented }, { Line::Style::Foreground(Line::Style::XtermColor::Green), Line::Style::Bold }); - break; - case JS::TokenCategory::Punctuation: - break; - case JS::TokenCategory::Operator: - break; - case JS::TokenCategory::Keyword: - switch (token.type()) { - case JS::TokenType::BoolLiteral: - case JS::TokenType::NullLiteral: - stylize({ start, end, Line::Span::CodepointOriented }, { Line::Style::Foreground(Line::Style::XtermColor::Yellow), Line::Style::Bold }); + struct HighlightState { + decltype(stylize)* stylize_fn; + size_t* open_indents; + bool indenters_starting_line { true }; + } highlight_state { &stylize, &open_indents }; + + JS::FFI::rust_tokenize(source_code->utf16_data(), source_code->length_in_code_units(), &highlight_state, + [](void* ctx, JS::FFI::FFIToken const* tok) { + auto& state = *static_cast(ctx); + auto type = static_cast(tok->token_type); + auto category = static_cast(tok->category); + auto start = static_cast(tok->offset); + auto end = start + tok->length; + if (type == JS::TokenType::Eof) + return; + + if (state.indenters_starting_line) { + if (type != JS::TokenType::ParenClose && type != JS::TokenType::BracketClose && type != JS::TokenType::CurlyClose) + state.indenters_starting_line = false; + else + --(*state.open_indents); + } + + switch (category) { + case JS::TokenCategory::Invalid: + (*state.stylize_fn)({ start, end, Line::Span::CodepointOriented }, { Line::Style::Foreground(Line::Style::XtermColor::Red), Line::Style::Underline }); + break; + case JS::TokenCategory::Number: + (*state.stylize_fn)({ start, end, Line::Span::CodepointOriented }, { Line::Style::Foreground(Line::Style::XtermColor::Magenta) }); + break; + case JS::TokenCategory::String: + (*state.stylize_fn)({ start, end, Line::Span::CodepointOriented }, { Line::Style::Foreground(Line::Style::XtermColor::Green), Line::Style::Bold }); + break; + case JS::TokenCategory::Punctuation: + case JS::TokenCategory::Operator: + break; + case JS::TokenCategory::Keyword: + if (type == JS::TokenType::BoolLiteral || type == JS::TokenType::NullLiteral) + (*state.stylize_fn)({ start, end, Line::Span::CodepointOriented }, { Line::Style::Foreground(Line::Style::XtermColor::Yellow), Line::Style::Bold }); + else + (*state.stylize_fn)({ start, end, Line::Span::CodepointOriented }, { Line::Style::Foreground(Line::Style::XtermColor::Blue), Line::Style::Bold }); + break; + case JS::TokenCategory::ControlKeyword: + (*state.stylize_fn)({ start, end, Line::Span::CodepointOriented }, { Line::Style::Foreground(Line::Style::XtermColor::Cyan), Line::Style::Italic }); + break; + case JS::TokenCategory::Identifier: + (*state.stylize_fn)({ start, end, Line::Span::CodepointOriented }, { Line::Style::Foreground(Line::Style::XtermColor::White), Line::Style::Bold }); break; default: - stylize({ start, end, Line::Span::CodepointOriented }, { Line::Style::Foreground(Line::Style::XtermColor::Blue), Line::Style::Bold }); break; } - break; - case JS::TokenCategory::ControlKeyword: - stylize({ start, end, Line::Span::CodepointOriented }, { Line::Style::Foreground(Line::Style::XtermColor::Cyan), Line::Style::Italic }); - break; - case JS::TokenCategory::Identifier: - stylize({ start, end, Line::Span::CodepointOriented }, { Line::Style::Foreground(Line::Style::XtermColor::White), Line::Style::Bold }); - break; - default: - break; - } - } + }); editor.set_prompt(prompt_for_level(open_indents).release_value_but_fixme_should_propagate_errors().to_byte_string()); }; auto complete = [&realm, &global_environment](Line::Editor const& editor) -> Vector { auto line = editor.line(editor.cursor()); + auto source_code = JS::SourceCode::create({}, Utf16String::from_utf8(line)); + auto const& code_view = source_code->code_view(); - JS::Lexer lexer(JS::SourceCode::create({}, Utf16String::from_utf8(line))); enum { Initial, CompleteVariable, @@ -684,6 +700,15 @@ static ErrorOr run_repl(bool gc_on_every_allocation, bool syntax_highlight) Utf16FlyString variable_name; Utf16FlyString property_name; + bool last_token_has_trivia = false; + + struct CompleteState { + decltype(mode)* current_mode; + Utf16FlyString* variable_name; + Utf16FlyString* property_name; + bool* last_token_has_trivia; + Utf16View const* code_view; + } complete_state { &mode, &variable_name, &property_name, &last_token_has_trivia, &code_view }; // we're only going to complete either // - @@ -691,45 +716,48 @@ static ErrorOr run_repl(bool gc_on_every_allocation, bool syntax_highlight) // - .

// where N is the complete name of a variable and // P is part of the name of one of its properties - auto js_token = lexer.next(); - for (; js_token.type() != JS::TokenType::Eof; js_token = lexer.next()) { - switch (mode) { - case CompleteVariable: - switch (js_token.type()) { - case JS::TokenType::Period: - // ... - mode = CompleteNullProperty; - break; - default: - // not a dot, reset back to initial - mode = Initial; - break; + JS::FFI::rust_tokenize(source_code->utf16_data(), source_code->length_in_code_units(), &complete_state, + [](void* ctx, JS::FFI::FFIToken const* tok) { + auto& s = *static_cast(ctx); + auto type = static_cast(tok->token_type); + auto category = static_cast(tok->category); + if (type == JS::TokenType::Eof) { + *s.last_token_has_trivia = tok->trivia_length > 0; + return; } - break; - case CompleteNullProperty: - if (js_token.is_identifier_name()) { - // ... - mode = CompleteProperty; - property_name = js_token.fly_string_value(); - } else { - mode = Initial; - } - break; - case CompleteProperty: - // something came after the property access, reset to initial - case Initial: - if (js_token.type() == JS::TokenType::Identifier) { - // ...... - mode = CompleteVariable; - variable_name = js_token.fly_string_value(); - } else { - mode = Initial; - } - break; - } - } - bool last_token_has_trivia = !js_token.trivia().is_empty(); + auto token_value = [&]() { + return Utf16FlyString::from_utf16(s.code_view->substring_view(tok->offset, tok->length)); + }; + bool is_identifier_name = type != JS::TokenType::PrivateIdentifier + && (category == JS::TokenCategory::Identifier || category == JS::TokenCategory::Keyword || category == JS::TokenCategory::ControlKeyword); + + switch (*s.current_mode) { + case CompleteVariable: + if (type == JS::TokenType::Period) + *s.current_mode = CompleteNullProperty; + else + *s.current_mode = Initial; + break; + case CompleteNullProperty: + if (is_identifier_name) { + *s.current_mode = CompleteProperty; + *s.property_name = token_value(); + } else { + *s.current_mode = Initial; + } + break; + case CompleteProperty: + case Initial: + if (type == JS::TokenType::Identifier) { + *s.current_mode = CompleteVariable; + *s.variable_name = token_value(); + } else { + *s.current_mode = Initial; + } + break; + } + }); if (mode == CompleteNullProperty) { mode = CompleteProperty;