LibJS: Remove C++ lexer, use Rust tokenizer for syntax highlighting

Delete Lexer.cpp/h and Token.cpp, replacing all tokenization with a
new rust_tokenize() FFI function that calls back for each token.

Rewrite SyntaxHighlighter.cpp and js.cpp REPL to use the Rust
tokenizer. The token type and category enums in Token.h now mirror
the Rust definitions in token.rs.

Move is_syntax_character/is_whitespace/is_line_terminator helpers
into RegExpConstructor.cpp as static functions, since they were only
used there.
This commit is contained in:
Andreas Kling
2026-03-19 14:03:42 -05:00
committed by Andreas Kling
parent 8ec7e7c07c
commit 30f108ba36
Notes: github-actions[bot] 2026-03-20 02:56:36 +00:00
12 changed files with 470 additions and 1877 deletions

View File

@@ -18,7 +18,6 @@ set(SOURCES
Contrib/Test262/IsHTMLDDA.cpp
CyclicModule.cpp
Heap/Cell.cpp
Lexer.cpp
Module.cpp
ParserError.cpp
Print.cpp
@@ -265,7 +264,6 @@ set(SOURCES
SourceTextModule.cpp
SyntaxHighlighter.cpp
SyntheticModule.cpp
Token.cpp
)
generate_bytecode_def_derived()

File diff suppressed because it is too large Load Diff

View File

@@ -1,88 +0,0 @@
/*
* Copyright (c) 2020, Stephan Unverwerth <s.unverwerth@serenityos.org>
* Copyright (c) 2020-2025, Andreas Kling <andreas@ladybird.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#pragma once
#include <AK/HashMap.h>
#include <AK/Utf16String.h>
#include <LibJS/Export.h>
#include <LibJS/SourceCode.h>
#include <LibJS/Token.h>
namespace JS {
class JS_API Lexer {
public:
explicit Lexer(NonnullRefPtr<SourceCode const>, size_t line_number = 1, size_t line_column = 0);
// These both advance the lexer and return a reference to the current token.
Token const& next();
Token const& force_slash_as_regex();
[[nodiscard]] Token const& current_token() const { return m_current_token; }
SourceCode const& source_code() const { return m_source_code; }
Utf16View const& source() const { return m_source_code->code_view(); }
Utf16String const& source_string() const { return m_source_code->code(); }
String const& filename() const { return m_source_code->filename(); }
void disallow_html_comments() { m_allow_html_comments = false; }
private:
void consume();
bool consume_exponent();
bool consume_octal_number();
bool consume_hexadecimal_number();
bool consume_binary_number();
bool consume_decimal_number();
u32 current_code_point() const;
bool is_eof() const;
bool is_line_terminator() const;
bool is_whitespace() const;
Optional<u32> is_identifier_unicode_escape(size_t& identifier_length) const;
Optional<u32> is_identifier_start(size_t& identifier_length) const;
Optional<u32> is_identifier_middle(size_t& identifier_length) const;
bool is_line_comment_start(bool line_has_token_yet) const;
bool is_block_comment_start() const;
bool is_block_comment_end() const;
bool is_numeric_literal_start() const;
bool match(char16_t, char16_t) const;
bool match(char16_t, char16_t, char16_t) const;
bool match(char16_t, char16_t, char16_t, char16_t) const;
template<typename Callback>
bool match_numeric_literal_separator_followed_by(Callback) const;
bool slash_means_division() const;
TokenType consume_regex_literal();
NonnullRefPtr<SourceCode const> m_source_code;
size_t m_position { 0 };
Token m_current_token;
char16_t m_current_code_unit { 0 };
bool m_eof { false };
bool m_regex_is_in_character_class { false };
bool m_allow_html_comments { true };
size_t m_line_number { 1 };
size_t m_line_column { 0 };
struct TemplateState {
bool in_expr;
u8 open_bracket_count;
};
Vector<TemplateState> m_template_states;
static HashMap<Utf16FlyString, TokenType> s_keywords;
};
bool is_syntax_character(u32 code_point);
bool is_whitespace(u32 code_point);
bool is_line_terminator(u32 code_point);
}

View File

@@ -6,15 +6,35 @@
#include <AK/CharacterTypes.h>
#include <AK/Find.h>
#include <LibJS/Lexer.h>
#include <LibJS/Runtime/Error.h>
#include <LibJS/Runtime/GlobalObject.h>
#include <LibJS/Runtime/RegExpConstructor.h>
#include <LibJS/Runtime/RegExpObject.h>
#include <LibJS/Runtime/Value.h>
#include <LibUnicode/CharacterTypes.h>
namespace JS {
static bool is_syntax_character(u32 code_point)
{
static constexpr auto syntax_characters = "^$\\.*+?()[]{}|"sv;
return is_ascii(code_point) && syntax_characters.contains(static_cast<char>(code_point));
}
static bool is_whitespace(u32 code_point)
{
if (is_ascii_space(code_point))
return true;
if (code_point == 0x00A0 || code_point == 0xFEFF)
return true;
return Unicode::code_point_has_space_separator_general_category(code_point);
}
static bool is_line_terminator(u32 code_point)
{
return code_point == '\n' || code_point == '\r' || code_point == 0x2028 || code_point == 0x2029;
}
GC_DEFINE_ALLOCATOR(RegExpConstructor);
RegExpConstructor::RegExpConstructor(Realm& realm)
@@ -164,7 +184,7 @@ static String encode_for_regexp_escape(u32 code_point)
});
// 1. If c is matched by SyntaxCharacter or c is U+002F (SOLIDUS), then
if (JS::is_syntax_character(code_point) || code_point == '/') {
if (is_syntax_character(code_point) || code_point == '/') {
// a. Return the string-concatenation of 0x005C (REVERSE SOLIDUS) and UTF16EncodeCodePoint(c).
return MUST(String::formatted("\\{}", String::from_code_point(code_point)));
}
@@ -186,7 +206,7 @@ static String encode_for_regexp_escape(u32 code_point)
// 5. If toEscape contains c, c is matched by either WhiteSpace or LineTerminator, or c has the same numeric value
// as a leading surrogate or trailing surrogate, then
if (to_escape.contains(code_point) || JS::is_whitespace(code_point) || JS::is_line_terminator(code_point) || is_unicode_surrogate(code_point)) {
if (to_escape.contains(code_point) || is_whitespace(code_point) || is_line_terminator(code_point) || is_unicode_surrogate(code_point)) {
// a. Let cNum be the numeric value of c.
// b. If cNum ≤ 0xFF, then
if (code_point <= 0xFF) {

View File

@@ -14,7 +14,7 @@ sys_includes = ["stdint.h", "stddef.h"]
usize_is_size_t = true
[export]
include = ["ConstantTag", "LiteralValueKind", "WellKnownSymbolKind"]
include = ["ConstantTag", "LiteralValueKind", "WellKnownSymbolKind", "FFIToken"]
[export.mangle]
rename_types = "PascalCase"

View File

@@ -2646,3 +2646,53 @@ unsafe extern "C" {
contains_direct_call_to_eval: bool,
);
}
/// C-compatible token info for the tokenize callback.
#[repr(C)]
pub struct FFIToken {
pub token_type: u8,
pub category: u8,
pub offset: u32,
pub length: u32,
pub trivia_offset: u32,
pub trivia_length: u32,
}
/// Tokenize a UTF-16 source string, calling `callback` for each token.
///
/// # Safety
/// - `source` must point to a valid UTF-16 buffer of `source_len` elements.
/// - `callback` must be a valid function pointer.
/// - `ctx` is passed through to the callback.
#[unsafe(no_mangle)]
pub unsafe extern "C" fn rust_tokenize(
source: *const u16,
source_len: usize,
ctx: *mut c_void,
callback: unsafe extern "C" fn(ctx: *mut c_void, token: *const FFIToken),
) {
unsafe {
abort_on_panic(|| {
let Some(source_slice) = source_from_raw(source, source_len) else {
return;
};
let mut lex = lexer::Lexer::new(source_slice, 1, 0);
loop {
let tok = lex.next();
let is_eof = tok.token_type == token::TokenType::Eof;
let ffi_tok = FFIToken {
token_type: tok.token_type as u8,
category: tok.token_type.category() as u8,
offset: tok.value_start,
length: tok.value_len,
trivia_offset: tok.trivia_start,
trivia_length: tok.trivia_len,
};
callback(ctx, &raw const ffi_tok);
if is_eof {
break;
}
}
});
}
}

View File

@@ -7,6 +7,7 @@
//! Token types and Token struct for the lexer.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[repr(C)]
pub enum TokenCategory {
Invalid,
Trivia,
@@ -25,6 +26,7 @@ pub enum TokenCategory {
macro_rules! define_tokens {
( $( $variant:ident => $category:ident ),* $(,)? ) => {
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[repr(C)]
pub enum TokenType {
$( $variant, )*
}

View File

@@ -6,16 +6,21 @@
*/
#include <AK/Debug.h>
#include <AK/Utf16String.h>
#include <LibGfx/Palette.h>
#include <LibJS/Lexer.h>
#include <LibJS/SourceCode.h>
#include <LibJS/SyntaxHighlighter.h>
#include <LibJS/Token.h>
#ifdef ENABLE_RUST
# include <LibJS/RustFFI.h>
#endif
namespace JS {
static Gfx::TextAttributes style_for_token_type(Gfx::Palette const& palette, TokenType type)
static Gfx::TextAttributes style_for_token_category(Gfx::Palette const& palette, TokenCategory category)
{
switch (Token::category(type)) {
switch (category) {
case TokenCategory::Invalid:
return { palette.syntax_comment() };
case TokenCategory::Number:
@@ -39,8 +44,7 @@ static Gfx::TextAttributes style_for_token_type(Gfx::Palette const& palette, Tok
bool SyntaxHighlighter::is_identifier(u64 token) const
{
auto js_token = static_cast<TokenType>(static_cast<size_t>(token));
return js_token == TokenType::Identifier;
return token_type_from_packed(token) == TokenType::Identifier;
}
bool SyntaxHighlighter::is_navigatable([[maybe_unused]] u64 token) const
@@ -48,80 +52,103 @@ bool SyntaxHighlighter::is_navigatable([[maybe_unused]] u64 token) const
return false;
}
struct RehighlightState {
Gfx::Palette const& palette;
Vector<Syntax::TextDocumentSpan>& spans;
Vector<Syntax::TextDocumentFoldingRegion>& folding_regions;
u16 const* source;
Syntax::TextPosition position { 0, 0 };
struct FoldStart {
Syntax::TextRange range;
};
Vector<FoldStart> folding_region_starts;
};
static void advance_position(Syntax::TextPosition& position, u16 const* source, u32 start, u32 len)
{
for (u32 i = 0; i < len; ++i) {
if (source[start + i] == '\n') {
position.set_line(position.line() + 1);
position.set_column(0);
} else {
position.set_column(position.column() + 1);
}
}
}
static void on_token(void* ctx, FFI::FFIToken const* ffi_token)
{
auto& state = *static_cast<RehighlightState*>(ctx);
auto token_type = static_cast<TokenType>(ffi_token->token_type);
auto category = static_cast<TokenCategory>(ffi_token->category);
// Emit trivia span
if (ffi_token->trivia_length > 0) {
auto trivia_start = state.position;
advance_position(state.position, state.source, ffi_token->trivia_offset, ffi_token->trivia_length);
Syntax::TextDocumentSpan span;
span.range.set_start(trivia_start);
span.range.set_end({ state.position.line(), state.position.column() });
span.attributes = style_for_token_category(state.palette, TokenCategory::Trivia);
span.is_skippable = true;
span.data = pack_token_data(TokenType::Trivia, TokenCategory::Trivia);
state.spans.append(span);
}
// Emit token span
auto token_start = state.position;
if (ffi_token->length > 0) {
advance_position(state.position, state.source, ffi_token->offset, ffi_token->length);
Syntax::TextDocumentSpan span;
span.range.set_start(token_start);
span.range.set_end({ state.position.line(), state.position.column() });
span.attributes = style_for_token_category(state.palette, category);
span.is_skippable = false;
span.data = pack_token_data(token_type, category);
state.spans.append(span);
}
// Track folding regions for {} blocks
if (token_type == TokenType::CurlyOpen) {
state.folding_region_starts.append({ .range = { token_start, state.position } });
} else if (token_type == TokenType::CurlyClose) {
if (!state.folding_region_starts.is_empty()) {
auto curly_open = state.folding_region_starts.take_last();
Syntax::TextDocumentFoldingRegion region;
region.range.set_start(curly_open.range.end());
region.range.set_end(token_start);
state.folding_regions.append(region);
}
}
}
void SyntaxHighlighter::rehighlight(Palette const& palette)
{
auto text = m_client->get_text();
Lexer lexer(SourceCode::create({}, Utf16String::from_utf8(text)));
auto source_utf16 = Utf16String::from_utf8(text);
auto source_code = SourceCode::create({}, move(source_utf16));
auto const* source_data = source_code->utf16_data();
auto source_len = source_code->length_in_code_units();
Vector<Syntax::TextDocumentSpan> spans;
Vector<Syntax::TextDocumentFoldingRegion> folding_regions;
Syntax::TextPosition position { 0, 0 };
Syntax::TextPosition start { 0, 0 };
auto advance_position = [&position](u32 code_point) {
if (code_point == '\n') {
position.set_line(position.line() + 1);
position.set_column(0);
} else
position.set_column(position.column() + 1);
RehighlightState state {
.palette = palette,
.spans = spans,
.folding_regions = folding_regions,
.source = source_data,
.position = {},
.folding_region_starts = {},
};
auto append_token = [&](Utf16View const& str, Token const& token, bool is_trivia) {
if (str.is_empty())
return;
start = position;
for (auto code_point : str)
advance_position(code_point);
Syntax::TextDocumentSpan span;
span.range.set_start(start);
span.range.set_end({ position.line(), position.column() });
auto type = is_trivia ? TokenType::Trivia : token.type();
span.attributes = style_for_token_type(palette, type);
span.is_skippable = is_trivia;
span.data = static_cast<u64>(type);
spans.append(span);
dbgln_if(SYNTAX_HIGHLIGHTING_DEBUG, "{}{} @ '{}' {}:{} - {}:{}",
token.name(),
is_trivia ? " (trivia)" : "",
token.value(),
span.range.start().line(), span.range.start().column(),
span.range.end().line(), span.range.end().column());
};
struct TokenData {
Token token;
Syntax::TextRange range;
};
Vector<TokenData> folding_region_start_tokens;
bool was_eof = false;
for (auto token = lexer.next(); !was_eof; token = lexer.next()) {
append_token(token.trivia(), token, true);
auto token_start_position = position;
append_token(token.value(), token, false);
if (token.type() == TokenType::Eof)
was_eof = true;
// Create folding regions for {} blocks
if (token.type() == TokenType::CurlyOpen) {
folding_region_start_tokens.append({ .token = token,
.range = { token_start_position, position } });
} else if (token.type() == TokenType::CurlyClose) {
if (!folding_region_start_tokens.is_empty()) {
auto curly_open = folding_region_start_tokens.take_last();
Syntax::TextDocumentFoldingRegion region;
region.range.set_start(curly_open.range.end());
region.range.set_end(token_start_position);
folding_regions.append(region);
}
}
}
#ifdef ENABLE_RUST
FFI::rust_tokenize(source_data, source_len, &state,
[](void* ctx, FFI::FFIToken const* token) { on_token(ctx, token); });
#else
(void)source_len;
#endif
m_client->do_set_spans(move(spans));
m_client->do_set_folding_regions(move(folding_regions));
@@ -136,9 +163,9 @@ Vector<Syntax::Highlighter::MatchingTokenPair> SyntaxHighlighter::matching_token
{
static Vector<Syntax::Highlighter::MatchingTokenPair> pairs;
if (pairs.is_empty()) {
pairs.append({ static_cast<u64>(TokenType::CurlyOpen), static_cast<u64>(TokenType::CurlyClose) });
pairs.append({ static_cast<u64>(TokenType::ParenOpen), static_cast<u64>(TokenType::ParenClose) });
pairs.append({ static_cast<u64>(TokenType::BracketOpen), static_cast<u64>(TokenType::BracketClose) });
pairs.append({ pack_token_data(TokenType::CurlyOpen, TokenCategory::Punctuation), pack_token_data(TokenType::CurlyClose, TokenCategory::Punctuation) });
pairs.append({ pack_token_data(TokenType::ParenOpen, TokenCategory::Punctuation), pack_token_data(TokenType::ParenClose, TokenCategory::Punctuation) });
pairs.append({ pack_token_data(TokenType::BracketOpen, TokenCategory::Punctuation), pack_token_data(TokenType::BracketClose, TokenCategory::Punctuation) });
}
return pairs;
}

View File

@@ -1,308 +0,0 @@
/*
* Copyright (c) 2020, Stephan Unverwerth <s.unverwerth@serenityos.org>
* Copyright (c) 2020-2021, Linus Groh <linusg@serenityos.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#include <AK/Assertions.h>
#include <AK/CharacterTypes.h>
#include <AK/GenericLexer.h>
#include <AK/StringBuilder.h>
#include <LibJS/Token.h>
namespace JS {
char const* Token::name(TokenType type)
{
switch (type) {
#define __ENUMERATE_JS_TOKEN(type, category) \
case TokenType::type: \
return #type;
ENUMERATE_JS_TOKENS
#undef __ENUMERATE_JS_TOKEN
default:
VERIFY_NOT_REACHED();
return "<Unknown>";
}
}
char const* Token::name() const
{
return name(m_type);
}
TokenCategory Token::category(TokenType type)
{
switch (type) {
#define __ENUMERATE_JS_TOKEN(type, category) \
case TokenType::type: \
return TokenCategory::category;
ENUMERATE_JS_TOKENS
#undef __ENUMERATE_JS_TOKEN
default:
VERIFY_NOT_REACHED();
}
}
TokenCategory Token::category() const
{
return category(m_type);
}
double Token::double_value() const
{
VERIFY(type() == TokenType::NumericLiteral);
auto value = this->value();
Utf16String buffer;
if (value.contains('_')) {
buffer = value.replace("_"sv, {}, ReplaceMode::All);
value = buffer;
}
auto parse_integer_digits = [](Utf16View digits, u8 radix) -> double {
if (auto v = digits.to_number<u64>(TrimWhitespace::No, radix); v.has_value())
return static_cast<double>(v.value());
double result = 0.0;
for (size_t i = 0; i < digits.length_in_code_units(); ++i) {
auto digit = parse_ascii_hex_digit(digits.code_unit_at(i));
result = result * radix + digit;
}
return result;
};
if (value.length_in_code_units() >= 2 && value.starts_with('0')) {
auto next = value.code_unit_at(1);
// hexadecimal
if (next == 'x' || next == 'X')
return parse_integer_digits(value.substring_view(2), 16);
// octal
if (next == 'o' || next == 'O')
return parse_integer_digits(value.substring_view(2), 8);
// binary
if (next == 'b' || next == 'B')
return parse_integer_digits(value.substring_view(2), 2);
// also octal, but syntax error in strict mode
if (is_ascii_digit(next) && !value.contains_any_of({ { '8', '9' } }))
return parse_integer_digits(value.substring_view(1), 8);
}
// This should always be a valid double
return value.to_number<double>(TrimWhitespace::No).value();
}
Utf16String Token::string_value(StringValueStatus& status) const
{
VERIFY(type() == TokenType::StringLiteral || type() == TokenType::TemplateLiteralString);
auto is_template = type() == TokenType::TemplateLiteralString;
auto value = this->value();
Utf16GenericLexer lexer(is_template ? value : value.substring_view(1, value.length_in_code_units() - 2));
auto encoding_failure = [&status](StringValueStatus parse_status) -> Utf16String {
status = parse_status;
return {};
};
StringBuilder builder(StringBuilder::Mode::UTF16);
while (!lexer.is_eof()) {
// No escape, consume one char and continue
if (!lexer.next_is('\\')) {
if (is_template && lexer.next_is('\r')) {
lexer.ignore();
if (lexer.next_is('\n'))
lexer.ignore();
builder.append('\n');
continue;
}
builder.append_code_unit(lexer.consume());
continue;
}
// Unicode escape
if (lexer.next_is("\\u"sv)) {
auto code_point_or_error = lexer.consume_escaped_code_point();
if (code_point_or_error.is_error()) {
switch (code_point_or_error.error()) {
case AK::UnicodeEscapeError::MalformedUnicodeEscape:
return encoding_failure(StringValueStatus::MalformedUnicodeEscape);
case AK::UnicodeEscapeError::UnicodeEscapeOverflow:
return encoding_failure(StringValueStatus::UnicodeEscapeOverflow);
}
}
builder.append_code_point(code_point_or_error.value());
continue;
}
lexer.ignore();
VERIFY(!lexer.is_eof());
// Line continuation
if (lexer.next_is('\n') || lexer.next_is('\r')) {
if (lexer.next_is("\r\n"sv))
lexer.ignore();
lexer.ignore();
continue;
}
// Line continuation
if (lexer.next_is(LINE_SEPARATOR) || lexer.next_is(PARAGRAPH_SEPARATOR)) {
lexer.ignore();
continue;
}
// Null-byte escape
if (lexer.next_is('0') && !is_ascii_digit(lexer.peek(1))) {
lexer.ignore();
builder.append('\0');
continue;
}
// Hex escape
if (lexer.next_is('x')) {
lexer.ignore();
if (!is_ascii_hex_digit(lexer.peek()) || !is_ascii_hex_digit(lexer.peek(1)))
return encoding_failure(StringValueStatus::MalformedHexEscape);
auto code_point = lexer.consume(2).to_number<u32>(TrimWhitespace::No, 16).value();
VERIFY(code_point <= 255);
builder.append_code_point(code_point);
continue;
}
// In non-strict mode LegacyOctalEscapeSequence is allowed in strings:
// https://tc39.es/ecma262/#sec-additional-syntax-string-literals
Optional<Utf16View> octal_str;
auto is_octal_digit = [](auto ch) { return ch >= '0' && ch <= '7'; };
auto is_zero_to_three = [](auto ch) { return ch >= '0' && ch <= '3'; };
auto is_four_to_seven = [](auto ch) { return ch >= '4' && ch <= '7'; };
// OctalDigit [lookahead ∉ OctalDigit]
if (is_octal_digit(lexer.peek()) && !is_octal_digit(lexer.peek(1)))
octal_str = lexer.consume(1);
// ZeroToThree OctalDigit [lookahead ∉ OctalDigit]
else if (is_zero_to_three(lexer.peek()) && is_octal_digit(lexer.peek(1)) && !is_octal_digit(lexer.peek(2)))
octal_str = lexer.consume(2);
// FourToSeven OctalDigit
else if (is_four_to_seven(lexer.peek()) && is_octal_digit(lexer.peek(1)))
octal_str = lexer.consume(2);
// ZeroToThree OctalDigit OctalDigit
else if (is_zero_to_three(lexer.peek()) && is_octal_digit(lexer.peek(1)) && is_octal_digit(lexer.peek(2)))
octal_str = lexer.consume(3);
if (octal_str.has_value()) {
status = StringValueStatus::LegacyOctalEscapeSequence;
auto code_point = octal_str->to_number<u32>(TrimWhitespace::No, 8).value();
VERIFY(code_point <= 255);
builder.append_code_point(code_point);
continue;
}
if (lexer.next_is('8') || lexer.next_is('9')) {
status = StringValueStatus::LegacyOctalEscapeSequence;
builder.append_code_unit(lexer.consume());
continue;
}
lexer.retreat();
builder.append_code_unit(lexer.consume_escaped_character('\\', "b\bf\fn\nr\rt\tv\v"sv));
}
return builder.to_utf16_string();
}
// 12.8.6.2 Static Semantics: TRV, https://tc39.es/ecma262/#sec-static-semantics-trv
Utf16String Token::raw_template_value() const
{
return value().replace("\r\n"sv, "\n"sv, ReplaceMode::All).replace("\r"sv, "\n"sv, ReplaceMode::All);
}
bool Token::bool_value() const
{
VERIFY(type() == TokenType::BoolLiteral);
return value() == "true"sv;
}
bool Token::is_identifier_name() const
{
// IdentifierNames are Identifiers + ReservedWords
// The standard defines this reversed: Identifiers are IdentifierNames except reserved words
// https://tc39.es/ecma262/#prod-Identifier
return m_type == TokenType::Identifier
|| m_type == TokenType::EscapedKeyword
|| m_type == TokenType::Await
|| m_type == TokenType::Async
|| m_type == TokenType::BoolLiteral
|| m_type == TokenType::Break
|| m_type == TokenType::Case
|| m_type == TokenType::Catch
|| m_type == TokenType::Class
|| m_type == TokenType::Const
|| m_type == TokenType::Continue
|| m_type == TokenType::Debugger
|| m_type == TokenType::Default
|| m_type == TokenType::Delete
|| m_type == TokenType::Do
|| m_type == TokenType::Else
|| m_type == TokenType::Enum
|| m_type == TokenType::Export
|| m_type == TokenType::Extends
|| m_type == TokenType::Finally
|| m_type == TokenType::For
|| m_type == TokenType::Function
|| m_type == TokenType::If
|| m_type == TokenType::Import
|| m_type == TokenType::In
|| m_type == TokenType::Instanceof
|| m_type == TokenType::Let
|| m_type == TokenType::New
|| m_type == TokenType::NullLiteral
|| m_type == TokenType::Return
|| m_type == TokenType::Super
|| m_type == TokenType::Switch
|| m_type == TokenType::This
|| m_type == TokenType::Throw
|| m_type == TokenType::Try
|| m_type == TokenType::Typeof
|| m_type == TokenType::Var
|| m_type == TokenType::Void
|| m_type == TokenType::While
|| m_type == TokenType::With
|| m_type == TokenType::Yield;
}
bool Token::trivia_contains_line_terminator() const
{
return m_trivia.contains('\n') || m_trivia.contains('\r') || m_trivia.contains(LINE_SEPARATOR) || m_trivia.contains(PARAGRAPH_SEPARATOR);
}
String Token::message() const
{
switch (m_message) {
case Message::StartOfPrivateNameNotFollowedByValidIdentifier:
return "Start of private name '#' but not followed by valid identifier"_string;
case Message::InvalidNumericLiteral:
return "Invalid numeric literal"_string;
case Message::UnterminatedMultiLineComment:
return "Unterminated multi-line comment"_string;
case Message::None:
return {};
}
VERIFY_NOT_REACHED();
return {};
}
}

View File

@@ -1,169 +1,22 @@
/*
* Copyright (c) 2020, Stephan Unverwerth <s.unverwerth@serenityos.org>
* Copyright (c) 2026-present, the Ladybird developers.
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#pragma once
#include <AK/Utf16FlyString.h>
#include <AK/Utf16String.h>
#include <AK/Variant.h>
#include <LibJS/Export.h>
#include <AK/Types.h>
namespace JS {
// U+00A0 NO BREAK SPACE
constexpr inline char16_t const NO_BREAK_SPACE { 0x00A0 };
constexpr u32 LINE_SEPARATOR = 0x2028;
constexpr u32 PARAGRAPH_SEPARATOR = 0x2029;
// U+200C ZERO WIDTH NON-JOINER
constexpr inline char16_t const ZERO_WIDTH_NON_JOINER { 0x200C };
// NB: These enums must match the Rust token::TokenCategory and
// token::TokenType enums in Libraries/LibJS/Rust/src/token.rs.
// U+200D ZERO WIDTH JOINER
constexpr inline char16_t const ZERO_WIDTH_JOINER { 0x200D };
// U+2028 LINE SEPARATOR
constexpr inline char16_t const LINE_SEPARATOR { 0x2028 };
// U+2029 PARAGRAPH SEPARATOR
constexpr inline char16_t const PARAGRAPH_SEPARATOR { 0x2029 };
// U+FEFF ZERO WIDTH NO-BREAK SPACE
constexpr inline char16_t const ZERO_WIDTH_NO_BREAK_SPACE { 0xFEFF };
#define ENUMERATE_JS_TOKENS \
__ENUMERATE_JS_TOKEN(Ampersand, Operator) \
__ENUMERATE_JS_TOKEN(AmpersandEquals, Operator) \
__ENUMERATE_JS_TOKEN(Arrow, Operator) \
__ENUMERATE_JS_TOKEN(Asterisk, Operator) \
__ENUMERATE_JS_TOKEN(AsteriskEquals, Operator) \
__ENUMERATE_JS_TOKEN(Async, Keyword) \
__ENUMERATE_JS_TOKEN(Await, Keyword) \
__ENUMERATE_JS_TOKEN(BigIntLiteral, Number) \
__ENUMERATE_JS_TOKEN(BoolLiteral, Keyword) \
__ENUMERATE_JS_TOKEN(BracketClose, Punctuation) \
__ENUMERATE_JS_TOKEN(BracketOpen, Punctuation) \
__ENUMERATE_JS_TOKEN(Break, Keyword) \
__ENUMERATE_JS_TOKEN(Caret, Operator) \
__ENUMERATE_JS_TOKEN(CaretEquals, Operator) \
__ENUMERATE_JS_TOKEN(Case, ControlKeyword) \
__ENUMERATE_JS_TOKEN(Catch, ControlKeyword) \
__ENUMERATE_JS_TOKEN(Class, Keyword) \
__ENUMERATE_JS_TOKEN(Colon, Punctuation) \
__ENUMERATE_JS_TOKEN(Comma, Punctuation) \
__ENUMERATE_JS_TOKEN(Const, Keyword) \
__ENUMERATE_JS_TOKEN(Continue, ControlKeyword) \
__ENUMERATE_JS_TOKEN(CurlyClose, Punctuation) \
__ENUMERATE_JS_TOKEN(CurlyOpen, Punctuation) \
__ENUMERATE_JS_TOKEN(Debugger, Keyword) \
__ENUMERATE_JS_TOKEN(Default, ControlKeyword) \
__ENUMERATE_JS_TOKEN(Delete, Keyword) \
__ENUMERATE_JS_TOKEN(Do, ControlKeyword) \
__ENUMERATE_JS_TOKEN(DoubleAmpersand, Operator) \
__ENUMERATE_JS_TOKEN(DoubleAmpersandEquals, Operator) \
__ENUMERATE_JS_TOKEN(DoubleAsterisk, Operator) \
__ENUMERATE_JS_TOKEN(DoubleAsteriskEquals, Operator) \
__ENUMERATE_JS_TOKEN(DoublePipe, Operator) \
__ENUMERATE_JS_TOKEN(DoublePipeEquals, Operator) \
__ENUMERATE_JS_TOKEN(DoubleQuestionMark, Operator) \
__ENUMERATE_JS_TOKEN(DoubleQuestionMarkEquals, Operator) \
__ENUMERATE_JS_TOKEN(Else, ControlKeyword) \
__ENUMERATE_JS_TOKEN(Enum, Keyword) \
__ENUMERATE_JS_TOKEN(Eof, Invalid) \
__ENUMERATE_JS_TOKEN(Equals, Operator) \
__ENUMERATE_JS_TOKEN(EqualsEquals, Operator) \
__ENUMERATE_JS_TOKEN(EqualsEqualsEquals, Operator) \
__ENUMERATE_JS_TOKEN(EscapedKeyword, Identifier) \
__ENUMERATE_JS_TOKEN(ExclamationMark, Operator) \
__ENUMERATE_JS_TOKEN(ExclamationMarkEquals, Operator) \
__ENUMERATE_JS_TOKEN(ExclamationMarkEqualsEquals, Operator) \
__ENUMERATE_JS_TOKEN(Export, Keyword) \
__ENUMERATE_JS_TOKEN(Extends, Keyword) \
__ENUMERATE_JS_TOKEN(Finally, ControlKeyword) \
__ENUMERATE_JS_TOKEN(For, ControlKeyword) \
__ENUMERATE_JS_TOKEN(Function, Keyword) \
__ENUMERATE_JS_TOKEN(GreaterThan, Operator) \
__ENUMERATE_JS_TOKEN(GreaterThanEquals, Operator) \
__ENUMERATE_JS_TOKEN(Identifier, Identifier) \
__ENUMERATE_JS_TOKEN(If, ControlKeyword) \
__ENUMERATE_JS_TOKEN(Implements, Keyword) \
__ENUMERATE_JS_TOKEN(Import, Keyword) \
__ENUMERATE_JS_TOKEN(In, Keyword) \
__ENUMERATE_JS_TOKEN(Instanceof, Keyword) \
__ENUMERATE_JS_TOKEN(Interface, Keyword) \
__ENUMERATE_JS_TOKEN(Invalid, Invalid) \
__ENUMERATE_JS_TOKEN(LessThan, Operator) \
__ENUMERATE_JS_TOKEN(LessThanEquals, Operator) \
__ENUMERATE_JS_TOKEN(Let, Keyword) \
__ENUMERATE_JS_TOKEN(Minus, Operator) \
__ENUMERATE_JS_TOKEN(MinusEquals, Operator) \
__ENUMERATE_JS_TOKEN(MinusMinus, Operator) \
__ENUMERATE_JS_TOKEN(New, Keyword) \
__ENUMERATE_JS_TOKEN(NullLiteral, Keyword) \
__ENUMERATE_JS_TOKEN(NumericLiteral, Number) \
__ENUMERATE_JS_TOKEN(Package, Keyword) \
__ENUMERATE_JS_TOKEN(ParenClose, Punctuation) \
__ENUMERATE_JS_TOKEN(ParenOpen, Punctuation) \
__ENUMERATE_JS_TOKEN(Percent, Operator) \
__ENUMERATE_JS_TOKEN(PercentEquals, Operator) \
__ENUMERATE_JS_TOKEN(Period, Operator) \
__ENUMERATE_JS_TOKEN(Pipe, Operator) \
__ENUMERATE_JS_TOKEN(PipeEquals, Operator) \
__ENUMERATE_JS_TOKEN(Plus, Operator) \
__ENUMERATE_JS_TOKEN(PlusEquals, Operator) \
__ENUMERATE_JS_TOKEN(PlusPlus, Operator) \
__ENUMERATE_JS_TOKEN(Private, Keyword) \
__ENUMERATE_JS_TOKEN(PrivateIdentifier, Identifier) \
__ENUMERATE_JS_TOKEN(Protected, Keyword) \
__ENUMERATE_JS_TOKEN(Public, Keyword) \
__ENUMERATE_JS_TOKEN(QuestionMark, Operator) \
__ENUMERATE_JS_TOKEN(QuestionMarkPeriod, Operator) \
__ENUMERATE_JS_TOKEN(RegexFlags, String) \
__ENUMERATE_JS_TOKEN(RegexLiteral, String) \
__ENUMERATE_JS_TOKEN(Return, ControlKeyword) \
__ENUMERATE_JS_TOKEN(Semicolon, Punctuation) \
__ENUMERATE_JS_TOKEN(ShiftLeft, Operator) \
__ENUMERATE_JS_TOKEN(ShiftLeftEquals, Operator) \
__ENUMERATE_JS_TOKEN(ShiftRight, Operator) \
__ENUMERATE_JS_TOKEN(ShiftRightEquals, Operator) \
__ENUMERATE_JS_TOKEN(Slash, Operator) \
__ENUMERATE_JS_TOKEN(SlashEquals, Operator) \
__ENUMERATE_JS_TOKEN(Static, Keyword) \
__ENUMERATE_JS_TOKEN(StringLiteral, String) \
__ENUMERATE_JS_TOKEN(Super, Keyword) \
__ENUMERATE_JS_TOKEN(Switch, ControlKeyword) \
__ENUMERATE_JS_TOKEN(TemplateLiteralEnd, String) \
__ENUMERATE_JS_TOKEN(TemplateLiteralExprEnd, Punctuation) \
__ENUMERATE_JS_TOKEN(TemplateLiteralExprStart, Punctuation) \
__ENUMERATE_JS_TOKEN(TemplateLiteralStart, String) \
__ENUMERATE_JS_TOKEN(TemplateLiteralString, String) \
__ENUMERATE_JS_TOKEN(This, Keyword) \
__ENUMERATE_JS_TOKEN(Throw, ControlKeyword) \
__ENUMERATE_JS_TOKEN(Tilde, Operator) \
__ENUMERATE_JS_TOKEN(TripleDot, Operator) \
__ENUMERATE_JS_TOKEN(Trivia, Trivia) \
__ENUMERATE_JS_TOKEN(Try, ControlKeyword) \
__ENUMERATE_JS_TOKEN(Typeof, Keyword) \
__ENUMERATE_JS_TOKEN(UnsignedShiftRight, Operator) \
__ENUMERATE_JS_TOKEN(UnsignedShiftRightEquals, Operator) \
__ENUMERATE_JS_TOKEN(UnterminatedRegexLiteral, String) \
__ENUMERATE_JS_TOKEN(UnterminatedStringLiteral, String) \
__ENUMERATE_JS_TOKEN(UnterminatedTemplateLiteral, String) \
__ENUMERATE_JS_TOKEN(Var, Keyword) \
__ENUMERATE_JS_TOKEN(Void, Keyword) \
__ENUMERATE_JS_TOKEN(While, ControlKeyword) \
__ENUMERATE_JS_TOKEN(With, ControlKeyword) \
__ENUMERATE_JS_TOKEN(Yield, ControlKeyword)
enum class TokenType {
#define __ENUMERATE_JS_TOKEN(type, category) type,
ENUMERATE_JS_TOKENS
#undef __ENUMERATE_JS_TOKEN
_COUNT_OF_TOKENS
};
constexpr size_t cs_num_of_js_tokens = static_cast<size_t>(TokenType::_COUNT_OF_TOKENS);
enum class TokenCategory {
enum class TokenCategory : u8 {
Invalid,
Trivia,
Number,
@@ -172,91 +25,151 @@ enum class TokenCategory {
Operator,
Keyword,
ControlKeyword,
Identifier
Identifier,
};
class JS_API Token {
public:
enum class Message {
None,
StartOfPrivateNameNotFollowedByValidIdentifier,
InvalidNumericLiteral,
UnterminatedMultiLineComment,
};
Token() = default;
Token(TokenType type, Message message, Utf16View const& trivia, Utf16View const& value, size_t line_number, size_t line_column, size_t offset)
: m_type(type)
, m_message(message)
, m_trivia(trivia)
, m_original_value(value)
, m_value(value)
, m_line_number(line_number)
, m_line_column(line_column)
, m_offset(offset)
{
}
TokenType type() const { return m_type; }
TokenCategory category() const;
static TokenCategory category(TokenType);
char const* name() const;
static char const* name(TokenType);
String message() const;
Utf16View const& trivia() const { return m_trivia; }
Utf16View const& original_value() const { return m_original_value; }
Utf16View value() const
{
return m_value.visit(
[](Utf16View const& view) { return view; },
[](Utf16FlyString const& identifier) { return identifier.view(); },
[](Empty) -> Utf16View { VERIFY_NOT_REACHED(); });
}
Utf16FlyString fly_string_value() const
{
return m_value.visit(
[](Utf16View const& view) { return Utf16FlyString::from_utf16(view); },
[](Utf16FlyString const& identifier) { return identifier; },
[](Empty) -> Utf16FlyString { VERIFY_NOT_REACHED(); });
}
u32 line_number() const { return m_line_number; }
u32 line_column() const { return m_line_column; }
u32 offset() const { return m_offset; }
double double_value() const;
bool bool_value() const;
enum class StringValueStatus {
Ok,
MalformedHexEscape,
MalformedUnicodeEscape,
UnicodeEscapeOverflow,
LegacyOctalEscapeSequence,
};
Utf16String string_value(StringValueStatus& status) const;
Utf16String raw_template_value() const;
void set_identifier_value(Utf16FlyString value)
{
m_value = move(value);
}
bool is_identifier_name() const;
bool trivia_contains_line_terminator() const;
private:
TokenType m_type { TokenType::Invalid };
Message m_message { Message::None };
Utf16View m_trivia;
Utf16View m_original_value;
Variant<Empty, Utf16View, Utf16FlyString> m_value;
u32 m_line_number { 0 };
u32 m_line_column { 0 };
u32 m_offset { 0 };
// NB: Keep in sync with define_tokens! in token.rs.
// The order must be identical (alphabetical by variant name).
enum class TokenType : u8 {
Ampersand,
AmpersandEquals,
Arrow,
Asterisk,
AsteriskEquals,
Async,
Await,
BigIntLiteral,
BoolLiteral,
BracketClose,
BracketOpen,
Break,
Caret,
CaretEquals,
Case,
Catch,
Class,
Colon,
Comma,
Const,
Continue,
CurlyClose,
CurlyOpen,
Debugger,
Default,
Delete,
Do,
DoubleAmpersand,
DoubleAmpersandEquals,
DoubleAsterisk,
DoubleAsteriskEquals,
DoublePipe,
DoublePipeEquals,
DoubleQuestionMark,
DoubleQuestionMarkEquals,
Else,
Enum,
Eof,
Equals,
EqualsEquals,
EqualsEqualsEquals,
EscapedKeyword,
ExclamationMark,
ExclamationMarkEquals,
ExclamationMarkEqualsEquals,
Export,
Extends,
Finally,
For,
Function,
GreaterThan,
GreaterThanEquals,
Identifier,
If,
Implements,
Import,
In,
Instanceof,
Interface,
Invalid,
LessThan,
LessThanEquals,
Let,
Minus,
MinusEquals,
MinusMinus,
New,
NullLiteral,
NumericLiteral,
Package,
ParenClose,
ParenOpen,
Percent,
PercentEquals,
Period,
Pipe,
PipeEquals,
Plus,
PlusEquals,
PlusPlus,
Private,
PrivateIdentifier,
Protected,
Public,
QuestionMark,
QuestionMarkPeriod,
RegexFlags,
RegexLiteral,
Return,
Semicolon,
ShiftLeft,
ShiftLeftEquals,
ShiftRight,
ShiftRightEquals,
Slash,
SlashEquals,
Static,
StringLiteral,
Super,
Switch,
TemplateLiteralEnd,
TemplateLiteralExprEnd,
TemplateLiteralExprStart,
TemplateLiteralStart,
TemplateLiteralString,
This,
Throw,
Tilde,
TripleDot,
Trivia,
Try,
Typeof,
UnsignedShiftRight,
UnsignedShiftRightEquals,
UnterminatedRegexLiteral,
UnterminatedStringLiteral,
UnterminatedTemplateLiteral,
Var,
Void,
While,
With,
Yield,
_COUNT_OF_TOKENS,
};
// Pack token type and category into a u64 for span data storage.
inline u64 pack_token_data(TokenType type, TokenCategory category)
{
return (static_cast<u64>(category) << 8) | static_cast<u64>(type);
}
inline TokenType token_type_from_packed(u64 data)
{
return static_cast<TokenType>(data & 0xFF);
}
inline TokenCategory token_category_from_packed(u64 data)
{
return static_cast<TokenCategory>((data >> 8) & 0xFF);
}
}

View File

@@ -201,7 +201,7 @@ StringView SourceHighlighterClient::class_for_token(u64 token_type) const
};
auto class_for_js_token = [](u64 token_type) {
auto category = JS::Token::category(static_cast<JS::TokenType>(token_type));
auto category = JS::token_category_from_packed(token_type);
switch (category) {
case JS::TokenCategory::Invalid:
return "invalid"sv;

View File

@@ -16,7 +16,6 @@
#include <LibJS/Bytecode/Interpreter.h>
#include <LibJS/Console.h>
#include <LibJS/Contrib/Test262/GlobalObject.h>
#include <LibJS/Lexer.h>
#include <LibJS/Print.h>
#include <LibJS/Runtime/ConsoleObject.h>
#include <LibJS/Runtime/DeclarativeEnvironment.h>
@@ -25,8 +24,11 @@
#include <LibJS/Runtime/Reference.h>
#include <LibJS/Runtime/StringPrototype.h>
#include <LibJS/Runtime/ValueInlines.h>
#include <LibJS/RustFFI.h>
#include <LibJS/Script.h>
#include <LibJS/SourceCode.h>
#include <LibJS/SourceTextModule.h>
#include <LibJS/Token.h>
#include <LibMain/Main.h>
#include <LibTextCodec/Decoder.h>
#include <signal.h>
@@ -509,7 +511,8 @@ static ErrorOr<String> read_next_piece()
piece.append(line);
piece.append('\n');
auto lexer = JS::Lexer(JS::SourceCode::create({}, Utf16String::from_utf8(line)));
auto source_code = JS::SourceCode::create({}, Utf16String::from_utf8(line));
enum {
NotInLabelOrObjectKey,
@@ -517,38 +520,45 @@ static ErrorOr<String> read_next_piece()
InLabelOrObjectKey
} label_state { NotInLabelOrObjectKey };
for (JS::Token token = lexer.next(); token.type() != JS::TokenType::Eof; token = lexer.next()) {
switch (token.type()) {
case JS::TokenType::BracketOpen:
case JS::TokenType::CurlyOpen:
case JS::TokenType::ParenOpen:
label_state = NotInLabelOrObjectKey;
s_repl_line_level++;
break;
case JS::TokenType::BracketClose:
case JS::TokenType::CurlyClose:
case JS::TokenType::ParenClose:
label_state = NotInLabelOrObjectKey;
s_repl_line_level--;
break;
struct BracketState {
decltype(label_state)* label;
int* level;
} bracket_state { &label_state, &s_repl_line_level };
case JS::TokenType::Identifier:
case JS::TokenType::StringLiteral:
if (label_state == NotInLabelOrObjectKey)
label_state = InLabelOrObjectKeyIdentifier;
else
label_state = NotInLabelOrObjectKey;
break;
case JS::TokenType::Colon:
if (label_state == InLabelOrObjectKeyIdentifier)
label_state = InLabelOrObjectKey;
else
label_state = NotInLabelOrObjectKey;
break;
default:
break;
}
}
JS::FFI::rust_tokenize(source_code->utf16_data(), source_code->length_in_code_units(), &bracket_state,
[](void* ctx, JS::FFI::FFIToken const* tok) {
auto& state = *static_cast<BracketState*>(ctx);
auto type = static_cast<JS::TokenType>(tok->token_type);
switch (type) {
case JS::TokenType::BracketOpen:
case JS::TokenType::CurlyOpen:
case JS::TokenType::ParenOpen:
*state.label = NotInLabelOrObjectKey;
(*state.level)++;
break;
case JS::TokenType::BracketClose:
case JS::TokenType::CurlyClose:
case JS::TokenType::ParenClose:
*state.label = NotInLabelOrObjectKey;
(*state.level)--;
break;
case JS::TokenType::Identifier:
case JS::TokenType::StringLiteral:
if (*state.label == NotInLabelOrObjectKey)
*state.label = InLabelOrObjectKeyIdentifier;
else
*state.label = NotInLabelOrObjectKey;
break;
case JS::TokenType::Colon:
if (*state.label == InLabelOrObjectKeyIdentifier)
*state.label = InLabelOrObjectKey;
else
*state.label = NotInLabelOrObjectKey;
break;
default:
break;
}
});
if (label_state == InLabelOrObjectKey) {
// If there's a label or object literal key at the end of this line,
@@ -618,63 +628,69 @@ static ErrorOr<int> run_repl(bool gc_on_every_allocation, bool syntax_highlight)
size_t open_indents = s_repl_line_level;
auto line = editor.line();
JS::Lexer lexer(JS::SourceCode::create({}, Utf16String::from_utf8(line)));
bool indenters_starting_line = true;
for (JS::Token token = lexer.next(); token.type() != JS::TokenType::Eof; token = lexer.next()) {
auto length = token.value().length_in_code_units();
auto start = token.offset();
auto end = start + length;
if (indenters_starting_line) {
if (token.type() != JS::TokenType::ParenClose && token.type() != JS::TokenType::BracketClose && token.type() != JS::TokenType::CurlyClose) {
indenters_starting_line = false;
} else {
--open_indents;
}
}
auto source_code = JS::SourceCode::create({}, Utf16String::from_utf8(line));
switch (token.category()) {
case JS::TokenCategory::Invalid:
stylize({ start, end, Line::Span::CodepointOriented }, { Line::Style::Foreground(Line::Style::XtermColor::Red), Line::Style::Underline });
break;
case JS::TokenCategory::Number:
stylize({ start, end, Line::Span::CodepointOriented }, { Line::Style::Foreground(Line::Style::XtermColor::Magenta) });
break;
case JS::TokenCategory::String:
stylize({ start, end, Line::Span::CodepointOriented }, { Line::Style::Foreground(Line::Style::XtermColor::Green), Line::Style::Bold });
break;
case JS::TokenCategory::Punctuation:
break;
case JS::TokenCategory::Operator:
break;
case JS::TokenCategory::Keyword:
switch (token.type()) {
case JS::TokenType::BoolLiteral:
case JS::TokenType::NullLiteral:
stylize({ start, end, Line::Span::CodepointOriented }, { Line::Style::Foreground(Line::Style::XtermColor::Yellow), Line::Style::Bold });
struct HighlightState {
decltype(stylize)* stylize_fn;
size_t* open_indents;
bool indenters_starting_line { true };
} highlight_state { &stylize, &open_indents };
JS::FFI::rust_tokenize(source_code->utf16_data(), source_code->length_in_code_units(), &highlight_state,
[](void* ctx, JS::FFI::FFIToken const* tok) {
auto& state = *static_cast<HighlightState*>(ctx);
auto type = static_cast<JS::TokenType>(tok->token_type);
auto category = static_cast<JS::TokenCategory>(tok->category);
auto start = static_cast<size_t>(tok->offset);
auto end = start + tok->length;
if (type == JS::TokenType::Eof)
return;
if (state.indenters_starting_line) {
if (type != JS::TokenType::ParenClose && type != JS::TokenType::BracketClose && type != JS::TokenType::CurlyClose)
state.indenters_starting_line = false;
else
--(*state.open_indents);
}
switch (category) {
case JS::TokenCategory::Invalid:
(*state.stylize_fn)({ start, end, Line::Span::CodepointOriented }, { Line::Style::Foreground(Line::Style::XtermColor::Red), Line::Style::Underline });
break;
case JS::TokenCategory::Number:
(*state.stylize_fn)({ start, end, Line::Span::CodepointOriented }, { Line::Style::Foreground(Line::Style::XtermColor::Magenta) });
break;
case JS::TokenCategory::String:
(*state.stylize_fn)({ start, end, Line::Span::CodepointOriented }, { Line::Style::Foreground(Line::Style::XtermColor::Green), Line::Style::Bold });
break;
case JS::TokenCategory::Punctuation:
case JS::TokenCategory::Operator:
break;
case JS::TokenCategory::Keyword:
if (type == JS::TokenType::BoolLiteral || type == JS::TokenType::NullLiteral)
(*state.stylize_fn)({ start, end, Line::Span::CodepointOriented }, { Line::Style::Foreground(Line::Style::XtermColor::Yellow), Line::Style::Bold });
else
(*state.stylize_fn)({ start, end, Line::Span::CodepointOriented }, { Line::Style::Foreground(Line::Style::XtermColor::Blue), Line::Style::Bold });
break;
case JS::TokenCategory::ControlKeyword:
(*state.stylize_fn)({ start, end, Line::Span::CodepointOriented }, { Line::Style::Foreground(Line::Style::XtermColor::Cyan), Line::Style::Italic });
break;
case JS::TokenCategory::Identifier:
(*state.stylize_fn)({ start, end, Line::Span::CodepointOriented }, { Line::Style::Foreground(Line::Style::XtermColor::White), Line::Style::Bold });
break;
default:
stylize({ start, end, Line::Span::CodepointOriented }, { Line::Style::Foreground(Line::Style::XtermColor::Blue), Line::Style::Bold });
break;
}
break;
case JS::TokenCategory::ControlKeyword:
stylize({ start, end, Line::Span::CodepointOriented }, { Line::Style::Foreground(Line::Style::XtermColor::Cyan), Line::Style::Italic });
break;
case JS::TokenCategory::Identifier:
stylize({ start, end, Line::Span::CodepointOriented }, { Line::Style::Foreground(Line::Style::XtermColor::White), Line::Style::Bold });
break;
default:
break;
}
}
});
editor.set_prompt(prompt_for_level(open_indents).release_value_but_fixme_should_propagate_errors().to_byte_string());
};
auto complete = [&realm, &global_environment](Line::Editor const& editor) -> Vector<Line::CompletionSuggestion> {
auto line = editor.line(editor.cursor());
auto source_code = JS::SourceCode::create({}, Utf16String::from_utf8(line));
auto const& code_view = source_code->code_view();
JS::Lexer lexer(JS::SourceCode::create({}, Utf16String::from_utf8(line)));
enum {
Initial,
CompleteVariable,
@@ -684,6 +700,15 @@ static ErrorOr<int> run_repl(bool gc_on_every_allocation, bool syntax_highlight)
Utf16FlyString variable_name;
Utf16FlyString property_name;
bool last_token_has_trivia = false;
struct CompleteState {
decltype(mode)* current_mode;
Utf16FlyString* variable_name;
Utf16FlyString* property_name;
bool* last_token_has_trivia;
Utf16View const* code_view;
} complete_state { &mode, &variable_name, &property_name, &last_token_has_trivia, &code_view };
// we're only going to complete either
// - <N>
@@ -691,45 +716,48 @@ static ErrorOr<int> run_repl(bool gc_on_every_allocation, bool syntax_highlight)
// - <N>.<P>
// where N is the complete name of a variable and
// P is part of the name of one of its properties
auto js_token = lexer.next();
for (; js_token.type() != JS::TokenType::Eof; js_token = lexer.next()) {
switch (mode) {
case CompleteVariable:
switch (js_token.type()) {
case JS::TokenType::Period:
// ...<name> <dot>
mode = CompleteNullProperty;
break;
default:
// not a dot, reset back to initial
mode = Initial;
break;
JS::FFI::rust_tokenize(source_code->utf16_data(), source_code->length_in_code_units(), &complete_state,
[](void* ctx, JS::FFI::FFIToken const* tok) {
auto& s = *static_cast<CompleteState*>(ctx);
auto type = static_cast<JS::TokenType>(tok->token_type);
auto category = static_cast<JS::TokenCategory>(tok->category);
if (type == JS::TokenType::Eof) {
*s.last_token_has_trivia = tok->trivia_length > 0;
return;
}
break;
case CompleteNullProperty:
if (js_token.is_identifier_name()) {
// ...<name> <dot> <name>
mode = CompleteProperty;
property_name = js_token.fly_string_value();
} else {
mode = Initial;
}
break;
case CompleteProperty:
// something came after the property access, reset to initial
case Initial:
if (js_token.type() == JS::TokenType::Identifier) {
// ...<name>...
mode = CompleteVariable;
variable_name = js_token.fly_string_value();
} else {
mode = Initial;
}
break;
}
}
bool last_token_has_trivia = !js_token.trivia().is_empty();
auto token_value = [&]() {
return Utf16FlyString::from_utf16(s.code_view->substring_view(tok->offset, tok->length));
};
bool is_identifier_name = type != JS::TokenType::PrivateIdentifier
&& (category == JS::TokenCategory::Identifier || category == JS::TokenCategory::Keyword || category == JS::TokenCategory::ControlKeyword);
switch (*s.current_mode) {
case CompleteVariable:
if (type == JS::TokenType::Period)
*s.current_mode = CompleteNullProperty;
else
*s.current_mode = Initial;
break;
case CompleteNullProperty:
if (is_identifier_name) {
*s.current_mode = CompleteProperty;
*s.property_name = token_value();
} else {
*s.current_mode = Initial;
}
break;
case CompleteProperty:
case Initial:
if (type == JS::TokenType::Identifier) {
*s.current_mode = CompleteVariable;
*s.variable_name = token_value();
} else {
*s.current_mode = Initial;
}
break;
}
});
if (mode == CompleteNullProperty) {
mode = CompleteProperty;