LibJS+LibUnicode: Use LibUnicode as appropriate for lexing JavaScript

Now that LibUnicode exports its character type APIs in Rust, we can use
them to lex identifiers and whitespace.

Fixes #8870.
This commit is contained in:
Timothy Flynn
2026-04-18 13:05:14 -04:00
committed by Shannon Booth
parent 11719369e8
commit 10ce847931
Notes: github-actions[bot] 2026-04-19 08:40:35 +00:00
6 changed files with 42 additions and 58 deletions

2
Cargo.lock generated
View File

@@ -375,10 +375,10 @@ version = "0.1.0"
dependencies = [
"bytecode_def",
"cbindgen",
"libunicode_rust",
"num-bigint",
"num-integer",
"num-traits",
"unicode-ident",
]
[[package]]

View File

@@ -9,7 +9,7 @@ crate-type = ["staticlib"]
# After changing dependencies, regenerate the Flatpak sources:
# python3 Meta/CMake/flatpak/generate-cargo-sources.py
[dependencies]
unicode-ident = "1.0"
libunicode_rust = { path = "../../LibUnicode/Rust", default-features = false }
num-bigint = "0.4"
num-traits = "0.2"
num-integer = "0.1"

View File

@@ -77,12 +77,12 @@ pub struct Lexer<'a> {
// Unicode constants used by the lexical grammar.
// https://tc39.es/ecma262/#sec-white-space
// https://tc39.es/ecma262/#sec-line-terminators
const NO_BREAK_SPACE: u16 = 0x00A0;
const ZERO_WIDTH_NON_JOINER: u32 = 0x200C;
const ZERO_WIDTH_JOINER: u32 = 0x200D;
const TAB: u32 = 0x0009;
const VERTICAL_TAB: u32 = 0x000B;
const FORM_FEED: u32 = 0x000C;
const LINE_SEPARATOR: u16 = 0x2028;
const PARAGRAPH_SEPARATOR: u16 = 0x2029;
const ZERO_WIDTH_NO_BREAK_SPACE: u16 = 0xFEFF;
const ZERO_WIDTH_NO_BREAK_SPACE: u32 = 0xFEFF;
/// Convert an ASCII byte literal to a UTF-16 code unit.
pub(crate) const fn ch(c: u8) -> u16 {
@@ -93,26 +93,14 @@ fn is_ascii(cu: u16) -> bool {
cu < 128
}
fn is_ascii_alpha(cp: u32) -> bool {
cp < 128 && (cp as u8).is_ascii_alphabetic()
}
fn is_ascii_digit(cu: u16) -> bool {
cu >= ch(b'0') && cu <= ch(b'9')
}
fn is_ascii_digit_cp(cp: u32) -> bool {
cp >= b'0' as u32 && cp <= b'9' as u32
}
fn is_ascii_hex_digit(cu: u16) -> bool {
is_ascii_digit(cu) || (cu >= ch(b'a') && cu <= ch(b'f')) || (cu >= ch(b'A') && cu <= ch(b'F'))
}
fn is_ascii_alphanumeric(cp: u32) -> bool {
is_ascii_alpha(cp) || is_ascii_digit_cp(cp)
}
fn is_ascii_space(cu: u16) -> bool {
matches!(cu, 0x09 | 0x0A | 0x0B | 0x0C | 0x0D | 0x20)
}
@@ -160,55 +148,25 @@ fn is_line_terminator_cp(cp: u32) -> bool {
// WhiteSpace :: <TAB> | <VT> | <FF> | <ZWNBSP> | <USP>
// where <USP> is any code point with General Category "Space_Separator" (Zs).
fn is_whitespace_cp(cp: u32) -> bool {
if cp < 128 {
return is_ascii_space(cp as u16);
}
if cp == NO_BREAK_SPACE as u32 || cp == ZERO_WIDTH_NO_BREAK_SPACE as u32 {
if matches!(cp, TAB | VERTICAL_TAB | FORM_FEED | ZERO_WIDTH_NO_BREAK_SPACE) {
return true;
}
// Unicode General Category "Space_Separator" (Zs)
matches!(cp, 0x1680 | 0x2000..=0x200A | 0x202F | 0x205F | 0x3000)
libunicode_rust::character_types::code_point_has_space_separator_general_category(cp)
}
// https://tc39.es/ecma262/#sec-identifier-names
// https://tc39.es/ecma262/#prod-IdentifierStartChar
// IdentifierStartChar :: UnicodeIDStart | $ | _
fn is_identifier_start_cp(cp: u32) -> bool {
if is_ascii_alpha(cp) || cp == '_' as u32 || cp == '$' as u32 {
return true;
}
if cp < 128 {
return false;
}
unicode_id_start(cp)
}
// https://tc39.es/ecma262/#sec-identifier-names
// IdentifierPartChar :: UnicodeIDContinue | $ | <ZWNJ> | <ZWJ>
fn is_identifier_continue_cp(cp: u32) -> bool {
if is_ascii_alphanumeric(cp)
|| cp == '$' as u32
cp == '$' as u32
|| cp == '_' as u32
|| cp == ZERO_WIDTH_NON_JOINER
|| cp == ZERO_WIDTH_JOINER
{
return true;
}
if cp < 128 {
return false;
}
unicode_id_continue(cp)
|| libunicode_rust::character_types::code_point_has_identifier_start_property(cp)
}
fn unicode_id_start(cp: u32) -> bool {
// NB: The ECMAScript spec requires ID_Start, not XID_Start.
// U+309B and U+309C are Other_ID_Start (thus ID_Start) but not XID_Start.
cp == 0x309B || cp == 0x309C || char::from_u32(cp).is_some_and(unicode_ident::is_xid_start)
}
fn unicode_id_continue(cp: u32) -> bool {
// NB: The ECMAScript spec requires ID_Continue, not XID_Continue.
// U+309B and U+309C are Other_ID_Start (thus ID_Continue) but not XID_Continue.
cp == 0x309B || cp == 0x309C || char::from_u32(cp).is_some_and(unicode_ident::is_xid_continue)
// https://tc39.es/ecma262/#prod-IdentifierPartChar
// IdentifierPartChar :: UnicodeIDContinue | $
fn is_identifier_continue_cp(cp: u32) -> bool {
cp == '$' as u32 || libunicode_rust::character_types::code_point_has_identifier_continue_property(cp)
}
// https://tc39.es/ecma262/#sec-keywords-and-reserved-words

View File

@@ -739,6 +739,8 @@ bool unicode_property_all_case_equivalents_match(u32, unsigned char const*, size
bool unicode_resolve_property(unsigned char const*, size_t, unsigned char const*, size_t, unsigned char*, u32*);
bool unicode_resolved_property_matches(u32, unsigned char, u32);
bool unicode_code_point_has_space_separator_general_category(u32);
bool unicode_code_point_has_identifier_start_property(u32);
bool unicode_code_point_has_identifier_continue_property(u32);
@@ -829,6 +831,11 @@ extern "C" bool unicode_resolved_property_matches(u32 code_point, unsigned char
VERIFY_NOT_REACHED();
}
extern "C" bool unicode_code_point_has_space_separator_general_category(u32 code_point)
{
return Unicode::code_point_has_space_separator_general_category(code_point);
}
extern "C" bool unicode_code_point_has_identifier_start_property(u32 code_point)
{
return Unicode::code_point_has_identifier_start_property(code_point);

View File

@@ -67,6 +67,8 @@ unsafe extern "C" {
fn unicode_resolved_property_matches(code_point: u32, kind: u8, id: u32) -> bool;
fn unicode_code_point_has_space_separator_general_category(code_point: u32) -> bool;
fn unicode_code_point_has_identifier_start_property(code_point: u32) -> bool;
fn unicode_code_point_has_identifier_continue_property(code_point: u32) -> bool;
@@ -172,6 +174,16 @@ pub fn resolved_property_matches(code_point: u32, property: ResolvedProperty) ->
unsafe { unicode_resolved_property_matches(code_point, property.kind as u8, property.id) }
}
#[inline(always)]
pub fn code_point_has_space_separator_general_category(code_point: u32) -> bool {
if is_ascii(code_point) {
return code_point == ' ' as u32 || code_point == 0xa0;
}
// SAFETY: This forwards only a scalar value to the C++ helper.
unsafe { unicode_code_point_has_space_separator_general_category(code_point) }
}
#[inline(always)]
pub fn code_point_has_identifier_start_property(code_point: u32) -> bool {
if is_ascii(code_point) {

View File

@@ -0,0 +1,7 @@
test("basic functionality", () => {
const foo = {
: 12389,
};
expect(foo.).toBe(12389);
});