LibJS+LibUnicode: Use LibUnicode as appropriate for lexing JavaScript

Now that LibUnicode exports its character type APIs in Rust, we can use them to lex identifiers and whitespace. Fixes #8870.
Author: https://github.com/trflynn89 Commit: https://github.com/LadybirdBrowser/ladybird/commit/10ce8479315 Pull-request: https://github.com/LadybirdBrowser/ladybird/pull/8975
2026-04-25 17:25:08 +02:00 · 2026-04-18 13:05:14 -04:00 · 2026-04-19 08:40:35 +00:00
parent 11719369e8
commit 10ce847931
6 changed files with 42 additions and 58 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -375,10 +375,10 @@ version = "0.1.0"
 dependencies = [
 "bytecode_def",
 "cbindgen",
+ "libunicode_rust",
 "num-bigint",
 "num-integer",
 "num-traits",
- "unicode-ident",
 ]

 [[package]]
--- a/Libraries/LibJS/Rust/Cargo.toml
+++ b/Libraries/LibJS/Rust/Cargo.toml
@@ -9,7 +9,7 @@ crate-type = ["staticlib"]
 # After changing dependencies, regenerate the Flatpak sources:
 #   python3 Meta/CMake/flatpak/generate-cargo-sources.py
 [dependencies]
-unicode-ident = "1.0"
+libunicode_rust = { path = "../../LibUnicode/Rust", default-features = false }
 num-bigint = "0.4"
 num-traits = "0.2"
 num-integer = "0.1"
--- a/Libraries/LibJS/Rust/src/lexer.rs
+++ b/Libraries/LibJS/Rust/src/lexer.rs
@@ -77,12 +77,12 @@ pub struct Lexer<'a> {
 // Unicode constants used by the lexical grammar.
 // https://tc39.es/ecma262/#sec-white-space
 // https://tc39.es/ecma262/#sec-line-terminators
-const NO_BREAK_SPACE: u16 = 0x00A0;
-const ZERO_WIDTH_NON_JOINER: u32 = 0x200C;
-const ZERO_WIDTH_JOINER: u32 = 0x200D;
+const TAB: u32 = 0x0009;
+const VERTICAL_TAB: u32 = 0x000B;
+const FORM_FEED: u32 = 0x000C;
 const LINE_SEPARATOR: u16 = 0x2028;
 const PARAGRAPH_SEPARATOR: u16 = 0x2029;
-const ZERO_WIDTH_NO_BREAK_SPACE: u16 = 0xFEFF;
+const ZERO_WIDTH_NO_BREAK_SPACE: u32 = 0xFEFF;

 /// Convert an ASCII byte literal to a UTF-16 code unit.
 pub(crate) const fn ch(c: u8) -> u16 {
@@ -93,26 +93,14 @@ fn is_ascii(cu: u16) -> bool {
    cu < 128
 }

-fn is_ascii_alpha(cp: u32) -> bool {
-    cp < 128 && (cp as u8).is_ascii_alphabetic()
-}
-
 fn is_ascii_digit(cu: u16) -> bool {
    cu >= ch(b'0') && cu <= ch(b'9')
 }

-fn is_ascii_digit_cp(cp: u32) -> bool {
-    cp >= b'0' as u32 && cp <= b'9' as u32
-}
-
 fn is_ascii_hex_digit(cu: u16) -> bool {
    is_ascii_digit(cu) || (cu >= ch(b'a') && cu <= ch(b'f')) || (cu >= ch(b'A') && cu <= ch(b'F'))
 }

-fn is_ascii_alphanumeric(cp: u32) -> bool {
-    is_ascii_alpha(cp) || is_ascii_digit_cp(cp)
-}
-
 fn is_ascii_space(cu: u16) -> bool {
    matches!(cu, 0x09 | 0x0A | 0x0B | 0x0C | 0x0D | 0x20)
 }
@@ -160,55 +148,25 @@ fn is_line_terminator_cp(cp: u32) -> bool {
 // WhiteSpace :: <TAB> | <VT> | <FF> | <ZWNBSP> | <USP>
 // where <USP> is any code point with General Category "Space_Separator" (Zs).
 fn is_whitespace_cp(cp: u32) -> bool {
-    if cp < 128 {
-        return is_ascii_space(cp as u16);
-    }
-    if cp == NO_BREAK_SPACE as u32 || cp == ZERO_WIDTH_NO_BREAK_SPACE as u32 {
+    if matches!(cp, TAB | VERTICAL_TAB | FORM_FEED | ZERO_WIDTH_NO_BREAK_SPACE) {
        return true;
    }
-    // Unicode General Category "Space_Separator" (Zs)
-    matches!(cp, 0x1680 | 0x2000..=0x200A | 0x202F | 0x205F | 0x3000)
+
+    libunicode_rust::character_types::code_point_has_space_separator_general_category(cp)
 }

-// https://tc39.es/ecma262/#sec-identifier-names
+// https://tc39.es/ecma262/#prod-IdentifierStartChar
 // IdentifierStartChar :: UnicodeIDStart | $ | _
 fn is_identifier_start_cp(cp: u32) -> bool {
-    if is_ascii_alpha(cp) || cp == '_' as u32 || cp == '$' as u32 {
-        return true;
-    }
-    if cp < 128 {
-        return false;
-    }
-    unicode_id_start(cp)
-}
-
-// https://tc39.es/ecma262/#sec-identifier-names
-// IdentifierPartChar :: UnicodeIDContinue | $ | <ZWNJ> | <ZWJ>
-fn is_identifier_continue_cp(cp: u32) -> bool {
-    if is_ascii_alphanumeric(cp)
-        || cp == '$' as u32
+    cp == '$' as u32
        || cp == '_' as u32
-        || cp == ZERO_WIDTH_NON_JOINER
-        || cp == ZERO_WIDTH_JOINER
-    {
-        return true;
-    }
-    if cp < 128 {
-        return false;
-    }
-    unicode_id_continue(cp)
+        || libunicode_rust::character_types::code_point_has_identifier_start_property(cp)
 }

-fn unicode_id_start(cp: u32) -> bool {
-    // NB: The ECMAScript spec requires ID_Start, not XID_Start.
-    //     U+309B and U+309C are Other_ID_Start (thus ID_Start) but not XID_Start.
-    cp == 0x309B || cp == 0x309C || char::from_u32(cp).is_some_and(unicode_ident::is_xid_start)
-}
-
-fn unicode_id_continue(cp: u32) -> bool {
-    // NB: The ECMAScript spec requires ID_Continue, not XID_Continue.
-    //     U+309B and U+309C are Other_ID_Start (thus ID_Continue) but not XID_Continue.
-    cp == 0x309B || cp == 0x309C || char::from_u32(cp).is_some_and(unicode_ident::is_xid_continue)
+// https://tc39.es/ecma262/#prod-IdentifierPartChar
+// IdentifierPartChar :: UnicodeIDContinue | $
+fn is_identifier_continue_cp(cp: u32) -> bool {
+    cp == '$' as u32 || libunicode_rust::character_types::code_point_has_identifier_continue_property(cp)
 }

 // https://tc39.es/ecma262/#sec-keywords-and-reserved-words
--- a/Libraries/LibUnicode/CharacterTypes.cpp
+++ b/Libraries/LibUnicode/CharacterTypes.cpp
@@ -739,6 +739,8 @@ bool unicode_property_all_case_equivalents_match(u32, unsigned char const*, size
 bool unicode_resolve_property(unsigned char const*, size_t, unsigned char const*, size_t, unsigned char*, u32*);
 bool unicode_resolved_property_matches(u32, unsigned char, u32);

+bool unicode_code_point_has_space_separator_general_category(u32);
+
 bool unicode_code_point_has_identifier_start_property(u32);
 bool unicode_code_point_has_identifier_continue_property(u32);

@@ -829,6 +831,11 @@ extern "C" bool unicode_resolved_property_matches(u32 code_point, unsigned char
    VERIFY_NOT_REACHED();
 }

+extern "C" bool unicode_code_point_has_space_separator_general_category(u32 code_point)
+{
+    return Unicode::code_point_has_space_separator_general_category(code_point);
+}
+
 extern "C" bool unicode_code_point_has_identifier_start_property(u32 code_point)
 {
    return Unicode::code_point_has_identifier_start_property(code_point);
--- a/Libraries/LibUnicode/Rust/src/character_types.rs
+++ b/Libraries/LibUnicode/Rust/src/character_types.rs
@@ -67,6 +67,8 @@ unsafe extern "C" {

    fn unicode_resolved_property_matches(code_point: u32, kind: u8, id: u32) -> bool;

+    fn unicode_code_point_has_space_separator_general_category(code_point: u32) -> bool;
+
    fn unicode_code_point_has_identifier_start_property(code_point: u32) -> bool;
    fn unicode_code_point_has_identifier_continue_property(code_point: u32) -> bool;

@@ -172,6 +174,16 @@ pub fn resolved_property_matches(code_point: u32, property: ResolvedProperty) ->
    unsafe { unicode_resolved_property_matches(code_point, property.kind as u8, property.id) }
 }

+#[inline(always)]
+pub fn code_point_has_space_separator_general_category(code_point: u32) -> bool {
+    if is_ascii(code_point) {
+        return code_point == ' ' as u32 || code_point == 0xa0;
+    }
+
+    // SAFETY: This forwards only a scalar value to the C++ helper.
+    unsafe { unicode_code_point_has_space_separator_general_category(code_point) }
+}
+
 #[inline(always)]
 pub fn code_point_has_identifier_start_property(code_point: u32) -> bool {
    if is_ascii(code_point) {
--- a/Tests/LibJS/Runtime/unicode-identifier-continue.js
+++ b/Tests/LibJS/Runtime/unicode-identifier-continue.js
@@ -0,0 +1,7 @@
+test("basic functionality", () => {
+    const foo = {
+        ำ: 12389,
+    };
+
+    expect(foo.ำ).toBe(12389);
+});