mirror of
https://github.com/LadybirdBrowser/ladybird
synced 2026-04-26 01:35:08 +02:00
Add LibRegex's new Rust ECMAScript regular expression engine. Replace the old parser's direct pattern-to-bytecode pipeline with a split architecture: parse patterns into a lossless AST first, then lower that AST into bytecode for a dedicated backtracking VM. Keep the syntax tree as the place for validation, analysis, and optimization instead of teaching every transformation to rewrite partially built bytecode. Specialize this backend for the job LibJS actually needs. The old C++ engine shared one generic parser and matcher stack across ECMA-262 and POSIX modes and supported both byte-string and UTF-16 inputs. The new engine focuses on ECMA-262 semantics on WTF-16 data, which lets it model lone surrogates and other JavaScript-specific behavior directly instead of carrying POSIX and multi-encoding constraints through the whole implementation. Fill in the ECMAScript features needed to replace the old engine for real web workloads: Unicode properties and sets, lookahead and lookbehind, named groups and backreferences, modifier groups, string properties, large quantifiers, lone surrogates, and the parser and VM corner cases those features exercise. Reshape the runtime around compile-time pattern hints and a hotter VM loop. Pre-resolve Unicode properties, derive first-character, character-class, and simple-scan filters, extract safe trailing literals for anchored patterns, add literal and literal-alternation fast paths, and keep reusable scratch storage for registers, backtracking state, and modifier stacks. Teach `find_all` to stay inside one VM so global searches stop paying setup costs on every match. Make those shortcuts semantics-aware instead of merely fast. In Unicode mode, do not use literal fast paths for lone surrogates, since ECMA-262 must not let `/\ud83d/u` match inside a surrogate pair. Likewise, only derive end-anchor suffix hints when the suffix lies on every path to `Match`, so lookarounds and disjunctions cannot skip into a shared tail and produce false negatives. This commit lands the Rust crate, the C++ wrapper, the build integration, and the initial LibJS-side plumbing needed to exercise the new engine under real RegExp callers before removing the legacy backend.
84 lines
2.9 KiB
C++
84 lines
2.9 KiB
C++
/*
|
|
* Copyright (c) 2026-present, the Ladybird developers.
|
|
*
|
|
* SPDX-License-Identifier: BSD-2-Clause
|
|
*/
|
|
|
|
#pragma once
|
|
|
|
#ifdef ENABLE_RUST
|
|
|
|
# include <AK/Error.h>
|
|
# include <AK/Noncopyable.h>
|
|
# include <AK/String.h>
|
|
# include <AK/Utf16View.h>
|
|
# include <AK/Vector.h>
|
|
# include <LibRegex/Export.h>
|
|
# include <RustFFI.h>
|
|
|
|
namespace regex {
|
|
|
|
struct RustNamedCaptureGroup {
|
|
String name;
|
|
unsigned int index;
|
|
};
|
|
|
|
class REGEX_API CompiledRustRegex {
|
|
AK_MAKE_NONCOPYABLE(CompiledRustRegex);
|
|
|
|
public:
|
|
static ErrorOr<CompiledRustRegex, String> compile(StringView pattern, RustRegexFlags flags);
|
|
|
|
~CompiledRustRegex();
|
|
CompiledRustRegex(CompiledRustRegex&& other);
|
|
CompiledRustRegex& operator=(CompiledRustRegex&& other);
|
|
|
|
/// Execute into internal capture buffer. Returns 1 on match, 0 on no match, -1 on limit exceeded.
|
|
/// After a successful call, read results via capture_slot().
|
|
int exec_internal(Utf16View input, size_t start_pos) const;
|
|
/// Read a capture slot from the internal buffer (after exec_internal).
|
|
/// Even slots are start positions, odd slots are end positions.
|
|
/// Returns -1 for unmatched captures.
|
|
int capture_slot(unsigned int slot) const { return m_capture_buffer[slot]; }
|
|
/// Test for a match. Returns 1 on match, 0 on no match, -1 on limit exceeded.
|
|
int test(Utf16View input, size_t start_pos = 0) const;
|
|
unsigned int capture_count() const;
|
|
/// Total number of capture groups including group 0.
|
|
unsigned int total_groups() const;
|
|
|
|
/// Find all non-overlapping matches. Returns number of matches found.
|
|
/// Results are written as (start, end) i32 pairs to the internal find_all buffer.
|
|
/// Access results via find_all_match(i) after calling.
|
|
int find_all(Utf16View input, size_t start_pos) const;
|
|
/// Get the i-th match from find_all results. Returns (start, end).
|
|
struct MatchPair {
|
|
int start;
|
|
int end;
|
|
};
|
|
MatchPair find_all_match(int index) const { return { m_find_all_buffer[index * 2], m_find_all_buffer[index * 2 + 1] }; }
|
|
|
|
Vector<RustNamedCaptureGroup> const& named_groups() const { return m_named_groups; }
|
|
|
|
private:
|
|
explicit CompiledRustRegex(RustRegex* regex);
|
|
|
|
/// Get u16 data pointer and length from a Utf16View.
|
|
/// For ASCII storage, widens to u16 using the cached buffer.
|
|
unsigned short const* get_u16_data(Utf16View input, size_t& out_len) const;
|
|
|
|
RustRegex* m_regex { nullptr };
|
|
Vector<RustNamedCaptureGroup> m_named_groups;
|
|
/// Reusable buffer for ASCII→u16 widening, avoiding per-call allocation.
|
|
mutable Vector<u16> m_u16_buffer;
|
|
/// Pre-allocated buffer for capture results to avoid per-exec allocation.
|
|
mutable Vector<int> m_capture_buffer;
|
|
mutable unsigned int m_capture_count { 0 };
|
|
mutable bool m_capture_count_cached { false };
|
|
/// Buffer for find_all results.
|
|
mutable Vector<int> m_find_all_buffer;
|
|
};
|
|
|
|
} // namespace regex
|
|
|
|
#endif // ENABLE_RUST
|