Files
ladybird/Libraries/LibUnicode/Segmenter.h
Andreas Kling 8cdfbfed49 LibUnicode+LibWeb: Add fast path grapheme segmenter for ASCII text
For ASCII text, every character is its own grapheme - there are no
combining characters or emoji sequences. This means grapheme boundary
detection is trivial: next_boundary(i) is simply i+1.

This commit adds AsciiGraphemeSegmenter, a simple Segmenter subclass
that performs O(1) boundary lookups without any ICU overhead.

TextNode::grapheme_segmenter() now checks if the text is ASCII and uses
this fast path, avoiding expensive ICU BreakIterator cloning and
boundary detection for the common case of ASCII-only text.
2026-01-11 11:10:19 +01:00

66 lines
2.0 KiB
C++

/*
* Copyright (c) 2024, Tim Flynn <trflynn89@serenityos.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#pragma once
#include <AK/Function.h>
#include <AK/NonnullOwnPtr.h>
#include <AK/Optional.h>
#include <AK/StringView.h>
namespace Unicode {
enum class SegmenterGranularity {
Grapheme,
Sentence,
Word,
};
SegmenterGranularity segmenter_granularity_from_string(StringView);
StringView segmenter_granularity_to_string(SegmenterGranularity);
class Segmenter {
public:
static NonnullOwnPtr<Segmenter> create(SegmenterGranularity segmenter_granularity);
static NonnullOwnPtr<Segmenter> create(StringView locale, SegmenterGranularity segmenter_granularity);
static NonnullOwnPtr<Segmenter> create_for_ascii_grapheme(size_t length);
virtual ~Segmenter() = default;
static bool should_continue_beyond_word(Utf16View const&);
SegmenterGranularity segmenter_granularity() const { return m_segmenter_granularity; }
virtual NonnullOwnPtr<Segmenter> clone() const = 0;
virtual void set_segmented_text(String) = 0;
virtual void set_segmented_text(Utf16View const&) = 0;
virtual size_t current_boundary() = 0;
enum class Inclusive {
No,
Yes,
};
virtual Optional<size_t> previous_boundary(size_t index, Inclusive = Inclusive::No) = 0;
virtual Optional<size_t> next_boundary(size_t index, Inclusive = Inclusive::No) = 0;
using SegmentationCallback = Function<IterationDecision(size_t)>;
virtual void for_each_boundary(String, SegmentationCallback) = 0;
virtual void for_each_boundary(Utf16View const&, SegmentationCallback) = 0;
virtual void for_each_boundary(Utf32View const&, SegmentationCallback) = 0;
virtual bool is_current_boundary_word_like() const = 0;
protected:
explicit Segmenter(SegmenterGranularity segmenter_granularity)
: m_segmenter_granularity(segmenter_granularity)
{
}
SegmenterGranularity m_segmenter_granularity { SegmenterGranularity::Grapheme };
};
}