Files
servo/components/shared/base/text.rs
Martin Robinson e4822c9c5d script: More thoroughly convert between UTF-16 and UTF-8 offsets in text inputs (#41588)
DOM APIs for interacting with selection and text in text inputs
`<input type=text>` and `<textarea>` all accept offsets and lengths in
UTF-16 code units. Servo was not converting all of these offsets into
UTF-8 code units. This change makes it so that this conversion is done
more thoroughly and makes it clear when the code is dealing with UTF-8
offsets and UTF-16 offsets.

Helper functions are added for doing this conversion in both directions
as it is necessary. In addition, a `char` iterator is added for
`TextInput` as it is useful for doing this conversion. It will be used
more completely in the future when a `Rope` data structure is extracted
from `TextInput`.

Finally, specification text is added to all of the DOM implementation
touched here.

Testing: This change includes a new WPT crash test as well as a series
of unit
tests to verify conversion between UTF-8 and UTF-16 offsets.
Fixes #36719.
Fixes #20028.
Fixes #39184.

Signed-off-by: Martin Robinson <mrobinson@igalia.com>
2025-12-31 09:29:25 +00:00

151 lines
4.7 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at https://mozilla.org/MPL/2.0/. */
use std::iter::Sum;
use std::ops::{Add, AddAssign, Range, Sub, SubAssign};
use malloc_size_of_derive::MallocSizeOf;
pub use crate::unicode_block::{UnicodeBlock, UnicodeBlockMethod};
pub fn is_bidi_control(c: char) -> bool {
matches!(c, '\u{202A}'..='\u{202E}' | '\u{2066}'..='\u{2069}' | '\u{200E}' | '\u{200F}' | '\u{061C}')
}
pub fn unicode_plane(codepoint: char) -> u32 {
(codepoint as u32) >> 16
}
pub fn is_cjk(codepoint: char) -> bool {
if let Some(
UnicodeBlock::CJKRadicalsSupplement |
UnicodeBlock::KangxiRadicals |
UnicodeBlock::IdeographicDescriptionCharacters |
UnicodeBlock::CJKSymbolsandPunctuation |
UnicodeBlock::Hiragana |
UnicodeBlock::Katakana |
UnicodeBlock::Bopomofo |
UnicodeBlock::HangulCompatibilityJamo |
UnicodeBlock::Kanbun |
UnicodeBlock::BopomofoExtended |
UnicodeBlock::CJKStrokes |
UnicodeBlock::KatakanaPhoneticExtensions |
UnicodeBlock::EnclosedCJKLettersandMonths |
UnicodeBlock::CJKCompatibility |
UnicodeBlock::CJKUnifiedIdeographsExtensionA |
UnicodeBlock::YijingHexagramSymbols |
UnicodeBlock::CJKUnifiedIdeographs |
UnicodeBlock::CJKCompatibilityIdeographs |
UnicodeBlock::CJKCompatibilityForms |
UnicodeBlock::HalfwidthandFullwidthForms,
) = codepoint.block()
{
return true;
}
// https://en.wikipedia.org/wiki/Plane_(Unicode)#Supplementary_Ideographic_Plane
// https://en.wikipedia.org/wiki/Plane_(Unicode)#Tertiary_Ideographic_Plane
unicode_plane(codepoint) == 2 || unicode_plane(codepoint) == 3
}
macro_rules! unicode_length_type {
($type_name:ident) => {
/// A length in code units of the given text encoding. For instance, `Utf8CodeUnitLength`
/// is a length in UTF-8 code units (one byte each). `Utf16CodeUnitLength` is a length in
/// UTF-16 code units (two bytes each). This type is used to more reliable work with
/// lengths in different encodings.
#[derive(Clone, Copy, Debug, Default, Eq, MallocSizeOf, Ord, PartialEq, PartialOrd)]
pub struct $type_name(pub usize);
impl $type_name {
pub fn zero() -> Self {
Self(0)
}
pub fn one() -> Self {
Self(1)
}
pub fn unwrap_range(byte_range: Range<Self>) -> Range<usize> {
byte_range.start.0..byte_range.end.0
}
pub fn saturating_sub(self, value: Self) -> Self {
Self(self.0.saturating_sub(value.0))
}
}
impl From<u32> for $type_name {
fn from(value: u32) -> Self {
Self(value as usize)
}
}
impl From<isize> for $type_name {
fn from(value: isize) -> Self {
Self(value as usize)
}
}
impl Add for $type_name {
type Output = Self;
fn add(self, other: Self) -> Self {
Self(self.0 + other.0)
}
}
impl AddAssign for $type_name {
fn add_assign(&mut self, other: Self) {
*self = Self(self.0 + other.0)
}
}
impl Sub for $type_name {
type Output = Self;
fn sub(self, value: Self) -> Self {
Self(self.0 - value.0)
}
}
impl SubAssign for $type_name {
fn sub_assign(&mut self, other: Self) {
*self = Self(self.0 - other.0)
}
}
impl Sum for $type_name {
fn sum<I: Iterator<Item = Self>>(iter: I) -> Self {
iter.fold(Self::zero(), |a, b| Self(a.0 + b.0))
}
}
};
}
unicode_length_type!(Utf8CodeUnitLength);
unicode_length_type!(Utf16CodeUnitLength);
#[cfg(test)]
mod test {
use super::*;
#[test]
fn test_is_cjk() {
// Test characters from different CJK blocks
assert_eq!(is_cjk(''), true);
assert_eq!(is_cjk('㐀'), true);
assert_eq!(is_cjk('あ'), true);
assert_eq!(is_cjk('ア'), true);
assert_eq!(is_cjk('㆒'), true);
assert_eq!(is_cjk('ㆣ'), true);
assert_eq!(is_cjk('龥'), true);
assert_eq!(is_cjk('𰾑'), true);
assert_eq!(is_cjk('𰻝'), true);
// Test characters from outside CJK blocks
assert_eq!(is_cjk('a'), false);
assert_eq!(is_cjk('🙂'), false);
assert_eq!(is_cjk('©'), false);
}
}