servo/components/script_bindings/domstring.rs

/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at https://mozilla.org/MPL/2.0/. */

#![allow(clippy::non_canonical_partial_ord_impl)]
use std::borrow::{Cow, ToOwned};
use std::cell::{Ref, RefCell, RefMut};
use std::default::Default;
use std::ops::Deref;
use std::ptr::{self, NonNull};
use std::str::FromStr;
use std::sync::LazyLock;
use std::{fmt, slice, str};

use base::text::{Utf8CodeUnitLength, Utf16CodeUnitLength};
use html5ever::{LocalName, Namespace};
use js::conversions::{ToJSValConvertible, jsstr_to_string};
use js::gc::MutableHandleValue;
use js::jsapi::{Heap, JS_GetLatin1StringCharsAndLength, JSContext, JSString};
use js::jsval::StringValue;
use js::rust::{Runtime, Trace};
use malloc_size_of::MallocSizeOfOps;
use num_traits::{ToPrimitive, Zero};
use regex::Regex;
use style::Atom;
use style::str::HTML_SPACE_CHARACTERS;

use crate::script_runtime::JSContext as SafeJSContext;
use crate::trace::RootedTraceableBox;

const ASCII_END: u8 = 0x7E;
const ASCII_CAPITAL_A: u8 = 0x41;
const ASCII_CAPITAL_Z: u8 = 0x5A;
const ASCII_LOWERCASE_A: u8 = 0x61;
const ASCII_LOWERCASE_Z: u8 = 0x7A;
const ASCII_TAB: u8 = 0x09;
const ASCII_NEWLINE: u8 = 0x0A;
const ASCII_FORMFEED: u8 = 0x0C;
const ASCII_CR: u8 = 0x0D;
const ASCII_SPACE: u8 = 0x20;

/// Gets the latin1 bytes from the js engine.
/// Safety: Make sure the *mut JSString is not null.
unsafe fn get_latin1_string_bytes(
    rooted_traceable_box: &RootedTraceableBox<Heap<*mut JSString>>,
) -> &[u8] {
    debug_assert!(!rooted_traceable_box.get().is_null());
    let mut length = 0;
    unsafe {
        let chars = JS_GetLatin1StringCharsAndLength(
            Runtime::get().expect("JS runtime has shut down").as_ptr(),
            ptr::null(),
            rooted_traceable_box.get(),
            &mut length,
        );
        assert!(!chars.is_null());
        slice::from_raw_parts(chars, length)
    }
}

/// A type representing the underlying encoded bytes of a [`DOMString`].
#[derive(Debug)]
pub enum EncodedBytes<'a> {
    /// These bytes are Latin1 encoded.
    Latin1(Ref<'a, [u8]>),
    /// These bytes are UTF-8 encoded.
    Utf8(Ref<'a, [u8]>),
}

impl EncodedBytes<'_> {
    /// Return a reference to the raw bytes of this [`EncodedBytes`] without any information about
    /// the underlying encoding.
    pub fn bytes(&self) -> &[u8] {
        match self {
            Self::Latin1(bytes) => bytes,
            Self::Utf8(bytes) => bytes,
        }
    }

    pub fn len(&self) -> usize {
        match self {
            Self::Latin1(bytes) => bytes
                .iter()
                .map(|b| if *b <= ASCII_END { 1 } else { 2 })
                .sum(),
            Self::Utf8(bytes) => bytes.len(),
        }
    }

    /// Return whether or not there is any data in this collection of bytes.
    pub fn is_empty(&self) -> bool {
        self.bytes().is_empty()
    }
}

enum DOMStringType {
    /// A simple rust string
    Rust(String),
    /// A JS String stored in mozjs.
    JSString(RootedTraceableBox<Heap<*mut JSString>>),
    #[cfg(test)]
    /// This is used for testing of the bindings to give
    /// a raw u8 Latin1 encoded string without having a js engine.
    Latin1Vec(Vec<u8>),
}

impl Default for DOMStringType {
    fn default() -> Self {
        Self::Rust(Default::default())
    }
}

impl DOMStringType {
    /// Warning:
    /// This function does not checking and just returns the raw bytes of the string,
    /// independently if they are  utf8 or latin1.
    /// The caller needs to take care that these make sense in context.
    fn as_raw_bytes(&self) -> &[u8] {
        match self {
            DOMStringType::Rust(s) => s.as_bytes(),
            DOMStringType::JSString(rooted_traceable_box) => unsafe {
                get_latin1_string_bytes(rooted_traceable_box)
            },
            #[cfg(test)]
            DOMStringType::Latin1Vec(items) => items,
        }
    }

    fn ensure_rust_string(&mut self) -> &mut String {
        let new_string = match self {
            DOMStringType::Rust(string) => return string,
            DOMStringType::JSString(rooted_traceable_box) => unsafe {
                jsstr_to_string(
                    Runtime::get().expect("JS runtime has shut down").as_ptr(),
                    NonNull::new(rooted_traceable_box.get()).unwrap(),
                )
            },
            #[cfg(test)]
            DOMStringType::Latin1Vec(items) => {
                let mut v = vec![0; items.len() * 2];
                let real_size =
                    encoding_rs::mem::convert_latin1_to_utf8(items.as_slice(), v.as_mut_slice());
                v.truncate(real_size);

                // Safety: convert_latin1_to_utf8 converts the raw bytes to utf8 and the
                // buffer is the size specified in the documentation, so this should be safe.
                unsafe { String::from_utf8_unchecked(v) }
            },
        };
        *self = DOMStringType::Rust(new_string);
        self.ensure_rust_string()
    }
}

/// A reference to a Rust `str` of UTF-8 encoded bytes, used to get a Rust
/// string from a [`DOMString`].
#[derive(Debug)]
pub struct StringView<'a>(Ref<'a, str>);

impl StringView<'_> {
    pub fn split_html_space_characters(&self) -> impl Iterator<Item = &str> {
        self.split(HTML_SPACE_CHARACTERS)
            .filter(|string| !string.is_empty())
    }
}

impl From<StringView<'_>> for String {
    fn from(string_view: StringView<'_>) -> Self {
        string_view.0.to_string()
    }
}

impl Deref for StringView<'_> {
    type Target = str;
    fn deref(&self) -> &str {
        &(self.0)
    }
}

impl AsRef<str> for StringView<'_> {
    fn as_ref(&self) -> &str {
        &(self.0)
    }
}

impl PartialEq for StringView<'_> {
    fn eq(&self, other: &Self) -> bool {
        self.0.eq(&*(other.0))
    }
}

impl PartialEq<&str> for StringView<'_> {
    fn eq(&self, other: &&str) -> bool {
        self.0.eq(*other)
    }
}

impl Eq for StringView<'_> {}

impl PartialOrd for StringView<'_> {
    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
        self.0.partial_cmp(&**other)
    }
}

impl Ord for StringView<'_> {
    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
        self.0.cmp(other)
    }
}

/// Safety comment:
///
/// This method will _not_ trace the pointer if the rust string exists.
/// The js string could be garbage collected and, hence, violating this
/// could lead to undefined behavior
unsafe impl Trace for DOMStringType {
    unsafe fn trace(&self, tracer: *mut js::jsapi::JSTracer) {
        unsafe {
            match self {
                DOMStringType::Rust(_s) => {},
                DOMStringType::JSString(rooted_traceable_box) => rooted_traceable_box.trace(tracer),
                #[cfg(test)]
                DOMStringType::Latin1Vec(_s) => {},
            }
        }
    }
}

impl malloc_size_of::MallocSizeOf for DOMStringType {
    fn size_of(&self, ops: &mut MallocSizeOfOps) -> usize {
        match self {
            DOMStringType::Rust(s) => s.size_of(ops),
            DOMStringType::JSString(_rooted_traceable_box) => {
                // Managed by JS Engine
                0
            },
            #[cfg(test)]
            DOMStringType::Latin1Vec(s) => s.size_of(ops),
        }
    }
}

impl std::fmt::Debug for DOMStringType {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        match self {
            DOMStringType::Rust(s) => f.debug_struct("DOMString").field("rust_string", s).finish(),
            DOMStringType::JSString(_rooted_traceable_box) => f.debug_struct("DOMString").finish(),
            #[cfg(test)]
            DOMStringType::Latin1Vec(s) => f
                .debug_struct("DOMString")
                .field("latin1_string", s)
                .finish(),
        }
    }
}

////// A DOMString.
///
/// This type corresponds to the [`DOMString`] type in WebIDL.
///
/// [`DOMString`]: https://webidl.spec.whatwg.org/#idl-DOMString
///
/// Conceptually, a DOMString has the same value space as a JavaScript String,
/// i.e., an array of 16-bit *code units* representing UTF-16, potentially with
/// unpaired surrogates present (also sometimes called WTF-16).
///
/// However, Rust `String`s are guaranteed to be valid UTF-8, and as such have
/// a *smaller value space* than WTF-16 (i.e., some JavaScript String values
/// can not be represented as a Rust `String`). This introduces the question of
/// what to do with values being passed from JavaScript to Rust that contain
/// unpaired surrogates.
///
/// The hypothesis is that it does not matter much how exactly those values are
/// transformed, because  passing unpaired surrogates into the DOM is very rare.
/// Instead Servo withh replace the unpaired surrogate by a U+FFFD replacement
/// character.
///
/// Currently, the lack of crash reports about this issue provides some
/// evidence to support the hypothesis. This evidence will hopefully be used to
/// convince other browser vendors that it would be safe to replace unpaired
/// surrogates at the boundary between JavaScript and native code. (This would
/// unify the `DOMString` and `USVString` types, both in the WebIDL standard
/// and in Servo.)
///
/// This string class will keep either the Reference to the mozjs object alive
/// or will have an internal rust string.
/// We currently default to doing most of the string operation on the rust side.
/// You should use `str()` to get the Rust string (represented by a `StringView`
/// which you can deref to a `&str`). You should assume that this conversion is
/// expensive. For now, you should assume that all the functions incur this
/// conversion cost.
#[repr(transparent)]
#[derive(Debug, Default, MallocSizeOf, JSTraceable)]
pub struct DOMString(RefCell<DOMStringType>);

impl Clone for DOMString {
    fn clone(&self) -> Self {
        self.ensure_rust_string().clone().into()
    }
}

pub enum DOMStringErrorType {
    JSConversionError,
}

impl DOMString {
    /// Creates a new `DOMString`.
    pub fn new() -> DOMString {
        Default::default()
    }

    /// Creates the string from js. If the string can be encoded in latin1, just take the reference
    /// to the JSString. Otherwise do the conversion to utf8 now.
    pub fn from_js_string(
        cx: SafeJSContext,
        value: js::gc::HandleValue,
    ) -> Result<DOMString, DOMStringErrorType> {
        let string_ptr = unsafe { js::rust::ToString(*cx, value) };
        if string_ptr.is_null() {
            debug!("ToString failed");
            Err(DOMStringErrorType::JSConversionError)
        } else {
            let latin1 = unsafe { js::jsapi::JS_DeprecatedStringHasLatin1Chars(string_ptr) };
            let inner = if latin1 {
                let h = RootedTraceableBox::from_box(Heap::boxed(string_ptr));
                DOMStringType::JSString(h)
            } else {
                // We need to convert the string anyway as it is not just latin1
                DOMStringType::Rust(unsafe {
                    jsstr_to_string(*cx, ptr::NonNull::new(string_ptr).unwrap())
                })
            };
            Ok(DOMString(RefCell::new(inner)))
        }
    }

    /// Transforms the internal storage of this [`DOMString`] into a Rust string if it is not
    /// yet one. This will make a copy of the underlying string data.
    fn ensure_rust_string(&self) -> RefMut<'_, String> {
        let inner = self.0.borrow_mut();
        RefMut::map(inner, |inner| inner.ensure_rust_string())
    }

    /// Debug the current  state of the string without modifying it.
    #[expect(unused)]
    fn debug_js(&self) {
        match *self.0.borrow() {
            DOMStringType::Rust(ref s) => info!("Rust String ({})", s),
            DOMStringType::JSString(ref rooted_traceable_box) => {
                let s = unsafe {
                    jsstr_to_string(
                        Runtime::get().expect("JS runtime has shut down").as_ptr(),
                        ptr::NonNull::new(rooted_traceable_box.get()).unwrap(),
                    )
                };
                info!("JSString ({})", s);
            },
            #[cfg(test)]
            DOMStringType::Latin1Vec(ref items) => info!("Latin1 string"),
        }
    }

    /// Returns the underlying rust string.
    pub fn str(&self) -> StringView<'_> {
        {
            let inner = self.0.borrow();
            if matches!(&*inner, DOMStringType::Rust(..)) {
                return StringView(Ref::map(inner, |inner| match inner {
                    DOMStringType::Rust(string) => string.as_str(),
                    _ => unreachable!("Guaranteed by condition above"),
                }));
            }
        }

        self.ensure_rust_string();
        self.str()
    }

    /// Return the [`EncodedBytes`] of this [`DOMString`]. This returns the original encoded
    /// bytes of the string without doing any conversions.
    pub fn encoded_bytes(&self) -> EncodedBytes<'_> {
        let inner = self.0.borrow();
        match &*inner {
            DOMStringType::Rust(..) => {
                EncodedBytes::Utf8(Ref::map(inner, |inner| inner.as_raw_bytes()))
            },
            _ => EncodedBytes::Latin1(Ref::map(inner, |inner| inner.as_raw_bytes())),
        }
    }

    pub fn clear(&mut self) {
        *self.0.borrow_mut() = DOMStringType::Rust(String::new())
    }

    pub fn is_empty(&self) -> bool {
        self.encoded_bytes().is_empty()
    }

    /// The length of this string in UTF-8 code units, each one being one byte in size.
    ///
    /// Note: This is different than the number of Unicode characters (or code points). A
    /// character may require multiple UTF-8 code units.
    pub fn len(&self) -> usize {
        self.encoded_bytes().len()
    }

    /// The length of this string in UTF-8 code units, each one being one byte in size.
    /// This method is the same as [`DOMString::len`], but the result is wrapped in a
    /// `Utf8CodeUnitLength` to be used in code that mixes different kinds of offsets.
    ///
    /// Note: This is different than the number of Unicode characters (or code points). A
    /// character may require multiple UTF-8 code units.
    pub fn len_utf8(&self) -> Utf8CodeUnitLength {
        Utf8CodeUnitLength(self.len())
    }

    /// The length of this string in UTF-16 code units, each one being one two bytes in size.
    ///
    /// Note: This is different than the number of Unicode characters (or code points). A
    /// character may require multiple UTF-16 code units.
    pub fn len_utf16(&self) -> Utf16CodeUnitLength {
        Utf16CodeUnitLength(self.str().chars().map(char::len_utf16).sum())
    }

    pub fn make_ascii_lowercase(&mut self) {
        self.0
            .borrow_mut()
            .ensure_rust_string()
            .make_ascii_lowercase();
    }

    pub fn push_str(&mut self, string_to_push: &str) {
        self.0
            .borrow_mut()
            .ensure_rust_string()
            .push_str(string_to_push);
    }

    pub fn strip_leading_and_trailing_ascii_whitespace(&mut self) {
        if self.is_empty() {
            return;
        }

        let mut inner = self.0.borrow_mut();
        let string = inner.ensure_rust_string();
        let trailing_whitespace_len = string
            .trim_end_matches(|character: char| character.is_ascii_whitespace())
            .len();
        string.truncate(trailing_whitespace_len);
        if string.is_empty() {
            return;
        }

        let first_non_whitespace = string
            .find(|character: char| !character.is_ascii_whitespace())
            .unwrap();
        string.replace_range(0..first_non_whitespace, "");
    }

    /// This is a dom spec
    pub fn is_valid_floating_point_number_string(&self) -> bool {
        static RE: LazyLock<Regex> = LazyLock::new(|| {
            Regex::new(r"^-?(?:\d+\.\d+|\d+|\.\d+)(?:(e|E)(\+|\-)?\d+)?$").unwrap()
        });

        RE.is_match(self.0.borrow_mut().ensure_rust_string()) &&
            self.parse_floating_point_number().is_some()
    }

    pub fn parse<T: FromStr>(&self) -> Result<T, <T as FromStr>::Err> {
        self.str().parse::<T>()
    }

    /// <https://html.spec.whatwg.org/multipage/#rules-for-parsing-floating-point-number-values>
    pub fn parse_floating_point_number(&self) -> Option<f64> {
        parse_floating_point_number(&self.str())
    }

    /// <https://html.spec.whatwg.org/multipage/#best-representation-of-the-number-as-a-floating-point-number>
    pub fn set_best_representation_of_the_floating_point_number(&mut self) {
        if let Some(val) = self.parse_floating_point_number() {
            // [tc39] Step 2: If x is either +0 or -0, return "0".
            let parsed_value = if val.is_zero() { 0.0_f64 } else { val };

            *self.0.borrow_mut() = DOMStringType::Rust(parsed_value.to_string());
        }
    }

    pub fn to_lowercase(&self) -> String {
        self.str().to_lowercase()
    }

    pub fn to_uppercase(&self) -> String {
        self.str().to_uppercase()
    }

    pub fn strip_newlines(&mut self) {
        // > To strip newlines from a string, remove any U+000A LF and U+000D CR code
        // > points from the string.
        self.0
            .borrow_mut()
            .ensure_rust_string()
            .retain(|character| character != '\r' && character != '\n');
    }

    /// Normalize newlines according to <https://infra.spec.whatwg.org/#normalize-newlines>.
    pub fn normalize_newlines(&mut self) {
        // > To normalize newlines in a string, replace every U+000D CR U+000A LF code point
        // > pair with a single U+000A LF code point, and then replace every remaining
        // > U+000D CR code point with a U+000A LF code point.
        let mut inner = self.0.borrow_mut();
        let string = inner.ensure_rust_string();
        *string = string.replace("\r\n", "\n").replace("\r", "\n")
    }

    pub fn replace(self, needle: &str, replace_char: &str) -> DOMString {
        let new_string = self.str().to_owned();
        DOMString(RefCell::new(DOMStringType::Rust(
            new_string.replace(needle, replace_char),
        )))
    }

    /// Pattern is not yet stable in rust, hence, we need different methods for str and char
    pub fn starts_with(&self, c: char) -> bool {
        if !c.is_ascii() {
            self.str().starts_with(c)
        } else {
            // As this is an ASCII character, it is guaranteed to be a single byte, no matter if the
            // underlying encoding is UTF-8 or Latin1.
            self.encoded_bytes().bytes().starts_with(&[c as u8])
        }
    }

    pub fn starts_with_str(&self, needle: &str) -> bool {
        self.str().starts_with(needle)
    }

    pub fn contains(&self, needle: &str) -> bool {
        self.str().contains(needle)
    }

    pub fn to_ascii_lowercase(&self) -> String {
        let conversion = match self.encoded_bytes() {
            EncodedBytes::Latin1(bytes) => {
                if bytes.iter().all(|c| *c <= ASCII_END) {
                    // We are just simple ascii
                    Some(unsafe {
                        String::from_utf8_unchecked(
                            bytes
                                .iter()
                                .map(|c| {
                                    if *c >= ASCII_CAPITAL_A && *c <= ASCII_CAPITAL_Z {
                                        c + 32
                                    } else {
                                        *c
                                    }
                                })
                                .collect(),
                        )
                    })
                } else {
                    None
                }
            },
            EncodedBytes::Utf8(bytes) => unsafe {
                // Save because we know it was a utf8 string
                Some(str::from_utf8_unchecked(&bytes).to_ascii_lowercase())
            },
        };
        // We otherwise would double borrow the refcell
        if let Some(conversion) = conversion {
            conversion
        } else {
            self.str().to_ascii_lowercase()
        }
    }

    fn contains_space_characters(
        &self,
        latin1_characters: &'static [u8],
        utf8_characters: &'static [char],
    ) -> bool {
        match self.encoded_bytes() {
            EncodedBytes::Latin1(items) => {
                latin1_characters.iter().any(|byte| items.contains(byte))
            },
            EncodedBytes::Utf8(bytes) => {
                // Save because we know it was a utf8 string
                let s = unsafe { str::from_utf8_unchecked(&bytes) };
                s.contains(utf8_characters)
            },
        }
    }

    /// <https://infra.spec.whatwg.org/#ascii-tab-or-newline>
    pub fn contains_tab_or_newline(&self) -> bool {
        const LATIN_TAB_OR_NEWLINE: [u8; 3] = [ASCII_TAB, ASCII_NEWLINE, ASCII_CR];
        const UTF8_TAB_OR_NEWLINE: [char; 3] = ['\u{0009}', '\u{000a}', '\u{000d}'];

        self.contains_space_characters(&LATIN_TAB_OR_NEWLINE, &UTF8_TAB_OR_NEWLINE)
    }

    /// <https://infra.spec.whatwg.org/#ascii-whitespace>
    pub fn contains_html_space_characters(&self) -> bool {
        const SPACE_BYTES: [u8; 5] = [
            ASCII_TAB,
            ASCII_NEWLINE,
            ASCII_FORMFEED,
            ASCII_CR,
            ASCII_SPACE,
        ];
        self.contains_space_characters(&SPACE_BYTES, HTML_SPACE_CHARACTERS)
    }

    /// This returns the string in utf8 bytes, i.e., `[u8]` encoded with utf8.
    pub fn as_bytes(&self) -> BytesView<'_> {
        // BytesView will just give the raw bytes on dereference.
        // If we are ascii this is the same for latin1 and utf8.
        // Otherwise we convert to rust.
        if self.is_ascii() {
            BytesView(self.0.borrow())
        } else {
            self.ensure_rust_string();
            BytesView(self.0.borrow())
        }
    }

    /// Tests if there are only ascii lowercase characters. Does not include special characters.
    pub fn is_ascii_lowercase(&self) -> bool {
        match self.encoded_bytes() {
            EncodedBytes::Latin1(items) => items
                .iter()
                .all(|c| (ASCII_LOWERCASE_A..=ASCII_LOWERCASE_Z).contains(c)),
            EncodedBytes::Utf8(s) => s
                .iter()
                .map(|c| c.to_u8().unwrap_or(ASCII_LOWERCASE_A - 1))
                .all(|c| (ASCII_LOWERCASE_A..=ASCII_LOWERCASE_Z).contains(&c)),
        }
    }

    /// Is the string only ascii characters
    pub fn is_ascii(&self) -> bool {
        self.encoded_bytes().bytes().is_ascii()
    }

    /// Returns true if the slice only contains bytes that are safe to use in cookie strings.
    /// <https://www.ietf.org/archive/id/draft-ietf-httpbis-rfc6265bis-15.html#section-5.6-6>
    /// Not using ServoCookie::is_valid_name_or_value to prevent dependency on the net crate.
    pub fn is_valid_for_cookie(&self) -> bool {
        match self.encoded_bytes() {
            EncodedBytes::Latin1(items) | EncodedBytes::Utf8(items) => !items
                .iter()
                .any(|c| *c == 0x7f || (*c <= 0x1f && *c != 0x09)),
        }
    }

    /// Call the callback with a `&str` reference of the string stored in this [`DOMString`]. Note
    /// that if the [`DOMString`] cannot be interpreted as a Rust string a conversion will be done.
    fn with_str_reference<Result>(&self, callback: fn(&str) -> Result) -> Result {
        match self.encoded_bytes() {
            // If the Latin1 string is all ASCII bytes, then it is safe to interpret it as UTF-8.
            EncodedBytes::Latin1(latin1_bytes) => {
                if latin1_bytes.iter().all(|character| character.is_ascii()) {
                    // SAFETY: All characters are ASCII, so it is safe to interpret this string as
                    // UTF-8.
                    return callback(unsafe { str::from_utf8_unchecked(&latin1_bytes) });
                }
            },
            EncodedBytes::Utf8(utf8_bytes) => {
                // SAFETY: These are the bytes of a UTF-8 string already, so they can be interpreted
                // as UTF-8.
                return callback(unsafe { str::from_utf8_unchecked(&utf8_bytes) });
            },
        };
        callback(self.str().deref())
    }
}

/// <https://html.spec.whatwg.org/multipage/#rules-for-parsing-floating-point-number-values>
pub fn parse_floating_point_number(input: &str) -> Option<f64> {
    // Steps 15-16 are telling us things about IEEE rounding modes
    // for floating-point significands; this code assumes the Rust
    // compiler already matches them in any cases where
    // that actually matters. They are not
    // related to f64::round(), which is for rounding to integers.
    input.trim().parse::<f64>().ok().filter(|value| {
        // A valid number is the same as what rust considers to be valid,
        // except for +1., NaN, and Infinity.
        !(value.is_infinite() || value.is_nan() || input.ends_with('.') || input.starts_with('+'))
    })
}

pub struct BytesView<'a>(Ref<'a, DOMStringType>);

impl Deref for BytesView<'_> {
    type Target = [u8];

    fn deref(&self) -> &Self::Target {
        // This does the correct thing by the construction of BytesView in `DOMString::as_bytes`.
        self.0.as_raw_bytes()
    }
}

impl Ord for DOMString {
    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
        self.str().cmp(&other.str())
    }
}

impl PartialOrd for DOMString {
    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
        self.str().partial_cmp(&other.str())
    }
}

impl Extend<char> for DOMString {
    fn extend<T: IntoIterator<Item = char>>(&mut self, iter: T) {
        self.0.borrow_mut().ensure_rust_string().extend(iter)
    }
}

impl ToJSValConvertible for DOMString {
    unsafe fn to_jsval(&self, cx: *mut JSContext, mut rval: MutableHandleValue) {
        let val = self.0.borrow();
        match *val {
            DOMStringType::Rust(ref s) => unsafe {
                s.to_jsval(cx, rval);
            },
            DOMStringType::JSString(ref rooted_traceable_box) => unsafe {
                rval.set(StringValue(&*rooted_traceable_box.get()));
            },
            #[cfg(test)]
            DOMStringType::Latin1Vec(ref items) => {
                let mut v = vec![0; items.len() * 2];
                let real_size =
                    encoding_rs::mem::convert_latin1_to_utf8(items.as_slice(), v.as_mut_slice());
                v.truncate(real_size);

                String::from_utf8(v)
                    .expect("Error in constructin test string")
                    .to_jsval(cx, rval);
            },
        };
    }
}

impl std::hash::Hash for DOMString {
    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
        self.str().hash(state);
    }
}

impl std::fmt::Display for DOMString {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        fmt::Display::fmt(self.str().deref(), f)
    }
}

impl std::cmp::PartialEq<str> for DOMString {
    fn eq(&self, other: &str) -> bool {
        if other.is_ascii() {
            *other.as_bytes() == *self.encoded_bytes().bytes()
        } else {
            self.str().deref() == other
        }
    }
}

impl std::cmp::PartialEq<&str> for DOMString {
    fn eq(&self, other: &&str) -> bool {
        self.eq(*other)
    }
}

impl std::cmp::PartialEq<String> for DOMString {
    fn eq(&self, other: &String) -> bool {
        self.eq(other.as_str())
    }
}

impl std::cmp::PartialEq<DOMString> for String {
    fn eq(&self, other: &DOMString) -> bool {
        other.eq(self)
    }
}

impl std::cmp::PartialEq<DOMString> for str {
    fn eq(&self, other: &DOMString) -> bool {
        other.eq(self)
    }
}

impl std::cmp::PartialEq for DOMString {
    fn eq(&self, other: &DOMString) -> bool {
        let result = match (self.encoded_bytes(), other.encoded_bytes()) {
            (EncodedBytes::Latin1(bytes), EncodedBytes::Latin1(other_bytes)) => {
                Some(*bytes == *other_bytes)
            },
            (EncodedBytes::Latin1(bytes), EncodedBytes::Utf8(other_bytes))
                if other_bytes.is_ascii() =>
            {
                Some(*bytes == *other_bytes)
            },
            (EncodedBytes::Utf8(bytes), EncodedBytes::Latin1(other_bytes)) if bytes.is_ascii() => {
                Some(*bytes == *other_bytes)
            },
            (EncodedBytes::Utf8(bytes), EncodedBytes::Utf8(other_bytes)) => {
                Some(*bytes == *other_bytes)
            },
            _ => None,
        };

        if let Some(eq_result) = result {
            return eq_result;
        }

        *self.str() == *other.str()
    }
}

impl std::cmp::Eq for DOMString {}

impl From<std::string::String> for DOMString {
    fn from(string: String) -> Self {
        DOMString(RefCell::new(DOMStringType::Rust(string)))
    }
}

impl From<&str> for DOMString {
    fn from(string: &str) -> Self {
        String::from(string).into()
    }
}

impl From<DOMString> for LocalName {
    fn from(dom_string: DOMString) -> LocalName {
        dom_string.with_str_reference(|string| LocalName::from(string))
    }
}

impl From<&DOMString> for LocalName {
    fn from(dom_string: &DOMString) -> LocalName {
        dom_string.with_str_reference(|string| LocalName::from(string))
    }
}

impl From<DOMString> for Namespace {
    fn from(dom_string: DOMString) -> Namespace {
        dom_string.with_str_reference(|string| Namespace::from(string))
    }
}

impl From<DOMString> for Atom {
    fn from(dom_string: DOMString) -> Atom {
        dom_string.with_str_reference(|string| Atom::from(string))
    }
}

impl From<DOMString> for String {
    fn from(val: DOMString) -> Self {
        val.str().to_owned()
    }
}

impl From<DOMString> for Vec<u8> {
    fn from(value: DOMString) -> Self {
        value.str().as_bytes().to_vec()
    }
}

impl From<Cow<'_, str>> for DOMString {
    fn from(value: Cow<'_, str>) -> Self {
        DOMString(RefCell::new(DOMStringType::Rust(value.into_owned())))
    }
}

#[macro_export]
macro_rules! match_domstring_ascii_inner {
    ($variant: expr, $input: expr, $ascii_literal: literal => $then: expr, $($rest:tt)*) => {
        if {
            debug_assert!(($ascii_literal).is_ascii());
            $ascii_literal.as_bytes()
        } == $input.bytes() {
          $then
        } else {
            $crate::match_domstring_ascii_inner!($variant, $input, $($rest)*)
        }

    };
    ($variant: expr, $input: expr, $p: pat => $then: expr,) => {
        match $input {
            $p => $then
        }
    }
}

/// Use this to match &str against lazydomstring efficiently.
/// You are only allowed to match ascii strings otherwise this macro will
/// lead to wrong results.
/// ```ignore
/// let s = DOMString::from("test");
/// let value = match_domstring!(s,
/// "test1" => 1,
/// "test2" => 2,
/// "test" => 3,
/// _ => 4,
/// );
/// assert_eq!(value, 3);
/// ```
#[macro_export]
macro_rules! match_domstring_ascii {
    ($input:expr, $($tail:tt)*) => {
        {
            use $crate::domstring::EncodedBytes;

            let encoded_bytes = $input.encoded_bytes();
            match encoded_bytes {
                EncodedBytes::Latin1(_) => {
                    $crate::match_domstring_ascii_inner!(EncodedBytes::Latin1, encoded_bytes, $($tail)*)
                }
                EncodedBytes::Utf8(_) => {
                    $crate::match_domstring_ascii_inner!(EncodedBytes::Utf8, encoded_bytes, $($tail)*)
                }

            }
        }
    };
}

#[cfg(test)]
mod tests {
    use super::*;

    const LATIN1_PILLCROW: u8 = 0xB6;
    const UTF8_PILLCROW: [u8; 2] = [194, 182];
    const LATIN1_POWER2: u8 = 0xB2;

    fn from_latin1(l1vec: Vec<u8>) -> DOMString {
        DOMString(RefCell::new(DOMStringType::Latin1Vec(l1vec)))
    }

    #[test]
    fn string_functions() {
        let s = DOMString::from("AbBcC❤&%$#");
        let s_copy = s.clone();
        assert_eq!(s.to_ascii_lowercase(), "abbcc❤&%$#");
        assert_eq!(s, s_copy);
        assert_eq!(s.len(), 12);
        assert_eq!(s_copy.len(), 12);
        assert!(s.starts_with('A'));
        let s2 = DOMString::from("");
        assert!(s2.is_empty());
    }

    #[test]
    fn string_functions_latin1() {
        {
            let s = from_latin1(vec![
                b'A', b'b', b'B', b'c', b'C', b'&', b'%', b'$', b'#', 0xB2,
            ]);
            assert_eq!(s.to_ascii_lowercase(), "abbcc&%$#²");
        }
        {
            let s = from_latin1(vec![b'A', b'b', b'B', b'c', b'C']);
            assert_eq!(s.to_ascii_lowercase(), "abbcc");
        }
        {
            let s = from_latin1(vec![
                b'A', b'b', b'B', b'c', b'C', b'&', b'%', b'$', b'#', 0xB2,
            ]);
            assert_eq!(s.len(), 11);
            assert!(s.starts_with('A'));
        }
        {
            let s = from_latin1(vec![]);
            assert!(s.is_empty());
        }
    }

    #[test]
    fn test_length() {
        let s1 = from_latin1(vec![
            0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD,
            0xAE, 0xAF,
        ]);
        let s2 = from_latin1(vec![
            0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD,
            0xBE, 0xBF,
        ]);
        let s3 = from_latin1(vec![
            0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD,
            0xCE, 0xCF,
        ]);
        let s4 = from_latin1(vec![
            0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD,
            0xDE, 0xDF,
        ]);
        let s5 = from_latin1(vec![
            0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED,
            0xEE, 0xEF,
        ]);
        let s6 = from_latin1(vec![
            0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD,
            0xFE, 0xFF,
        ]);

        let s1_utf8 = String::from("\u{00A0}¡¢£¤¥¦§¨©ª«¬\u{00AD}®¯");
        let s2_utf8 = String::from("°±²³´µ¶·¸¹º»¼½¾¿");
        let s3_utf8 = String::from("ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏ");
        let s4_utf8 = String::from("ÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞß");
        let s5_utf8 = String::from("àáâãäåæçèéêëìíîï");
        let s6_utf8 = String::from("ðñòóôõö÷øùúûüýþÿ");

        assert_eq!(s1.len(), s1_utf8.len());
        assert_eq!(s2.len(), s2_utf8.len());
        assert_eq!(s3.len(), s3_utf8.len());
        assert_eq!(s4.len(), s4_utf8.len());
        assert_eq!(s5.len(), s5_utf8.len());
        assert_eq!(s6.len(), s6_utf8.len());

        s1.ensure_rust_string();
        s2.ensure_rust_string();
        s3.ensure_rust_string();
        s4.ensure_rust_string();
        s5.ensure_rust_string();
        s6.ensure_rust_string();
        assert_eq!(s1.len(), s1_utf8.len());
        assert_eq!(s2.len(), s2_utf8.len());
        assert_eq!(s3.len(), s3_utf8.len());
        assert_eq!(s4.len(), s4_utf8.len());
        assert_eq!(s5.len(), s5_utf8.len());
        assert_eq!(s6.len(), s6_utf8.len());
    }

    #[test]
    fn test_convert() {
        let s = from_latin1(vec![b'a', b'b', b'c', b'%', b'$']);
        s.ensure_rust_string();
        assert_eq!(&*s.str(), "abc%$");
    }

    #[test]
    fn partial_eq() {
        let s = from_latin1(vec![b'a', b'b', b'c', b'%', b'$']);
        let string = String::from("abc%$");
        let s2 = DOMString::from(string.clone());
        assert_eq!(s, s2);
        assert_eq!(s, string);
    }

    #[test]
    fn encoded_latin1_bytes() {
        let original_latin1_bytes = vec![b'a', b'b', b'c', b'%', b'$', 0xB2];
        let dom_string = from_latin1(original_latin1_bytes.clone());
        let string_latin1_bytes = match dom_string.encoded_bytes() {
            EncodedBytes::Latin1(bytes) => bytes,
            _ => unreachable!("Expected Latin1 encoded bytes"),
        };
        assert_eq!(*original_latin1_bytes, *string_latin1_bytes);
    }

    #[test]
    fn testing_stringview() {
        let s = from_latin1(vec![b'a', b'b', b'c', b'%', b'$', 0xB2]);

        assert_eq!(
            s.str().chars().collect::<Vec<char>>(),
            vec!['a', 'b', 'c', '%', '$', '²']
        );
        assert_eq!(s.str().as_bytes(), String::from("abc%$²").as_bytes());
    }

    // We need to be extra careful here as two strings that have different
    // representation need to have the same hash.
    // Additionally, the interior mutability is only used for the conversion
    // which is forced by Hash. Hence, it is safe to have this interior mutability.
    #[test]
    fn test_hash() {
        use std::hash::{DefaultHasher, Hash, Hasher};
        fn hash_value(d: &DOMString) -> u64 {
            let mut hasher = DefaultHasher::new();
            d.hash(&mut hasher);
            hasher.finish()
        }

        let s = from_latin1(vec![b'a', b'b', b'c', b'%', b'$', 0xB2]);
        let s_converted = from_latin1(vec![b'a', b'b', b'c', b'%', b'$', 0xB2]);
        s_converted.ensure_rust_string();
        let s2 = DOMString::from("abc%$²");

        let hash_s = hash_value(&s);
        let hash_s_converted = hash_value(&s_converted);
        let hash_s2 = hash_value(&s2);

        assert_eq!(hash_s, hash_s2);
        assert_eq!(hash_s, hash_s_converted);
    }

    // Testing match_lazydomstring if it executes the statements in the match correctly
    #[test]
    fn test_match_executing() {
        // executing
        {
            let s = from_latin1(vec![b'a', b'b', b'c']);
            match_domstring_ascii!( s,
                "abc" => assert!(true),
                "bcd" => assert!(false),
                _ =>  (),
            );
        }

        {
            let s = from_latin1(vec![b'a', b'b', b'c', b'/']);
            match_domstring_ascii!( s,
                "abc/" => assert!(true),
                "bcd" => assert!(false),
                _ =>  (),
            );
        }

        {
            let s = from_latin1(vec![b'a', b'b', b'c', b'%', b'$']);
            match_domstring_ascii!( s,
                "bcd" => assert!(false),
                "abc%$" => assert!(true),
                _ => (),
            );
        }

        {
            let s = DOMString::from("abcde");
            match_domstring_ascii!( s,
                "abc" => assert!(false),
                "bcd" => assert!(false),
                _ => assert!(true),
            );
        }
        {
            let s = DOMString::from("abc%$");
            match_domstring_ascii!( s,
                "bcd" => assert!(false),
                "abc%$" => assert!(true),
                _ =>  (),
            );
        }
        {
            let s = from_latin1(vec![b'a', b'b', b'c']);
            match_domstring_ascii!( s,
                "abcdd" => assert!(false),
                "bcd" => assert!(false),
                _ => (),
            );
        }
    }

    // Testing match_lazydomstring if it evaluates to the correct expression
    #[test]
    fn test_match_returning_result() {
        {
            let s = from_latin1(vec![b'a', b'b', b'c']);
            let res = match_domstring_ascii!( s,
                "abc" => true,
                "bcd" => false,
                _ => false,
            );
            assert_eq!(res, true);
        }
        {
            let s = from_latin1(vec![b'a', b'b', b'c', b'/']);
            let res = match_domstring_ascii!( s,
                "abc/" => true,
                "bcd" => false,
                _ => false,
            );
            assert_eq!(res, true);
        }
        {
            let s = from_latin1(vec![b'a', b'b', b'c', b'%', b'$']);
            let res = match_domstring_ascii!( s,
                "bcd" => false,
                "abc%$" => true,
                _ => false,
            );
            assert_eq!(res, true);
        }

        {
            let s = DOMString::from("abcde");
            let res = match_domstring_ascii!( s,
                "abc" => false,
                "bcd" => false,
                _ => true,
            );
            assert_eq!(res, true);
        }
        {
            let s = DOMString::from("abc%$");
            let res = match_domstring_ascii!( s,
                "bcd" => false,
                "abc%$" => true,
                _ => false,
            );
            assert_eq!(res, true);
        }
        {
            let s = from_latin1(vec![b'a', b'b', b'c']);
            let res = match_domstring_ascii!( s,
                "abcdd" => false,
                "bcd" => false,
                _ => true,
            );
            assert_eq!(res, true);
        }
    }

    #[test]
    #[should_panic]
    fn test_match_panic() {
        let s = DOMString::from("abcd");
        let _res = match_domstring_ascii!(s,
            "❤" => true,
            _ => false,);
    }

    #[test]
    #[should_panic]
    fn test_match_panic2() {
        let s = DOMString::from("abcd");
        let _res = match_domstring_ascii!(s,
            "abc" => false,
            "❤" => true,
            _ => false,
        );
    }

    #[test]
    fn test_strip_whitespace() {
        {
            let mut s = from_latin1(vec![
                b' ', b' ', b' ', b'\n', b' ', b'a', b'b', b'c', b'%', b'$', 0xB2, b' ',
            ]);

            s.strip_leading_and_trailing_ascii_whitespace();
            s.ensure_rust_string();
            assert_eq!(&*s.str(), "abc%$²");
        }
        {
            let mut s = DOMString::from("   \n  abc%$ ");

            s.strip_leading_and_trailing_ascii_whitespace();
            s.ensure_rust_string();
            assert_eq!(&*s.str(), "abc%$");
        }
    }

    // https://infra.spec.whatwg.org/#ascii-whitespace
    #[test]
    fn contains_html_space_characters() {
        let s = from_latin1(vec![b'a', b'a', b'a', ASCII_TAB, b'a', b'a']); // TAB
        assert!(s.contains_html_space_characters());
        s.ensure_rust_string();
        assert!(s.contains_html_space_characters());

        let s = from_latin1(vec![b'a', b'a', b'a', ASCII_NEWLINE, b'a', b'a']); // NEWLINE
        assert!(s.contains_html_space_characters());
        s.ensure_rust_string();
        assert!(s.contains_html_space_characters());

        let s = from_latin1(vec![b'a', b'a', b'a', ASCII_FORMFEED, b'a', b'a']); // FF
        assert!(s.contains_html_space_characters());
        s.ensure_rust_string();
        assert!(s.contains_html_space_characters());

        let s = from_latin1(vec![b'a', b'a', b'a', ASCII_CR, b'a', b'a']); // Carriage Return
        assert!(s.contains_html_space_characters());
        s.ensure_rust_string();
        assert!(s.contains_html_space_characters());

        let s = from_latin1(vec![b'a', b'a', b'a', ASCII_SPACE, b'a', b'a']); // SPACE
        assert!(s.contains_html_space_characters());
        s.ensure_rust_string();
        assert!(s.contains_html_space_characters());

        let s = from_latin1(vec![b'a', b'a', b'a', b'a', b'a']);
        assert!(!s.contains_html_space_characters());
        s.ensure_rust_string();
        assert!(!s.contains_html_space_characters());
    }

    #[test]
    fn atom() {
        let s = from_latin1(vec![b'a', b'a', b'a', 0x20, b'a', b'a']);
        let atom1 = Atom::from(s);
        let s2 = DOMString::from("aaa aa");
        let atom2 = Atom::from(s2);
        assert_eq!(atom1, atom2);
        let s3 = from_latin1(vec![b'a', b'a', b'a', 0xB2, b'a', b'a']);
        let atom3 = Atom::from(s3);
        assert_ne!(atom1, atom3);
    }

    #[test]
    fn namespace() {
        let s = from_latin1(vec![b'a', b'a', b'a', ASCII_SPACE, b'a', b'a']);
        let atom1 = Namespace::from(s);
        let s2 = DOMString::from("aaa aa");
        let atom2 = Namespace::from(s2);
        assert_eq!(atom1, atom2);
        let s3 = from_latin1(vec![b'a', b'a', b'a', LATIN1_POWER2, b'a', b'a']);
        let atom3 = Namespace::from(s3);
        assert_ne!(atom1, atom3);
    }

    #[test]
    fn localname() {
        let s = from_latin1(vec![b'a', b'a', b'a', ASCII_SPACE, b'a', b'a']);
        let atom1 = LocalName::from(s);
        let s2 = DOMString::from("aaa aa");
        let atom2 = LocalName::from(s2);
        assert_eq!(atom1, atom2);
        let s3 = from_latin1(vec![b'a', b'a', b'a', LATIN1_POWER2, b'a', b'a']);
        let atom3 = LocalName::from(s3);
        assert_ne!(atom1, atom3);
    }

    #[test]
    fn is_ascii_lowercase() {
        let s = from_latin1(vec![b'a', b'a', b'a', ASCII_SPACE, b'a', b'a']);
        assert!(!s.is_ascii_lowercase());
        let s = from_latin1(vec![b'a', b'a', b'a', LATIN1_PILLCROW, b'a', b'a']);
        assert!(!s.is_ascii_lowercase());
        let s = from_latin1(vec![b'a', b'a', b'a', b'a', b'z']);
        assert!(s.is_ascii_lowercase());
        let s = from_latin1(vec![b'`', b'a', b'a', b'a', b'z']);
        assert!(!s.is_ascii_lowercase());
        let s = DOMString::from("`aaaz");
        assert!(!s.is_ascii_lowercase());
        let s = DOMString::from("aaaz");
        assert!(s.is_ascii_lowercase());
    }

    #[test]
    fn test_as_bytes() {
        const ASCII_SMALL_A: u8 = b'a';
        const ASCII_SMALL_Z: u8 = b'z';

        let v1 = vec![b'a', b'a', b'a', LATIN1_PILLCROW, b'a', b'a'];
        let s = from_latin1(v1.clone());
        assert_eq!(
            *s.as_bytes(),
            [
                ASCII_SMALL_A,
                ASCII_SMALL_A,
                ASCII_SMALL_A,
                UTF8_PILLCROW[0],
                UTF8_PILLCROW[1],
                ASCII_SMALL_A,
                ASCII_SMALL_A
            ]
        );

        let v2 = vec![b'a', b'a', b'a', b'a', b'z'];
        let s = from_latin1(v2.clone());
        assert_eq!(
            *s.as_bytes(),
            [
                ASCII_SMALL_A,
                ASCII_SMALL_A,
                ASCII_SMALL_A,
                ASCII_SMALL_A,
                ASCII_SMALL_Z
            ]
        );

        let str = "abc%$²".to_owned();
        let s = DOMString::from(str.clone());
        assert_eq!(&*s.as_bytes(), str.as_bytes());
        let str = "AbBcC❤&%$#".to_owned();
        let s = DOMString::from(str.clone());
        assert_eq!(&*s.as_bytes(), str.as_bytes());
    }
}