servo/components/script_bindings/str.rs

/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at https://mozilla.org/MPL/2.0/. */

//! The `ByteString` struct.
use std::borrow::{Borrow, Cow, ToOwned};
use std::default::Default;
use std::hash::{Hash, Hasher};
use std::marker::PhantomData;
use std::ops::{Deref, DerefMut};
use std::str::{CharIndices, Chars, EncodeUtf16, FromStr};
use std::sync::LazyLock;
use std::{fmt, ops, slice, str};

use cssparser::CowRcStr;
use html5ever::{LocalName, Namespace};
use js::rust::wrappers::ToJSON;
use js::rust::{HandleObject, HandleValue};
use num_traits::Zero;
use regex::Regex;
use style::str::HTML_SPACE_CHARACTERS;
use stylo_atoms::Atom;

use crate::error::Error;
use crate::script_runtime::JSContext as SafeJSContext;

/// Encapsulates the IDL `ByteString` type.
#[derive(Clone, Debug, Default, Eq, JSTraceable, MallocSizeOf, PartialEq)]
pub struct ByteString(Vec<u8>);

impl ByteString {
    /// Creates a new `ByteString`.
    pub fn new(value: Vec<u8>) -> ByteString {
        ByteString(value)
    }

    /// Returns `self` as a string, if it encodes valid UTF-8, and `None`
    /// otherwise.
    pub fn as_str(&self) -> Option<&str> {
        str::from_utf8(&self.0).ok()
    }

    /// Returns the length.
    pub fn len(&self) -> usize {
        self.0.len()
    }

    /// Checks if the ByteString is empty.
    pub fn is_empty(&self) -> bool {
        self.0.is_empty()
    }

    /// Returns `self` with A–Z replaced by a–z.
    pub fn to_lower(&self) -> ByteString {
        ByteString::new(self.0.to_ascii_lowercase())
    }
}

impl From<ByteString> for Vec<u8> {
    fn from(byte_string: ByteString) -> Vec<u8> {
        byte_string.0
    }
}

impl Hash for ByteString {
    fn hash<H: Hasher>(&self, state: &mut H) {
        self.0.hash(state);
    }
}

impl FromStr for ByteString {
    type Err = ();
    fn from_str(s: &str) -> Result<ByteString, ()> {
        Ok(ByteString::new(s.to_owned().into_bytes()))
    }
}

impl ops::Deref for ByteString {
    type Target = [u8];
    fn deref(&self) -> &[u8] {
        &self.0
    }
}

/// A string that is constructed from a UCS-2 buffer by replacing invalid code
/// points with the replacement character.
#[derive(Clone, Debug, Default, Eq, Hash, MallocSizeOf, Ord, PartialEq, PartialOrd)]
pub struct USVString(pub String);

impl Borrow<str> for USVString {
    #[inline]
    fn borrow(&self) -> &str {
        &self.0
    }
}

impl Deref for USVString {
    type Target = str;

    #[inline]
    fn deref(&self) -> &str {
        &self.0
    }
}

impl DerefMut for USVString {
    #[inline]
    fn deref_mut(&mut self) -> &mut str {
        &mut self.0
    }
}

impl AsRef<str> for USVString {
    fn as_ref(&self) -> &str {
        &self.0
    }
}

impl fmt::Display for USVString {
    #[inline]
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        fmt::Display::fmt(&**self, f)
    }
}

impl PartialEq<str> for USVString {
    fn eq(&self, other: &str) -> bool {
        &**self == other
    }
}

impl<'a> PartialEq<&'a str> for USVString {
    fn eq(&self, other: &&'a str) -> bool {
        &**self == *other
    }
}

impl From<String> for USVString {
    fn from(contents: String) -> USVString {
        USVString(contents)
    }
}

/// Returns whether `s` is a `token`, as defined by
/// [RFC 2616](http://tools.ietf.org/html/rfc2616#page-17).
pub fn is_token(s: &[u8]) -> bool {
    if s.is_empty() {
        return false; // A token must be at least a single character
    }
    s.iter().all(|&x| {
        // http://tools.ietf.org/html/rfc2616#section-2.2
        match x {
            0..=31 | 127 => false, // CTLs
            40 | 41 | 60 | 62 | 64 | 44 | 59 | 58 | 92 | 34 | 47 | 91 | 93 | 63 | 61 | 123 |
            125 | 32 => false, // separators
            x if x > 127 => false, // non-CHARs
            _ => true,
        }
    })
}

/// A DOMString.
///
/// This type corresponds to the [`DOMString`] type in WebIDL.
///
/// [`DOMString`]: https://webidl.spec.whatwg.org/#idl-DOMString
///
/// Conceptually, a DOMString has the same value space as a JavaScript String,
/// i.e., an array of 16-bit *code units* representing UTF-16, potentially with
/// unpaired surrogates present (also sometimes called WTF-16).
///
/// Currently, this type stores a Rust `String`, in order to avoid issues when
/// integrating with the rest of the Rust ecosystem and even the rest of the
/// browser itself.
///
/// However, Rust `String`s are guaranteed to be valid UTF-8, and as such have
/// a *smaller value space* than WTF-16 (i.e., some JavaScript String values
/// can not be represented as a Rust `String`). This introduces the question of
/// what to do with values being passed from JavaScript to Rust that contain
/// unpaired surrogates.
///
/// The hypothesis is that it does not matter much how exactly those values are
/// transformed, because  passing unpaired surrogates into the DOM is very rare.
/// Instead Servo withh replace the unpaired surrogate by a U+FFFD replacement
/// character.
///
/// Currently, the lack of crash reports about this issue provides some
/// evidence to support the hypothesis. This evidence will hopefully be used to
/// convince other browser vendors that it would be safe to replace unpaired
/// surrogates at the boundary between JavaScript and native code. (This would
/// unify the `DOMString` and `USVString` types, both in the WebIDL standard
/// and in Servo.)
///
/// This type is currently `!Send`, in order to help with an independent
/// experiment to store `JSString`s rather than Rust `String`s.
#[derive(Clone, Debug, Eq, Hash, MallocSizeOf, Ord, PartialEq, PartialOrd)]
pub struct DOMString(String, PhantomData<*const ()>);

impl DOMString {
    /// Creates a new `DOMString`.
    pub fn new() -> DOMString {
        DOMString(String::new(), PhantomData)
    }

    /// Creates a new `DOMString` from a `String`.
    pub fn from_string(s: String) -> DOMString {
        DOMString(s, PhantomData)
    }

    /// Get the internal `&str` value of this [`DOMString`].
    pub fn str(&self) -> &str {
        &self.0
    }

    pub fn is_empty(&self) -> bool {
        self.0.is_empty()
    }

    /// Appends a given string slice onto the end of this String.
    pub fn push_str(&mut self, string: &str) {
        self.0.push_str(string)
    }

    /// Clears this `DOMString`, removing all contents.
    pub fn clear(&mut self) {
        self.0.clear()
    }

    /// Shortens this String to the specified length.
    pub fn truncate(&mut self, new_len: usize) {
        self.0.truncate(new_len);
    }

    /// Removes newline characters according to <https://infra.spec.whatwg.org/#strip-newlines>.
    pub fn strip_newlines(&mut self) {
        // > To strip newlines from a string, remove any U+000A LF and U+000D CR code
        // > points from the string.
        self.0.retain(|c| c != '\r' && c != '\n');
    }

    /// Normalize newlines according to <https://infra.spec.whatwg.org/#normalize-newlines>.
    pub fn normalize_newlines(&mut self) {
        // > To normalize newlines in a string, replace every U+000D CR U+000A LF code point
        // > pair with a single U+000A LF code point, and then replace every remaining
        // > U+000D CR code point with a U+000A LF code point.
        self.0 = self.0.replace("\r\n", "\n").replace("\r", "\n")
    }

    /// Removes leading and trailing ASCII whitespaces according to
    /// <https://infra.spec.whatwg.org/#strip-leading-and-trailing-ascii-whitespace>.
    pub fn strip_leading_and_trailing_ascii_whitespace(&mut self) {
        if self.0.is_empty() {
            return;
        }

        let trailing_whitespace_len = self
            .0
            .trim_end_matches(|ref c| char::is_ascii_whitespace(c))
            .len();
        self.0.truncate(trailing_whitespace_len);
        if self.0.is_empty() {
            return;
        }

        let first_non_whitespace = self.0.find(|ref c| !char::is_ascii_whitespace(c)).unwrap();
        self.0.replace_range(0..first_non_whitespace, "");
    }

    /// <https://html.spec.whatwg.org/multipage/#valid-floating-point-number>
    pub fn is_valid_floating_point_number_string(&self) -> bool {
        static RE: LazyLock<Regex> = LazyLock::new(|| {
            Regex::new(r"^-?(?:\d+\.\d+|\d+|\.\d+)(?:(e|E)(\+|\-)?\d+)?$").unwrap()
        });

        RE.is_match(&self.0) && self.parse_floating_point_number().is_some()
    }

    /// <https://html.spec.whatwg.org/multipage/#rules-for-parsing-floating-point-number-values>
    pub fn parse_floating_point_number(&self) -> Option<f64> {
        // Steps 15-16 are telling us things about IEEE rounding modes
        // for floating-point significands; this code assumes the Rust
        // compiler already matches them in any cases where
        // that actually matters. They are not
        // related to f64::round(), which is for rounding to integers.
        let input = &self.0;
        if let Ok(val) = input.trim().parse::<f64>() {
            if !(
                // A valid number is the same as what rust considers to be valid,
                // except for +1., NaN, and Infinity.
                val.is_infinite() || val.is_nan() || input.ends_with('.') || input.starts_with('+')
            ) {
                return Some(val);
            }
        }
        None
    }

    /// Applies the same processing as `parse_floating_point_number` with some additional handling
    /// according to ECMA's string conversion steps.
    ///
    /// Used for specific elements when handling floating point values, namely the `number` and
    /// `range` inputs, as well as `meter` and `progress` elements.
    ///
    /// <https://html.spec.whatwg.org/multipage/#best-representation-of-the-number-as-a-floating-point-number>
    /// <https://tc39.es/ecma262/#sec-numeric-types-number-tostring>
    pub fn set_best_representation_of_the_floating_point_number(&mut self) {
        if let Some(val) = self.parse_floating_point_number() {
            // [tc39] Step 2: If x is either +0 or -0, return "0".
            let parsed_value = if val.is_zero() { 0.0_f64 } else { val };

            self.0 = parsed_value.to_string()
        }
    }

    // What follows are the functions inherited from std::string
    pub fn make_ascii_lowercase(&mut self) {
        self.0.make_ascii_lowercase();
    }

    pub fn to_ascii_lowercase(&self) -> String {
        self.0.to_ascii_lowercase()
    }

    pub fn len(&self) -> usize {
        self.0.len()
    }

    pub fn chars(&self) -> Chars<'_> {
        self.0.chars()
    }

    pub fn parse<T: FromStr>(&self) -> Result<T, <T as FromStr>::Err> {
        self.0.parse::<T>()
    }

    pub fn contains(&self, needle: &str) -> bool {
        self.0.contains(needle)
    }

    pub fn to_lowercase(&self) -> String {
        self.0.to_lowercase()
    }

    pub fn as_bytes(&self) -> &[u8] {
        self.0.as_bytes()
    }

    pub fn to_uppercase(&self) -> String {
        self.0.to_uppercase()
    }

    pub fn encode_utf16(&self) -> EncodeUtf16<'_> {
        self.0.encode_utf16()
    }

    pub fn find(&self, c: char) -> Option<usize> {
        self.0.find(c)
    }

    pub fn starts_with(&self, c: char) -> bool {
        self.0.starts_with(c)
    }

    pub fn starts_with_str(&self, needle: &str) -> bool {
        self.0.starts_with(needle)
    }

    pub fn contains_html_space_characters(&self) -> bool {
        self.0.contains(HTML_SPACE_CHARACTERS)
    }

    pub fn split_html_space_characters(&self) -> impl Iterator<Item = &str> {
        self.0
            .split(HTML_SPACE_CHARACTERS)
            .filter(|s| !s.is_empty())
    }

    pub fn char_indices(&self) -> CharIndices<'_> {
        self.0.char_indices()
    }

    pub fn strip_prefix(&self, pattern: &str) -> Option<&str> {
        self.0.strip_prefix(pattern)
    }

    pub fn split(&self, c: char) -> impl Iterator<Item = &str> {
        self.0.split(c)
    }
}

/// Because this converts to a DOMString it becomes UTF-8 encoded which is closer to
/// the spec definition of <https://infra.spec.whatwg.org/#serialize-a-javascript-value-to-json-bytes>
/// but we generally do not operate on anything that is truly a WTF-16 string.
///
/// <https://infra.spec.whatwg.org/#serialize-a-javascript-value-to-a-json-string>
pub fn serialize_jsval_to_json_utf8(
    cx: SafeJSContext,
    data: HandleValue,
) -> Result<DOMString, Error> {
    #[repr(C)]
    struct ToJSONCallbackData {
        string: Option<String>,
    }

    let mut out_str = ToJSONCallbackData { string: None };

    #[allow(unsafe_code)]
    unsafe extern "C" fn write_callback(
        string: *const u16,
        len: u32,
        data: *mut std::ffi::c_void,
    ) -> bool {
        let data = data as *mut ToJSONCallbackData;
        let string_chars = unsafe { slice::from_raw_parts(string, len as usize) };
        unsafe { &mut *data }
            .string
            .get_or_insert_with(Default::default)
            .push_str(&String::from_utf16_lossy(string_chars));
        true
    }

    // 1. Let result be ? Call(%JSON.stringify%, undefined, « value »).
    unsafe {
        let stringify_result = ToJSON(
            *cx,
            data,
            HandleObject::null(),
            HandleValue::null(),
            Some(write_callback),
            &mut out_str as *mut ToJSONCallbackData as *mut _,
        );
        // Note: ToJSON returns false when a JS error is thrown, so we need to return
        // JSFailed to propagate the raised exception
        if !stringify_result {
            return Err(Error::JSFailed);
        }
    }

    // 2. If result is undefined, then throw a TypeError.
    // Note: ToJSON will not call the callback if the data cannot be serialized.
    // 3. Assert: result is a string.
    // 4. Return result.
    out_str
        .string
        .map(Into::into)
        .ok_or_else(|| Error::Type("unable to serialize JSON".to_owned()))
}

impl Borrow<str> for DOMString {
    #[inline]
    fn borrow(&self) -> &str {
        &self.0
    }
}

impl Default for DOMString {
    fn default() -> Self {
        DOMString(String::new(), PhantomData)
    }
}

impl fmt::Display for DOMString {
    #[inline]
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        fmt::Display::fmt(self.str(), f)
    }
}

impl PartialEq<str> for DOMString {
    fn eq(&self, other: &str) -> bool {
        self.str() == other
    }
}

impl PartialEq<DOMString> for str {
    fn eq(&self, other: &DOMString) -> bool {
        self == other.str()
    }
}

impl<'a> PartialEq<&'a str> for DOMString {
    fn eq(&self, other: &&'a str) -> bool {
        self.str() == *other
    }
}

impl PartialEq<DOMString> for String {
    fn eq(&self, other: &DOMString) -> bool {
        *other.0 == *self
    }
}

impl PartialEq<String> for DOMString {
    fn eq(&self, other: &String) -> bool {
        self.0 == *other
    }
}

impl From<String> for DOMString {
    fn from(contents: String) -> DOMString {
        DOMString(contents, PhantomData)
    }
}

impl From<&str> for DOMString {
    fn from(contents: &str) -> DOMString {
        DOMString::from(String::from(contents))
    }
}

impl<'a> From<Cow<'a, str>> for DOMString {
    fn from(contents: Cow<'a, str>) -> DOMString {
        match contents {
            Cow::Owned(s) => DOMString::from(s),
            Cow::Borrowed(s) => DOMString::from(s),
        }
    }
}

impl From<DOMString> for LocalName {
    fn from(contents: DOMString) -> LocalName {
        LocalName::from(contents.0)
    }
}

impl From<DOMString> for Namespace {
    fn from(contents: DOMString) -> Namespace {
        Namespace::from(contents.0)
    }
}

impl From<DOMString> for Atom {
    fn from(contents: DOMString) -> Atom {
        Atom::from(contents.0)
    }
}

impl From<DOMString> for String {
    fn from(contents: DOMString) -> String {
        contents.0
    }
}

impl From<DOMString> for Vec<u8> {
    fn from(contents: DOMString) -> Vec<u8> {
        contents.0.into()
    }
}

impl<'a> From<DOMString> for Cow<'a, str> {
    fn from(contents: DOMString) -> Cow<'a, str> {
        contents.0.into()
    }
}

impl<'a> From<DOMString> for CowRcStr<'a> {
    fn from(contents: DOMString) -> CowRcStr<'a> {
        contents.0.into()
    }
}

impl Extend<char> for DOMString {
    fn extend<I>(&mut self, iterable: I)
    where
        I: IntoIterator<Item = char>,
    {
        self.0.extend(iterable)
    }
}