diff --git a/Cargo.lock b/Cargo.lock index ad9f60b9e61..89ce01cd929 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1302,6 +1302,17 @@ dependencies = [ "zeroize", ] +[[package]] +name = "chardetng" +version = "0.1.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14b8f0b65b7b08ae3c8187e8d77174de20cb6777864c6b832d8ad365999cf1ea" +dependencies = [ + "cfg-if", + "encoding_rs", + "memchr", +] + [[package]] name = "chrono" version = "0.4.42" @@ -7647,6 +7658,7 @@ dependencies = [ "canvas_traits", "cbc", "chacha20poly1305", + "chardetng", "chrono", "cipher", "compositing_traits", diff --git a/Cargo.toml b/Cargo.toml index a43e4fd64b3..525236696d3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -54,6 +54,7 @@ canvas_traits = { path = "components/shared/canvas" } cbc = "0.1.2" cfg-if = "1.0.4" chacha20poly1305 = "0.10" +chardetng = "0.1" chrono = { version = "0.4", features = ["serde"] } cipher = { version = "0.4.4", features = ["alloc"] } compositing_traits = { path = "components/shared/compositing" } diff --git a/components/script/Cargo.toml b/components/script/Cargo.toml index 1c9caea2687..77a459318d0 100644 --- a/components/script/Cargo.toml +++ b/components/script/Cargo.toml @@ -52,6 +52,7 @@ brotli = { workspace = true } canvas_traits = { workspace = true } cbc = { workspace = true } chacha20poly1305 = { workspace = true } +chardetng = { workspace = true } chrono = { workspace = true } cipher = { workspace = true } compositing_traits = { workspace = true } diff --git a/components/script/dom/servoparser/encoding.rs b/components/script/dom/servoparser/encoding.rs index 9aae32a9c5a..87b07a03047 100644 --- a/components/script/dom/servoparser/encoding.rs +++ b/components/script/dom/servoparser/encoding.rs @@ -57,16 +57,23 @@ impl DetectingState { /// more bytes are required. /// /// [determine the character encoding]: https://html.spec.whatwg.org/multipage/#determining-the-character-encoding - fn buffer(&mut self, data: &[u8]) -> Option<&'static Encoding> { + fn buffer( + &mut self, + data: &[u8], + document: &Document, + is_at_end_of_file: AtEndOfFile, + ) -> Option<&'static Encoding> { self.buffered_bytes.extend_from_slice(data); let can_wait_longer = self.start_timestamp.elapsed() < Self::MAX_TIME_TO_BUFFER; - self.determine_the_character_encoding(can_wait_longer) + self.determine_the_character_encoding(document, can_wait_longer, is_at_end_of_file) } /// fn determine_the_character_encoding( &mut self, + document: &Document, potentially_wait_for_more_data: bool, + is_at_end_of_file: AtEndOfFile, ) -> Option<&'static Encoding> { // Step 1. If the result of BOM sniffing is an encoding, return that encoding with confidence certain. if !self.attempted_bom_sniffing && self.buffered_bytes.len() > 2 { @@ -132,7 +139,19 @@ impl DetectingState { // Step 8. The user agent may attempt to autodetect the character encoding from applying frequency analysis // or other algorithms to the data stream. - // NOTE: We don't. + let mut encoding_detector = chardetng::EncodingDetector::new(); + encoding_detector.feed(&self.buffered_bytes, is_at_end_of_file == AtEndOfFile::Yes); + let url = document.url(); + let tld = url + .as_url() + .domain() + .and_then(|domain| domain.rsplit('.').next()) + .map(|tld| tld.as_bytes()); + let (guessed_encoding, is_probably_right) = encoding_detector.guess_assess(tld, true); + if is_probably_right { + log::debug!("chardetng determined that the document encoding is {guessed_encoding:?}"); + return Some(guessed_encoding); + } // Step 9. Otherwise, return an implementation-defined or user-specified default character encoding, // with the confidence tentative. @@ -142,8 +161,8 @@ impl DetectingState { Some(UTF_8) } - fn finish(&mut self) -> &'static Encoding { - self.determine_the_character_encoding(false) + fn finish(&mut self, document: &Document) -> &'static Encoding { + self.determine_the_character_encoding(document, false, AtEndOfFile::Yes) .expect("Should always return character encoding when we're not allowed to wait") } } @@ -169,7 +188,7 @@ impl NetworkDecoderState { pub(super) fn push(&mut self, chunk: &[u8], document: &Document) -> Option { match self { Self::Detecting(encoding_detector) => { - if let Some(encoding) = encoding_detector.buffer(chunk) { + if let Some(encoding) = encoding_detector.buffer(chunk, document, AtEndOfFile::No) { document.set_encoding(encoding); let buffered_bytes = mem::take(&mut encoding_detector.buffered_bytes); *self = Self::Decoding(DecodingState { @@ -198,7 +217,7 @@ impl NetworkDecoderState { pub(super) fn finish(&mut self, document: &Document) -> StrTendril { match self { Self::Detecting(encoding_detector) => { - let encoding = encoding_detector.finish(); + let encoding = encoding_detector.finish(document); document.set_encoding(encoding); let buffered_bytes = mem::take(&mut encoding_detector.buffered_bytes); let mut decoder = LossyDecoder::new_encoding_rs(encoding, NetworkSink::default()); @@ -777,3 +796,9 @@ pub fn get_xml_encoding(input: &[u8]) -> Option<&'static Encoding> { Some(encoding) } } + +#[derive(PartialEq)] +enum AtEndOfFile { + Yes, + No, +} diff --git a/tests/wpt/include.ini b/tests/wpt/include.ini index 107e2f6b160..6146ca98c52 100644 --- a/tests/wpt/include.ini +++ b/tests/wpt/include.ini @@ -142,6 +142,8 @@ skip: true skip: false [encoding] skip: false +[encoding-detection] + skip: false [eventsource] skip: false [fetch] diff --git a/tests/wpt/meta/encoding-detection/utf-8.html.ini b/tests/wpt/meta/encoding-detection/utf-8.html.ini new file mode 100644 index 00000000000..18a5834dfeb --- /dev/null +++ b/tests/wpt/meta/encoding-detection/utf-8.html.ini @@ -0,0 +1,3 @@ +[utf-8.html] + [Check detection result] + expected: FAIL diff --git a/tests/wpt/meta/encoding/utf-32-from-win1252.html.ini b/tests/wpt/meta/encoding/utf-32-from-win1252.html.ini index 17ea7fe2af2..72d67ca09e0 100644 --- a/tests/wpt/meta/encoding/utf-32-from-win1252.html.ini +++ b/tests/wpt/meta/encoding/utf-32-from-win1252.html.ini @@ -1,9 +1,9 @@ [utf-32-from-win1252.html] - [Expect resources/utf-32-big-endian-bom.html to parse as windows-1252] - expected: FAIL - [Expect resources/utf-32-big-endian-nobom.html to parse as windows-1252] expected: FAIL [Expect resources/utf-32-little-endian-nobom.html to parse as windows-1252] expected: FAIL + + [Expect resources/utf-32-big-endian-bom.xml to parse as UTF-8] + expected: FAIL diff --git a/tests/wpt/meta/encoding/utf-32.html.ini b/tests/wpt/meta/encoding/utf-32.html.ini new file mode 100644 index 00000000000..1b22f130979 --- /dev/null +++ b/tests/wpt/meta/encoding/utf-32.html.ini @@ -0,0 +1,6 @@ +[utf-32.html] + [Expect resources/utf-32-big-endian-bom.html to parse as UTF-8] + expected: FAIL + + [Expect resources/utf-32-big-endian-bom.xml to parse as UTF-8] + expected: FAIL diff --git a/tests/wpt/meta/html/syntax/charset/inheritance-bogus-meta-utf-8.html.ini b/tests/wpt/meta/html/syntax/charset/inheritance-bogus-meta-utf-8.html.ini new file mode 100644 index 00000000000..a43a09af869 --- /dev/null +++ b/tests/wpt/meta/html/syntax/charset/inheritance-bogus-meta-utf-8.html.ini @@ -0,0 +1,6 @@ +[inheritance-bogus-meta-utf-8.html] + [Child with bogus ] + expected: FAIL + + [Child with bogus Content-Type charset] + expected: FAIL diff --git a/tests/wpt/meta/html/syntax/charset/inheritance-bogus-meta.html.ini b/tests/wpt/meta/html/syntax/charset/inheritance-bogus-meta.html.ini index 4b120812974..baa2f1e7510 100644 --- a/tests/wpt/meta/html/syntax/charset/inheritance-bogus-meta.html.ini +++ b/tests/wpt/meta/html/syntax/charset/inheritance-bogus-meta.html.ini @@ -1,7 +1,4 @@ [inheritance-bogus-meta.html] - [Cross-origin child with bogus ] - expected: FAIL - [Child with bogus ] expected: FAIL