script: Use chardetng to guess encoding when all else fails (#41435)

[`chardetng`](https://github.com/hsivonen/chardetng) is the library used
by gecko to guess encodings.

This makes https://intsys.co.jp/game/panepon/p01/index.html load with
the correct encoding. Notably, that site uses shift-jis but has no
encoding declaration of any kind.

Part of https://github.com/servo/servo/issues/6414

---------

Signed-off-by: Simon Wülker <simon.wuelker@arcor.de>
This commit is contained in:
Simon Wülker
2025-12-21 09:53:42 +01:00
committed by GitHub
parent 6964956110
commit a58d9727f9
10 changed files with 66 additions and 13 deletions

12
Cargo.lock generated
View File

@@ -1302,6 +1302,17 @@ dependencies = [
"zeroize",
]
[[package]]
name = "chardetng"
version = "0.1.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "14b8f0b65b7b08ae3c8187e8d77174de20cb6777864c6b832d8ad365999cf1ea"
dependencies = [
"cfg-if",
"encoding_rs",
"memchr",
]
[[package]]
name = "chrono"
version = "0.4.42"
@@ -7647,6 +7658,7 @@ dependencies = [
"canvas_traits",
"cbc",
"chacha20poly1305",
"chardetng",
"chrono",
"cipher",
"compositing_traits",

View File

@@ -54,6 +54,7 @@ canvas_traits = { path = "components/shared/canvas" }
cbc = "0.1.2"
cfg-if = "1.0.4"
chacha20poly1305 = "0.10"
chardetng = "0.1"
chrono = { version = "0.4", features = ["serde"] }
cipher = { version = "0.4.4", features = ["alloc"] }
compositing_traits = { path = "components/shared/compositing" }

View File

@@ -52,6 +52,7 @@ brotli = { workspace = true }
canvas_traits = { workspace = true }
cbc = { workspace = true }
chacha20poly1305 = { workspace = true }
chardetng = { workspace = true }
chrono = { workspace = true }
cipher = { workspace = true }
compositing_traits = { workspace = true }

View File

@@ -57,16 +57,23 @@ impl DetectingState {
/// more bytes are required.
///
/// [determine the character encoding]: https://html.spec.whatwg.org/multipage/#determining-the-character-encoding
fn buffer(&mut self, data: &[u8]) -> Option<&'static Encoding> {
fn buffer(
&mut self,
data: &[u8],
document: &Document,
is_at_end_of_file: AtEndOfFile,
) -> Option<&'static Encoding> {
self.buffered_bytes.extend_from_slice(data);
let can_wait_longer = self.start_timestamp.elapsed() < Self::MAX_TIME_TO_BUFFER;
self.determine_the_character_encoding(can_wait_longer)
self.determine_the_character_encoding(document, can_wait_longer, is_at_end_of_file)
}
/// <https://html.spec.whatwg.org/multipage/#determining-the-character-encoding>
fn determine_the_character_encoding(
&mut self,
document: &Document,
potentially_wait_for_more_data: bool,
is_at_end_of_file: AtEndOfFile,
) -> Option<&'static Encoding> {
// Step 1. If the result of BOM sniffing is an encoding, return that encoding with confidence certain.
if !self.attempted_bom_sniffing && self.buffered_bytes.len() > 2 {
@@ -132,7 +139,19 @@ impl DetectingState {
// Step 8. The user agent may attempt to autodetect the character encoding from applying frequency analysis
// or other algorithms to the data stream.
// NOTE: We don't.
let mut encoding_detector = chardetng::EncodingDetector::new();
encoding_detector.feed(&self.buffered_bytes, is_at_end_of_file == AtEndOfFile::Yes);
let url = document.url();
let tld = url
.as_url()
.domain()
.and_then(|domain| domain.rsplit('.').next())
.map(|tld| tld.as_bytes());
let (guessed_encoding, is_probably_right) = encoding_detector.guess_assess(tld, true);
if is_probably_right {
log::debug!("chardetng determined that the document encoding is {guessed_encoding:?}");
return Some(guessed_encoding);
}
// Step 9. Otherwise, return an implementation-defined or user-specified default character encoding,
// with the confidence tentative.
@@ -142,8 +161,8 @@ impl DetectingState {
Some(UTF_8)
}
fn finish(&mut self) -> &'static Encoding {
self.determine_the_character_encoding(false)
fn finish(&mut self, document: &Document) -> &'static Encoding {
self.determine_the_character_encoding(document, false, AtEndOfFile::Yes)
.expect("Should always return character encoding when we're not allowed to wait")
}
}
@@ -169,7 +188,7 @@ impl NetworkDecoderState {
pub(super) fn push(&mut self, chunk: &[u8], document: &Document) -> Option<StrTendril> {
match self {
Self::Detecting(encoding_detector) => {
if let Some(encoding) = encoding_detector.buffer(chunk) {
if let Some(encoding) = encoding_detector.buffer(chunk, document, AtEndOfFile::No) {
document.set_encoding(encoding);
let buffered_bytes = mem::take(&mut encoding_detector.buffered_bytes);
*self = Self::Decoding(DecodingState {
@@ -198,7 +217,7 @@ impl NetworkDecoderState {
pub(super) fn finish(&mut self, document: &Document) -> StrTendril {
match self {
Self::Detecting(encoding_detector) => {
let encoding = encoding_detector.finish();
let encoding = encoding_detector.finish(document);
document.set_encoding(encoding);
let buffered_bytes = mem::take(&mut encoding_detector.buffered_bytes);
let mut decoder = LossyDecoder::new_encoding_rs(encoding, NetworkSink::default());
@@ -777,3 +796,9 @@ pub fn get_xml_encoding(input: &[u8]) -> Option<&'static Encoding> {
Some(encoding)
}
}
#[derive(PartialEq)]
enum AtEndOfFile {
Yes,
No,
}

View File

@@ -142,6 +142,8 @@ skip: true
skip: false
[encoding]
skip: false
[encoding-detection]
skip: false
[eventsource]
skip: false
[fetch]

View File

@@ -0,0 +1,3 @@
[utf-8.html]
[Check detection result]
expected: FAIL

View File

@@ -1,9 +1,9 @@
[utf-32-from-win1252.html]
[Expect resources/utf-32-big-endian-bom.html to parse as windows-1252]
expected: FAIL
[Expect resources/utf-32-big-endian-nobom.html to parse as windows-1252]
expected: FAIL
[Expect resources/utf-32-little-endian-nobom.html to parse as windows-1252]
expected: FAIL
[Expect resources/utf-32-big-endian-bom.xml to parse as UTF-8]
expected: FAIL

View File

@@ -0,0 +1,6 @@
[utf-32.html]
[Expect resources/utf-32-big-endian-bom.html to parse as UTF-8]
expected: FAIL
[Expect resources/utf-32-big-endian-bom.xml to parse as UTF-8]
expected: FAIL

View File

@@ -0,0 +1,6 @@
[inheritance-bogus-meta-utf-8.html]
[Child with bogus <meta charset>]
expected: FAIL
[Child with bogus Content-Type charset]
expected: FAIL

View File

@@ -1,7 +1,4 @@
[inheritance-bogus-meta.html]
[Cross-origin child with bogus <meta charset>]
expected: FAIL
[Child with bogus <meta charset>]
expected: FAIL