mirror of
https://github.com/servo/servo
synced 2026-04-25 17:15:48 +02:00
script: Use chardetng to guess encoding when all else fails (#41435)
[`chardetng`](https://github.com/hsivonen/chardetng) is the library used by gecko to guess encodings. This makes https://intsys.co.jp/game/panepon/p01/index.html load with the correct encoding. Notably, that site uses shift-jis but has no encoding declaration of any kind. Part of https://github.com/servo/servo/issues/6414 --------- Signed-off-by: Simon Wülker <simon.wuelker@arcor.de>
This commit is contained in:
12
Cargo.lock
generated
12
Cargo.lock
generated
@@ -1302,6 +1302,17 @@ dependencies = [
|
||||
"zeroize",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "chardetng"
|
||||
version = "0.1.17"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "14b8f0b65b7b08ae3c8187e8d77174de20cb6777864c6b832d8ad365999cf1ea"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"encoding_rs",
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "chrono"
|
||||
version = "0.4.42"
|
||||
@@ -7647,6 +7658,7 @@ dependencies = [
|
||||
"canvas_traits",
|
||||
"cbc",
|
||||
"chacha20poly1305",
|
||||
"chardetng",
|
||||
"chrono",
|
||||
"cipher",
|
||||
"compositing_traits",
|
||||
|
||||
@@ -54,6 +54,7 @@ canvas_traits = { path = "components/shared/canvas" }
|
||||
cbc = "0.1.2"
|
||||
cfg-if = "1.0.4"
|
||||
chacha20poly1305 = "0.10"
|
||||
chardetng = "0.1"
|
||||
chrono = { version = "0.4", features = ["serde"] }
|
||||
cipher = { version = "0.4.4", features = ["alloc"] }
|
||||
compositing_traits = { path = "components/shared/compositing" }
|
||||
|
||||
@@ -52,6 +52,7 @@ brotli = { workspace = true }
|
||||
canvas_traits = { workspace = true }
|
||||
cbc = { workspace = true }
|
||||
chacha20poly1305 = { workspace = true }
|
||||
chardetng = { workspace = true }
|
||||
chrono = { workspace = true }
|
||||
cipher = { workspace = true }
|
||||
compositing_traits = { workspace = true }
|
||||
|
||||
@@ -57,16 +57,23 @@ impl DetectingState {
|
||||
/// more bytes are required.
|
||||
///
|
||||
/// [determine the character encoding]: https://html.spec.whatwg.org/multipage/#determining-the-character-encoding
|
||||
fn buffer(&mut self, data: &[u8]) -> Option<&'static Encoding> {
|
||||
fn buffer(
|
||||
&mut self,
|
||||
data: &[u8],
|
||||
document: &Document,
|
||||
is_at_end_of_file: AtEndOfFile,
|
||||
) -> Option<&'static Encoding> {
|
||||
self.buffered_bytes.extend_from_slice(data);
|
||||
let can_wait_longer = self.start_timestamp.elapsed() < Self::MAX_TIME_TO_BUFFER;
|
||||
self.determine_the_character_encoding(can_wait_longer)
|
||||
self.determine_the_character_encoding(document, can_wait_longer, is_at_end_of_file)
|
||||
}
|
||||
|
||||
/// <https://html.spec.whatwg.org/multipage/#determining-the-character-encoding>
|
||||
fn determine_the_character_encoding(
|
||||
&mut self,
|
||||
document: &Document,
|
||||
potentially_wait_for_more_data: bool,
|
||||
is_at_end_of_file: AtEndOfFile,
|
||||
) -> Option<&'static Encoding> {
|
||||
// Step 1. If the result of BOM sniffing is an encoding, return that encoding with confidence certain.
|
||||
if !self.attempted_bom_sniffing && self.buffered_bytes.len() > 2 {
|
||||
@@ -132,7 +139,19 @@ impl DetectingState {
|
||||
|
||||
// Step 8. The user agent may attempt to autodetect the character encoding from applying frequency analysis
|
||||
// or other algorithms to the data stream.
|
||||
// NOTE: We don't.
|
||||
let mut encoding_detector = chardetng::EncodingDetector::new();
|
||||
encoding_detector.feed(&self.buffered_bytes, is_at_end_of_file == AtEndOfFile::Yes);
|
||||
let url = document.url();
|
||||
let tld = url
|
||||
.as_url()
|
||||
.domain()
|
||||
.and_then(|domain| domain.rsplit('.').next())
|
||||
.map(|tld| tld.as_bytes());
|
||||
let (guessed_encoding, is_probably_right) = encoding_detector.guess_assess(tld, true);
|
||||
if is_probably_right {
|
||||
log::debug!("chardetng determined that the document encoding is {guessed_encoding:?}");
|
||||
return Some(guessed_encoding);
|
||||
}
|
||||
|
||||
// Step 9. Otherwise, return an implementation-defined or user-specified default character encoding,
|
||||
// with the confidence tentative.
|
||||
@@ -142,8 +161,8 @@ impl DetectingState {
|
||||
Some(UTF_8)
|
||||
}
|
||||
|
||||
fn finish(&mut self) -> &'static Encoding {
|
||||
self.determine_the_character_encoding(false)
|
||||
fn finish(&mut self, document: &Document) -> &'static Encoding {
|
||||
self.determine_the_character_encoding(document, false, AtEndOfFile::Yes)
|
||||
.expect("Should always return character encoding when we're not allowed to wait")
|
||||
}
|
||||
}
|
||||
@@ -169,7 +188,7 @@ impl NetworkDecoderState {
|
||||
pub(super) fn push(&mut self, chunk: &[u8], document: &Document) -> Option<StrTendril> {
|
||||
match self {
|
||||
Self::Detecting(encoding_detector) => {
|
||||
if let Some(encoding) = encoding_detector.buffer(chunk) {
|
||||
if let Some(encoding) = encoding_detector.buffer(chunk, document, AtEndOfFile::No) {
|
||||
document.set_encoding(encoding);
|
||||
let buffered_bytes = mem::take(&mut encoding_detector.buffered_bytes);
|
||||
*self = Self::Decoding(DecodingState {
|
||||
@@ -198,7 +217,7 @@ impl NetworkDecoderState {
|
||||
pub(super) fn finish(&mut self, document: &Document) -> StrTendril {
|
||||
match self {
|
||||
Self::Detecting(encoding_detector) => {
|
||||
let encoding = encoding_detector.finish();
|
||||
let encoding = encoding_detector.finish(document);
|
||||
document.set_encoding(encoding);
|
||||
let buffered_bytes = mem::take(&mut encoding_detector.buffered_bytes);
|
||||
let mut decoder = LossyDecoder::new_encoding_rs(encoding, NetworkSink::default());
|
||||
@@ -777,3 +796,9 @@ pub fn get_xml_encoding(input: &[u8]) -> Option<&'static Encoding> {
|
||||
Some(encoding)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(PartialEq)]
|
||||
enum AtEndOfFile {
|
||||
Yes,
|
||||
No,
|
||||
}
|
||||
|
||||
2
tests/wpt/include.ini
vendored
2
tests/wpt/include.ini
vendored
@@ -142,6 +142,8 @@ skip: true
|
||||
skip: false
|
||||
[encoding]
|
||||
skip: false
|
||||
[encoding-detection]
|
||||
skip: false
|
||||
[eventsource]
|
||||
skip: false
|
||||
[fetch]
|
||||
|
||||
3
tests/wpt/meta/encoding-detection/utf-8.html.ini
vendored
Normal file
3
tests/wpt/meta/encoding-detection/utf-8.html.ini
vendored
Normal file
@@ -0,0 +1,3 @@
|
||||
[utf-8.html]
|
||||
[Check detection result]
|
||||
expected: FAIL
|
||||
@@ -1,9 +1,9 @@
|
||||
[utf-32-from-win1252.html]
|
||||
[Expect resources/utf-32-big-endian-bom.html to parse as windows-1252]
|
||||
expected: FAIL
|
||||
|
||||
[Expect resources/utf-32-big-endian-nobom.html to parse as windows-1252]
|
||||
expected: FAIL
|
||||
|
||||
[Expect resources/utf-32-little-endian-nobom.html to parse as windows-1252]
|
||||
expected: FAIL
|
||||
|
||||
[Expect resources/utf-32-big-endian-bom.xml to parse as UTF-8]
|
||||
expected: FAIL
|
||||
|
||||
6
tests/wpt/meta/encoding/utf-32.html.ini
vendored
Normal file
6
tests/wpt/meta/encoding/utf-32.html.ini
vendored
Normal file
@@ -0,0 +1,6 @@
|
||||
[utf-32.html]
|
||||
[Expect resources/utf-32-big-endian-bom.html to parse as UTF-8]
|
||||
expected: FAIL
|
||||
|
||||
[Expect resources/utf-32-big-endian-bom.xml to parse as UTF-8]
|
||||
expected: FAIL
|
||||
6
tests/wpt/meta/html/syntax/charset/inheritance-bogus-meta-utf-8.html.ini
vendored
Normal file
6
tests/wpt/meta/html/syntax/charset/inheritance-bogus-meta-utf-8.html.ini
vendored
Normal file
@@ -0,0 +1,6 @@
|
||||
[inheritance-bogus-meta-utf-8.html]
|
||||
[Child with bogus <meta charset>]
|
||||
expected: FAIL
|
||||
|
||||
[Child with bogus Content-Type charset]
|
||||
expected: FAIL
|
||||
@@ -1,7 +1,4 @@
|
||||
[inheritance-bogus-meta.html]
|
||||
[Cross-origin child with bogus <meta charset>]
|
||||
expected: FAIL
|
||||
|
||||
[Child with bogus <meta charset>]
|
||||
expected: FAIL
|
||||
|
||||
|
||||
Reference in New Issue
Block a user