Files
ladybird/Libraries/LibWeb/Fetch/Infrastructure/HTTP/Bodies.h
Andreas Kling 37bdcc3488 LibWeb: Support MIME type sniffing for streaming HTTP responses
Previously, when loading a document, we would try to sniff the MIME
type by reading from the response body's source. However, for streaming
HTTP responses, the body source is Empty (the data comes through the
stream instead), so we had no bytes to sniff.

This caused pages like hypr.land (which sends no Content-Type header)
to be misidentified as plain text instead of HTML, since the MIME
sniffing algorithm would receive zero bytes and fall back to the
default type.

The fix captures the first bytes of the response body during fetch,
storing them on the Body object. These bytes are the "resource header"
defined by the MIME Sniffing spec - up to 1445 bytes, which is enough
to identify any MIME type the spec can detect.

Since bytes may arrive asynchronously during streaming, we use a
callback mechanism: if bytes aren't ready yet when load_document()
needs them, it registers a callback that fires once enough bytes have
been captured (or the stream ends).

The flow is:
1. FetchedDataReceiver receives network bytes, buffers them
2. When Body is created, buffered bytes are flushed to Body's sniff
   buffer, and subsequent bytes are appended as they arrive
3. Before calling load_document(), Navigable waits for sniff bytes
4. load_document() passes the bytes to MimeSniff::Resource::sniff()
2026-01-24 15:21:26 +01:00

103 lines
4.5 KiB
C++

/*
* Copyright (c) 2022-2023, Linus Groh <linusg@serenityos.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#pragma once
#include <AK/ByteBuffer.h>
#include <AK/Forward.h>
#include <AK/NonnullRefPtr.h>
#include <AK/Optional.h>
#include <AK/Variant.h>
#include <LibGC/Ptr.h>
#include <LibGC/Root.h>
#include <LibWeb/Export.h>
#include <LibWeb/Fetch/Infrastructure/Task.h>
#include <LibWeb/FileAPI/Blob.h>
#include <LibWeb/Streams/ReadableStream.h>
#include <LibWeb/WebIDL/Promise.h>
namespace Web::Fetch::Infrastructure {
// https://fetch.spec.whatwg.org/#concept-body
class WEB_API Body final : public JS::Cell {
GC_CELL(Body, JS::Cell);
GC_DECLARE_ALLOCATOR(Body);
public:
using SourceType = Variant<Empty, ByteBuffer, GC::Root<FileAPI::Blob>>;
// processBody must be an algorithm accepting a byte sequence.
using ProcessBodyCallback = GC::Ref<GC::Function<void(ByteBuffer)>>;
// processBodyError must be an algorithm optionally accepting an exception.
using ProcessBodyErrorCallback = GC::Ref<GC::Function<void(JS::Value)>>;
// processBodyChunk must be an algorithm accepting a byte sequence.
using ProcessBodyChunkCallback = GC::Ref<GC::Function<void(ByteBuffer)>>;
// processEndOfBody must be an algorithm accepting no arguments
using ProcessEndOfBodyCallback = GC::Ref<GC::Function<void()>>;
[[nodiscard]] static GC::Ref<Body> create(JS::VM&, GC::Ref<Streams::ReadableStream>);
[[nodiscard]] static GC::Ref<Body> create(JS::VM&, GC::Ref<Streams::ReadableStream>, SourceType, Optional<u64>);
[[nodiscard]] GC::Ref<Streams::ReadableStream> stream() const { return *m_stream; }
void set_stream(GC::Ref<Streams::ReadableStream> value) { m_stream = value; }
[[nodiscard]] SourceType const& source() const { return m_source; }
[[nodiscard]] Optional<u64> const& length() const { return m_length; }
// https://mimesniff.spec.whatwg.org/#reading-the-resource-header
// Non-standard infrastructure to obtain the "resource header" for MIME type sniffing.
// The spec defines resource header as the byte sequence to sniff, obtained by reading
// "until [...] 1445 or more bytes have been read" or end of resource is reached.
// For non-streaming bodies (ByteBuffer/Blob source), bytes are available immediately.
// For streaming bodies, bytes are captured during fetch and delivered via callback.
using SniffBytesCallback = GC::Ref<GC::Function<void(ReadonlyBytes)>>;
Optional<ReadonlyBytes> sniff_bytes_if_available() const;
void wait_for_sniff_bytes(SniffBytesCallback on_ready);
// Called by FetchedDataReceiver to provide sniff bytes during streaming fetch.
void append_sniff_bytes(ReadonlyBytes bytes);
void set_sniff_bytes_complete();
[[nodiscard]] GC::Ref<Body> clone(JS::Realm&);
void fully_read(JS::Realm&, ProcessBodyCallback process_body, ProcessBodyErrorCallback process_body_error, TaskDestination) const;
void incrementally_read(ProcessBodyChunkCallback process_body_chunk, ProcessEndOfBodyCallback process_end_of_body, ProcessBodyErrorCallback process_body_error, TaskDestination);
void incrementally_read_loop(Streams::ReadableStreamDefaultReader& reader, TaskDestination, ProcessBodyChunkCallback process_body_chunk, ProcessEndOfBodyCallback process_end_of_body, ProcessBodyErrorCallback process_body_error);
virtual void visit_edges(JS::Cell::Visitor&) override;
private:
explicit Body(GC::Ref<Streams::ReadableStream>);
Body(GC::Ref<Streams::ReadableStream>, SourceType, Optional<u64>);
// https://fetch.spec.whatwg.org/#concept-body-stream
// A stream (a ReadableStream object).
GC::Ref<Streams::ReadableStream> m_stream;
// https://fetch.spec.whatwg.org/#concept-body-source
// A source (null, a byte sequence, a Blob object, or a FormData object), initially null.
SourceType m_source;
// https://fetch.spec.whatwg.org/#concept-body-total-bytes
// A length (null or an integer), initially null.
Optional<u64> m_length;
// https://mimesniff.spec.whatwg.org/#reading-the-resource-header
// Non-standard: Captured "resource header" bytes for MIME type sniffing.
ByteBuffer m_sniff_bytes;
bool m_sniff_bytes_complete { false };
GC::Ptr<GC::Function<void(ReadonlyBytes)>> m_sniff_bytes_callback;
};
// https://fetch.spec.whatwg.org/#body-with-type
// A body with type is a tuple that consists of a body (a body) and a type (a header value or null).
struct BodyWithType {
GC::Ref<Body> body;
Optional<ByteString> type;
};
WEB_API GC::Ref<Body> byte_sequence_as_body(JS::Realm&, ReadonlyBytes);
}