Files
ladybird/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.h
Aliaksandr Kalenik b6ffd51d1c LibWeb: Pause tokenizer at a CR right before the insertion point
HTML newline normalization collapses CRLF into a single LF, so
next_code_point() needs one code point of lookahead at a CR to decide
whether the CR stands alone or is the first half of a CRLF pair. When
the tokenizer is paused at the insertion point and the next code point
to consume is a CR sitting one position before it, that lookahead has
not been written yet.

Previously the tokenizer consumed the CR and emitted it as LF, so a
subsequent document.write() that began with LF surfaced as a second
LF instead of being absorbed into the original CRLF pair.

Stop one code point earlier in this case and wait for the next write
to arrive. This makes four html5lib write_single WPT tests pass.
2026-04-27 21:44:56 +02:00

233 lines
10 KiB
C++

/*
* Copyright (c) 2020, Andreas Kling <andreas@ladybird.org>
* Copyright (c) 2022, Linus Groh <linusg@serenityos.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#pragma once
#include <AK/Queue.h>
#include <AK/StringBuilder.h>
#include <AK/StringView.h>
#include <AK/Types.h>
#include <AK/Vector.h>
#include <LibGC/Cell.h>
#include <LibGC/Ptr.h>
#include <LibWeb/Export.h>
#include <LibWeb/Forward.h>
#include <LibWeb/HTML/Parser/Entities.h>
#include <LibWeb/HTML/Parser/HTMLToken.h>
namespace Web::HTML {
#define ENUMERATE_TOKENIZER_STATES \
__ENUMERATE_TOKENIZER_STATE(Data) \
__ENUMERATE_TOKENIZER_STATE(RCDATA) \
__ENUMERATE_TOKENIZER_STATE(RAWTEXT) \
__ENUMERATE_TOKENIZER_STATE(ScriptData) \
__ENUMERATE_TOKENIZER_STATE(PLAINTEXT) \
__ENUMERATE_TOKENIZER_STATE(TagOpen) \
__ENUMERATE_TOKENIZER_STATE(EndTagOpen) \
__ENUMERATE_TOKENIZER_STATE(TagName) \
__ENUMERATE_TOKENIZER_STATE(RCDATALessThanSign) \
__ENUMERATE_TOKENIZER_STATE(RCDATAEndTagOpen) \
__ENUMERATE_TOKENIZER_STATE(RCDATAEndTagName) \
__ENUMERATE_TOKENIZER_STATE(RAWTEXTLessThanSign) \
__ENUMERATE_TOKENIZER_STATE(RAWTEXTEndTagOpen) \
__ENUMERATE_TOKENIZER_STATE(RAWTEXTEndTagName) \
__ENUMERATE_TOKENIZER_STATE(ScriptDataLessThanSign) \
__ENUMERATE_TOKENIZER_STATE(ScriptDataEndTagOpen) \
__ENUMERATE_TOKENIZER_STATE(ScriptDataEndTagName) \
__ENUMERATE_TOKENIZER_STATE(ScriptDataEscapeStart) \
__ENUMERATE_TOKENIZER_STATE(ScriptDataEscapeStartDash) \
__ENUMERATE_TOKENIZER_STATE(ScriptDataEscaped) \
__ENUMERATE_TOKENIZER_STATE(ScriptDataEscapedDash) \
__ENUMERATE_TOKENIZER_STATE(ScriptDataEscapedDashDash) \
__ENUMERATE_TOKENIZER_STATE(ScriptDataEscapedLessThanSign) \
__ENUMERATE_TOKENIZER_STATE(ScriptDataEscapedEndTagOpen) \
__ENUMERATE_TOKENIZER_STATE(ScriptDataEscapedEndTagName) \
__ENUMERATE_TOKENIZER_STATE(ScriptDataDoubleEscapeStart) \
__ENUMERATE_TOKENIZER_STATE(ScriptDataDoubleEscaped) \
__ENUMERATE_TOKENIZER_STATE(ScriptDataDoubleEscapedDash) \
__ENUMERATE_TOKENIZER_STATE(ScriptDataDoubleEscapedDashDash) \
__ENUMERATE_TOKENIZER_STATE(ScriptDataDoubleEscapedLessThanSign) \
__ENUMERATE_TOKENIZER_STATE(ScriptDataDoubleEscapeEnd) \
__ENUMERATE_TOKENIZER_STATE(BeforeAttributeName) \
__ENUMERATE_TOKENIZER_STATE(AttributeName) \
__ENUMERATE_TOKENIZER_STATE(AfterAttributeName) \
__ENUMERATE_TOKENIZER_STATE(BeforeAttributeValue) \
__ENUMERATE_TOKENIZER_STATE(AttributeValueDoubleQuoted) \
__ENUMERATE_TOKENIZER_STATE(AttributeValueSingleQuoted) \
__ENUMERATE_TOKENIZER_STATE(AttributeValueUnquoted) \
__ENUMERATE_TOKENIZER_STATE(AfterAttributeValueQuoted) \
__ENUMERATE_TOKENIZER_STATE(SelfClosingStartTag) \
__ENUMERATE_TOKENIZER_STATE(BogusComment) \
__ENUMERATE_TOKENIZER_STATE(MarkupDeclarationOpen) \
__ENUMERATE_TOKENIZER_STATE(CommentStart) \
__ENUMERATE_TOKENIZER_STATE(CommentStartDash) \
__ENUMERATE_TOKENIZER_STATE(Comment) \
__ENUMERATE_TOKENIZER_STATE(CommentLessThanSign) \
__ENUMERATE_TOKENIZER_STATE(CommentLessThanSignBang) \
__ENUMERATE_TOKENIZER_STATE(CommentLessThanSignBangDash) \
__ENUMERATE_TOKENIZER_STATE(CommentLessThanSignBangDashDash) \
__ENUMERATE_TOKENIZER_STATE(CommentEndDash) \
__ENUMERATE_TOKENIZER_STATE(CommentEnd) \
__ENUMERATE_TOKENIZER_STATE(CommentEndBang) \
__ENUMERATE_TOKENIZER_STATE(DOCTYPE) \
__ENUMERATE_TOKENIZER_STATE(BeforeDOCTYPEName) \
__ENUMERATE_TOKENIZER_STATE(DOCTYPEName) \
__ENUMERATE_TOKENIZER_STATE(AfterDOCTYPEName) \
__ENUMERATE_TOKENIZER_STATE(AfterDOCTYPEPublicKeyword) \
__ENUMERATE_TOKENIZER_STATE(BeforeDOCTYPEPublicIdentifier) \
__ENUMERATE_TOKENIZER_STATE(DOCTYPEPublicIdentifierDoubleQuoted) \
__ENUMERATE_TOKENIZER_STATE(DOCTYPEPublicIdentifierSingleQuoted) \
__ENUMERATE_TOKENIZER_STATE(AfterDOCTYPEPublicIdentifier) \
__ENUMERATE_TOKENIZER_STATE(BetweenDOCTYPEPublicAndSystemIdentifiers) \
__ENUMERATE_TOKENIZER_STATE(AfterDOCTYPESystemKeyword) \
__ENUMERATE_TOKENIZER_STATE(BeforeDOCTYPESystemIdentifier) \
__ENUMERATE_TOKENIZER_STATE(DOCTYPESystemIdentifierDoubleQuoted) \
__ENUMERATE_TOKENIZER_STATE(DOCTYPESystemIdentifierSingleQuoted) \
__ENUMERATE_TOKENIZER_STATE(AfterDOCTYPESystemIdentifier) \
__ENUMERATE_TOKENIZER_STATE(BogusDOCTYPE) \
__ENUMERATE_TOKENIZER_STATE(CDATASection) \
__ENUMERATE_TOKENIZER_STATE(CDATASectionBracket) \
__ENUMERATE_TOKENIZER_STATE(CDATASectionEnd) \
__ENUMERATE_TOKENIZER_STATE(CharacterReference) \
__ENUMERATE_TOKENIZER_STATE(NamedCharacterReference) \
__ENUMERATE_TOKENIZER_STATE(AmbiguousAmpersand) \
__ENUMERATE_TOKENIZER_STATE(NumericCharacterReference) \
__ENUMERATE_TOKENIZER_STATE(HexadecimalCharacterReferenceStart) \
__ENUMERATE_TOKENIZER_STATE(DecimalCharacterReferenceStart) \
__ENUMERATE_TOKENIZER_STATE(HexadecimalCharacterReference) \
__ENUMERATE_TOKENIZER_STATE(DecimalCharacterReference) \
__ENUMERATE_TOKENIZER_STATE(NumericCharacterReferenceEnd)
class WEB_API HTMLTokenizer {
public:
explicit HTMLTokenizer();
explicit HTMLTokenizer(StringView input, ByteString const& encoding);
enum class State {
#define __ENUMERATE_TOKENIZER_STATE(state) state,
ENUMERATE_TOKENIZER_STATES
#undef __ENUMERATE_TOKENIZER_STATE
};
enum class StopAtInsertionPoint {
No,
Yes,
};
Optional<HTMLToken> next_token(StopAtInsertionPoint = StopAtInsertionPoint::No);
void set_parser(Badge<HTMLParser>, HTMLParser& parser) { m_parser = &parser; }
void switch_to(Badge<HTMLParser>, State new_state);
void switch_to(State new_state)
{
m_state = new_state;
}
void set_blocked(bool b) { m_blocked = b; }
bool is_blocked() const { return m_blocked; }
auto const& source() const { return m_source; }
String unparsed_input() const;
void insert_input_at_insertion_point(StringView input);
void insert_eof();
bool is_eof_inserted();
bool is_insertion_point_defined() const { return m_insertion_point.has_value(); }
bool is_insertion_point_reached() { return m_insertion_point.has_value() && m_current_offset >= *m_insertion_point; }
void undefine_insertion_point() { m_insertion_point = {}; }
void store_old_insertion_point() { m_old_insertion_points.append(m_insertion_point); }
void restore_old_insertion_point() { m_insertion_point = m_old_insertion_points.take_last(); }
void update_insertion_point() { m_insertion_point = m_current_offset; }
// This permanently cuts off the tokenizer input stream.
void abort() { m_aborted = true; }
void parser_did_run(Badge<HTMLParser>);
void visit_edges(GC::Cell::Visitor&);
private:
void skip(size_t count);
Optional<u32> next_code_point(StopAtInsertionPoint);
Optional<u32> peek_code_point(ssize_t offset, StopAtInsertionPoint) const;
enum class ConsumeNextResult {
Consumed,
NotConsumed,
RanOutOfCharacters,
};
[[nodiscard]] ConsumeNextResult consume_next_if_match(StringView, StopAtInsertionPoint, CaseSensitivity = CaseSensitivity::CaseSensitive);
bool should_pause_before_next_input_character(StopAtInsertionPoint) const;
void create_new_token(HTMLToken::Type);
bool current_end_tag_token_is_appropriate() const;
String consume_current_builder();
static char const* state_name(State state)
{
switch (state) {
#define __ENUMERATE_TOKENIZER_STATE(state) \
case State::state: \
return #state;
ENUMERATE_TOKENIZER_STATES
#undef __ENUMERATE_TOKENIZER_STATE
};
VERIFY_NOT_REACHED();
}
void will_emit(HTMLToken&);
void will_switch_to(State);
void will_reconsume_in(State);
bool consumed_as_part_of_an_attribute() const;
void restore_to(ssize_t new_iterator);
HTMLToken::Position nth_last_position(size_t n = 0);
GC::Ptr<HTMLParser> m_parser;
State m_state { State::Data };
State m_return_state { State::Data };
Vector<u32> m_temporary_buffer;
String m_source;
Vector<u32> m_decoded_input;
Optional<ssize_t> m_insertion_point;
// Spec algorithms have an "old insertion point" local; reentrant script execution can nest those locals.
Vector<Optional<ssize_t>> m_old_insertion_points;
ssize_t m_current_offset { 0 };
ssize_t m_prev_offset { 0 };
HTMLToken m_current_token;
StringBuilder m_current_builder;
NamedCharacterReferenceMatcher m_named_character_reference_matcher;
Optional<FlyString> m_last_emitted_start_tag_name;
bool m_explicit_eof_inserted { false };
bool m_has_emitted_eof { false };
Queue<HTMLToken> m_queued_tokens;
u32 m_character_reference_code { 0 };
bool m_blocked { false };
bool m_aborted { false };
Vector<HTMLToken::Position> m_source_positions;
};
}