Files
ladybird/Meta/Utils/CSSGrammar/Parser/tokenizer.py
Sam Atkins 39c997b102 Meta: Use consume_specific in Python CSS Grammar tokenizer
A stealth merge conflict meant consume_specific_string() and
consume_specific_char() got combined into a single method and no longer
exist.
2026-04-29 13:23:53 +01:00

171 lines
6.2 KiB
Python

from math import inf
from Utils.CSSGrammar.Parser.component_values import Keyword
from Utils.CSSGrammar.Parser.component_values import NumericTypeRangeRestriction
from Utils.CSSGrammar.Parser.component_values import Type
from Utils.CSSGrammar.Parser.component_values import is_dimension_percentage_mix_type
from Utils.CSSGrammar.Parser.component_values import is_dimension_type
from Utils.CSSGrammar.Parser.component_values import is_numeric_type
from Utils.CSSGrammar.Parser.token import Token
from Utils.CSSGrammar.Parser.token import TokenType
from Utils.lexer import Lexer
def is_identifier_character(ch: str) -> bool:
return ch.isascii() and (ch.isalnum() or ch == "-")
class Tokenizer:
def __init__(self, input: str) -> None:
self.lexer = Lexer(input)
@classmethod
def tokenize(cls, input: str) -> list[Token]:
return cls(input).tokenize_impl()
def tokenize_impl(self) -> list[Token]:
tokens = []
while True:
self.discard_whitespace()
if self.lexer.is_eof():
tokens.append(Token.create(TokenType.END_OF_FILE))
return tokens
tokens.append(self.consume_a_token())
def discard_whitespace(self) -> None:
self.lexer.ignore_while(lambda ch: ch.isspace() and ch.isascii())
def consume_an_identifier(self) -> str:
return self.lexer.consume_while(is_identifier_character)
def consume_a_token(self) -> Token:
match self.lexer.peek():
case "|":
self.lexer.consume()
return Token.create(TokenType.SINGLE_BAR)
case "<":
return self.consume_a_non_terminal_token()
if is_identifier_character(self.lexer.peek()):
return self.consume_a_keyword_token()
raise SyntaxError("CSSGrammar::Tokenizer: Unexpected character")
def consume_custom_ident_blacklist(self) -> list[str]:
# NB: This notation isn't yet included in the spec but we use it internally and the CSSWG has resolved to add it in
# https://github.com/w3c/csswg-drafts/issues/11924
self.discard_whitespace()
if not self.lexer.consume_specific("!["):
return []
blacklist = []
while True:
self.discard_whitespace()
ident = self.consume_an_identifier()
if not ident:
raise SyntaxError("Expected identifier in custom-ident blacklist")
blacklist.append(ident)
self.discard_whitespace()
if self.lexer.consume_specific("]"):
return blacklist
if not self.lexer.consume_specific(","):
raise SyntaxError("Expected ',' in custom-ident blacklist")
# https://drafts.csswg.org/css-values-4/#css-bracketed-range-notation
def consume_bracketed_range_notation(self, type_name: str) -> NumericTypeRangeRestriction:
self.discard_whitespace()
# If no range is indicated, either by using the bracketed range notation or in the property description, then
# [-∞,∞] is assumed.
if not self.lexer.consume_specific("["):
return NumericTypeRangeRestriction(-inf, inf)
self.discard_whitespace()
minimum = self.consume_bracketed_range_bound(type_name)
self.discard_whitespace()
if not self.lexer.consume_specific(","):
raise SyntaxError("Expected ',' in bracketed range notation")
self.discard_whitespace()
maximum = self.consume_bracketed_range_bound(type_name)
self.discard_whitespace()
if not self.lexer.consume_specific("]"):
raise SyntaxError("Expected ']' to close bracketed range notation")
return NumericTypeRangeRestriction(minimum, maximum)
def consume_bracketed_range_bound(self, type_name: str) -> float:
# Values of -∞ or ∞ must be written without units, even if the value type uses units.
if self.lexer.consume_specific("-∞"):
return -inf
if self.lexer.consume_specific(""):
return inf
# FIXME: Do we need to allow non-integer values?
bound_value = self.consume_decimal_integer()
if bound_value != 0 and (is_dimension_percentage_mix_type(type_name) or type_name == "length"):
raise SyntaxError("Types with units not resolvable at parse time only support zero and infinite bounds")
# FIXME: Validate and store the unit, for now we drop it and assume it was the relevant canonical unit.
unit = self.lexer.consume_while(is_identifier_character)
if unit and not is_dimension_type(type_name) and not is_dimension_percentage_mix_type(type_name):
raise SyntaxError("Unexpected unit for unitless bound value")
if not unit and bound_value != 0 and is_dimension_type(type_name):
raise SyntaxError("Expected unit for non-zero, non-infinite bound value")
return float(bound_value)
def consume_decimal_integer(self) -> int:
sign = 1
if self.lexer.consume_specific("-"):
sign = -1
digits = self.lexer.consume_while(lambda ch: ch.isdigit())
if not digits:
raise SyntaxError("Expected decimal integer")
return sign * int(digits)
def consume_a_non_terminal_token(self) -> Token:
assert self.lexer.consume_specific("<")
name = self.consume_an_identifier()
if not name:
raise SyntaxError("CSSGrammar::Tokenizer: Expected a type name")
custom_ident_blacklist = None
if name == "custom-ident":
custom_ident_blacklist = self.consume_custom_ident_blacklist()
numeric_type_accepted_range = None
if is_numeric_type(name) or is_dimension_percentage_mix_type(name):
numeric_type_accepted_range = self.consume_bracketed_range_notation(name)
if not self.lexer.consume_specific(">"):
raise SyntaxError("CSSGrammar::Tokenizer: Expected '>'")
return Token.create_component_value(Type(name, custom_ident_blacklist, numeric_type_accepted_range))
def consume_a_keyword_token(self) -> Token:
value = self.consume_an_identifier()
assert value
return Token.create_component_value(Keyword(value))