Files
ladybird/Meta/Utils/webidl_parser.py
Shannon Booth 6fc06afc81 Meta: Add a Python WebIDL parser
Add a simple python3 WebIDL parser for the subset of IDL we need for
code generation.

For validation, the parser structure is intentionally very close to the
existing C++ parser, so the control flow and parsing mechanics stay easy
to compare while we continue porting IDL-related generators.

This parser currently focuses on the constructs needed by
GenerateWindowOrWorkerInterfaces, but is meant to serve as a base for
future IDL generator ports as well.
2026-04-29 12:24:00 +01:00

563 lines
18 KiB
Python

# Copyright (c) 2026-present, the Ladybird developers.
#
# SPDX-License-Identifier: BSD-2-Clause
from dataclasses import dataclass
from dataclasses import field
from pathlib import Path
from typing import Dict
from typing import List
from typing import Optional
from Utils.lexer import Lexer
@dataclass
class Constant:
declaration: str
@dataclass
class SpecialOperation:
identifier_type: str
declaration: str
@dataclass
class Interface:
name: str
path: Path
extended_attributes: Dict[str, str] = field(default_factory=dict)
is_namespace: bool = False
is_callback_interface: bool = False
parent_name: str = ""
constants: List[Constant] = field(default_factory=list)
named_property_getter: Optional[SpecialOperation] = None
indexed_property_getter: Optional[SpecialOperation] = None
implemented_name: str = ""
namespaced_name: str = ""
constructor_class: str = ""
prototype_class: str = ""
namespace_class: str = ""
def supports_named_properties(self) -> bool:
return self.named_property_getter is not None
def finalize(self) -> None:
legacy_namespace = self.extended_attributes.get("LegacyNamespace")
if legacy_namespace:
self.namespaced_name = f"{legacy_namespace}.{self.name}"
else:
self.namespaced_name = self.name
self.implemented_name = self.extended_attributes.get("ImplementedAs", self.name)
self.constructor_class = f"{self.implemented_name}Constructor"
self.prototype_class = f"{self.implemented_name}Prototype"
self.namespace_class = f"{self.name}Namespace"
@dataclass
class Module:
path: Path
interface: Optional[Interface] = None
@dataclass
class NestingState:
parentheses_depth: int = 0
bracket_depth: int = 0
brace_depth: int = 0
angle_depth: int = 0
def is_at_top_level(self) -> bool:
return not any((self.parentheses_depth, self.bracket_depth, self.brace_depth, self.angle_depth))
def update_for_character(self, character: str) -> None:
if character == "(":
self.parentheses_depth += 1
elif character == ")":
self.parentheses_depth = max(self.parentheses_depth - 1, 0)
elif character == "[":
self.bracket_depth += 1
elif character == "]":
self.bracket_depth = max(self.bracket_depth - 1, 0)
elif character == "{":
self.brace_depth += 1
elif character == "}":
self.brace_depth = max(self.brace_depth - 1, 0)
elif character == "<":
self.angle_depth += 1
elif character == ">":
self.angle_depth = max(self.angle_depth - 1, 0)
class ParseError(RuntimeError):
pass
def parse_module(path: Path, contents: str) -> Module:
return Parser(path, contents).parse()
class Parser:
def __init__(self, path: Path, contents: str) -> None:
self.path = path
self.contents = contents
self.lexer = Lexer(contents)
def parse(self) -> Module:
module = Module(path=self.path)
self.consume_whitespace()
while not self.lexer.is_eof():
extended_attributes: Dict[str, str] = {}
if self.lexer.consume_specific("["):
extended_attributes = self.parse_extended_attributes()
if self.next_is_keyword("dictionary") or self.next_is_keyword("partial dictionary"):
self.skip_braced_declaration()
elif self.next_is_keyword("enum"):
self.skip_braced_declaration()
elif self.next_is_keyword("typedef"):
self.consume_keyword("typedef")
self.consume_statement_text()
elif self.next_is_keyword("partial interface mixin"):
self.skip_braced_declaration()
elif self.next_is_keyword("partial interface"):
self.skip_braced_declaration()
elif self.next_is_keyword("interface mixin"):
self.skip_braced_declaration()
elif self.next_is_keyword("partial namespace"):
self.skip_braced_declaration()
elif self.next_is_keyword("callback interface"):
module.interface = self.set_or_check_module_interface(
module.interface,
self.parse_interface(extended_attributes, is_callback_interface=True),
)
elif self.next_is_keyword("callback"):
self.consume_keyword("callback")
self.consume_statement_text()
elif self.next_is_keyword("namespace"):
module.interface = self.set_or_check_module_interface(
module.interface,
self.parse_interface(extended_attributes, is_namespace=True),
)
elif self.next_is_keyword("interface"):
module.interface = self.set_or_check_module_interface(
module.interface,
self.parse_interface(extended_attributes),
)
else:
self.parse_includes_statement()
self.consume_whitespace()
if module.interface is None:
self.raise_parse_error("did not find an interface, callback interface, or namespace")
return module
def set_or_check_module_interface(
self, existing_interface: Optional[Interface], candidate_interface: Interface
) -> Interface:
if existing_interface is None:
return candidate_interface
if not existing_interface.is_namespace and existing_interface.name == candidate_interface.name:
return existing_interface
self.raise_parse_error(
"encountered multiple interface, callback interface, or namespace declarations in one file"
)
def parse_includes_statement(self) -> None:
self.parse_identifier_ending_with_space()
self.consume_whitespace()
if not self.next_is_keyword("includes"):
self.raise_parse_error("expected a declaration or an includes statement")
self.consume_keyword("includes")
self.consume_whitespace()
self.parse_identifier_ending_with_space_or(";")
self.consume_whitespace()
self.assert_specific(";")
def parse_interface(
self,
extended_attributes: Dict[str, str],
is_namespace: bool = False,
is_callback_interface: bool = False,
) -> Interface:
if is_callback_interface:
self.consume_keyword("callback")
self.consume_whitespace()
self.consume_keyword("interface")
elif is_namespace:
self.consume_keyword("namespace")
else:
self.consume_keyword("interface")
self.consume_whitespace()
interface = Interface(
name=self.parse_identifier_ending_with_space_or(":", "{"),
path=self.path,
extended_attributes=extended_attributes,
is_namespace=is_namespace,
is_callback_interface=is_callback_interface,
)
self.consume_whitespace()
if not is_namespace and self.lexer.consume_specific(":"):
self.consume_whitespace()
interface.parent_name = self.parse_identifier_ending_with_space_or("{")
self.consume_whitespace()
body_text = self.consume_braced_block()
self.consume_whitespace()
self.assert_specific(";")
if not is_namespace:
self.parse_interface_body(interface, body_text)
interface.finalize()
return interface
def parse_interface_body(self, interface: Interface, body_text: str) -> None:
for statement in split_top_level_statements(remove_line_comments(body_text)):
if not statement:
continue
stripped_statement = strip_leading_extended_attributes(statement).strip()
if not stripped_statement:
continue
if stripped_statement.startswith("const "):
interface.constants.append(Constant(stripped_statement))
continue
if stripped_statement.startswith("getter "):
identifier_type = parse_special_operation_identifier_type(stripped_statement)
special_operation = SpecialOperation(identifier_type=identifier_type, declaration=stripped_statement)
if identifier_type == "DOMString":
interface.named_property_getter = special_operation
elif identifier_type == "unsigned long":
interface.indexed_property_getter = special_operation
else:
self.raise_parse_error(
f"named/indexed property getter must use DOMString or unsigned long, got '{identifier_type}'"
)
def parse_extended_attributes(self) -> Dict[str, str]:
extended_attributes: Dict[str, str] = {}
while True:
self.consume_whitespace()
if self.lexer.consume_specific("]"):
break
name = self.parse_identifier_ending_with_space_or("]", "=", ",")
value = ""
if self.lexer.consume_specific("="):
value = self.consume_extended_attribute_value().strip()
extended_attributes[name] = value
self.lexer.consume_specific(",")
self.consume_whitespace()
return extended_attributes
def consume_extended_attribute_value(self) -> str:
start = self.lexer.tell()
parentheses_depth = 0
while not self.lexer.is_eof():
character = self.lexer.peek()
if character == "(":
parentheses_depth += 1
elif character == ")":
if parentheses_depth > 0:
parentheses_depth -= 1
elif parentheses_depth == 0 and character in (",", "]"):
break
self.lexer.consume()
return self.contents[start : self.lexer.tell()]
def skip_braced_declaration(self) -> None:
while not self.lexer.is_eof() and self.lexer.peek() != "{":
self.lexer.consume()
if self.lexer.is_eof():
self.raise_parse_error("expected '{' while skipping declaration")
self.consume_braced_block()
self.consume_whitespace()
self.assert_specific(";")
def consume_braced_block(self) -> str:
self.assert_specific("{")
start = self.lexer.tell()
brace_depth = 1
active_quote = ""
while not self.lexer.is_eof():
character = self.lexer.consume()
if active_quote:
if character == active_quote:
active_quote = ""
continue
if character == "/" and self.lexer.peek() == "/":
self.lexer.ignore_until("\n")
if self.lexer.peek() == "\n":
self.lexer.ignore()
continue
if character in ('"', "'"):
active_quote = character
continue
if character == "{":
brace_depth += 1
elif character == "}":
brace_depth -= 1
if brace_depth == 0:
end = self.lexer.tell() - 1
return self.contents[start:end]
self.raise_parse_error("unterminated declaration body")
def consume_statement_text(self) -> str:
start = self.lexer.tell()
nesting_state = NestingState()
active_quote = ""
while not self.lexer.is_eof():
character = self.lexer.consume()
if active_quote:
if character == active_quote:
active_quote = ""
continue
if character == "/" and self.lexer.peek() == "/":
self.lexer.ignore_until("\n")
if self.lexer.peek() == "\n":
self.lexer.ignore()
continue
if character in ('"', "'"):
active_quote = character
continue
if character == ";" and nesting_state.is_at_top_level():
end = self.lexer.tell() - 1
return self.contents[start:end]
nesting_state.update_for_character(character)
self.raise_parse_error("unterminated statement")
def parse_identifier_ending_with_space(self) -> str:
return self.parse_identifier_ending_with_space_or()
def parse_identifier_ending_with_space_or(self, *terminators: str) -> str:
identifier = self.lexer.consume_until(lambda character: character.isspace() or character in terminators)
return identifier.lstrip("_")
def consume_whitespace(self) -> None:
consumed = True
while consumed:
consumed = False
whitespace = self.lexer.consume_while(str.isspace)
if whitespace:
consumed = True
if self.lexer.consume_specific("//"):
self.lexer.ignore_until("\n")
if self.lexer.peek() == "\n":
self.lexer.ignore()
consumed = True
def next_is_keyword(self, keyword: str) -> bool:
if not self.lexer.next_is(keyword):
return False
end = self.lexer.tell() + len(keyword)
if end >= len(self.contents):
return True
trailing_character = self.contents[end]
return trailing_character.isspace() or trailing_character in "{([;:"
def consume_keyword(self, keyword: str) -> None:
if not self.next_is_keyword(keyword):
self.raise_parse_error(f"expected '{keyword}'")
self.lexer.ignore(len(keyword))
def assert_specific(self, expected_character: str) -> None:
if not self.lexer.consume_specific(expected_character):
self.raise_parse_error(f"expected '{expected_character}'")
def raise_parse_error(self, message: str) -> None:
line_number = 1
column_number = 1
for index, character in enumerate(self.contents):
if index == self.lexer.tell():
break
if character == "\n":
line_number += 1
column_number = 1
else:
column_number += 1
raise ParseError(f"{self.path}:{line_number}:{column_number}: error: {message}")
def remove_line_comments(text: str) -> str:
lines = []
for line in text.splitlines():
comment_index = line.find("//")
if comment_index != -1:
line = line[:comment_index]
lines.append(line)
return "\n".join(lines)
def split_top_level_statements(text: str) -> List[str]:
statements: List[str] = []
start = 0
nesting_state = NestingState()
active_quote = ""
for index, character in enumerate(text):
if active_quote:
if character == active_quote:
active_quote = ""
continue
if character in ('"', "'"):
active_quote = character
continue
if character == ";" and nesting_state.is_at_top_level():
statements.append(text[start:index].strip())
start = index + 1
continue
nesting_state.update_for_character(character)
trailing_text = text[start:].strip()
if trailing_text:
statements.append(trailing_text)
return statements
def strip_leading_extended_attributes(text: str) -> str:
remaining_text = text.lstrip()
while remaining_text.startswith("["):
closing_bracket = find_matching_closer(remaining_text, 0, "[", "]")
remaining_text = remaining_text[closing_bracket + 1 :].lstrip()
return remaining_text
def parse_special_operation_identifier_type(statement: str) -> str:
parenthesized_text = extract_final_parenthesized_group(statement)
parameter_text = split_top_level_items(parenthesized_text, ",")[0].strip()
parameter_text = strip_leading_extended_attributes(parameter_text)
last_space = find_last_top_level_whitespace(parameter_text)
if last_space == -1:
raise ParseError(f"could not determine getter identifier type from '{statement}'")
return normalize_whitespace(parameter_text[:last_space])
def extract_final_parenthesized_group(text: str) -> str:
stripped_text = text.rstrip()
if not stripped_text.endswith(")"):
raise ParseError(f"expected a trailing parameter list in '{text}'")
closing_index = len(stripped_text) - 1
opening_index = find_matching_opener(stripped_text, closing_index, "(", ")")
return stripped_text[opening_index + 1 : closing_index]
def split_top_level_items(text: str, delimiter: str) -> List[str]:
items: List[str] = []
start = 0
nesting_state = NestingState()
active_quote = ""
for index, character in enumerate(text):
if active_quote:
if character == active_quote:
active_quote = ""
continue
if character in ('"', "'"):
active_quote = character
continue
if character == delimiter and nesting_state.is_at_top_level():
items.append(text[start:index])
start = index + 1
continue
nesting_state.update_for_character(character)
items.append(text[start:])
return items
def find_last_top_level_whitespace(text: str) -> int:
last_space = -1
nesting_state = NestingState()
for index, character in enumerate(text):
if character.isspace() and nesting_state.is_at_top_level():
last_space = index
nesting_state.update_for_character(character)
return last_space
def find_matching_closer(text: str, start_index: int, opener: str, closer: str) -> int:
depth = 0
for index in range(start_index, len(text)):
character = text[index]
if character == opener:
depth += 1
elif character == closer:
depth -= 1
if depth == 0:
return index
raise ParseError(f"unterminated '{opener}' in '{text}'")
def find_matching_opener(text: str, start_index: int, opener: str, closer: str) -> int:
depth = 0
for index in range(start_index, -1, -1):
character = text[index]
if character == closer:
depth += 1
elif character == opener:
depth -= 1
if depth == 0:
return index
raise ParseError(f"unterminated '{opener}' in '{text}'")
def normalize_whitespace(text: str) -> str:
return " ".join(text.split())