From feced062a9119d6f8279cf9d002f9a93926a8ca4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Simon=20W=C3=BClker?= Date: Wed, 8 Jan 2025 15:54:59 +0100 Subject: [PATCH] Replace manual code generation with procedural macro (#567) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Replace manual code generation with procedural macros This makes LSPs like rust-analyzer pick up the rules.rs file, making it much easier to edit. Signed-off-by: Simon Wülker * Format rules.rs Signed-off-by: Simon Wülker --------- Signed-off-by: Simon Wülker --- Cargo.toml | 7 +- html5ever/Cargo.toml | 7 +- html5ever/build.rs | 39 --- html5ever/src/tree_builder/mod.rs | 6 +- html5ever/src/tree_builder/rules.rs | 16 +- match_token/Cargo.toml | 12 + .../match_token.rs => match_token/src/lib.rs | 244 ++++++------------ 7 files changed, 105 insertions(+), 226 deletions(-) delete mode 100644 html5ever/build.rs create mode 100644 match_token/Cargo.toml rename html5ever/macros/match_token.rs => match_token/src/lib.rs (63%) diff --git a/Cargo.toml b/Cargo.toml index c857b21f..eed06f4e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,6 +3,11 @@ members = [ "markup5ever", "html5ever", "rcdom", - "xml5ever" + "xml5ever", + "match_token" ] + +[workspace.dependencies] +match_token = { path = "match_token" } + resolver = "2" diff --git a/html5ever/Cargo.toml b/html5ever/Cargo.toml index 12463fb0..75cdd454 100644 --- a/html5ever/Cargo.toml +++ b/html5ever/Cargo.toml @@ -6,7 +6,6 @@ license = "MIT OR Apache-2.0" repository = "https://github.com/servo/html5ever" description = "High-performance browser-grade HTML5 parser" documentation = "https://docs.rs/html5ever" -build = "build.rs" categories = [ "parser-implementations", "web-programming" ] keywords = ["html", "html5", "parser", "parsing"] edition = "2021" @@ -19,16 +18,12 @@ trace_tokenizer = [] log = "0.4" mac = "0.1" markup5ever = { version = "0.14", path = "../markup5ever" } +match_token = { workspace = true } [dev-dependencies] criterion = "0.5" typed-arena = "2.0.2" -[build-dependencies] -quote = "1" -syn = { version = "2", features = ["extra-traits", "full", "fold"] } -proc-macro2 = "1" - [[bench]] name = "html5ever" harness = false diff --git a/html5ever/build.rs b/html5ever/build.rs deleted file mode 100644 index 327c707e..00000000 --- a/html5ever/build.rs +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright 2014-2017 The html5ever Project Developers. See the -// COPYRIGHT file at the top-level directory of this distribution. -// -// Licensed under the Apache License, Version 2.0 or the MIT license -// , at your -// option. This file may not be copied, modified, or distributed -// except according to those terms. - -use std::env; -use std::path::Path; -use std::thread::Builder; - -#[path = "macros/match_token.rs"] -mod match_token; - -fn main() { - let manifest_dir = env::var("CARGO_MANIFEST_DIR").unwrap(); - - let input = Path::new(&manifest_dir).join("src/tree_builder/rules.rs"); - let output = Path::new(&env::var("OUT_DIR").unwrap()).join("rules.rs"); - println!("cargo:rerun-if-changed={}", input.display()); - - #[cfg(target_os = "haiku")] - let stack_size = 16; - - #[cfg(not(target_os = "haiku"))] - let stack_size = 128; - - // We have stack overflows on Servo's CI. - let handle = Builder::new() - .stack_size(stack_size * 1024 * 1024) - .spawn(move || { - match_token::expand(&input, &output); - }) - .unwrap(); - - handle.join().unwrap(); -} diff --git a/html5ever/src/tree_builder/mod.rs b/html5ever/src/tree_builder/mod.rs index 736f4d9f..88afa725 100644 --- a/html5ever/src/tree_builder/mod.rs +++ b/html5ever/src/tree_builder/mod.rs @@ -43,13 +43,9 @@ pub use self::PushFlag::*; mod tag_sets; mod data; +mod rules; mod types; -#[allow(warnings)] -mod autogenerated { - include!(concat!(env!("OUT_DIR"), "/rules.rs")); -} - /// Tree builder options, with an impl for Default. #[derive(Copy, Clone)] pub struct TreeBuilderOpts { diff --git a/html5ever/src/tree_builder/rules.rs b/html5ever/src/tree_builder/rules.rs index 9bce3570..eec6faf7 100644 --- a/html5ever/src/tree_builder/rules.rs +++ b/html5ever/src/tree_builder/rules.rs @@ -10,7 +10,7 @@ // The tree builder rules, as a single, enormous nested match expression. use crate::interface::Quirks; -use crate::tokenizer::states::{Plaintext, Rawtext, Rcdata, ScriptData}; +use crate::tokenizer::states::{Rawtext, Rcdata, ScriptData}; use crate::tokenizer::TagKind::{EndTag, StartTag}; use crate::tree_builder::tag_sets::*; use crate::tree_builder::types::*; @@ -19,12 +19,11 @@ use crate::tree_builder::{ TreeSink, }; use crate::QualName; -use markup5ever::{expanded_name, local_name, namespace_prefix, namespace_url, ns}; +use markup5ever::{expanded_name, local_name, namespace_url, ns}; use std::borrow::Cow::Borrowed; -use std::borrow::ToOwned; - use crate::tendril::SliceExt; +use match_token::match_token; fn any_not_whitespace(x: &StrTendril) -> bool { // FIXME: this might be much faster as a byte scan @@ -421,12 +420,9 @@ where } } - match to_close { - Some(name) => { - self.generate_implied_end_except(name.clone()); - self.expect_to_close(name); - } - None => (), + if let Some(name) = to_close { + self.generate_implied_end_except(name.clone()); + self.expect_to_close(name); } self.close_p_element_in_button_scope(); diff --git a/match_token/Cargo.toml b/match_token/Cargo.toml new file mode 100644 index 00000000..f5208577 --- /dev/null +++ b/match_token/Cargo.toml @@ -0,0 +1,12 @@ +[package] +name = "match_token" +version = "0.1.0" +edition = "2021" + +[dependencies] +syn = "2" +quote = "1" +proc-macro2 = "1" + +[lib] +proc-macro = true \ No newline at end of file diff --git a/html5ever/macros/match_token.rs b/match_token/src/lib.rs similarity index 63% rename from html5ever/macros/match_token.rs rename to match_token/src/lib.rs index 2eb64fee..16f4de99 100644 --- a/html5ever/macros/match_token.rs +++ b/match_token/src/lib.rs @@ -1,139 +1,83 @@ -// Copyright 2014-2017 The html5ever Project Developers. See the -// COPYRIGHT file at the top-level directory of this distribution. -// -// Licensed under the Apache License, Version 2.0 or the MIT license -// , at your -// option. This file may not be copied, modified, or distributed -// except according to those terms. - -/*! - -Implements the `match_token!()` macro for use by the HTML tree builder -in `src/tree_builder/rules.rs`. - - -## Example - -```rust -match_token!(token { - CommentToken(text) => 1, - - tag @ => 2, - - => 3, - -
=> else, - - tag @ => 4, - - token => 5, -}) -``` - - -## Syntax - -Because of the simplistic parser, the macro invocation must -start with exactly `match_token!(token {` (with whitespace as specified) -and end with exactly `})`. - -The left-hand side of each match arm is an optional `name @` binding, followed by - - - an ordinary Rust pattern that starts with an identifier or an underscore, or - - - a sequence of HTML tag names as identifiers, each inside "<...>" or "" - to match an open or close tag respectively, or - - - a "wildcard tag" "<_>" or "" to match all open tags or all close tags - respectively. - -The right-hand side is either an expression or the keyword `else`. - -Note that this syntax does not support guards or pattern alternation like -`Foo | Bar`. This is not a fundamental limitation; it's done for implementation -simplicity. - - -## Semantics - -Ordinary Rust patterns match as usual. If present, the `name @` binding has -the usual meaning. - -A sequence of named tags matches any of those tags. A single sequence can -contain both open and close tags. If present, the `name @` binding binds (by -move) the `Tag` struct, not the outer `Token`. That is, a match arm like - -```rust -tag @ => ... -``` - -expands to something like - -```rust -TagToken(tag @ Tag { name: local_name!("html"), kind: StartTag }) -| TagToken(tag @ Tag { name: local_name!("head"), kind: StartTag }) => ... -``` - -A wildcard tag matches any tag of the appropriate kind, *unless* it was -previously matched with an `else` right-hand side (more on this below). - -The expansion of this macro reorders code somewhat, to satisfy various -restrictions arising from moves. However it provides the semantics of in-order -matching, by enforcing the following restrictions on its input: - - - The last pattern must be a variable or the wildcard "_". In other words - it must match everything. - - - Otherwise, ordinary Rust patterns and specific-tag patterns cannot appear - after wildcard tag patterns. - - - No tag name may appear more than once. - - - A wildcard tag pattern may not occur in the same arm as any other tag. - "<_> => ..." and "<_> => ..." are both forbidden. - - - The right-hand side "else" may only appear with specific-tag patterns. - It means that these specific tags should be handled by the last, - catch-all case arm, rather than by any wildcard tag arm. This situation - is common in the HTML5 syntax. -*/ +extern crate proc_macro; use quote::quote; -use syn::{braced, parse_quote, Token}; +use syn::{braced, Token}; -use proc_macro2::TokenStream; -use quote::ToTokens; use std::collections::HashSet; -use std::fs::File; -use std::io::{Read, Write}; -use std::path::Path; use syn::ext::IdentExt; -use syn::fold::Fold; use syn::parse::{Parse, ParseStream, Result}; -pub fn expand(from: &Path, to: &Path) { - let mut source = String::new(); - File::open(from) - .unwrap() - .read_to_string(&mut source) - .unwrap(); - let ast = syn::parse_file(&source).expect("Parsing rules.rs module"); - let mut m = MatchTokenParser {}; - let ast = m.fold_file(ast); - let code = ast - .into_token_stream() - .to_string() - .replace("{ ", "{\n") - .replace(" }", "\n}"); - File::create(to) - .unwrap() - .write_all(code.as_bytes()) - .unwrap(); +/// Implements the `match_token!()` macro for use by the HTML tree builder +/// in `src/tree_builder/rules.rs`. +/// +/// ## Example +/// +/// ```rust,ignore +/// match_token!(token { +/// CommentToken(text) => 1, +/// tag @ => 2, +/// => 3, +///
=> else, +/// tag @ => 4, +/// token => 5, +/// }) +/// ``` +/// +/// ## Syntax +/// Because of the simplistic parser, the macro invocation must +/// start with exactly `match_token!(token {` (with whitespace as specified) +/// and end with exactly `})`. +/// The left-hand side of each match arm is an optional `name @` binding, followed by +/// - an ordinary Rust pattern that starts with an identifier or an underscore, or +/// - a sequence of HTML tag names as identifiers, each inside "<...>" or "" +/// to match an open or close tag respectively, or +/// - a "wildcard tag" "<_>" or "" to match all open tags or all close tags +/// respectively. +/// +/// The right-hand side is either an expression or the keyword `else`. +/// Note that this syntax does not support guards or pattern alternation like +/// `Foo | Bar`. This is not a fundamental limitation; it's done for implementation +/// simplicity. +/// ## Semantics +/// Ordinary Rust patterns match as usual. If present, the `name @` binding has +/// the usual meaning. +/// A sequence of named tags matches any of those tags. A single sequence can +/// contain both open and close tags. If present, the `name @` binding binds (by +/// move) the `Tag` struct, not the outer `Token`. That is, a match arm like +/// ```rust,ignore +/// tag @ => ... +/// ``` +/// expands to something like +/// ```rust,ignore +/// TagToken(tag @ Tag { name: local_name!("html"), kind: StartTag }) +/// | TagToken(tag @ Tag { name: local_name!("head"), kind: StartTag }) => ... +/// ``` +/// A wildcard tag matches any tag of the appropriate kind, *unless* it was +/// previously matched with an `else` right-hand side (more on this below). +/// The expansion of this macro reorders code somewhat, to satisfy various +/// restrictions arising from moves. However it provides the semantics of in-order +/// matching, by enforcing the following restrictions on its input: +/// - The last pattern must be a variable or the wildcard "_". In other words +/// it must match everything. +/// - Otherwise, ordinary Rust patterns and specific-tag patterns cannot appear +/// after wildcard tag patterns. +/// - No tag name may appear more than once. +/// - A wildcard tag pattern may not occur in the same arm as any other tag. +/// "<_> => ..." and "<_> => ..." are both forbidden. +/// - The right-hand side "else" may only appear with specific-tag patterns. +/// It means that these specific tags should be handled by the last, +/// catch-all case arm, rather than by any wildcard tag arm. This situation +/// is common in the HTML5 syntax. +#[proc_macro] +pub fn match_token(input: proc_macro::TokenStream) -> proc_macro::TokenStream { + let input = proc_macro2::TokenStream::from(input); + + let match_token = syn::parse2::(input).expect("Parsing match_token! input failed"); + let output = expand_match_token_macro(match_token); + + proc_macro::TokenStream::from(output) } -struct MatchTokenParser {} - struct MatchToken { ident: syn::Ident, arms: Vec, @@ -163,7 +107,7 @@ enum TagKind { // Option is None if wildcard #[derive(PartialEq, Eq, Hash, Clone)] -pub struct Tag { +struct Tag { kind: TagKind, name: Option, } @@ -250,13 +194,7 @@ impl Parse for MatchToken { } } -pub fn expand_match_token(body: &TokenStream) -> syn::Expr { - let match_token = syn::parse2::(body.clone()); - let ast = expand_match_token_macro(match_token.unwrap()); - syn::parse2(ast).unwrap() -} - -fn expand_match_token_macro(match_token: MatchToken) -> TokenStream { +fn expand_match_token_macro(match_token: MatchToken) -> proc_macro2::TokenStream { let mut arms = match_token.arms; let to_be_matched = match_token.ident; // Handle the last arm specially at the end. @@ -267,11 +205,11 @@ fn expand_match_token_macro(match_token: MatchToken) -> TokenStream { // Case arms for wildcard matching. We collect these and // emit them later. - let mut wildcards_patterns: Vec = Vec::new(); + let mut wildcards_patterns: Vec = Vec::new(); let mut wildcards_expressions: Vec = Vec::new(); // Tags excluded (by an 'else' RHS) from wildcard matching. - let mut wild_excluded_patterns: Vec = Vec::new(); + let mut wild_excluded_patterns: Vec = Vec::new(); let mut arms_code = Vec::new(); @@ -290,7 +228,7 @@ fn expand_match_token_macro(match_token: MatchToken) -> TokenStream { // ordinary pattern => expression (Lhs::Pattern(pat), Rhs::Expression(expr)) => { if !wildcards_patterns.is_empty() { - panic!("ordinary patterns may not appear after wildcard tags {pat:?} {expr:?}"); + panic!("ordinary patterns may not appear after wildcard tags"); } arms_code.push(quote!(#binding #pat => #expr,)) }, @@ -304,7 +242,8 @@ fn expand_match_token_macro(match_token: MatchToken) -> TokenStream { if tag.name.is_none() { panic!("'else' may not appear with a wildcard tag"); } - wild_excluded_patterns.push(make_tag_pattern(&TokenStream::new(), tag)); + wild_excluded_patterns + .push(make_tag_pattern(&proc_macro2::TokenStream::new(), tag)); } }, @@ -413,32 +352,7 @@ fn expand_match_token_macro(match_token: MatchToken) -> TokenStream { } } -impl Fold for MatchTokenParser { - fn fold_stmt(&mut self, stmt: syn::Stmt) -> syn::Stmt { - if let syn::Stmt::Item(syn::Item::Macro(syn::ItemMacro { ref mac, .. })) = stmt { - if mac.path == parse_quote!(match_token) { - return syn::fold::fold_stmt( - self, - syn::Stmt::Expr(expand_match_token(&mac.tokens), None), - ); - } - } - - syn::fold::fold_stmt(self, stmt) - } - - fn fold_expr(&mut self, expr: syn::Expr) -> syn::Expr { - if let syn::Expr::Macro(syn::ExprMacro { ref mac, .. }) = expr { - if mac.path == parse_quote!(match_token) { - return syn::fold::fold_expr(self, expand_match_token(&mac.tokens)); - } - } - - syn::fold::fold_expr(self, expr) - } -} - -fn make_tag_pattern(binding: &TokenStream, tag: Tag) -> TokenStream { +fn make_tag_pattern(binding: &proc_macro2::TokenStream, tag: Tag) -> proc_macro2::TokenStream { let kind = match tag.kind { TagKind::StartTag => quote!(crate::tokenizer::StartTag), TagKind::EndTag => quote!(crate::tokenizer::EndTag),