diff --git a/Cargo.lock b/Cargo.lock index 4fab55a6444f..76043ec0a439 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3724,6 +3724,7 @@ dependencies = [ "tree-sitter-python", "tree-sitter-ruby", "yeast-macros", + "yeast-schema", ] [[package]] @@ -3735,6 +3736,15 @@ dependencies = [ "syn", ] +[[package]] +name = "yeast-schema" +version = "0.1.0" +dependencies = [ + "serde", + "serde_json", + "serde_yaml", +] + [[package]] name = "yoke" version = "0.8.0" diff --git a/Cargo.toml b/Cargo.toml index 62eb2e7e920c..9c15b486062b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,6 +6,7 @@ members = [ "shared/tree-sitter-extractor", "shared/yeast", "shared/yeast-macros", + "shared/yeast-schema", "ruby/extractor", "unified/extractor", "unified/extractor/tree-sitter-swift", diff --git a/misc/bazel/3rdparty/tree_sitter_extractors_deps/defs.bzl b/misc/bazel/3rdparty/tree_sitter_extractors_deps/defs.bzl index 11842460638f..7fbdfc4bbd4b 100644 --- a/misc/bazel/3rdparty/tree_sitter_extractors_deps/defs.bzl +++ b/misc/bazel/3rdparty/tree_sitter_extractors_deps/defs.bzl @@ -403,6 +403,13 @@ _NORMAL_DEPENDENCIES = { "syn": Label("@vendor_ts__syn-2.0.106//:syn"), }, }, + "shared/yeast-schema": { + _COMMON_CONDITION: { + "serde": Label("@vendor_ts__serde-1.0.228//:serde"), + "serde_json": Label("@vendor_ts__serde_json-1.0.145//:serde_json"), + "serde_yaml": Label("@vendor_ts__serde_yaml-0.9.34-deprecated//:serde_yaml"), + }, + }, "unified/extractor": { _COMMON_CONDITION: { "clap": Label("@vendor_ts__clap-4.5.48//:clap"), @@ -456,6 +463,10 @@ _NORMAL_ALIASES = { _COMMON_CONDITION: { }, }, + "shared/yeast-schema": { + _COMMON_CONDITION: { + }, + }, "unified/extractor": { _COMMON_CONDITION: { }, @@ -488,6 +499,8 @@ _NORMAL_DEV_DEPENDENCIES = { }, "shared/yeast-macros": { }, + "shared/yeast-schema": { + }, "unified/extractor": { }, "unified/extractor/tree-sitter-swift": { @@ -513,6 +526,8 @@ _NORMAL_DEV_ALIASES = { }, "shared/yeast-macros": { }, + "shared/yeast-schema": { + }, "unified/extractor": { }, "unified/extractor/tree-sitter-swift": { @@ -536,6 +551,8 @@ _PROC_MACRO_DEPENDENCIES = { }, "shared/yeast-macros": { }, + "shared/yeast-schema": { + }, "unified/extractor": { }, "unified/extractor/tree-sitter-swift": { @@ -559,6 +576,8 @@ _PROC_MACRO_ALIASES = { }, "shared/yeast-macros": { }, + "shared/yeast-schema": { + }, "unified/extractor": { }, "unified/extractor/tree-sitter-swift": { @@ -582,6 +601,8 @@ _PROC_MACRO_DEV_DEPENDENCIES = { }, "shared/yeast-macros": { }, + "shared/yeast-schema": { + }, "unified/extractor": { }, "unified/extractor/tree-sitter-swift": { @@ -607,6 +628,8 @@ _PROC_MACRO_DEV_ALIASES = { }, "shared/yeast-macros": { }, + "shared/yeast-schema": { + }, "unified/extractor": { }, "unified/extractor/tree-sitter-swift": { @@ -630,6 +653,8 @@ _BUILD_DEPENDENCIES = { }, "shared/yeast-macros": { }, + "shared/yeast-schema": { + }, "unified/extractor": { }, "unified/extractor/tree-sitter-swift": { @@ -657,6 +682,8 @@ _BUILD_ALIASES = { }, "shared/yeast-macros": { }, + "shared/yeast-schema": { + }, "unified/extractor": { }, "unified/extractor/tree-sitter-swift": { @@ -682,6 +709,8 @@ _BUILD_PROC_MACRO_DEPENDENCIES = { }, "shared/yeast-macros": { }, + "shared/yeast-schema": { + }, "unified/extractor": { }, "unified/extractor/tree-sitter-swift": { @@ -705,6 +734,8 @@ _BUILD_PROC_MACRO_ALIASES = { }, "shared/yeast-macros": { }, + "shared/yeast-schema": { + }, "unified/extractor": { }, "unified/extractor/tree-sitter-swift": { diff --git a/shared/yeast-macros/src/lib.rs b/shared/yeast-macros/src/lib.rs index 7db97f9fb709..0df96c91d26b 100644 --- a/shared/yeast-macros/src/lib.rs +++ b/shared/yeast-macros/src/lib.rs @@ -113,3 +113,43 @@ pub fn rule(input: TokenStream) -> TokenStream { Err(err) => err.to_compile_error().into(), } } + +/// Bundle a list of YEAST rewrite rules with input/output node-types +/// schema paths. Returns a `Vec`; substitutable for +/// `vec![rule!(...), ...]`. +/// +/// Each comma-separated item in the bracketed list may be: +/// +/// 1. A **bare rule body** `(query) => (template)` — the `rule!(...)` +/// wrapper is implicit. +/// 2. An explicit `rule!(...)` invocation, possibly chained as +/// `rule!(...).repeated()` or path-prefixed as `yeast::rule!(...)`. +/// 3. Any other expression returning a `Rule` (helper-function calls, +/// conditionals). +/// +/// ```ignore +/// let translation_rules: Vec = yeast::rules! { +/// input: "tree-sitter-swift/node-types.yml", +/// output: "ast_types.yml", +/// [ +/// (source_file (_)* @cs) => (top_level body: {..cs}), +/// (simple_identifier) @id => (name_expr identifier: (identifier #{id})), +/// rule!((integer_literal) @lit => (int_literal #{lit})).repeated(), +/// helper_fn(), +/// ] +/// }; +/// ``` +/// +/// Paths are resolved relative to the consuming crate's `CARGO_MANIFEST_DIR` +/// (the same convention `include_str!` uses for relative paths). The +/// resolved paths are also emitted as `include_str!` references so the +/// consuming crate gets invalidated when a schema YAML changes, prepping +/// the ground for compile-time type-checking against those schemas. +#[proc_macro] +pub fn rules(input: TokenStream) -> TokenStream { + let input2: TokenStream2 = input.into(); + match parse::parse_rules_top(input2) { + Ok(output) => output.into(), + Err(err) => err.to_compile_error().into(), + } +} diff --git a/shared/yeast-macros/src/parse.rs b/shared/yeast-macros/src/parse.rs index 2ab6236fdac9..01c0b574b1cf 100644 --- a/shared/yeast-macros/src/parse.rs +++ b/shared/yeast-macros/src/parse.rs @@ -617,6 +617,76 @@ fn extract_captures_inner( } } +/// A rule's return-type annotation, when the body is a Rust block. Written +/// between `=>` and the block body using the schema's own vocabulary: +/// +/// ```text +/// => kind { … } // single node of that kind +/// => kind? { … } // Option (0 or 1) +/// => kind* { … } // Vec (0+) +/// ``` +/// +/// Template bodies (`=> (kind …)`) never carry an annotation — the +/// output kind is the template root. The shorthand `=> kind` (no +/// body) also carries no annotation. See `parse_rule_top` for dispatch. +#[derive(Clone, Debug)] +struct ReturnAnnotation { + kind: Ident, + multiplicity: AnnotationMultiplicity, +} + +#[derive(Clone, Copy, Debug, PartialEq)] +enum AnnotationMultiplicity { + Single, + Optional, + Repeated, +} + +/// Peek at the token stream to decide whether the transform following +/// `=>` is a **new** annotation form (`kind [? | *] { … }`). If so, +/// consume the annotation and return it, leaving the `{ … }` body in +/// the stream for the caller to parse. Otherwise leave the stream +/// untouched and return `None`. +/// +/// The lookahead distinguishes: +/// `kind {` → annotation (single) +/// `kind? {` → annotation (optional) +/// `kind* {` → annotation (repeated) +/// `kind` → shorthand form (no `{` follows) — NOT an annotation +/// anything else → template or bare block — NOT an annotation +fn try_consume_return_annotation(tokens: &mut Tokens) -> Result> { + // Must start with an identifier (the kind name). + let mut lookahead = tokens.clone(); + let Some(TokenTree::Ident(_)) = lookahead.next() else { + return Ok(None); + }; + // Then optionally `?` or `*`, then a `{` group. + let after_suffix = match lookahead.peek() { + Some(TokenTree::Punct(p)) if p.as_char() == '?' || p.as_char() == '*' => { + lookahead.next(); + lookahead.peek() + } + other => other, + }; + if !matches!(after_suffix, Some(TokenTree::Group(g)) if g.delimiter() == Delimiter::Brace) { + return Ok(None); + } + // Commit: consume the ident + suffix from the real stream. + let kind = expect_ident(tokens, "expected output-kind name in annotation")?; + let multiplicity = match tokens.peek() { + Some(TokenTree::Punct(p)) if p.as_char() == '?' => { + tokens.next(); + AnnotationMultiplicity::Optional + } + Some(TokenTree::Punct(p)) if p.as_char() == '*' => { + tokens.next(); + AnnotationMultiplicity::Repeated + } + _ => AnnotationMultiplicity::Single, + }; + Ok(Some(ReturnAnnotation { kind, multiplicity })) +} + /// Parse `rule!( query => transform )`. pub fn parse_rule_top(input: TokenStream) -> Result { let mut tokens = input.into_iter().peekable(); @@ -688,8 +758,52 @@ pub fn parse_rule_top(input: TokenStream) -> Result { }) .collect(); - // Parse transform: either shorthand `=> kind_name` or full `=> (template ...)` - let transform_body = if peek_is_field(&mut tokens) && { + // Parse transform: the token(s) after `=>` fall into one of three + // shapes, dispatched in order: + // + // 1. `kind [? | *] { rust_body }` — annotated Rust body (NEW). + // Static-analysis-ready: the annotation declares the output + // kind and multiplicity in the schema's own vocabulary. + // 2. `kind` alone — shorthand: emit `(kind field: {@cap})…` from + // the query's captures. + // 3. anything else — full template form (`(kind …)` or bare + // `{ … }` splice via `parse_direct_list`). + let annotation = try_consume_return_annotation(&mut tokens)?; + + let transform_body = if let Some(annotation) = annotation { + // Annotation form: `=> kind [? | *] { rust_body }`. + let body_group = expect_group(&mut tokens, Delimiter::Brace)?; + if let Some(tok) = tokens.next() { + return Err(syn::Error::new_spanned( + tok, + "unexpected token after annotated rule body", + )); + } + let body = body_group.stream(); + // The annotation is not yet consumed by codegen — it will drive + // typed handles once the schema-driven codegen lands. For now, + // emit a self-documenting reference to the annotated kind and + // preserve today's `Vec` closure return so behavior + // is unchanged. + let kind_str = annotation.kind.to_string(); + let mult_str = match annotation.multiplicity { + AnnotationMultiplicity::Single => "single", + AnnotationMultiplicity::Optional => "optional", + AnnotationMultiplicity::Repeated => "repeated", + }; + let _ = (kind_str, mult_str); // silence unused warnings until wired + + // For now, adapt the user's typed return value to the framework's + // `Vec` closure result. This uses `IntoFieldIds`, which + // already accepts a bare `Id`, an iterable of ids, or `Option` + // — matching the three annotation multiplicities. + quote! { + let __value = { #body }; + let mut __ids: Vec = Vec::new(); + yeast::IntoFieldIds::extend_into(__value, &mut __ids); + __ids + } + } else if peek_is_field(&mut tokens) && { // Shorthand form: bare identifier = output node kind. // Auto-generate template from captures. let mut lookahead = tokens.clone(); @@ -749,6 +863,26 @@ pub fn parse_rule_top(input: TokenStream) -> Result { vec![__id] } } else { + // Reject bare `{ ... }` transforms — they used to be accepted + // as either a Rust body producing a `Vec` or a template + // consisting of a single `{cap}` splice. Both patterns lost + // static-analysis information (no visible output kind), so we + // now require rules with block bodies to use the annotation + // form `=> kind [? | *] { ... }`. Templates must start with a + // parenthesized node (e.g. `(if_expr ...)`). + if let Some(TokenTree::Group(g)) = tokens.peek() { + if g.delimiter() == Delimiter::Brace { + let span = g.span(); + return Err(syn::Error::new( + span, + "bare `{...}` rule bodies are no longer accepted; \ + use the annotation form `=> kind [? | *] { ... }` \ + (where the kind names the output node's schema kind, \ + optionally suffixed with `?` or `*` for multiplicity)", + )); + } + } + // Full template form let transform_items = parse_direct_list(&mut tokens, &ctx_ident)?; @@ -897,6 +1031,189 @@ fn expect_repetition(tokens: &mut Tokens) -> Result { } } +// --------------------------------------------------------------------------- +// rules! parsing — bundle a list of rules with input/output schema paths. +// +// The macro accepts both bare rule bodies (`(query) => (template)`) and +// explicit `rule!(...)` invocations. The schema paths are recorded but +// not yet consumed; a later change layers compile-time type-checking on +// top, using these paths to load the input/output schemas. +// --------------------------------------------------------------------------- + +/// Parse `rules! { input: "path", output: "path", [ items, ... ] }`. +/// +/// Each item in the bracketed list can be: +/// * a **bare rule body** `(query) => (template)` — wrapped implicitly +/// in `yeast::rule! { ... }` for codegen; +/// * an explicit `rule!(...)` (or `rule!(...).repeated()`, +/// `yeast::rule!(...)`, etc.) — passed through verbatim; +/// * any other expression returning a `Rule` (helper-function calls, +/// conditionals) — passed through verbatim. +/// +/// Returns a `Vec` containing the items in order. The expansion +/// also emits `include_str!` references to the resolved schema paths so +/// Cargo treats them as inputs to the consuming crate; this validates +/// path existence at compile time and prepares the ground for later +/// schema-aware checks. +pub fn parse_rules_top(input: TokenStream) -> Result { + let mut tokens = input.into_iter().peekable(); + + let input_path = parse_named_string_arg(&mut tokens, "input")?; + expect_punct(&mut tokens, ',', "expected `,` after input path")?; + let output_path = parse_named_string_arg(&mut tokens, "output")?; + expect_punct(&mut tokens, ',', "expected `,` after output path")?; + + // Resolve paths relative to the consuming crate's CARGO_MANIFEST_DIR + // so callers can write paths like "tree-sitter-swift/node-types.yml" + // alongside their other workspace-relative includes (e.g. include_str!). + let manifest_dir = std::env::var("CARGO_MANIFEST_DIR").map_err(|_| { + syn::Error::new( + Span::call_site(), + "rules!: CARGO_MANIFEST_DIR is not set; cannot resolve schema paths", + ) + })?; + let resolve_path = |raw: &str| -> std::path::PathBuf { + let p = std::path::PathBuf::from(raw); + if p.is_absolute() { + p + } else { + std::path::PathBuf::from(&manifest_dir).join(p) + } + }; + let input_abs = resolve_path(&input_path.value); + let output_abs = resolve_path(&output_path.value); + + let list = expect_group(&mut tokens, Delimiter::Bracket)?; + if let Some(tok) = tokens.next() { + return Err(syn::Error::new_spanned( + tok, + "unexpected token after `rules!` list", + )); + } + + let items = split_top_level_commas(list.stream()); + let emitted_items: Vec = items + .into_iter() + .map(|item| { + // Bare rule body — wrap in `yeast::rule! { ... }` so the + // existing rule-construction macro handles codegen. Other + // items pass through unchanged. + if has_top_level_arrow(&item) { + quote! { yeast::rule! { #item } } + } else { + item + } + }) + .collect(); + + // Emit `include_str!` references to both schema files so Cargo + // treats them as inputs to the consuming crate's compilation. The + // `const _` bindings are unused; rustc/LLVM drop them after the + // file-input dependency edge is recorded. Absolute paths are used + // because `include_str!` resolves relative paths against the source + // file, while `rules!`'s own paths are relative to + // `CARGO_MANIFEST_DIR`. + let input_abs_str = input_abs.to_string_lossy().into_owned(); + let output_abs_str = output_abs.to_string_lossy().into_owned(); + let input_lit = proc_macro2::Literal::string(&input_abs_str); + let output_lit = proc_macro2::Literal::string(&output_abs_str); + + Ok(quote! { + { + const _: &::core::primitive::str = ::core::include_str!(#input_lit); + const _: &::core::primitive::str = ::core::include_str!(#output_lit); + vec![ #(#emitted_items),* ] + } + }) +} + +/// True iff `item` contains a `=>` operator at the top level (not nested +/// inside any group). Used to detect bare rule bodies inside `rules!`. +fn has_top_level_arrow(item: &TokenStream) -> bool { + let toks: Vec = item.clone().into_iter().collect(); + find_top_level_arrow(&toks).is_some() +} + +/// Find the index of the first token of a top-level `=>` operator (the +/// `=`), ignoring `=>` inside any group. Returns `None` if not present. +fn find_top_level_arrow(toks: &[TokenTree]) -> Option { + let mut i = 0; + while i + 1 < toks.len() { + if let (TokenTree::Punct(p1), TokenTree::Punct(p2)) = (&toks[i], &toks[i + 1]) { + if p1.as_char() == '=' + && p1.spacing() == proc_macro2::Spacing::Joint + && p2.as_char() == '>' + { + return Some(i); + } + } + i += 1; + } + None +} + +/// A string literal argument named `expected_name` parsed from `name: "value"`. +struct NamedString { + value: String, + #[allow(dead_code)] + span: Span, +} + +fn parse_named_string_arg(tokens: &mut Tokens, expected_name: &str) -> Result { + let name = expect_ident(tokens, &format!("expected `{expected_name}:` argument"))?; + if name != expected_name { + return Err(syn::Error::new_spanned( + name, + format!("expected `{expected_name}:` argument"), + )); + } + expect_punct( + tokens, + ':', + &format!("expected `:` after `{expected_name}`"), + )?; + let lit = expect_literal(tokens)?; + let span = lit.span(); + let value = string_literal_value(&lit).ok_or_else(|| { + syn::Error::new( + span, + format!("`{expected_name}` must be a string literal path"), + ) + })?; + Ok(NamedString { value, span }) +} + +/// Read a literal as a plain Rust string, respecting Rust's own escape +/// rules (via `syn::LitStr`). Falls back to `None` if the literal +/// isn't a string. +fn string_literal_value(lit: &Literal) -> Option { + let tokens = TokenStream::from(TokenTree::Literal(lit.clone())); + syn::parse2::(tokens).ok().map(|s| s.value()) +} + +/// Split a token stream into top-level comma-separated items. Commas inside +/// any group token (parens, brackets, braces) are ignored so that things +/// like `rule!(a, b)` aren't accidentally split. +fn split_top_level_commas(stream: TokenStream) -> Vec { + let mut items = Vec::new(); + let mut current: Vec = Vec::new(); + for tt in stream { + if let TokenTree::Punct(p) = &tt { + if p.as_char() == ',' && p.spacing() == proc_macro2::Spacing::Alone { + if !current.is_empty() { + items.push(current.drain(..).collect()); + } + continue; + } + } + current.push(tt); + } + if !current.is_empty() { + items.push(current.into_iter().collect()); + } + items +} + fn maybe_wrap_capture(tokens: &mut Tokens, base: TokenStream) -> Result { if peek_is_at(tokens) { let name = consume_capture_marker(tokens)?; @@ -970,3 +1287,33 @@ fn maybe_wrap_list_capture(tokens: &mut Tokens, elem: TokenStream) -> Result` is present. + let toks = quote! { (a) => (b) }; + assert!(has_top_level_arrow(&toks)); + // `rule!((a) => (b))`: the `=>` is INSIDE the macro group, so + // it's not at top level. Must NOT be detected as a bare body. + let toks = quote! { rule!((a) => (b)) }; + assert!(!has_top_level_arrow(&toks)); + // Helper call: no `=>` anywhere. + let toks = quote! { make_rule() }; + assert!(!has_top_level_arrow(&toks)); + // Match expressions inside a block: `=>` is inside braces. + let toks = quote! { { match x { 1 => 2, _ => 3 } } }; + assert!(!has_top_level_arrow(&toks)); + // Bare shorthand form: top-level `=>` followed by a bare ident. + let toks = quote! { (a) => kind }; + assert!(has_top_level_arrow(&toks)); + } +} diff --git a/shared/yeast-schema/BUILD.bazel b/shared/yeast-schema/BUILD.bazel new file mode 100644 index 000000000000..85f008a1aa67 --- /dev/null +++ b/shared/yeast-schema/BUILD.bazel @@ -0,0 +1,12 @@ +load("@rules_rust//rust:defs.bzl", "rust_library") +load("//misc/bazel/3rdparty/tree_sitter_extractors_deps:defs.bzl", "aliases", "all_crate_deps") + +exports_files(["Cargo.toml"]) + +rust_library( + name = "yeast-schema", + srcs = glob(["src/**/*.rs"]), + aliases = aliases(), + visibility = ["//visibility:public"], + deps = all_crate_deps(), +) diff --git a/shared/yeast-schema/Cargo.toml b/shared/yeast-schema/Cargo.toml new file mode 100644 index 000000000000..4cf534d4f0ce --- /dev/null +++ b/shared/yeast-schema/Cargo.toml @@ -0,0 +1,9 @@ +[package] +name = "yeast-schema" +version = "0.1.0" +edition = "2021" + +[dependencies] +serde = { version = "1.0", features = ["derive"] } +serde_json = "1.0" +serde_yaml = "0.9" diff --git a/shared/yeast-schema/src/lib.rs b/shared/yeast-schema/src/lib.rs new file mode 100644 index 000000000000..8e15571c3558 --- /dev/null +++ b/shared/yeast-schema/src/lib.rs @@ -0,0 +1,33 @@ +//! Schema definitions and YAML/JSON node-types loaders for YEAST. +//! +//! This crate carries the parts of the YEAST framework that don't need +//! `tree-sitter`: the [`schema::Schema`] type and its associated +//! [`schema::NodeType`] / [`schema::FieldCardinality`] helpers, plus the +//! YAML and JSON conversion helpers in [`node_types_yaml`]. +//! +//! It exists so that both the runtime crate (`yeast`) and the +//! compile-time `rules!` proc macro (`yeast-macros`) can build against a +//! single source of truth without dragging tree-sitter (a heavy C-backed +//! dep) into the proc-macro toolchain. +//! +//! Tree-sitter-aware adapters — building a `Schema` from a +//! `tree_sitter::Language`, or loading a YAML schema on top of one — +//! live in `yeast::schema` and `yeast::node_types_yaml` respectively. + +pub mod node_types_yaml; +pub mod schema; + +/// Field IDs are stable `u16`s, matching tree-sitter's representation so a +/// schema built from a tree-sitter language can preserve the language's +/// existing IDs. +pub type FieldId = u16; + +/// Kind IDs are stable `u16`s. Like `FieldId`, this matches tree-sitter's +/// representation. +pub type KindId = u16; + +/// Sentinel field id used to mean "the implicit unfielded slot" (what the +/// tree-sitter docs call `children` and what YEAST surfaces in queries as +/// the bare `child:` field). Reserved to avoid clashing with real field +/// IDs allocated by `Schema::register_field`. +pub const CHILD_FIELD: u16 = u16::MAX; diff --git a/shared/yeast-schema/src/node_types_yaml.rs b/shared/yeast-schema/src/node_types_yaml.rs new file mode 100644 index 000000000000..5f6a3906f7cb --- /dev/null +++ b/shared/yeast-schema/src/node_types_yaml.rs @@ -0,0 +1,762 @@ +/// Converts a YAML node-types file to the tree-sitter `node-types.json` format. +/// +/// # YAML format +/// +/// ```yaml +/// supertypes: +/// _expression: +/// - assignment +/// - binary +/// +/// named: +/// assignment: +/// left: _lhs +/// right: _expression +/// identifier: +/// +/// unnamed: +/// - "+" +/// - "end" +/// ``` +/// +/// See the crate-level docs for the full format specification. +use std::collections::{BTreeMap, BTreeSet}; +use std::fmt::Write; + +use crate::CHILD_FIELD; +use serde::Deserialize; +use serde_json::json; + +/// Top-level YAML structure. +#[derive(Deserialize, Default)] +struct YamlNodeTypes { + #[serde(default)] + supertypes: BTreeMap>, + #[serde(default)] + named: BTreeMap>>, + #[serde(default)] + unnamed: Vec, +} + +/// A reference to a node type. Can be: +/// - a plain string (resolved by looking up named vs unnamed) +/// - a map `{unnamed: "name"}` to force unnamed interpretation +#[derive(Deserialize, Debug, Clone)] +#[serde(untagged)] +enum TypeRef { + Name(String), + Explicit { unnamed: String }, +} + +/// A field value: either a single type ref or a list of them. +#[derive(Deserialize, Debug, Clone)] +#[serde(untagged)] +enum TypeRefOrList { + Single(TypeRef), + List(Vec), +} + +impl TypeRefOrList { + fn into_vec(self) -> Vec { + match self { + TypeRefOrList::Single(t) => vec![t], + TypeRefOrList::List(v) => v, + } + } +} + +/// Parsed field name: base name + multiplicity markers. +struct FieldSpec { + name: Option, // None for $children + multiple: bool, + required: bool, +} + +fn parse_field_name(raw: &str) -> FieldSpec { + let is_children = + raw == "$children" || raw == "$children?" || raw == "$children*" || raw == "$children+"; + + let suffix = raw.chars().last().filter(|c| matches!(c, '?' | '*' | '+')); + + let (multiple, required) = match suffix { + Some('?') => (false, false), + Some('*') => (true, false), + Some('+') => (true, true), + _ => (false, true), // bare field name = required, single + }; + + let name = if is_children { + None + } else { + let base = raw.trim_end_matches(['?', '*', '+']); + Some(base.to_string()) + }; + + FieldSpec { + name, + multiple, + required, + } +} + +/// Resolve a TypeRef to a (type, named) pair, given the sets of known named +/// and unnamed types. +fn resolve_type_ref_pair( + type_ref: &TypeRef, + named_types: &BTreeSet, + unnamed_types: &BTreeSet, +) -> (String, bool) { + match type_ref { + TypeRef::Explicit { unnamed } => (unnamed.clone(), false), + TypeRef::Name(name) => { + let is_named = named_types.contains(name); + let is_unnamed = unnamed_types.contains(name); + if is_named && is_unnamed { + (name.clone(), true) + } else if is_unnamed { + (name.clone(), false) + } else { + (name.clone(), true) + } + } + } +} + +/// Resolve a TypeRef to a {type, named} JSON record, given the sets of known named +/// and unnamed types. +fn resolve_type_ref( + type_ref: &TypeRef, + named_types: &BTreeSet, + unnamed_types: &BTreeSet, +) -> serde_json::Value { + let (kind, named) = resolve_type_ref_pair(type_ref, named_types, unnamed_types); + json!({"type": kind, "named": named}) +} + +/// Convert YAML string to node-types JSON string. +pub fn convert(yaml_input: &str) -> Result { + let yaml: YamlNodeTypes = + serde_yaml::from_str(yaml_input).map_err(|e| format!("Failed to parse YAML: {e}"))?; + + // Build the sets of known named and unnamed types for resolution. + let mut named_types = BTreeSet::new(); + for name in yaml.supertypes.keys() { + named_types.insert(name.clone()); + } + for name in yaml.named.keys() { + named_types.insert(name.clone()); + } + let unnamed_types: BTreeSet = yaml.unnamed.iter().cloned().collect(); + + let mut output = Vec::new(); + + // 1. Supertypes + for (name, members) in &yaml.supertypes { + let subtypes: Vec<_> = members + .iter() + .map(|m| resolve_type_ref(m, &named_types, &unnamed_types)) + .collect(); + output.push(json!({ + "type": name, + "named": true, + "subtypes": subtypes, + })); + } + + // 2. Named nodes + for (name, fields_opt) in &yaml.named { + let fields_map = match fields_opt { + None => { + // Leaf token: no fields, no children, no subtypes + output.push(json!({ + "type": name, + "named": true, + "fields": {}, + })); + continue; + } + Some(m) if m.is_empty() => { + output.push(json!({ + "type": name, + "named": true, + "fields": {}, + })); + continue; + } + Some(m) => m, + }; + + let mut json_fields = serde_json::Map::new(); + let mut json_children: Option = None; + + for (raw_field_name, type_refs) in fields_map { + let spec = parse_field_name(raw_field_name); + let types: Vec<_> = type_refs + .clone() + .into_vec() + .iter() + .map(|t| resolve_type_ref(t, &named_types, &unnamed_types)) + .collect(); + + // Cloning to make the borrow checker happy + let field_info = json!({ + "multiple": spec.multiple, + "required": spec.required, + "types": types, + }); + + if spec.name.is_none() { + // $children + json_children = Some(field_info); + } else { + json_fields.insert(spec.name.unwrap(), field_info); + } + } + + let mut entry = json!({ + "type": name, + "named": true, + "fields": json_fields, + }); + + if let Some(children) = json_children { + entry + .as_object_mut() + .unwrap() + .insert("children".to_string(), children); + } + + output.push(entry); + } + + // 3. Unnamed tokens + for name in &yaml.unnamed { + output.push(json!({ + "type": name, + "named": false, + })); + } + + serde_json::to_string_pretty(&output).map_err(|e| format!("Failed to serialize JSON: {e}")) +} + +/// Apply YAML node-type definitions to a mutable Schema. +/// Registers all types, fields, and allowed types from the YAML into the +/// schema. Public so callers can layer YAML node-types onto a Schema that +/// already has fields/kinds preregistered from another source (e.g. a +/// tree-sitter language). +pub fn extend_schema_from_yaml( + schema: &mut crate::schema::Schema, + yaml_input: &str, +) -> Result<(), String> { + let yaml: YamlNodeTypes = + serde_yaml::from_str(yaml_input).map_err(|e| format!("Failed to parse YAML: {e}"))?; + apply_yaml_to_schema(&yaml, schema); + Ok(()) +} + +fn apply_yaml_to_schema( + yaml: &YamlNodeTypes, + schema: &mut crate::schema::Schema, +) { + // Register all supertypes as node kinds + for name in yaml.supertypes.keys() { + schema.register_kind(name); + } + + // Register named node kinds and their fields + for (name, fields_opt) in &yaml.named { + schema.register_kind(name); + if let Some(fields) = fields_opt { + for raw_field_name in fields.keys() { + let spec = parse_field_name(raw_field_name); + if let Some(field_name) = &spec.name { + schema.register_field(field_name); + } + } + } + } + + // Register unnamed tokens as node kinds + for name in &yaml.unnamed { + schema.register_unnamed_kind(name); + } + + let mut named_types = BTreeSet::new(); + for name in yaml.supertypes.keys() { + named_types.insert(name.clone()); + } + for name in yaml.named.keys() { + named_types.insert(name.clone()); + } + let unnamed_types: BTreeSet = yaml.unnamed.iter().cloned().collect(); + + for (supertype, members) in &yaml.supertypes { + let node_types = members + .iter() + .map(|m| { + let (kind, named) = resolve_type_ref_pair(m, &named_types, &unnamed_types); + crate::schema::NodeType { kind, named } + }) + .collect(); + schema.set_supertype_members(supertype, node_types); + } + + // Register allowed field child types for type checking. + for (parent_kind, fields_opt) in &yaml.named { + let Some(fields) = fields_opt else { + continue; + }; + + for (raw_field_name, type_refs) in fields { + let spec = parse_field_name(raw_field_name); + let field_id = match &spec.name { + Some(name) => schema.register_field(name), + None => CHILD_FIELD, + }; + + let mut node_types = type_refs + .clone() + .into_vec() + .into_iter() + .map(|type_ref| { + let (kind, named) = resolve_type_ref_pair(&type_ref, &named_types, &unnamed_types); + crate::schema::NodeType { kind, named } + }) + .collect::>(); + node_types.sort_by(|a, b| a.kind.cmp(&b.kind).then(a.named.cmp(&b.named))); + node_types.dedup_by(|a, b| a.kind == b.kind && a.named == b.named); + schema.set_field_types(parent_kind, field_id, node_types); + schema.set_field_cardinality( + parent_kind, + field_id, + crate::schema::FieldCardinality { + multiple: spec.multiple, + required: spec.required, + }, + ); + } + } +} + +pub fn schema_from_yaml(yaml_input: &str) -> Result { + let mut schema = crate::schema::Schema::new(); + extend_schema_from_yaml(&mut schema, yaml_input)?; + Ok(schema) +} + +// --------------------------------------------------------------------------- +// JSON → YAML conversion +// --------------------------------------------------------------------------- + +/// JSON node-types structures (mirrors tree-sitter's format). +#[derive(Deserialize)] +struct JsonNodeInfo { + #[serde(rename = "type")] + kind: String, + named: bool, + #[serde(default)] + fields: BTreeMap, + children: Option, + #[serde(default)] + subtypes: Vec, +} + +#[derive(Deserialize)] +struct JsonNodeType { + #[serde(rename = "type")] + kind: String, + named: bool, +} + +#[derive(Deserialize)] +struct JsonFieldInfo { + multiple: bool, + required: bool, + types: Vec, +} + +/// Convert a tree-sitter node-types.json string to the YAML format. +pub fn convert_from_json(json_input: &str) -> Result { + let nodes: Vec = + serde_json::from_str(json_input).map_err(|e| format!("Failed to parse JSON: {e}"))?; + + // Collect all named and unnamed types for disambiguation decisions. + let mut all_named: BTreeSet = BTreeSet::new(); + let mut all_unnamed: BTreeSet = BTreeSet::new(); + for node in &nodes { + if node.named { + all_named.insert(node.kind.clone()); + } else { + all_unnamed.insert(node.kind.clone()); + } + } + + let mut supertypes: BTreeMap> = BTreeMap::new(); + let mut named: BTreeMap>> = BTreeMap::new(); + let mut unnamed: Vec = Vec::new(); + + for node in nodes { + if !node.named { + unnamed.push(node.kind); + continue; + } + + if !node.subtypes.is_empty() { + supertypes.insert(node.kind, node.subtypes); + continue; + } + + if node.fields.is_empty() && node.children.is_none() { + // Leaf token + named.insert(node.kind, None); + } else { + let mut fields = BTreeMap::new(); + for (name, info) in node.fields { + fields.insert(name, info); + } + if let Some(children) = node.children { + fields.insert("$children".to_string(), children); + } + named.insert(node.kind, Some(fields)); + } + } + + // Now emit YAML + let mut out = String::new(); + + // Supertypes + if !supertypes.is_empty() { + writeln!(out, "supertypes:").unwrap(); + for (name, members) in &supertypes { + writeln!(out, " {name}:").unwrap(); + for member in members { + let ref_str = format_type_ref(&member.kind, member.named, &all_named, &all_unnamed); + writeln!(out, " - {ref_str}").unwrap(); + } + } + writeln!(out).unwrap(); + } + + // Named + if !named.is_empty() { + writeln!(out, "named:").unwrap(); + for (name, fields_opt) in &named { + match fields_opt { + None => { + writeln!(out, " {name}:").unwrap(); + } + Some(fields) => { + writeln!(out, " {name}:").unwrap(); + for (field_name, info) in fields { + let suffix = field_suffix(info.multiple, info.required); + let yaml_name = if field_name == "$children" { + format!("$children{suffix}") + } else { + format!("{field_name}{suffix}") + }; + + let type_refs: Vec = info + .types + .iter() + .map(|t| format_type_ref(&t.kind, t.named, &all_named, &all_unnamed)) + .collect(); + + if type_refs.len() == 1 { + writeln!(out, " {yaml_name}: {}", type_refs[0]).unwrap(); + } else { + let list = type_refs + .iter() + .map(|s| s.as_str()) + .collect::>() + .join(", "); + writeln!(out, " {yaml_name}: [{list}]").unwrap(); + } + } + } + } + } + writeln!(out).unwrap(); + } + + // Unnamed + if !unnamed.is_empty() { + writeln!(out, "unnamed:").unwrap(); + for name in &unnamed { + writeln!(out, " - {}", force_quote(name)).unwrap(); + } + } + + Ok(out) +} + +fn field_suffix(multiple: bool, required: bool) -> &'static str { + match (multiple, required) { + (false, true) => "", + (false, false) => "?", + (true, true) => "+", + (true, false) => "*", + } +} + +/// Format a type reference for YAML output. Uses the disambiguation rule: +/// plain string if unambiguous, `{unnamed: name}` if the name exists as both +/// named and unnamed and we need the unnamed interpretation. +fn format_type_ref( + kind: &str, + named: bool, + all_named: &BTreeSet, + _all_unnamed: &BTreeSet, +) -> String { + if named { + quote_yaml(kind) + } else { + let is_also_named = all_named.contains(kind); + if is_also_named { + format!("{{unnamed: {}}}", force_quote(kind)) + } else { + force_quote(kind) + } + } +} + +/// Always wrap in double quotes. Used for unnamed node references so they're +/// visually distinct from named ones — YAML treats both forms as equivalent strings. +fn force_quote(s: &str) -> String { + format!("\"{}\"", s.replace('\\', "\\\\").replace('"', "\\\"")) +} + +/// Quote a YAML string value if it contains special characters or could be +/// misinterpreted. +fn quote_yaml(s: &str) -> String { + let needs_quoting = s.is_empty() + || s.contains(|c: char| { + matches!( + c, + ':' | '{' + | '}' + | '[' + | ']' + | ',' + | '&' + | '*' + | '#' + | '?' + | '|' + | '-' + | '<' + | '>' + | '=' + | '!' + | '%' + | '@' + | '`' + | '"' + | '\'' + ) + }) + || s.starts_with(' ') + || s.ends_with(' ') + || s == "true" + || s == "false" + || s == "null" + || s == "yes" + || s == "no" + || s.parse::().is_ok(); + + if needs_quoting { + format!("\"{}\"", s.replace('\\', "\\\\").replace('"', "\\\"")) + } else { + s.to_string() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_basic_conversion() { + let yaml = r#" +supertypes: + _expression: + - assignment + - binary + +named: + assignment: + left: _lhs + right: _expression + binary: + left: [_expression, _simple_numeric] + operator: ["!=", "+"] + right: _expression + argument_list: + $children*: [_expression, block_argument] + identifier: + +unnamed: + - "!=" + - "+" + - "end" +"#; + + let json_str = convert(yaml).unwrap(); + let result: Vec = serde_json::from_str(&json_str).unwrap(); + + // Check supertype + let expr = &result[0]; + assert_eq!(expr["type"], "_expression"); + assert_eq!(expr["named"], true); + assert_eq!(expr["subtypes"].as_array().unwrap().len(), 2); + + // Check assignment + let assign = result.iter().find(|n| n["type"] == "assignment").unwrap(); + assert_eq!(assign["fields"]["left"]["required"], true); + assert_eq!(assign["fields"]["left"]["multiple"], false); + assert_eq!(assign["fields"]["left"]["types"][0]["type"], "_lhs"); + assert_eq!(assign["fields"]["left"]["types"][0]["named"], true); + + // Check binary.operator — "!=" and "+" should resolve to unnamed + let binary = result.iter().find(|n| n["type"] == "binary").unwrap(); + let op_types = binary["fields"]["operator"]["types"].as_array().unwrap(); + assert_eq!(op_types[0]["type"], "!="); + assert_eq!(op_types[0]["named"], false); + assert_eq!(op_types[1]["type"], "+"); + assert_eq!(op_types[1]["named"], false); + + // Check argument_list has children, not a field + let arg_list = result + .iter() + .find(|n| n["type"] == "argument_list") + .unwrap(); + assert!(arg_list.get("children").is_some()); + assert_eq!(arg_list["children"]["multiple"], true); + assert_eq!(arg_list["children"]["required"], false); + + // Check identifier is a leaf + let ident = result.iter().find(|n| n["type"] == "identifier").unwrap(); + assert_eq!(ident["fields"].as_object().unwrap().len(), 0); + + // Check unnamed tokens + let end = result.iter().find(|n| n["type"] == "end").unwrap(); + assert_eq!(end["named"], false); + } + + #[test] + fn test_explicit_unnamed_disambiguation() { + let yaml = r#" +named: + foo: + field: [{unnamed: bar}] + +unnamed: + - bar +"#; + + let json_str = convert(yaml).unwrap(); + let result: Vec = serde_json::from_str(&json_str).unwrap(); + let foo = result.iter().find(|n| n["type"] == "foo").unwrap(); + assert_eq!(foo["fields"]["field"]["types"][0]["named"], false); + } + + #[test] + fn test_field_suffixes() { + let yaml = r#" +named: + test_node: + required_single: foo + optional_single?: foo + required_multiple+: foo + optional_multiple*: foo +"#; + + let json_str = convert(yaml).unwrap(); + let result: Vec = serde_json::from_str(&json_str).unwrap(); + let node = result.iter().find(|n| n["type"] == "test_node").unwrap(); + let fields = node["fields"].as_object().unwrap(); + + assert_eq!(fields["required_single"]["required"], true); + assert_eq!(fields["required_single"]["multiple"], false); + + assert_eq!(fields["optional_single"]["required"], false); + assert_eq!(fields["optional_single"]["multiple"], false); + + assert_eq!(fields["required_multiple"]["required"], true); + assert_eq!(fields["required_multiple"]["multiple"], true); + + assert_eq!(fields["optional_multiple"]["required"], false); + assert_eq!(fields["optional_multiple"]["multiple"], true); + } + + #[test] + fn test_json_to_yaml() { + let json = r#"[ + {"type": "_expression", "named": true, "subtypes": [ + {"type": "assignment", "named": true}, + {"type": "identifier", "named": true} + ]}, + {"type": "assignment", "named": true, "fields": { + "left": {"multiple": false, "required": true, "types": [ + {"type": "_expression", "named": true} + ]}, + "right": {"multiple": false, "required": false, "types": [ + {"type": "_expression", "named": true} + ]} + }, "children": { + "multiple": true, "required": false, "types": [ + {"type": "identifier", "named": true} + ] + }}, + {"type": "identifier", "named": true, "fields": {}}, + {"type": "=", "named": false}, + {"type": "end", "named": false} + ]"#; + + let yaml = convert_from_json(json).unwrap(); + + // Verify key structures are present + assert!(yaml.contains("supertypes:")); + assert!(yaml.contains("_expression:")); + assert!(yaml.contains("named:")); + assert!(yaml.contains("assignment:")); + assert!(yaml.contains("left:")); + assert!(yaml.contains("right?:")); + assert!(yaml.contains("$children*:")); + assert!(yaml.contains("identifier:")); + assert!(yaml.contains("unnamed:")); + assert!(yaml.contains("\"=\"")); + assert!(yaml.contains("end")); + } + + #[test] + fn test_round_trip() { + let yaml_input = r#" +supertypes: + _expression: + - assignment + - identifier + +named: + assignment: + left: _expression + right?: _expression + $children*: identifier + identifier: + +unnamed: + - "=" + - end +"#; + + // YAML → JSON → YAML + let json = convert(yaml_input).unwrap(); + let yaml_output = convert_from_json(&json).unwrap(); + // YAML → JSON again (should be identical) + let json2 = convert(&yaml_output).unwrap(); + + let v1: serde_json::Value = serde_json::from_str(&json).unwrap(); + let v2: serde_json::Value = serde_json::from_str(&json2).unwrap(); + assert_eq!(v1, v2); + } +} diff --git a/shared/yeast-schema/src/schema.rs b/shared/yeast-schema/src/schema.rs new file mode 100644 index 000000000000..4acd14377a4d --- /dev/null +++ b/shared/yeast-schema/src/schema.rs @@ -0,0 +1,340 @@ +use std::collections::{BTreeMap, BTreeSet}; + +use crate::{FieldId, KindId, CHILD_FIELD}; + +#[derive(Clone, Debug)] +pub struct NodeType { + pub kind: String, + pub named: bool, +} + +/// Multiplicity/optionality of a field declaration. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub struct FieldCardinality { + /// Whether the field may hold more than one child. + pub multiple: bool, + /// Whether at least one child must be present. + pub required: bool, +} + +/// A schema defining node kinds and field names for the output AST. +/// Built from a node-types.yml file, independent of any tree-sitter grammar. +/// +/// # Memory management +/// +/// `register_field`/`register_kind`/`register_unnamed_kind` (and their +/// `_with_id` siblings) use `Box::leak` to obtain `&'static str` names. This +/// is intentional: the `&'static str` names appear pervasively in `Node`, +/// `AstCursor`, query patterns, and the extractor's TRAP output, where +/// adding a lifetime would propagate widely. +/// +/// The leak is bounded by the number of distinct kind/field names registered. +/// Schemas are expected to be constructed once per process (e.g. at extractor +/// startup) and reused. Repeated construction in long-running processes will +/// leak memory unboundedly and should be avoided. +#[derive(Clone)] +pub struct Schema { + field_ids: BTreeMap, + field_names: BTreeMap, + next_field_id: FieldId, + kind_ids: BTreeMap, + unnamed_kind_ids: BTreeMap, + kind_names: BTreeMap, + next_kind_id: KindId, + field_types: BTreeMap<(String, FieldId), Vec>, + field_cardinalities: BTreeMap<(String, FieldId), FieldCardinality>, + supertypes: BTreeMap>, +} + +impl Default for Schema { + fn default() -> Self { + Self::new() + } +} + +impl Schema { + pub fn new() -> Self { + Self { + field_ids: BTreeMap::new(), + field_names: BTreeMap::new(), + next_field_id: 1, // 0 is reserved + kind_ids: BTreeMap::new(), + unnamed_kind_ids: BTreeMap::new(), + kind_names: BTreeMap::new(), + next_kind_id: 1, // 0 is reserved + field_types: BTreeMap::new(), + field_cardinalities: BTreeMap::new(), + supertypes: BTreeMap::new(), + } + } + + /// Register a field name, returning its ID. + /// If already registered, returns the existing ID. + pub fn register_field(&mut self, name: &str) -> FieldId { + if name == "child" { + return CHILD_FIELD; + } + if let Some(&id) = self.field_ids.get(name) { + return id; + } + let id = self.next_field_id; + assert!(id < CHILD_FIELD, "too many fields"); + self.next_field_id += 1; + let leaked: &'static str = Box::leak(name.to_string().into_boxed_str()); + self.field_ids.insert(name.to_string(), id); + self.field_names.insert(id, leaked); + id + } + + /// Register a field name with a specific ID, e.g. when importing IDs + /// from an external source like a tree-sitter language. If the name is + /// already registered (with any ID), nothing is changed and the + /// existing ID is returned. + pub fn register_field_with_id(&mut self, name: &str, id: FieldId) -> FieldId { + if name == "child" { + return CHILD_FIELD; + } + if let Some(&existing) = self.field_ids.get(name) { + return existing; + } + assert!(id < CHILD_FIELD, "too many fields"); + let leaked: &'static str = Box::leak(name.to_string().into_boxed_str()); + self.field_ids.insert(name.to_string(), id); + self.field_names.insert(id, leaked); + if id >= self.next_field_id { + self.next_field_id = id + 1; + } + id + } + + /// Register a named node kind name, returning its ID. + /// If already registered, returns the existing ID. + pub fn register_kind(&mut self, name: &str) -> KindId { + if let Some(&id) = self.kind_ids.get(name) { + return id; + } + let id = self.next_kind_id; + self.next_kind_id += 1; + let leaked: &'static str = Box::leak(name.to_string().into_boxed_str()); + self.kind_ids.insert(name.to_string(), id); + self.kind_names.insert(id, leaked); + id + } + + /// Register a named node kind with a specific ID, e.g. when importing + /// IDs from a tree-sitter language. If the name is already registered, + /// nothing is changed and the existing ID is returned. + pub fn register_kind_with_id(&mut self, name: &str, id: KindId) -> KindId { + if let Some(&existing) = self.kind_ids.get(name) { + return existing; + } + let leaked: &'static str = Box::leak(name.to_string().into_boxed_str()); + self.kind_ids.insert(name.to_string(), id); + self.kind_names.insert(id, leaked); + if id >= self.next_kind_id { + self.next_kind_id = id + 1; + } + id + } + + /// Register an unnamed token kind (e.g. `"="`, `"end"`), returning its ID. + /// If already registered, returns the existing ID. + pub fn register_unnamed_kind(&mut self, name: &str) -> KindId { + if let Some(&id) = self.unnamed_kind_ids.get(name) { + return id; + } + let id = self.next_kind_id; + self.next_kind_id += 1; + let leaked: &'static str = Box::leak(name.to_string().into_boxed_str()); + self.unnamed_kind_ids.insert(name.to_string(), id); + self.kind_names.insert(id, leaked); + id + } + + /// Register an unnamed token kind with a specific ID. If the name is + /// already registered as unnamed, nothing is changed and the existing + /// ID is returned. + pub fn register_unnamed_kind_with_id(&mut self, name: &str, id: KindId) -> KindId { + if let Some(&existing) = self.unnamed_kind_ids.get(name) { + return existing; + } + let leaked: &'static str = Box::leak(name.to_string().into_boxed_str()); + self.unnamed_kind_ids.insert(name.to_string(), id); + self.kind_names.insert(id, leaked); + if id >= self.next_kind_id { + self.next_kind_id = id + 1; + } + id + } + + /// Track a name for a kind ID without registering it as named or + /// unnamed. Useful when importing tree-sitter ID tables that may + /// contain duplicate IDs across the named/unnamed split. + pub fn record_kind_name(&mut self, id: KindId, name: &'static str) { + self.kind_names.entry(id).or_insert(name); + if id >= self.next_kind_id { + self.next_kind_id = id + 1; + } + } + + pub fn field_id_for_name(&self, name: &str) -> Option { + if name == "child" { + return Some(CHILD_FIELD); + } + self.field_ids.get(name).copied() + } + + pub fn field_name_for_id(&self, id: FieldId) -> Option<&'static str> { + if id == CHILD_FIELD { + return Some("child"); + } + self.field_names.get(&id).copied() + } + + pub fn id_for_node_kind(&self, kind: &str) -> Option { + self.kind_ids.get(kind).copied() + } + + pub fn id_for_unnamed_node_kind(&self, kind: &str) -> Option { + self.unnamed_kind_ids.get(kind).copied() + } + + /// Has `kind` been registered as a named kind (concrete node or + /// supertype)? + pub fn has_named_kind(&self, kind: &str) -> bool { + self.id_for_node_kind(kind).is_some() + } + + /// Has `kind` been registered as an unnamed token kind? + pub fn has_unnamed_kind(&self, kind: &str) -> bool { + self.id_for_unnamed_node_kind(kind).is_some() + } + + /// Is `field_name` declared as a field on `parent_kind`? + /// `field_name == None` checks for the implicit unfielded slot + /// (`$children`/`CHILD_FIELD`). + pub fn has_field(&self, parent_kind: &str, field_name: Option<&str>) -> bool { + let field_id = match field_name { + Some(name) => match self.field_id_for_name(name) { + Some(id) => id, + None => return false, + }, + None => CHILD_FIELD, + }; + self.field_types(parent_kind, field_id).is_some() + } + + pub fn node_kind_for_id(&self, id: KindId) -> Option<&'static str> { + self.kind_names.get(&id).copied() + } + + pub fn set_field_types( + &mut self, + parent_kind: &str, + field_id: FieldId, + node_types: Vec, + ) { + self.field_types + .insert((parent_kind.to_string(), field_id), node_types); + } + + pub fn field_types( + &self, + parent_kind: &str, + field_id: FieldId, + ) -> Option<&Vec> { + self.field_types + .get(&(parent_kind.to_string(), field_id)) + } + + pub fn set_field_cardinality( + &mut self, + parent_kind: &str, + field_id: FieldId, + cardinality: FieldCardinality, + ) { + self.field_cardinalities + .insert((parent_kind.to_string(), field_id), cardinality); + } + + /// Returns the declared cardinality for a field, if known. + pub fn field_cardinality( + &self, + parent_kind: &str, + field_id: FieldId, + ) -> Option { + self.field_cardinalities + .get(&(parent_kind.to_string(), field_id)) + .copied() + } + + /// Returns an iterator over all `(field_id, field_name)` pairs that are + /// declared as required (`required: true`) for the given `parent_kind`. + pub fn required_fields_for_kind<'a>( + &'a self, + parent_kind: &'a str, + ) -> impl Iterator)> + 'a { + self.field_cardinalities + .iter() + .filter(move |((kind, _), card)| kind == parent_kind && card.required) + .map(move |((_, field_id), _)| { + let name = self.field_name_for_id(*field_id); + (*field_id, name) + }) + } + + pub fn set_supertype_members(&mut self, supertype: &str, node_types: Vec) { + self.supertypes.insert(supertype.to_string(), node_types); + } + + /// Returns the declared members of a supertype, if known. + pub fn supertype_members(&self, supertype: &str) -> Option<&Vec> { + self.supertypes.get(supertype) + } + + /// Is `kind` a known supertype (an abstract grouping)? + pub fn is_supertype(&self, kind: &str) -> bool { + self.supertypes.contains_key(kind) + } + + fn allows_node( + &self, + node_type: &NodeType, + node_kind: &str, + node_named: bool, + active: &mut BTreeSet, + ) -> bool { + if node_type.kind == node_kind && node_type.named == node_named { + return true; + } + + if !node_type.named { + return false; + } + + let Some(members) = self.supertypes.get(&node_type.kind) else { + return false; + }; + + if !active.insert(node_type.kind.clone()) { + return false; + } + + let matched = members + .iter() + .any(|member| self.allows_node(member, node_kind, node_named, active)); + active.remove(&node_type.kind); + matched + } + + pub fn node_matches_types( + &self, + node_kind: &str, + node_named: bool, + node_types: &[NodeType], + ) -> bool { + node_types.iter().any(|node_type| { + self.allows_node(node_type, node_kind, node_named, &mut BTreeSet::new()) + }) + } +} diff --git a/shared/yeast/BUILD.bazel b/shared/yeast/BUILD.bazel index fe0b01bb87bd..5217f20ec67d 100644 --- a/shared/yeast/BUILD.bazel +++ b/shared/yeast/BUILD.bazel @@ -14,5 +14,7 @@ rust_library( "//shared/yeast-macros", ], visibility = ["//visibility:public"], - deps = all_crate_deps(), + deps = all_crate_deps() + [ + "//shared/yeast-schema", + ], ) diff --git a/shared/yeast/Cargo.toml b/shared/yeast/Cargo.toml index 166887c324cf..518a0d1cefc2 100644 --- a/shared/yeast/Cargo.toml +++ b/shared/yeast/Cargo.toml @@ -10,6 +10,7 @@ serde_json = "1.0.108" serde_yaml = "0.9" tree-sitter = ">= 0.23.0" yeast-macros = { path = "../yeast-macros" } +yeast-schema = { path = "../yeast-schema" } tree-sitter-ruby = "0.23" tree-sitter-python = "0.23" diff --git a/shared/yeast/doc/yeast.md b/shared/yeast/doc/yeast.md index 8aa050592f6b..90edb510c1a1 100644 --- a/shared/yeast/doc/yeast.md +++ b/shared/yeast/doc/yeast.md @@ -312,13 +312,15 @@ already conforms to the output schema. For rules that need the raw (input-schema) capture — typically to read its source text or to translate it explicitly with mutable context state between calls — use `@@name` instead. The body sees the original -input-schema `Id`: +input-schema `Id`. Because these rules always have a Rust block body, +they use the annotation form (see [the `rule!` macro +section](#the-rule-macro) for the full grammar): ```rust yeast::rule!( (assignment left: (_) @@raw_lhs right: (_) @rhs) => - { + call { // raw_lhs is untranslated: read its original source text. let text = ctx.ast.source_text(raw_lhs); // rhs is already translated by the auto-translate prefix. @@ -372,26 +374,79 @@ automatically: single captures bind as `Id`, repeated captures (after ## The `rule!` macro -`rule!` combines a query and a transform into a single declaration: +`rule!` combines a query and a transform into a single declaration. +There are three transform forms, each suited to a different level of +rule complexity: ```rust -// Full template form +// 1. Template form — a tree literal describing the output. yeast::rule!( (query_pattern field: (_) @capture) => (output_template field: {capture}) ) -// Shorthand form — captures become fields on the output node +// 2. Shorthand form — captures become fields on a bare output kind. yeast::rule!( (query_pattern field: (_) @capture) => output_kind ) + +// 3. Annotation form — a Rust block body preceded by the output kind. +yeast::rule!( + (query_pattern child: (_)+ @@children) + => + output_kind* { + // arbitrary Rust; must evaluate to a value compatible with the + // declared multiplicity (see below). + let mut result = Vec::new(); + for child in children { + result.extend(ctx.translate(child)?); + } + result + } +) ``` The shorthand `=> kind` form auto-generates the template, mapping each capture name to a field of the same name on the output node. +### Annotation form + +Rules that need imperative logic — mutating [`BuildCtx`] state per +iteration, computing intermediate values, or looping over captures — +use the annotation form. It has three shapes distinguished by a suffix +on the output-kind identifier: + +| Syntax | Body must evaluate to | Meaning | +|---------------------|-------------------------------------|--------------------------------| +| `=> kind { ... }` | a single node id of `kind` | Emit exactly one node. | +| `=> kind? { ... }` | an `Option` of a node id of `kind` | Emit 0 or 1 nodes (`None`/`Some`). | +| `=> kind* { ... }` | an iterable of node ids of `kind` | Emit 0+ nodes; flattens into the enclosing splice slot. | + +The suffix mirrors the `?` / `*` markers used elsewhere in the schema +DSL (see [`ast_types.yml`](../../../unified/extractor/ast_types.yml)): +bare identifier = required single, `?` = optional single, `*` = +repeated. + +The annotation names the schema kind of the output, giving the macro +enough information for future static analysis (e.g. computing the +static output type of translated captures at their consumer sites). + +**Bare `=> { ... }` block bodies are rejected** — every Rust-block body +must carry an annotation, so the output kind is always visible without +having to inspect the block's expression. + +### Choosing between the forms + +Prefer the simplest form that fits: + +- If the whole transform is a tree literal, use the **template form**. +- If the transform is a template whose root matches a query capture + 1:1, use the **shorthand form**. +- If the transform needs Rust logic (loops, `let` bindings, calls to + `ctx.translate`, etc.), use the **annotation form**. + ## Integration with the extractor A YEAST desugaring pass is configured with a [`DesugaringConfig`], which @@ -437,3 +492,44 @@ For the dbscheme/QL code generator, set `Language::desugar` to a `DesugaringConfig` carrying the same YAML; the generator converts it to JSON for downstream code generation. The `phases` field of the config is unused at code-generation time. + +## The `rules!` macro + +The [`rules!`] macro bundles a list of rewrite rules with the input and +output node-types schema paths. It's a drop-in replacement for the +hand-written `vec![rule!(...), rule!(...), ...]` form and accepts a +slightly looser syntax: bare rule bodies don't need an explicit +`rule!(...)` wrapper. + +```rust +let translation_rules: Vec = yeast::rules! { + input: "tree-sitter-swift/node-types.yml", + output: "ast_types.yml", + [ + (simple_identifier) @name + => + (name_expr identifier: (identifier #{name})), + + (integer_literal) @lit + => + (int_literal #{lit}), + ] +}; +``` + +Each comma-separated item in the bracketed list may be: + +- A **bare rule body** `(query) => (template)` — no `rule!(...)` wrapper. +- An explicit `rule!(...)` invocation, with optional postfix calls such + as `rule!(...).repeated()`. +- Any other expression returning a `Rule` (helper functions, etc.). + +Schema paths are resolved relative to the consuming crate's +`CARGO_MANIFEST_DIR` (the same convention `include_str!` uses for +relative paths). The resolved paths are emitted as `include_str!` +references in the expansion so the consuming crate's incremental cache +invalidates when a schema YAML changes — laying the groundwork for +schema-aware compile-time checks on the rule bodies. + +The `Vec` produced by `rules!` flows into `add_phase` exactly as +before. \ No newline at end of file diff --git a/shared/yeast/src/lib.rs b/shared/yeast/src/lib.rs index fdfe4dd0fb01..4363d3124dca 100644 --- a/shared/yeast/src/lib.rs +++ b/shared/yeast/src/lib.rs @@ -15,7 +15,7 @@ pub mod schema; pub mod tree_builder; mod visitor; -pub use yeast_macros::{query, rule, tree, trees}; +pub use yeast_macros::{query, rule, rules, tree, trees}; use captures::Captures; use query::QueryNode; @@ -43,8 +43,13 @@ impl From for usize { } /// Field and Kind ids are provided by tree-sitter -type FieldId = u16; -type KindId = u16; +type FieldId = yeast_schema::FieldId; +type KindId = yeast_schema::KindId; + +/// Sentinel field id used to mean "the implicit unfielded slot". +/// Re-exported from `yeast-schema` so the runtime and the schema share a +/// single value. +pub use yeast_schema::CHILD_FIELD; /// Trait for values that can be appended to a field's id list inside a /// `tree!`/`trees!`/`rule!` template (in `{expr}` placeholders). @@ -148,8 +153,6 @@ impl YeastSourceRange for &T { } } -pub const CHILD_FIELD: u16 = u16::MAX; - #[derive(Debug)] pub struct AstCursor<'a> { ast: &'a Ast, @@ -295,7 +298,7 @@ impl std::fmt::Debug for Ast { impl Ast { /// Construct an AST from a TS tree pub fn from_tree(language: tree_sitter::Language, tree: &tree_sitter::Tree) -> Self { - let schema = schema::Schema::from_language(&language); + let schema = schema::from_language(&language); Self::from_tree_with_schema(schema, tree, &language) } @@ -1220,7 +1223,7 @@ impl DesugaringConfig { pub fn build_schema(&self, language: &tree_sitter::Language) -> Result { match self.output_node_types_yaml { Some(yaml) => node_types_yaml::schema_from_yaml_with_language(yaml, language), - None => Ok(schema::Schema::from_language(language)), + None => Ok(schema::from_language(language)), } } } @@ -1234,7 +1237,7 @@ pub struct Runner<'a, C = ()> { impl<'a, C> Runner<'a, C> { /// Create a runner using the input grammar's schema for output. pub fn new(language: tree_sitter::Language, phases: &'a [Phase]) -> Self { - let schema = schema::Schema::from_language(&language); + let schema = schema::from_language(&language); Self { language, schema, diff --git a/shared/yeast/src/node_types_yaml.rs b/shared/yeast/src/node_types_yaml.rs index f4d9f2a1c427..7beb4bb25bed 100644 --- a/shared/yeast/src/node_types_yaml.rs +++ b/shared/yeast/src/node_types_yaml.rs @@ -1,767 +1,22 @@ -/// Converts a YAML node-types file to the tree-sitter `node-types.json` format. -/// -/// # YAML format -/// -/// ```yaml -/// supertypes: -/// _expression: -/// - assignment -/// - binary -/// -/// named: -/// assignment: -/// left: _lhs -/// right: _expression -/// identifier: -/// -/// unnamed: -/// - "+" -/// - "end" -/// ``` -/// -/// See the crate-level docs for the full format specification. -use std::collections::{BTreeMap, BTreeSet}; -use std::fmt::Write; - -use crate::CHILD_FIELD; -use serde::Deserialize; -use serde_json::json; - -/// Top-level YAML structure. -#[derive(Deserialize, Default)] -struct YamlNodeTypes { - #[serde(default)] - supertypes: BTreeMap>, - #[serde(default)] - named: BTreeMap>>, - #[serde(default)] - unnamed: Vec, -} - -/// A reference to a node type. Can be: -/// - a plain string (resolved by looking up named vs unnamed) -/// - a map `{unnamed: "name"}` to force unnamed interpretation -#[derive(Deserialize, Debug, Clone)] -#[serde(untagged)] -enum TypeRef { - Name(String), - Explicit { unnamed: String }, -} - -/// A field value: either a single type ref or a list of them. -#[derive(Deserialize, Debug, Clone)] -#[serde(untagged)] -enum TypeRefOrList { - Single(TypeRef), - List(Vec), -} - -impl TypeRefOrList { - fn into_vec(self) -> Vec { - match self { - TypeRefOrList::Single(t) => vec![t], - TypeRefOrList::List(v) => v, - } - } -} - -/// Parsed field name: base name + multiplicity markers. -struct FieldSpec { - name: Option, // None for $children - multiple: bool, - required: bool, -} - -fn parse_field_name(raw: &str) -> FieldSpec { - let is_children = - raw == "$children" || raw == "$children?" || raw == "$children*" || raw == "$children+"; - - let suffix = raw.chars().last().filter(|c| matches!(c, '?' | '*' | '+')); - - let (multiple, required) = match suffix { - Some('?') => (false, false), - Some('*') => (true, false), - Some('+') => (true, true), - _ => (false, true), // bare field name = required, single - }; - - let name = if is_children { - None - } else { - let base = raw.trim_end_matches(['?', '*', '+']); - Some(base.to_string()) - }; - - FieldSpec { - name, - multiple, - required, - } -} - -/// Resolve a TypeRef to a (type, named) pair, given the sets of known named -/// and unnamed types. -fn resolve_type_ref_pair( - type_ref: &TypeRef, - named_types: &BTreeSet, - unnamed_types: &BTreeSet, -) -> (String, bool) { - match type_ref { - TypeRef::Explicit { unnamed } => (unnamed.clone(), false), - TypeRef::Name(name) => { - let is_named = named_types.contains(name); - let is_unnamed = unnamed_types.contains(name); - if is_named && is_unnamed { - (name.clone(), true) - } else if is_unnamed { - (name.clone(), false) - } else { - (name.clone(), true) - } - } - } -} - -/// Resolve a TypeRef to a {type, named} JSON record, given the sets of known named -/// and unnamed types. -fn resolve_type_ref( - type_ref: &TypeRef, - named_types: &BTreeSet, - unnamed_types: &BTreeSet, -) -> serde_json::Value { - let (kind, named) = resolve_type_ref_pair(type_ref, named_types, unnamed_types); - json!({"type": kind, "named": named}) -} - -/// Convert YAML string to node-types JSON string. -pub fn convert(yaml_input: &str) -> Result { - let yaml: YamlNodeTypes = - serde_yaml::from_str(yaml_input).map_err(|e| format!("Failed to parse YAML: {e}"))?; - - // Build the sets of known named and unnamed types for resolution. - let mut named_types = BTreeSet::new(); - for name in yaml.supertypes.keys() { - named_types.insert(name.clone()); - } - for name in yaml.named.keys() { - named_types.insert(name.clone()); - } - let unnamed_types: BTreeSet = yaml.unnamed.iter().cloned().collect(); - - let mut output = Vec::new(); - - // 1. Supertypes - for (name, members) in &yaml.supertypes { - let subtypes: Vec<_> = members - .iter() - .map(|m| resolve_type_ref(m, &named_types, &unnamed_types)) - .collect(); - output.push(json!({ - "type": name, - "named": true, - "subtypes": subtypes, - })); - } - - // 2. Named nodes - for (name, fields_opt) in &yaml.named { - let fields_map = match fields_opt { - None => { - // Leaf token: no fields, no children, no subtypes - output.push(json!({ - "type": name, - "named": true, - "fields": {}, - })); - continue; - } - Some(m) if m.is_empty() => { - output.push(json!({ - "type": name, - "named": true, - "fields": {}, - })); - continue; - } - Some(m) => m, - }; - - let mut json_fields = serde_json::Map::new(); - let mut json_children: Option = None; - - for (raw_field_name, type_refs) in fields_map { - let spec = parse_field_name(raw_field_name); - let types: Vec<_> = type_refs - .clone() - .into_vec() - .iter() - .map(|t| resolve_type_ref(t, &named_types, &unnamed_types)) - .collect(); - - // Cloning to make the borrow checker happy - let field_info = json!({ - "multiple": spec.multiple, - "required": spec.required, - "types": types, - }); - - if spec.name.is_none() { - // $children - json_children = Some(field_info); - } else { - json_fields.insert(spec.name.unwrap(), field_info); - } - } - - let mut entry = json!({ - "type": name, - "named": true, - "fields": json_fields, - }); - - if let Some(children) = json_children { - entry - .as_object_mut() - .unwrap() - .insert("children".to_string(), children); - } - - output.push(entry); - } - - // 3. Unnamed tokens - for name in &yaml.unnamed { - output.push(json!({ - "type": name, - "named": false, - })); - } - - serde_json::to_string_pretty(&output).map_err(|e| format!("Failed to serialize JSON: {e}")) -} - -/// Apply YAML node-type definitions to a mutable Schema. -/// Registers all types, fields, and allowed types from the YAML into the schema. -fn apply_yaml_to_schema(yaml: &YamlNodeTypes, schema: &mut crate::schema::Schema) { - // Register all supertypes as node kinds - for name in yaml.supertypes.keys() { - schema.register_kind(name); - } - - // Register named node kinds and their fields - for (name, fields_opt) in &yaml.named { - schema.register_kind(name); - if let Some(fields) = fields_opt { - for raw_field_name in fields.keys() { - let spec = parse_field_name(raw_field_name); - if let Some(field_name) = &spec.name { - schema.register_field(field_name); - } - } - } - } - - // Register unnamed tokens as node kinds - for name in &yaml.unnamed { - schema.register_unnamed_kind(name); - } - - let mut named_types = BTreeSet::new(); - for name in yaml.supertypes.keys() { - named_types.insert(name.clone()); - } - for name in yaml.named.keys() { - named_types.insert(name.clone()); - } - let unnamed_types: BTreeSet = yaml.unnamed.iter().cloned().collect(); - - for (supertype, members) in &yaml.supertypes { - let node_types = members - .iter() - .map(|m| { - let (kind, named) = resolve_type_ref_pair(m, &named_types, &unnamed_types); - crate::schema::NodeType { kind, named } - }) - .collect(); - schema.set_supertype_members(supertype, node_types); - } - - // Register allowed field child types for type checking. - for (parent_kind, fields_opt) in &yaml.named { - let Some(fields) = fields_opt else { - continue; - }; - - for (raw_field_name, type_refs) in fields { - let spec = parse_field_name(raw_field_name); - let field_id = match &spec.name { - Some(name) => schema.register_field(name), - None => CHILD_FIELD, - }; - - let mut node_types = type_refs - .clone() - .into_vec() - .into_iter() - .map(|type_ref| { - let (kind, named) = - resolve_type_ref_pair(&type_ref, &named_types, &unnamed_types); - crate::schema::NodeType { kind, named } - }) - .collect::>(); - node_types.sort_by(|a, b| a.kind.cmp(&b.kind).then(a.named.cmp(&b.named))); - node_types.dedup_by(|a, b| a.kind == b.kind && a.named == b.named); - schema.set_field_types(parent_kind, field_id, node_types); - schema.set_field_cardinality( - parent_kind, - field_id, - crate::schema::FieldCardinality { - multiple: spec.multiple, - required: spec.required, - }, - ); - } - } -} - -pub fn schema_from_yaml(yaml_input: &str) -> Result { - let yaml: YamlNodeTypes = - serde_yaml::from_str(yaml_input).map_err(|e| format!("Failed to parse YAML: {e}"))?; - - let mut schema = crate::schema::Schema::new(); - apply_yaml_to_schema(&yaml, &mut schema); - - Ok(schema) -} - -/// Build a Schema from a YAML string, extending a tree-sitter Language. -/// The Schema inherits all field/kind names from the Language, plus any -/// additional ones defined in the YAML. +//! YAML/JSON node-types loaders for YEAST. +//! +//! The pure YAML/JSON conversion routines live in [`yeast_schema::node_types_yaml`]. +//! This module re-exports them and adds the tree-sitter-aware adapter +//! [`schema_from_yaml_with_language`]. + +pub use yeast_schema::node_types_yaml::{ + convert, convert_from_json, extend_schema_from_yaml, schema_from_yaml, +}; + +/// Build a Schema from a YAML string, layered on top of a tree-sitter +/// `Language`. The Schema inherits all field/kind names from the language +/// (preserving the language's IDs), plus any additional ones defined in +/// the YAML. pub fn schema_from_yaml_with_language( yaml_input: &str, language: &tree_sitter::Language, ) -> Result { - let yaml: YamlNodeTypes = - serde_yaml::from_str(yaml_input).map_err(|e| format!("Failed to parse YAML: {e}"))?; - - let mut schema = crate::schema::Schema::from_language(language); - apply_yaml_to_schema(&yaml, &mut schema); - + let mut schema = crate::schema::from_language(language); + extend_schema_from_yaml(&mut schema, yaml_input)?; Ok(schema) } - -// --------------------------------------------------------------------------- -// JSON → YAML conversion -// --------------------------------------------------------------------------- - -/// JSON node-types structures (mirrors tree-sitter's format). -#[derive(Deserialize)] -struct JsonNodeInfo { - #[serde(rename = "type")] - kind: String, - named: bool, - #[serde(default)] - fields: BTreeMap, - children: Option, - #[serde(default)] - subtypes: Vec, -} - -#[derive(Deserialize)] -struct JsonNodeType { - #[serde(rename = "type")] - kind: String, - named: bool, -} - -#[derive(Deserialize)] -struct JsonFieldInfo { - multiple: bool, - required: bool, - types: Vec, -} - -/// Convert a tree-sitter node-types.json string to the YAML format. -pub fn convert_from_json(json_input: &str) -> Result { - let nodes: Vec = - serde_json::from_str(json_input).map_err(|e| format!("Failed to parse JSON: {e}"))?; - - // Collect all named and unnamed types for disambiguation decisions. - let mut all_named: BTreeSet = BTreeSet::new(); - let mut all_unnamed: BTreeSet = BTreeSet::new(); - for node in &nodes { - if node.named { - all_named.insert(node.kind.clone()); - } else { - all_unnamed.insert(node.kind.clone()); - } - } - - let mut supertypes: BTreeMap> = BTreeMap::new(); - let mut named: BTreeMap>> = BTreeMap::new(); - let mut unnamed: Vec = Vec::new(); - - for node in nodes { - if !node.named { - unnamed.push(node.kind); - continue; - } - - if !node.subtypes.is_empty() { - supertypes.insert(node.kind, node.subtypes); - continue; - } - - if node.fields.is_empty() && node.children.is_none() { - // Leaf token - named.insert(node.kind, None); - } else { - let mut fields = BTreeMap::new(); - for (name, info) in node.fields { - fields.insert(name, info); - } - if let Some(children) = node.children { - fields.insert("$children".to_string(), children); - } - named.insert(node.kind, Some(fields)); - } - } - - // Now emit YAML - let mut out = String::new(); - - // Supertypes - if !supertypes.is_empty() { - writeln!(out, "supertypes:").unwrap(); - for (name, members) in &supertypes { - writeln!(out, " {name}:").unwrap(); - for member in members { - let ref_str = format_type_ref(&member.kind, member.named, &all_named, &all_unnamed); - writeln!(out, " - {ref_str}").unwrap(); - } - } - writeln!(out).unwrap(); - } - - // Named - if !named.is_empty() { - writeln!(out, "named:").unwrap(); - for (name, fields_opt) in &named { - match fields_opt { - None => { - writeln!(out, " {name}:").unwrap(); - } - Some(fields) => { - writeln!(out, " {name}:").unwrap(); - for (field_name, info) in fields { - let suffix = field_suffix(info.multiple, info.required); - let yaml_name = if field_name == "$children" { - format!("$children{suffix}") - } else { - format!("{field_name}{suffix}") - }; - - let type_refs: Vec = info - .types - .iter() - .map(|t| format_type_ref(&t.kind, t.named, &all_named, &all_unnamed)) - .collect(); - - if type_refs.len() == 1 { - writeln!(out, " {yaml_name}: {}", type_refs[0]).unwrap(); - } else { - let list = type_refs - .iter() - .map(|s| s.as_str()) - .collect::>() - .join(", "); - writeln!(out, " {yaml_name}: [{list}]").unwrap(); - } - } - } - } - } - writeln!(out).unwrap(); - } - - // Unnamed - if !unnamed.is_empty() { - writeln!(out, "unnamed:").unwrap(); - for name in &unnamed { - writeln!(out, " - {}", force_quote(name)).unwrap(); - } - } - - Ok(out) -} - -fn field_suffix(multiple: bool, required: bool) -> &'static str { - match (multiple, required) { - (false, true) => "", - (false, false) => "?", - (true, true) => "+", - (true, false) => "*", - } -} - -/// Format a type reference for YAML output. Uses the disambiguation rule: -/// plain string if unambiguous, `{unnamed: name}` if the name exists as both -/// named and unnamed and we need the unnamed interpretation. -fn format_type_ref( - kind: &str, - named: bool, - all_named: &BTreeSet, - _all_unnamed: &BTreeSet, -) -> String { - if named { - quote_yaml(kind) - } else { - let is_also_named = all_named.contains(kind); - if is_also_named { - format!("{{unnamed: {}}}", force_quote(kind)) - } else { - force_quote(kind) - } - } -} - -/// Always wrap in double quotes. Used for unnamed node references so they're -/// visually distinct from named ones — YAML treats both forms as equivalent strings. -fn force_quote(s: &str) -> String { - format!("\"{}\"", s.replace('\\', "\\\\").replace('"', "\\\"")) -} - -/// Quote a YAML string value if it contains special characters or could be -/// misinterpreted. -fn quote_yaml(s: &str) -> String { - let needs_quoting = s.is_empty() - || s.contains(|c: char| { - matches!( - c, - ':' | '{' - | '}' - | '[' - | ']' - | ',' - | '&' - | '*' - | '#' - | '?' - | '|' - | '-' - | '<' - | '>' - | '=' - | '!' - | '%' - | '@' - | '`' - | '"' - | '\'' - ) - }) - || s.starts_with(' ') - || s.ends_with(' ') - || s == "true" - || s == "false" - || s == "null" - || s == "yes" - || s == "no" - || s.parse::().is_ok(); - - if needs_quoting { - format!("\"{}\"", s.replace('\\', "\\\\").replace('"', "\\\"")) - } else { - s.to_string() - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_basic_conversion() { - let yaml = r#" -supertypes: - _expression: - - assignment - - binary - -named: - assignment: - left: _lhs - right: _expression - binary: - left: [_expression, _simple_numeric] - operator: ["!=", "+"] - right: _expression - argument_list: - $children*: [_expression, block_argument] - identifier: - -unnamed: - - "!=" - - "+" - - "end" -"#; - - let json_str = convert(yaml).unwrap(); - let result: Vec = serde_json::from_str(&json_str).unwrap(); - - // Check supertype - let expr = &result[0]; - assert_eq!(expr["type"], "_expression"); - assert_eq!(expr["named"], true); - assert_eq!(expr["subtypes"].as_array().unwrap().len(), 2); - - // Check assignment - let assign = result.iter().find(|n| n["type"] == "assignment").unwrap(); - assert_eq!(assign["fields"]["left"]["required"], true); - assert_eq!(assign["fields"]["left"]["multiple"], false); - assert_eq!(assign["fields"]["left"]["types"][0]["type"], "_lhs"); - assert_eq!(assign["fields"]["left"]["types"][0]["named"], true); - - // Check binary.operator — "!=" and "+" should resolve to unnamed - let binary = result.iter().find(|n| n["type"] == "binary").unwrap(); - let op_types = binary["fields"]["operator"]["types"].as_array().unwrap(); - assert_eq!(op_types[0]["type"], "!="); - assert_eq!(op_types[0]["named"], false); - assert_eq!(op_types[1]["type"], "+"); - assert_eq!(op_types[1]["named"], false); - - // Check argument_list has children, not a field - let arg_list = result - .iter() - .find(|n| n["type"] == "argument_list") - .unwrap(); - assert!(arg_list.get("children").is_some()); - assert_eq!(arg_list["children"]["multiple"], true); - assert_eq!(arg_list["children"]["required"], false); - - // Check identifier is a leaf - let ident = result.iter().find(|n| n["type"] == "identifier").unwrap(); - assert_eq!(ident["fields"].as_object().unwrap().len(), 0); - - // Check unnamed tokens - let end = result.iter().find(|n| n["type"] == "end").unwrap(); - assert_eq!(end["named"], false); - } - - #[test] - fn test_explicit_unnamed_disambiguation() { - let yaml = r#" -named: - foo: - field: [{unnamed: bar}] - -unnamed: - - bar -"#; - - let json_str = convert(yaml).unwrap(); - let result: Vec = serde_json::from_str(&json_str).unwrap(); - let foo = result.iter().find(|n| n["type"] == "foo").unwrap(); - assert_eq!(foo["fields"]["field"]["types"][0]["named"], false); - } - - #[test] - fn test_field_suffixes() { - let yaml = r#" -named: - test_node: - required_single: foo - optional_single?: foo - required_multiple+: foo - optional_multiple*: foo -"#; - - let json_str = convert(yaml).unwrap(); - let result: Vec = serde_json::from_str(&json_str).unwrap(); - let node = result.iter().find(|n| n["type"] == "test_node").unwrap(); - let fields = node["fields"].as_object().unwrap(); - - assert_eq!(fields["required_single"]["required"], true); - assert_eq!(fields["required_single"]["multiple"], false); - - assert_eq!(fields["optional_single"]["required"], false); - assert_eq!(fields["optional_single"]["multiple"], false); - - assert_eq!(fields["required_multiple"]["required"], true); - assert_eq!(fields["required_multiple"]["multiple"], true); - - assert_eq!(fields["optional_multiple"]["required"], false); - assert_eq!(fields["optional_multiple"]["multiple"], true); - } - - #[test] - fn test_json_to_yaml() { - let json = r#"[ - {"type": "_expression", "named": true, "subtypes": [ - {"type": "assignment", "named": true}, - {"type": "identifier", "named": true} - ]}, - {"type": "assignment", "named": true, "fields": { - "left": {"multiple": false, "required": true, "types": [ - {"type": "_expression", "named": true} - ]}, - "right": {"multiple": false, "required": false, "types": [ - {"type": "_expression", "named": true} - ]} - }, "children": { - "multiple": true, "required": false, "types": [ - {"type": "identifier", "named": true} - ] - }}, - {"type": "identifier", "named": true, "fields": {}}, - {"type": "=", "named": false}, - {"type": "end", "named": false} - ]"#; - - let yaml = convert_from_json(json).unwrap(); - - // Verify key structures are present - assert!(yaml.contains("supertypes:")); - assert!(yaml.contains("_expression:")); - assert!(yaml.contains("named:")); - assert!(yaml.contains("assignment:")); - assert!(yaml.contains("left:")); - assert!(yaml.contains("right?:")); - assert!(yaml.contains("$children*:")); - assert!(yaml.contains("identifier:")); - assert!(yaml.contains("unnamed:")); - assert!(yaml.contains("\"=\"")); - assert!(yaml.contains("end")); - } - - #[test] - fn test_round_trip() { - let yaml_input = r#" -supertypes: - _expression: - - assignment - - identifier - -named: - assignment: - left: _expression - right?: _expression - $children*: identifier - identifier: - -unnamed: - - "=" - - end -"#; - - // YAML → JSON → YAML - let json = convert(yaml_input).unwrap(); - let yaml_output = convert_from_json(&json).unwrap(); - // YAML → JSON again (should be identical) - let json2 = convert(&yaml_output).unwrap(); - - let v1: serde_json::Value = serde_json::from_str(&json).unwrap(); - let v2: serde_json::Value = serde_json::from_str(&json2).unwrap(); - assert_eq!(v1, v2); - } -} diff --git a/shared/yeast/src/query.rs b/shared/yeast/src/query.rs index bcf0f7facab1..3e61ac60b2be 100644 --- a/shared/yeast/src/query.rs +++ b/shared/yeast/src/query.rs @@ -66,7 +66,17 @@ impl QueryNode { pub fn do_match(&self, ast: &Ast, node: Id, matches: &mut Captures) -> Result { match self { - QueryNode::Any { .. } => Ok(true), + QueryNode::Any { match_unnamed } => { + if *match_unnamed { + Ok(true) + } else { + // `(_)` only matches named nodes, matching tree-sitter + // semantics. Bare `_` (with `match_unnamed = true`) + // matches any node. + let n = ast.get_node(node).unwrap(); + Ok(n.is_named()) + } + } QueryNode::Node { kind, children } => { let node = ast.get_node(node).unwrap(); let target_kind = ast diff --git a/shared/yeast/src/schema.rs b/shared/yeast/src/schema.rs index da13bb8b6b70..daa8ad98eb5b 100644 --- a/shared/yeast/src/schema.rs +++ b/shared/yeast/src/schema.rs @@ -1,285 +1,54 @@ -use std::collections::{BTreeMap, BTreeSet}; - -use crate::{FieldId, KindId, CHILD_FIELD}; - -#[derive(Clone, Debug)] -pub struct NodeType { - pub kind: String, - pub named: bool, -} - -/// Multiplicity/optionality of a field declaration. -#[derive(Clone, Copy, Debug, PartialEq, Eq)] -pub struct FieldCardinality { - /// Whether the field may hold more than one child. - pub multiple: bool, - /// Whether at least one child must be present. - pub required: bool, -} - -/// A schema defining node kinds and field names for the output AST. -/// Built from a node-types.yml file, independent of any tree-sitter grammar. -/// -/// # Memory management -/// -/// `register_field`/`register_kind`/`register_unnamed_kind` use `Box::leak` -/// to obtain `&'static str` names. This is intentional: the `&'static str` -/// names appear pervasively in `Node`, `AstCursor`, query patterns, and the -/// extractor's TRAP output, where adding a lifetime would propagate widely. -/// -/// The leak is bounded by the number of distinct kind/field names registered. -/// Schemas are expected to be constructed once per process (e.g. at extractor -/// startup) and reused. Repeated construction in long-running processes will -/// leak memory unboundedly and should be avoided. -#[derive(Clone)] -pub struct Schema { - field_ids: BTreeMap, - field_names: BTreeMap, - next_field_id: FieldId, - kind_ids: BTreeMap, - unnamed_kind_ids: BTreeMap, - kind_names: BTreeMap, - next_kind_id: KindId, - field_types: BTreeMap<(String, FieldId), Vec>, - field_cardinalities: BTreeMap<(String, FieldId), FieldCardinality>, - supertypes: BTreeMap>, -} - -impl Default for Schema { - fn default() -> Self { - Self::new() - } -} - -impl Schema { - pub fn new() -> Self { - Self { - field_ids: BTreeMap::new(), - field_names: BTreeMap::new(), - next_field_id: 1, // 0 is reserved - kind_ids: BTreeMap::new(), - unnamed_kind_ids: BTreeMap::new(), - kind_names: BTreeMap::new(), - next_kind_id: 1, // 0 is reserved - field_types: BTreeMap::new(), - field_cardinalities: BTreeMap::new(), - supertypes: BTreeMap::new(), - } - } - - /// Create a schema from a tree-sitter language, importing all its - /// known field and kind names. - pub fn from_language(language: &tree_sitter::Language) -> Self { - let mut schema = Self::new(); - // Import all field names, preserving tree-sitter's IDs - for id in 1..=language.field_count() as u16 { - if let Some(name) = language.field_name_for_id(id) { - schema.field_ids.insert(name.to_string(), id); - schema.field_names.insert(id, name); - if id >= schema.next_field_id { - schema.next_field_id = id + 1; - } +//! YEAST schema types. +//! +//! The schema struct itself lives in the [`yeast_schema`] crate (so it can +//! be shared with the `yeast-macros` proc-macro crate without dragging +//! tree-sitter into proc-macro compiles). This module re-exports its +//! public API and supplies the one tree-sitter-aware adapter the runtime +//! needs: [`from_language`]. + +pub use yeast_schema::schema::{FieldCardinality, NodeType, Schema}; + +/// Build a [`Schema`] from a tree-sitter language, importing all its +/// known field and kind names so the resulting schema's IDs line up with +/// the language's own IDs (i.e. `field_name_for_id` agrees). +pub fn from_language(language: &tree_sitter::Language) -> Schema { + let mut schema = Schema::new(); + + // Import all field names, preserving tree-sitter's IDs. + for id in 1..=language.field_count() as u16 { + if let Some(name) = language.field_name_for_id(id) { + schema.register_field_with_id(name, id); + } + } + + // Import all node kind names, preserving tree-sitter's IDs. + // Track named and unnamed variants separately. For both, prefer the + // canonical ID returned by `id_for_node_kind`, since some languages + // have multiple IDs for the same name (e.g. the reserved error token + // at ID 0 may share a name with a real token). + for id in 0..language.node_kind_count() as u16 { + if let Some(name) = language.node_kind_for_id(id) { + if name.is_empty() { + continue; } - } - // Import all node kind names, preserving tree-sitter's IDs. - // Track named and unnamed variants separately. For both named and - // unnamed kinds, use the canonical ID from id_for_node_kind, since - // some languages have multiple IDs for the same name (e.g., the - // reserved error token at ID 0 may share a name with a real token). - for id in 0..language.node_kind_count() as u16 { - if let Some(name) = language.node_kind_for_id(id) { - if !name.is_empty() { - let is_named = language.node_kind_is_named(id); - if is_named { - let canonical_id = language.id_for_node_kind(name, true); - if canonical_id != 0 && !schema.kind_ids.contains_key(name) { - schema.kind_ids.insert(name.to_string(), canonical_id); - schema.kind_names.insert(canonical_id, name); - } - } else { - let canonical_id = language.id_for_node_kind(name, false); - if canonical_id != 0 && !schema.unnamed_kind_ids.contains_key(name) { - schema - .unnamed_kind_ids - .insert(name.to_string(), canonical_id); - schema.kind_names.insert(canonical_id, name); - } - } - // Always track the name for any ID we encounter - schema.kind_names.entry(id).or_insert(name); - if id >= schema.next_kind_id { - schema.next_kind_id = id + 1; - } + let is_named = language.node_kind_is_named(id); + if is_named { + let canonical_id = language.id_for_node_kind(name, true); + if canonical_id != 0 && schema.id_for_node_kind(name).is_none() { + schema.register_kind_with_id(name, canonical_id); + } + } else { + let canonical_id = language.id_for_node_kind(name, false); + if canonical_id != 0 && schema.id_for_unnamed_node_kind(name).is_none() { + schema.register_unnamed_kind_with_id(name, canonical_id); } } + // Always track the name for any ID we encounter (so + // `node_kind_for_id` works for the literal `id` we saw, even + // when it isn't the canonical one). + schema.record_kind_name(id, name); } - schema - } - - /// Register a field name, returning its ID. - /// If already registered, returns the existing ID. - pub fn register_field(&mut self, name: &str) -> FieldId { - if name == "child" { - return CHILD_FIELD; - } - if let Some(&id) = self.field_ids.get(name) { - return id; - } - let id = self.next_field_id; - assert!(id < CHILD_FIELD, "too many fields"); - self.next_field_id += 1; - let leaked: &'static str = Box::leak(name.to_string().into_boxed_str()); - self.field_ids.insert(name.to_string(), id); - self.field_names.insert(id, leaked); - id - } - - /// Register a named node kind name, returning its ID. - /// If already registered, returns the existing ID. - pub fn register_kind(&mut self, name: &str) -> KindId { - if let Some(&id) = self.kind_ids.get(name) { - return id; - } - let id = self.next_kind_id; - self.next_kind_id += 1; - let leaked: &'static str = Box::leak(name.to_string().into_boxed_str()); - self.kind_ids.insert(name.to_string(), id); - self.kind_names.insert(id, leaked); - id } - /// Register an unnamed token kind (e.g. `"="`, `"end"`), returning its ID. - /// If already registered, returns the existing ID. - pub fn register_unnamed_kind(&mut self, name: &str) -> KindId { - if let Some(&id) = self.unnamed_kind_ids.get(name) { - return id; - } - let id = self.next_kind_id; - self.next_kind_id += 1; - let leaked: &'static str = Box::leak(name.to_string().into_boxed_str()); - self.unnamed_kind_ids.insert(name.to_string(), id); - self.kind_names.insert(id, leaked); - id - } - - pub fn field_id_for_name(&self, name: &str) -> Option { - if name == "child" { - return Some(CHILD_FIELD); - } - self.field_ids.get(name).copied() - } - - pub fn field_name_for_id(&self, id: FieldId) -> Option<&'static str> { - if id == CHILD_FIELD { - return Some("child"); - } - self.field_names.get(&id).copied() - } - - pub fn id_for_node_kind(&self, kind: &str) -> Option { - self.kind_ids.get(kind).copied() - } - - pub fn id_for_unnamed_node_kind(&self, kind: &str) -> Option { - self.unnamed_kind_ids.get(kind).copied() - } - - pub fn node_kind_for_id(&self, id: KindId) -> Option<&'static str> { - self.kind_names.get(&id).copied() - } - - pub fn set_field_types( - &mut self, - parent_kind: &str, - field_id: FieldId, - node_types: Vec, - ) { - self.field_types - .insert((parent_kind.to_string(), field_id), node_types); - } - - pub fn field_types(&self, parent_kind: &str, field_id: FieldId) -> Option<&Vec> { - self.field_types.get(&(parent_kind.to_string(), field_id)) - } - - pub fn set_field_cardinality( - &mut self, - parent_kind: &str, - field_id: FieldId, - cardinality: FieldCardinality, - ) { - self.field_cardinalities - .insert((parent_kind.to_string(), field_id), cardinality); - } - - /// Returns the declared cardinality for a field, if known. - pub fn field_cardinality( - &self, - parent_kind: &str, - field_id: FieldId, - ) -> Option { - self.field_cardinalities - .get(&(parent_kind.to_string(), field_id)) - .copied() - } - - /// Returns an iterator over all `(field_id, field_name)` pairs that are - /// declared as required (`required: true`) for the given `parent_kind`. - pub fn required_fields_for_kind<'a>( - &'a self, - parent_kind: &'a str, - ) -> impl Iterator)> + 'a { - self.field_cardinalities - .iter() - .filter(move |((kind, _), card)| kind == parent_kind && card.required) - .map(move |((_, field_id), _)| { - let name = self.field_name_for_id(*field_id); - (*field_id, name) - }) - } - - pub fn set_supertype_members(&mut self, supertype: &str, node_types: Vec) { - self.supertypes.insert(supertype.to_string(), node_types); - } - - fn allows_node( - &self, - node_type: &NodeType, - node_kind: &str, - node_named: bool, - active: &mut BTreeSet, - ) -> bool { - if node_type.kind == node_kind && node_type.named == node_named { - return true; - } - - if !node_type.named { - return false; - } - - let Some(members) = self.supertypes.get(&node_type.kind) else { - return false; - }; - - if !active.insert(node_type.kind.clone()) { - return false; - } - - let matched = members - .iter() - .any(|member| self.allows_node(member, node_kind, node_named, active)); - active.remove(&node_type.kind); - matched - } - - pub fn node_matches_types( - &self, - node_kind: &str, - node_named: bool, - node_types: &[NodeType], - ) -> bool { - node_types.iter().any(|node_type| { - self.allows_node(node_type, node_kind, node_named, &mut BTreeSet::new()) - }) - } + schema } diff --git a/shared/yeast/tests/input-types.yml b/shared/yeast/tests/input-types.yml new file mode 100644 index 000000000000..6bc184ec6470 --- /dev/null +++ b/shared/yeast/tests/input-types.yml @@ -0,0 +1,40 @@ +# Test input schema for yeast rules! macro tests. Covers a small subset of +# tree-sitter-ruby kinds used by the test rules. Kept deliberately small so +# the macro's compile-time loader can be exercised over a known surface. + +named: + program: + $children*: [assignment, call, identifier, for] + + assignment: + left: [identifier, left_assignment_list] + right: [identifier, integer, call] + + left_assignment_list: + $children*: identifier + + for: + pattern: [identifier, left_assignment_list] + value: in + body: do + + in: + $children: [identifier, call] + + do: + $children*: [identifier, assignment, call] + + call: + receiver: [identifier, call] + method: identifier + + identifier: + integer: + +unnamed: + - "=" + - "," + - "for" + - "in" + - "do" + - "end" diff --git a/shared/yeast/tests/test.rs b/shared/yeast/tests/test.rs index 57a9e17dbd4c..3a24709dd9ff 100644 --- a/shared/yeast/tests/test.rs +++ b/shared/yeast/tests/test.rs @@ -989,7 +989,7 @@ fn test_one_shot_recurses_into_returned_capture() { yeast::rule!( (assignment left: (_) @left right: (_) @right) => - {left} + identifier { left } ), yeast::rule!((identifier) => (identifier "ID")), yeast::rule!((integer) => (integer "INT")), @@ -1084,7 +1084,7 @@ fn test_raw_capture_marker() { yeast::rule!( (assignment left: (_) @@raw_lhs right: (_) @rhs) => - { + call { let text = ctx.ast.source_text(raw_lhs); tree!((call method: (identifier #{text.as_str()}) @@ -1139,7 +1139,7 @@ fn test_raw_capture_marker_explicit_translate() { yeast::rule!( (assignment left: (_) @@raw_lhs right: (_) @rhs) => - { + call { let translated_lhs = ctx.translate(raw_lhs)?; tree!((call method: {translated_lhs} @@ -1322,3 +1322,233 @@ fn test_hash_brace_uses_capture_location_for_leaf() { assert_eq!(bar.start_byte(), 4); assert_eq!(bar.end_byte(), 7); } + +// ---- `rules!` macro tests (compile-time type-checking) ---- + +/// `rules!` should accept well-typed rules using the bare-rule-body +/// syntax (no inner `rule!` invocations) and produce a `Vec` that +/// behaves identically to a plain `vec![rule!(...)]` list. +#[test] +fn test_rules_macro_accepts_bare_rule_body() { + let rules: Vec = yeast::rules! { + input: "tests/input-types.yml", + output: "tests/node-types.yml", + [ + (assignment + left: (_) @left + right: (_) @right + ) + => + (assignment + left: {right} + right: {left} + ), + ] + }; + + let dump = run_and_dump("x = 1", rules); + assert_dump_eq( + &dump, + r#" + program + assignment + left: integer "1" + right: identifier "x" + "#, + ); +} + +/// The bare-rule-body shorthand `=> output_kind` should also be accepted. +#[test] +fn test_rules_macro_accepts_bare_shorthand_form() { + let rules: Vec = yeast::rules! { + input: "tests/input-types.yml", + output: "tests/node-types.yml", + [ + (assignment + left: (_) @method + right: (_) @receiver + ) + => call, + ] + }; + + let dump = run_and_dump("x = 1", rules); + assert_dump_eq( + &dump, + r#" + program + call + method: identifier "x" + receiver: integer "1" + "#, + ); +} + +/// Backwards-compat: explicit `rule!(...)` invocations inside `rules!` +/// should still type-check and behave the same as the bare form. +#[test] +fn test_rules_macro_accepts_explicit_rule_macro() { + let rules: Vec = yeast::rules! { + input: "tests/input-types.yml", + output: "tests/node-types.yml", + [ + rule!( + (assignment + left: (_) @left + right: (_) @right + ) + => + (assignment + left: {right} + right: {left} + ) + ), + ] + }; + assert_eq!(rules.len(), 1); +} + +/// `rules!` should pass through items that aren't bare rule bodies or +/// `rule!(...)` calls (e.g. helper-function calls returning a `Rule`), +/// without type-checking them. Bare and explicit rules in the same list +/// still get checked. +#[test] +fn test_rules_macro_allows_non_rule_items() { + fn extra() -> yeast::Rule { + rule!((identifier) => (identifier "extra")) + } + let rules: Vec = yeast::rules! { + input: "tests/input-types.yml", + output: "tests/node-types.yml", + [ + (integer) => (integer "checked"), + extra(), + ] + }; + assert_eq!(rules.len(), 2); +} + +/// `rules!` should accept lists that mix bare-rule and explicit-rule items. +#[test] +fn test_rules_macro_mixes_bare_and_explicit_forms() { + let rules: Vec = yeast::rules! { + input: "tests/input-types.yml", + output: "tests/node-types.yml", + [ + (integer) => (integer "I"), + rule!((identifier) => (identifier "S")), + ] + }; + assert_eq!(rules.len(), 2); +} + +// ---- Rule-body return-type annotation tests ---- +// +// The annotation form `=> kind [? | *] { rust_body }` is the future +// interface for Rust-bodied rules: the schema-vocabulary annotation +// declares the rule's output kind for static analysis. Today's codegen +// does NOT yet consume the annotation (it just adapts the returned +// value to `Vec` via `IntoFieldIds`); these tests only exercise +// the parser + the runtime-equivalence property. + +/// Annotation form with `*` (repeated): the rule body returns a +/// `Vec` and the annotation says the outputs are `assignment`s. +#[test] +fn test_rule_annotation_repeated() { + // Behaviourally equivalent to a two-node splice template. + let r: Rule = rule!( + (assignment left: (_) @l right: (_) @r) + => + assignment* { + let a1 = tree!((assignment left: {l} right: {r})); + let a2 = tree!((assignment left: {r} right: {l})); + vec![a1, a2] + } + ); + let ast = run_and_ast("x = 1", vec![r]); + // Just verify the run completes without a schema error; two + // top-level `assignment` nodes should appear as siblings. + let mut count = 0usize; + for id in ast.reachable_node_ids() { + if let Some(n) = ast.get_node(id) { + if n.kind_name() == "assignment" { + count += 1; + } + } + } + assert!( + count >= 2, + "expected at least two assignment nodes, got {count}" + ); +} + +/// Annotation form with `?` (optional): the rule body returns +/// `Option`. This uses `None` so the rule effectively deletes the +/// node. +#[test] +fn test_rule_annotation_optional_none() { + // Delete every `integer` (returning None yields no output nodes). + let r: Rule = rule!( + (integer) @lit + => + integer? { + let _ = lit; + None:: + } + ); + let ast = run_and_ast("42", vec![r]); + // No integer node should survive. + for id in ast.reachable_node_ids() { + if let Some(n) = ast.get_node(id) { + assert_ne!(n.kind_name(), "integer", "integer should have been deleted"); + } + } +} + +/// Annotation form (single): the rule body returns a bare `Id`. +#[test] +fn test_rule_annotation_single() { + // Identity on assignment nodes, expressed with the annotation form. + let r: Rule = rule!( + (assignment left: (_) @l right: (_) @r) + => + assignment { + tree!((assignment left: {l} right: {r})) + } + ); + let ast = run_and_ast("x = 1", vec![r]); + let mut has_assignment = false; + for id in ast.reachable_node_ids() { + if let Some(n) = ast.get_node(id) { + if n.kind_name() == "assignment" { + has_assignment = true; + } + } + } + assert!(has_assignment, "expected an assignment node"); +} + +/// The shorthand `=> kind` form (no body, no annotation) must still be +/// distinguished from the annotation form and continue to work. +#[test] +fn test_shorthand_still_works_alongside_annotation_syntax() { + let r: Rule = rule!( + (assignment left: (_) @method right: (_) @receiver) + => + call + ); + let ast = run_and_ast("x = 1", vec![r]); + let mut has_call = false; + for id in ast.reachable_node_ids() { + if let Some(n) = ast.get_node(id) { + if n.kind_name() == "call" { + has_call = true; + } + } + } + assert!( + has_call, + "shorthand form should still produce a `call` node" + ); +} diff --git a/unified/extractor/src/languages/swift/swift.rs b/unified/extractor/src/languages/swift/swift.rs index 5689d930bff3..af9158433816 100644 --- a/unified/extractor/src/languages/swift/swift.rs +++ b/unified/extractor/src/languages/swift/swift.rs @@ -104,8 +104,8 @@ fn translation_rules() -> Vec> { ) ), // Declarations may be wrapped in local/global wrapper nodes. - rule!((global_declaration _ @inner) => {inner}), - rule!((local_declaration _ @inner) => {inner}), + rule!((global_declaration _ @inner) => stmt { inner }), + rule!((local_declaration _ @inner) => stmt { inner }), // ---- Literals ---- rule!((integer_literal) => (int_literal)), rule!((hex_literal) => (int_literal)), @@ -198,7 +198,7 @@ fn translation_rules() -> Vec> { type: _? @ty computed_value: (computed_property accessor: _+ @@accessors)) => - {{ + accessor_declaration* { ctx.property_name = Some(tree!((identifier #{pattern}))); ctx.property_type = ty; @@ -210,7 +210,7 @@ fn translation_rules() -> Vec> { result.extend(ctx.translate(acc)?); } result - }} + } ), // Computed property: shorthand getter (no explicit get/set, just // statements) → a single accessor_declaration with kind "get". @@ -249,7 +249,7 @@ fn translation_rules() -> Vec> { value: _? @val observers: (willset_didset_block willset: _? @@ws didset: _? @@ds)) => - {{ + member* { let var_decl = tree!( (variable_declaration modifier: {ctx.binding_modifier} @@ -271,7 +271,7 @@ fn translation_rules() -> Vec> { result.extend(ctx.translate(obs)?); } result - }} + } ), // property_binding with any pattern name (identifier or // destructuring). Reads outer modifiers / chained tag from `ctx`. @@ -305,7 +305,7 @@ fn translation_rules() -> Vec> { declarator: _* @@decls (modifiers)* @mods) => - {{ + member* { let binding_text = ctx.ast.source_text(binding_kind); ctx.binding_modifier = Some(ctx.literal("modifier", &binding_text)); ctx.outer_modifiers = mods; @@ -316,7 +316,7 @@ fn translation_rules() -> Vec> { result.extend(ctx.translate(decl)?); } result - }} + } ), // ---- Enums ---- // enum_type_parameter → parameter (with optional name as pattern). @@ -376,7 +376,7 @@ fn translation_rules() -> Vec> { rule!( (enum_entry case: _+ @@cases (modifiers)* @mods) => - {{ + member* { ctx.outer_modifiers = mods; let mut result = Vec::new(); @@ -385,7 +385,7 @@ fn translation_rules() -> Vec> { result.extend(ctx.translate(case)?); } result - }} + } ), // Plain assignment: `x = expr` rule!( @@ -400,9 +400,9 @@ fn translation_rules() -> Vec> { (compound_assign_expr target: {target} operator: (infix_operator #{op}) value: {value}) ), // Unwrap `type` wrapper node - rule!((type name: @inner) => {inner}), + rule!((type name: @inner) => type_expr { inner }), // `directly_assignable_expression` is just a wrapper; unwrap it - rule!((directly_assignable_expression expr: @inner) => {inner}), + rule!((directly_assignable_expression expr: @inner) => expr { inner }), // Pattern with bound_identifier → name_pattern rule!((pattern bound_identifier: @name) => (name_pattern identifier: (identifier #{name}))), // Pattern with 'let' or 'var' binding: extract the inner pattern @@ -410,7 +410,7 @@ fn translation_rules() -> Vec> { rule!( (pattern kind: (binding_pattern binding: _? pattern: @pattern)) => - {pattern} + pattern { pattern } ), // case T.foo(x,y) pattern rule!( @@ -463,10 +463,10 @@ fn translation_rules() -> Vec> { rule!( (function_parameter parameter: @@p default_value: _? @def) => - {{ + parameter* { ctx.default_value = def; ctx.translate(p)? - }} + } ), // Parameter with external name and type rule!( @@ -689,7 +689,7 @@ fn translation_rules() -> Vec> { element: (pattern_element pattern: (name_pattern identifier: (identifier #{name}))))) ), // If-condition — unwrap (pass through the inner expression/pattern) - rule!((if_condition kind: @inner) => {inner}), + rule!((if_condition kind: @inner) => expr_or_pattern { inner }), // ---- Loops ---- // For-in loop with optional where-clause guard. rule!( @@ -722,7 +722,7 @@ fn translation_rules() -> Vec> { body: (block stmt: {body})) ), // Labeled statement (e.g. `outer: for ...`). Strip the trailing ':' from the label token. - rule!((labeled_statement label: (statement_label) @lbl statement: @stmt) => { + rule!((labeled_statement label: (statement_label) @lbl statement: @stmt) => labeled_stmt { let text = ctx.ast.source_text(lbl); let name = &text[..text.len() - 1]; tree!((labeled_stmt label: (identifier #{name}) stmt: {stmt})) @@ -744,7 +744,7 @@ fn translation_rules() -> Vec> { rule!((dictionary_literal_item key: @k value: @v) => (key_value_pair key: {k} value: {v})), // ---- Optionals and errors ---- // Optional chaining — unwrap the marker - rule!((optional_chain_marker expr: @inner) => {inner}), + rule!((optional_chain_marker expr: @inner) => expr { inner }), // try/try?/try! expr → unary_expr with operator "try", "try?" or "try!" rule!((try_expression (try_operator) @op expr: @inner) => (unary_expr operator: (prefix_operator #{op}) operand: {inner})), rule!((try_expression operator: (try_operator) @op expr: @inner) => (unary_expr operator: (prefix_operator #{op}) operand: {inner})), @@ -800,7 +800,7 @@ fn translation_rules() -> Vec> { rule!( (identifier part: _+ @parts) => - {member_chain(&mut ctx, parts)} + expr { member_chain(&mut ctx, parts) } ), // Scoped import declaration (for example `import struct Foo.Bar`): // flatten the identifier parts into a member_access_expr and bind the @@ -831,7 +831,7 @@ fn translation_rules() -> Vec> { // Super expression → super_expr rule!((super_expression) => (super_expr)), // Modifiers — unwrap to individual modifier children - rule!((modifiers _* @mods) => {mods}), + rule!((modifiers _* @mods) => modifier* { mods }), rule!((attribute) @m => (modifier #{m})), rule!((visibility_modifier) @m => (modifier #{m})), rule!((function_modifier) @m => (modifier #{m})), @@ -843,7 +843,7 @@ fn translation_rules() -> Vec> { rule!((inheritance_modifier) @m => (modifier #{m})), rule!((property_behavior_modifier) @m => (modifier #{m})), // Type annotations — unwrap - rule!((type_annotation type: @inner) => {inner}), + rule!((type_annotation type: @inner) => type_expr { inner }), // user_type is split into simple_user_type parts. // Keep a conservative textual fallback to avoid dropping type information. rule!((user_type) @ty => (named_type_expr name: (identifier #{ty}))), @@ -1018,7 +1018,7 @@ fn translation_rules() -> Vec> { type: _? @ty (modifiers)* @mods) => - {{ + accessor_declaration* { ctx.property_name = Some(tree!((identifier #{name}))); ctx.property_type = ty; ctx.outer_modifiers = mods; @@ -1029,7 +1029,7 @@ fn translation_rules() -> Vec> { result.extend(ctx.translate(acc)?); } result - }} + } ), // getter_specifier / setter_specifier → bodyless accessor_declaration // getter_specifier / setter_specifier → bodyless @@ -1056,7 +1056,7 @@ fn translation_rules() -> Vec> { modifier: {chained_modifier(&mut ctx)}) ), // protocol_property_requirements wrapper — should be consumed by above; fallback - rule!((protocol_property_requirements accessor: _* @accs) => {accs}), + rule!((protocol_property_requirements accessor: _* @accs) => accessor_declaration* { accs }), // Computed getter → accessor_declaration (body optional). // Reads property name/type from the outer property_binding rule // and binding/outer modifiers + chained tag from the outer @@ -1116,7 +1116,7 @@ fn translation_rules() -> Vec> { // willset/didset block — spread to children (only reachable as a // fallback; the outer property_binding manual rule normally // captures the willset/didset clauses directly). - rule!((willset_didset_block _* @clauses) => {clauses}), + rule!((willset_didset_block _* @clauses) => accessor_declaration* { clauses }), // willset clause → accessor_declaration (body optional). Reads // `ctx.property_name` set by the outer property_binding rule and // binding/outer modifiers + chained tag from the outer @@ -1147,16 +1147,18 @@ fn translation_rules() -> Vec> { // Preprocessor conditionals — unsupported rule!((diagnostic) => (unsupported_node)), // ---- Fallbacks ---- + // Bare `_` (rather than `(_)`) so this matches both named nodes + // and unnamed tokens. Any unnamed token that escapes the + // input-schema-specific rules (e.g. captured operators in + // `additive_expression op: @op`) has its auto-translated value + // replaced with an `unsupported_node` whose source range is + // inherited from the original token, so `#{op}` still reads the + // original text. rule!( - (_) + _ => (unsupported_node) ), - rule!( - _ @node - => - {node} - ), ] } diff --git a/unified/extractor/tests/rules_macro_smoke.rs b/unified/extractor/tests/rules_macro_smoke.rs new file mode 100644 index 000000000000..cde8ae3ca4ab --- /dev/null +++ b/unified/extractor/tests/rules_macro_smoke.rs @@ -0,0 +1,25 @@ +/// Smoke test: load a few real Swift translation rules through the new +/// `yeast::rules!` macro using the bare-rule-body syntax, and confirm the +/// input + output schemas accept them. Compiles only — any type-checking +/// error surfaces as a compile-time error. +#[test] +fn rules_macro_compiles_against_real_swift_schemas() { + let _rules: Vec = yeast::rules! { + input: "tree-sitter-swift/node-types.yml", + output: "ast_types.yml", + [ + (simple_identifier) @name + => + (name_expr + identifier: (identifier #{name})), + + (integer_literal) @lit + => + (int_literal #{lit}), + + (line_string_literal) @lit + => + (string_literal #{lit}), + ] + }; +}