Skip to content

Commit 8a2a48d

Browse files
asgerfCopilot
andcommitted
Unified extractor: add AST schema, swift translation rules, and corpus framework
Add ast_types.yml defining the unified output AST schema with supertypes (expr, stmt, condition, pattern) and named nodes (top_level, binary_expr, name_expr, etc.). Rewrite swift translation rules to map from tree-sitter Swift grammar to the unified AST, using one-shot phase rules. Update the generator to use the output AST schema for dbscheme/QL generation, and normalize the extraction table prefix to 'unified'. Improve the corpus test framework to include raw tree-sitter parse output, type-error checking against the output schema, and better failure reporting. Regenerate Ast.qll, unified.dbscheme, and update BasicTest accordingly. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent 5d0cb9e commit 8a2a48d

10 files changed

Lines changed: 442 additions & 220 deletions

File tree

unified/extractor/ast_types.yml

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
supertypes:
2+
expr:
3+
- binary_expr
4+
- unary_expr
5+
- name_expr
6+
- lambda_expr
7+
- unsupported_node
8+
stmt:
9+
- empty_stmt
10+
- if_stmt
11+
- variable_declaration_stmt
12+
- guard_if_stmt
13+
- unsupported_node
14+
condition:
15+
- expr_condition
16+
- let_pattern_condition
17+
- unsupported_node
18+
pattern:
19+
- var_pattern
20+
- apply_pattern
21+
- ignore_pattern
22+
- unsupported_node
23+
named:
24+
# Top-level is the root node, currently containing a list of expressions
25+
top_level:
26+
body*: expr
27+
28+
# Application of a binary operator, such as `a + b`
29+
binary_expr:
30+
left: expr
31+
operator: operator
32+
right: expr
33+
34+
# Application of a unary operator, such as `!x`
35+
unary_expr:
36+
operand: expr
37+
operator: operator
38+
39+
# An identifier used in the context of an expression
40+
name_expr:
41+
identifier: identifier
42+
43+
lambda_expr:
44+
parameter*: parameter
45+
body: [expr, stmt]
46+
47+
# A parameter
48+
parameter:
49+
pattern: pattern
50+
51+
empty_stmt:
52+
53+
if_stmt:
54+
condition: condition
55+
then?: stmt
56+
else?: stmt
57+
58+
variable_declaration_stmt:
59+
variable_declarator+: variable_declarator
60+
61+
# A variable declaration, or assignment to a pattern.
62+
# The initializer is optional (but typically only possible in combination with a simple variable pattern).
63+
variable_declarator:
64+
pattern: pattern
65+
value?: expr
66+
67+
# Evaluate 'condition', and if false, execute 'else' which must break from the enclosing block scope (return, break, etc).
68+
# Any variables bound by 'condition' will be in scope for the remainder of the enclosing block scope
69+
# (which differs from how if_stmt works).
70+
guard_if_stmt:
71+
condition: condition
72+
else: stmt
73+
74+
# Evaluates the given condition and interprets it as a boolean (by language conventions)
75+
expr_condition:
76+
expr: expr
77+
78+
# Evaluate 'expr' and match its result against 'pattern', and return true if it matches.
79+
# Variables bound by the pattern will be in scope within the 'true' branch controlled by this condition.
80+
let_pattern_condition:
81+
pattern: pattern
82+
value: expr
83+
84+
# A pattern matching anything, binding its value to the given variable
85+
var_pattern:
86+
identifier: identifier
87+
88+
# A pattern matching anything, binding no variables, usually using the syntax "_"
89+
ignore_pattern:
90+
91+
# A pattern such as `Some(x)` where `Some` is the constructor and `x` is an argument
92+
apply_pattern:
93+
constructor: expr
94+
argument*: expr
95+
96+
# An simple unqualified identifier token
97+
identifier:
98+
99+
# A node that we don't yet translate
100+
unsupported_node:
101+
102+
operator:

unified/extractor/src/extractor.rs

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,17 @@ pub struct Options {
2323
pub fn run(options: Options) -> std::io::Result<()> {
2424
codeql_extractor::extractor::set_tracing_level("unified");
2525

26+
// The generated dbscheme/QL library uses the unified_* relation namespace.
27+
// Keep per-language specs for parser/rules/file globs, but normalize the
28+
// extraction table prefix so emitted TRAP relations match the dbscheme.
29+
let mut languages = languages::all_language_specs();
30+
for lang in &mut languages {
31+
lang.prefix = "unified";
32+
}
33+
2634
let extractor = simple::Extractor {
2735
prefix: "unified".to_string(),
28-
languages: languages::all_language_specs(),
36+
languages,
2937
trap_dir: options.output_dir,
3038
trap_compression: trap::Compression::from_env("CODEQL_EXTRACTOR_UNIFIED_OPTION_TRAP_COMPRESSION"),
3139
source_archive_dir: options.source_archive_dir,

unified/extractor/src/generator.rs

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@ use std::path::PathBuf;
33

44
use codeql_extractor::generator::{generate, language::Language};
55

6+
use crate::languages;
7+
68
#[derive(Args)]
79
pub struct Options {
810
/// Path of the generated dbscheme file
@@ -17,10 +19,16 @@ pub struct Options {
1719
pub fn run(options: Options) -> std::io::Result<()> {
1820
codeql_extractor::extractor::set_tracing_level("unified");
1921

22+
// The QL-visible schema is the unified output AST, not the per-language
23+
// input grammars. Pass it via `desugar.output_node_types_yaml` so the
24+
// generator converts the YAML to JSON node-types.
25+
let desugar = yeast::DesugaringConfig::new()
26+
.with_output_node_types_yaml(languages::OUTPUT_AST_SCHEMA);
27+
2028
let languages = vec![Language {
21-
name: "Swift".to_owned(),
22-
node_types: tree_sitter_swift::NODE_TYPES,
23-
desugar: None,
29+
name: "Unified".to_owned(),
30+
node_types: "", // unused: generator picks up output_node_types_yaml above
31+
desugar: Some(desugar),
2432
}];
2533

2634
generate(languages, options.dbscheme, options.library, "run unified/scripts/create-extractor-pack.sh")

unified/extractor/src/languages/mod.rs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,9 @@ use codeql_extractor::extractor::simple;
33
#[path = "swift/swift.rs"]
44
mod swift;
55

6+
/// Shared YEAST output AST schema for all languages.
7+
pub(crate) const OUTPUT_AST_SCHEMA: &str = include_str!("../../ast_types.yml");
8+
69
pub fn all_language_specs() -> Vec<simple::LanguageSpec> {
7-
vec![swift::language_spec()]
10+
vec![swift::language_spec(OUTPUT_AST_SCHEMA)]
811
}

unified/extractor/src/languages/swift/swift.rs

Lines changed: 46 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,59 @@
11
use codeql_extractor::extractor::simple;
22
use yeast::{rule, DesugaringConfig, PhaseKind};
33

4-
fn desugaring_rules() -> Vec<yeast::Rule> {
4+
fn translation_rules() -> Vec<yeast::Rule> {
55
vec![
66
rule!(
7-
(additive_expression)
7+
(source_file (_)* @children)
88
=>
9-
(simple_identifier "blah")
9+
(top_level
10+
body: {..children}
11+
)
12+
),
13+
rule!(
14+
(additive_expression
15+
lhs: (_) @left
16+
op: _ @operator
17+
rhs: (_) @right)
18+
=>
19+
(binary_expr
20+
left: {left}
21+
operator: (operator #{operator})
22+
right: {right})
23+
),
24+
rule!(
25+
(multiplicative_expression
26+
lhs: (_) @left
27+
op: _ @operator
28+
rhs: (_) @right)
29+
=>
30+
(binary_expr
31+
left: {left}
32+
operator: (operator #{operator})
33+
right: {right})
34+
),
35+
rule!(
36+
(simple_identifier)
37+
=>
38+
name_expr
39+
),
40+
rule!(
41+
(_)
42+
=>
43+
(unsupported_node)
44+
),
45+
rule!(
46+
_ @node
47+
=>
48+
{node}
1049
),
1150
]
1251
}
1352

14-
pub fn language_spec() -> simple::LanguageSpec {
15-
let desugar = DesugaringConfig::new().add_phase("desugar", PhaseKind::Repeating, desugaring_rules());
53+
pub fn language_spec(desugared_ast_schema: &'static str) -> simple::LanguageSpec {
54+
let desugar = DesugaringConfig::new()
55+
.add_phase("translate", PhaseKind::OneShot, translation_rules())
56+
.with_output_node_types_yaml(desugared_ast_schema);
1657
simple::LanguageSpec {
1758
prefix: "swift",
1859
ts_language: tree_sitter_swift::LANGUAGE.into(),

0 commit comments

Comments
 (0)