Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Extract and recover structure of extras #48

Merged
merged 13 commits into from
Sep 18, 2024
Merged
2 changes: 0 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,6 @@ build:
#
.PHONY: setup
setup:
test -f tree-sitter-version \
|| cp tree-sitter-version.default tree-sitter-version
./scripts/check-prerequisites
./scripts/install-tree-sitter-cli
./scripts/install-tree-sitter-lib
Expand Down
1 change: 1 addition & 0 deletions scripts/download-tree-sitter
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ while [[ $# -gt 0 ]]; do
shift
done

./scripts/update-version-symlinks
version=$(cat tree-sitter-version)

mkdir -p downloads
Expand Down
4 changes: 4 additions & 0 deletions scripts/update-version-symlinks
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,13 @@
#
set -eu

if [[ ! -e tree-sitter-version ]]; then
cp tree-sitter-version.default tree-sitter-version
fi
version=$(cat tree-sitter-version)

echo "Updating symlinks 'downloads/tree-sitter' and 'tree-sitter'"
mkdir -p downloads

(
cd downloads
Expand Down
4 changes: 2 additions & 2 deletions src/bindings/lib/Tree_sitter_output.atd
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,6 @@ type node = {
}

type position = {
row: int;
column: int;
row: int; (* 0-based *)
column: int; (* 0-based *)
}
5 changes: 3 additions & 2 deletions src/gen/lib/CST_grammar.ml
Original file line number Diff line number Diff line change
Expand Up @@ -163,8 +163,9 @@ type grammar = {
(* rules, grouped and sorted in dependency order. *)

extras: string list;
(* node names that don't belong to any rule and can occur anywhere,
such as comments. *)
(* rules names for constructs that can occur anywhere independently from
the grammar, such as comments. Other extras such as string literals
and patterns were removed because we don't need them. *)
}
[@@deriving show {with_path = false}]

Expand Down
25 changes: 14 additions & 11 deletions src/gen/lib/CST_grammar_conv.ml
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,11 @@
open CST_grammar

(*
Traverse the grammar starting from the entrypoint, return the set of
visited rule names.
Traverse the grammar starting from all the entrypoints, which
are the main entrypoint (first rule in the tree-sitter grammar)
and the extras. Return the set of visited rule names.
*)
let detect_used ~entrypoint rules =
let detect_used ~entrypoints rules =
let rule_tbl = Hashtbl.create 100 in
List.iter (fun (name, x) -> Hashtbl.add rule_tbl name x) rules;
let get_rule name =
Expand Down Expand Up @@ -48,7 +49,7 @@ let detect_used ~entrypoint rules =
| None -> ()
| Some x -> scan x
in
visit entrypoint;
List.iter visit entrypoints;
was_visited

let name_of_body opt_rule_name body =
Expand Down Expand Up @@ -241,14 +242,16 @@ let tsort_rules rules =
) group
) sorted

(*
Extras can be rule names, strings or patterns.
Here we only keep rule names. We need them to identify tree nodes that
are extras and should be handled independently from the grammar.
*)
let filter_extras bodies =
List.filter_map (fun (x : Tree_sitter_t.rule_body) ->
match x with
| SYMBOL name -> Some name
| STRING name ->
(* Results in tree-sitter parse error at the moment.
Presumably not super useful. *)
Some name
| STRING _ -> None
| _ -> None
) bodies

Expand All @@ -259,7 +262,8 @@ let of_tree_sitter (x : Tree_sitter_t.grammar) : t =
| (name, _) :: _ -> name
| _ -> "program"
in
let is_used = detect_used ~entrypoint x.rules in
let extras = filter_extras x.extras in
let is_used = detect_used ~entrypoints:(entrypoint :: extras) x.rules in
let grammar_rules = translate_rules x.rules in
let all_rules =
make_external_rules x.externals @ grammar_rules
Expand All @@ -270,12 +274,11 @@ let of_tree_sitter (x : Tree_sitter_t.grammar) : t =
body;
is_rec = true; (* set correctly by tsort below *)
is_inlined_rule = is_inlined_rule;
is_inlined_type = false
is_inlined_type = false;
}
)
in
let sorted_rules = tsort_rules all_rules in
let extras = filter_extras x.extras in
{
name = x.name;
entrypoint;
Expand Down
2 changes: 2 additions & 0 deletions src/gen/lib/Codegen.ml
Original file line number Diff line number Diff line change
Expand Up @@ -50,12 +50,14 @@ let () =
~parse_source_file:%s.parse_source_file
~parse_input_tree:%s.parse_input_tree
~dump_tree:%s.dump_tree
~dump_extras:%s.dump_extras
"
lib_module_name
lang
parse_module_name
parse_module_name
boilerplate_module_name
boilerplate_module_name

let ocaml ?out_dir ~lang grammar =
let cst_module_name = "CST" in
Expand Down
50 changes: 43 additions & 7 deletions src/gen/lib/Codegen_CST.ml
Original file line number Diff line number Diff line change
Expand Up @@ -71,10 +71,20 @@ module Fmt = struct
let type_app param type_name =
label param (atom type_name)

let product l =
(* About the need for parentheses:
- For polymorphic variants, the presence of parentheses changes nothing,
a standalone tuple is always created.
- For classic variants though, the presence parentheses forces the
creation of a tuple which is detachable from the variant constructor.
Without parentheses though, we save the allocation of a block. *)
let product ~paren l =
match l with
| [x] -> x
| l -> E.List (("(", "*", ")", Style.left_sep_paren_list), l)
| l ->
if paren then
E.List (("(", "*", ")", Style.left_sep_paren_list), l)
else
E.List (("", "*", "", Style.left_sep_paren_list), l)

let classic_variant l =
let cases =
Expand Down Expand Up @@ -110,7 +120,7 @@ module Fmt = struct
let top_sequence l =
E.List (("", "", "", Style.vert_seq), l)

let typedef pos (name, inlined, rhs) =
let typedef pos (name, inlined, opt_rhs) =
let is_first = (pos = 0) in
let type_ =
if is_first then
Expand All @@ -122,7 +132,11 @@ module Fmt = struct
if inlined then " (* inlined *)"
else ""
in
let code = def (sprintf "%s %s%s =" type_ name comment) rhs in
let code =
match opt_rhs with
| Some rhs -> def (sprintf "%s %s%s =" type_ name comment) rhs
| None -> atom (sprintf "%s %s%s" type_ name comment)
in
if is_first then code
else
top_sequence [
Expand Down Expand Up @@ -206,7 +220,7 @@ let rec format_body ?def_name body : E.t =
| Optional body ->
Fmt.type_app (format_body body) "option"
| Seq body_list ->
Fmt.product (format_seq body_list)
Fmt.product ~paren:true (format_seq body_list)

and format_choice l =
List.map (fun (name, body) ->
Expand All @@ -219,7 +233,29 @@ and format_seq l =
let format_rule (rule : rule) =
(trans rule.name,
rule.is_inlined_type,
format_body ~def_name:rule.name rule.body)
Some (format_body ~def_name:rule.name rule.body))

let format_extra_def extras =
let rhs =
match extras with
| [] -> None
| extras ->
let cases =
extras
|> List.map (fun rule_name ->
let constructor =
Codegen_util.translate_ident_uppercase rule_name in
(constructor, Some (Fmt.product ~paren:false [
format_body (Symbol "Loc.t");
format_body (Symbol rule_name);
]))
)
|> Fmt.poly_variant
in
Some cases
in
[[("extra", false, rhs)];
[("extras", false, Some (Fmt.atom "extra list"))]]

(*
1. Identify names that are used at most once, becoming candidates
Expand All @@ -242,7 +278,7 @@ let format_types grammar =
Fmt.recursive_typedefs x;
Fmt.atom ""
]
) semi_formatted_defs
) (semi_formatted_defs @ format_extra_def grammar.extras)
|> Fmt.top_sequence

let generate grammar =
Expand Down
59 changes: 54 additions & 5 deletions src/gen/lib/Codegen_boilerplate.ml
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@ let gen_rule_mapper_binding ~cst_module_name (rule : rule) =
Block (gen_mapper_body_multi env);
]

let gen ~cst_module_name grammar =
let gen ~cst_module_name ~is_extra grammar =
List.filter_map (fun rule_group ->
let is_rec =
match rule_group with
Expand All @@ -182,7 +182,7 @@ let gen ~cst_module_name grammar =
in
let bindings =
List.filter_map (fun rule ->
if rule.is_inlined_type then
if rule.is_inlined_type && not (is_extra rule.name) then
None
else
Some (gen_rule_mapper_binding ~cst_module_name rule)
Expand All @@ -204,14 +204,63 @@ let generate_dumper grammar =

let dump_tree root =
map_%s () root
|> Tree_sitter_run.Raw_tree.to_string
|> print_string
|> Tree_sitter_run.Raw_tree.to_channel stdout
"
(trans grammar.entrypoint)

let generate_map_extra grammar =
let cases =
grammar.extras
|> List.map (fun rule_name ->
let constructor =
Codegen_util.translate_ident_uppercase rule_name in
sprintf " | `%s (_loc, x) -> (%S, %S, map_%s env x)\n"
constructor rule_name (trans rule_name) (trans rule_name)
)
|> String.concat ""
in
sprintf "\

let map_extra (env : env) (x : CST.extra) =
match x with
%s"
cases

let generate_extra_dumper grammar =
match grammar.extras with
| [] ->
"\
let dump_extras (extras : CST.extras) = ()
"
| _ ->
sprintf "\
%s
let dump_extras (extras : CST.extras) =
List.iter (fun extra ->
let ts_rule_name, ocaml_type_name, raw_tree = map_extra () extra in
let details =
if ocaml_type_name <> ts_rule_name then
Printf.sprintf \" (OCaml type '%%s')\" ocaml_type_name
else
\"\"
in
Printf.printf \"%%s%%s:\\n\" ts_rule_name details;
Tree_sitter_run.Raw_tree.to_channel stdout raw_tree
) extras
"
(generate_map_extra grammar)

let make_is_extra grammar =
let tbl = Hashtbl.create 100 in
List.iter (fun rule_name -> Hashtbl.replace tbl rule_name ()) grammar.extras;
fun rule_name ->
Hashtbl.mem tbl rule_name

let generate ~cst_module_name grammar =
let inline_grammar = Nice_typedefs.rearrange_rules grammar in
let tree = gen ~cst_module_name inline_grammar in
let is_extra = make_is_extra grammar in
let tree = gen ~cst_module_name ~is_extra inline_grammar in
make_header grammar
^ Indent.to_string tree
^ generate_dumper grammar
^ generate_extra_dumper grammar
Loading
Loading