semgrep · mjambon · Sep 18, 2024 · Sep 14, 2022 · Sep 14, 2022 · Sep 15, 2022
diff --git a/Makefile b/Makefile
@@ -18,8 +18,6 @@ build:
 #
 .PHONY: setup
 setup:
-	test -f tree-sitter-version \
-	|| cp tree-sitter-version.default tree-sitter-version
 	./scripts/check-prerequisites
 	./scripts/install-tree-sitter-cli
 	./scripts/install-tree-sitter-lib

diff --git a/scripts/download-tree-sitter b/scripts/download-tree-sitter
@@ -41,6 +41,7 @@ while [[ $# -gt 0 ]]; do
   shift
 done
 
+./scripts/update-version-symlinks
 version=$(cat tree-sitter-version)
 
 mkdir -p downloads

diff --git a/scripts/update-version-symlinks b/scripts/update-version-symlinks
@@ -15,9 +15,13 @@
 #
 set -eu
 
+if [[ ! -e tree-sitter-version ]]; then
+  cp tree-sitter-version.default tree-sitter-version
+fi
 version=$(cat tree-sitter-version)
 
 echo "Updating symlinks 'downloads/tree-sitter' and 'tree-sitter'"
+mkdir -p downloads
 
 (
   cd downloads

diff --git a/src/bindings/lib/Tree_sitter_output.atd b/src/bindings/lib/Tree_sitter_output.atd
@@ -42,6 +42,6 @@ type node = {
 }
 
 type position = {
-  row: int;
-  column: int;
+  row: int; (* 0-based *)
+  column: int; (* 0-based *)
 }
diff --git a/src/gen/lib/CST_grammar.ml b/src/gen/lib/CST_grammar.ml
@@ -163,8 +163,9 @@ type grammar = {
   (* rules, grouped and sorted in dependency order. *)
 
   extras: string list;
-  (* node names that don't belong to any rule and can occur anywhere,
-     such as comments. *)
+  (* rules names for constructs that can occur anywhere independently from
+     the grammar, such as comments. Other extras such as string literals
+     and patterns were removed because we don't need them. *)
 }
 [@@deriving show {with_path = false}]
 

diff --git a/src/gen/lib/CST_grammar_conv.ml b/src/gen/lib/CST_grammar_conv.ml
@@ -5,10 +5,11 @@
 open CST_grammar
 
 (*
-   Traverse the grammar starting from the entrypoint, return the set of
-   visited rule names.
+   Traverse the grammar starting from all the entrypoints, which
+   are the main entrypoint (first rule in the tree-sitter grammar)
+   and the extras. Return the set of visited rule names.
 *)
-let detect_used ~entrypoint rules =
+let detect_used ~entrypoints rules =
   let rule_tbl = Hashtbl.create 100 in
   List.iter (fun (name, x) -> Hashtbl.add rule_tbl name x) rules;
   let get_rule name =
@@ -48,7 +49,7 @@ let detect_used ~entrypoint rules =
     | None -> ()
     | Some x -> scan x
   in
-  visit entrypoint;
+  List.iter visit entrypoints;
   was_visited
 
 let name_of_body opt_rule_name body =
@@ -241,14 +242,16 @@ let tsort_rules rules =
     ) group
   ) sorted
 
+(*
+   Extras can be rule names, strings or patterns.
+   Here we only keep rule names. We need them to identify tree nodes that
+   are extras and should be handled independently from the grammar.
+*)
 let filter_extras bodies =
   List.filter_map (fun (x : Tree_sitter_t.rule_body) ->
     match x with
     | SYMBOL name -> Some name
-    | STRING name ->
-        (* Results in tree-sitter parse error at the moment.
-           Presumably not super useful. *)
-        Some name
+    | STRING _ -> None
     | _ -> None
   ) bodies
 
@@ -259,7 +262,8 @@ let of_tree_sitter (x : Tree_sitter_t.grammar) : t =
     | (name, _) :: _ -> name
     | _ -> "program"
   in
-  let is_used = detect_used ~entrypoint x.rules in
+  let extras = filter_extras x.extras in
+  let is_used = detect_used ~entrypoints:(entrypoint :: extras) x.rules in
   let grammar_rules = translate_rules x.rules in
   let all_rules =
     make_external_rules x.externals @ grammar_rules
@@ -270,12 +274,11 @@ let of_tree_sitter (x : Tree_sitter_t.grammar) : t =
         body;
         is_rec = true; (* set correctly by tsort below *)
         is_inlined_rule = is_inlined_rule;
-        is_inlined_type = false
+        is_inlined_type = false;
       }
     )
   in
   let sorted_rules = tsort_rules all_rules in
-  let extras = filter_extras x.extras in
   {
     name = x.name;
     entrypoint;

diff --git a/src/gen/lib/Codegen.ml b/src/gen/lib/Codegen.ml
@@ -50,12 +50,14 @@ let () =
     ~parse_source_file:%s.parse_source_file
     ~parse_input_tree:%s.parse_input_tree
     ~dump_tree:%s.dump_tree
+    ~dump_extras:%s.dump_extras
 "
     lib_module_name
     lang
     parse_module_name
     parse_module_name
     boilerplate_module_name
+    boilerplate_module_name
 
 let ocaml ?out_dir ~lang grammar =
   let cst_module_name = "CST" in

diff --git a/src/gen/lib/Codegen_CST.ml b/src/gen/lib/Codegen_CST.ml
@@ -71,10 +71,20 @@ module Fmt = struct
   let type_app param type_name =
     label param (atom type_name)
 
-  let product l =
+  (* About the need for parentheses:
+     - For polymorphic variants, the presence of parentheses changes nothing,
+       a standalone tuple is always created.
+     - For classic variants though, the presence parentheses forces the
+       creation of a tuple which is detachable from the variant constructor.
+       Without parentheses though, we save the allocation of a block. *)
+  let product ~paren l =
     match l with
     | [x] -> x
-    | l -> E.List (("(", "*", ")", Style.left_sep_paren_list), l)
+    | l ->
+        if paren then
+          E.List (("(", "*", ")", Style.left_sep_paren_list), l)
+        else
+          E.List (("", "*", "", Style.left_sep_paren_list), l)
 
   let classic_variant l =
     let cases =
@@ -110,7 +120,7 @@ module Fmt = struct
   let top_sequence l =
     E.List (("", "", "", Style.vert_seq), l)
 
-  let typedef pos (name, inlined, rhs) =
+  let typedef pos (name, inlined, opt_rhs) =
     let is_first = (pos = 0) in
     let type_ =
       if is_first then
@@ -122,7 +132,11 @@ module Fmt = struct
       if inlined then " (* inlined *)"
       else ""
     in
-    let code = def (sprintf "%s %s%s =" type_ name comment) rhs in
+    let code =
+      match opt_rhs with
+      | Some rhs -> def (sprintf "%s %s%s =" type_ name comment) rhs
+      | None -> atom (sprintf "%s %s%s" type_ name comment)
+    in
     if is_first then code
     else
       top_sequence [
@@ -206,7 +220,7 @@ let rec format_body ?def_name body : E.t =
   | Optional body ->
       Fmt.type_app (format_body body) "option"
   | Seq body_list ->
-      Fmt.product (format_seq body_list)
+      Fmt.product ~paren:true (format_seq body_list)
 
 and format_choice l =
   List.map (fun (name, body) ->
@@ -219,7 +233,29 @@ and format_seq l =
 let format_rule (rule : rule) =
   (trans rule.name,
    rule.is_inlined_type,
-   format_body ~def_name:rule.name rule.body)
+   Some (format_body ~def_name:rule.name rule.body))
+
+let format_extra_def extras =
+  let rhs =
+    match extras with
+    | [] -> None
+    | extras ->
+        let cases =
+          extras
+          |> List.map (fun rule_name ->
+            let constructor =
+              Codegen_util.translate_ident_uppercase rule_name in
+            (constructor, Some (Fmt.product ~paren:false [
+               format_body (Symbol "Loc.t");
+               format_body (Symbol rule_name);
+             ]))
+          )
+          |> Fmt.poly_variant
+        in
+        Some cases
+  in
+  [[("extra", false, rhs)];
+   [("extras", false, Some (Fmt.atom "extra list"))]]
 
 (*
    1. Identify names that are used at most once, becoming candidates
@@ -242,7 +278,7 @@ let format_types grammar =
       Fmt.recursive_typedefs x;
       Fmt.atom ""
     ]
-  ) semi_formatted_defs
+  ) (semi_formatted_defs @ format_extra_def grammar.extras)
   |> Fmt.top_sequence
 
 let generate grammar =

diff --git a/src/gen/lib/Codegen_boilerplate.ml b/src/gen/lib/Codegen_boilerplate.ml
@@ -173,7 +173,7 @@ let gen_rule_mapper_binding ~cst_module_name (rule : rule) =
     Block (gen_mapper_body_multi env);
   ]
 
-let gen ~cst_module_name grammar =
+let gen ~cst_module_name ~is_extra grammar =
   List.filter_map (fun rule_group ->
     let is_rec =
       match rule_group with
@@ -182,7 +182,7 @@ let gen ~cst_module_name grammar =
     in
     let bindings =
       List.filter_map (fun rule ->
-        if rule.is_inlined_type then
+        if rule.is_inlined_type && not (is_extra rule.name) then
           None
         else
           Some (gen_rule_mapper_binding ~cst_module_name rule)
@@ -204,14 +204,63 @@ let generate_dumper grammar =
 
 let dump_tree root =
   map_%s () root
-  |> Tree_sitter_run.Raw_tree.to_string
-  |> print_string
+  |> Tree_sitter_run.Raw_tree.to_channel stdout
 "
     (trans grammar.entrypoint)
 
+let generate_map_extra grammar =
+  let cases =
+    grammar.extras
+    |> List.map (fun rule_name ->
+      let constructor =
+        Codegen_util.translate_ident_uppercase rule_name in
+      sprintf "  | `%s (_loc, x) -> (%S, %S, map_%s env x)\n"
+        constructor rule_name (trans rule_name) (trans rule_name)
+    )
+    |> String.concat ""
+  in
+  sprintf "\
+
+let map_extra (env : env) (x : CST.extra) =
+  match x with
+%s"
+    cases
+
+let generate_extra_dumper grammar =
+  match grammar.extras with
+  | [] ->
+      "\
+let dump_extras (extras : CST.extras) = ()
+"
+  | _ ->
+      sprintf "\
+%s
+let dump_extras (extras : CST.extras) =
+  List.iter (fun extra ->
+    let ts_rule_name, ocaml_type_name, raw_tree = map_extra () extra in
+    let details =
+      if ocaml_type_name <> ts_rule_name then
+        Printf.sprintf \" (OCaml type '%%s')\" ocaml_type_name
+      else
+        \"\"
+    in
+    Printf.printf \"%%s%%s:\\n\" ts_rule_name details;
+    Tree_sitter_run.Raw_tree.to_channel stdout raw_tree
+  ) extras
+"
+        (generate_map_extra grammar)
+
+let make_is_extra grammar =
+  let tbl = Hashtbl.create 100 in
+  List.iter (fun rule_name -> Hashtbl.replace tbl rule_name ()) grammar.extras;
+  fun rule_name ->
+    Hashtbl.mem tbl rule_name
+
 let generate ~cst_module_name grammar =
   let inline_grammar = Nice_typedefs.rearrange_rules grammar in
-  let tree = gen ~cst_module_name inline_grammar in
+  let is_extra = make_is_extra grammar in
+  let tree = gen ~cst_module_name ~is_extra inline_grammar in
   make_header grammar
   ^ Indent.to_string tree
   ^ generate_dumper grammar
+  ^ generate_extra_dumper grammar
-Original file line number
+Diff line change
@@ Expand Up / @@ -41,6 +41,7 @@ while [[ $# -gt 0 ]]; do @@
       shift
     done
+    ./scripts/update-version-symlinks
     version=$(cat tree-sitter-version)
     mkdir -p downloads
@@ Expand Down @@