rewrite name part recognitions

tree-sitter · Jun 1, 2018 · a7b00f4 · a7b00f4
1 parent 2d76e0a
commit a7b00f4
Show file tree

Hide file tree

Showing 6 changed files with 14,242 additions and 16,316 deletions.
diff --git a/agda-lexer.x b/agda-lexer.x
@@ -0,0 +1,320 @@
+{
+{-# OPTIONS_GHC -fno-warn-deprecated-flags   #-}
+{-# OPTIONS_GHC -fno-warn-missing-signatures #-}
+{-# OPTIONS_GHC -fno-warn-tabs               #-}
+{-# OPTIONS_GHC -fno-warn-unused-imports     #-}
+
+{-# LANGUAGE BangPatterns #-}
+
+{-| The lexer is generated by Alex (<http://www.haskell.org/alex>) and is an
+    adaptation of GHC's lexer. The main lexing function 'lexer' is called by
+    the "Agda.Syntax.Parser.Parser" to get the next token from the input.
+-}
+module Agda.Syntax.Parser.Lexer
+    ( -- * The main function
+      lexer
+      -- * Lex states
+    , normal, code
+    , layout, empty_layout, bol, imp_dir
+      -- * Alex generated functions
+    , AlexReturn(..), alexScanUser
+    ) where
+
+import Agda.Syntax.Parser.Alex
+import Agda.Syntax.Parser.Comments
+#ifndef __HADDOCK__
+import {-# SOURCE #-} Agda.Syntax.Parser.Layout
+import {-# SOURCE #-} Agda.Syntax.Parser.LexActions
+#endif
+import Agda.Syntax.Parser.Monad
+import Agda.Syntax.Parser.StringLiterals
+import Agda.Syntax.Parser.Tokens
+import Agda.Syntax.Literal
+
+}
+
+-- Unicode is not handled by the following regular expressions.
+-- Instead, unicode characters are translated to 7-bit ASCII
+-- by Agda.Syntax.Parser.LexActions.foolAlex in a preprocessing pass.
+
+$digit       = 0-9
+$hexdigit    = [ $digit a-f A-F ]
+$alpha       = [ A-Z a-z _ ]
+$op          = [ \- \! \# \$ \% \& \* \+ \/ \< \= \> \^ \| \~ \? \` \[ \] \, \: ]
+$idstart     = [ $digit $alpha $op ]
+$idchar      = [ $idstart ' \\ ]
+$nonalpha    = $idchar # $alpha
+$nonalphanum = $nonalpha # $digit
+$white_notab = $white # \t
+$white_nonl  = $white_notab # \n
+
+@number      = $digit+ | "0x" $hexdigit+
+@integer     = [\-]? @number
+@exponent    = [eE] [\-\+]? @number
+@float       = @integer \. @number @exponent? | @number @exponent
+
+-- A name can't start with \x (to allow \x -> x).
+-- Bug in alex: [ _ op ]+ doesn't seem to work!
+@start = ($idstart # [_]) | \\ [ $nonalpha ]
+@ident = @start $idchar* | [_] $idchar+
+
+@namespace  = (@ident \.)*
+@q_ident    = @namespace @ident
+
+tokens :-
+
+-- White space
+<0,code,bol_,layout_,empty_layout_,imp_dir_>
+    $white_nonl+    ;
+
+<pragma_,fpragma_> $white_notab ;
+
+-- Pragmas
+<0,code,pragma_>   "{-#"               { beginWith pragma  $ symbol SymOpenPragma }
+<fpragma_>         "{-#"               { beginWith fpragma $ symbol SymOpenPragma }
+<pragma_,fpragma_> "#-}"               { endWith $ symbol SymClosePragma }
+<pragma_>   "BUILTIN"                  { keyword KwBUILTIN }
+<pragma_>   "CATCHALL"                 { keyword KwCATCHALL }
+<pragma_>   "COMPILED"                 { keyword KwCOMPILED }
+<pragma_>   "COMPILED_DATA"            { keyword KwCOMPILED_DATA }
+<pragma_>   "COMPILED_DATA_UHC"        { keyword KwCOMPILED_DATA_UHC }
+<pragma_>   "COMPILED_EXPORT"          { keyword KwCOMPILED_EXPORT }
+<pragma_>   "COMPILED_JS"              { keyword KwCOMPILED_JS }
+<pragma_>   "COMPILED_TYPE"            { keyword KwCOMPILED_TYPE }
+<pragma_>   "COMPILED_UHC"             { keyword KwCOMPILED_UHC }
+<pragma_>   "COMPILE"                  { endWith $ beginWith fpragma $ keyword KwCOMPILE }
+<pragma_>   "FOREIGN"                  { endWith $ beginWith fpragma $ keyword KwFOREIGN }
+<pragma_>   "DISPLAY"                  { keyword KwDISPLAY }
+<pragma_>   "ETA"                      { keyword KwETA }
+<pragma_>   "HASKELL"                  { endWith $ beginWith fpragma $ keyword KwHASKELL }
+<pragma_>   "IMPORT"                   { keyword KwIMPORT }
+<pragma_>   "IMPORT_UHC"               { keyword KwIMPORT_UHC }
+<pragma_>   "IMPOSSIBLE"               { keyword KwIMPOSSIBLE }
+<pragma_>   "INJECTIVE"                { keyword KwINJECTIVE }
+<pragma_>   "INLINE"                   { keyword KwINLINE }
+<pragma_>   "NOINLINE"                 { keyword KwNOINLINE }
+<pragma_>   "LINE"                     { keyword KwLINE }
+<pragma_>   "MEASURE"                  { keyword KwMEASURE }
+<pragma_>   "NO_POSITIVITY_CHECK"      { keyword KwNO_POSITIVITY_CHECK }
+<pragma_>   "NO_TERMINATION_CHECK"     { keyword KwNO_TERMINATION_CHECK }
+<pragma_>   "NON_TERMINATING"          { keyword KwNON_TERMINATING }
+<pragma_>   "OPTIONS"                  { keyword KwOPTIONS }
+<pragma_>   "POLARITY"                 { keyword KwPOLARITY }
+<pragma_>   "REWRITE"                  { keyword KwREWRITE }
+<pragma_>   "STATIC"                   { keyword KwSTATIC }
+<pragma_>   "TERMINATING"              { keyword KwTERMINATING }
+<pragma_>   "WARNING_ON_USAGE"         { keyword KwWARNING_ON_USAGE }
+<pragma_>   . # [ $white \" ] +        { withInterval $ TokString } -- we recognise string literals in pragmas
+<fpragma_>  . # [ $white ] +           { withInterval $ TokString }
+
+-- Comments
+    -- We need to rule out pragmas here. Usually longest match would take
+    -- precedence, but in some states pragmas aren't valid but comments are.
+<0,code,bol_,layout_,empty_layout_,imp_dir_>
+    "{-" / { not' (followedBy '#') }    { nestedComment }
+    -- A misplaced end-comment, like in @f {x-} = x-@ gives a parse error.
+    "-}"                                { symbol SymEndComment }
+    @ident "-}"                         { symbol SymEndComment }
+
+-- Dashes followed by a name symbol should be parsed as a name.
+<0,code,bol_,layout_,empty_layout_,imp_dir_>
+   "--" .* / { keepComments .&&. (followedBy '\n' .||. eof) }
+             { withInterval TokComment }
+<0,code,bol_,layout_,empty_layout_,imp_dir_>
+  "--" .* / { followedBy '\n' .||. eof } ;
+
+-- We need to check the offside rule for the first token on each line.  We
+-- should not check the offside rule for the end of file token or an
+-- '\end{code}'
+<0,code,imp_dir_> \n    { begin bol_ }
+<bol_>
+    {
+        \n                  ;
+--      ^ \\ "end{code}"    { end }
+        () / { not' eof }       { offsideRule }
+    }
+
+-- After a layout keyword there is either an open brace (no layout) or the
+-- indentation of the first token decides the column of the layout block.
+<layout_>
+    {   \n      ;
+--      \{      { endWith openBrace }
+        ()      { endWith newLayoutContext }
+    }
+
+-- The only rule for the empty_layout state. Generates a close brace.
+<empty_layout_> ()              { emptyLayout }
+
+-- Keywords
+<0,code> let            { keyword KwLet }
+<0,code> in             { keyword KwIn }
+<0,code> where          { keyword KwWhere }
+<0,code> do             { keyword KwDo }
+<0,code> field          { keyword KwField }
+<0,code> with           { keyword KwWith }
+<0,code> rewrite        { keyword KwRewrite }
+<0,code> postulate      { keyword KwPostulate }
+<0,code> primitive      { keyword KwPrimitive }
+<0,code> open           { keyword KwOpen }
+<0,code> import         { keyword KwImport }
+<0,code> module         { keyword KwModule }
+<0,code> data           { keyword KwData }
+<0,code> codata         { keyword KwCoData }
+<0,code> record         { keyword KwRecord }
+<0,code> constructor    { keyword KwConstructor }
+<0,code> inductive      { keyword KwInductive }
+<0,code> coinductive    { keyword KwCoInductive }
+<0,code> "eta-equality"    { keyword KwEta }
+<0,code> "no-eta-equality" { keyword KwNoEta }
+<0,code> infix          { keyword KwInfix }
+<0,code> infixl         { keyword KwInfixL }
+<0,code> infixr         { keyword KwInfixR }
+<0,code> mutual         { keyword KwMutual }
+<0,code> abstract       { keyword KwAbstract }
+<0,code> private        { keyword KwPrivate }
+<0,code> instance       { keyword KwInstance }
+<0,code> overlap        { keyword KwOverlap }
+<0,code> macro          { keyword KwMacro }
+<0,code> Set            { keyword KwSet }
+<0,code> Prop           { keyword KwProp }
+<0,code> forall         { keyword KwForall }
+<0,code> Set @number    { withInterval' (read . drop 3) TokSetN }
+<0,code> Prop @number   { withInterval' (read . drop 4) TokPropN }
+<0,code> quoteGoal      { keyword KwQuoteGoal }
+<0,code> quoteContext   { keyword KwQuoteContext }
+<0,code> quote          { keyword KwQuote }
+<0,code> quoteTerm      { keyword KwQuoteTerm }
+<0,code> unquote        { keyword KwUnquote }
+<0,code> unquoteDecl    { keyword KwUnquoteDecl }
+<0,code> unquoteDef     { keyword KwUnquoteDef  }
+<0,code> tactic         { keyword KwTactic }
+<0,code> syntax         { keyword KwSyntax }
+<0,code> pattern        { keyword KwPatternSyn }
+
+-- The parser is responsible to put the lexer in the imp_dir_ state when it
+-- expects an import directive keyword. This means that if you run the
+-- tokensParser you will never see these keywords.
+<0,code> using      { keyword KwUsing }
+<0,code> hiding     { keyword KwHiding }
+<0,code> renaming   { keyword KwRenaming }
+<imp_dir_> to       { endWith $ keyword KwTo }
+<0,code> public     { keyword KwPublic }
+
+-- Holes
+<0,code> "{!"           { hole }
+
+-- Special symbols
+<0,code> "..."          { symbol SymEllipsis }
+<0,code> ".."           { symbol SymDotDot }
+<0,code> "."            { symbol SymDot }
+<0,code> ";"            { symbol SymSemi }
+<0,code> ":"            { symbol SymColon }
+<0,code> "="            { symbol SymEqual }
+<0,code> "_"            { symbol SymUnderscore }
+<0,code> "?"            { symbol SymQuestionMark }
+<0,code> "|"            { symbol SymBar }
+<0,code> "(|" /[$white] { symbol SymOpenIdiomBracket }
+<0,code> "|)"           { symbol SymCloseIdiomBracket }
+<0,code> "("            { symbol SymOpenParen }
+<0,code> ")"            { symbol SymCloseParen }
+<0,code> "->"           { symbol SymArrow }
+<0,code> "\"            { symbol SymLambda } -- "
+<0,code> "@"            { symbol SymAs }
+<0,code> "{{" /[^!]             { symbol SymDoubleOpenBrace }
+-- We don't lex '}}' into a SymDoubleCloseBrace. Instead, we lex it as
+-- two SymCloseBrace's. When the parser is looking for a double
+-- closing brace, it will also accept two SymCloseBrace's, after
+-- verifying that they are immediately next to each other.
+-- This trick allows us to keep "record { a = record {}}" working
+-- properly.
+-- <0,code> "}}"                { symbol SymDoubleCloseBrace }
+<0,code> "{"            { symbol SymOpenBrace }     -- you can't use braces for layout
+<0,code> "}"            { symbol SymCloseBrace }
+
+-- Literals
+<0,code> \'             { litChar }
+<0,code,pragma_> \"     { litString }
+<0,code> @integer       { literal LitNat }
+<0,code> @float         { literal LitFloat }
+
+-- Identifiers
+<0,code,imp_dir_> @q_ident      { identifier }
+-- Andreas, 2013-02-21, added identifiers to the 'imp_dir_' state.
+-- This is to fix issue 782: 'toz' should not be lexed as 'to'
+-- (followed by 'z' after leaving imp_dir_).
+-- With identifiers in state imp_dir_, 'toz' should be lexed as
+-- identifier 'toz' in imp_dir_ state, leading to a parse error later.
+
+{
+
+-- | This is the initial state for parsing a regular, non-literate file.
+normal :: LexState
+normal = 0
+
+
+{-| The layout state. Entered when we see a layout keyword ('withLayout') and
+    exited either when seeing an open brace ('openBrace') or at the next token
+    ('newLayoutContext').
+
+    Update: we don't use braces for layout anymore.
+-}
+layout :: LexState
+layout = layout_
+
+
+{-| The state inside a pragma.
+-}
+pragma :: LexState
+pragma = pragma_
+
+-- | The state inside a FOREIGN pragma. This needs to be different so that we don't
+--   lex further strings as pragma keywords.
+fpragma :: LexState
+fpragma = fpragma_
+
+{-| We enter this state from 'newLayoutContext' when the token following a
+    layout keyword is to the left of (or at the same column as) the current
+    layout context. Example:
+
+    > data Empty : Set where
+    > foo : Empty -> Nat
+
+    Here the second line is not part of the @where@ clause since it is has the
+    same indentation as the @data@ definition. What we have to do is insert an
+    empty layout block @{}@ after the @where@. The only thing that can happen
+    in this state is that 'emptyLayout' is executed, generating the closing
+    brace. The open brace is generated when entering by 'newLayoutContext'.
+-}
+empty_layout :: LexState
+empty_layout = empty_layout_
+
+
+-- | This state is entered at the beginning of each line. You can't lex
+--   anything in this state, and to exit you have to check the layout rule.
+--   Done with 'offsideRule'.
+bol :: LexState
+bol = bol_
+
+
+-- | This state can only be entered by the parser. In this state you can only
+--   lex the keywords @using@, @hiding@, @renaming@ and @to@. Moreover they are
+--   only keywords in this particular state. The lexer will never enter this
+--   state by itself, that has to be done in the parser.
+imp_dir :: LexState
+imp_dir = imp_dir_
+
+
+-- | Return the next token. This is the function used by Happy in the parser.
+--
+--   @lexer k = 'lexToken' >>= k@
+lexer :: (Token -> Parser a) -> Parser a
+lexer k = lexToken >>= k
+
+-- | Do not use this function; it sets the 'ParseFlags' to
+-- 'undefined'.
+alexScan :: AlexInput -> Int -> AlexReturn (LexAction Token)
+
+-- | This is the main lexing function generated by Alex.
+alexScanUser :: ([LexState], ParseFlags) -> AlexInput -> Int -> AlexReturn (LexAction Token)
+
+}
diff --git a/corpus/expression/record.txt b/corpus/expression/record.txt
@@ -10,8 +10,8 @@ f = record { fst = 2; snd = 3 }
     (function_clause
         (lhs (atom (qualified_name)))
         (rhs (expr (atom
-            (field_assignment (qualified_name) (expr (atom (literal (int)))))
-            (field_assignment (qualified_name) (expr (atom (literal (int))))))))))
+            (field_assignment (qualified_name) (expr (atom (literal))))
+            (field_assignment (qualified_name) (expr (atom (literal)))))))))
 
 ==================
 Building records from modules

diff --git a/corpus/infix.txt b/corpus/infix.txt
@@ -9,6 +9,6 @@ infixr 5 _-_ _-_
 ---
 
 (source_file
-    (infix (int) (name))
-    (infix (int) (name))
-    (infix (int) (name) (name)))
+    (infix (integer) (name))
+    (infix (integer) (name))
+    (infix (integer) (name) (name)))