From 6e2603519903c925006d934d48f86fc072d07b4a Mon Sep 17 00:00:00 2001
From: David Holland <dholland@galois.com>
Date: Tue, 14 Jan 2025 21:17:48 -0500
Subject: [PATCH 1/4] Lexer: rearrange comment handling.

Don't have the lexer match //.*; just match the // part, and extend
the same logic that handles dropping the bits between /* and */ to
also handle dropping the bits between // and EOL.

This requires materializing EOL tokens instead of treating them as
whitespace, but that costs ~nothing.

Also fix it so we generate explicit messages when there are unclosed
comments. A // comment that doesn't have an EOL before EOF generates a
warning; a /* comment without a matching */ generates an error. The
position for this error is the comment start.

(The previous behavior was that unclosed comments commented out the
EOF token and that would cause a non-specific "parse error".)

Update the lexer call sites to post the messages. There should not be
four separate lexer call sites, but that's a problem for a future
branch.

Add some tests. This includes a new test_parse_errors area akin to
the existing test_type_errors.
---
 intTests/test2181/test.sh                    |   5 +
 intTests/test2181/test1.saw                  |   2 +
 intTests/test2181/test2.saw                  |   3 +
 intTests/test2181/test3.saw                  |   1 +
 intTests/test_parse_errors/.gitignore        |   3 +
 intTests/test_parse_errors/lexer001.log.good |   2 +
 intTests/test_parse_errors/lexer001.saw      |   1 +
 intTests/test_parse_errors/lexer002.log.good |   5 +
 intTests/test_parse_errors/lexer002.saw      |   3 +
 intTests/test_parse_errors/test.sh           |   1 +
 saw/SAWScript/REPL/Command.hs                |  16 ++-
 src/SAWScript/Import.hs                      |  31 ++++--
 src/SAWScript/Interpreter.hs                 |  14 ++-
 src/SAWScript/Lexer.x                        | 107 +++++++++++++++----
 src/SAWScript/Token.hs                       |   9 +-
 15 files changed, 169 insertions(+), 34 deletions(-)
 create mode 100644 intTests/test2181/test.sh
 create mode 100644 intTests/test2181/test1.saw
 create mode 100644 intTests/test2181/test2.saw
 create mode 100644 intTests/test2181/test3.saw
 create mode 100644 intTests/test_parse_errors/.gitignore
 create mode 100644 intTests/test_parse_errors/lexer001.log.good
 create mode 100644 intTests/test_parse_errors/lexer001.saw
 create mode 100644 intTests/test_parse_errors/lexer002.log.good
 create mode 100644 intTests/test_parse_errors/lexer002.saw
 create mode 100644 intTests/test_parse_errors/test.sh

diff --git a/intTests/test2181/test.sh b/intTests/test2181/test.sh
new file mode 100644
index 000000000..e4d5a3dac
--- /dev/null
+++ b/intTests/test2181/test.sh
@@ -0,0 +1,5 @@
+set -e
+
+$SAW test1.saw
+$SAW test2.saw
+! $SAW test3.saw
diff --git a/intTests/test2181/test1.saw b/intTests/test2181/test1.saw
new file mode 100644
index 000000000..a2cc0ea49
--- /dev/null
+++ b/intTests/test2181/test1.saw
@@ -0,0 +1,2 @@
+/* // */
+
diff --git a/intTests/test2181/test2.saw b/intTests/test2181/test2.saw
new file mode 100644
index 000000000..aa294e3bb
--- /dev/null
+++ b/intTests/test2181/test2.saw
@@ -0,0 +1,3 @@
+// comments do nest
+
+/* a /* b */ c */
diff --git a/intTests/test2181/test3.saw b/intTests/test2181/test3.saw
new file mode 100644
index 000000000..7d5e04660
--- /dev/null
+++ b/intTests/test2181/test3.saw
@@ -0,0 +1 @@
+/* unclosed comment
diff --git a/intTests/test_parse_errors/.gitignore b/intTests/test_parse_errors/.gitignore
new file mode 100644
index 000000000..4619758de
--- /dev/null
+++ b/intTests/test_parse_errors/.gitignore
@@ -0,0 +1,3 @@
+*.rawlog
+*.log
+*.diff
diff --git a/intTests/test_parse_errors/lexer001.log.good b/intTests/test_parse_errors/lexer001.log.good
new file mode 100644
index 000000000..8ca6fbb9a
--- /dev/null
+++ b/intTests/test_parse_errors/lexer001.log.good
@@ -0,0 +1,2 @@
+ Loading file "lexer001.saw"
+ lexer001.saw:1:47-1:47: Missing newline at end of file
diff --git a/intTests/test_parse_errors/lexer001.saw b/intTests/test_parse_errors/lexer001.saw
new file mode 100644
index 000000000..2e05ab5ec
--- /dev/null
+++ b/intTests/test_parse_errors/lexer001.saw
@@ -0,0 +1 @@
+// line comment with no newline at end of file
\ No newline at end of file
diff --git a/intTests/test_parse_errors/lexer002.log.good b/intTests/test_parse_errors/lexer002.log.good
new file mode 100644
index 000000000..c4d5b0e6a
--- /dev/null
+++ b/intTests/test_parse_errors/lexer002.log.good
@@ -0,0 +1,5 @@
+ Loading file "lexer002.saw"
+ Stack trace:
+lexer002.saw:1:1-1:3: Unclosed block comment
+
+FAILED
diff --git a/intTests/test_parse_errors/lexer002.saw b/intTests/test_parse_errors/lexer002.saw
new file mode 100644
index 000000000..bde3ba771
--- /dev/null
+++ b/intTests/test_parse_errors/lexer002.saw
@@ -0,0 +1,3 @@
+/*
+ * Unclosed comment
+ *
diff --git a/intTests/test_parse_errors/test.sh b/intTests/test_parse_errors/test.sh
new file mode 100644
index 000000000..5fcd79d0a
--- /dev/null
+++ b/intTests/test_parse_errors/test.sh
@@ -0,0 +1 @@
+exec ${TEST_SHELL:-bash} ../support/test-and-diff.sh "$@"
diff --git a/saw/SAWScript/REPL/Command.hs b/saw/SAWScript/REPL/Command.hs
index 6605fb37f..91f42373e 100644
--- a/saw/SAWScript/REPL/Command.hs
+++ b/saw/SAWScript/REPL/Command.hs
@@ -165,7 +165,13 @@ typeOfCmd :: String -> REPL ()
 typeOfCmd str
   | null str = do io $ putStrLn $ "[error] :type requires an argument"
   | otherwise =
-  do let tokens = SAWScript.Lexer.lexSAW replFileName (Text.pack str)
+  do let (tokens, optmsg) = SAWScript.Lexer.lexSAW replFileName (Text.pack str)
+     case optmsg of
+       Nothing -> return ()
+       Just (_vrb, pos, msg) -> do
+         -- XXX wrap printing of positions in the message-printing infrastructure
+         let msg' = show pos ++ ": " ++ Text.unpack msg
+         io $ putStrLn msg'
      expr <- case SAWScript.Parser.parseExpression tokens of
        Left err -> fail (show err)
        Right expr -> return expr
@@ -247,7 +253,13 @@ caveats:
      we also hang onto the results and use them to seed the interpreter. -}
 sawScriptCmd :: String -> REPL ()
 sawScriptCmd str = do
-  let tokens = SAWScript.Lexer.lexSAW replFileName (Text.pack str)
+  let (tokens, optmsg) = SAWScript.Lexer.lexSAW replFileName (Text.pack str)
+  case optmsg of
+    Nothing -> return ()
+    Just (_vrb, pos, msg) -> do
+      -- XXX wrap printing of positions in the message-printing infrastructure
+      let msg' = show pos ++ ": " ++ Text.unpack msg
+      io $ putStrLn msg'
   case SAWScript.Parser.parseStmtSemi tokens of
     Left err -> io $ print err
     Right stmt ->
diff --git a/src/SAWScript/Import.hs b/src/SAWScript/Import.hs
index b25fd2627..5e1b800e2 100644
--- a/src/SAWScript/Import.hs
+++ b/src/SAWScript/Import.hs
@@ -12,26 +12,39 @@ module SAWScript.Import
   , findAndLoadFile
   ) where
 
+import qualified Data.Text.IO as TextIO (readFile)
+import qualified Data.Text as Text
+import Control.Exception
+import System.Directory
 
+import SAWScript.Position (Pos)
 import SAWScript.AST
 import SAWScript.Lexer (lexSAW)
 import SAWScript.Options
 import SAWScript.Parser
-
-import qualified Data.Text.IO as TextIO (readFile)
-import Data.Text (Text)
-import System.Directory
-import Control.Exception
+import SAWScript.Token (Token)
 
 loadFile :: Options -> FilePath -> IO [Stmt]
 loadFile opts fname = do
   printOutLn opts Info $ "Loading file " ++ show fname
   ftext <- TextIO.readFile fname
-  either throwIO return (parseFile fname ftext)
+  let (tokens, optmsg) = lexSAW fname ftext
+  case optmsg of
+      Nothing -> return ()
+      Just (vrb, pos, txt) -> do
+          -- XXX: the print functions should take care of printing the position
+          -- (clean this up when we clean out the printing infrastructure)
+          let txt' = show pos ++ ": " ++ Text.unpack txt
+          -- XXX: the print functions should also take care of exiting on an error
+          -- (immediately or later). For now, throw errors and print anything else.
+          case vrb of
+              Error -> throwIO $ userError txt'
+              _ -> printOutLn opts vrb txt'
+  either throwIO return (parseFile tokens)
 
-parseFile :: FilePath -> Text -> Either ParseError [Stmt]
-parseFile fname input =
-  case parseModule (lexSAW fname input) of
+parseFile :: [Token Pos] -> Either ParseError [Stmt]
+parseFile tokens = do
+  case parseModule tokens of
     Left err -> Left err
     Right stmts -> Right stmts
 
diff --git a/src/SAWScript/Interpreter.hs b/src/SAWScript/Interpreter.hs
index f8eabba88..e09427492 100644
--- a/src/SAWScript/Interpreter.hs
+++ b/src/SAWScript/Interpreter.hs
@@ -897,8 +897,18 @@ print_value v = do
 
 readSchema :: Text -> SS.Schema
 readSchema str =
-  case parseSchema (lexSAW "internal" str) of
-    Left err -> error (show err)
+  let croak what msg =
+        error (what ++ " error in builtin " ++ Text.unpack str ++ ": " ++ msg)
+      tokens =
+        let (tokens', optmsg) = lexSAW "internal" str in
+        -- XXX clean this up when we clean out the message printing infrastructure
+        case optmsg of
+            Just (Error, _pos, msg) -> croak "Lexer" $ Text.unpack msg
+            Just (_, _pos, _msg) -> tokens'  -- ignore warnings for now
+            Nothing -> tokens'
+  in
+  case parseSchema tokens of
+    Left err -> croak "Parse" $ show err
     Right schema -> schema
 
 data PrimType
diff --git a/src/SAWScript/Lexer.x b/src/SAWScript/Lexer.x
index b702ca905..bd0ab252a 100644
--- a/src/SAWScript/Lexer.x
+++ b/src/SAWScript/Lexer.x
@@ -13,6 +13,7 @@ module SAWScript.Lexer
   , lexSAW
   ) where
 
+import SAWScript.Options (Verbosity(..))
 import SAWScript.Token
 import SAWScript.Panic (panic)
 import SAWScript.Position
@@ -22,6 +23,7 @@ import Numeric (readInt)
 import Data.Char (ord)
 import qualified Data.Char as Char
 import Data.List
+import Data.Text (Text)
 import qualified Data.Text as Text
 import Data.Text (Text)
 import Data.Word (Word8)
@@ -38,7 +40,8 @@ $unispace       = \x5
 $uniother       = \x6
 $unitick        = \x7
 
-$whitechar = [\ \t\n\r\f\v $unispace]
+$whitechar = [\ \t\r\f\v $unispace]
+$gapchar   = [$whitechar \n]
 $special   = [\(\)\,\;\[\]\`\{\}]
 $digit     = 0-9
 $large     = [A-Z $uniupper]
@@ -51,7 +54,7 @@ $octit     = 0-7
 $hexit     = [0-9 A-F a-f]
 $idfirst   = [$alpha \_]
 $idchar    = [$alpha $digit $unidigit $unitick \' \_]
-$codechar  = [$graphic $whitechar]
+$codechar  = [$graphic $whitechar \n]
 
 @reservedid  = import|and|let|rec|in|do|if|then|else|as|hiding|typedef
              |CryptolSetup|JavaSetup|LLVMSetup|MIRSetup|ProofScript|TopLevel|CrucibleSetup
@@ -75,7 +78,7 @@ $cntrl       = [$large \@\[\\\]\^\_]
              | SUB | ESC | FS | GS | RS | US | SP | DEL
 $charesc     = [abfnrtv\\\"\'\&]
 @escape      = \\ ($charesc | @ascii | @decimal | o @octal | x @hexadecimal)
-@gap         = \\ $whitechar+ \\
+@gap         = \\ $gapchar+ \\
 @string      = $graphic # [\"\\] | " " | @escape | @gap
 @code        = ($codechar # \}) | \} ($codechar # \})
 @ctype       = ($codechar # \|) | \| ($codechar # \})
@@ -83,10 +86,11 @@ $charesc     = [abfnrtv\\\"\'\&]
 
 sawTokens :-
 
-$white+                          ;
-"//".*                           ;
-"/*"                             { plain          TCmntS    }
-"*/"                             { plain          TCmntE    }
+\n                               { plain          TEOL      }
+"//"                             { plain          TCommentL }
+"/*"                             { plain          TCommentS }
+"*/"                             { plain          TCommentE }
+$whitechar+                      ;
 @reservedid                      { plain          TReserved }
 @punct                           { plain          TPunct    }
 @reservedop                      { plain          TOp       }
@@ -252,23 +256,88 @@ scanTokens filename str = go (startPos, str)
             in
             act pos' text : go inp'
 
+-- state for processing comments
+data CommentState = CNone | CBlock Pos !Int | CLine
+
+-- this should cease to be needed once we clean out the message
+-- printing infrastructure
+type OptMsg = Maybe (Verbosity, Pos, Text)
+
 -- postprocess to drop comments (this allows comments to be nested)
-dropComments :: [Token Pos] -> [Token Pos]
-dropComments = go 0
-  where go :: Int -> [Token Pos] -> [Token Pos]
-        go _  []                 = []
-        go !i (TCmntS _ _ : ts)  = go (i+1) ts
-        go !i (TCmntE _ _ : ts)
-         | i > 0                 = go (i-1) ts
-        go !i (t : ts)
-         | i /= 0                = go i ts
-         | True                  = t : go i ts
+dropComments :: [Token Pos] -> ([Token Pos], OptMsg)
+dropComments = go CNone
+  where go :: CommentState -> [Token Pos] -> ([Token Pos], OptMsg)
+        go state ts = case ts of
+          [] ->
+            -- should always see TEOF before we hit the end
+            panic "Lexer" ["dropComments: tokens ran out"]
+          TEOF _ _ : _ : _ ->
+            -- should only see TEOF at the end of the list
+            panic "Lexer" ["dropComments: misplaced EOF in tokens"]
+
+          t : ts' ->
+            let take state' =
+                    let (results, optmsg) = go state' ts' in
+                    (t : results, optmsg)
+                drop state' = go state' ts'
+                finish = ([t], Nothing)
+                finishWith msg = ([t], Just msg)
+            in
+            case state of
+              CNone -> case t of
+                TEOF _ _ ->
+                    -- plain EOF
+                    finish
+                TCommentS pos _ ->
+                    -- open block-comment
+                    drop (CBlock pos 0)
+                TCommentL _ _  ->
+                    -- begin line-comment
+                    drop CLine
+                TEOL _ _ ->
+                    -- ordinary EOL, doesn't need to be seen downstream; drop it
+                    drop CNone
+                _ ->
+                    -- uncommented token, keep it
+                    take CNone
+
+              CBlock startpos depth -> case t of
+                TEOF pos _ ->
+                    -- EOF in /**/-comment; unclosed, which is an error
+                    finishWith (Error, startpos, "Unclosed block comment")
+                TCommentS _ _ ->
+                    -- open nested comment, keep original start position
+                    drop (CBlock startpos (depth + 1))
+                TCommentE _ _
+                 | depth == 0 ->
+                     -- end outer block comment
+                     drop CNone
+                 | otherwise ->
+                     -- end nested block comment
+                     drop (CBlock startpos (depth - 1))
+                _ ->
+                     -- anything else in a block comment, drop it
+                     -- (this includes any TCommentLs that come through)
+                     drop state
+
+              CLine -> case t of
+                TEOF pos _ ->
+                    -- EOF in //-comment; missing a newline
+                    finishWith (Warn, pos, "Missing newline at end of file")
+                TEOL _ _ ->
+                    -- EOL ending a line comment, drop it
+                    -- (EOLs aren't sent downstream)
+                    drop CNone
+                _ ->
+                    -- anything else in a line comment, drop it
+                    -- (this includes any TCommentS/TCommentE that appear)
+                    drop CLine
 
 -- entry point
-lexSAW :: FilePath -> Text -> [Token Pos]
+lexSAW :: FilePath -> Text -> ([Token Pos], OptMsg)
 lexSAW f text = dropComments $ scanTokens f text
 
 -- alternate monadic entry point (XXX: does this have any value?)
-scan :: Monad m => FilePath -> Text -> m [Token Pos]
+scan :: Monad m => FilePath -> Text -> m ([Token Pos], OptMsg)
 scan f = return . lexSAW f
 }
diff --git a/src/SAWScript/Token.hs b/src/SAWScript/Token.hs
index 8a08f5392..c71138338 100644
--- a/src/SAWScript/Token.hs
+++ b/src/SAWScript/Token.hs
@@ -15,6 +15,9 @@ import SAWScript.Position (Positioned(..))
 -- All tokens have tokStr carrying the matched text (even TEOF where
 -- it doesn't entirely make sense) so that tokStr can be applied to
 -- any variant of the type.
+--
+-- The comment and EOL tokens are internal to the lexer and not seen
+-- downstream.
 data Token p = TVar      { tokPos :: p, tokStr :: Text                               }
              | TQVar     { tokPos :: p, tokStr :: Text, tokVars :: ([Text], Text)    }
              | TLit      { tokPos :: p, tokStr :: Text                               }
@@ -25,8 +28,10 @@ data Token p = TVar      { tokPos :: p, tokStr :: Text
              | TReserved { tokPos :: p, tokStr :: Text                               }
              | TOp       { tokPos :: p, tokStr :: Text                               }
              | TNum      { tokPos :: p, tokStr :: Text, tokNum :: Integer            }
-             | TCmntS    { tokPos :: p, tokStr :: Text                               }
-             | TCmntE    { tokPos :: p, tokStr :: Text                               }
+             | TCommentS { tokPos :: p, tokStr :: Text                               }
+             | TCommentE { tokPos :: p, tokStr :: Text                               }
+             | TCommentL { tokPos :: p, tokStr :: Text                               }
+             | TEOL      { tokPos :: p, tokStr :: Text                               }
              | TEOF      { tokPos :: p, tokStr :: Text                               }
              deriving (Show, Functor)
 

From 1b942660e5872d659bb501ef4e486c00702b7e10 Mon Sep 17 00:00:00 2001
From: David Holland <dholland@galois.com>
Date: Mon, 20 Jan 2025 19:39:10 -0500
Subject: [PATCH 2/4] Add some more tests pursuant to #2189

(interactions of comments and Cryptol blocks)

Note: these reflect the current behavior, which as discussed in the
issue is not particularly desirable. They should be updated when/if
the behavior is improved.
---
 intTests/test_parse_errors/lexer003.log.good |  3 +++
 intTests/test_parse_errors/lexer003.saw      | 11 +++++++++++
 intTests/test_parse_errors/lexer004.log.good |  3 +++
 intTests/test_parse_errors/lexer004.saw      | 11 +++++++++++
 intTests/test_parse_errors/lexer005.log.good |  5 +++++
 intTests/test_parse_errors/lexer005.saw      | 11 +++++++++++
 intTests/test_parse_errors/lexer006.log.good |  3 +++
 intTests/test_parse_errors/lexer006.saw      | 15 +++++++++++++++
 8 files changed, 62 insertions(+)
 create mode 100644 intTests/test_parse_errors/lexer003.log.good
 create mode 100644 intTests/test_parse_errors/lexer003.saw
 create mode 100644 intTests/test_parse_errors/lexer004.log.good
 create mode 100644 intTests/test_parse_errors/lexer004.saw
 create mode 100644 intTests/test_parse_errors/lexer005.log.good
 create mode 100644 intTests/test_parse_errors/lexer005.saw
 create mode 100644 intTests/test_parse_errors/lexer006.log.good
 create mode 100644 intTests/test_parse_errors/lexer006.saw

diff --git a/intTests/test_parse_errors/lexer003.log.good b/intTests/test_parse_errors/lexer003.log.good
new file mode 100644
index 000000000..bc84b49cf
--- /dev/null
+++ b/intTests/test_parse_errors/lexer003.log.good
@@ -0,0 +1,3 @@
+ Loading file "lexer003.saw"
+ Parse error at lexer003.saw:9:20-9:22: Unexpected `*/'
+FAILED
diff --git a/intTests/test_parse_errors/lexer003.saw b/intTests/test_parse_errors/lexer003.saw
new file mode 100644
index 000000000..793c5dda6
--- /dev/null
+++ b/intTests/test_parse_errors/lexer003.saw
@@ -0,0 +1,11 @@
+// Currently comment delimiters in Cryptol blocks are not recognized
+// by the saw-script lexer; they're treated as part of the block and
+// handled by the Cryptol lexer downstream. This has some odd
+// consequences. See issue #2189.
+//
+// This currently fails with an unexpected end-of-comment because the
+// open-comment isn't seen.
+
+let x = {{ /* 3 }} */;
+
+
diff --git a/intTests/test_parse_errors/lexer004.log.good b/intTests/test_parse_errors/lexer004.log.good
new file mode 100644
index 000000000..b67019d2b
--- /dev/null
+++ b/intTests/test_parse_errors/lexer004.log.good
@@ -0,0 +1,3 @@
+ Loading file "lexer004.saw"
+ Parse error at lexer004.saw:9:20-9:21: Unexpected `+'
+FAILED
diff --git a/intTests/test_parse_errors/lexer004.saw b/intTests/test_parse_errors/lexer004.saw
new file mode 100644
index 000000000..30d345b64
--- /dev/null
+++ b/intTests/test_parse_errors/lexer004.saw
@@ -0,0 +1,11 @@
+// Currently comment delimiters in Cryptol blocks are not recognized
+// by the saw-script lexer; they're treated as part of the block and
+// handled by the Cryptol lexer downstream. This has some odd
+// consequences. See issue #2189.
+//
+// This currently fails with "unexpected +" because the // isn't
+// seen.
+
+let x = {{ // 3 }} + 4;
+
+
diff --git a/intTests/test_parse_errors/lexer005.log.good b/intTests/test_parse_errors/lexer005.log.good
new file mode 100644
index 000000000..9390d8fdc
--- /dev/null
+++ b/intTests/test_parse_errors/lexer005.log.good
@@ -0,0 +1,5 @@
+ Loading file "lexer005.saw"
+ Stack trace:
+lexer005.saw:9:9-9:11: Unclosed block comment
+
+FAILED
diff --git a/intTests/test_parse_errors/lexer005.saw b/intTests/test_parse_errors/lexer005.saw
new file mode 100644
index 000000000..a2c66ef8e
--- /dev/null
+++ b/intTests/test_parse_errors/lexer005.saw
@@ -0,0 +1,11 @@
+// Currently comment delimiters in Cryptol blocks are not recognized
+// by the saw-script lexer; they're treated as part of the block and
+// handled by the Cryptol lexer downstream. This has some odd
+// consequences. See issue #2189.
+//
+// This currently fails with an unclosed block comment, because the
+// end-comment inside the Cryptol block isn't matched.
+
+let x = /* {{ */ 2 }};
+
+
diff --git a/intTests/test_parse_errors/lexer006.log.good b/intTests/test_parse_errors/lexer006.log.good
new file mode 100644
index 000000000..82efbab57
--- /dev/null
+++ b/intTests/test_parse_errors/lexer006.log.good
@@ -0,0 +1,3 @@
+ Loading file "lexer006.saw"
+ Parse error at lexer006.saw:12:1-12:2: Unexpected `+'
+FAILED
diff --git a/intTests/test_parse_errors/lexer006.saw b/intTests/test_parse_errors/lexer006.saw
new file mode 100644
index 000000000..113e20f3c
--- /dev/null
+++ b/intTests/test_parse_errors/lexer006.saw
@@ -0,0 +1,15 @@
+// Currently comment delimiters in Cryptol blocks are not recognized
+// by the saw-script lexer; they're treated as part of the block and
+// handled by the Cryptol lexer downstream. This has some odd
+// consequences. See issue #2189.
+//
+// This currently fails with "unexpected +" because the whole {{ }}
+// is commented out and the comment only ends after _then_ seeing a
+// newline.
+
+let x = 3 // {{
+}} *
++ 5;
+
+
+

From 90e186710499dc61ae40ce31f66ec5807d3bd28b Mon Sep 17 00:00:00 2001
From: David Holland <dholland@galois.com>
Date: Wed, 22 Jan 2025 16:00:56 -0500
Subject: [PATCH 3/4] lexer: Add a bunch of comments

---
 src/SAWScript/Lexer.x  | 25 +++++++++++++++++++++----
 src/SAWScript/Token.hs | 22 ++++++++++++++++++++++
 2 files changed, 43 insertions(+), 4 deletions(-)

diff --git a/src/SAWScript/Lexer.x b/src/SAWScript/Lexer.x
index bd0ab252a..6275cd888 100644
--- a/src/SAWScript/Lexer.x
+++ b/src/SAWScript/Lexer.x
@@ -256,14 +256,31 @@ scanTokens filename str = go (startPos, str)
             in
             act pos' text : go inp'
 
--- state for processing comments
+-- | State for processing comments.
+--
+-- CNone is the ground state (not in a comment)
+--
+-- CBlock startpos n is when we're within a block comment starting at
+-- startpos, and n is the number of additional nested block comments
+-- that have been opened.
+--
+-- CLine is when we're in a line (//-type) comment.
 data CommentState = CNone | CBlock Pos !Int | CLine
 
--- this should cease to be needed once we clean out the message
--- printing infrastructure
+-- | Type to hold a diagnostic message (error or warning).
+--
+-- This should cease to be needed once we clean out the message
+-- printing infrastructure. It's here only to shorten the type
+-- signatures below, and represents the possibility of having
+-- generated a message.
 type OptMsg = Maybe (Verbosity, Pos, Text)
 
--- postprocess to drop comments (this allows comments to be nested)
+-- | Postprocess to drop comments; this allows block comments to nest.
+--
+-- Also returns a message (error or warning) along with the updated
+-- token list if there's an unclosed comment when we reach EOF. Since
+-- we aren't in a monad here and can't print, dispatching the message
+-- is the caller's responsibility.
 dropComments :: [Token Pos] -> ([Token Pos], OptMsg)
 dropComments = go CNone
   where go :: CommentState -> [Token Pos] -> ([Token Pos], OptMsg)
diff --git a/src/SAWScript/Token.hs b/src/SAWScript/Token.hs
index c71138338..ccb914bac 100644
--- a/src/SAWScript/Token.hs
+++ b/src/SAWScript/Token.hs
@@ -12,12 +12,34 @@ import Data.Text (Text)
 
 import SAWScript.Position (Positioned(..))
 
+-- | Lexer tokens for saw-script.
+--
 -- All tokens have tokStr carrying the matched text (even TEOF where
 -- it doesn't entirely make sense) so that tokStr can be applied to
 -- any variant of the type.
 --
 -- The comment and EOL tokens are internal to the lexer and not seen
 -- downstream.
+--
+-- The tokens are:
+--   TVar       variable/identifier
+--   TQVar      qualified variable/identifier
+--   TLit       string constant
+--   TCode      Cryptol code block
+--   TCType     Cryptol type bllock
+--   TUnknown   Anything else that doesn't fit
+--   TPunct     Punctuation tokens used in the grammar
+--   TReserved  All reserved words
+--   TOp        Punctuation tokens apparently not used in the grammar
+--   TNum       Number
+--   TCommentS  Start of a block comment
+--   TCommentE  End of a block comment
+--   TCommentL  Start of a line comment
+--   EOL        End of line
+--   EOF        End of file/input
+--
+-- FUTURE: many of these could stand to be renamed
+--
 data Token p = TVar      { tokPos :: p, tokStr :: Text                               }
              | TQVar     { tokPos :: p, tokStr :: Text, tokVars :: ([Text], Text)    }
              | TLit      { tokPos :: p, tokStr :: Text                               }

From 1595f041ba57bbe44e352729fdeadfc7d3d2cb75 Mon Sep 17 00:00:00 2001
From: David Holland <dholland@galois.com>
Date: Wed, 22 Jan 2025 16:17:37 -0500
Subject: [PATCH 4/4] Typo in previous

---
 src/SAWScript/Token.hs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/SAWScript/Token.hs b/src/SAWScript/Token.hs
index ccb914bac..da96034d6 100644
--- a/src/SAWScript/Token.hs
+++ b/src/SAWScript/Token.hs
@@ -35,8 +35,8 @@ import SAWScript.Position (Positioned(..))
 --   TCommentS  Start of a block comment
 --   TCommentE  End of a block comment
 --   TCommentL  Start of a line comment
---   EOL        End of line
---   EOF        End of file/input
+--   TEOL       End of line
+--   TEOF       End of file/input
 --
 -- FUTURE: many of these could stand to be renamed
 --