Multiline strings fixes (#643)

* scanner: allow multiline strings to end with "" or '' * parser: trim all whitespaces after \ in multiline
pelletier · Oct 28, 2021 · 39f893a · 39f893a
1 parent c871a61
commit 39f893a
Show file tree

Hide file tree

Showing 4 changed files with 66 additions and 8 deletions.
diff --git a/parser.go b/parser.go
@@ -570,13 +570,25 @@ func (p *parser) parseMultilineBasicString(b []byte) ([]byte, []byte, []byte, er
 			// When the last non-whitespace character on a line is an unescaped \,
 			// it will be trimmed along with all whitespace (including newlines) up
 			// to the next non-whitespace character or closing delimiter.
-			if token[i+1] == '\n' || (token[i+1] == '\r' && token[i+2] == '\n') {
-				i++ // skip the \
+
+			isLastNonWhitespaceOnLine := false
+			j := 1
+		findEOLLoop:
+			for ; j < len(token)-3-i; j++ {
+				switch token[i+j] {
+				case ' ', '\t':
+					continue
+				case '\n':
+					isLastNonWhitespaceOnLine = true
+				}
+				break findEOLLoop
+			}
+			if isLastNonWhitespaceOnLine {
+				i += j
 				for ; i < len(token)-3; i++ {
 					c := token[i]
 					if !(c == '\n' || c == '\r' || c == ' ' || c == '\t') {
 						i--
-
 						break
 					}
 				}

diff --git a/scanner.go b/scanner.go
@@ -76,8 +76,30 @@ func scanMultilineLiteralString(b []byte) ([]byte, []byte, error) {
 	// mll-char = %x09 / %x20-26 / %x28-7E / non-ascii
 	// mll-quotes = 1*2apostrophe
 	for i := 3; i < len(b); {
-		if b[i] == '\'' && scanFollowsMultilineLiteralStringDelimiter(b[i:]) {
-			return b[:i+3], b[i+3:], nil
+		if scanFollowsMultilineLiteralStringDelimiter(b[i:]) {
+			i += 3
+
+			// At that point we found 3 apostrophe, and i is the
+			// index of the byte after the third one. The scanner
+			// needs to be eager, because there can be an extra 2
+			// apostrophe that can be accepted at the end of the
+			// string.
+
+			if i >= len(b) || b[i] != '\'' {
+				return b[:i], b[i:], nil
+			}
+			i++
+
+			if i >= len(b) || b[i] != '\'' {
+				return b[:i], b[i:], nil
+			}
+			i++
+
+			if i < len(b) && b[i] == '\'' {
+				return nil, nil, newDecodeError(b[i-3:i+1], "''' not allowed in multiline literal string")
+			}
+
+			return b[:i], b[i:], nil
 		}
 		size := utf8ValidNext(b[i:])
 		if size == 0 {
@@ -201,7 +223,29 @@ loop:
 		switch b[i] {
 		case '"':
 			if scanFollowsMultilineBasicStringDelimiter(b[i:]) {
-				return b[:i+3], escaped, b[i+3:], nil
+				i += 3
+
+				// At that point we found 3 apostrophe, and i is the
+				// index of the byte after the third one. The scanner
+				// needs to be eager, because there can be an extra 2
+				// apostrophe that can be accepted at the end of the
+				// string.
+
+				if i >= len(b) || b[i] != '"' {
+					return b[:i], escaped, b[i:], nil
+				}
+				i++
+
+				if i >= len(b) || b[i] != '"' {
+					return b[:i], escaped, b[i:], nil
+				}
+				i++
+
+				if i < len(b) && b[i] == '"' {
+					return nil, escaped, nil, newDecodeError(b[i-3:i+1], `""" not allowed in multiline basic string`)
+				}
+
+				return b[:i], escaped, b[i:], nil
 			}
 		case '\\':
 			if len(b) < i+2 {

diff --git a/toml_testgen_support_test.go b/toml_testgen_support_test.go
@@ -7,6 +7,7 @@ import (
 	"encoding/json"
 	"testing"
 
+	"github.com/pelletier/go-toml/v2"
 	"github.com/pelletier/go-toml/v2/testsuite"
 	"github.com/stretchr/testify/require"
 )
@@ -37,6 +38,9 @@ func testgenValid(t *testing.T, input string, jsonRef string) {
 
 	err := testsuite.Unmarshal([]byte(input), &doc)
 	if err != nil {
+		if de, ok := err.(*toml.DecodeError); ok {
+			t.Logf("%s\n%s", err, de)
+		}
 		t.Fatalf("failed parsing toml: %s", err)
 	}
 	j, err := testsuite.ValueToTaggedJSON(doc)

diff --git a/toml_testgen_test.go b/toml_testgen_test.go
@@ -1379,14 +1379,12 @@ func TestTOMLTest_Valid_String_Escapes(t *testing.T) {
 }
 
 func TestTOMLTest_Valid_String_MultilineQuotes(t *testing.T) {
-	t.Skip("FIXME")
 	input := "# Make sure that quotes inside multiline strings are allowed, including right\n# after the opening '''/\"\"\" and before the closing '''/\"\"\"\n\nlit_one = ''''one quote''''\nlit_two = '''''two quotes'''''\nlit_one_space = ''' 'one quote' '''\nlit_two_space = ''' ''two quotes'' '''\n\none = \"\"\"\"one quote\"\"\"\"\ntwo = \"\"\"\"\"two quotes\"\"\"\"\"\none_space = \"\"\" \"one quote\" \"\"\"\ntwo_space = \"\"\" \"\"two quotes\"\" \"\"\"\n\nmismatch1 = \"\"\"aaa'''bbb\"\"\"\nmismatch2 = '''aaa\"\"\"bbb'''\n"
 	jsonRef := "{\n  \"lit_one\": {\n    \"type\": \"string\",\n    \"value\": \"'one quote'\"\n  },\n  \"lit_one_space\": {\n    \"type\": \"string\",\n    \"value\": \" 'one quote' \"\n  },\n  \"lit_two\": {\n    \"type\": \"string\",\n    \"value\": \"''two quotes''\"\n  },\n  \"lit_two_space\": {\n    \"type\": \"string\",\n    \"value\": \" ''two quotes'' \"\n  },\n  \"mismatch1\": {\n    \"type\": \"string\",\n    \"value\": \"aaa'''bbb\"\n  },\n  \"mismatch2\": {\n    \"type\": \"string\",\n    \"value\": \"aaa\\\"\\\"\\\"bbb\"\n  },\n  \"one\": {\n    \"type\": \"string\",\n    \"value\": \"\\\"one quote\\\"\"\n  },\n  \"one_space\": {\n    \"type\": \"string\",\n    \"value\": \" \\\"one quote\\\" \"\n  },\n  \"two\": {\n    \"type\": \"string\",\n    \"value\": \"\\\"\\\"two quotes\\\"\\\"\"\n  },\n  \"two_space\": {\n    \"type\": \"string\",\n    \"value\": \" \\\"\\\"two quotes\\\"\\\" \"\n  }\n}\n"
 	testgenValid(t, input, jsonRef)
 }
 
 func TestTOMLTest_Valid_String_Multiline(t *testing.T) {
-	t.Skip("FIXME")
 	input := "# NOTE: this file includes some literal tab characters.\n\nmultiline_empty_one = \"\"\"\"\"\"\nmultiline_empty_two = \"\"\"\n\"\"\"\nmultiline_empty_three = \"\"\"\\\n    \"\"\"\nmultiline_empty_four = \"\"\"\\\n   \\\n   \\  \n   \"\"\"\n\nequivalent_one = \"The quick brown fox jumps over the lazy dog.\"\nequivalent_two = \"\"\"\nThe quick brown \\\n\n\n  fox jumps over \\\n    the lazy dog.\"\"\"\n\nequivalent_three = \"\"\"\\\n       The quick brown \\\n       fox jumps over \\\n       the lazy dog.\\\n       \"\"\"\n\nwhitespace-after-bs = \"\"\"\\\n       The quick brown \\\n       fox jumps over \\   \n       the lazy dog.\\\t\n       \"\"\"\n\nno-space = \"\"\"a\\\n    b\"\"\"\n\nkeep-ws-before = \"\"\"a   \t\\\n   b\"\"\"\n\nescape-bs-1 = \"\"\"a \\\\\nb\"\"\"\n\nescape-bs-2 = \"\"\"a \\\\\\\nb\"\"\"\n\nescape-bs-3 = \"\"\"a \\\\\\\\\n  b\"\"\"\n"
 	jsonRef := "{\n  \"equivalent_one\": {\n    \"type\": \"string\",\n    \"value\": \"The quick brown fox jumps over the lazy dog.\"\n  },\n  \"equivalent_three\": {\n    \"type\": \"string\",\n    \"value\": \"The quick brown fox jumps over the lazy dog.\"\n  },\n  \"equivalent_two\": {\n    \"type\": \"string\",\n    \"value\": \"The quick brown fox jumps over the lazy dog.\"\n  },\n  \"escape-bs-1\": {\n    \"type\": \"string\",\n    \"value\": \"a \\\\\\nb\"\n  },\n  \"escape-bs-2\": {\n    \"type\": \"string\",\n    \"value\": \"a \\\\b\"\n  },\n  \"escape-bs-3\": {\n    \"type\": \"string\",\n    \"value\": \"a \\\\\\\\\\n  b\"\n  },\n  \"keep-ws-before\": {\n    \"type\": \"string\",\n    \"value\": \"a   \\tb\"\n  },\n  \"multiline_empty_four\": {\n    \"type\": \"string\",\n    \"value\": \"\"\n  },\n  \"multiline_empty_one\": {\n    \"type\": \"string\",\n    \"value\": \"\"\n  },\n  \"multiline_empty_three\": {\n    \"type\": \"string\",\n    \"value\": \"\"\n  },\n  \"multiline_empty_two\": {\n    \"type\": \"string\",\n    \"value\": \"\"\n  },\n  \"no-space\": {\n    \"type\": \"string\",\n    \"value\": \"ab\"\n  },\n  \"whitespace-after-bs\": {\n    \"type\": \"string\",\n    \"value\": \"The quick brown fox jumps over the lazy dog.\"\n  }\n}\n"
 	testgenValid(t, input, jsonRef)