Merge pull request #7123 from dolthub/aaron/import-supports-bom

dolt table import: json,csv: Support BOM file headers.
dolthub · Dec 8, 2023 · 076f51a · 076f51a
2 parents e899f7a + 04bd70a
commit 076f51a
Show file tree

Hide file tree

Showing 12 changed files with 273 additions and 59 deletions.
diff --git a/go/libraries/doltcore/table/typed/json/reader.go b/go/libraries/doltcore/table/typed/json/reader.go
@@ -22,6 +22,8 @@ import (
 
 	"github.com/bcicen/jstream"
 	"github.com/dolthub/go-mysql-server/sql"
+	"golang.org/x/text/encoding/unicode"
+	"golang.org/x/text/transform"
 
 	"github.com/dolthub/dolt/go/libraries/doltcore/row"
 	"github.com/dolthub/dolt/go/libraries/doltcore/schema"
@@ -52,12 +54,17 @@ func OpenJSONReader(vrw types.ValueReadWriter, path string, fs filesys.ReadableF
 	return NewJSONReader(vrw, r, sch)
 }
 
+// The bytes of the supplied reader are treated as UTF-8. If there is a UTF8,
+// UTF16LE or UTF16BE BOM at the first bytes read, then it is stripped and the
+// remaining contents of the reader are treated as that encoding.
 func NewJSONReader(vrw types.ValueReadWriter, r io.ReadCloser, sch schema.Schema) (*JSONReader, error) {
 	if sch == nil {
 		return nil, errors.New("schema must be provided to JsonReader")
 	}
 
-	decoder := jstream.NewDecoder(r, 2) // extract JSON values at a depth level of 1
+	textReader := transform.NewReader(r, unicode.BOMOverride(unicode.UTF8.NewDecoder()))
+
+	decoder := jstream.NewDecoder(textReader, 2) // extract JSON values at a depth level of 1
 
 	return &JSONReader{vrw: vrw, closer: r, sch: sch, jsonStream: decoder}, nil
 }

diff --git a/go/libraries/doltcore/table/typed/json/reader_test.go b/go/libraries/doltcore/table/typed/json/reader_test.go
@@ -15,6 +15,7 @@
 package json
 
 import (
+	"bytes"
 	"context"
 	"io"
 	"os"
@@ -24,6 +25,8 @@ import (
 	"github.com/dolthub/go-mysql-server/sql"
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
+	"golang.org/x/text/encoding/unicode"
+	"golang.org/x/text/transform"
 
 	"github.com/dolthub/dolt/go/libraries/doltcore/row"
 	"github.com/dolthub/dolt/go/libraries/doltcore/schema"
@@ -33,25 +36,7 @@ import (
 	"github.com/dolthub/dolt/go/store/types"
 )
 
-func TestReader(t *testing.T) {
-	testJSON := `{
-		"rows": [
-			 {
-			   "id": 0,
-			   "first name": "tim",
-			   "last name": "sehn"
-			},
-			{
-			   "id": 1,
-			   "first name": "brian",
-			   "last name": "hendriks"
-			}
-		]
-	}`
-
-	fs := filesys.EmptyInMemFS("/")
-	require.NoError(t, fs.WriteFile("file.json", []byte(testJSON), os.ModePerm))
-
+func testGoodJSON(t *testing.T, getReader func(types.ValueReadWriter, schema.Schema) (*JSONReader, error)) {
 	colColl := schema.NewColCollection(
 		schema.Column{
 			Name:       "id",
@@ -83,7 +68,7 @@ func TestReader(t *testing.T) {
 	require.NoError(t, err)
 
 	vrw := types.NewMemoryValueStore()
-	reader, err := OpenJSONReader(vrw, "file.json", fs, sch)
+	reader, err := getReader(vrw, sch)
 	require.NoError(t, err)
 
 	verifySchema, err := reader.VerifySchema(sch)
@@ -109,6 +94,75 @@ func TestReader(t *testing.T) {
 	assert.Equal(t, enginetest.WidenRows(sqlSch.Schema, expectedRows), rows)
 }
 
+func TestReader(t *testing.T) {
+	testJSON := `{
+		"rows": [
+			 {
+			   "id": 0,
+			   "first name": "tim",
+			   "last name": "sehn"
+			},
+			{
+			   "id": 1,
+			   "first name": "brian",
+			   "last name": "hendriks"
+			}
+		]
+	}`
+
+	fs := filesys.EmptyInMemFS("/")
+	require.NoError(t, fs.WriteFile("file.json", []byte(testJSON), os.ModePerm))
+
+	testGoodJSON(t, func(vrw types.ValueReadWriter, sch schema.Schema) (*JSONReader, error) {
+		return OpenJSONReader(vrw, "file.json", fs, sch)
+	})
+}
+
+func TestReaderBOMHandling(t *testing.T) {
+	testJSON := `{
+		"rows": [
+			 {
+			   "id": 0,
+			   "first name": "tim",
+			   "last name": "sehn"
+			},
+			{
+			   "id": 1,
+			   "first name": "brian",
+			   "last name": "hendriks"
+			}
+		]
+	}`
+	t.Run("UTF-8", func(t *testing.T) {
+		bs := bytes.NewBuffer([]byte(testJSON))
+		reader := transform.NewReader(bs, unicode.UTF8.NewEncoder())
+		testGoodJSON(t, func(vrw types.ValueReadWriter, sch schema.Schema) (*JSONReader, error) {
+			return NewJSONReader(vrw, io.NopCloser(reader), sch)
+		})
+	})
+	t.Run("UTF-8 BOM", func(t *testing.T) {
+		bs := bytes.NewBuffer([]byte(testJSON))
+		reader := transform.NewReader(bs, unicode.UTF8BOM.NewEncoder())
+		testGoodJSON(t, func(vrw types.ValueReadWriter, sch schema.Schema) (*JSONReader, error) {
+			return NewJSONReader(vrw, io.NopCloser(reader), sch)
+		})
+	})
+	t.Run("UTF-16 LE BOM", func(t *testing.T) {
+		bs := bytes.NewBuffer([]byte(testJSON))
+		reader := transform.NewReader(bs, unicode.UTF16(unicode.LittleEndian, unicode.UseBOM).NewEncoder())
+		testGoodJSON(t, func(vrw types.ValueReadWriter, sch schema.Schema) (*JSONReader, error) {
+			return NewJSONReader(vrw, io.NopCloser(reader), sch)
+		})
+	})
+	t.Run("UTF-16 BE BOM", func(t *testing.T) {
+		bs := bytes.NewBuffer([]byte(testJSON))
+		reader := transform.NewReader(bs, unicode.UTF16(unicode.BigEndian, unicode.UseBOM).NewEncoder())
+		testGoodJSON(t, func(vrw types.ValueReadWriter, sch schema.Schema) (*JSONReader, error) {
+			return NewJSONReader(vrw, io.NopCloser(reader), sch)
+		})
+	})
+}
+
 func TestReaderBadJson(t *testing.T) {
 	testJSON := ` {
    "rows": [

diff --git a/go/libraries/doltcore/table/untyped/csv/reader.go b/go/libraries/doltcore/table/untyped/csv/reader.go
@@ -27,6 +27,8 @@ import (
 	"unicode/utf8"
 
 	"github.com/dolthub/go-mysql-server/sql"
+	textunicode "golang.org/x/text/encoding/unicode"
+	"golang.org/x/text/transform"
 
 	"github.com/dolthub/dolt/go/libraries/doltcore/row"
 	"github.com/dolthub/dolt/go/libraries/doltcore/schema"
@@ -73,6 +75,14 @@ func OpenCSVReader(nbf *types.NomsBinFormat, path string, fs filesys.ReadableFS,
 }
 
 // NewCSVReader creates a CSVReader from a given ReadCloser.  The CSVFileInfo should describe the csv file being read.
+//
+// The interpretation of the bytes of the supplied reader is a little murky. If
+// there is a UTF8, UTF16LE or UTF16BE BOM as the first bytes read, then the
+// BOM is stripped and the remaining contents of the reader are treated as that
+// encoding. If we are not in any of those marked encodings, then some of the
+// bytes go uninterpreted until we get to the SQL layer. It is currently the
+// case that newlines must be encoded as a '0xa' byte and the delimiter must
+// match |info.Delim|.
 func NewCSVReader(nbf *types.NomsBinFormat, r io.ReadCloser, info *CSVFileInfo) (*CSVReader, error) {
 	if len(info.Delim) < 1 {
 		return nil, fmt.Errorf("delimiter '%s' has invalid length", info.Delim)
@@ -81,7 +91,9 @@ func NewCSVReader(nbf *types.NomsBinFormat, r io.ReadCloser, info *CSVFileInfo)
 		return nil, fmt.Errorf("invalid delimiter: %s", string(info.Delim))
 	}
 
-	br := bufio.NewReaderSize(r, ReadBufSize)
+	textReader := transform.NewReader(r, textunicode.BOMOverride(transform.Nop))
+
+	br := bufio.NewReaderSize(textReader, ReadBufSize)
 	colStrs, err := getColHeaders(br, info)
 
 	if err != nil {
@@ -102,18 +114,6 @@ func NewCSVReader(nbf *types.NomsBinFormat, r io.ReadCloser, info *CSVFileInfo)
 	}, nil
 }
 
-// trimBOM checks if the given string has the Byte Order Mark, and removes it if it is
-// the BOM is there if the first 3 bytes are xEF\xBB\xBF and indicates that a file is in UTF-8 encoding
-func trimBOM(s string) string {
-	if len(s) < 3 {
-		return s
-	}
-	if s[0] == '\xEF' && s[1] == '\xBB' && s[2] == '\xBF' {
-		return s[3:]
-	}
-	return s
-}
-
 func getColHeaders(br *bufio.Reader, info *CSVFileInfo) ([]string, error) {
 	colStrs := info.Columns
 	if info.HasHeaderLine {
@@ -124,7 +124,6 @@ func getColHeaders(br *bufio.Reader, info *CSVFileInfo) ([]string, error) {
 		} else if strings.TrimSpace(line) == "" {
 			return nil, errors.New("Header line is empty")
 		}
-		line = trimBOM(line)
 		colStrsFromFile, err := csvSplitLine(line, info.Delim, info.EscapeQuotes)
 
 		if err != nil {

diff --git a/go/libraries/doltcore/table/untyped/csv/reader_test.go b/go/libraries/doltcore/table/untyped/csv/reader_test.go
@@ -20,6 +20,11 @@ import (
 	"strings"
 	"testing"
 
+	"github.com/stretchr/testify/require"
+	"golang.org/x/text/encoding"
+	"golang.org/x/text/encoding/unicode"
+	"golang.org/x/text/transform"
+
 	"github.com/dolthub/dolt/go/libraries/doltcore/row"
 	"github.com/dolthub/dolt/go/libraries/doltcore/table"
 	"github.com/dolthub/dolt/go/libraries/doltcore/table/untyped"
@@ -67,6 +72,13 @@ func mustRow(r row.Row, err error) row.Row {
 	return r
 }
 
+func mustEncodeBytes(t *testing.T, bs []byte, enc encoding.Encoding) []byte {
+	ret, n, err := transform.Bytes(enc.NewEncoder(), bs)
+	require.NoError(t, err)
+	require.Equal(t, n, len(bs))
+	return ret
+}
+
 func TestReader(t *testing.T) {
 	colNames := []string{"name", "age", "title"}
 	_, sch := untyped.NewUntypedSchema(colNames...)
@@ -82,33 +94,42 @@ func TestReader(t *testing.T) {
 		mustRow(untyped.NewRowFromStrings(types.Format_Default, sch, []string{"Jack Jackson", "27"})),
 	}
 
+	utf8bomBytes := mustEncodeBytes(t, []byte(PersonDB1), unicode.UTF8BOM)
+	require.Equal(t, utf8bomBytes[0:3], []byte{0xEF, 0xBB, 0xBF})
+	utf16leBytes := mustEncodeBytes(t, []byte(PersonDB1), unicode.UTF16(unicode.LittleEndian, unicode.UseBOM))
+	utf16beBytes := mustEncodeBytes(t, []byte(PersonDB1), unicode.UTF16(unicode.BigEndian, unicode.UseBOM))
+
 	tests := []struct {
-		inputStr     string
+		input        []byte
 		expectedRows []row.Row
 		info         *CSVFileInfo
 	}{
-		{PersonDB1, goodExpectedRows, NewCSVInfo()},
-		{PersonDB2, goodExpectedRows, NewCSVInfo()},
-		{PersonDB3, goodExpectedRows, NewCSVInfo()},
+		{[]byte(PersonDB1), goodExpectedRows, NewCSVInfo()},
+		{[]byte(PersonDB2), goodExpectedRows, NewCSVInfo()},
+		{[]byte(PersonDB3), goodExpectedRows, NewCSVInfo()},
+
+		{utf8bomBytes, goodExpectedRows, NewCSVInfo()},
+		{utf16leBytes, goodExpectedRows, NewCSVInfo()},
+		{utf16beBytes, goodExpectedRows, NewCSVInfo()},
 
-		{PersonDBWithBadRow, badExpectedRows, NewCSVInfo()},
-		{PersonDBWithBadRow2, badExpectedRows, NewCSVInfo()},
-		{PersonDBWithBadRow3, badExpectedRows, NewCSVInfo()},
+		{[]byte(PersonDBWithBadRow), badExpectedRows, NewCSVInfo()},
+		{[]byte(PersonDBWithBadRow2), badExpectedRows, NewCSVInfo()},
+		{[]byte(PersonDBWithBadRow3), badExpectedRows, NewCSVInfo()},
 
 		{
-			PersonDBWithoutHeaders,
+			[]byte(PersonDBWithoutHeaders),
 			goodExpectedRows,
 			NewCSVInfo().SetHasHeaderLine(false).SetColumns(colNames),
 		},
 		{
-			PersonDBDifferentHeaders,
+			[]byte(PersonDBDifferentHeaders),
 			goodExpectedRows,
 			NewCSVInfo().SetHasHeaderLine(true).SetColumns(colNames),
 		},
 	}
 
 	for _, test := range tests {
-		rows, numBad, err := readTestRows(t, test.inputStr, test.info)
+		rows, numBad, err := readTestRows(t, test.input, test.info)
 
 		if err != nil {
 			t.Fatal("Unexpected Error:", err)
@@ -136,11 +157,11 @@ func TestReader(t *testing.T) {
 	}
 }
 
-func readTestRows(t *testing.T, inputStr string, info *CSVFileInfo) ([]row.Row, int, error) {
+func readTestRows(t *testing.T, input []byte, info *CSVFileInfo) ([]row.Row, int, error) {
 	const root = "/"
 	const path = "/file.csv"
 
-	fs := filesys.NewInMemFS(nil, map[string][]byte{path: []byte(inputStr)}, root)
+	fs := filesys.NewInMemFS(nil, map[string][]byte{path: input}, root)
 	csvR, err := OpenCSVReader(types.Format_Default, path, fs, info)
 	defer csvR.Close(context.Background())
 

diff --git a/integration-tests/bats/helper/employees-tbl-schema-unordered.utf16bebom.csv b/integration-tests/bats/helper/employees-tbl-schema-unordered.utf16bebom.csv
diff --git a/integration-tests/bats/helper/employees-tbl-schema-unordered.utf16lebom.csv b/integration-tests/bats/helper/employees-tbl-schema-unordered.utf16lebom.csv
diff --git a/integration-tests/bats/helper/employees-tbl-schema-unordered.utf8bom.csv b/integration-tests/bats/helper/employees-tbl-schema-unordered.utf8bom.csv
@@ -0,0 +1,4 @@
+id, title, start date, end date, first name, last name 
+0, "ceo", "", "", "tim", "sehn"
+1, "founder", "", "", "aaron", "son"
+2, "founder", "", "", "brian", "hendriks"
diff --git a/integration-tests/bats/helper/employees-tbl.utf16bebom.json b/integration-tests/bats/helper/employees-tbl.utf16bebom.json
diff --git a/integration-tests/bats/helper/employees-tbl.utf16lebom.json b/integration-tests/bats/helper/employees-tbl.utf16lebom.json
diff --git a/integration-tests/bats/helper/employees-tbl.utf8bom.json b/integration-tests/bats/helper/employees-tbl.utf8bom.json
@@ -0,0 +1,28 @@
+{
+  "rows": [
+    {
+      "id": 0,
+      "first name": "tim",
+      "last name": "sehn",
+      "title": "ceo",
+      "start date": "",
+      "end date": ""
+    },
+    {
+      "id": 1,
+      "first name": "aaron",
+      "last name": "son",
+      "title": "founder",
+      "start date": "",
+      "end date": ""
+    },
+    {
+      "id": 2,
+      "first name": "brian",
+      "last name": "hendricks",
+      "title": "founder",
+      "start date": "",
+      "end date": ""
+    }
+  ]
+}