Skip to content

Commit

Permalink
Merge pull request #7123 from dolthub/aaron/import-supports-bom
Browse files Browse the repository at this point in the history
dolt table import: json,csv: Support BOM file headers.
  • Loading branch information
reltuk authored Dec 8, 2023
2 parents e899f7a + 04bd70a commit 076f51a
Show file tree
Hide file tree
Showing 12 changed files with 273 additions and 59 deletions.
9 changes: 8 additions & 1 deletion go/libraries/doltcore/table/typed/json/reader.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ import (

"github.com/bcicen/jstream"
"github.com/dolthub/go-mysql-server/sql"
"golang.org/x/text/encoding/unicode"
"golang.org/x/text/transform"

"github.com/dolthub/dolt/go/libraries/doltcore/row"
"github.com/dolthub/dolt/go/libraries/doltcore/schema"
Expand Down Expand Up @@ -52,12 +54,17 @@ func OpenJSONReader(vrw types.ValueReadWriter, path string, fs filesys.ReadableF
return NewJSONReader(vrw, r, sch)
}

// The bytes of the supplied reader are treated as UTF-8. If there is a UTF8,
// UTF16LE or UTF16BE BOM at the first bytes read, then it is stripped and the
// remaining contents of the reader are treated as that encoding.
func NewJSONReader(vrw types.ValueReadWriter, r io.ReadCloser, sch schema.Schema) (*JSONReader, error) {
if sch == nil {
return nil, errors.New("schema must be provided to JsonReader")
}

decoder := jstream.NewDecoder(r, 2) // extract JSON values at a depth level of 1
textReader := transform.NewReader(r, unicode.BOMOverride(unicode.UTF8.NewDecoder()))

decoder := jstream.NewDecoder(textReader, 2) // extract JSON values at a depth level of 1

return &JSONReader{vrw: vrw, closer: r, sch: sch, jsonStream: decoder}, nil
}
Expand Down
94 changes: 74 additions & 20 deletions go/libraries/doltcore/table/typed/json/reader_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
package json

import (
"bytes"
"context"
"io"
"os"
Expand All @@ -24,6 +25,8 @@ import (
"github.com/dolthub/go-mysql-server/sql"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"golang.org/x/text/encoding/unicode"
"golang.org/x/text/transform"

"github.com/dolthub/dolt/go/libraries/doltcore/row"
"github.com/dolthub/dolt/go/libraries/doltcore/schema"
Expand All @@ -33,25 +36,7 @@ import (
"github.com/dolthub/dolt/go/store/types"
)

func TestReader(t *testing.T) {
testJSON := `{
"rows": [
{
"id": 0,
"first name": "tim",
"last name": "sehn"
},
{
"id": 1,
"first name": "brian",
"last name": "hendriks"
}
]
}`

fs := filesys.EmptyInMemFS("/")
require.NoError(t, fs.WriteFile("file.json", []byte(testJSON), os.ModePerm))

func testGoodJSON(t *testing.T, getReader func(types.ValueReadWriter, schema.Schema) (*JSONReader, error)) {
colColl := schema.NewColCollection(
schema.Column{
Name: "id",
Expand Down Expand Up @@ -83,7 +68,7 @@ func TestReader(t *testing.T) {
require.NoError(t, err)

vrw := types.NewMemoryValueStore()
reader, err := OpenJSONReader(vrw, "file.json", fs, sch)
reader, err := getReader(vrw, sch)
require.NoError(t, err)

verifySchema, err := reader.VerifySchema(sch)
Expand All @@ -109,6 +94,75 @@ func TestReader(t *testing.T) {
assert.Equal(t, enginetest.WidenRows(sqlSch.Schema, expectedRows), rows)
}

func TestReader(t *testing.T) {
testJSON := `{
"rows": [
{
"id": 0,
"first name": "tim",
"last name": "sehn"
},
{
"id": 1,
"first name": "brian",
"last name": "hendriks"
}
]
}`

fs := filesys.EmptyInMemFS("/")
require.NoError(t, fs.WriteFile("file.json", []byte(testJSON), os.ModePerm))

testGoodJSON(t, func(vrw types.ValueReadWriter, sch schema.Schema) (*JSONReader, error) {
return OpenJSONReader(vrw, "file.json", fs, sch)
})
}

func TestReaderBOMHandling(t *testing.T) {
testJSON := `{
"rows": [
{
"id": 0,
"first name": "tim",
"last name": "sehn"
},
{
"id": 1,
"first name": "brian",
"last name": "hendriks"
}
]
}`
t.Run("UTF-8", func(t *testing.T) {
bs := bytes.NewBuffer([]byte(testJSON))
reader := transform.NewReader(bs, unicode.UTF8.NewEncoder())
testGoodJSON(t, func(vrw types.ValueReadWriter, sch schema.Schema) (*JSONReader, error) {
return NewJSONReader(vrw, io.NopCloser(reader), sch)
})
})
t.Run("UTF-8 BOM", func(t *testing.T) {
bs := bytes.NewBuffer([]byte(testJSON))
reader := transform.NewReader(bs, unicode.UTF8BOM.NewEncoder())
testGoodJSON(t, func(vrw types.ValueReadWriter, sch schema.Schema) (*JSONReader, error) {
return NewJSONReader(vrw, io.NopCloser(reader), sch)
})
})
t.Run("UTF-16 LE BOM", func(t *testing.T) {
bs := bytes.NewBuffer([]byte(testJSON))
reader := transform.NewReader(bs, unicode.UTF16(unicode.LittleEndian, unicode.UseBOM).NewEncoder())
testGoodJSON(t, func(vrw types.ValueReadWriter, sch schema.Schema) (*JSONReader, error) {
return NewJSONReader(vrw, io.NopCloser(reader), sch)
})
})
t.Run("UTF-16 BE BOM", func(t *testing.T) {
bs := bytes.NewBuffer([]byte(testJSON))
reader := transform.NewReader(bs, unicode.UTF16(unicode.BigEndian, unicode.UseBOM).NewEncoder())
testGoodJSON(t, func(vrw types.ValueReadWriter, sch schema.Schema) (*JSONReader, error) {
return NewJSONReader(vrw, io.NopCloser(reader), sch)
})
})
}

func TestReaderBadJson(t *testing.T) {
testJSON := ` {
"rows": [
Expand Down
27 changes: 13 additions & 14 deletions go/libraries/doltcore/table/untyped/csv/reader.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ import (
"unicode/utf8"

"github.com/dolthub/go-mysql-server/sql"
textunicode "golang.org/x/text/encoding/unicode"
"golang.org/x/text/transform"

"github.com/dolthub/dolt/go/libraries/doltcore/row"
"github.com/dolthub/dolt/go/libraries/doltcore/schema"
Expand Down Expand Up @@ -73,6 +75,14 @@ func OpenCSVReader(nbf *types.NomsBinFormat, path string, fs filesys.ReadableFS,
}

// NewCSVReader creates a CSVReader from a given ReadCloser. The CSVFileInfo should describe the csv file being read.
//
// The interpretation of the bytes of the supplied reader is a little murky. If
// there is a UTF8, UTF16LE or UTF16BE BOM as the first bytes read, then the
// BOM is stripped and the remaining contents of the reader are treated as that
// encoding. If we are not in any of those marked encodings, then some of the
// bytes go uninterpreted until we get to the SQL layer. It is currently the
// case that newlines must be encoded as a '0xa' byte and the delimiter must
// match |info.Delim|.
func NewCSVReader(nbf *types.NomsBinFormat, r io.ReadCloser, info *CSVFileInfo) (*CSVReader, error) {
if len(info.Delim) < 1 {
return nil, fmt.Errorf("delimiter '%s' has invalid length", info.Delim)
Expand All @@ -81,7 +91,9 @@ func NewCSVReader(nbf *types.NomsBinFormat, r io.ReadCloser, info *CSVFileInfo)
return nil, fmt.Errorf("invalid delimiter: %s", string(info.Delim))
}

br := bufio.NewReaderSize(r, ReadBufSize)
textReader := transform.NewReader(r, textunicode.BOMOverride(transform.Nop))

br := bufio.NewReaderSize(textReader, ReadBufSize)
colStrs, err := getColHeaders(br, info)

if err != nil {
Expand All @@ -102,18 +114,6 @@ func NewCSVReader(nbf *types.NomsBinFormat, r io.ReadCloser, info *CSVFileInfo)
}, nil
}

// trimBOM checks if the given string has the Byte Order Mark, and removes it if it is
// the BOM is there if the first 3 bytes are xEF\xBB\xBF and indicates that a file is in UTF-8 encoding
func trimBOM(s string) string {
if len(s) < 3 {
return s
}
if s[0] == '\xEF' && s[1] == '\xBB' && s[2] == '\xBF' {
return s[3:]
}
return s
}

func getColHeaders(br *bufio.Reader, info *CSVFileInfo) ([]string, error) {
colStrs := info.Columns
if info.HasHeaderLine {
Expand All @@ -124,7 +124,6 @@ func getColHeaders(br *bufio.Reader, info *CSVFileInfo) ([]string, error) {
} else if strings.TrimSpace(line) == "" {
return nil, errors.New("Header line is empty")
}
line = trimBOM(line)
colStrsFromFile, err := csvSplitLine(line, info.Delim, info.EscapeQuotes)

if err != nil {
Expand Down
45 changes: 33 additions & 12 deletions go/libraries/doltcore/table/untyped/csv/reader_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,11 @@ import (
"strings"
"testing"

"github.com/stretchr/testify/require"
"golang.org/x/text/encoding"
"golang.org/x/text/encoding/unicode"
"golang.org/x/text/transform"

"github.com/dolthub/dolt/go/libraries/doltcore/row"
"github.com/dolthub/dolt/go/libraries/doltcore/table"
"github.com/dolthub/dolt/go/libraries/doltcore/table/untyped"
Expand Down Expand Up @@ -67,6 +72,13 @@ func mustRow(r row.Row, err error) row.Row {
return r
}

func mustEncodeBytes(t *testing.T, bs []byte, enc encoding.Encoding) []byte {
ret, n, err := transform.Bytes(enc.NewEncoder(), bs)
require.NoError(t, err)
require.Equal(t, n, len(bs))
return ret
}

func TestReader(t *testing.T) {
colNames := []string{"name", "age", "title"}
_, sch := untyped.NewUntypedSchema(colNames...)
Expand All @@ -82,33 +94,42 @@ func TestReader(t *testing.T) {
mustRow(untyped.NewRowFromStrings(types.Format_Default, sch, []string{"Jack Jackson", "27"})),
}

utf8bomBytes := mustEncodeBytes(t, []byte(PersonDB1), unicode.UTF8BOM)
require.Equal(t, utf8bomBytes[0:3], []byte{0xEF, 0xBB, 0xBF})
utf16leBytes := mustEncodeBytes(t, []byte(PersonDB1), unicode.UTF16(unicode.LittleEndian, unicode.UseBOM))
utf16beBytes := mustEncodeBytes(t, []byte(PersonDB1), unicode.UTF16(unicode.BigEndian, unicode.UseBOM))

tests := []struct {
inputStr string
input []byte
expectedRows []row.Row
info *CSVFileInfo
}{
{PersonDB1, goodExpectedRows, NewCSVInfo()},
{PersonDB2, goodExpectedRows, NewCSVInfo()},
{PersonDB3, goodExpectedRows, NewCSVInfo()},
{[]byte(PersonDB1), goodExpectedRows, NewCSVInfo()},
{[]byte(PersonDB2), goodExpectedRows, NewCSVInfo()},
{[]byte(PersonDB3), goodExpectedRows, NewCSVInfo()},

{utf8bomBytes, goodExpectedRows, NewCSVInfo()},
{utf16leBytes, goodExpectedRows, NewCSVInfo()},
{utf16beBytes, goodExpectedRows, NewCSVInfo()},

{PersonDBWithBadRow, badExpectedRows, NewCSVInfo()},
{PersonDBWithBadRow2, badExpectedRows, NewCSVInfo()},
{PersonDBWithBadRow3, badExpectedRows, NewCSVInfo()},
{[]byte(PersonDBWithBadRow), badExpectedRows, NewCSVInfo()},
{[]byte(PersonDBWithBadRow2), badExpectedRows, NewCSVInfo()},
{[]byte(PersonDBWithBadRow3), badExpectedRows, NewCSVInfo()},

{
PersonDBWithoutHeaders,
[]byte(PersonDBWithoutHeaders),
goodExpectedRows,
NewCSVInfo().SetHasHeaderLine(false).SetColumns(colNames),
},
{
PersonDBDifferentHeaders,
[]byte(PersonDBDifferentHeaders),
goodExpectedRows,
NewCSVInfo().SetHasHeaderLine(true).SetColumns(colNames),
},
}

for _, test := range tests {
rows, numBad, err := readTestRows(t, test.inputStr, test.info)
rows, numBad, err := readTestRows(t, test.input, test.info)

if err != nil {
t.Fatal("Unexpected Error:", err)
Expand Down Expand Up @@ -136,11 +157,11 @@ func TestReader(t *testing.T) {
}
}

func readTestRows(t *testing.T, inputStr string, info *CSVFileInfo) ([]row.Row, int, error) {
func readTestRows(t *testing.T, input []byte, info *CSVFileInfo) ([]row.Row, int, error) {
const root = "/"
const path = "/file.csv"

fs := filesys.NewInMemFS(nil, map[string][]byte{path: []byte(inputStr)}, root)
fs := filesys.NewInMemFS(nil, map[string][]byte{path: input}, root)
csvR, err := OpenCSVReader(types.Format_Default, path, fs, info)
defer csvR.Close(context.Background())

Expand Down
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
id, title, start date, end date, first name, last name
0, "ceo", "", "", "tim", "sehn"
1, "founder", "", "", "aaron", "son"
2, "founder", "", "", "brian", "hendriks"
Binary file not shown.
Binary file not shown.
28 changes: 28 additions & 0 deletions integration-tests/bats/helper/employees-tbl.utf8bom.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
{
"rows": [
{
"id": 0,
"first name": "tim",
"last name": "sehn",
"title": "ceo",
"start date": "",
"end date": ""
},
{
"id": 1,
"first name": "aaron",
"last name": "son",
"title": "founder",
"start date": "",
"end date": ""
},
{
"id": 2,
"first name": "brian",
"last name": "hendricks",
"title": "founder",
"start date": "",
"end date": ""
}
]
}
Loading

0 comments on commit 076f51a

Please sign in to comment.