diff --git a/go/libraries/doltcore/table/typed/parquet/reader.go b/go/libraries/doltcore/table/typed/parquet/reader.go index 4ba3f0d68e7..e5e2946f0c9 100644 --- a/go/libraries/doltcore/table/typed/parquet/reader.go +++ b/go/libraries/doltcore/table/typed/parquet/reader.go @@ -50,7 +50,10 @@ type ParquetReader struct { fileData map[string][]interface{} // rLevels indicate whether a value in a column is a repeat of a repeated type. // We only include these for repeated fields. - rLevels map[string][]int32 + rLevels map[string][]int32 + // dLevels are used for interpreting null values by indicating the deepest level in + // a nested field that's defined. + dLevels map[string][]int32 columnName []string } @@ -82,6 +85,7 @@ func NewParquetReader(vrw types.ValueReadWriter, fr source.ParquetFile, sche sch // TODO : need to solve for getting single row data in readRow (storing all columns data in memory right now) data := make(map[string][]interface{}) rLevels := make(map[string][]int32) + dLevels := make(map[string][]int32) rowReadCounters := make(map[string]int) var colName []string for _, col := range columns { @@ -96,7 +100,7 @@ func NewParquetReader(vrw types.ValueReadWriter, fr source.ParquetFile, sche sch } return nil, fmt.Errorf("cannot read column: %s Column not found", col.Name) } - colData, rLevel, _, cErr := pr.ReadColumnByPath(resolvedColumnName, num) + colData, rLevel, dLevel, cErr := pr.ReadColumnByPath(resolvedColumnName, num) if cErr != nil { return nil, fmt.Errorf("cannot read column: %s", cErr.Error()) } @@ -104,6 +108,7 @@ func NewParquetReader(vrw types.ValueReadWriter, fr source.ParquetFile, sche sch if isRepeated { rLevels[col.Name] = rLevel } + dLevels[col.Name] = dLevel rowReadCounters[col.Name] = 0 colName = append(colName, col.Name) } @@ -118,6 +123,7 @@ func NewParquetReader(vrw types.ValueReadWriter, fr source.ParquetFile, sche sch rowReadCounters: rowReadCounters, fileData: data, rLevels: rLevels, + dLevels: dLevels, columnName: colName, }, nil } @@ -229,19 +235,37 @@ func (pr *ParquetReader) ReadSqlRow(ctx context.Context) (sql.Row, error) { } var val interface{} rLevels, isRepeated := pr.rLevels[col.Name] - if !isRepeated { - val = readVal() - } else { + dLevels, _ := pr.dLevels[col.Name] + readVals := func() (val interface{}) { var vals []interface{} for { + dLevel := dLevels[rowReadCounter] subVal := readVal() + if subVal == nil { + // dLevels tells us how to interpret this nil value: + // 0 -> the column value is NULL + // 1 -> the column exists but is empty + // 2 -> the column contains an empty value + // 3+ -> the column contains a non-empty value + switch dLevel { + case 0: + return nil + case 1: + return []interface{}{} + } + } vals = append(vals, subVal) // an rLevel of 0 marks the start of a new record. if rowReadCounter >= len(rLevels) || rLevels[rowReadCounter] == 0 { break } } - val = vals + return vals + } + if !isRepeated { + val = readVal() + } else { + val = readVals() } pr.rowReadCounters[col.Name] = rowReadCounter diff --git a/integration-tests/bats/helper/parquet/sequences.parquet b/integration-tests/bats/helper/parquet/sequences.parquet new file mode 100644 index 00000000000..efb895103fd Binary files /dev/null and b/integration-tests/bats/helper/parquet/sequences.parquet differ diff --git a/integration-tests/bats/helper/parquet/sequences.sql b/integration-tests/bats/helper/parquet/sequences.sql new file mode 100644 index 00000000000..b80bac07bab --- /dev/null +++ b/integration-tests/bats/helper/parquet/sequences.sql @@ -0,0 +1 @@ +create table sequences(`pk` int primary key, `name` varchar(20), `embeddings` json); \ No newline at end of file diff --git a/integration-tests/bats/import-create-tables.bats b/integration-tests/bats/import-create-tables.bats index a58eda8a80c..5de9c5205c4 100755 --- a/integration-tests/bats/import-create-tables.bats +++ b/integration-tests/bats/import-create-tables.bats @@ -930,4 +930,23 @@ DELIM [[ "$output" =~ "text" ]] || false [[ "$output" =~ "hello foo" ]] || false [[ "$output" =~ "hello world" ]] || false +} + +@test "import-create-tables: import sequences as JSON arrays" { + # The file strings.parquet uses a different name for the root column than the one generated by `dolt table export`, + # but Dolt should still be able to import it. + run dolt table import -c -s `batshelper parquet/sequences.sql` sequences `batshelper parquet/sequences.parquet` + [ "$status" -eq 0 ] + + dolt sql -r csv -q "select * from sequences;" + run dolt sql -r csv -q "select * from sequences;" + [ "$status" -eq 0 ] + [ "${#lines[@]}" -eq 7 ] + [[ "$output" =~ '1,empty,[]' ]] || false + [[ "$output" =~ "2,single,[1]" ]] || false + [[ "$output" =~ "3,null," ]] || false + [[ "$output" =~ '4,double,"[2,3]"' ]] || false + [[ "$output" =~ '5,contains null,"[4,null]"' ]] || false + [[ "$output" =~ '6,empty,[]' ]] || false + } \ No newline at end of file