Skip to content

Commit

Permalink
Truncate MCVs (#8041)
Browse files Browse the repository at this point in the history
* [no-release-notes] test smaller set of mcv changes

* [ga-format-pr] Run go/utils/repofmt/format_repo.sh and go/Godeps/update.sh

* fix tests

* bump

* fix tests

* fix bats

* uncomment

---------

Co-authored-by: max-hoffman <[email protected]>
  • Loading branch information
max-hoffman and max-hoffman authored Jun 21, 2024
1 parent fed318d commit 082a398
Show file tree
Hide file tree
Showing 7 changed files with 59 additions and 47 deletions.
2 changes: 1 addition & 1 deletion go/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ require (
github.com/cespare/xxhash v1.1.0
github.com/creasty/defaults v1.6.0
github.com/dolthub/flatbuffers/v23 v23.3.3-dh.2
github.com/dolthub/go-mysql-server v0.18.2-0.20240621090043-94a27aeefd56
github.com/dolthub/go-mysql-server v0.18.2-0.20240621163952-f2914ed9b1a6
github.com/dolthub/gozstd v0.0.0-20240423170813-23a2903bca63
github.com/dolthub/swiss v0.1.0
github.com/goccy/go-json v0.10.2
Expand Down
4 changes: 2 additions & 2 deletions go/go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -183,8 +183,8 @@ github.com/dolthub/fslock v0.0.3 h1:iLMpUIvJKMKm92+N1fmHVdxJP5NdyDK5bK7z7Ba2s2U=
github.com/dolthub/fslock v0.0.3/go.mod h1:QWql+P17oAAMLnL4HGB5tiovtDuAjdDTPbuqx7bYfa0=
github.com/dolthub/go-icu-regex v0.0.0-20230524105445-af7e7991c97e h1:kPsT4a47cw1+y/N5SSCkma7FhAPw7KeGmD6c9PBZW9Y=
github.com/dolthub/go-icu-regex v0.0.0-20230524105445-af7e7991c97e/go.mod h1:KPUcpx070QOfJK1gNe0zx4pA5sicIK1GMikIGLKC168=
github.com/dolthub/go-mysql-server v0.18.2-0.20240621090043-94a27aeefd56 h1:vEdZ8vWHkBh83Q3nhsuoVU8L2oVm5sBB0/zjNu1RASg=
github.com/dolthub/go-mysql-server v0.18.2-0.20240621090043-94a27aeefd56/go.mod h1:XdiHsd2TX3OOhjwY6tPcw1ztT2BdBiP6Wp0m/7OYHn4=
github.com/dolthub/go-mysql-server v0.18.2-0.20240621163952-f2914ed9b1a6 h1:VD4irTQzGU9HgsBnOhw/QIXd5rHoRHFVPBrISzw1WNA=
github.com/dolthub/go-mysql-server v0.18.2-0.20240621163952-f2914ed9b1a6/go.mod h1:XdiHsd2TX3OOhjwY6tPcw1ztT2BdBiP6Wp0m/7OYHn4=
github.com/dolthub/gozstd v0.0.0-20240423170813-23a2903bca63 h1:OAsXLAPL4du6tfbBgK0xXHZkOlos63RdKYS3Sgw/dfI=
github.com/dolthub/gozstd v0.0.0-20240423170813-23a2903bca63/go.mod h1:lV7lUeuDhH5thVGDCKXbatwKy2KW80L4rMT46n+Y2/Q=
github.com/dolthub/ishell v0.0.0-20221214210346-d7db0b066488 h1:0HHu0GWJH0N6a6keStrHhUAK5/o9LVfkh44pvsV4514=
Expand Down
8 changes: 2 additions & 6 deletions go/libraries/doltcore/sqle/enginetest/stats_queries.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,19 +46,15 @@ var DoltHistogramTests = []queries.ScriptTest{
Query: " SELECT mcv_cnt from information_schema.column_statistics join json_table(histogram, '$.statistic.buckets[*]' COLUMNS(mcv_cnt JSON path '$.mcv_counts')) as dt where table_name = 'xy' and column_name = 'y,z'",
Expected: []sql.Row{
{types.JSONDocument{Val: []interface{}{
float64(1),
float64(4),
float64(1),
}}},
},
},
{
Query: " SELECT mcv from information_schema.column_statistics join json_table(histogram, '$.statistic.buckets[*]' COLUMNS(mcv JSON path '$.mcvs[*]')) as dt where table_name = 'xy' and column_name = 'y,z'",
Expected: []sql.Row{
{types.JSONDocument{Val: []interface{}{
[]interface{}{float64(1), "a"},
[]interface{}{float64(0), "a"},
[]interface{}{float64(2), "a"},
}}},
},
},
Expand Down Expand Up @@ -329,8 +325,8 @@ var DoltStatsIOTests = []queries.ScriptTest{
{
Query: fmt.Sprintf("select %s, %s, %s, %s, %s from dolt_statistics", schema.StatsMcv1ColName, schema.StatsMcv2ColName, schema.StatsMcv3ColName, schema.StatsMcv4ColName, schema.StatsMcvCountsColName),
Expected: []sql.Row{
{"5", "1", "2", "", "1,1,1"},
{"1,a", "0,a", "2,a", "", "1,4,1"},
{"", "", "", "", ""},
{"0,a", "", "", "", "4"},
},
},
},
Expand Down
3 changes: 3 additions & 0 deletions go/libraries/doltcore/sqle/statsnoms/load.go
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,9 @@ func loadStats(ctx *sql.Context, db dsess.SqlDatabase, m prolly.Map) (map[sql.St
mcvCountsStr := strings.Split(row[schema.StatsMcvCountsTag].(string), ",")
mcvCnts := make([]uint64, numMcvs)
for i, v := range mcvCountsStr {
if v == "" {
continue
}
val, err := strconv.Atoi(v)
if err != nil {
return nil, err
Expand Down
20 changes: 20 additions & 0 deletions go/libraries/doltcore/sqle/statspro/update.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"errors"
"fmt"
"io"
"sort"
"strings"
"time"

Expand Down Expand Up @@ -265,6 +266,10 @@ func (u *bucketBuilder) newBucket() {
func (u *bucketBuilder) finalize(ctx context.Context, ns tree.NodeStore) (DoltBucket, error) {
// update MCV in case we've ended on a run of many identical keys
u.updateMcv()

u.mcvs.Sort()
u.mcvs.Truncate(2 * float64(u.count) / float64(u.distinct)) // only keep MCVs that are > twice as common as average

// convert the MCV tuples into SQL rows (most efficient to only do this once)
mcvRows, err := u.mcvs.Values(ctx, u.tupleDesc, ns, u.prefixLen)
if err != nil {
Expand Down Expand Up @@ -365,6 +370,21 @@ func (m mcvHeap) Counts() []uint64 {
return ret
}

func (m mcvHeap) Sort() {
sort.Slice(m, m.Less)
}

func (m *mcvHeap) Truncate(cutoff float64) {
start := m.Len()
for i, v := range *m {
if float64(v.cnt) >= cutoff {
start = i
}
}
old := *m
*m = old[start:]
}

func (m mcvHeap) Values(ctx context.Context, keyDesc val.TupleDesc, ns tree.NodeStore, prefixLen int) ([]sql.Row, error) {
ret := make([]sql.Row, len(m))
for i, mcv := range m {
Expand Down
45 changes: 29 additions & 16 deletions go/libraries/doltcore/sqle/statspro/update_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,8 @@ func TestBucketBuilder(t *testing.T) {
bucket: DoltBucket{Bucket: &stats.Bucket{
RowCnt: 15,
DistinctCnt: 5,
McvVals: []sql.Row{{int64(4)}, {int64(2)}, {int64(3)}},
McvsCnt: []uint64{3, 4, 3},
McvVals: []sql.Row{},
McvsCnt: []uint64{},
BoundVal: sql.Row{int64(5)},
BoundCnt: 2,
}},
Expand All @@ -85,8 +85,8 @@ func TestBucketBuilder(t *testing.T) {
RowCnt: 16,
DistinctCnt: 6,
NullCnt: 3,
McvVals: []sql.Row{{int64(4)}, {int64(2)}, {nil}},
McvsCnt: []uint64{3, 4, 3},
McvVals: []sql.Row{},
McvsCnt: []uint64{},
BoundVal: sql.Row{int64(5)},
BoundCnt: 2,
}},
Expand All @@ -99,8 +99,8 @@ func TestBucketBuilder(t *testing.T) {
RowCnt: 15,
DistinctCnt: 6,
NullCnt: 2,
McvVals: []sql.Row{{int64(3)}, {int64(4)}, {int64(2)}},
McvsCnt: []uint64{3, 3, 4},
McvVals: []sql.Row{},
McvsCnt: []uint64{},
BoundVal: sql.Row{int64(5)},
BoundCnt: 2,
}},
Expand All @@ -113,8 +113,8 @@ func TestBucketBuilder(t *testing.T) {
RowCnt: 22,
DistinctCnt: 7,
BoundCnt: 1,
McvVals: []sql.Row{{int64(2)}, {int64(6)}, {int64(5)}},
McvsCnt: []uint64{4, 4, 4},
McvVals: []sql.Row{},
McvsCnt: []uint64{},
BoundVal: sql.Row{int64(7)},
}},
},
Expand All @@ -125,8 +125,8 @@ func TestBucketBuilder(t *testing.T) {
bucket: DoltBucket{Bucket: &stats.Bucket{
RowCnt: 15,
DistinctCnt: 11,
McvVals: []sql.Row{{int64(1), int64(1)}, {int64(4), int64(1)}, {int64(2), int64(3)}},
McvsCnt: []uint64{2, 3, 2},
McvVals: []sql.Row{{int64(4), int64(1)}},
McvsCnt: []uint64{3},
BoundVal: sql.Row{int64(5), int64(2)},
BoundCnt: 1,
}},
Expand All @@ -139,8 +139,8 @@ func TestBucketBuilder(t *testing.T) {
RowCnt: 5,
DistinctCnt: 5,
NullCnt: 3,
McvVals: []sql.Row{{int64(2), int64(2)}, {int64(1), nil}, {int64(1), int64(2)}},
McvsCnt: []uint64{1, 1, 1},
McvVals: []sql.Row{},
McvsCnt: []uint64{},
BoundVal: sql.Row{int64(2), int64(2)},
BoundCnt: 1},
},
Expand All @@ -152,8 +152,8 @@ func TestBucketBuilder(t *testing.T) {
bucket: DoltBucket{Bucket: &stats.Bucket{
RowCnt: 15,
DistinctCnt: 9,
McvVals: []sql.Row{{"i"}, {"h"}, {"g"}},
McvsCnt: []uint64{2, 3, 3},
McvVals: []sql.Row{},
McvsCnt: []uint64{},
BoundVal: sql.Row{"i"},
BoundCnt: 2,
}},
Expand All @@ -165,12 +165,25 @@ func TestBucketBuilder(t *testing.T) {
bucket: DoltBucket{Bucket: &stats.Bucket{
RowCnt: 15,
DistinctCnt: 12,
McvVals: []sql.Row{{"i", int64(1)}, {"g", int64(2)}, {"h", int64(1)}},
McvsCnt: []uint64{2, 2, 2},
McvVals: []sql.Row{},
McvsCnt: []uint64{},
BoundVal: sql.Row{"i", int64(1)},
BoundCnt: 2,
}},
},
{
name: "mcvs",
keys: []sql.Row{{1}, {2}, {3}, {4}, {5}, {6}, {7}, {7}, {7}, {7}, {8}, {9}, {10}, {10}, {10}, {11}, {12}, {13}, {14}, {15}, {20}, {21}, {22}},
keyDesc: val.NewTupleDescriptor(val.Type{Enc: val.Int64Enc, Nullable: false}),
bucket: DoltBucket{Bucket: &stats.Bucket{
RowCnt: 23,
DistinctCnt: 18,
McvVals: []sql.Row{{int64(7)}},
McvsCnt: []uint64{4},
BoundVal: sql.Row{int64(22)},
BoundCnt: 1,
}},
},
}

ctx := context.Background()
Expand Down
24 changes: 2 additions & 22 deletions integration-tests/bats/stats.bats
Original file line number Diff line number Diff line change
Expand Up @@ -208,33 +208,13 @@ teardown() {
cd repo2

dolt sql -q "alter table xy add index y2 (y)"
dolt sql -q "insert into xy values (0,0), (1,0), (2,0), (3,0), (4,0), (5,0)"
dolt sql -q "insert into xy values (0,0), (1,0), (2,0), (3,0), (4,0), (5,0), (6,1), (7,2), (8,3), (9,4)"

# setting variables doesn't hang or error
dolt sql -q "SET @@persist.dolt_stats_auto_refresh_enabled = 1;"
dolt sql -q "SET @@persist.dolt_stats_auto_refresh_threshold = .5"
dolt sql -q "SET @@persist.dolt_stats_auto_refresh_interval = 1;"

# auto refresh can only initialize at server startup
start_sql_server

# need to trigger at least one refresh cycle
sleep 1
dolt sql -q "analyze table xy"

run dolt sql -r csv -q "select mcv1 from dolt_statistics where index_name = 'y2'"
[ "$status" -eq 0 ]
[ "${lines[1]}" = "0" ]

sleep 1

dolt sql -q "update xy set y = 2 where x between 0 and 3"

sleep 1

run dolt sql -r csv -q "select mcv1 as mcv from dolt_statistics where index_name = 'y2' union select mcv2 as mcv from dolt_statistics where index_name = 'y2' order by mcv"
[ "$status" -eq 0 ]
[ "${lines[1]}" = "0" ]
[ "${lines[2]}" = "2" ]
}

@test "stats: multi db" {
Expand Down

0 comments on commit 082a398

Please sign in to comment.