Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[statspro] Bootstrap database statistics once on startup #8036

Merged
merged 12 commits into from
Jun 24, 2024
2 changes: 2 additions & 0 deletions go/libraries/doltcore/env/actions/branch.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@ func RenameBranch(ctx context.Context, dbData env.DbData, oldBranch, newBranch s
}
}

// todo: update default branch variable

return DeleteBranch(ctx, dbData, oldBranch, DeleteOptions{Force: true, AllowDeletingCurrentBranch: true}, remoteDbPro, rsc)
}

Expand Down
1 change: 1 addition & 0 deletions go/libraries/doltcore/sqle/dsess/variables.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ const (
DoltClusterAckWritesTimeoutSecs = "dolt_cluster_ack_writes_timeout_secs"

DoltStatsAutoRefreshEnabled = "dolt_stats_auto_refresh_enabled"
DoltStatsBootstrapEnabled = "dolt_stats_bootstrap_enabled"
DoltStatsAutoRefreshThreshold = "dolt_stats_auto_refresh_threshold"
DoltStatsAutoRefreshInterval = "dolt_stats_auto_refresh_interval"
DoltStatsMemoryOnly = "dolt_stats_memory_only"
Expand Down
1 change: 1 addition & 0 deletions go/libraries/doltcore/sqle/enginetest/dolt_engine_tests.go
Original file line number Diff line number Diff line change
Expand Up @@ -266,6 +266,7 @@ func RunQueryTestPlans(t *testing.T, harness DoltEnginetestHarness) {
}

defer harness.Close()
sql.SystemVariables.SetGlobal(dsess.DoltStatsBootstrapEnabled, 0)
enginetest.TestQueryPlans(t, harness, queries.PlanTests)
}

Expand Down
5 changes: 2 additions & 3 deletions go/libraries/doltcore/sqle/statsnoms/iter.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ import (

"github.com/dolthub/go-mysql-server/sql"
"github.com/dolthub/go-mysql-server/sql/planbuilder"
"github.com/dolthub/go-mysql-server/sql/stats"
"gopkg.in/errgo.v2/errors"

"github.com/dolthub/dolt/go/libraries/doltcore/schema"
Expand Down Expand Up @@ -114,15 +113,15 @@ func (s *statsIter) Next(ctx *sql.Context) (sql.Row, error) {
upperBoundCnt := row[schema.StatsUpperBoundCntTag].(int64)
createdAt := row[schema.StatsCreatedAtTag].(time.Time)

typs := strings.Split(typesStr, ",")
typs := strings.Split(typesStr, "\n")
for i, t := range typs {
typs[i] = strings.TrimSpace(t)
}

qual := sql.NewStatQualifier(dbName, tableName, indexName)
if curQual := qual.String(); !strings.EqualFold(curQual, s.currentQual) {
s.currentQual = curQual
s.currentTypes, err = stats.ParseTypeStrings(typs)
s.currentTypes, err = parseTypeStrings(typs)
if err != nil {
return nil, err
}
Expand Down
21 changes: 17 additions & 4 deletions go/libraries/doltcore/sqle/statsnoms/load.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import (
"time"

"github.com/dolthub/go-mysql-server/sql"
"github.com/dolthub/go-mysql-server/sql/planbuilder"
"github.com/dolthub/go-mysql-server/sql/stats"

"github.com/dolthub/dolt/go/libraries/doltcore/doltdb"
Expand Down Expand Up @@ -68,7 +69,7 @@ func loadStats(ctx *sql.Context, db dsess.SqlDatabase, m prolly.Map) (map[sql.St
upperBoundCnt := row[schema.StatsUpperBoundCntTag].(uint64)
createdAt := row[schema.StatsCreatedAtTag].(time.Time)

typs := strings.Split(typesStr, ",")
typs := strings.Split(typesStr, "\n")
for i, t := range typs {
typs[i] = strings.TrimSpace(t)
}
Expand All @@ -90,7 +91,7 @@ func loadStats(ctx *sql.Context, db dsess.SqlDatabase, m prolly.Map) (map[sql.St

mcvs := make([]sql.Row, numMcvs)
for i, v := range row[schema.StatsMcv1Tag:schema.StatsMcvCountsTag] {
if v != nil {
if v != nil && v != "" {
row, err := iter.ParseRow(v.(string))
if err != nil {
return nil, err
Expand Down Expand Up @@ -136,7 +137,7 @@ func loadStats(ctx *sql.Context, db dsess.SqlDatabase, m prolly.Map) (map[sql.St
}

if currentStat.Statistic.Hist == nil {
currentStat.Statistic.Typs, err = stats.ParseTypeStrings(typs)
currentStat.Statistic.Typs, err = parseTypeStrings(typs)
if err != nil {
return nil, err
}
Expand Down Expand Up @@ -180,6 +181,18 @@ func loadStats(ctx *sql.Context, db dsess.SqlDatabase, m prolly.Map) (map[sql.St
return qualToStats, nil
}

func parseTypeStrings(typs []string) ([]sql.Type, error) {
var ret []sql.Type
for _, typ := range typs {
ct, err := planbuilder.ParseColumnTypeString(typ)
if err != nil {
return nil, err
}
ret = append(ret, ct)
}
return ret, nil
}

func loadLowerBound(ctx *sql.Context, qual sql.StatQualifier) (sql.Row, error) {
dSess := dsess.DSessFromSess(ctx.Session)
roots, ok := dSess.GetRoots(ctx, qual.Db())
Expand Down Expand Up @@ -216,7 +229,7 @@ func loadLowerBound(ctx *sql.Context, qual sql.StatQualifier) (sql.Row, error) {
}

firstKey := keyBuilder.Build(buffPool)
var firstRow sql.Row
firstRow := make(sql.Row, keyBuilder.Desc.Count())
for i := 0; i < keyBuilder.Desc.Count(); i++ {
firstRow[i], err = tree.GetField(ctx, prollyMap.KeyDesc(), i, firstKey, prollyMap.NodeStore())
if err != nil {
Expand Down
2 changes: 1 addition & 1 deletion go/libraries/doltcore/sqle/statsnoms/write.go
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ func putIndexRows(ctx context.Context, statsMap *prolly.MutableMap, dStats *stat
sep := ""
for _, t := range dStats.Statistic.Typs {
typesB.WriteString(sep + t.String())
sep = ","
sep = "\n"
}
typesStr := typesB.String()

Expand Down
61 changes: 60 additions & 1 deletion go/libraries/doltcore/sqle/statspro/analyze.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,62 @@ import (
"github.com/dolthub/dolt/go/store/prolly/tree"
)

const (
boostrapRowLimit = 2e6
)

func (p *Provider) RefreshTableStats(ctx *sql.Context, table sql.Table, db string) error {
dSess := dsess.DSessFromSess(ctx.Session)
branch, err := dSess.GetBranch()
if err != nil {
return err
}
return p.RefreshTableStatsWithBranch(ctx, table, db, branch)
}

func (p *Provider) BootstrapDatabaseStats(ctx *sql.Context, db string) error {
dSess := dsess.DSessFromSess(ctx.Session)
branches := p.getStatsBranches(ctx)
var rows uint64
for _, branch := range branches {
sqlDb, err := dSess.Provider().Database(ctx, p.branchQualifiedDatabase(db, branch))
if err != nil {
if sql.ErrDatabaseNotFound.Is(err) {
// default branch is not valid
continue
}
return err
}
tables, err := sqlDb.GetTableNames(ctx)
if err != nil {
return err
}
for _, table := range tables {
sqlTable, _, err := GetLatestTable(ctx, table, sqlDb)
if err != nil {
return err
}

if st, ok := sqlTable.(sql.StatisticsTable); ok {
cnt, ok, err := st.RowCount(ctx)
if ok && err == nil {
rows += cnt
}
}
if rows >= boostrapRowLimit {
return fmt.Errorf("stats bootstrap aborted because %s exceeds the default row limit; manually run \"ANALYZE <table>\" or \"call dolt_stats_restart()\" to collect statistics", db)
}

if err := p.RefreshTableStatsWithBranch(ctx, sqlTable, db, branch); err != nil {
return err
}
}
}
return nil
}

func (p *Provider) RefreshTableStatsWithBranch(ctx *sql.Context, table sql.Table, db string, branch string) error {
dSess := dsess.DSessFromSess(ctx.Session)

sqlDb, err := dSess.Provider().Database(ctx, p.branchQualifiedDatabase(db, branch))
if err != nil {
Expand Down Expand Up @@ -143,7 +193,16 @@ func (p *Provider) branchQualifiedDatabase(db, branch string) string {

// GetLatestTable will get the WORKING root table for the current database/branch
func GetLatestTable(ctx *sql.Context, tableName string, sqlDb sql.Database) (sql.Table, *doltdb.Table, error) {
sqlTable, ok, err := sqlDb.(sqle.Database).GetTableInsensitive(ctx, tableName)
var db sqle.Database
switch d := sqlDb.(type) {
case sqle.Database:
db = d
case sqle.ReadReplicaDatabase:
db = d.Database
default:
return nil, nil, fmt.Errorf("expected sqle.Database, found %T", sqlDb)
}
sqlTable, ok, err := db.GetTableInsensitive(ctx, tableName)
if err != nil {
return nil, nil, err
}
Expand Down
10 changes: 8 additions & 2 deletions go/libraries/doltcore/sqle/statspro/configure.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ func (p *Provider) Configure(ctx context.Context, ctxFactory func(ctx context.Co
branches := p.getStatsBranches(loadCtx)

var autoEnabled bool
var startupEnabled bool
var intervalSec time.Duration
var thresholdf64 float64
if _, enabled, _ := sql.SystemVariables.GetGlobal(dsess.DoltStatsAutoRefreshEnabled); enabled == int8(1) {
Expand All @@ -55,6 +56,8 @@ func (p *Provider) Configure(ctx context.Context, ctxFactory func(ctx context.Co

p.pro.InitDatabaseHooks = append(p.pro.InitDatabaseHooks, NewStatsInitDatabaseHook(p, ctxFactory, bThreads))
p.pro.DropDatabaseHooks = append(p.pro.DropDatabaseHooks, NewStatsDropDatabaseHook(p))
} else if _, startupStats, _ := sql.SystemVariables.GetGlobal(dsess.DoltStatsBootstrapEnabled); startupStats == int8(1) {
startupEnabled = true
}

eg, ctx := loadCtx.NewErrgroup()
Expand All @@ -69,7 +72,6 @@ func (p *Provider) Configure(ctx context.Context, ctxFactory func(ctx context.Co
} else {
err = fmt.Errorf("%w: %v", ErrFailedToLoad, r)
}

return
}
}()
Expand All @@ -84,6 +86,10 @@ func (p *Provider) Configure(ctx context.Context, ctxFactory func(ctx context.Co
}
if autoEnabled {
return p.InitAutoRefreshWithParams(ctxFactory, db.Name(), bThreads, intervalSec, thresholdf64, branches)
} else if startupEnabled {
if err := p.BootstrapDatabaseStats(loadCtx, db.Name()); err != nil {
return err
}
}
return nil
})
Expand All @@ -109,7 +115,7 @@ func (p *Provider) getStatsBranches(ctx *sql.Context) []string {
}

if branches == nil {
branches = []string{p.pro.DefaultBranch()}
branches = append(branches, p.pro.DefaultBranch())
}
return branches
}
Expand Down
7 changes: 7 additions & 0 deletions go/libraries/doltcore/sqle/system_variables.go
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,13 @@ func AddDoltSystemVariables() {
Type: types.NewSystemBoolType(dsess.DoltStatsAutoRefreshEnabled),
Default: int8(0),
},
&sql.MysqlSystemVariable{
Name: dsess.DoltStatsBootstrapEnabled,
Dynamic: true,
Scope: sql.GetMysqlScope(sql.SystemVariableScope_Global),
Type: types.NewSystemBoolType(dsess.DoltStatsBootstrapEnabled),
Default: int8(1),
},
&sql.MysqlSystemVariable{
Name: dsess.DoltStatsMemoryOnly,
Dynamic: true,
Expand Down
59 changes: 52 additions & 7 deletions integration-tests/bats/stats.bats
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,9 @@ teardown() {
@test "stats: empty initial stats" {
cd repo2

# disable bootstrap, can only make stats with ANALYZE or background thread
dolt sql -q "set @@PERSIST.dolt_stats_bootstrap_enabled = 0;"

dolt sql -q "insert into xy values (0,0), (1,1)"

start_sql_server
Expand Down Expand Up @@ -88,6 +91,16 @@ teardown() {
[ "${lines[1]}" = "8" ]
}

@test "stats: bootrap on engine startup" {
cd repo2

dolt sql -q "set @@PERSIST.dolt_stats_bootstrap_enabled = 1;"
dolt sql -q "insert into xy values (0,0), (1,1)"
run dolt sql -r csv -q "select count(*) from dolt_statistics"
[ "$status" -eq 0 ]
[ "${lines[1]}" = "2" ]
}

@test "stats: deletes refresh" {
cd repo2

Expand Down Expand Up @@ -219,20 +232,15 @@ teardown() {

@test "stats: multi db" {
cd repo1

dolt sql -q "insert into ab values (0,0), (1,1)"

cd ../repo2

dolt sql -q "insert into ab values (0,0), (1,1)"
dolt sql -q "insert into xy values (0,0), (1,1)"

cd ..
start_sql_server
sleep 1
stop_sql_server

run dolt sql -r csv -q "select count(*) from dolt_statistics"
[ "$status" -eq 0 ]
[ "${lines[1]}" = "0" ]

dolt sql -q "SET @@persist.dolt_stats_auto_refresh_enabled = 1;"
dolt sql -q "SET @@persist.dolt_stats_auto_refresh_threshold = 0.5"
Expand Down Expand Up @@ -327,3 +335,40 @@ SQL
[ "${lines[2]}" = "2" ]
}

@test "stats: boostrap abort over 1mm rows" {
cat <<EOF > data.py
import random
import os

rows = 2*1000*1000+1

def main():
f = open("data.csv","w+")
f.write("id,hostname\n")

for i in range(rows):
hostname = random.getrandbits(100)
f.write(f"{i},{hostname}\n")
if i % (500*1000) == 0:
print("row :", i)
f.flush()

f.close()

if __name__ == "__main__":
main()
EOF

mkdir repo3
cd repo3
python3 ../data.py

dolt init
dolt sql -q "create table f (id int primary key, hostname int)"
dolt table import -u --continue f data.csv

run dolt sql -r csv -q "select count(*) from dolt_statistics"
[ "$status" -eq 0 ]
[[ "${lines[0]}" =~ "stats bootstrap aborted" ]] || false
[ "${lines[2]}" = "0" ]
}
Loading