diff --git a/go/cmd/dolt/commands/engine/sqlengine.go b/go/cmd/dolt/commands/engine/sqlengine.go index 8f702fd98a0..2260cbfeedf 100644 --- a/go/cmd/dolt/commands/engine/sqlengine.go +++ b/go/cmd/dolt/commands/engine/sqlengine.go @@ -16,11 +16,6 @@ package engine import ( "context" - "fmt" - "os" - "strconv" - "strings" - gms "github.com/dolthub/go-mysql-server" "github.com/dolthub/go-mysql-server/eventscheduler" "github.com/dolthub/go-mysql-server/sql" @@ -31,6 +26,9 @@ import ( _ "github.com/dolthub/go-mysql-server/sql/variables" "github.com/dolthub/vitess/go/vt/sqlparser" "github.com/sirupsen/logrus" + "os" + "strconv" + "strings" "github.com/dolthub/dolt/go/cmd/dolt/cli" "github.com/dolthub/dolt/go/libraries/doltcore/branch_control" @@ -43,7 +41,6 @@ import ( "github.com/dolthub/dolt/go/libraries/doltcore/sqle/dsess" "github.com/dolthub/dolt/go/libraries/doltcore/sqle/kvexec" "github.com/dolthub/dolt/go/libraries/doltcore/sqle/mysql_file_handler" - "github.com/dolthub/dolt/go/libraries/doltcore/sqle/statsnoms" "github.com/dolthub/dolt/go/libraries/doltcore/sqle/statspro" "github.com/dolthub/dolt/go/libraries/doltcore/sqle/writer" "github.com/dolthub/dolt/go/libraries/utils/config" @@ -189,7 +186,13 @@ func NewSqlEngine( "authentication_dolt_jwt": NewAuthenticateDoltJWTPlugin(config.JwksConfig), }) - statsPro := statspro.NewProvider(pro, statsnoms.NewNomsStatsFactory(mrEnv.RemoteDialProvider())) + var statsPro sql.StatsProvider + _, enabled, _ := sql.SystemVariables.GetGlobal(dsess.DoltStatsEnabled) + if enabled.(int8) == 1 { + statsPro = statspro.NewStatsCoord(pro, sqlEngine.NewDefaultContext, logrus.StandardLogger(), bThreads, mrEnv.GetEnv(mrEnv.GetFirstDatabase())) + } else { + statsPro = statspro.StatsNoop{} + } engine.Analyzer.Catalog.StatsProvider = statsPro engine.Analyzer.ExecBuilder = rowexec.NewOverrideBuilder(kvexec.Builder{}) @@ -202,8 +205,8 @@ func NewSqlEngine( // configuring stats depends on sessionBuilder // sessionBuilder needs ref to statsProv - if err = statsPro.Configure(ctx, sqlEngine.NewDefaultContext, bThreads, dbs); err != nil { - fmt.Fprintln(cli.CliErr, err) + if sc, ok := statsPro.(*statspro.StatsCoord); ok { + sc.Init(ctx, dbs) } // Load MySQL Db information diff --git a/go/cmd/dolt/commands/sqlserver/server.go b/go/cmd/dolt/commands/sqlserver/server.go index 33d253a377a..5844926d0f8 100644 --- a/go/cmd/dolt/commands/sqlserver/server.go +++ b/go/cmd/dolt/commands/sqlserver/server.go @@ -19,6 +19,7 @@ import ( "crypto/tls" "errors" "fmt" + "github.com/dolthub/dolt/go/libraries/doltcore/sqle/statspro" "net" "net/http" "os" @@ -260,23 +261,23 @@ func ConfigureServices( var sqlEngine *engine.SqlEngine InitSqlEngine := &svcs.AnonService{ InitF: func(ctx context.Context) (err error) { - if statsOn, err := mrEnv.Config().GetString(env.SqlServerGlobalsPrefix + "." + dsess.DoltStatsAutoRefreshEnabled); err != nil { - // Auto-stats is off by default for every command except - // sql-server. Unless the config specifies a specific - // behavior, enable server stats collection. - sql.SystemVariables.SetGlobal(dsess.DoltStatsAutoRefreshEnabled, 1) - } else if statsOn != "0" { - // do not bootstrap if auto-stats enabled - } else if _, err := mrEnv.Config().GetString(env.SqlServerGlobalsPrefix + "." + dsess.DoltStatsBootstrapEnabled); err != nil { - // If we've disabled stats collection and config does not - // specify bootstrap behavior, enable bootstrapping. - sql.SystemVariables.SetGlobal(dsess.DoltStatsBootstrapEnabled, 1) - } sqlEngine, err = engine.NewSqlEngine( ctx, mrEnv, config, ) + if sc, ok := sqlEngine.GetUnderlyingEngine().Analyzer.Catalog.StatsProvider.(*statspro.StatsCoord); ok { + sqlCtx, err := sqlEngine.NewDefaultContext(ctx) + if err != nil { + return err + } + if sc == nil { + return fmt.Errorf("unexpected nil stats coord") + } + if err = sc.Restart(sqlCtx); err != nil { + return err + } + } return err }, StopF: func() error { diff --git a/go/libraries/doltcore/schema/statistic.go b/go/libraries/doltcore/schema/statistic.go index 1879951e10b..88215a7443a 100644 --- a/go/libraries/doltcore/schema/statistic.go +++ b/go/libraries/doltcore/schema/statistic.go @@ -24,12 +24,12 @@ import ( const StatsVersion int64 = 1 const ( - StatsQualifierColName = "qualifier" StatsDbColName = "database_name" StatsTableColName = "table_name" StatsIndexColName = "index_name" - StatsPositionColName = "position" + StatsBranchName = "branch" StatsCommitHashColName = "commit_hash" + StatsPrefixLenName = "prefix_len" StatsRowCountColName = "row_count" StatsDistinctCountColName = "distinct_count" StatsNullCountColName = "null_count" @@ -42,7 +42,7 @@ const ( StatsMcv2ColName = "mcv2" StatsMcv3ColName = "mcv3" StatsMcv4ColName = "mcv4" - StatsMcvCountsColName = "mcvCounts" + StatsMcvCountsColName = "mcv_counts" StatsVersionColName = "version" ) @@ -52,6 +52,7 @@ const ( StatsIndexTag StatsPositionTag StatsVersionTag + StatsPrefixLenTag StatsCommitHashTag StatsRowCountTag StatsDistinctCountTag @@ -71,9 +72,9 @@ const ( func StatsTableSqlSchema(dbName string) sql.PrimaryKeySchema { return sql.PrimaryKeySchema{ Schema: sql.Schema{ - &sql.Column{Name: StatsDbColName, Type: types.Text, PrimaryKey: true, DatabaseSource: dbName}, - &sql.Column{Name: StatsTableColName, Type: types.Text, PrimaryKey: true, DatabaseSource: dbName}, - &sql.Column{Name: StatsIndexColName, Type: types.Text, PrimaryKey: true, DatabaseSource: dbName}, + &sql.Column{Name: StatsDbColName, Type: types.Text, DatabaseSource: dbName}, + &sql.Column{Name: StatsTableColName, Type: types.Text, DatabaseSource: dbName}, + &sql.Column{Name: StatsIndexColName, Type: types.Text, DatabaseSource: dbName}, &sql.Column{Name: StatsRowCountColName, Type: types.Int64, DatabaseSource: dbName}, &sql.Column{Name: StatsDistinctCountColName, Type: types.Int64, DatabaseSource: dbName}, &sql.Column{Name: StatsNullCountColName, Type: types.Int64, DatabaseSource: dbName}, @@ -88,7 +89,6 @@ func StatsTableSqlSchema(dbName string) sql.PrimaryKeySchema { &sql.Column{Name: StatsMcv4ColName, Type: types.Text, DatabaseSource: dbName}, &sql.Column{Name: StatsMcvCountsColName, Type: types.Text, DatabaseSource: dbName}, }, - PkOrdinals: []int{0, 1}, } } @@ -96,20 +96,14 @@ var StatsTableDoltSchema = StatsTableDoltSchemaGen() func StatsTableDoltSchemaGen() Schema { colColl := NewColCollection( - NewColumn(StatsDbColName, StatsDbTag, stypes.StringKind, true, NotNullConstraint{}), - NewColumn(StatsTableColName, StatsTableTag, stypes.StringKind, true, NotNullConstraint{}), - NewColumn(StatsIndexColName, StatsIndexTag, stypes.StringKind, true, NotNullConstraint{}), - NewColumn(StatsPositionColName, StatsPositionTag, stypes.IntKind, true, NotNullConstraint{}), + NewColumn(StatsPrefixLenName, StatsPrefixLenTag, stypes.IntKind, true, NotNullConstraint{}), + NewColumn(StatsCommitHashColName, StatsCommitHashTag, stypes.StringKind, true, NotNullConstraint{}), NewColumn(StatsVersionColName, StatsVersionTag, stypes.IntKind, false, NotNullConstraint{}), - NewColumn(StatsCommitHashColName, StatsCommitHashTag, stypes.StringKind, false, NotNullConstraint{}), NewColumn(StatsRowCountColName, StatsRowCountTag, stypes.IntKind, false, NotNullConstraint{}), NewColumn(StatsDistinctCountColName, StatsDistinctCountTag, stypes.IntKind, false, NotNullConstraint{}), NewColumn(StatsNullCountColName, StatsNullCountTag, stypes.IntKind, false, NotNullConstraint{}), - NewColumn(StatsColumnsColName, StatsColumnsTag, stypes.StringKind, false, NotNullConstraint{}), - NewColumn(StatsTypesColName, StatsTypesTag, stypes.StringKind, false, NotNullConstraint{}), NewColumn(StatsUpperBoundColName, StatsUpperBoundTag, stypes.StringKind, false, NotNullConstraint{}), NewColumn(StatsUpperBoundCntColName, StatsUpperBoundCntTag, stypes.IntKind, false, NotNullConstraint{}), - NewColumn(StatsCreatedAtColName, StatsCreatedAtTag, stypes.TimestampKind, false, NotNullConstraint{}), NewColumn(StatsMcv1ColName, StatsMcv1Tag, stypes.StringKind, false), NewColumn(StatsMcv2ColName, StatsMcv2Tag, stypes.StringKind, false), NewColumn(StatsMcv3ColName, StatsMcv3Tag, stypes.StringKind, false), diff --git a/go/libraries/doltcore/sqle/clusterdb/database.go b/go/libraries/doltcore/sqle/clusterdb/database.go index dd741a9a205..4577d2f3c4d 100644 --- a/go/libraries/doltcore/sqle/clusterdb/database.go +++ b/go/libraries/doltcore/sqle/clusterdb/database.go @@ -162,6 +162,10 @@ func (db database) RequestedName() string { return db.Name() } +func (db database) AliasedName() string { + return db.Name() +} + type noopRepoStateWriter struct{} var _ env.RepoStateWriter = noopRepoStateWriter{} diff --git a/go/libraries/doltcore/sqle/database.go b/go/libraries/doltcore/sqle/database.go index f75e5f52997..10c5e154999 100644 --- a/go/libraries/doltcore/sqle/database.go +++ b/go/libraries/doltcore/sqle/database.go @@ -694,6 +694,9 @@ func (db Database) getTableInsensitive(ctx *sql.Context, head *doltdb.Commit, ds if err != nil { return nil, false, err } + if branch == "" { + branch = db.Revision() + } dt, found = dtables.NewStatisticsTable(ctx, db.Name(), db.schemaName, branch, tables), true case doltdb.ProceduresTableName: found = true diff --git a/go/libraries/doltcore/sqle/database_provider.go b/go/libraries/doltcore/sqle/database_provider.go index 293e9d7be00..bea3f7fa059 100644 --- a/go/libraries/doltcore/sqle/database_provider.go +++ b/go/libraries/doltcore/sqle/database_provider.go @@ -966,7 +966,7 @@ func (p *DoltDatabaseProvider) databaseForRevision(ctx *sql.Context, revisionQua } } - db, err := revisionDbForBranch(ctx, srcDb, resolvedRevSpec, requestedName) + db, err := RevisionDbForBranch(ctx, srcDb, resolvedRevSpec, requestedName) // preserve original user case in the case of not found if sql.ErrDatabaseNotFound.Is(err) { return nil, false, sql.ErrDatabaseNotFound.New(revisionQualifiedName) @@ -1507,8 +1507,8 @@ func isTag(ctx context.Context, db dsess.SqlDatabase, tagName string) (string, b return "", false, nil } -// revisionDbForBranch returns a new database that is tied to the branch named by revSpec -func revisionDbForBranch(ctx context.Context, srcDb dsess.SqlDatabase, revSpec string, requestedName string) (dsess.SqlDatabase, error) { +// RevisionDbForBranch returns a new database that is tied to the branch named by revSpec +func RevisionDbForBranch(ctx context.Context, srcDb dsess.SqlDatabase, revSpec string, requestedName string) (dsess.SqlDatabase, error) { static := staticRepoState{ branch: ref.NewBranchRef(revSpec), RepoStateWriter: srcDb.DbData().Rsw, diff --git a/go/libraries/doltcore/sqle/dprocedures/init.go b/go/libraries/doltcore/sqle/dprocedures/init.go index 499d4209886..7603093e3ba 100644 --- a/go/libraries/doltcore/sqle/dprocedures/init.go +++ b/go/libraries/doltcore/sqle/dprocedures/init.go @@ -47,12 +47,14 @@ var DoltProcedures = []sql.ExternalStoredProcedureDetails{ {Name: "dolt_tag", Schema: int64Schema("status"), Function: doltTag}, {Name: "dolt_verify_constraints", Schema: int64Schema("violations"), Function: doltVerifyConstraints}, - {Name: "dolt_stats_drop", Schema: statsFuncSchema, Function: statsFunc(statsDrop)}, {Name: "dolt_stats_restart", Schema: statsFuncSchema, Function: statsFunc(statsRestart)}, {Name: "dolt_stats_stop", Schema: statsFuncSchema, Function: statsFunc(statsStop)}, - {Name: "dolt_stats_status", Schema: statsFuncSchema, Function: statsFunc(statsStatus)}, - {Name: "dolt_stats_prune", Schema: statsFuncSchema, Function: statsFunc(statsPrune)}, + {Name: "dolt_stats_info", Schema: statsFuncSchema, Function: statsFunc(statsInfo)}, {Name: "dolt_stats_purge", Schema: statsFuncSchema, Function: statsFunc(statsPurge)}, + {Name: "dolt_stats_wait", Schema: statsFuncSchema, Function: statsFunc(statsWait)}, + {Name: "dolt_stats_gc", Schema: statsFuncSchema, Function: statsFunc(statsGc)}, + {Name: "dolt_stats_sync", Schema: statsFuncSchema, Function: statsFunc(statsBranchSync)}, + {Name: "dolt_stats_validate", Schema: statsFuncSchema, Function: statsFunc(statsValidate)}, } // stringSchema returns a non-nullable schema with all columns as LONGTEXT. diff --git a/go/libraries/doltcore/sqle/dprocedures/stats_funcs.go b/go/libraries/doltcore/sqle/dprocedures/stats_funcs.go index 139bec5e5d2..18ea0fe6cd1 100644 --- a/go/libraries/doltcore/sqle/dprocedures/stats_funcs.go +++ b/go/libraries/doltcore/sqle/dprocedures/stats_funcs.go @@ -15,13 +15,14 @@ package dprocedures import ( + "context" + "encoding/json" "fmt" "strings" "github.com/dolthub/go-mysql-server/sql" gmstypes "github.com/dolthub/go-mysql-server/sql/types" - "github.com/dolthub/dolt/go/libraries/doltcore/env" "github.com/dolthub/dolt/go/libraries/doltcore/ref" "github.com/dolthub/dolt/go/libraries/doltcore/sqle/dsess" ) @@ -35,7 +36,12 @@ var statsFuncSchema = []*sql.Column{ } func statsFunc(fn func(ctx *sql.Context) (interface{}, error)) func(ctx *sql.Context, args ...string) (sql.RowIter, error) { - return func(ctx *sql.Context, args ...string) (sql.RowIter, error) { + return func(ctx *sql.Context, args ...string) (iter sql.RowIter, err error) { + defer func() { + if r := recover(); r != nil { + err = fmt.Errorf("stats function unexpectedly panicked: %s", r) + } + }() res, err := fn(ctx) if err != nil { return nil, err @@ -44,124 +50,181 @@ func statsFunc(fn func(ctx *sql.Context) (interface{}, error)) func(ctx *sql.Con } } -// AutoRefreshStatsProvider is a sql.StatsProvider that exposes hooks for +type StatsInfo struct { + DbCnt int `json:"dbCnt"` + ReadCnt int `json:"readCnt"` + Active bool `json:"active"` + DbSeedCnt int `json:"dbSeedCnt"` + EstBucketCnt int `json:"estBucketCnt"` + CachedBucketCnt int `json:"cachedBucketCnt"` + StatCnt int `json:"statCnt"` + GcCounter int `json:"gcCounter"` + BranchCounter int `json:"branchCounter"` +} + +func (si StatsInfo) ToJson() string { + jsonData, err := json.Marshal(si) + if err != nil { + return "" + } + return string(jsonData) +} + +// ToggableStats is a sql.StatsProvider that exposes hooks for // observing and manipulating background database auto refresh threads. -type AutoRefreshStatsProvider interface { +type ToggableStats interface { sql.StatsProvider - CancelRefreshThread(string) - StartRefreshThread(*sql.Context, dsess.DoltDatabaseProvider, string, *env.DoltEnv, dsess.SqlDatabase) error - ThreadStatus(string) string - Prune(ctx *sql.Context) error + FlushQueue(ctx context.Context) error + Restart(context.Context) error + Info() StatsInfo Purge(ctx *sql.Context) error + WaitForDbSync(ctx *sql.Context) error + Gc(ctx *sql.Context) error + BranchSync(ctx *sql.Context) error + ValidateState(ctx context.Context) error + Init(context.Context, []dsess.SqlDatabase) error } type BranchStatsProvider interface { DropBranchDbStats(ctx *sql.Context, branch, db string, flush bool) error } -// statsRestart tries to stop and then start a refresh thread +// statsRestart flushes the current job queue and re-inits all +// statistic databases. func statsRestart(ctx *sql.Context) (interface{}, error) { dSess := dsess.DSessFromSess(ctx.Session) statsPro := dSess.StatsProvider() - dbName := strings.ToLower(ctx.GetCurrentDatabase()) - if afp, ok := statsPro.(AutoRefreshStatsProvider); ok { - pro := dSess.Provider() - newFs, err := pro.FileSystemForDatabase(dbName) + if afp, ok := statsPro.(ToggableStats); ok { + err := afp.FlushQueue(ctx) if err != nil { - return nil, fmt.Errorf("failed to restart stats collection: %w", err) + return nil, fmt.Errorf("failed to restart collection: %w", err) } - dEnv := env.Load(ctx, env.GetCurrentUserHomeDir, newFs, pro.DbFactoryUrl(), "TODO") - - sqlDb, ok := pro.BaseDatabase(ctx, dbName) - if !ok { - return nil, fmt.Errorf("failed to restart stats collection: database not found: %s", dbName) + dbs := dSess.Provider().AllDatabases(ctx) + var sqlDbs []dsess.SqlDatabase + for _, db := range dbs { + sqlDb, ok := db.(dsess.SqlDatabase) + if ok { + sqlDbs = append(sqlDbs, sqlDb) + } } - - afp.CancelRefreshThread(dbName) - - err = afp.StartRefreshThread(ctx, pro, dbName, dEnv, sqlDb) - if err != nil { - return nil, fmt.Errorf("failed to restart collection: %w", err) + if err := afp.Init(ctx, sqlDbs); err != nil { + return nil, err } + if err := afp.Restart(ctx); err != nil { + return nil, err + } + return fmt.Sprintf("restarted stats collection: %s", ref.StatsRef{}.String()), nil } - return nil, fmt.Errorf("provider does not implement AutoRefreshStatsProvider") + return nil, fmt.Errorf("provider does not implement ToggableStats") } -// statsStatus returns the last update for a stats thread -func statsStatus(ctx *sql.Context) (interface{}, error) { +// statsInfo returns the last update for a stats thread +func statsInfo(ctx *sql.Context) (interface{}, error) { dSess := dsess.DSessFromSess(ctx.Session) - dbName := strings.ToLower(ctx.GetCurrentDatabase()) pro := dSess.StatsProvider() - if afp, ok := pro.(AutoRefreshStatsProvider); ok { - return afp.ThreadStatus(dbName), nil + if afp, ok := pro.(ToggableStats); ok { + info := afp.Info() + return info.ToJson(), nil } - return nil, fmt.Errorf("provider does not implement AutoRefreshStatsProvider") + return nil, fmt.Errorf("provider does not implement ToggableStats") } -// statsStop cancels a refresh thread -func statsStop(ctx *sql.Context) (interface{}, error) { +// statsWait blocks until the job queue executes two full loops +// of instructions, which will (1) pick up and (2) commit new +// sets of index-bucket dependencies. +func statsWait(ctx *sql.Context) (interface{}, error) { dSess := dsess.DSessFromSess(ctx.Session) - statsPro := dSess.StatsProvider() - dbName := strings.ToLower(ctx.GetCurrentDatabase()) - - if afp, ok := statsPro.(AutoRefreshStatsProvider); ok { - afp.CancelRefreshThread(dbName) - return fmt.Sprintf("stopped thread: %s", dbName), nil + pro := dSess.StatsProvider() + if afp, ok := pro.(ToggableStats); ok { + afp.WaitForDbSync(ctx) + return nil, nil } - return nil, fmt.Errorf("provider does not implement AutoRefreshStatsProvider") + return nil, fmt.Errorf("provider does not implement ToggableStats") } -// statsDrop deletes the stats ref -func statsDrop(ctx *sql.Context) (interface{}, error) { +// statsGc rewrites the cache to only include objects reachable +// by the current root value. +func statsGc(ctx *sql.Context) (interface{}, error) { dSess := dsess.DSessFromSess(ctx.Session) pro := dSess.StatsProvider() - dbName := strings.ToLower(ctx.GetCurrentDatabase()) - - branch, err := dSess.GetBranch() - if err != nil { - return nil, fmt.Errorf("failed to drop stats: %w", err) + if afp, ok := pro.(ToggableStats); ok { + return nil, afp.Gc(ctx) } + return nil, fmt.Errorf("provider does not implement ToggableStats") +} - if afp, ok := pro.(AutoRefreshStatsProvider); ok { - // currently unsafe to drop stats while running refresh - afp.CancelRefreshThread(dbName) - } - if bsp, ok := pro.(BranchStatsProvider); ok { - err := bsp.DropBranchDbStats(ctx, branch, dbName, true) - if err != nil { - return nil, fmt.Errorf("failed to drop stats: %w", err) - } +// statsBranchSync update database branch tracking based on the +// most recent session. +func statsBranchSync(ctx *sql.Context) (interface{}, error) { + dSess := dsess.DSessFromSess(ctx.Session) + pro := dSess.StatsProvider() + if afp, ok := pro.(ToggableStats); ok { + return nil, afp.BranchSync(ctx) } - - return fmt.Sprintf("deleted stats ref for %s", dbName), nil + return nil, fmt.Errorf("provider does not implement ToggableStats") } -// statsPrune replaces the current disk contents with only the currently -// tracked in memory statistics. -func statsPrune(ctx *sql.Context) (interface{}, error) { +// statsValidate returns inconsistencies if the kv cache is out of date +func statsValidate(ctx *sql.Context) (interface{}, error) { dSess := dsess.DSessFromSess(ctx.Session) - pro, ok := dSess.StatsProvider().(AutoRefreshStatsProvider) - if !ok { - return nil, fmt.Errorf("stats not persisted, cannot purge") + pro := dSess.StatsProvider() + if afp, ok := pro.(ToggableStats); ok { + return afp.ValidateState(ctx).Error(), nil } - if err := pro.Prune(ctx); err != nil { - return "failed to prune stats databases", err + return nil, fmt.Errorf("provider does not implement ToggableStats") +} + +// statsStop flushes the job queue and leaves the stats provider +// in a paused state. +func statsStop(ctx *sql.Context) (interface{}, error) { + dSess := dsess.DSessFromSess(ctx.Session) + statsPro := dSess.StatsProvider() + dbName := strings.ToLower(ctx.GetCurrentDatabase()) + + if afp, ok := statsPro.(ToggableStats); ok { + if err := afp.FlushQueue(ctx); err != nil { + return nil, err + } + return fmt.Sprintf("stopped thread: %s", dbName), nil } - return "pruned all stats databases", nil + return nil, fmt.Errorf("provider does not implement ToggableStats") } -// statsPurge removes the stats database from disk +// statsPurge flushes the job queue, deletes the current caches +// and storage targets, re-initializes the tracked database +// states, and returns with stats collection paused. func statsPurge(ctx *sql.Context) (interface{}, error) { dSess := dsess.DSessFromSess(ctx.Session) - pro, ok := dSess.StatsProvider().(AutoRefreshStatsProvider) + pro, ok := dSess.StatsProvider().(ToggableStats) if !ok { return nil, fmt.Errorf("stats not persisted, cannot purge") } + + err := pro.FlushQueue(ctx) + if err != nil { + return nil, fmt.Errorf("failed to flush queue: %w", err) + } + if err := pro.Purge(ctx); err != nil { - return "failed to purged databases", err + return "failed to purge stats", err + } + + dbs := dSess.Provider().AllDatabases(ctx) + var sqlDbs []dsess.SqlDatabase + for _, db := range dbs { + sqlDb, ok := db.(dsess.SqlDatabase) + if ok { + sqlDbs = append(sqlDbs, sqlDb) + } } + + // init is currently the safest way to reset state + if err := pro.Init(ctx, sqlDbs); err != nil { + return "failed to purge stats", err + } + return "purged all database stats", nil } diff --git a/go/libraries/doltcore/sqle/dsess/session_db_provider.go b/go/libraries/doltcore/sqle/dsess/session_db_provider.go index 3d4969bb114..05e72971747 100644 --- a/go/libraries/doltcore/sqle/dsess/session_db_provider.go +++ b/go/libraries/doltcore/sqle/dsess/session_db_provider.go @@ -122,6 +122,7 @@ type SqlDatabase interface { sql.Database sql.SchemaDatabase sql.DatabaseSchema + sql.AliasedDatabase SessionDatabase RevisionDatabase diff --git a/go/libraries/doltcore/sqle/dsess/variables.go b/go/libraries/doltcore/sqle/dsess/variables.go index 848ed2218ec..0d8e0fd4edb 100644 --- a/go/libraries/doltcore/sqle/dsess/variables.go +++ b/go/libraries/doltcore/sqle/dsess/variables.go @@ -59,12 +59,12 @@ const ( DoltClusterRoleEpochVariable = "dolt_cluster_role_epoch" DoltClusterAckWritesTimeoutSecs = "dolt_cluster_ack_writes_timeout_secs" - DoltStatsAutoRefreshEnabled = "dolt_stats_auto_refresh_enabled" - DoltStatsBootstrapEnabled = "dolt_stats_bootstrap_enabled" - DoltStatsAutoRefreshThreshold = "dolt_stats_auto_refresh_threshold" - DoltStatsAutoRefreshInterval = "dolt_stats_auto_refresh_interval" - DoltStatsMemoryOnly = "dolt_stats_memory_only" - DoltStatsBranches = "dolt_stats_branches" + DoltStatsEnabled = "dolt_stats_enabled" + DoltStatsMemoryOnly = "dolt_stats_memory_only" + DoltStatsBranches = "dolt_stats_branches" + DoltStatsJobInterval = "dolt_stats_job_interval" + DoltStatsBranchInterval = "dolt_stats_branch_interval" + DoltStatsGCInterval = "dolt_stats_gc_interval" ) const URLTemplateDatabasePlaceholder = "{database}" diff --git a/go/libraries/doltcore/sqle/dtables/statistics_table.go b/go/libraries/doltcore/sqle/dtables/statistics_table.go index fda463e7e49..f73cfaf192b 100644 --- a/go/libraries/doltcore/sqle/dtables/statistics_table.go +++ b/go/libraries/doltcore/sqle/dtables/statistics_table.go @@ -68,7 +68,7 @@ func (st *StatisticsTable) DataLength(ctx *sql.Context) (uint64, error) { } type BranchStatsProvider interface { - GetTableDoltStats(ctx *sql.Context, branch, db, schema, table string) ([]sql.Statistic, error) + GetTableDoltStats(ctx *sql.Context, branch, db, schema, table string) ([]*stats.Statistic, error) } // RowCount implements sql.StatisticsTable @@ -119,14 +119,19 @@ func (st *StatisticsTable) Partitions(*sql.Context) (sql.PartitionIter, error) { // PartitionRows is a sql.Table interface function that gets a row iterator for a partition func (st *StatisticsTable) PartitionRows(ctx *sql.Context, _ sql.Partition) (sql.RowIter, error) { dSess := dsess.DSessFromSess(ctx.Session) - statsPro := dSess.StatsProvider().(BranchStatsProvider) + statsPro, ok := dSess.StatsProvider().(BranchStatsProvider) + if !ok { + return sql.RowsToRowIter(), nil + } var dStats []sql.Statistic for _, table := range st.tableNames { dbStats, err := statsPro.GetTableDoltStats(ctx, st.branch, st.dbName, st.schemaName, table) if err != nil { return nil, err } - dStats = append(dStats, dbStats...) + for _, s := range dbStats { + dStats = append(dStats, s) + } } return stats.NewStatsIter(ctx, dStats...) } diff --git a/go/libraries/doltcore/sqle/enginetest/dolt_engine_test.go b/go/libraries/doltcore/sqle/enginetest/dolt_engine_test.go index ac958a8084e..d1591f58636 100644 --- a/go/libraries/doltcore/sqle/enginetest/dolt_engine_test.go +++ b/go/libraries/doltcore/sqle/enginetest/dolt_engine_test.go @@ -17,6 +17,7 @@ package enginetest import ( "context" "fmt" + "github.com/dolthub/dolt/go/libraries/doltcore/ref" "os" "runtime" "sync" @@ -1681,11 +1682,6 @@ func TestStatsStorage(t *testing.T) { RunStatsStorageTests(t, h) } -func TestStatsIOWithoutReload(t *testing.T) { - h := newDoltEnginetestHarness(t) - RunStatsIOTestsWithoutReload(t, h) -} - func TestJoinStats(t *testing.T) { h := newDoltEnginetestHarness(t) RunJoinStatsTests(t, h) @@ -1971,22 +1967,23 @@ func TestStatsAutoRefreshConcurrency(t *testing.T) { // Setting an interval of 0 and a threshold of 0 will result // in the stats being updated after every operation - intervalSec := time.Duration(0) - thresholdf64 := 0. - bThreads := sql.NewBackgroundThreads() - branches := []string{"main"} - statsProv := engine.EngineAnalyzer().Catalog.StatsProvider.(*statspro.Provider) + //intervalSec := time.Duration(0) + //thresholdf64 := 0. + //bThreads := sql.NewBackgroundThreads() + //branches := []string{"main"} + statsProv := engine.EngineAnalyzer().Catalog.StatsProvider.(*statspro.StatsCoord) // it is important to use new sessions for this test, to avoid working root conflicts readCtx := enginetest.NewSession(harness) writeCtx := enginetest.NewSession(harness) refreshCtx := enginetest.NewSession(harness) - newCtx := func(context.Context) (*sql.Context, error) { - return refreshCtx, nil - } - err := statsProv.InitAutoRefreshWithParams(newCtx, sqlDb.Name(), bThreads, intervalSec, thresholdf64, branches) + fs, err := engine.EngineAnalyzer().Catalog.DbProvider.(*sqle.DoltDatabaseProvider).FileSystemForDatabase(sqlDb.AliasedName()) + require.NoError(t, err) + + done, err := statsProv.Add(refreshCtx, sqlDb, ref.NewBranchRef("main"), fs) require.NoError(t, err) + <-done execQ := func(ctx *sql.Context, q string, id int, tag string) { _, iter, _, err := engine.Query(ctx, q) diff --git a/go/libraries/doltcore/sqle/enginetest/dolt_engine_tests.go b/go/libraries/doltcore/sqle/enginetest/dolt_engine_tests.go index efd221635f4..d53dc74921a 100755 --- a/go/libraries/doltcore/sqle/enginetest/dolt_engine_tests.go +++ b/go/libraries/doltcore/sqle/enginetest/dolt_engine_tests.go @@ -268,7 +268,6 @@ func RunQueryTestPlans(t *testing.T, harness DoltEnginetestHarness) { } defer harness.Close() - sql.SystemVariables.SetGlobal(dsess.DoltStatsBootstrapEnabled, 0) enginetest.TestQueryPlans(t, harness, queries.PlanTests) } @@ -1562,27 +1561,12 @@ func RunStatsStorageTests(t *testing.T, h DoltEnginetestHarness) { for _, script := range append(DoltStatsStorageTests, DoltHistogramTests...) { func() { h = h.NewHarness(t).WithConfigureStats(true) - defer h.Close() e := mustNewEngine(t, h) if enginetest.IsServerEngine(e) { return } defer e.Close() - TestProviderReloadScriptWithEngine(t, e, h, script) - }() - } -} - -func RunStatsIOTestsWithoutReload(t *testing.T, h DoltEnginetestHarness) { - for _, script := range append(DoltStatsStorageTests, DoltHistogramTests...) { - func() { - h = h.NewHarness(t).WithConfigureStats(true) defer h.Close() - e := mustNewEngine(t, h) - if enginetest.IsServerEngine(e) { - return - } - defer e.Close() enginetest.TestScriptWithEngine(t, e, h, script) }() } diff --git a/go/libraries/doltcore/sqle/enginetest/dolt_harness.go b/go/libraries/doltcore/sqle/enginetest/dolt_harness.go index c599c61da79..7a4f9cec641 100644 --- a/go/libraries/doltcore/sqle/enginetest/dolt_harness.go +++ b/go/libraries/doltcore/sqle/enginetest/dolt_harness.go @@ -17,10 +17,7 @@ package enginetest import ( "context" "fmt" - "runtime" - "strings" - "testing" - + "github.com/dolthub/dolt/go/libraries/doltcore/ref" gms "github.com/dolthub/go-mysql-server" "github.com/dolthub/go-mysql-server/enginetest" "github.com/dolthub/go-mysql-server/enginetest/scriptgen/setup" @@ -29,6 +26,9 @@ import ( "github.com/dolthub/go-mysql-server/sql/mysql_db" "github.com/dolthub/go-mysql-server/sql/rowexec" "github.com/stretchr/testify/require" + "runtime" + "strings" + "testing" "github.com/dolthub/dolt/go/libraries/doltcore/branch_control" "github.com/dolthub/dolt/go/libraries/doltcore/dtestutils" @@ -36,7 +36,6 @@ import ( "github.com/dolthub/dolt/go/libraries/doltcore/sqle" "github.com/dolthub/dolt/go/libraries/doltcore/sqle/dsess" "github.com/dolthub/dolt/go/libraries/doltcore/sqle/kvexec" - "github.com/dolthub/dolt/go/libraries/doltcore/sqle/statsnoms" "github.com/dolthub/dolt/go/libraries/doltcore/sqle/statspro" "github.com/dolthub/dolt/go/libraries/doltcore/sqle/writer" "github.com/dolthub/dolt/go/libraries/utils/filesys" @@ -46,7 +45,7 @@ import ( type DoltHarness struct { t *testing.T provider dsess.DoltDatabaseProvider - statsPro sql.StatsProvider + statsPro *statspro.StatsCoord multiRepoEnv *env.MultiRepoEnv session *dsess.DoltSession branchControl *branch_control.Controller @@ -246,13 +245,23 @@ func (d *DoltHarness) NewEngine(t *testing.T) (enginetest.QueryEngine, error) { d.gcSafepointController = dsess.NewGCSafepointController() - statsProv := statspro.NewProvider(d.provider.(*sqle.DoltDatabaseProvider), statsnoms.NewNomsStatsFactory(d.multiRepoEnv.RemoteDialProvider())) - d.statsPro = statsProv - var err error d.session, err = dsess.NewDoltSession(enginetest.NewBaseSession(), d.provider, d.multiRepoEnv.Config(), d.branchControl, d.statsPro, writer.NewWriteSession, d.gcSafepointController) require.NoError(t, err) + sqlCtx := enginetest.NewContext(d) + bThreads := sql.NewBackgroundThreads() + + ctxGen := func(ctx context.Context) (*sql.Context, error) { + return d.NewContext(), nil + } + statsPro := statspro.NewStatsCoord(doltProvider, ctxGen, sqlCtx.Session.GetLogger().Logger, bThreads, d.multiRepoEnv.GetEnv(d.multiRepoEnv.GetFirstDatabase())) + err = statsPro.Restart(ctx) + if err != nil { + return nil, err + } + d.statsPro = statsPro + e, err := enginetest.NewEngine(t, d, d.provider, d.setupData, d.statsPro) if err != nil { return nil, err @@ -260,8 +269,8 @@ func (d *DoltHarness) NewEngine(t *testing.T) (enginetest.QueryEngine, error) { e.Analyzer.ExecBuilder = rowexec.NewOverrideBuilder(kvexec.Builder{}) d.engine = e - sqlCtx := enginetest.NewContext(d) databases := pro.AllDatabases(sqlCtx) + d.setupDbs = make(map[string]struct{}) var dbs []string for _, db := range databases { @@ -281,24 +290,23 @@ func (d *DoltHarness) NewEngine(t *testing.T) (enginetest.QueryEngine, error) { require.NoError(t, err) } - if d.configureStats { - bThreads := sql.NewBackgroundThreads() - e = e.WithBackgroundThreads(bThreads) + e = e.WithBackgroundThreads(bThreads) + if d.configureStats { dSess := dsess.DSessFromSess(sqlCtx.Session) dbCache := dSess.DatabaseCache(sqlCtx) - dsessDbs := make([]dsess.SqlDatabase, len(dbs)) for i, dbName := range dbs { dsessDbs[i], _ = dbCache.GetCachedRevisionDb(fmt.Sprintf("%s/main", dbName), dbName) - } - - ctxFact := func(context.Context) (*sql.Context, error) { - sess := d.newSessionWithClient(sql.Client{Address: "localhost", User: "root"}) - return sql.NewContext(context.Background(), sql.WithSession(sess)), nil - } - if err = statsProv.Configure(sqlCtx, ctxFact, bThreads, dsessDbs); err != nil { - return nil, err + fs, err := doltProvider.FileSystemForDatabase(dsessDbs[i].AliasedName()) + if err != nil { + return nil, err + } + done, err := statsPro.Add(sqlCtx, dsessDbs[i], ref.NewBranchRef("main"), fs) + if err != nil { + return nil, err + } + <-done } statsOnlyQueries := filterStatsOnlyQueries(d.setupData) @@ -309,13 +317,20 @@ func (d *DoltHarness) NewEngine(t *testing.T) (enginetest.QueryEngine, error) { } // Reset the mysql DB table to a clean state for this new engine + ctx := enginetest.NewContext(d) + d.engine.Analyzer.Catalog.MySQLDb = mysql_db.CreateEmptyMySQLDb() d.engine.Analyzer.Catalog.MySQLDb.AddRootAccount() - d.engine.Analyzer.Catalog.StatsProvider = statspro.NewProvider(d.provider.(*sqle.DoltDatabaseProvider), statsnoms.NewNomsStatsFactory(d.multiRepoEnv.RemoteDialProvider())) - var err error - sqlCtx := enginetest.NewContext(d) - e, err := enginetest.RunSetupScripts(sqlCtx, d.engine, d.resetScripts(), d.SupportsNativeIndexCreation()) + ctxGen := func(ctx context.Context) (*sql.Context, error) { + return d.NewContext(), nil + } + bThreads := sql.NewBackgroundThreads() + statsPro := statspro.NewStatsCoord(d.provider.(*sqle.DoltDatabaseProvider), ctxGen, ctx.Session.GetLogger().Logger, bThreads, d.multiRepoEnv.GetEnv(d.multiRepoEnv.GetFirstDatabase())) + require.NoError(t, statsPro.Restart(ctx)) + d.engine.Analyzer.Catalog.StatsProvider = statsPro + + e, err := enginetest.RunSetupScripts(ctx, d.engine, d.resetScripts(), d.SupportsNativeIndexCreation()) // Get a fresh session after running setup scripts, since some setup scripts can change the session state d.session, err = dsess.NewDoltSession(enginetest.NewBaseSession(), d.provider, d.multiRepoEnv.Config(), d.branchControl, d.statsPro, writer.NewWriteSession, nil) @@ -430,7 +445,6 @@ func (d *DoltHarness) NewDatabases(names ...string) []sql.Database { doltProvider, ok := pro.(*sqle.DoltDatabaseProvider) require.True(d.t, ok) d.provider = doltProvider - d.statsPro = statspro.NewProvider(doltProvider, statsnoms.NewNomsStatsFactory(d.multiRepoEnv.RemoteDialProvider())) var err error d.session, err = dsess.NewDoltSession(enginetest.NewBaseSession(), doltProvider, d.multiRepoEnv.Config(), d.branchControl, d.statsPro, writer.NewWriteSession, nil) @@ -502,7 +516,10 @@ func (d *DoltHarness) NewDatabaseProvider() sql.MutableDatabaseProvider { func (d *DoltHarness) Close() { d.closeProvider() - sql.SystemVariables.SetGlobal(dsess.DoltStatsAutoRefreshEnabled, int8(0)) + if d.statsPro != nil { + d.statsPro.Close() + } + sql.SystemVariables.SetGlobal(dsess.DoltStatsEnabled, int8(0)) } func (d *DoltHarness) closeProvider() { diff --git a/go/libraries/doltcore/sqle/enginetest/stats_queries.go b/go/libraries/doltcore/sqle/enginetest/stats_queries.go index fedb7297d5f..d3c737619cb 100644 --- a/go/libraries/doltcore/sqle/enginetest/stats_queries.go +++ b/go/libraries/doltcore/sqle/enginetest/stats_queries.go @@ -16,18 +16,11 @@ package enginetest import ( "fmt" - "strings" - "testing" - - gms "github.com/dolthub/go-mysql-server" - "github.com/dolthub/go-mysql-server/enginetest" + "github.com/dolthub/dolt/go/libraries/doltcore/schema" "github.com/dolthub/go-mysql-server/enginetest/queries" "github.com/dolthub/go-mysql-server/sql" "github.com/dolthub/go-mysql-server/sql/types" - "github.com/stretchr/testify/require" - - "github.com/dolthub/dolt/go/libraries/doltcore/schema" - "github.com/dolthub/dolt/go/libraries/doltcore/sqle/statspro" + "strings" ) // fillerVarchar pushes the tree into level 3 @@ -510,8 +503,6 @@ var DoltStatsStorageTests = []queries.ScriptTest{ { Name: "incremental stats deletes auto", SetUpScript: []string{ - "set @@PERSIST.dolt_stats_auto_refresh_interval = 0;", - "set @@PERSIST.dolt_stats_auto_refresh_threshold = 0;", "CREATE table xy (x bigint primary key, y int, z varchar(500), key(y,z));", "insert into xy select x, 1, 1 from (with recursive inputs(x) as (select 4 union select x+1 from inputs where x < 1000) select * from inputs) dt;", "analyze table xy", @@ -525,10 +516,7 @@ var DoltStatsStorageTests = []queries.ScriptTest{ Query: "delete from xy where x > 500", }, { - Query: "call dolt_stats_restart()", - }, - { - Query: "select sleep(.1)", + Query: "analyze table xy", }, { Query: "select count(*) from dolt_statistics group by table_name, index_name", @@ -540,8 +528,6 @@ var DoltStatsStorageTests = []queries.ScriptTest{ // https://github.com/dolthub/dolt/issues/8504 Name: "alter index column type", SetUpScript: []string{ - "set @@PERSIST.dolt_stats_auto_refresh_interval = 0;", - "set @@PERSIST.dolt_stats_auto_refresh_threshold = 0;", "CREATE table xy (x bigint primary key, y varchar(16))", "insert into xy values (0,'0'), (1,'1'), (2,'2')", "analyze table xy", @@ -569,78 +555,9 @@ var DoltStatsStorageTests = []queries.ScriptTest{ }, }, }, - { - Name: "differentiate table cases", - SetUpScript: []string{ - "set @@PERSIST.dolt_stats_auto_refresh_interval = 0;", - "set @@PERSIST.dolt_stats_auto_refresh_threshold = 0;", - "set @@PERSIST.dolt_stats_branches ='main'", - "CREATE table XY (x bigint primary key, y varchar(16))", - "insert into XY values (0,'0'), (1,'1'), (2,'2')", - "analyze table XY", - }, - Assertions: []queries.ScriptTestAssertion{ - { - Query: "select table_name, upper_bound from dolt_statistics", - Expected: []sql.Row{{"xy", "2"}}, - }, - }, - }, - { - Name: "deleted table loads OK", - SetUpScript: []string{ - "set @@PERSIST.dolt_stats_auto_refresh_interval = 0;", - "set @@PERSIST.dolt_stats_auto_refresh_threshold = 0;", - "set @@PERSIST.dolt_stats_branches ='main'", - "CREATE table xy (x bigint primary key, y varchar(16))", - "insert into xy values (0,'0'), (1,'1'), (2,'2')", - "analyze table xy", - "CREATE table uv (u bigint primary key, v varchar(16))", - "insert into uv values (0,'0'), (1,'1'), (2,'2')", - "analyze table uv", - "drop table uv", - }, - Assertions: []queries.ScriptTestAssertion{ - { - Query: "select table_name, upper_bound from dolt_statistics", - Expected: []sql.Row{{"xy", "2"}}, - }, - }, - }, - { - Name: "differentiate branch names", - SetUpScript: []string{ - "set @@PERSIST.dolt_stats_auto_refresh_interval = 0;", - "set @@PERSIST.dolt_stats_auto_refresh_threshold = 0;", - "set @@PERSIST.dolt_stats_branches ='main,feat'", - "CREATE table xy (x bigint primary key, y varchar(16))", - "insert into xy values (0,'0'), (1,'1'), (2,'2')", - "analyze table xy", - "call dolt_checkout('-b', 'feat')", - "CREATE table xy (x varchar(16) primary key, y bigint, z bigint)", - "insert into xy values (3,'3',3)", - "analyze table xy", - "call dolt_checkout('main')", - }, - Assertions: []queries.ScriptTestAssertion{ - { - Query: "select table_name, upper_bound from dolt_statistics", - Expected: []sql.Row{{"xy", "2"}}, - }, - { - Query: "call dolt_checkout('feat')", - }, - { - Query: "select table_name, upper_bound from dolt_statistics", - Expected: []sql.Row{{"xy", "3"}}, - }, - }, - }, { Name: "drop primary key", SetUpScript: []string{ - "set @@PERSIST.dolt_stats_auto_refresh_interval = 0;", - "set @@PERSIST.dolt_stats_auto_refresh_threshold = 0;", "CREATE table xy (x bigint primary key, y varchar(16))", "insert into xy values (0,'0'), (1,'1'), (2,'2')", "analyze table xy", @@ -657,10 +574,7 @@ var DoltStatsStorageTests = []queries.ScriptTest{ Query: "insert into xy values ('3', '3')", }, { - Query: "call dolt_stats_restart()", - }, - { - Query: "select sleep(.2)", + Query: "analyze table xy", }, { Query: "select count(*) from dolt_statistics group by table_name, index_name", @@ -994,94 +908,6 @@ var StatProcTests = []queries.ScriptTest{ }, } -// TestProviderReloadScriptWithEngine runs the test script given with the engine provided. -func TestProviderReloadScriptWithEngine(t *testing.T, e enginetest.QueryEngine, harness enginetest.Harness, script queries.ScriptTest) { - ctx := enginetest.NewContext(harness) - err := enginetest.CreateNewConnectionForServerEngine(ctx, e) - require.NoError(t, err, nil) - - t.Run(script.Name, func(t *testing.T) { - for _, statement := range script.SetUpScript { - if sh, ok := harness.(enginetest.SkippingHarness); ok { - if sh.SkipQueryTest(statement) { - t.Skip() - } - } - ctx = ctx.WithQuery(statement) - enginetest.RunQueryWithContext(t, e, harness, ctx, statement) - } - - assertions := script.Assertions - if len(assertions) == 0 { - assertions = []queries.ScriptTestAssertion{ - { - Query: script.Query, - Expected: script.Expected, - ExpectedErr: script.ExpectedErr, - ExpectedIndexes: script.ExpectedIndexes, - }, - } - } - - { - // reload provider, get disk stats - eng, ok := e.(*gms.Engine) - if !ok { - t.Errorf("expected *gms.Engine but found: %T", e) - } - - branches := eng.Analyzer.Catalog.StatsProvider.(*statspro.Provider).TrackedBranches("mydb") - brCopy := make([]string, len(branches)) - copy(brCopy, branches) - err := eng.Analyzer.Catalog.StatsProvider.DropDbStats(ctx, "mydb", false) - require.NoError(t, err) - for _, branch := range brCopy { - err = eng.Analyzer.Catalog.StatsProvider.(*statspro.Provider).LoadStats(ctx, "mydb", branch) - require.NoError(t, err) - } - } - - for _, assertion := range assertions { - t.Run(assertion.Query, func(t *testing.T) { - if assertion.NewSession { - th, ok := harness.(enginetest.TransactionHarness) - require.True(t, ok, "ScriptTestAssertion requested a NewSession, "+ - "but harness doesn't implement TransactionHarness") - ctx = th.NewSession() - } - - if sh, ok := harness.(enginetest.SkippingHarness); ok && sh.SkipQueryTest(assertion.Query) { - t.Skip() - } - if assertion.Skip { - t.Skip() - } - - if assertion.ExpectedErr != nil { - enginetest.AssertErr(t, e, harness, assertion.Query, nil, assertion.ExpectedErr) - } else if assertion.ExpectedErrStr != "" { - enginetest.AssertErrWithCtx(t, e, harness, ctx, assertion.Query, nil, nil, assertion.ExpectedErrStr) - } else if assertion.ExpectedWarning != 0 { - enginetest.AssertWarningAndTestQuery(t, e, nil, harness, assertion.Query, - assertion.Expected, nil, assertion.ExpectedWarning, assertion.ExpectedWarningsCount, - assertion.ExpectedWarningMessageSubstring, assertion.SkipResultsCheck) - } else if assertion.SkipResultsCheck { - enginetest.RunQueryWithContext(t, e, harness, nil, assertion.Query) - } else if assertion.CheckIndexedAccess { - enginetest.TestQueryWithIndexCheck(t, ctx, e, harness, assertion.Query, assertion.Expected, assertion.ExpectedColumns, assertion.Bindings) - } else { - var expected = assertion.Expected - if enginetest.IsServerEngine(e) && assertion.SkipResultCheckOnServerEngine { - // TODO: remove this check in the future - expected = nil - } - enginetest.TestQueryWithContext(t, ctx, e, harness, assertion.Query, expected, assertion.ExpectedColumns, assertion.Bindings, nil) - } - }) - } - }) -} - func mustNewStatQual(s string) sql.StatQualifier { qual, _ := sql.NewQualifierFromString(s) return qual diff --git a/go/libraries/doltcore/sqle/sqlddl_test.go b/go/libraries/doltcore/sqle/sqlddl_test.go index 7e50899d881..5fee7a12c19 100644 --- a/go/libraries/doltcore/sqle/sqlddl_test.go +++ b/go/libraries/doltcore/sqle/sqlddl_test.go @@ -1128,6 +1128,7 @@ func newTestEngine(ctx context.Context, dEnv *env.DoltEnv) (*gms.Engine, *sql.Co IsServerLocked: false, }), sqlCtx } + func TestIndexOverwrite(t *testing.T) { ctx := context.Background() dEnv := dtestutils.CreateTestEnv() diff --git a/go/libraries/doltcore/sqle/statsnoms/database.go b/go/libraries/doltcore/sqle/statsnoms/database.go deleted file mode 100644 index 6a972a3b103..00000000000 --- a/go/libraries/doltcore/sqle/statsnoms/database.go +++ /dev/null @@ -1,488 +0,0 @@ -// Copyright 2024 Dolthub, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package statsnoms - -import ( - "context" - "errors" - "fmt" - "path" - "strings" - "sync" - - "github.com/dolthub/go-mysql-server/sql" - - "github.com/dolthub/dolt/go/libraries/doltcore/dbfactory" - "github.com/dolthub/dolt/go/libraries/doltcore/doltdb" - "github.com/dolthub/dolt/go/libraries/doltcore/env" - "github.com/dolthub/dolt/go/libraries/doltcore/schema" - "github.com/dolthub/dolt/go/libraries/doltcore/sqle" - "github.com/dolthub/dolt/go/libraries/doltcore/sqle/dsess" - "github.com/dolthub/dolt/go/libraries/doltcore/sqle/statspro" - "github.com/dolthub/dolt/go/libraries/doltcore/table/editor" - "github.com/dolthub/dolt/go/libraries/utils/earl" - "github.com/dolthub/dolt/go/libraries/utils/filesys" - "github.com/dolthub/dolt/go/store/datas" - "github.com/dolthub/dolt/go/store/hash" - "github.com/dolthub/dolt/go/store/prolly" - "github.com/dolthub/dolt/go/store/types" -) - -func NewNomsStatsFactory(dialPro dbfactory.GRPCDialProvider) *NomsStatsFactory { - return &NomsStatsFactory{dialPro: dialPro} -} - -type NomsStatsFactory struct { - dialPro dbfactory.GRPCDialProvider -} - -var _ statspro.StatsFactory = NomsStatsFactory{} - -func (sf NomsStatsFactory) Init(ctx *sql.Context, sourceDb dsess.SqlDatabase, prov *sqle.DoltDatabaseProvider, fs filesys.Filesys, hdp env.HomeDirProvider) (statspro.Database, error) { - params := make(map[string]interface{}) - params[dbfactory.GRPCDialProviderParam] = sf.dialPro - - var urlPath string - u, err := earl.Parse(prov.DbFactoryUrl()) - if u.Scheme == dbfactory.MemScheme { - urlPath = path.Join(prov.DbFactoryUrl(), dbfactory.DoltDataDir) - } else if u.Scheme == dbfactory.FileScheme { - urlPath = doltdb.LocalDirDoltDB - } - - statsFs, err := fs.WithWorkingDir(dbfactory.DoltStatsDir) - if err != nil { - return nil, err - } - - var dEnv *env.DoltEnv - exists, isDir := statsFs.Exists("") - if !exists { - err := statsFs.MkDirs("") - if err != nil { - return nil, fmt.Errorf("unable to make directory '%s', cause: %s", dbfactory.DoltStatsDir, err.Error()) - } - - dEnv = env.Load(context.Background(), hdp, statsFs, urlPath, "test") - sess := dsess.DSessFromSess(ctx.Session) - err = dEnv.InitRepo(ctx, types.Format_Default, sess.Username(), sess.Email(), prov.DefaultBranch()) - if err != nil { - return nil, err - } - } else if !isDir { - return nil, fmt.Errorf("file exists where the dolt stats directory should be") - } else { - dEnv = env.LoadWithoutDB(ctx, hdp, statsFs, "", "") - } - - dEnv.LoadDoltDBWithParams(ctx, types.Format_Default, urlPath, statsFs, params) - - deaf := dEnv.DbEaFactory(ctx) - - tmpDir, err := dEnv.TempTableFilesDir() - if err != nil { - return nil, err - } - opts := editor.Options{ - Deaf: deaf, - Tempdir: tmpDir, - } - statsDb, err := sqle.NewDatabase(ctx, "stats", dEnv.DbData(ctx), opts) - if err != nil { - return nil, err - } - return NewNomsStats(sourceDb, statsDb), nil -} - -func NewNomsStats(sourceDb, statsDb dsess.SqlDatabase) *NomsStatsDatabase { - return &NomsStatsDatabase{mu: &sync.Mutex{}, destDb: statsDb, sourceDb: sourceDb} -} - -type dbStats map[sql.StatQualifier]*statspro.DoltStats - -type NomsStatsDatabase struct { - mu *sync.Mutex - destDb dsess.SqlDatabase - sourceDb dsess.SqlDatabase - stats []dbStats - branches []string - tableHashes []map[string]hash.Hash - schemaHashes []map[string]hash.Hash - dirty []*prolly.MutableMap -} - -var _ statspro.Database = (*NomsStatsDatabase)(nil) - -func (n *NomsStatsDatabase) Close() error { - return n.destDb.DbData().Ddb.Close() -} - -func (n *NomsStatsDatabase) Branches() []string { - return n.branches -} - -func (n *NomsStatsDatabase) LoadBranchStats(ctx *sql.Context, branch string) error { - branchQDbName := statspro.BranchQualifiedDatabase(n.sourceDb.Name(), branch) - - dSess := dsess.DSessFromSess(ctx.Session) - sqlDb, err := dSess.Provider().Database(ctx, branchQDbName) - if err != nil { - ctx.GetLogger().Debugf("statistics load: branch not found: %s; `call dolt_stats_prune()` to delete stale statistics", branch) - return nil - } - branchQDb, ok := sqlDb.(dsess.SqlDatabase) - if !ok { - return fmt.Errorf("branch/database not found: %s", branchQDbName) - } - - if ok, err := n.SchemaChange(ctx, branch, branchQDb); err != nil { - return err - } else if ok { - ctx.GetLogger().Debugf("statistics load: detected schema change incompatility, purging %s/%s", branch, n.sourceDb.Name()) - if err := n.DeleteBranchStats(ctx, branch, true); err != nil { - return err - } - } - - statsMap, err := n.destDb.DbData().Ddb.GetStatistics(ctx, branch) - if errors.Is(err, doltdb.ErrNoStatistics) { - return n.trackBranch(ctx, branch) - } else if errors.Is(err, datas.ErrNoBranchStats) { - return n.trackBranch(ctx, branch) - } else if err != nil { - return err - } - if cnt, err := statsMap.Count(); err != nil { - return err - } else if cnt == 0 { - return n.trackBranch(ctx, branch) - } - - doltStats, err := loadStats(ctx, branchQDb, statsMap) - if err != nil { - return err - } - n.branches = append(n.branches, branch) - n.stats = append(n.stats, doltStats) - n.dirty = append(n.dirty, nil) - n.tableHashes = append(n.tableHashes, make(map[string]hash.Hash)) - n.schemaHashes = append(n.schemaHashes, make(map[string]hash.Hash)) - return nil -} - -func (n *NomsStatsDatabase) SchemaChange(ctx *sql.Context, branch string, branchQDb dsess.SqlDatabase) (bool, error) { - root, err := branchQDb.GetRoot(ctx) - if err != nil { - return false, err - } - tables, err := branchQDb.GetTableNames(ctx) - if err != nil { - return false, err - } - - var keys []string - var schHashes []hash.Hash - for _, tableName := range tables { - table, ok, err := root.GetTable(ctx, doltdb.TableName{Name: tableName}) - if err != nil { - return false, err - } - if !ok { - return false, nil - } - curHash, err := table.GetSchemaHash(ctx) - if err != nil { - return false, err - } - - keys = append(keys, n.schemaTupleKey(branch, tableName)) - schHashes = append(schHashes, curHash) - } - - ddb := n.destDb.DbData().Ddb - var schemaChange bool - for i, key := range keys { - curHash := schHashes[i] - if val, ok, err := ddb.GetTuple(ctx, key); err != nil { - return false, err - } else if ok { - oldHash := hash.Parse(string(val)) - if !ok || !oldHash.Equal(curHash) { - schemaChange = true - break - } - } - } - if schemaChange { - for _, key := range keys { - ddb.DeleteTuple(ctx, key) - } - return true, nil - } - return false, nil -} - -func (n *NomsStatsDatabase) getBranchStats(branch string) dbStats { - for i, b := range n.branches { - if strings.EqualFold(b, branch) { - return n.stats[i] - } - } - return nil -} - -func (n *NomsStatsDatabase) GetStat(branch string, qual sql.StatQualifier) (*statspro.DoltStats, bool) { - n.mu.Lock() - defer n.mu.Unlock() - stats := n.getBranchStats(branch) - ret, ok := stats[qual] - return ret, ok -} - -func (n *NomsStatsDatabase) ListStatQuals(branch string) []sql.StatQualifier { - n.mu.Lock() - defer n.mu.Unlock() - stats := n.getBranchStats(branch) - var ret []sql.StatQualifier - for qual, _ := range stats { - ret = append(ret, qual) - } - return ret -} - -func (n *NomsStatsDatabase) setStat(ctx context.Context, branch string, qual sql.StatQualifier, stats *statspro.DoltStats) error { - var statsMap *prolly.MutableMap - for i, b := range n.branches { - if strings.EqualFold(branch, b) { - n.stats[i][qual] = stats - if n.dirty[i] == nil { - if err := n.initMutable(ctx, i); err != nil { - return err - } - } - statsMap = n.dirty[i] - } - } - if statsMap == nil { - if err := n.trackBranch(ctx, branch); err != nil { - return err - } - statsMap = n.dirty[len(n.branches)-1] - n.stats[len(n.branches)-1][qual] = stats - } - - return n.replaceStats(ctx, statsMap, stats) -} -func (n *NomsStatsDatabase) SetStat(ctx context.Context, branch string, qual sql.StatQualifier, stats *statspro.DoltStats) error { - n.mu.Lock() - defer n.mu.Unlock() - - return n.setStat(ctx, branch, qual, stats) -} - -func (n *NomsStatsDatabase) trackBranch(ctx context.Context, branch string) error { - n.branches = append(n.branches, branch) - n.stats = append(n.stats, make(dbStats)) - n.tableHashes = append(n.tableHashes, make(map[string]hash.Hash)) - n.schemaHashes = append(n.schemaHashes, make(map[string]hash.Hash)) - - kd, vd := schema.StatsTableDoltSchema.GetMapDescriptors() - newMap, err := prolly.NewMapFromTuples(ctx, n.destDb.DbData().Ddb.NodeStore(), kd, vd) - if err != nil { - return err - } - n.dirty = append(n.dirty, newMap.Mutate()) - return n.destDb.DbData().Ddb.SetStatisics(ctx, branch, newMap.HashOf()) -} - -func (n *NomsStatsDatabase) initMutable(ctx context.Context, i int) error { - statsMap, err := n.destDb.DbData().Ddb.GetStatistics(ctx, n.branches[i]) - if err != nil { - return err - } - n.dirty[i] = statsMap.Mutate() - return nil -} - -func (n *NomsStatsDatabase) DeleteStats(ctx *sql.Context, branch string, quals ...sql.StatQualifier) { - n.mu.Lock() - defer n.mu.Unlock() - - for i, b := range n.branches { - if strings.EqualFold(b, branch) { - for _, qual := range quals { - ctx.GetLogger().Debugf("statistics refresh: deleting index statistics: %s/%s", branch, qual) - delete(n.stats[i], qual) - } - } - } -} - -func (n *NomsStatsDatabase) DeleteBranchStats(ctx *sql.Context, branch string, flush bool) error { - n.mu.Lock() - defer n.mu.Unlock() - - ctx.GetLogger().Debugf("statistics refresh: deleting branch statistics: %s", branch) - - for i, b := range n.branches { - if strings.EqualFold(b, branch) { - n.branches = append(n.branches[:i], n.branches[i+1:]...) - n.dirty = append(n.dirty[:i], n.dirty[i+1:]...) - n.stats = append(n.stats[:i], n.stats[i+1:]...) - n.tableHashes = append(n.tableHashes[:i], n.tableHashes[i+1:]...) - n.schemaHashes = append(n.schemaHashes[:i], n.schemaHashes[i+1:]...) - } - } - if flush { - return n.destDb.DbData().Ddb.DropStatisics(ctx, branch) - } - return nil -} - -func (n *NomsStatsDatabase) ReplaceChunks(ctx context.Context, branch string, qual sql.StatQualifier, targetHashes []hash.Hash, dropChunks, newChunks []sql.HistogramBucket) error { - n.mu.Lock() - defer n.mu.Unlock() - - var dbStat dbStats - for i, b := range n.branches { - if strings.EqualFold(b, branch) { - // naive merge the new with old - dbStat = n.stats[i] - } - } - - if dbStat == nil { - if err := n.trackBranch(ctx, branch); err != nil { - return err - } - dbStat = n.stats[len(n.branches)-1] - } - - if _, ok := dbStat[qual]; ok { - oldChunks := dbStat[qual].Hist - targetBuckets, err := statspro.MergeNewChunks(targetHashes, oldChunks, newChunks) - if err != nil { - return err - } - newStat, err := dbStat[qual].WithHistogram(targetBuckets) - if err != nil { - return err - } - dbStat[qual] = newStat.(*statspro.DoltStats) - } else { - dbStat[qual] = statspro.NewDoltStats() - } - dbStat[qual].Chunks = targetHashes - dbStat[qual].UpdateActive() - - // let |n.SetStats| update memory and disk - return n.setStat(ctx, branch, qual, dbStat[qual]) -} - -func (n *NomsStatsDatabase) Flush(ctx context.Context, branch string) error { - n.mu.Lock() - defer n.mu.Unlock() - - for i, b := range n.branches { - if strings.EqualFold(b, branch) { - if n.dirty[i] != nil { - flushedMap, err := n.dirty[i].Map(ctx) - if err != nil { - return err - } - n.dirty[i] = nil - if err := n.destDb.DbData().Ddb.SetStatisics(ctx, branch, flushedMap.HashOf()); err != nil { - return err - } - return nil - } - } - } - return nil -} - -func (n *NomsStatsDatabase) GetTableHash(branch, tableName string) hash.Hash { - n.mu.Lock() - defer n.mu.Unlock() - for i, b := range n.branches { - if strings.EqualFold(branch, b) { - return n.tableHashes[i][tableName] - } - } - return hash.Hash{} -} - -func (n *NomsStatsDatabase) SetTableHash(branch, tableName string, h hash.Hash) { - n.mu.Lock() - defer n.mu.Unlock() - for i, b := range n.branches { - if strings.EqualFold(branch, b) { - n.tableHashes[i][tableName] = h - break - } - } -} - -func (n *NomsStatsDatabase) GetSchemaHash(ctx context.Context, branch, tableName string) (hash.Hash, error) { - n.mu.Lock() - defer n.mu.Unlock() - for i, b := range n.branches { - if strings.EqualFold(branch, b) { - return n.schemaHashes[i][tableName], nil - } - if val, ok, err := n.destDb.DbData().Ddb.GetTuple(ctx, n.schemaTupleKey(branch, tableName)); ok { - if err != nil { - return hash.Hash{}, err - } - h := hash.Parse(string(val)) - n.schemaHashes[i][tableName] = h - return h, nil - } else if err != nil { - return hash.Hash{}, err - } - break - } - return hash.Hash{}, nil -} - -func (n *NomsStatsDatabase) schemaTupleKey(branch, tableName string) string { - return n.sourceDb.Name() + "/" + branch + "/" + tableName -} - -func (n *NomsStatsDatabase) SetSchemaHash(ctx context.Context, branch, tableName string, h hash.Hash) error { - n.mu.Lock() - defer n.mu.Unlock() - branchIdx := -1 - for i, b := range n.branches { - if strings.EqualFold(branch, b) { - branchIdx = i - break - } - } - if branchIdx < 0 { - branchIdx = len(n.branches) - if err := n.trackBranch(ctx, branch); err != nil { - return err - } - } - - n.schemaHashes[branchIdx][tableName] = h - key := n.schemaTupleKey(branch, tableName) - if err := n.destDb.DbData().Ddb.DeleteTuple(ctx, key); err != doltdb.ErrTupleNotFound { - return err - } - - return n.destDb.DbData().Ddb.SetTuple(ctx, key, []byte(h.String())) -} diff --git a/go/libraries/doltcore/sqle/statsnoms/iter.go b/go/libraries/doltcore/sqle/statsnoms/iter.go deleted file mode 100644 index 59b9456eed6..00000000000 --- a/go/libraries/doltcore/sqle/statsnoms/iter.go +++ /dev/null @@ -1,176 +0,0 @@ -// Copyright 2024 Dolthub, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package statsnoms - -import ( - "fmt" - "strings" - "time" - - "github.com/dolthub/go-mysql-server/sql" - "github.com/dolthub/go-mysql-server/sql/planbuilder" - "gopkg.in/errgo.v2/errors" - - "github.com/dolthub/dolt/go/libraries/doltcore/schema" - "github.com/dolthub/dolt/go/store/hash" - "github.com/dolthub/dolt/go/store/prolly" - "github.com/dolthub/dolt/go/store/prolly/tree" - "github.com/dolthub/dolt/go/store/val" -) - -var ErrIncompatibleVersion = errors.New("client stats version mismatch") - -func NewStatsIter(ctx *sql.Context, schemaName string, m prolly.Map) (*statsIter, error) { - iter, err := m.IterAll(ctx) - if err != nil { - return nil, err - } - kd, vd := m.Descriptors() - keyBuilder := val.NewTupleBuilder(kd) - valueBuilder := val.NewTupleBuilder(vd) - ns := m.NodeStore() - - return &statsIter{ - iter: iter, - kb: keyBuilder, - vb: valueBuilder, - ns: ns, - schemaName: schemaName, - planb: planbuilder.New(ctx, nil, nil, nil), - }, nil -} - -// statsIter reads histogram buckets into string-compatible types. -// Values that are SQL rows should be converted with statsIter.ParseRow. -// todo: make a JSON compatible container for sql.Row w/ types so that we -// can eagerly convert to sql.Row without sacrificing string printing. -type statsIter struct { - iter prolly.MapIter - kb, vb *val.TupleBuilder - ns tree.NodeStore - planb *planbuilder.Builder - currentQual string - schemaName string - currentTypes []sql.Type -} - -var _ sql.RowIter = (*statsIter)(nil) - -func (s *statsIter) Next(ctx *sql.Context) (sql.Row, error) { - k, v, err := s.iter.Next(ctx) - if err != nil { - return nil, err - } - - // deserialize K, V - version, err := tree.GetField(ctx, s.vb.Desc, 0, v, s.ns) - if err != nil { - return nil, err - } - if version != schema.StatsVersion { - return nil, fmt.Errorf("%w: write version %d does not match read version %d", ErrIncompatibleVersion, version, schema.StatsVersion) - } - - var row sql.Row - for i := 0; i < s.kb.Desc.Count(); i++ { - f, err := tree.GetField(ctx, s.kb.Desc, i, k, s.ns) - if err != nil { - return nil, err - } - row = append(row, f) - } - - for i := 0; i < s.vb.Desc.Count(); i++ { - f, err := tree.GetField(ctx, s.vb.Desc, i, v, s.ns) - if err != nil { - return nil, err - } - row = append(row, f) - } - - dbName := row[schema.StatsDbTag].(string) - tableName := row[schema.StatsTableTag].(string) - indexName := row[schema.StatsIndexTag].(string) - position := row[schema.StatsPositionTag].(int64) - _ = row[schema.StatsVersionTag] - commit := hash.Parse(row[schema.StatsCommitHashTag].(string)) - rowCount := row[schema.StatsRowCountTag].(int64) - distinctCount := row[schema.StatsDistinctCountTag].(int64) - nullCount := row[schema.StatsNullCountTag].(int64) - columnsStr := row[schema.StatsColumnsTag].(string) - typesStr := row[schema.StatsTypesTag].(string) - upperBoundStr := row[schema.StatsUpperBoundTag].(string) - upperBoundCnt := row[schema.StatsUpperBoundCntTag].(int64) - createdAt := row[schema.StatsCreatedAtTag].(time.Time) - - typs := strings.Split(typesStr, "\n") - for i, t := range typs { - typs[i] = strings.TrimSpace(t) - } - - qual := sql.NewStatQualifier(dbName, s.schemaName, tableName, indexName) - if curQual := qual.String(); !strings.EqualFold(curQual, s.currentQual) { - s.currentQual = curQual - s.currentTypes, err = parseTypeStrings(typs) - if err != nil { - return nil, err - } - } - - mcvCountsStr := row[schema.StatsMcvCountsTag].(string) - - numMcvs := schema.StatsMcvCountsTag - schema.StatsMcv1Tag - mcvs := make([]string, numMcvs) - for i, v := range row[schema.StatsMcv1Tag:schema.StatsMcvCountsTag] { - if v != nil { - mcvs[i] = v.(string) - } - } - - return sql.Row{ - dbName, - tableName, - indexName, - int(position), - version, - commit.String(), - uint64(rowCount), - uint64(distinctCount), - uint64(nullCount), - columnsStr, - typesStr, - upperBoundStr, - uint64(upperBoundCnt), - createdAt, - mcvs[0], mcvs[1], mcvs[2], mcvs[3], - mcvCountsStr, - }, nil -} - -func (s *statsIter) ParseRow(rowStr string) (sql.Row, error) { - var row sql.Row - for i, v := range strings.Split(rowStr, ",") { - val, _, err := s.currentTypes[i].Convert(v) - if err != nil { - return nil, err - } - row = append(row, val) - } - return row, nil -} - -func (s *statsIter) Close(context *sql.Context) error { - return nil -} diff --git a/go/libraries/doltcore/sqle/statsnoms/load.go b/go/libraries/doltcore/sqle/statsnoms/load.go deleted file mode 100644 index 72051260260..00000000000 --- a/go/libraries/doltcore/sqle/statsnoms/load.go +++ /dev/null @@ -1,308 +0,0 @@ -// Copyright 2024 Dolthub, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package statsnoms - -import ( - "errors" - "fmt" - "io" - "strconv" - "strings" - "time" - - "github.com/dolthub/go-mysql-server/sql" - "github.com/dolthub/go-mysql-server/sql/planbuilder" - "github.com/dolthub/go-mysql-server/sql/stats" - - "github.com/dolthub/dolt/go/libraries/doltcore/doltdb" - "github.com/dolthub/dolt/go/libraries/doltcore/doltdb/durable" - "github.com/dolthub/dolt/go/libraries/doltcore/schema" - "github.com/dolthub/dolt/go/libraries/doltcore/sqle/dsess" - "github.com/dolthub/dolt/go/libraries/doltcore/sqle/statspro" - "github.com/dolthub/dolt/go/store/hash" - "github.com/dolthub/dolt/go/store/prolly" - "github.com/dolthub/dolt/go/store/prolly/tree" - "github.com/dolthub/dolt/go/store/val" -) - -func loadStats(ctx *sql.Context, db dsess.SqlDatabase, m prolly.Map) (map[sql.StatQualifier]*statspro.DoltStats, error) { - qualToStats := make(map[sql.StatQualifier]*statspro.DoltStats) - schemaName := db.SchemaName() - iter, err := NewStatsIter(ctx, schemaName, m) - if err != nil { - return nil, err - } - currentStat := statspro.NewDoltStats() - invalidTables := make(map[string]bool) - for { - row, err := iter.Next(ctx) - if errors.Is(err, io.EOF) { - break - } else if err != nil { - return nil, err - } - - // deserialize K, V - dbName := row[schema.StatsDbTag].(string) - tableName := row[schema.StatsTableTag].(string) - indexName := row[schema.StatsIndexTag].(string) - _ = row[schema.StatsVersionTag] - commit := hash.Parse(row[schema.StatsCommitHashTag].(string)) - rowCount := row[schema.StatsRowCountTag].(uint64) - distinctCount := row[schema.StatsDistinctCountTag].(uint64) - nullCount := row[schema.StatsNullCountTag].(uint64) - columns := strings.Split(row[schema.StatsColumnsTag].(string), ",") - typesStr := row[schema.StatsTypesTag].(string) - boundRowStr := row[schema.StatsUpperBoundTag].(string) - upperBoundCnt := row[schema.StatsUpperBoundCntTag].(uint64) - createdAt := row[schema.StatsCreatedAtTag].(time.Time) - - typs := strings.Split(typesStr, "\n") - for i, t := range typs { - typs[i] = strings.TrimSpace(t) - } - - qual := sql.NewStatQualifier(dbName, schemaName, tableName, indexName) - if _, ok := invalidTables[tableName]; ok { - continue - } - - if currentStat.Statistic.Qual.String() != qual.String() { - if !currentStat.Statistic.Qual.Empty() { - currentStat.UpdateActive() - qualToStats[currentStat.Statistic.Qual] = currentStat - } - - currentStat = statspro.NewDoltStats() - - tab, ok, err := db.GetTableInsensitive(ctx, qual.Table()) - if ok { - currentStat.Statistic.Qual = qual - currentStat.Statistic.Cols = columns - currentStat.Statistic.LowerBnd, currentStat.Tb, currentStat.Statistic.Fds, currentStat.Statistic.Colset, err = loadRefdProps(ctx, db, tab, currentStat.Statistic.Qual, len(currentStat.Columns())) - if err != nil { - return nil, err - } - } else if !ok { - ctx.GetLogger().Debugf("stats load: table previously collected is missing from root: %s", tableName) - invalidTables[qual.Table()] = true - continue - } else if err != nil { - return nil, err - } - } - - numMcvs := schema.StatsMcvCountsTag - schema.StatsMcv1Tag - - mcvCountsStr := strings.Split(row[schema.StatsMcvCountsTag].(string), ",") - mcvCnts := make([]uint64, numMcvs) - for i, v := range mcvCountsStr { - if v == "" { - continue - } - val, err := strconv.Atoi(v) - if err != nil { - return nil, err - } - mcvCnts[i] = uint64(val) - } - - mcvs := make([]sql.Row, numMcvs) - for i, v := range row[schema.StatsMcv1Tag:schema.StatsMcvCountsTag] { - if v != nil && v != "" { - row, err := DecodeRow(ctx, m.NodeStore(), v.(string), currentStat.Tb) - if err != nil { - return nil, err - } - mcvs[i] = row - } - } - - for i, v := range mcvCnts { - if v == 0 { - mcvs = mcvs[:i] - mcvCnts = mcvCnts[:i] - break - } - } - - if currentStat.Statistic.Hist == nil { - currentStat.Statistic.Typs, err = parseTypeStrings(typs) - if err != nil { - return nil, err - } - currentStat.Statistic.Qual = qual - } - - boundRow, err := DecodeRow(ctx, m.NodeStore(), boundRowStr, currentStat.Tb) - if err != nil { - return nil, err - } - - bucket := statspro.DoltBucket{ - Chunk: commit, - Created: createdAt, - Bucket: &stats.Bucket{ - RowCnt: uint64(rowCount), - DistinctCnt: uint64(distinctCount), - NullCnt: uint64(nullCount), - McvVals: mcvs, - McvsCnt: mcvCnts, - BoundCnt: upperBoundCnt, - BoundVal: boundRow, - }, - } - - currentStat.Hist = append(currentStat.Hist, bucket) - currentStat.Statistic.RowCnt += uint64(rowCount) - currentStat.Statistic.DistinctCnt += uint64(distinctCount) - currentStat.Statistic.NullCnt += uint64(rowCount) - if currentStat.Statistic.Created.Before(createdAt) { - currentStat.Statistic.Created = createdAt - } - } - if !currentStat.Qualifier().Empty() { - currentStat.UpdateActive() - qualToStats[currentStat.Statistic.Qual] = currentStat - } - return qualToStats, nil -} - -func parseTypeStrings(typs []string) ([]sql.Type, error) { - var ret []sql.Type - for _, typ := range typs { - ct, err := planbuilder.ParseColumnTypeString(typ) - if err != nil { - return nil, err - } - ret = append(ret, ct) - } - return ret, nil -} - -func loadRefdProps(ctx *sql.Context, db dsess.SqlDatabase, sqlTable sql.Table, qual sql.StatQualifier, cols int) (sql.Row, *val.TupleBuilder, *sql.FuncDepSet, sql.ColSet, error) { - root, err := db.GetRoot(ctx) - if err != nil { - return nil, nil, nil, sql.ColSet{}, err - } - - iat, ok := sqlTable.(sql.IndexAddressable) - if !ok { - return nil, nil, nil, sql.ColSet{}, nil - } - - indexes, err := iat.GetIndexes(ctx) - if err != nil { - return nil, nil, nil, sql.ColSet{}, err - } - - var sqlIdx sql.Index - for _, i := range indexes { - if strings.EqualFold(i.ID(), qual.Index()) { - sqlIdx = i - break - } - } - - if sqlIdx == nil { - return nil, nil, nil, sql.ColSet{}, fmt.Errorf("%w: index not found: '%s'", statspro.ErrFailedToLoad, qual.Index()) - } - - fds, colset, err := stats.IndexFds(qual.Table(), sqlTable.Schema(), sqlIdx) - if err != nil { - return nil, nil, nil, sql.ColSet{}, err - } - table, ok, err := root.GetTable(ctx, doltdb.TableName{Name: sqlTable.Name()}) - if !ok { - return nil, nil, nil, sql.ColSet{}, sql.ErrTableNotFound.New(qual.Table()) - } - if err != nil { - return nil, nil, nil, sql.ColSet{}, err - } - - var idx durable.Index - if qual.Index() == "primary" { - idx, err = table.GetRowData(ctx) - } else { - idx, err = table.GetIndexRowData(ctx, qual.Index()) - } - if err != nil { - return nil, nil, nil, sql.ColSet{}, err - } - - prollyMap := durable.ProllyMapFromIndex(idx) - keyBuilder := val.NewTupleBuilder(prollyMap.KeyDesc().PrefixDesc(cols)) - buffPool := prollyMap.NodeStore().Pool() - - if cnt, err := prollyMap.Count(); err != nil { - return nil, nil, nil, sql.ColSet{}, err - } else if cnt == 0 { - return nil, keyBuilder, nil, sql.ColSet{}, nil - } - firstIter, err := prollyMap.IterOrdinalRange(ctx, 0, 1) - if err != nil { - return nil, nil, nil, sql.ColSet{}, err - } - keyBytes, _, err := firstIter.Next(ctx) - if err != nil { - return nil, nil, nil, sql.ColSet{}, err - } - for i := range keyBuilder.Desc.Types { - keyBuilder.PutRaw(i, keyBytes.GetField(i)) - } - - firstKey := keyBuilder.Build(buffPool) - firstRow := make(sql.Row, keyBuilder.Desc.Count()) - for i := 0; i < keyBuilder.Desc.Count(); i++ { - firstRow[i], err = tree.GetField(ctx, prollyMap.KeyDesc(), i, firstKey, prollyMap.NodeStore()) - if err != nil { - return nil, nil, nil, sql.ColSet{}, err - } - } - return firstRow, keyBuilder, fds, colset, nil -} - -func loadFuncDeps(ctx *sql.Context, db dsess.SqlDatabase, qual sql.StatQualifier) (*sql.FuncDepSet, sql.ColSet, error) { - tab, ok, err := db.GetTableInsensitive(ctx, qual.Table()) - if err != nil { - return nil, sql.ColSet{}, err - } else if !ok { - return nil, sql.ColSet{}, fmt.Errorf("%w: table not found: '%s'", statspro.ErrFailedToLoad, qual.Table()) - } - - iat, ok := tab.(sql.IndexAddressable) - if !ok { - return nil, sql.ColSet{}, fmt.Errorf("%w: table does not have indexes: '%s'", statspro.ErrFailedToLoad, qual.Table()) - } - - indexes, err := iat.GetIndexes(ctx) - if err != nil { - return nil, sql.ColSet{}, err - } - - var idx sql.Index - for _, i := range indexes { - if strings.EqualFold(i.ID(), qual.Index()) { - idx = i - break - } - } - - if idx == nil { - return nil, sql.ColSet{}, fmt.Errorf("%w: index not found: '%s'", statspro.ErrFailedToLoad, qual.Index()) - } - - return stats.IndexFds(qual.Table(), tab.Schema(), idx) -} diff --git a/go/libraries/doltcore/sqle/statsnoms/write.go b/go/libraries/doltcore/sqle/statsnoms/write.go deleted file mode 100644 index c23e1d93dc8..00000000000 --- a/go/libraries/doltcore/sqle/statsnoms/write.go +++ /dev/null @@ -1,181 +0,0 @@ -// Copyright 2024 Dolthub, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package statsnoms - -import ( - "context" - "errors" - "io" - "strings" - - "github.com/dolthub/go-mysql-server/sql" - "github.com/dolthub/go-mysql-server/sql/stats" - "github.com/dolthub/go-mysql-server/sql/types" - - "github.com/dolthub/dolt/go/libraries/doltcore/schema" - "github.com/dolthub/dolt/go/libraries/doltcore/sqle/statspro" - "github.com/dolthub/dolt/go/store/prolly" - "github.com/dolthub/dolt/go/store/prolly/tree" - "github.com/dolthub/dolt/go/store/val" -) - -// About ~200 20 byte address fit in a ~4k chunk. Chunk sizes -// are approximate, but certainly shouldn't reach the square -// of the expected size. -const maxBucketFanout = 200 * 200 - -var mcvsTypes = []sql.Type{types.Int64, types.Int64, types.Int64} - -func (n *NomsStatsDatabase) replaceStats(ctx context.Context, statsMap *prolly.MutableMap, dStats *statspro.DoltStats) error { - if err := deleteIndexRows(ctx, statsMap, dStats); err != nil { - return err - } - return putIndexRows(ctx, statsMap, dStats) -} - -func deleteIndexRows(ctx context.Context, statsMap *prolly.MutableMap, dStats *statspro.DoltStats) error { - if ctx.Err() != nil { - return ctx.Err() - } - sch := schema.StatsTableDoltSchema - kd, _ := sch.GetMapDescriptors() - - keyBuilder := val.NewTupleBuilder(kd) - - qual := dStats.Qualifier() - pool := statsMap.NodeStore().Pool() - - // delete previous entries for this index -> (db, table, index, pos) - keyBuilder.PutString(0, qual.Database) - keyBuilder.PutString(1, qual.Table()) - keyBuilder.PutString(2, qual.Index()) - keyBuilder.PutInt64(3, 0) - firstKey := keyBuilder.Build(pool) - keyBuilder.PutString(0, qual.Database) - keyBuilder.PutString(1, qual.Table()) - keyBuilder.PutString(2, qual.Index()) - keyBuilder.PutInt64(3, maxBucketFanout+1) - maxKey := keyBuilder.Build(pool) - - // there is a limit on the number of buckets for a given index, iter - // will terminate before maxBucketFanout - iter, err := statsMap.IterKeyRange(ctx, firstKey, maxKey) - if err != nil { - return err - } - - for { - k, _, err := iter.Next(ctx) - if errors.Is(err, io.EOF) { - break - } else if err != nil { - return err - } - err = statsMap.Put(ctx, k, nil) - if err != nil { - return err - } - } - return nil -} - -func putIndexRows(ctx context.Context, statsMap *prolly.MutableMap, dStats *statspro.DoltStats) error { - if ctx.Err() != nil { - return ctx.Err() - } - sch := schema.StatsTableDoltSchema - kd, vd := sch.GetMapDescriptors() - - keyBuilder := val.NewTupleBuilder(kd) - valueBuilder := val.NewTupleBuilder(vd) - - qual := dStats.Qualifier() - pool := statsMap.NodeStore().Pool() - - // now add new buckets - typesB := strings.Builder{} - sep := "" - for _, t := range dStats.Statistic.Typs { - typesB.WriteString(sep + t.String()) - sep = "\n" - } - typesStr := typesB.String() - - var pos int64 - for _, h := range dStats.Hist { - keyBuilder.PutString(0, qual.Database) - keyBuilder.PutString(1, qual.Tab) - keyBuilder.PutString(2, qual.Idx) - keyBuilder.PutInt64(3, pos) - - valueBuilder.PutInt64(0, schema.StatsVersion) - valueBuilder.PutString(1, statspro.DoltBucketChunk(h).String()) - valueBuilder.PutInt64(2, int64(h.RowCount())) - valueBuilder.PutInt64(3, int64(h.DistinctCount())) - valueBuilder.PutInt64(4, int64(h.NullCount())) - valueBuilder.PutString(5, strings.Join(dStats.Columns(), ",")) - valueBuilder.PutString(6, typesStr) - boundRow, err := EncodeRow(ctx, statsMap.NodeStore(), h.UpperBound(), dStats.Tb) - if err != nil { - return err - } - valueBuilder.PutString(7, string(boundRow)) - valueBuilder.PutInt64(8, int64(h.BoundCount())) - valueBuilder.PutDatetime(9, statspro.DoltBucketCreated(h)) - for i, r := range h.Mcvs() { - mcvRow, err := EncodeRow(ctx, statsMap.NodeStore(), r, dStats.Tb) - if err != nil { - return err - } - valueBuilder.PutString(10+i, string(mcvRow)) - } - var mcvCntsRow sql.Row - for _, v := range h.McvCounts() { - mcvCntsRow = append(mcvCntsRow, int(v)) - } - valueBuilder.PutString(14, stats.StringifyKey(mcvCntsRow, mcvsTypes)) - - key := keyBuilder.Build(pool) - value := valueBuilder.Build(pool) - statsMap.Put(ctx, key, value) - pos++ - } - return nil -} - -func EncodeRow(ctx context.Context, ns tree.NodeStore, r sql.Row, tb *val.TupleBuilder) ([]byte, error) { - for i, v := range r { - if v == nil { - continue - } - if err := tree.PutField(ctx, ns, tb, i, v); err != nil { - return nil, err - } - } - return tb.Build(ns.Pool()), nil -} - -func DecodeRow(ctx context.Context, ns tree.NodeStore, s string, tb *val.TupleBuilder) (sql.Row, error) { - tup := []byte(s) - r := make(sql.Row, tb.Desc.Count()) - var err error - for i, _ := range r { - r[i], err = tree.GetField(ctx, tb.Desc, i, tup, ns) - if err != nil { - return nil, err - } - } - return r, nil -} diff --git a/go/libraries/doltcore/sqle/statspro/analyze.go b/go/libraries/doltcore/sqle/statspro/analyze.go deleted file mode 100644 index faa1869315c..00000000000 --- a/go/libraries/doltcore/sqle/statspro/analyze.go +++ /dev/null @@ -1,343 +0,0 @@ -// Copyright 2024 Dolthub, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package statspro - -import ( - "fmt" - "strings" - - "github.com/dolthub/go-mysql-server/sql" - - "github.com/dolthub/dolt/go/libraries/doltcore/doltdb" - "github.com/dolthub/dolt/go/libraries/doltcore/doltdb/durable" - "github.com/dolthub/dolt/go/libraries/doltcore/env" - "github.com/dolthub/dolt/go/libraries/doltcore/sqle" - "github.com/dolthub/dolt/go/libraries/doltcore/sqle/dsess" - "github.com/dolthub/dolt/go/store/hash" - "github.com/dolthub/dolt/go/store/prolly/tree" -) - -const ( - boostrapRowLimit = 2e6 -) - -func (p *Provider) RefreshTableStats(ctx *sql.Context, table sql.Table, db string) error { - dSess := dsess.DSessFromSess(ctx.Session) - branch, err := dSess.GetBranch() - if err != nil { - return err - } - return p.RefreshTableStatsWithBranch(ctx, table, db, branch) -} - -func (p *Provider) BootstrapDatabaseStats(ctx *sql.Context, db string) error { - dSess := dsess.DSessFromSess(ctx.Session) - branches := p.getStatsBranches(ctx) - var rows uint64 - for _, branch := range branches { - sqlDb, err := dSess.Provider().Database(ctx, BranchQualifiedDatabase(db, branch)) - if err != nil { - if sql.ErrDatabaseNotFound.Is(err) { - // default branch is not valid - continue - } - return err - } - tables, err := sqlDb.GetTableNames(ctx) - if err != nil { - return err - } - for _, table := range tables { - sqlTable, _, err := GetLatestTable(ctx, table, sqlDb) - if err != nil { - return err - } - - if st, ok := sqlTable.(sql.StatisticsTable); ok { - cnt, ok, err := st.RowCount(ctx) - if ok && err == nil { - rows += cnt - } - } - if rows >= boostrapRowLimit { - return fmt.Errorf("stats bootstrap aborted because %s exceeds the default row limit; manually run \"ANALYZE \" or \"call dolt_stats_restart()\" to collect statistics", db) - } - - if err := p.RefreshTableStatsWithBranch(ctx, sqlTable, db, branch); err != nil { - return err - } - } - } - return nil -} - -func (p *Provider) RefreshTableStatsWithBranch(ctx *sql.Context, table sql.Table, db string, branch string) error { - if !p.TryLockForUpdate(branch, db, table.Name()) { - return fmt.Errorf("already updating statistics") - } - defer p.UnlockTable(branch, db, table.Name()) - - dSess := dsess.DSessFromSess(ctx.Session) - - sqlDb, err := dSess.Provider().Database(ctx, BranchQualifiedDatabase(db, branch)) - if err != nil { - return err - } - - // lock only after accessing DatabaseProvider - - tableName := strings.ToLower(table.Name()) - dbName := strings.ToLower(db) - var schemaName string - if schTab, ok := table.(sql.DatabaseSchemaTable); ok { - schemaName = strings.ToLower(schTab.DatabaseSchema().SchemaName()) - } - - iat, ok := table.(sql.IndexAddressableTable) - if !ok { - return nil - } - indexes, err := iat.GetIndexes(ctx) - if err != nil { - return err - } - - // it's important to update WORKING session references every call - sqlTable, dTab, err := GetLatestTable(ctx, tableName, sqlDb) - if err != nil { - return err - } - - statDb, ok := p.getStatDb(dbName) - if !ok { - // if the stats database does not exist, initialize one - fs, err := p.pro.FileSystemForDatabase(dbName) - if err != nil { - return err - } - sourceDb, ok := p.pro.BaseDatabase(ctx, dbName) - if !ok { - return sql.ErrDatabaseNotFound.New(dbName) - } - statDb, err = p.sf.Init(ctx, sourceDb, p.pro, fs, env.GetCurrentUserHomeDir) - if err != nil { - ctx.Warn(0, err.Error()) - return nil - } - p.setStatDb(dbName, statDb) - } - - schHash, err := dTab.GetSchemaHash(ctx) - if err != nil { - return err - } - - if oldSchHash, err := statDb.GetSchemaHash(ctx, branch, tableName); oldSchHash.IsEmpty() { - if err := statDb.SetSchemaHash(ctx, branch, tableName, schHash); err != nil { - return fmt.Errorf("set schema hash error: %w", err) - } - } else if oldSchHash != schHash { - ctx.GetLogger().Debugf("statistics refresh: detected table schema change: %s,%s/%s", dbName, table, branch) - if err := statDb.SetSchemaHash(ctx, branch, tableName, schHash); err != nil { - return err - } - - stats, err := p.GetTableDoltStats(ctx, branch, dbName, schemaName, tableName) - if err != nil { - return err - } - for _, stat := range stats { - statDb.DeleteStats(ctx, branch, stat.Qualifier()) - } - } else if err != nil { - return err - } - - tablePrefix := fmt.Sprintf("%s.", tableName) - var idxMetas []indexMeta - for _, idx := range indexes { - cols := make([]string, len(idx.Expressions())) - for i, c := range idx.Expressions() { - cols[i] = strings.TrimPrefix(strings.ToLower(c), tablePrefix) - } - - qual := sql.NewStatQualifier(db, schemaName, table.Name(), strings.ToLower(idx.ID())) - curStat, ok := statDb.GetStat(branch, qual) - if !ok { - curStat = NewDoltStats() - curStat.Statistic.Qual = qual - } - idxMeta, err := newIdxMeta(ctx, curStat, dTab, idx, cols) - if err != nil { - return err - } - idxMetas = append(idxMetas, idxMeta) - } - - newTableStats, err := createNewStatsBuckets(ctx, sqlTable, dTab, indexes, idxMetas) - if err != nil { - return err - } - - // merge new chunks with preexisting chunks - for _, idxMeta := range idxMetas { - stat := newTableStats[idxMeta.qual] - targetChunks, err := MergeNewChunks(idxMeta.allAddrs, idxMeta.keepChunks, stat.Hist) - if err != nil { - return err - } - if targetChunks == nil { - // empty table - continue - } - stat.SetChunks(idxMeta.allAddrs) - stat.Hist = targetChunks - stat.UpdateActive() - if err := statDb.SetStat(ctx, branch, idxMeta.qual, stat); err != nil { - return err - } - } - - p.UpdateStatus(dbName, fmt.Sprintf("refreshed %s", dbName)) - return statDb.Flush(ctx, branch) -} - -// BranchQualifiedDatabase returns a branch qualified database. If the database -// is already branch suffixed no duplication is applied. -func BranchQualifiedDatabase(db, branch string) string { - suffix := fmt.Sprintf("/%s", branch) - if !strings.HasSuffix(db, suffix) { - return fmt.Sprintf("%s%s", db, suffix) - } - return db -} - -// GetLatestTable will get the WORKING root table for the current database/branch -func GetLatestTable(ctx *sql.Context, tableName string, sqlDb sql.Database) (sql.Table, *doltdb.Table, error) { - var db sqle.Database - switch d := sqlDb.(type) { - case sqle.Database: - db = d - case sqle.ReadReplicaDatabase: - db = d.Database - default: - return nil, nil, fmt.Errorf("expected sqle.Database, found %T", sqlDb) - } - sqlTable, ok, err := db.GetTableInsensitive(ctx, tableName) - if err != nil { - return nil, nil, err - } - if !ok { - return nil, nil, fmt.Errorf("statistics refresh error: table not found %s", tableName) - } - - var dTab *doltdb.Table - switch t := sqlTable.(type) { - case *sqle.AlterableDoltTable: - dTab, err = t.DoltTable.DoltTable(ctx) - case *sqle.WritableDoltTable: - dTab, err = t.DoltTable.DoltTable(ctx) - case *sqle.DoltTable: - dTab, err = t.DoltTable(ctx) - default: - err = fmt.Errorf("failed to unwrap dolt table from type: %T", sqlTable) - } - if err != nil { - return nil, nil, err - } - return sqlTable, dTab, nil -} - -func newIdxMeta(ctx *sql.Context, curStats *DoltStats, doltTable *doltdb.Table, sqlIndex sql.Index, cols []string) (indexMeta, error) { - var idx durable.Index - var err error - if strings.EqualFold(sqlIndex.ID(), "PRIMARY") { - idx, err = doltTable.GetRowData(ctx) - } else { - idx, err = doltTable.GetIndexRowData(ctx, sqlIndex.ID()) - } - if err != nil { - return indexMeta{}, err - } - - prollyMap := durable.ProllyMapFromIndex(idx) - - if cnt, err := prollyMap.Count(); err != nil { - return indexMeta{}, err - } else if cnt == 0 { - return indexMeta{ - qual: curStats.Statistic.Qual, - cols: cols, - }, nil - } - - // get newest histogram target level hashes - levelNodes, err := tree.GetHistogramLevel(ctx, prollyMap.Tuples(), bucketLowCnt) - if err != nil { - return indexMeta{}, err - } - - var addrs []hash.Hash - var keepChunks []sql.HistogramBucket - var missingAddrs float64 - var missingChunks []tree.Node - var missingOffsets []updateOrdinal - var offset uint64 - - for _, n := range levelNodes { - // Compare the previous histogram chunks to the newest tree chunks. - // Partition the newest chunks into 1) preserved or 2) missing. - // Missing chunks will need to be scanned on a stats update, so - // track the (start, end) ordinal offsets to simplify the read iter. - treeCnt, err := n.TreeCount() - if err != nil { - return indexMeta{}, err - } - - addrs = append(addrs, n.HashOf()) - if bucketIdx, ok := curStats.Active[n.HashOf()]; !ok { - missingChunks = append(missingChunks, n) - missingOffsets = append(missingOffsets, updateOrdinal{offset, offset + uint64(treeCnt)}) - missingAddrs++ - } else { - keepChunks = append(keepChunks, curStats.Hist[bucketIdx]) - } - offset += uint64(treeCnt) - } - - var dropChunks []sql.HistogramBucket - for _, h := range curStats.Chunks { - var match bool - for _, b := range keepChunks { - if DoltBucketChunk(b) == h { - match = true - break - } - } - if !match { - dropChunks = append(dropChunks, curStats.Hist[curStats.Active[h]]) - } - } - - return indexMeta{ - qual: curStats.Statistic.Qual, - cols: cols, - newNodes: missingChunks, - updateOrdinals: missingOffsets, - keepChunks: keepChunks, - dropChunks: dropChunks, - allAddrs: addrs, - }, nil -} diff --git a/go/libraries/doltcore/sqle/statspro/auto_refresh.go b/go/libraries/doltcore/sqle/statspro/auto_refresh.go deleted file mode 100644 index 3322065f809..00000000000 --- a/go/libraries/doltcore/sqle/statspro/auto_refresh.go +++ /dev/null @@ -1,282 +0,0 @@ -// Copyright 2024 Dolthub, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package statspro - -import ( - "context" - "fmt" - "strings" - "time" - - "github.com/dolthub/go-mysql-server/sql" - types2 "github.com/dolthub/go-mysql-server/sql/types" - - "github.com/dolthub/dolt/go/libraries/doltcore/sqle/dsess" -) - -const asyncAutoRefreshStats = "async_auto_refresh_stats" - -func (p *Provider) InitAutoRefresh(ctxFactory func(ctx context.Context) (*sql.Context, error), dbName string, bThreads *sql.BackgroundThreads) error { - _, threshold, _ := sql.SystemVariables.GetGlobal(dsess.DoltStatsAutoRefreshThreshold) - _, interval, _ := sql.SystemVariables.GetGlobal(dsess.DoltStatsAutoRefreshInterval) - interval64, _, _ := types2.Int64.Convert(interval) - intervalSec := time.Second * time.Duration(interval64.(int64)) - thresholdf64 := threshold.(float64) - - ctx, err := ctxFactory(context.Background()) - if err != nil { - return err - } - - branches := p.getStatsBranches(ctx) - - return p.InitAutoRefreshWithParams(ctxFactory, dbName, bThreads, intervalSec, thresholdf64, branches) -} - -func (p *Provider) InitAutoRefreshWithParams(ctxFactory func(ctx context.Context) (*sql.Context, error), dbName string, bThreads *sql.BackgroundThreads, checkInterval time.Duration, updateThresh float64, branches []string) error { - // this is only called after initial statistics are finished loading - // launch a thread that periodically checks freshness - - p.mu.Lock() - defer p.mu.Unlock() - - dropDbCtx, dbStatsCancel := context.WithCancel(context.Background()) - p.autoCtxCancelers[dbName] = dbStatsCancel - - return bThreads.Add(fmt.Sprintf("%s_%s", asyncAutoRefreshStats, dbName), func(ctx context.Context) { - ticker := time.NewTicker(checkInterval + time.Nanosecond) - for { - select { - case <-ctx.Done(): - ticker.Stop() - return - case <-ticker.C: - select { - case <-dropDbCtx.Done(): - ticker.Stop() - return - default: - } - - sqlCtx, err := ctxFactory(ctx) - if err != nil { - return - } - - dSess := dsess.DSessFromSess(sqlCtx.Session) - ddb, ok := dSess.GetDoltDB(sqlCtx, dbName) - if !ok { - sqlCtx.GetLogger().Debugf("statistics refresh error: database not found %s", dbName) - return - } - for _, branch := range branches { - if br, ok, err := ddb.HasBranch(ctx, branch); ok { - sqlCtx.GetLogger().Debugf("starting statistics refresh check for '%s': %s", dbName, time.Now().String()) - // update WORKING session references - sqlDb, err := dSess.Provider().Database(sqlCtx, BranchQualifiedDatabase(dbName, branch)) - if err != nil { - sqlCtx.GetLogger().Debugf("statistics refresh error: %s", err.Error()) - return - } - - if err := p.checkRefresh(sqlCtx, sqlDb, dbName, br, updateThresh); err != nil { - sqlCtx.GetLogger().Debugf("statistics refresh error: %s", err.Error()) - return - } - } else if err != nil { - sqlCtx.GetLogger().Debugf("statistics refresh error: branch check error %s", err.Error()) - } else { - sqlCtx.GetLogger().Debugf("statistics refresh error: branch not found %s", br) - } - } - } - } - }) -} - -func (p *Provider) checkRefresh(ctx *sql.Context, sqlDb sql.Database, dbName, branch string, updateThresh float64) error { - if !p.TryLockForUpdate(branch, dbName, "") { - return fmt.Errorf("database already being updated: %s/%s", branch, dbName) - } - defer p.UnlockTable(branch, dbName, "") - - // Iterate all dbs, tables, indexes. Each db will collect - // []indexMeta above refresh threshold. We read and process those - // chunks' statistics. We merge updated chunks with precomputed - // chunks. The full set of statistics for each database lands - // 1) in the provider's most recent set of database statistics, and - // 2) on disk in the database's statistics ref'd prolly.Map. - statDb, ok := p.getStatDb(dbName) - if !ok { - return sql.ErrDatabaseNotFound.New(dbName) - } - - var deletedStats []sql.StatQualifier - qualExists := make(map[sql.StatQualifier]bool) - tableExistsAndSkipped := make(map[string]bool) - - tables, err := sqlDb.GetTableNames(ctx) - if err != nil { - return err - } - - for _, table := range tables { - if !p.TryLockForUpdate(branch, dbName, table) { - ctx.GetLogger().Debugf("statistics refresh: table is already being updated: %s/%s.%s", branch, dbName, table) - return fmt.Errorf("table already being updated: %s", table) - } - defer p.UnlockTable(branch, dbName, table) - - sqlTable, dTab, err := GetLatestTable(ctx, table, sqlDb) - if err != nil { - return err - } - - tableHash, err := dTab.GetRowDataHash(ctx) - if err != nil { - return err - } - - if statDb.GetTableHash(branch, table) == tableHash { - // no data changes since last check - tableExistsAndSkipped[table] = true - ctx.GetLogger().Debugf("statistics refresh: table hash unchanged since last check: %s", tableHash) - continue - } else { - ctx.GetLogger().Debugf("statistics refresh: new table hash: %s", tableHash) - } - - schHash, err := dTab.GetSchemaHash(ctx) - if err != nil { - return err - } - - var schemaName string - if schTab, ok := sqlTable.(sql.DatabaseSchemaTable); ok { - schemaName = strings.ToLower(schTab.DatabaseSchema().SchemaName()) - } - - if oldSchHash, err := statDb.GetSchemaHash(ctx, branch, table); oldSchHash.IsEmpty() { - if err := statDb.SetSchemaHash(ctx, branch, table, schHash); err != nil { - return err - } - } else if oldSchHash != schHash { - ctx.GetLogger().Debugf("statistics refresh: detected table schema change: %s,%s/%s", dbName, table, branch) - if err := statDb.SetSchemaHash(ctx, branch, table, schHash); err != nil { - return err - } - stats, err := p.GetTableDoltStats(ctx, branch, dbName, schemaName, table) - if err != nil { - return err - } - for _, stat := range stats { - statDb.DeleteStats(ctx, branch, stat.Qualifier()) - } - } else if err != nil { - return err - } - - iat, ok := sqlTable.(sql.IndexAddressableTable) - if !ok { - return fmt.Errorf("table does not support indexes %s", table) - } - - indexes, err := iat.GetIndexes(ctx) - if err != nil { - return err - } - - // collect indexes and ranges to be updated - var idxMetas []indexMeta - for _, index := range indexes { - qual := sql.NewStatQualifier(dbName, schemaName, table, strings.ToLower(index.ID())) - qualExists[qual] = true - curStat, ok := statDb.GetStat(branch, qual) - if !ok { - curStat = NewDoltStats() - curStat.Statistic.Qual = qual - - cols := make([]string, len(index.Expressions())) - tablePrefix := fmt.Sprintf("%s.", table) - for i, c := range index.Expressions() { - cols[i] = strings.TrimPrefix(strings.ToLower(c), tablePrefix) - } - curStat.Statistic.Cols = cols - } - ctx.GetLogger().Debugf("statistics refresh index: %s", qual.String()) - - updateMeta, err := newIdxMeta(ctx, curStat, dTab, index, curStat.Columns()) - if err != nil { - ctx.GetLogger().Debugf("statistics refresh error: %s", err.Error()) - continue - } - curCnt := float64(len(curStat.Active)) - updateCnt := float64(len(updateMeta.newNodes)) - deleteCnt := float64(len(curStat.Active) - len(updateMeta.keepChunks)) - ctx.GetLogger().Debugf("statistics current: %d, new: %d, delete: %d", int(curCnt), int(updateCnt), int(deleteCnt)) - - if curCnt == 0 || (deleteCnt+updateCnt)/curCnt > updateThresh { - if curCnt == 0 && updateCnt == 0 { - continue - } - ctx.GetLogger().Debugf("statistics updating: %s", updateMeta.qual) - // mark index for updating - idxMetas = append(idxMetas, updateMeta) - // update latest hash if we haven't already - statDb.SetTableHash(branch, table, tableHash) - } - } - - // get new buckets for index chunks to update - newTableStats, err := createNewStatsBuckets(ctx, sqlTable, dTab, indexes, idxMetas) - if err != nil { - return err - } - - // merge new chunks with preexisting chunks - for _, updateMeta := range idxMetas { - stat := newTableStats[updateMeta.qual] - if stat != nil { - var err error - if _, ok := statDb.GetStat(branch, updateMeta.qual); !ok { - err = statDb.SetStat(ctx, branch, updateMeta.qual, stat) - } else { - err = statDb.ReplaceChunks(ctx, branch, updateMeta.qual, updateMeta.allAddrs, updateMeta.dropChunks, stat.Hist) - } - if err != nil { - return err - } - p.UpdateStatus(dbName, fmt.Sprintf("refreshed %s", dbName)) - } - } - } - - for _, q := range statDb.ListStatQuals(branch) { - // table or index delete leaves hole in stats - // this is separate from threshold check - if !tableExistsAndSkipped[q.Table()] && !qualExists[q] { - // only delete stats we've verified are deleted - deletedStats = append(deletedStats, q) - } - } - - statDb.DeleteStats(ctx, branch, deletedStats...) - - if err := statDb.Flush(ctx, branch); err != nil { - return err - } - - return nil -} diff --git a/go/libraries/doltcore/sqle/statspro/update.go b/go/libraries/doltcore/sqle/statspro/bucket_builder.go similarity index 52% rename from go/libraries/doltcore/sqle/statspro/update.go rename to go/libraries/doltcore/sqle/statspro/bucket_builder.go index 562e82c5679..f521ebe83bd 100644 --- a/go/libraries/doltcore/sqle/statspro/update.go +++ b/go/libraries/doltcore/sqle/statspro/bucket_builder.go @@ -17,19 +17,10 @@ package statspro import ( "container/heap" "context" - "errors" - "fmt" - "io" - "sort" - "strings" - "time" - "github.com/dolthub/go-mysql-server/sql" "github.com/dolthub/go-mysql-server/sql/stats" + "sort" - "github.com/dolthub/dolt/go/libraries/doltcore/doltdb" - "github.com/dolthub/dolt/go/libraries/doltcore/doltdb/durable" - "github.com/dolthub/dolt/go/store/hash" "github.com/dolthub/dolt/go/store/prolly" "github.com/dolthub/dolt/go/store/prolly/tree" "github.com/dolthub/dolt/go/store/val" @@ -40,153 +31,7 @@ const ( mcvCnt = 3 ) -// createNewStatsBuckets builds histograms for a list of index statistic metadata. -// We only read chunk ranges indicated by |indexMeta.updateOrdinals|. If -// the returned buckets are a subset of the index the caller is responsible -// for reconciling the difference. -func createNewStatsBuckets(ctx *sql.Context, sqlTable sql.Table, dTab *doltdb.Table, indexes []sql.Index, idxMetas []indexMeta) (map[sql.StatQualifier]*DoltStats, error) { - nameToIdx := make(map[string]sql.Index) - for _, idx := range indexes { - nameToIdx[strings.ToLower(idx.ID())] = idx - } - - ret := make(map[sql.StatQualifier]*DoltStats) - - for _, meta := range idxMetas { - var idx durable.Index - var err error - if strings.EqualFold(meta.qual.Index(), "PRIMARY") { - idx, err = dTab.GetRowData(ctx) - } else { - idx, err = dTab.GetIndexRowData(ctx, meta.qual.Index()) - } - if err != nil { - return nil, err - } - - prollyMap := durable.ProllyMapFromIndex(idx) - keyBuilder := val.NewTupleBuilder(prollyMap.KeyDesc()) - - sqlIdx := nameToIdx[strings.ToLower(meta.qual.Index())] - fds, colSet, err := stats.IndexFds(meta.qual.Table(), sqlTable.Schema(), sqlIdx) - if err != nil { - return nil, err - } - - var types []sql.Type - for _, cet := range nameToIdx[strings.ToLower(meta.qual.Index())].ColumnExpressionTypes() { - types = append(types, cet.Type) - } - - if cnt, err := prollyMap.Count(); err != nil { - return nil, err - } else if cnt == 0 { - // table is empty - ret[meta.qual] = NewDoltStats() - ret[meta.qual].Statistic.Created = time.Now() - ret[meta.qual].Statistic.Cols = meta.cols - ret[meta.qual].Statistic.Typs = types - ret[meta.qual].Statistic.Qual = meta.qual - - ret[meta.qual].Statistic.Fds = fds - ret[meta.qual].Statistic.Colset = colSet - ret[meta.qual].Tb = val.NewTupleBuilder(prollyMap.KeyDesc().PrefixDesc(len(meta.cols))) - - continue - } - - firstRow, err := firstRowForIndex(ctx, prollyMap, keyBuilder, len(meta.cols)) - if err != nil { - return nil, err - } - - updater := newBucketBuilder(meta.qual, len(meta.cols), prollyMap.KeyDesc()) - ret[meta.qual] = NewDoltStats() - ret[meta.qual].Chunks = meta.allAddrs - ret[meta.qual].Statistic.Created = time.Now() - ret[meta.qual].Statistic.Cols = meta.cols - ret[meta.qual].Statistic.Typs = types - ret[meta.qual].Statistic.Qual = meta.qual - ret[meta.qual].Tb = val.NewTupleBuilder(prollyMap.KeyDesc().PrefixDesc(len(meta.cols))) - - var start, stop uint64 - // read leaf rows for each bucket - for i, chunk := range meta.newNodes { - // each node is a bucket - updater.newBucket() - - // we read exclusive range [node first key, next node first key) - start, stop = meta.updateOrdinals[i].start, meta.updateOrdinals[i].stop - iter, err := prollyMap.IterOrdinalRange(ctx, start, stop) - if err != nil { - return nil, err - } - for { - // stats key will be a prefix of the index key - keyBytes, _, err := iter.Next(ctx) - if errors.Is(err, io.EOF) { - break - } else if err != nil { - return nil, err - } - // build full key - for i := range keyBuilder.Desc.Types { - keyBuilder.PutRaw(i, keyBytes.GetField(i)) - } - - updater.add(keyBuilder.BuildPrefixNoRecycle(prollyMap.Pool(), updater.prefixLen)) - keyBuilder.Recycle() - } - - // finalize the aggregation - bucket, err := updater.finalize(ctx, prollyMap.NodeStore()) - if err != nil { - return nil, err - } - bucket.Chunk = chunk.HashOf() - ret[updater.qual].Hist = append(ret[updater.qual].Hist, bucket) - } - - ret[updater.qual].Statistic.DistinctCnt = uint64(updater.globalDistinct) - ret[updater.qual].Statistic.RowCnt = uint64(updater.globalCount) - ret[updater.qual].Statistic.LowerBnd = firstRow - ret[updater.qual].Statistic.Fds = fds - ret[updater.qual].Statistic.Colset = colSet - ret[updater.qual].UpdateActive() - } - return ret, nil -} - -// MergeNewChunks combines a set of old and new chunks to create -// the desired target histogram. Undefined behavior if a |targetHash| -// does not exist in either |oldChunks| or |newChunks|. -func MergeNewChunks(inputHashes []hash.Hash, oldChunks, newChunks []sql.HistogramBucket) ([]sql.HistogramBucket, error) { - hashToPos := make(map[hash.Hash]int, len(inputHashes)) - for i, h := range inputHashes { - hashToPos[h] = i - } - - var cnt int - targetBuckets := make([]sql.HistogramBucket, len(inputHashes)) - for _, c := range oldChunks { - if idx, ok := hashToPos[DoltBucketChunk(c)]; ok { - cnt++ - targetBuckets[idx] = c - } - } - for _, c := range newChunks { - if idx, ok := hashToPos[DoltBucketChunk(c)]; ok && targetBuckets[idx] == nil { - cnt++ - targetBuckets[idx] = c - } - } - if cnt != len(inputHashes) { - return nil, fmt.Errorf("encountered invalid statistic chunks") - } - return targetBuckets, nil -} - -func firstRowForIndex(ctx *sql.Context, prollyMap prolly.Map, keyBuilder *val.TupleBuilder, prefixLen int) (sql.Row, error) { +func firstRowForIndex(ctx *sql.Context, prollyMap prolly.Map, keyBuilder *val.TupleBuilder) (sql.Row, error) { if cnt, err := prollyMap.Count(); err != nil { return nil, err } else if cnt == 0 { @@ -208,9 +53,9 @@ func firstRowForIndex(ctx *sql.Context, prollyMap prolly.Map, keyBuilder *val.Tu keyBuilder.PutRaw(i, keyBytes.GetField(i)) } - firstKey := keyBuilder.BuildPrefixNoRecycle(buffPool, prefixLen) - firstRow := make(sql.Row, prefixLen) - for i := 0; i < prefixLen; i++ { + firstKey := keyBuilder.Build(buffPool) + firstRow := make(sql.Row, firstKey.Count()) + for i := range firstRow { firstRow[i], err = tree.GetField(ctx, prollyMap.KeyDesc(), i, firstKey, prollyMap.NodeStore()) if err != nil { return nil, err @@ -266,7 +111,7 @@ func (u *bucketBuilder) newBucket() { // finalize converts the current aggregation stats into a histogram bucket, // which includes deserializing most common value tuples into sql.Rows. -func (u *bucketBuilder) finalize(ctx context.Context, ns tree.NodeStore) (DoltBucket, error) { +func (u *bucketBuilder) finalize(ctx context.Context, ns tree.NodeStore) (*stats.Bucket, error) { // update MCV in case we've ended on a run of many identical keys u.updateMcv() @@ -276,27 +121,25 @@ func (u *bucketBuilder) finalize(ctx context.Context, ns tree.NodeStore) (DoltBu // convert the MCV tuples into SQL rows (most efficient to only do this once) mcvRows, err := u.mcvs.Values(ctx, u.tupleDesc, ns, u.prefixLen) if err != nil { - return DoltBucket{}, err + return nil, err } upperBound := make(sql.Row, u.prefixLen) if u.currentKey != nil { for i := 0; i < u.prefixLen; i++ { upperBound[i], err = tree.GetField(ctx, u.tupleDesc, i, u.currentKey, ns) if err != nil { - return DoltBucket{}, err + return nil, err } } } - return DoltBucket{ - Bucket: &stats.Bucket{ - RowCnt: uint64(u.count), - DistinctCnt: uint64(u.distinct), - BoundCnt: uint64(u.currentCnt), - McvVals: mcvRows, - McvsCnt: u.mcvs.Counts(), - BoundVal: upperBound, - NullCnt: uint64(u.nulls), - }, + return &stats.Bucket{ + RowCnt: uint64(u.count), + DistinctCnt: uint64(u.distinct), + BoundCnt: uint64(u.currentCnt), + McvVals: mcvRows, + McvsCnt: u.mcvs.Counts(), + BoundVal: upperBound, + NullCnt: uint64(u.nulls), }, nil } diff --git a/go/libraries/doltcore/sqle/statspro/update_test.go b/go/libraries/doltcore/sqle/statspro/bucket_builder_test.go similarity index 92% rename from go/libraries/doltcore/sqle/statspro/update_test.go rename to go/libraries/doltcore/sqle/statspro/bucket_builder_test.go index ef670e19c8b..e97ad343755 100644 --- a/go/libraries/doltcore/sqle/statspro/update_test.go +++ b/go/libraries/doltcore/sqle/statspro/bucket_builder_test.go @@ -61,27 +61,27 @@ func TestBucketBuilder(t *testing.T) { name string keys []sql.Row keyDesc val.TupleDesc - bucket DoltBucket + bucket *stats.Bucket }{ { name: "ints", keys: []sql.Row{{1}, {1}, {1}, {2}, {2}, {2}, {2}, {3}, {3}, {3}, {4}, {4}, {4}, {5}, {5}}, keyDesc: val.NewTupleDescriptor(val.Type{Enc: val.Int64Enc, Nullable: false}), - bucket: DoltBucket{Bucket: &stats.Bucket{ + bucket: &stats.Bucket{ RowCnt: 15, DistinctCnt: 5, McvVals: []sql.Row{}, McvsCnt: []uint64{}, BoundVal: sql.Row{int64(5)}, BoundCnt: 2, - }}, + }, }, { // technically nulls should be at beginning name: "ints with middle nulls", keys: []sql.Row{{1}, {1}, {1}, {2}, {2}, {2}, {2}, {nil}, {nil}, {nil}, {3}, {4}, {4}, {4}, {5}, {5}}, keyDesc: val.NewTupleDescriptor(val.Type{Enc: val.Int64Enc, Nullable: true}), - bucket: DoltBucket{Bucket: &stats.Bucket{ + bucket: &stats.Bucket{ RowCnt: 16, DistinctCnt: 6, NullCnt: 3, @@ -89,13 +89,13 @@ func TestBucketBuilder(t *testing.T) { McvsCnt: []uint64{}, BoundVal: sql.Row{int64(5)}, BoundCnt: 2, - }}, + }, }, { name: "ints with beginning nulls", keys: []sql.Row{{nil}, {nil}, {1}, {2}, {2}, {2}, {2}, {3}, {3}, {3}, {4}, {4}, {4}, {5}, {5}}, keyDesc: val.NewTupleDescriptor(val.Type{Enc: val.Int64Enc, Nullable: true}), - bucket: DoltBucket{Bucket: &stats.Bucket{ + bucket: &stats.Bucket{ RowCnt: 15, DistinctCnt: 6, NullCnt: 2, @@ -103,86 +103,86 @@ func TestBucketBuilder(t *testing.T) { McvsCnt: []uint64{}, BoundVal: sql.Row{int64(5)}, BoundCnt: 2, - }}, + }, }, { name: "more ints", keys: []sql.Row{{1}, {1}, {1}, {2}, {2}, {2}, {2}, {3}, {3}, {3}, {4}, {4}, {4}, {5}, {5}, {5}, {5}, {6}, {6}, {6}, {6}, {7}}, keyDesc: val.NewTupleDescriptor(val.Type{Enc: val.Int64Enc, Nullable: false}), - bucket: DoltBucket{Bucket: &stats.Bucket{ + bucket: &stats.Bucket{ RowCnt: 22, DistinctCnt: 7, BoundCnt: 1, McvVals: []sql.Row{}, McvsCnt: []uint64{}, BoundVal: sql.Row{int64(7)}, - }}, + }, }, { name: "2-ints", keys: []sql.Row{{1, 1}, {1, 1}, {1, 2}, {2, 1}, {2, 2}, {2, 3}, {2, 3}, {3, 1}, {3, 2}, {3, 3}, {4, 1}, {4, 1}, {4, 1}, {5, 1}, {5, 2}}, keyDesc: val.NewTupleDescriptor(val.Type{Enc: val.Int64Enc, Nullable: false}, val.Type{Enc: val.Int64Enc, Nullable: false}), - bucket: DoltBucket{Bucket: &stats.Bucket{ + bucket: &stats.Bucket{ RowCnt: 15, DistinctCnt: 11, McvVals: []sql.Row{{int64(4), int64(1)}}, McvsCnt: []uint64{3}, BoundVal: sql.Row{int64(5), int64(2)}, BoundCnt: 1, - }}, + }, }, { name: "2-ints with nulls", keys: []sql.Row{{nil, 1}, {1, nil}, {1, 2}, {2, nil}, {2, 2}}, keyDesc: val.NewTupleDescriptor(val.Type{Enc: val.Int64Enc, Nullable: true}, val.Type{Enc: val.Int64Enc, Nullable: true}), - bucket: DoltBucket{Bucket: &stats.Bucket{ + bucket: &stats.Bucket{ RowCnt: 5, DistinctCnt: 5, NullCnt: 3, McvVals: []sql.Row{}, McvsCnt: []uint64{}, BoundVal: sql.Row{int64(2), int64(2)}, - BoundCnt: 1}, + BoundCnt: 1, }, }, { name: "varchars", keys: []sql.Row{{"a"}, {"b"}, {"c"}, {"d"}, {"e"}, {"e"}, {"f"}, {"g"}, {"g"}, {"g"}, {"h"}, {"h"}, {"h"}, {"i"}, {"i"}}, keyDesc: val.NewTupleDescriptor(val.Type{Enc: val.StringEnc, Nullable: false}), - bucket: DoltBucket{Bucket: &stats.Bucket{ + bucket: &stats.Bucket{ RowCnt: 15, DistinctCnt: 9, McvVals: []sql.Row{}, McvsCnt: []uint64{}, BoundVal: sql.Row{"i"}, BoundCnt: 2, - }}, + }, }, { name: "varchar-ints", keys: []sql.Row{{"a", 1}, {"b", 1}, {"c", 1}, {"d", 1}, {"e", 1}, {"e", 2}, {"f", 1}, {"g", 1}, {"g", 2}, {"g", 2}, {"h", 1}, {"h", 1}, {"h", 2}, {"i", 1}, {"i", 1}}, keyDesc: val.NewTupleDescriptor(val.Type{Enc: val.StringEnc, Nullable: false}, val.Type{Enc: val.Int64Enc, Nullable: false}), - bucket: DoltBucket{Bucket: &stats.Bucket{ + bucket: &stats.Bucket{ RowCnt: 15, DistinctCnt: 12, McvVals: []sql.Row{}, McvsCnt: []uint64{}, BoundVal: sql.Row{"i", int64(1)}, BoundCnt: 2, - }}, + }, }, { name: "mcvs", keys: []sql.Row{{1}, {2}, {3}, {4}, {5}, {6}, {7}, {7}, {7}, {7}, {8}, {9}, {10}, {10}, {10}, {11}, {12}, {13}, {14}, {15}, {20}, {21}, {22}}, keyDesc: val.NewTupleDescriptor(val.Type{Enc: val.Int64Enc, Nullable: false}), - bucket: DoltBucket{Bucket: &stats.Bucket{ + bucket: &stats.Bucket{ RowCnt: 23, DistinctCnt: 18, McvVals: []sql.Row{{int64(10)}, {int64(7)}}, McvsCnt: []uint64{3, 4}, BoundVal: sql.Row{int64(22)}, BoundCnt: 1, - }}, + }, }, } diff --git a/go/libraries/doltcore/sqle/statspro/configure.go b/go/libraries/doltcore/sqle/statspro/configure.go deleted file mode 100644 index f8492a08b61..00000000000 --- a/go/libraries/doltcore/sqle/statspro/configure.go +++ /dev/null @@ -1,158 +0,0 @@ -// Copyright 2024 Dolthub, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package statspro - -import ( - "context" - "fmt" - "strings" - "time" - - "github.com/dolthub/go-mysql-server/sql" - types2 "github.com/dolthub/go-mysql-server/sql/types" - - "github.com/dolthub/dolt/go/libraries/doltcore/env" - "github.com/dolthub/dolt/go/libraries/doltcore/sqle" - "github.com/dolthub/dolt/go/libraries/doltcore/sqle/dsess" - "github.com/dolthub/dolt/go/libraries/utils/filesys" -) - -var helpMsg = "call dolt_stats_purge() to reset statistics" - -func (p *Provider) Configure(ctx context.Context, ctxFactory func(ctx context.Context) (*sql.Context, error), bThreads *sql.BackgroundThreads, dbs []dsess.SqlDatabase) error { - p.SetStarter(NewStatsInitDatabaseHook(p, ctxFactory, bThreads)) - - if _, disabled, _ := sql.SystemVariables.GetGlobal(dsess.DoltStatsMemoryOnly); disabled == int8(1) { - return nil - } - - loadCtx, err := ctxFactory(ctx) - if err != nil { - return err - } - - branches := p.getStatsBranches(loadCtx) - - var autoEnabled bool - var startupEnabled bool - var intervalSec time.Duration - var thresholdf64 float64 - if _, enabled, _ := sql.SystemVariables.GetGlobal(dsess.DoltStatsAutoRefreshEnabled); enabled == int8(1) { - autoEnabled = true - _, threshold, _ := sql.SystemVariables.GetGlobal(dsess.DoltStatsAutoRefreshThreshold) - _, interval, _ := sql.SystemVariables.GetGlobal(dsess.DoltStatsAutoRefreshInterval) - interval64, _, _ := types2.Int64.Convert(interval) - intervalSec = time.Second * time.Duration(interval64.(int64)) - thresholdf64 = threshold.(float64) - - p.pro.InitDatabaseHooks = append(p.pro.InitDatabaseHooks, NewStatsInitDatabaseHook(p, ctxFactory, bThreads)) - p.pro.DropDatabaseHooks = append([]sqle.DropDatabaseHook{NewStatsDropDatabaseHook(p)}, p.pro.DropDatabaseHooks...) - } else if _, startupStats, _ := sql.SystemVariables.GetGlobal(dsess.DoltStatsBootstrapEnabled); startupStats == int8(1) { - startupEnabled = true - } - - eg, ctx := loadCtx.NewErrgroup() - for _, db := range dbs { - // copy closure variables - db := db - eg.Go(func() (err error) { - defer func() { - if r := recover(); r != nil { - if str, ok := r.(fmt.Stringer); ok { - err = fmt.Errorf("%w: %s", ErrFailedToLoad, str.String()) - } else { - err = fmt.Errorf("%w: %v", ErrFailedToLoad, r) - } - return - } - }() - - fs, err := p.pro.FileSystemForDatabase(db.Name()) - if err != nil { - return err - } - - if p.Load(loadCtx, fs, db, branches); err != nil { - return err - } - if autoEnabled { - return p.InitAutoRefreshWithParams(ctxFactory, db.Name(), bThreads, intervalSec, thresholdf64, branches) - } else if startupEnabled { - if err := p.BootstrapDatabaseStats(loadCtx, db.Name()); err != nil { - return err - } - } - return nil - }) - } - return eg.Wait() -} - -// getStatsBranches returns the set of branches whose statistics are tracked. -// The order of precedence is (1) global variable, (2) session current branch, -// (3) engine default branch. -func (p *Provider) getStatsBranches(ctx *sql.Context) []string { - dSess := dsess.DSessFromSess(ctx.Session) - var branches []string - if _, bs, _ := sql.SystemVariables.GetGlobal(dsess.DoltStatsBranches); bs == "" { - defaultBranch, _ := dSess.GetBranch() - if defaultBranch != "" { - branches = append(branches, defaultBranch) - } - } else { - for _, branch := range strings.Split(bs.(string), ",") { - branches = append(branches, strings.TrimSpace(branch)) - } - } - - if branches == nil { - branches = append(branches, p.pro.DefaultBranch()) - } - return branches -} - -func (p *Provider) LoadStats(ctx *sql.Context, db, branch string) error { - if statDb, ok := p.getStatDb(db); ok { - return statDb.LoadBranchStats(ctx, branch) - } - return nil -} - -// Load scans the statistics tables, populating the |stats| attribute. -// Statistics are not available for reading until we've finished loading. -func (p *Provider) Load(ctx *sql.Context, fs filesys.Filesys, db dsess.SqlDatabase, branches []string) { - // |statPath| is either file://./stat or mem://stat - statsDb, err := p.sf.Init(ctx, db, p.pro, fs, env.GetCurrentUserHomeDir) - if err != nil { - ctx.GetLogger().Errorf("initialize stats failure for %s: %s; %s\n", db.Name(), err.Error(), helpMsg) - return - } - - for _, branch := range branches { - if err = statsDb.LoadBranchStats(ctx, branch); err != nil { - // if branch name is invalid, continue loading rest - // TODO: differentiate bad branch name from other errors - ctx.GetLogger().Errorf("load stats init failure for %s: %s; %s\n", db.Name(), err.Error(), helpMsg) - continue - } - if err := statsDb.Flush(ctx, branch); err != nil { - ctx.GetLogger().Errorf("load stats flush failure for %s: %s; %s\n", db.Name(), err.Error(), helpMsg) - continue - } - } - - p.setStatDb(strings.ToLower(db.Name()), statsDb) - return -} diff --git a/go/libraries/doltcore/sqle/statspro/doc.go b/go/libraries/doltcore/sqle/statspro/doc.go new file mode 100644 index 00000000000..281ae80f16e --- /dev/null +++ b/go/libraries/doltcore/sqle/statspro/doc.go @@ -0,0 +1,79 @@ +// Copyright 2025 Dolthub, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package statspro + +// Package statspro provides an event loop that manages table statistics +// management and access. +// +// At any given time there is one thread responsible for pulling work +// from the job queue to execute. The thread has exclusive ownership +// over the job channel. +// +// All stats are persisted within a single database. If there are multiple +// databases, one is selected by random as the storage target. If during +// initialization multiple databases have stats, one will be chosen by +// random as the target. If a database changes between server restarts, +// the storage stats will be useless but not impair operations because +// storage is only ever a best-effort content-addressed persistence layer; +// buckets will be regenerated if they are missing. If the database acting +// as a storage target is deleted, we swap the cache to write to a new storage +// target that still exists. +// +// The main data structures: +// - Table statistics map, that returns a list of table index statistics +// for a specific branch, database, and table name. +// - Object caches: +// - Bucket cache: Chunk addressed histogram bucket. All provider +// histogram references should be in the bucket cache. This is an LRU +// that is sized to always fit the current active set, and doubles +// when the provider bucket counter reaches the threshold. Backed +// by a best-effort on-disk prolly.Map to make restarts faster. +// - Template cache: Table-schema/index addressed stats.Statistics object +// for a specific index. +// - Bound cache: Chunk addressed first row for an index histogram. +// +// Work is broken down into: +// - A basic update cycle of (1) seed database tables, (2) create or pull +// buckets from disk, (3) commit statistics accessed by the provider. +// - GC cycle: Mark and sweep the most recent context's active set into +// new cache/prolly.Map objects. +// - Branch sync: Update the tracked set of branch-qualified databases. +// +// Regular jobs, GC, and branch-sync are all controlled by tickers at the +// top level that controls that maximum rate of calling each. GC and +// branch-sync are prioritized before jobs, and therefore rate-limited to +// allow the job queue to flush in-between calls. +// +// DDL operations and branch create/delete are concurrent to the event +// loop. We require an extra fixed-sized queue as an intermediary to the +// job queue to protect the main thread's ownership. DDL acquiring the +// provider lock is a deadlock risk -- we cannot do any provider checks +// while holding the db lock. And lastly, the way update jobs are split +// up over time means we need to do special checks when finalizing a set +// of database stats. A race between deleting a database and finalizing +// statistics needs to end with no statistics, which requires a delete check +// after finalize. +// +// The stats lifecycle can be controlled with: +// - dolt_stats_stop: clear queue and disable thread +// - dolt_stats_restart: clear queue, refresh queue, start thread +// - dolt_stats_purge: clear queue, clear cache, refresh queue, +// disable thread +// - dolt_stats_validate: return report of cache misses for current +// root value. +// +// `dolt_stats_wait` is additionally useful for blocking on a full +// queue cycle and then validating whether the session head is caught up. +// diff --git a/go/libraries/doltcore/sqle/statspro/dolt_stats.go b/go/libraries/doltcore/sqle/statspro/dolt_stats.go deleted file mode 100644 index 4c5d43250c9..00000000000 --- a/go/libraries/doltcore/sqle/statspro/dolt_stats.go +++ /dev/null @@ -1,290 +0,0 @@ -// Copyright 2024 Dolthub, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package statspro - -import ( - "context" - "fmt" - "sync" - "time" - - "github.com/dolthub/go-mysql-server/sql" - "github.com/dolthub/go-mysql-server/sql/stats" - - "github.com/dolthub/dolt/go/store/hash" - "github.com/dolthub/dolt/go/store/val" -) - -type DoltStats struct { - Statistic *stats.Statistic - mu *sync.Mutex - // Chunks is a list of addresses for the histogram fanout level - Chunks []hash.Hash - // Active maps a chunk/bucket address to its position in - // the histogram. 1-indexed to differentiate from an empty - // field on disk - Active map[hash.Hash]int - Hist sql.Histogram - Tb *val.TupleBuilder -} - -func (s *DoltStats) Clone(_ context.Context) sql.JSONWrapper { - return s -} - -var _ sql.Statistic = (*DoltStats)(nil) - -func (s *DoltStats) SetChunks(h []hash.Hash) { - s.mu.Lock() - defer s.mu.Unlock() - s.Chunks = h -} - -func (s *DoltStats) WithColSet(set sql.ColSet) sql.Statistic { - ret := *s - ret.Statistic = ret.Statistic.WithColSet(set).(*stats.Statistic) - return &ret -} - -func (s *DoltStats) WithFuncDeps(set *sql.FuncDepSet) sql.Statistic { - ret := *s - ret.Statistic = ret.Statistic.WithFuncDeps(set).(*stats.Statistic) - return &ret -} - -func (s *DoltStats) WithDistinctCount(u uint64) sql.Statistic { - ret := *s - ret.Statistic = ret.Statistic.WithDistinctCount(u).(*stats.Statistic) - return &ret -} - -func (s *DoltStats) WithRowCount(u uint64) sql.Statistic { - ret := *s - ret.Statistic = ret.Statistic.WithRowCount(u).(*stats.Statistic) - return &ret -} - -func (s *DoltStats) WithNullCount(u uint64) sql.Statistic { - ret := *s - ret.Statistic = ret.Statistic.WithNullCount(u).(*stats.Statistic) - return &ret -} - -func (s *DoltStats) WithAvgSize(u uint64) sql.Statistic { - ret := *s - ret.Statistic = ret.Statistic.WithAvgSize(u).(*stats.Statistic) - return &ret -} - -func (s *DoltStats) WithLowerBound(row sql.Row) sql.Statistic { - ret := *s - ret.Statistic = ret.Statistic.WithLowerBound(row).(*stats.Statistic) - return &ret -} - -func (s *DoltStats) RowCount() uint64 { - return s.Statistic.RowCount() -} - -func (s *DoltStats) DistinctCount() uint64 { - return s.Statistic.DistinctCount() -} - -func (s *DoltStats) NullCount() uint64 { - return s.Statistic.NullCount() - -} - -func (s *DoltStats) AvgSize() uint64 { - return s.Statistic.AvgSize() - -} - -func (s *DoltStats) CreatedAt() time.Time { - return s.Statistic.CreatedAt() - -} - -func (s *DoltStats) Columns() []string { - return s.Statistic.Columns() -} - -func (s *DoltStats) Types() []sql.Type { - return s.Statistic.Types() -} - -func (s *DoltStats) Qualifier() sql.StatQualifier { - return s.Statistic.Qualifier() -} - -func (s *DoltStats) IndexClass() sql.IndexClass { - return s.Statistic.IndexClass() -} - -func (s *DoltStats) FuncDeps() *sql.FuncDepSet { - return s.Statistic.FuncDeps() -} - -func (s *DoltStats) ColSet() sql.ColSet { - return s.Statistic.ColSet() -} - -func (s *DoltStats) LowerBound() sql.Row { - return s.Statistic.LowerBound() -} - -func NewDoltStats() *DoltStats { - return &DoltStats{mu: &sync.Mutex{}, Active: make(map[hash.Hash]int), Statistic: &stats.Statistic{}} -} - -func (s *DoltStats) ToInterface() (interface{}, error) { - statVal, err := s.Statistic.ToInterface() - if err != nil { - return nil, err - } - ret := statVal.(map[string]interface{}) - - var hist sql.Histogram - for _, b := range s.Hist { - hist = append(hist, b) - } - histVal, err := hist.ToInterface() - if err != nil { - return nil, err - } - ret["statistic"].(map[string]interface{})["buckets"] = histVal - return ret, nil -} - -func (s *DoltStats) WithHistogram(h sql.Histogram) (sql.Statistic, error) { - s.mu.Lock() - defer s.mu.Unlock() - ret := *s - ret.Hist = nil - for _, b := range h { - doltB, ok := b.(DoltBucket) - if !ok { - return nil, fmt.Errorf("invalid bucket type: %T, %s", b, h.DebugString()) - } - ret.Hist = append(ret.Hist, doltB) - } - return &ret, nil -} - -func (s *DoltStats) Histogram() sql.Histogram { - s.mu.Lock() - defer s.mu.Unlock() - return s.Hist -} - -func DoltStatsFromSql(stat sql.Statistic) (*DoltStats, error) { - hist, err := DoltHistFromSql(stat.Histogram(), stat.Types()) - if err != nil { - return nil, err - } - ret := &DoltStats{ - mu: &sync.Mutex{}, - Hist: hist, - Statistic: stats.NewStatistic(stat.RowCount(), stat.DistinctCount(), stat.NullCount(), stat.AvgSize(), stat.CreatedAt(), stat.Qualifier(), stat.Columns(), stat.Types(), nil, stat.IndexClass(), stat.LowerBound()), - Active: make(map[hash.Hash]int), - } - ret.Statistic.Fds = stat.FuncDeps() - ret.Statistic.Colset = stat.ColSet() - return ret, nil -} - -func (s *DoltStats) UpdateActive() { - s.mu.Lock() - defer s.mu.Unlock() - newActive := make(map[hash.Hash]int) - for i, hash := range s.Chunks { - newActive[hash] = i - } - s.Active = newActive -} - -type DoltHistogram []DoltBucket - -type DoltBucket struct { - Bucket *stats.Bucket - Chunk hash.Hash - Created time.Time -} - -func (d DoltBucket) RowCount() uint64 { - return d.Bucket.RowCount() -} - -func (d DoltBucket) DistinctCount() uint64 { - return d.Bucket.DistinctCount() -} - -func (d DoltBucket) NullCount() uint64 { - return d.Bucket.NullCount() -} - -func (d DoltBucket) BoundCount() uint64 { - return d.Bucket.BoundCount() -} - -func (d DoltBucket) UpperBound() sql.Row { - return d.Bucket.UpperBound() -} - -func (d DoltBucket) McvCounts() []uint64 { - return d.Bucket.McvCounts() -} - -func (d DoltBucket) Mcvs() []sql.Row { - return d.Bucket.Mcvs() -} - -func DoltBucketChunk(b sql.HistogramBucket) hash.Hash { - return b.(DoltBucket).Chunk -} - -func DoltBucketCreated(b sql.HistogramBucket) time.Time { - return b.(DoltBucket).Created -} - -var _ sql.HistogramBucket = (*DoltBucket)(nil) - -func DoltHistFromSql(hist sql.Histogram, types []sql.Type) (sql.Histogram, error) { - ret := make(sql.Histogram, len(hist)) - var err error - for i, b := range hist { - upperBound := make(sql.Row, len(b.UpperBound())) - for i, v := range b.UpperBound() { - upperBound[i], _, err = types[i].Convert(v) - if err != nil { - return nil, fmt.Errorf("failed to convert %v to type %s", v, types[i].String()) - } - } - mcvs := make([]sql.Row, len(b.Mcvs())) - for i, mcv := range b.Mcvs() { - for _, v := range mcv { - conv, _, err := types[i].Convert(v) - if err != nil { - return nil, fmt.Errorf("failed to convert %v to type %s", v, types[i].String()) - } - mcvs[i] = append(mcvs[i], conv) - } - } - ret[i] = DoltBucket{ - Bucket: stats.NewHistogramBucket(b.RowCount(), b.DistinctCount(), b.NullCount(), b.BoundCount(), upperBound, b.McvCounts(), mcvs).(*stats.Bucket), - } - } - return ret, nil -} diff --git a/go/libraries/doltcore/sqle/statspro/gc.go b/go/libraries/doltcore/sqle/statspro/gc.go new file mode 100644 index 00000000000..6d476e37d06 --- /dev/null +++ b/go/libraries/doltcore/sqle/statspro/gc.go @@ -0,0 +1,196 @@ +// Copyright 2025 Dolthub, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package statspro + +import ( + "context" + "errors" + "github.com/dolthub/dolt/go/libraries/doltcore/doltdb" + "github.com/dolthub/dolt/go/libraries/doltcore/doltdb/durable" + "github.com/dolthub/dolt/go/libraries/doltcore/sqle" + "github.com/dolthub/dolt/go/libraries/doltcore/sqle/dsess" + "github.com/dolthub/dolt/go/store/prolly/tree" + "github.com/dolthub/dolt/go/store/val" + "github.com/dolthub/go-mysql-server/sql" + "log" + "strconv" + "strings" +) + +type GcMarkJob struct { + sqlDb dsess.SqlDatabase + done chan struct{} +} + +func NewGcMarkJob(sqlDb dsess.SqlDatabase) GcMarkJob { + return GcMarkJob{ + sqlDb: sqlDb, + done: make(chan struct{}), + } +} + +func (j GcMarkJob) Finish() { + close(j.done) +} + +func (j GcMarkJob) String() string { + b := strings.Builder{} + b.WriteString("gcMark: ") + b.WriteString(j.sqlDb.RevisionQualifiedName()) + return b.String() +} + +func (sc *StatsCoord) runGc(ctx context.Context, done chan struct{}) (err error) { + defer func() { + if err != nil { + sc.enableGc.Store(true) + close(done) + } + }() + + if !sc.enableGc.Swap(false) { + return nil + } + + if sc.Debug { + log.Println("stats gc number: ", strconv.Itoa(int(sc.gcCounter.Load()))) + } + + sc.gcCounter.Add(1) + + sc.gcMu.Lock() + defer sc.gcMu.Unlock() + + sqlCtx, err := sc.ctxGen(ctx) + if err != nil { + return err + } + + if err := sc.kv.StartGc(ctx, int(sc.bucketCap)); err != nil { + return err + } + + // Can't take |dbMu| and provider lock, so copy dbs out. + // Unlike branch updates, it is OK if GC misses databases + // added in-between GC start and end because stats collection + // is paused for the duration. + sc.dbMu.Lock() + dbs := make([]dsess.SqlDatabase, len(sc.dbs)) + copy(dbs, sc.dbs) + sc.ddlGuard = true + sc.dbMu.Unlock() + + var bucketCnt int + for _, db := range dbs { + j := NewGcMarkJob(db) + cnt, err := sc.gcMark(sqlCtx, j) + if sql.ErrDatabaseNotFound.Is(err) { + // concurrent delete + continue + } else if errors.Is(err, doltdb.ErrWorkingSetNotFound) { + // branch registered but no data + continue + } else if err != nil { + return err + } + bucketCnt += cnt + } + + //sc.bucketCnt.Store(int64(bucketCnt)) + sc.bucketCap = sc.kv.Cap() + sc.kv.FinishGc() + + // Avoid GC starving the loop, only re-enable after + // letting a block of other work through. + if err := sc.unsafeAsyncSend(ctx, NewControl("re-enable GC", func(sc *StatsCoord) error { + sc.enableGc.Store(true) + close(done) + return nil + })); err != nil { + return err + } + + return nil +} + +func (sc *StatsCoord) gcMark(sqlCtx *sql.Context, j GcMarkJob) (int, error) { + dSess := dsess.DSessFromSess(sqlCtx.Session) + db, err := dSess.Provider().Database(sqlCtx, j.sqlDb.AliasedName()) + if err != nil { + return 0, err + } + sqlDb, err := sqle.RevisionDbForBranch(sqlCtx, db.(dsess.SqlDatabase), j.sqlDb.Revision(), j.sqlDb.Revision()+"/"+j.sqlDb.AliasedName()) + if err != nil { + return 0, err + } + tableNames, err := sqlDb.GetTableNames(sqlCtx) + if err != nil { + return 0, err + } + + var bucketCnt int + for _, tableName := range tableNames { + sqlTable, dTab, err := GetLatestTable(sqlCtx, tableName, j.sqlDb) + if err != nil { + return 0, err + } + indexes, err := sqlTable.GetIndexes(sqlCtx) + if err != nil { + return 0, err + } + + for _, sqlIdx := range indexes { + var idx durable.Index + var err error + if strings.EqualFold(sqlIdx.ID(), "PRIMARY") { + idx, err = dTab.GetRowData(sqlCtx) + } else { + idx, err = dTab.GetIndexRowData(sqlCtx, sqlIdx.ID()) + } + if err != nil { + return 0, err + } + + schHash, _, err := sqlTable.IndexCacheKey(sqlCtx) + key := templateCacheKey{h: schHash.Hash, idxName: sqlIdx.ID()} + sc.kv.GetTemplate(key) + + idxLen := len(sqlIdx.Expressions()) + + prollyMap := durable.ProllyMapFromIndex(idx) + levelNodes, err := tree.GetHistogramLevel(sqlCtx, prollyMap.Tuples(), bucketLowCnt) + if err != nil { + return 0, err + } + + if len(levelNodes) == 0 { + continue + } + + bucketCnt += len(levelNodes) + + firstNodeHash := levelNodes[0].HashOf() + sc.kv.GetBound(firstNodeHash, idxLen) + + for _, n := range levelNodes { + err = sc.kv.MarkBucket(sqlCtx, n.HashOf(), val.NewTupleBuilder(prollyMap.KeyDesc().PrefixDesc(idxLen))) + if err != nil { + return 0, err + } + } + } + } + return bucketCnt, nil +} diff --git a/go/libraries/doltcore/sqle/statspro/initdbhook.go b/go/libraries/doltcore/sqle/statspro/initdbhook.go index 8e11408ea59..d0b11604254 100644 --- a/go/libraries/doltcore/sqle/statspro/initdbhook.go +++ b/go/libraries/doltcore/sqle/statspro/initdbhook.go @@ -15,10 +15,6 @@ package statspro import ( - "context" - "fmt" - "strings" - "github.com/dolthub/go-mysql-server/sql" "github.com/dolthub/dolt/go/libraries/doltcore/env" @@ -26,67 +22,35 @@ import ( "github.com/dolthub/dolt/go/libraries/doltcore/sqle/dsess" ) -func NewStatsInitDatabaseHook( - statsProv *Provider, - ctxFactory func(ctx context.Context) (*sql.Context, error), - bThreads *sql.BackgroundThreads, -) sqle.InitDatabaseHook { +func NewInitDatabaseHook(sc *StatsCoord) sqle.InitDatabaseHook { return func( ctx *sql.Context, - pro *sqle.DoltDatabaseProvider, + _ *sqle.DoltDatabaseProvider, name string, denv *env.DoltEnv, db dsess.SqlDatabase, ) error { - dbName := strings.ToLower(db.Name()) - if statsDb, ok := statsProv.getStatDb(dbName); !ok { - statsDb, err := statsProv.sf.Init(ctx, db, statsProv.pro, denv.FS, env.GetCurrentUserHomeDir) - if err != nil { - ctx.GetLogger().Debugf("statistics load error: %s", err.Error()) - return nil - } - statsProv.setStatDb(dbName, statsDb) - } else { - dSess := dsess.DSessFromSess(ctx.Session) - for _, br := range statsDb.Branches() { - branchQDbName := BranchQualifiedDatabase(dbName, br) - sqlDb, err := dSess.Provider().Database(ctx, branchQDbName) - if err != nil { - ctx.GetLogger().Logger.Errorf("branch not found: %s", br) - continue - } - branchQDb, ok := sqlDb.(dsess.SqlDatabase) - if !ok { - return fmt.Errorf("branch/database not found: %s", branchQDbName) - } + head := denv.RepoState.Head + + sqlDb, ok := db.(sqle.Database) + if !ok { + sc.logger.Debugf("stats initialize db failed, expected *sqle.Database, found %T", db) + return nil + } - if ok, err := statsDb.SchemaChange(ctx, br, branchQDb); err != nil { - return err - } else if ok { - if err := statsDb.DeleteBranchStats(ctx, br, true); err != nil { - return err - } - } - } - ctx.GetLogger().Debugf("statistics init error: preexisting stats db: %s", dbName) + // call should only fail if backpressure in secondary queue + _, err := sc.Add(ctx, sqlDb, head.Ref, denv.FS) + if err != nil { + sc.logger.Debugf("cannot initialize db stats for %s; queue is closed", sqlDb.AliasedName()) } - ctx.GetLogger().Debugf("statistics refresh: initialize %s", name) - return statsProv.InitAutoRefresh(ctxFactory, name, bThreads) + return nil } } -func NewStatsDropDatabaseHook(statsProv *Provider) sqle.DropDatabaseHook { +func NewDropDatabaseHook(sc *StatsCoord) sqle.DropDatabaseHook { return func(ctx *sql.Context, name string) { - statsProv.CancelRefreshThread(name) - if err := statsProv.DropDbStats(ctx, name, false); err != nil { + if err := sc.DropDbStats(ctx, name, false); err != nil { ctx.GetLogger().Debugf("failed to close stats database: %s", err) } - - if db, ok := statsProv.getStatDb(name); ok { - if err := db.Close(); err != nil { - ctx.GetLogger().Debugf("failed to close stats database: %s", err) - } - delete(statsProv.statDbs, name) - } } } diff --git a/go/libraries/doltcore/sqle/statspro/interface.go b/go/libraries/doltcore/sqle/statspro/interface.go deleted file mode 100644 index 5a423466f91..00000000000 --- a/go/libraries/doltcore/sqle/statspro/interface.go +++ /dev/null @@ -1,75 +0,0 @@ -// Copyright 2024 Dolthub, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package statspro - -import ( - "context" - - "github.com/dolthub/go-mysql-server/sql" - - "github.com/dolthub/dolt/go/libraries/doltcore/env" - "github.com/dolthub/dolt/go/libraries/doltcore/sqle" - "github.com/dolthub/dolt/go/libraries/doltcore/sqle/dsess" - "github.com/dolthub/dolt/go/libraries/utils/filesys" - "github.com/dolthub/dolt/go/store/hash" -) - -// Database is a backing store for a collection of DoltStats. -// Each stats database tracks a user database, with multiple -// branches potentially each having their own statistics. -type Database interface { - // ListStatQuals returns the list of index statistics for a branch. - ListStatQuals(branch string) []sql.StatQualifier - // LoadBranchStats starts tracking a specific branch's statistics. - LoadBranchStats(ctx *sql.Context, branch string) error - // DeleteBranchStats removes references to in memory index statistics. - // If |flush| is true delete the data from storage. - DeleteBranchStats(ctx *sql.Context, branch string, flush bool) error - // GetStat returns a branch's index statistics. - GetStat(branch string, qual sql.StatQualifier) (*DoltStats, bool) - //SetStat bulk replaces the statistic, deleting any previous version - SetStat(ctx context.Context, branch string, qual sql.StatQualifier, stats *DoltStats) error - //DeleteStats deletes a list of index statistics. - DeleteStats(ctx *sql.Context, branch string, quals ...sql.StatQualifier) - // ReplaceChunks is an update interface that lets a stats implementation - // decide how to edit stats for a stats refresh. - ReplaceChunks(ctx context.Context, branch string, qual sql.StatQualifier, targetHashes []hash.Hash, dropChunks, newChunks []sql.HistogramBucket) error - // Flush instructs the database to sync any partial state to disk - Flush(ctx context.Context, branch string) error - // Close finalizes any file references. - Close() error - // SetTableHash updates the most recently tracked table stats table hash - SetTableHash(branch, tableName string, h hash.Hash) - // GetTableHash returns the most recently tracked table stats table hash - GetTableHash(branch, tableName string) hash.Hash - // SetSchemaHash updates the most recently stored table stat's schema hash - SetSchemaHash(ctx context.Context, branch, tableName string, h hash.Hash) error - // GetSchemaHash returns the schema hash for the latest stored statistics - GetSchemaHash(ctx context.Context, branch, tableName string) (hash.Hash, error) - // Branches returns the set of branches with tracked statistics databases - Branches() []string - // SchemaChange returns false if any table schema in the session - // root is incompatible with the latest schema used to create a stored - // set of statistics. - SchemaChange(ctx *sql.Context, branch string, branchQdb dsess.SqlDatabase) (bool, error) -} - -// StatsFactory instances construct statistic databases. -type StatsFactory interface { - // Init gets a reference to the stats database for a dolt database - // rooted at the given filesystem. It will create the database if - // it does not exist. - Init(ctx *sql.Context, sourceDb dsess.SqlDatabase, prov *sqle.DoltDatabaseProvider, fs filesys.Filesys, hdp env.HomeDirProvider) (Database, error) -} diff --git a/go/libraries/doltcore/sqle/statspro/noop_provider.go b/go/libraries/doltcore/sqle/statspro/noop_provider.go new file mode 100644 index 00000000000..f54e84d51b3 --- /dev/null +++ b/go/libraries/doltcore/sqle/statspro/noop_provider.go @@ -0,0 +1,67 @@ +package statspro + +import ( + "github.com/dolthub/dolt/go/libraries/doltcore/env" + "github.com/dolthub/dolt/go/libraries/doltcore/sqle/dsess" + "github.com/dolthub/go-mysql-server/sql" +) + +type StatsNoop struct{} + +func (s StatsNoop) GetTableStats(ctx *sql.Context, db string, table sql.Table) ([]sql.Statistic, error) { + return nil, nil +} + +func (s StatsNoop) RefreshTableStats(ctx *sql.Context, table sql.Table, db string) error { + return nil +} + +func (s StatsNoop) SetStats(ctx *sql.Context, stats sql.Statistic) error { + return nil +} + +func (s StatsNoop) GetStats(ctx *sql.Context, qual sql.StatQualifier, cols []string) (sql.Statistic, bool) { + return nil, false +} + +func (s StatsNoop) DropStats(ctx *sql.Context, qual sql.StatQualifier, cols []string) error { + return nil +} + +func (s StatsNoop) DropDbStats(ctx *sql.Context, db string, flush bool) error { + return nil +} + +func (s StatsNoop) RowCount(ctx *sql.Context, db string, table sql.Table) (uint64, error) { + return 0, nil +} + +func (s StatsNoop) DataLength(ctx *sql.Context, db string, table sql.Table) (uint64, error) { + return 0, nil +} + +func (s StatsNoop) CancelRefreshThread(string) { + return +} + +func (s StatsNoop) StartRefreshThread(*sql.Context, dsess.DoltDatabaseProvider, string, *env.DoltEnv, dsess.SqlDatabase) error { + return nil +} + +func (s StatsNoop) ThreadStatus(string) string { + return "stats disabled" +} + +func (s StatsNoop) Prune(ctx *sql.Context) error { + return nil +} + +func (s StatsNoop) Purge(ctx *sql.Context) error { + return nil +} + +func (s StatsNoop) WaitForDbSync(ctx *sql.Context) error { + return nil +} + +var _ sql.StatsProvider = StatsNoop{} diff --git a/go/libraries/doltcore/sqle/statspro/provider.go b/go/libraries/doltcore/sqle/statspro/provider.go new file mode 100644 index 00000000000..ea79b20c8a2 --- /dev/null +++ b/go/libraries/doltcore/sqle/statspro/provider.go @@ -0,0 +1,582 @@ +// Copyright 2025 Dolthub, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package statspro + +import ( + "context" + "fmt" + "github.com/dolthub/dolt/go/cmd/dolt/doltversion" + "github.com/dolthub/dolt/go/libraries/doltcore/dbfactory" + "github.com/dolthub/dolt/go/libraries/doltcore/doltdb" + "github.com/dolthub/dolt/go/libraries/doltcore/env" + "github.com/dolthub/dolt/go/libraries/doltcore/ref" + "github.com/dolthub/dolt/go/libraries/doltcore/sqle" + "github.com/dolthub/dolt/go/libraries/doltcore/sqle/dsess" + "github.com/dolthub/dolt/go/libraries/doltcore/table/editor" + "github.com/dolthub/dolt/go/libraries/utils/earl" + "github.com/dolthub/dolt/go/libraries/utils/filesys" + "github.com/dolthub/dolt/go/store/types" + "github.com/dolthub/go-mysql-server/sql" + "github.com/dolthub/go-mysql-server/sql/stats" + "golang.org/x/sync/errgroup" + "log" + "path" + "path/filepath" + "strconv" + "strings" +) + +var _ sql.StatsProvider = (*StatsCoord)(nil) + +func (sc *StatsCoord) GetTableStats(ctx *sql.Context, db string, table sql.Table) ([]sql.Statistic, error) { + dSess := dsess.DSessFromSess(ctx.Session) + branch, err := dSess.GetBranch() + if err != nil { + return nil, err + } + key := tableIndexesKey{ + db: db, + branch: branch, + table: table.Name(), + } + sc.statsMu.Lock() + defer sc.statsMu.Unlock() + st := sc.Stats[key] + var ret []sql.Statistic + for _, s := range st { + ret = append(ret, s) + } + return ret, nil +} + +func (sc *StatsCoord) RefreshTableStats(ctx *sql.Context, table sql.Table, dbName string) error { + dSess := dsess.DSessFromSess(ctx.Session) + branch, err := dSess.GetBranch() + if err != nil { + return err + } + + if branch == "" { + branch = "main" + } + + var sqlDb dsess.SqlDatabase + func() { + sc.dbMu.Lock() + defer sc.dbMu.Unlock() + for _, db := range sc.dbs { + if db.AliasedName() == dbName && db.Revision() == branch { + sqlDb = db + return + } + } + }() + + if sqlDb == nil { + return fmt.Errorf("qualified database not found: %s/%s", branch, dbName) + } + + after := NewControl("finish analyze", func(sc *StatsCoord) error { return nil }) + analyze := NewAnalyzeJob(ctx, sqlDb, []string{table.String()}, after) + + select { + case <-ctx.Done(): + return ctx.Err() + case <-sc.Done: + return fmt.Errorf("stat queue was interrupted") + case sc.Jobs <- analyze: //TODO send jobs + } + + // wait for finalize to finish before returning + select { + case <-ctx.Done(): + return ctx.Err() + case <-sc.Done: + return fmt.Errorf("stat queue was interrupted") + case <-after.done: + return nil + } +} + +func (sc *StatsCoord) SetStats(ctx *sql.Context, s sql.Statistic) error { + sc.statsMu.Lock() + defer sc.statsMu.Unlock() + ss, ok := s.(*stats.Statistic) + if !ok { + return fmt.Errorf("expected *stats.Statistics, found %T", s) + } + key, err := sc.statsKey(ctx, ss.Qualifier().Db(), ss.Qualifier().Table()) + if err != nil { + return err + } + sc.Stats[key] = sc.Stats[key][:0] + sc.Stats[key] = append(sc.Stats[key], ss) + return nil +} + +func (sc *StatsCoord) GetStats(ctx *sql.Context, qual sql.StatQualifier, cols []string) (sql.Statistic, bool) { + sc.statsMu.Lock() + defer sc.statsMu.Unlock() + key, err := sc.statsKey(ctx, qual.Database, qual.Table()) + if err != nil { + return nil, false + } + for _, s := range sc.Stats[key] { + if strings.EqualFold(s.Qualifier().Index(), qual.Index()) { + return s, true + } + } + return nil, false +} + +func (sc *StatsCoord) GetTableDoltStats(ctx *sql.Context, branch, db, schema, table string) ([]*stats.Statistic, error) { + sc.statsMu.Lock() + defer sc.statsMu.Unlock() + log.Printf("get stat: %s/%s/%s\n", branch, db, table) + key := tableIndexesKey{ + db: db, + branch: branch, + table: table, + schema: schema, + } + for key, ss := range sc.Stats { + log.Println(" stats exist " + key.String() + " " + strconv.Itoa(len(ss))) + } + return sc.Stats[key], nil +} + +func (sc *StatsCoord) DropStats(ctx *sql.Context, qual sql.StatQualifier, cols []string) error { + key, err := sc.statsKey(ctx, qual.Database, qual.Table()) + if err != nil { + return err + } + sc.statsMu.Lock() + defer sc.statsMu.Unlock() + delete(sc.Stats, key) + return nil +} + +func (sc *StatsCoord) DropDbStats(ctx *sql.Context, dbName string, flush bool) error { + var doSwap bool + func() { + sc.dbMu.Lock() + defer sc.dbMu.Unlock() + sc.ddlGuard = true + + doSwap = strings.EqualFold(sc.statsBackingDb, dbName) + for i := 0; i < len(sc.dbs); i++ { + db := sc.dbs[i] + if strings.EqualFold(db.AliasedName(), dbName) { + sc.dbs = append(sc.dbs[:i], sc.dbs[i+1:]...) + i-- + } + } + delete(sc.Branches, dbName) + }() + + if doSwap { + if err := sc.rotateStorage(ctx); err != nil { + return err + } + } + + sc.setGc() + + // stats lock is more contentious, do last + sc.statsMu.Lock() + defer sc.statsMu.Unlock() + var deleteKeys []tableIndexesKey + for k, _ := range sc.Stats { + if strings.EqualFold(dbName, k.db) { + deleteKeys = append(deleteKeys, k) + } + } + for _, k := range deleteKeys { + delete(sc.Stats, k) + } + + sc.dbMu.Lock() + defer sc.dbMu.Unlock() + delete(sc.dbFs, dbName) + + return nil +} + +func (sc *StatsCoord) statsKey(ctx *sql.Context, dbName, table string) (tableIndexesKey, error) { + dSess := dsess.DSessFromSess(ctx.Session) + branch, err := dSess.GetBranch() + if err != nil { + return tableIndexesKey{}, err + } + key := tableIndexesKey{ + db: dbName, + branch: branch, + table: table, + } + return key, nil +} + +func (sc *StatsCoord) RowCount(ctx *sql.Context, dbName string, table sql.Table) (uint64, error) { + key, err := sc.statsKey(ctx, dbName, table.Name()) + if err != nil { + return 0, err + } + sc.statsMu.Lock() + defer sc.statsMu.Unlock() + for _, s := range sc.Stats[key] { + if strings.EqualFold(s.Qualifier().Index(), "PRIMARY") { + return s.RowCnt, nil + } + } + return 0, nil +} + +func (sc *StatsCoord) DataLength(ctx *sql.Context, dbName string, table sql.Table) (uint64, error) { + key, err := sc.statsKey(ctx, dbName, table.Name()) + if err != nil { + return 0, err + } + sc.statsMu.Lock() + defer sc.statsMu.Unlock() + for _, s := range sc.Stats[key] { + if strings.EqualFold(s.Qualifier().Index(), "PRIMARY") { + return s.RowCnt, nil + } + } + return 0, nil +} + +func (sc *StatsCoord) FlushQueue(ctx context.Context) error { + sc.Stop() + select { + case <-ctx.Done(): + return context.Cause(ctx) + case <-sc.Done: + } + oldCap := cap(sc.Jobs) + close(sc.Jobs) + for _ = range sc.Jobs { + } + sc.Jobs = make(chan StatsJob, oldCap) + sc.seedCnt.Store(0) + sc.readCounter.Store(0) + return nil +} + +func (sc *StatsCoord) StartRefreshThread(ctx *sql.Context, sqlDb dsess.SqlDatabase, branch ref.DoltRef) error { + fs, err := sc.pro.FileSystemForDatabase(sqlDb.AliasedName()) + if err != nil { + return err + } + + done, err := sc.Add(ctx, sqlDb, branch, fs) + if err != nil { + return err + } + <-done + return nil +} + +func (sc *StatsCoord) Init(ctx context.Context, dbs []dsess.SqlDatabase) error { + sc.dbMu.Lock() + sc.statsMu.Lock() + + sc.dbs = sc.dbs[:0] + sc.Stats = make(map[tableIndexesKey][]*stats.Statistic) + sc.Branches = make(map[string][]ref.DoltRef) + sc.dbFs = make(map[string]filesys.Filesys) + sc.dbMu.Unlock() + sc.statsMu.Unlock() + + sc.bucketCnt.Store(0) + + _, memOnly, _ := sql.SystemVariables.GetGlobal(dsess.DoltStatsMemoryOnly) + sc.SetMemOnly(memOnly.(int8) == 1) + + typ, jobI, _ := sql.SystemVariables.GetGlobal(dsess.DoltStatsJobInterval) + _, gcI, _ := sql.SystemVariables.GetGlobal(dsess.DoltStatsGCInterval) + _, brI, _ := sql.SystemVariables.GetGlobal(dsess.DoltStatsBranchInterval) + + jobInterval, _, _ := typ.GetType().Convert(jobI) + gcInterval, _, _ := typ.GetType().Convert(gcI) + brInterval, _, _ := typ.GetType().Convert(brI) + + sc.SetEnableGc(false) + sc.enableBrSync.Store(false) + sc.JobInterval = 1 + defer sc.SetTimers(jobInterval.(int64), gcInterval.(int64), brInterval.(int64)) + defer sc.SetEnableGc(true) + defer sc.enableBrSync.Store(true) + + sqlCtx, err := sc.ctxGen(ctx) + if err != nil { + return err + } + + if err := sc.Restart(sqlCtx); err != nil { + return err + } + eg := errgroup.Group{} + for _, db := range dbs { + if db, ok := db.(dsess.SqlDatabase); ok { + br, err := db.DbData().Ddb.GetBranches(ctx) + if err != nil { + return err + } + fs, err := sc.pro.FileSystemForDatabase(db.AliasedName()) + if err != nil { + return err + } + for _, b := range br { + eg.Go(func() error { + done, err := sc.Add(sqlCtx, db, b, fs) + if err != nil { + return err + } + <-done + return nil + }) + } + } + } + eg.Wait() + eg.Go(func() error { + done, err := sc.Control(ctx, "enable gc", func(sc *StatsCoord) error { + return nil + }) + if err != nil { + return err + } + <-done + sc.Stop() + return nil + }) + eg.Wait() + <-sc.Done + return nil +} + +func (sc *StatsCoord) Purge(ctx *sql.Context) error { + if err := sc.rotateStorage(ctx); err != nil { + return err + } + if err := sc.kv.StartGc(ctx, 0); err != nil { + return err + } + sc.kv.FinishGc() + sc.bucketCnt.Store(0) + + return nil +} + +func (sc *StatsCoord) rotateStorage(ctx *sql.Context) error { + sc.dbMu.Lock() + defer sc.dbMu.Unlock() + if sc.statsBackingDb != "" { + if err := sc.rm(sc.statsBackingDb); err != nil { + return err + } + } + + var mem *memStats + switch kv := sc.kv.(type) { + case *prollyStats: + mem = kv.mem + case *memStats: + mem = kv + default: + mem = NewMemStats() + } + + if len(sc.dbs) == 0 { + sc.kv = mem + sc.statsBackingDb = "" + return nil + } + + newStorageTarget := sc.dbs[0] + if err := sc.rm(newStorageTarget.AliasedName()); err != nil { + return err + } + + newKv, err := sc.initStorage(ctx, newStorageTarget) + if err != nil { + return err + } + + newKv.mem = mem + sc.kv = newKv + sc.statsBackingDb = newStorageTarget.AliasedName() + return nil +} + +func (sc *StatsCoord) rm(db string) error { + fs, ok := sc.dbFs[db] + if !ok { + return fmt.Errorf("failed to remove stats db: %s filesys not found", db) + } + + statsFs, err := fs.WithWorkingDir(dbfactory.DoltStatsDir) + if err != nil { + return err + } + + if ok, _ := statsFs.Exists(""); ok { + if err := statsFs.Delete("", true); err != nil { + return err + } + } + + dropDbLoc, err := statsFs.Abs("") + if err != nil { + return err + } + + if err = dbfactory.DeleteFromSingletonCache(filepath.ToSlash(dropDbLoc + "/.dolt/noms")); err != nil { + return err + } + return nil +} + +func (sc *StatsCoord) initStorage(ctx *sql.Context, storageTarget dsess.SqlDatabase) (*prollyStats, error) { + fs, ok := sc.dbFs[strings.ToLower(storageTarget.AliasedName())] + if !ok { + return nil, fmt.Errorf("failed to remove stats db: %s filesys not found", storageTarget.AliasedName()) + } + + params := make(map[string]interface{}) + params[dbfactory.GRPCDialProviderParam] = sc.dialPro + + var urlPath string + u, err := earl.Parse(sc.pro.DbFactoryUrl()) + if u.Scheme == dbfactory.MemScheme { + urlPath = path.Join(sc.pro.DbFactoryUrl(), dbfactory.DoltDataDir) + } else if u.Scheme == dbfactory.FileScheme { + urlPath = doltdb.LocalDirDoltDB + } + + statsFs, err := fs.WithWorkingDir(dbfactory.DoltStatsDir) + if err != nil { + return nil, err + } + + var dEnv *env.DoltEnv + exists, isDir := statsFs.Exists("") + if !exists { + err := statsFs.MkDirs("") + if err != nil { + return nil, fmt.Errorf("unable to make directory '%s', cause: %s", dbfactory.DoltStatsDir, err.Error()) + } + + dEnv = env.Load(ctx, sc.hdp, statsFs, urlPath, "test") + sess := dsess.DSessFromSess(ctx.Session) + err = dEnv.InitRepo(ctx, types.Format_Default, sess.Username(), sess.Email(), storageTarget.AliasedName()) + if err != nil { + return nil, err + } + } else if !isDir { + return nil, fmt.Errorf("file exists where the dolt stats directory should be") + } else { + dEnv = env.LoadWithoutDB(ctx, sc.hdp, statsFs, "", doltversion.Version) + } + + if err := dEnv.LoadDoltDBWithParams(ctx, types.Format_Default, urlPath, statsFs, params); err != nil { + return nil, err + } + + deaf := dEnv.DbEaFactory(ctx) + + tmpDir, err := dEnv.TempTableFilesDir() + if err != nil { + return nil, err + } + opts := editor.Options{ + Deaf: deaf, + Tempdir: tmpDir, + } + statsDb, err := sqle.NewDatabase(ctx, "stats", dEnv.DbData(ctx), opts) + if err != nil { + return nil, err + } + return NewProllyStats(ctx, statsDb) +} + +func (sc *StatsCoord) unsafeAsyncSend(ctx context.Context, j StatsJob) error { + // The |Jobs| queue can change, the interrupts queue + // does not and is safe to send a blocking write to. + ji := NewControl("interrupt: '"+j.String()+"'", func(sc *StatsCoord) error { + return sc.sendJobs(ctx, j) + }) + + select { + case sc.Interrupts <- ji: + return nil + default: + return fmt.Errorf("async queue overflowed, failed to put job " + j.String()) + } +} + +func (sc *StatsCoord) WaitForDbSync(ctx *sql.Context) error { + // Wait until the control job finishes before returning. + // We want to do two cycles -- to pick up new seeds and + // execute the finalize jobs that update statistics. + for _ = range 2 { + j := NewControl("wait for sync", func(sc *StatsCoord) error { return nil }) + if err := sc.unsafeAsyncSend(ctx, j); err != nil { + return err + } + + select { + case <-ctx.Done(): + return context.Cause(ctx) + case <-sc.Done: + return fmt.Errorf("stats queue closed") + case <-j.done: + } + } + + return sc.ValidateState(ctx) +} + +func (sc *StatsCoord) Gc(ctx *sql.Context) error { + done := make(chan struct{}) + if err := sc.runGc(ctx, done); err != nil { + return err + } + select { + case <-ctx.Done(): + return context.Cause(ctx) + case <-done: + return nil + } +} + +func (sc *StatsCoord) BranchSync(ctx *sql.Context) error { + done := make(chan struct{}) + newJobs, err := sc.runBranchSync(ctx, done) + if err != nil { + return err + } + for _, j := range newJobs { + // have to go through interrupts queue for thread safety + sc.Interrupts <- j + } + select { + case <-ctx.Done(): + return context.Cause(ctx) + case <-done: + return nil + } +} diff --git a/go/libraries/doltcore/sqle/statspro/scheduler.go b/go/libraries/doltcore/sqle/statspro/scheduler.go new file mode 100644 index 00000000000..4e971fdd48a --- /dev/null +++ b/go/libraries/doltcore/sqle/statspro/scheduler.go @@ -0,0 +1,1037 @@ +// Copyright 2025 Dolthub, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package statspro + +import ( + "context" + "errors" + "fmt" + "github.com/dolthub/dolt/go/libraries/doltcore/dbfactory" + "github.com/dolthub/dolt/go/libraries/doltcore/env" + "github.com/dolthub/dolt/go/libraries/doltcore/ref" + "github.com/dolthub/dolt/go/libraries/doltcore/sqle" + "github.com/dolthub/dolt/go/libraries/doltcore/sqle/dprocedures" + "github.com/dolthub/dolt/go/libraries/doltcore/sqle/dsess" + "github.com/dolthub/dolt/go/libraries/utils/filesys" + "github.com/dolthub/dolt/go/store/hash" + "github.com/dolthub/dolt/go/store/prolly" + "github.com/dolthub/dolt/go/store/prolly/tree" + "github.com/dolthub/dolt/go/store/val" + "github.com/dolthub/go-mysql-server/sql" + "github.com/dolthub/go-mysql-server/sql/stats" + "github.com/sirupsen/logrus" + "io" + "log" + "strconv" + "strings" + "sync" + "sync/atomic" + "time" +) + +type StatsJob interface { + Finish() + String() string +} + +var _ StatsJob = (*ReadJob)(nil) +var _ StatsJob = (*SeedDbTablesJob)(nil) +var _ StatsJob = (*ControlJob)(nil) +var _ StatsJob = (*FinalizeJob)(nil) + +func NewSeedJob(sqlDb dsess.SqlDatabase) SeedDbTablesJob { + return SeedDbTablesJob{ + sqlDb: sqlDb, + tables: nil, + done: make(chan struct{}), + } +} + +// todo refactor so we can count buckets globally +type tableStatsInfo struct { + name string + schHash hash.Hash + idxRoots []hash.Hash + bucketCount int +} + +type SeedDbTablesJob struct { + sqlDb dsess.SqlDatabase + tables []tableStatsInfo + done chan struct{} +} + +func (j SeedDbTablesJob) Finish() { + close(j.done) +} + +func (j SeedDbTablesJob) String() string { + b := strings.Builder{} + b.WriteString("seed db: ") + b.WriteString(j.sqlDb.RevisionQualifiedName()) + b.WriteString("[") + + var sep = "" + for _, ti := range j.tables { + b.WriteString(sep) + b.WriteString("(" + ti.name + ": " + ti.schHash.String()[:5] + ")") + } + b.WriteString("]") + + return b.String() +} + +func NewAnalyzeJob(ctx *sql.Context, sqlDb dsess.SqlDatabase, tables []string, after ControlJob) AnalyzeJob { + return AnalyzeJob{ctx: ctx, sqlDb: sqlDb, tables: tables, after: after, done: make(chan struct{})} +} + +type AnalyzeJob struct { + ctx *sql.Context + sqlDb dsess.SqlDatabase + tables []string + after ControlJob + done chan struct{} +} + +func (j AnalyzeJob) String() string { + return "analyze: [" + strings.Join(j.tables, ", ") + "]" +} + +func (j AnalyzeJob) Finish() { + close(j.done) + return +} + +type ReadJob struct { + // |ctx|/|db| track a specific working set + ctx *sql.Context + db dsess.SqlDatabase + table string + key templateCacheKey + template stats.Statistic + m prolly.Map + first bool + nodes []tree.Node + ordinals []updateOrdinal + idxLen int + done chan struct{} +} + +func (j ReadJob) Finish() { + close(j.done) +} + +func (j ReadJob) String() string { + b := strings.Builder{} + b.WriteString("read: " + j.db.RevisionQualifiedName() + "/" + j.table + ": ") + sep := "" + for i, o := range j.ordinals { + b.WriteString(fmt.Sprintf("%s[%s:%d-%d]", sep, j.nodes[i].HashOf().String()[:5], o.start, o.stop)) + sep = ", " + } + return b.String() +} + +type finalizeStruct struct { + buckets []hash.Hash + tupB *val.TupleBuilder +} + +type FinalizeJob struct { + sqlDb dsess.SqlDatabase + tableKey tableIndexesKey + keepIndexes map[sql.StatQualifier]bool + editIndexes map[templateCacheKey]finalizeStruct + done chan struct{} +} + +func (j FinalizeJob) Finish() { + close(j.done) +} + +func (j FinalizeJob) String() string { + b := strings.Builder{} + b.WriteString("finalize " + j.tableKey.String()) + b.WriteString(": ") + sep := "" + for idx, fs := range j.editIndexes { + b.WriteString(fmt.Sprintf("%s(%s: ", sep, idx.idxName)) + sep = "" + for _, h := range fs.buckets { + b.WriteString(fmt.Sprintf("%s%s", sep, h.String()[:5])) + sep = ", " + } + b.WriteString(")") + sep = ", " + } + return b.String() +} + +func NewControl(desc string, cb func(sc *StatsCoord) error) ControlJob { + return ControlJob{cb: cb, desc: desc, done: make(chan struct{})} +} + +type ControlJob struct { + cb func(sc *StatsCoord) error + desc string + done chan struct{} +} + +func (j ControlJob) Finish() { + close(j.done) +} + +func (j ControlJob) String() string { + return "ControlJob: " + j.desc +} + +type ctxFactory func(ctx context.Context) (*sql.Context, error) + +func NewStatsCoord(pro *sqle.DoltDatabaseProvider, ctxGen ctxFactory, logger *logrus.Logger, threads *sql.BackgroundThreads, dEnv *env.DoltEnv) *StatsCoord { + done := make(chan struct{}) + close(done) + kv := NewMemStats() + return &StatsCoord{ + dbMu: &sync.Mutex{}, + statsMu: &sync.Mutex{}, + logger: logger, + Jobs: make(chan StatsJob, 1024), + Done: done, + Interrupts: make(chan StatsJob, 1024), + JobInterval: 50 * time.Millisecond, + gcInterval: 24 * time.Hour, + branchInterval: 24 * time.Hour, + enableGc: atomic.Bool{}, + bucketCap: kv.Cap(), + Stats: make(map[tableIndexesKey][]*stats.Statistic), + Branches: make(map[string][]ref.DoltRef), + dbFs: make(map[string]filesys.Filesys), + threads: threads, + kv: kv, + pro: pro, + hdp: dEnv.GetUserHomeDir, + dialPro: env.NewGRPCDialProviderFromDoltEnv(dEnv), + ctxGen: ctxGen, + } +} + +func (sc *StatsCoord) SetMemOnly(v bool) { + sc.dbMu.Lock() + defer sc.dbMu.Unlock() + sc.memOnly = v +} + +func (sc *StatsCoord) SetEnableGc(v bool) { + sc.enableGc.Store(v) +} + +func (sc *StatsCoord) SetTimers(job, gc, branch int64) { + sc.JobInterval = time.Duration(job) * time.Millisecond + sc.gcInterval = time.Duration(gc) * time.Millisecond + sc.branchInterval = time.Duration(branch) * time.Millisecond +} + +type tableIndexesKey struct { + db string + branch string + table string + schema string +} + +func (k tableIndexesKey) String() string { + return k.db + "/" + k.branch + "/" + k.table +} + +type StatsCoord struct { + logger *logrus.Logger + threads *sql.BackgroundThreads + pro *sqle.DoltDatabaseProvider + statsBackingDb string + dialPro dbfactory.GRPCDialProvider + hdp env.HomeDirProvider + // ctxGen lets us fetch the most recent working root + ctxGen ctxFactory + + JobInterval time.Duration + gcInterval time.Duration + branchInterval time.Duration + memOnly bool + Debug bool + + Jobs chan StatsJob + // Interrupts skip the job queue and are processed first, + // but has a fixed size and will block + Interrupts chan StatsJob + Done chan struct{} + + // XXX: do not hold the |dbMu| while accessing |pro| + dbMu *sync.Mutex + // dbs is a list of branch-qualified databases. + dbs []dsess.SqlDatabase + dbFs map[string]filesys.Filesys + // Branches lists the branches tracked for each database. + // Should track |dbs|. + Branches map[string][]ref.DoltRef + + // kv is a content-addressed cache of histogram objects: + // buckets, first bounds, and schema-specific statistic + // templates. + kv StatsKv + + // Stats tracks table statistics accessible to sessions. + Stats map[tableIndexesKey][]*stats.Statistic + statsMu *sync.Mutex + + branchCounter atomic.Uint64 + gcCounter atomic.Uint64 + + readCounter atomic.Int32 + + doGc atomic.Bool + enableGc atomic.Bool + enableBrSync atomic.Bool + gcMu sync.Mutex + + // ddlGuard is a compare and swap that lets |updateBranches| + // safe and nonblocking + ddlGuard bool + doBranchSync atomic.Bool + doCapCheck atomic.Bool + bucketCnt atomic.Int64 + seedCnt atomic.Int64 + bucketCap int64 +} + +func (sc *StatsCoord) Stop() { + select { + case <-sc.Done: + default: + close(sc.Done) + } +} + +func (sc *StatsCoord) Restart(ctx context.Context) error { + select { + case <-ctx.Done(): + return ctx.Err() + case <-sc.Done: + default: + // have loop stop itself to avoid accidentally closing + // channel twice + j := NewControl("stop thread", func(sc *StatsCoord) error { + sc.Stop() + return nil + }) + if err := sc.unsafeAsyncSend(ctx, j); err != nil { + return err + } + select { + case <-ctx.Done(): + return context.Cause(ctx) + case <-j.done: + case <-sc.Done: + } + } + + sc.Done = make(chan struct{}) + return sc.threads.Add("stats", func(ctx context.Context) { + sc.run(ctx) + }) +} + +func (sc *StatsCoord) Close() { + sc.Stop() + return +} + +func (sc *StatsCoord) Add(ctx *sql.Context, db dsess.SqlDatabase, branch ref.DoltRef, fs filesys.Filesys) (chan struct{}, error) { + db, err := sqle.RevisionDbForBranch(ctx, db, branch.GetPath(), branch.GetPath()+"/"+db.AliasedName()) + if err != nil { + sc.error(ControlJob{desc: "add db"}, err) + ret := make(chan struct{}) + close(ret) + return ret, nil + } + + sc.dbMu.Lock() + defer sc.dbMu.Unlock() + sc.ddlGuard = true + + sc.Branches[db.AliasedName()] = append(sc.Branches[db.AliasedName()], ref.NewBranchRef(db.Revision())) + sc.dbs = append(sc.dbs, db) + sc.dbFs[db.AliasedName()] = fs + ret, err := sc.Seed(ctx, db) + if err != nil { + return nil, err + } + + if len(sc.dbs) == 1 { + sc.statsBackingDb = db.AliasedName() + var mem *memStats + switch kv := sc.kv.(type) { + case *memStats: + mem = kv + case *prollyStats: + mem = kv.mem + default: + mem = NewMemStats() + return ret, nil + } + if sc.memOnly { + return ret, nil + } + newKv, err := sc.initStorage(ctx, db) + if err != nil { + sc.error(ControlJob{desc: "add db"}, err) + close(ret) + return ret, nil + } + newKv.mem = mem + sc.kv = newKv + } + + return ret, nil +} + +func (sc *StatsCoord) Info() dprocedures.StatsInfo { + sc.dbMu.Lock() + dbCnt := len(sc.dbs) + cachedBucketCnt := sc.kv.Len() + defer sc.dbMu.Unlock() + + sc.statsMu.Lock() + statCnt := len(sc.Stats) + defer sc.statsMu.Unlock() + + var active bool + select { + case <-sc.Done: + default: + active = true + } + + return dprocedures.StatsInfo{ + DbCnt: dbCnt, + ReadCnt: int(sc.readCounter.Load()), + Active: active, + DbSeedCnt: int(sc.seedCnt.Load()), + EstBucketCnt: int(sc.bucketCnt.Load()), + CachedBucketCnt: cachedBucketCnt, + StatCnt: statCnt, + GcCounter: int(sc.gcCounter.Load()), + BranchCounter: int(sc.branchCounter.Load()), + } +} + +// captureFlushQueue is a debug method that lets us inspect and +// restore the job queue +func (sc *StatsCoord) captureFlushQueue(ctx context.Context) ([]StatsJob, error) { + select { + case <-sc.Done: + default: + return nil, fmt.Errorf("cannot read queue while event loop is active") + // inactive event loop cannot be interrupted, discard + } + var ret []StatsJob + for _ = range len(sc.Jobs) { + select { + case <-ctx.Done(): + return nil, nil + case j, ok := <-sc.Jobs: + if !ok { + return nil, nil + } + ret = append(ret, j) + } + } + return ret, nil +} + +func (sc *StatsCoord) Seed(ctx context.Context, sqlDb dsess.SqlDatabase) (chan struct{}, error) { + j := NewSeedJob(sqlDb) + if err := sc.unsafeAsyncSend(ctx, j); err != nil { + return nil, err + } + sc.seedCnt.Add(1) + return j.done, nil +} + +func (sc *StatsCoord) Control(ctx context.Context, desc string, cb func(sc *StatsCoord) error) (chan struct{}, error) { + j := NewControl(desc, cb) + if err := sc.unsafeAsyncSend(ctx, j); err != nil { + return nil, err + } + return j.done, nil +} + +func (sc *StatsCoord) Interrupt(desc string, cb func(sc *StatsCoord) error) chan struct{} { + j := NewControl(desc, cb) + sc.Interrupts <- j + return j.done +} + +func (sc *StatsCoord) error(j StatsJob, err error) { + if sc.Debug { + log.Println("stats error: ", err.Error()) + } + sc.logger.Errorf("stats error; job detail: %s; verbose: %s", j.String(), err) +} + +// statsRunner operates on stats jobs +func (sc *StatsCoord) run(ctx context.Context) error { + jobTimer := time.NewTimer(0) + gcTicker := time.NewTicker(sc.gcInterval) + branchTicker := time.NewTicker(sc.branchInterval) + + for { + // sequentially test: + // (1) ctx done/thread canceled + // (2) GC check + // (3) branch check + // (4) interrupt queue + // (5) job and other tickers + select { + case <-sc.Done: + return nil + case <-ctx.Done(): + return ctx.Err() + default: + } + + if sc.doGc.Swap(false) { + if err := sc.runGc(ctx, make(chan struct{})); err != nil { + if err != nil { + sc.error(ControlJob{desc: "gc"}, err) + } + } + } + + if sc.doBranchSync.Swap(false) { + j := ControlJob{desc: "branches update"} + newJobs, err := sc.runBranchSync(ctx, make(chan struct{})) + if err != nil { + sc.error(j, err) + } + err = sc.sendJobs(ctx, newJobs...) + if err != nil { + sc.error(j, err) + } + } + + select { + case <-sc.Done: + return nil + case <-ctx.Done(): + return ctx.Err() + case j, ok := <-sc.Interrupts: + if !ok { + return nil + } + if sc.Debug { + log.Println("stats interrupt job: ", j.String()) + } + err := sc.executeJob(ctx, j) + if err != nil { + sc.error(j, err) + } + default: + } + + select { + case <-sc.Done: + return nil + case <-ctx.Done(): + return ctx.Err() + case j, ok := <-sc.Interrupts: + if !ok { + return nil + } + if sc.Debug { + log.Println("stats interrupt job: ", j.String()) + } + err := sc.executeJob(ctx, j) + if err != nil { + sc.error(j, err) + } + case <-jobTimer.C: + select { + case <-ctx.Done(): + return ctx.Err() + case j, ok := <-sc.Jobs: + if !ok { + return nil + } + if sc.Debug { + log.Println("stats execute job: ", j.String()) + } + err := sc.executeJob(ctx, j) + if err != nil { + sc.error(j, err) + } + default: + } + case <-gcTicker.C: + sc.setGc() + case <-branchTicker.C: + sc.doBranchSync.Store(true) + } + jobTimer.Reset(sc.JobInterval) + } +} + +func (sc *StatsCoord) sendJobs(ctx context.Context, jobs ...StatsJob) error { + // jobs can double and access is concurrent + sc.dbMu.Lock() + defer sc.dbMu.Unlock() + + for i := 0; i < len(jobs); i++ { + j := jobs[i] + if j == nil { + continue + } + select { + case <-ctx.Done(): + return ctx.Err() + case sc.Jobs <- j: + if _, ok := j.(ReadJob); ok { + sc.readCounter.Add(1) + } + default: + sc.doubleChannelSize(ctx) + i-- + } + } + return nil +} + +func (sc *StatsCoord) executeJob(ctx context.Context, j StatsJob) (err error) { + //defer func() { + // if r := recover(); r != nil { + // fmt.Println("Recovered in f", r) + // err = fmt.Errorf("stats job %s panicked: %s", j.String(), r) + // } + //}() + var newJobs []StatsJob + switch j := j.(type) { + case SeedDbTablesJob: + newJobs, err = sc.seedDbTables(ctx, j) + case ReadJob: + sc.readCounter.Add(-1) + newJobs, err = sc.readChunks(ctx, j) + case FinalizeJob: + newJobs, err = sc.finalizeUpdate(ctx, j) + case ControlJob: + if err := j.cb(sc); err != nil { + sc.error(j, err) + } + case AnalyzeJob: + newJobs, err = sc.runAnalyze(ctx, j) + default: + return fmt.Errorf("unknown job type: %T", j) + } + if err != nil { + return err + } + err = sc.sendJobs(ctx, newJobs...) + if err != nil { + sc.error(j, err) + } + j.Finish() + return nil +} + +func (sc *StatsCoord) doubleChannelSize(ctx context.Context) { + close(sc.Jobs) + ch := make(chan StatsJob, cap(sc.Jobs)*2) + for j := range sc.Jobs { + ch <- j + } + sc.Jobs = ch +} + +func (sc *StatsCoord) dropTableJob(sqlDb dsess.SqlDatabase, tableName string) StatsJob { + return FinalizeJob{ + tableKey: tableIndexesKey{ + db: sqlDb.AliasedName(), + branch: sqlDb.Revision(), + table: tableName, + }, + editIndexes: nil, + done: make(chan struct{}), + } +} + +func (sc *StatsCoord) readChunks(ctx context.Context, j ReadJob) ([]StatsJob, error) { + // check if chunk already in cache + // if no, see if on disk and we just need to load + // otherwise perform read to create the bucket, write to disk, update mem ref + + prollyMap := j.m + updater := newBucketBuilder(sql.StatQualifier{}, j.idxLen, prollyMap.KeyDesc()) + keyBuilder := val.NewTupleBuilder(prollyMap.KeyDesc().PrefixDesc(j.idxLen)) + + // all kv puts are guarded by |gcMu| to avoid concurrent + // GC with stale data discarding some or all state + sc.gcMu.Lock() + defer sc.gcMu.Unlock() + + if j.first { + sc.kv.PutTemplate(j.key, j.template) + + firstNodeHash := j.nodes[0].HashOf() + if _, ok := sc.kv.GetBound(firstNodeHash, j.idxLen); !ok { + firstRow, err := firstRowForIndex(j.ctx, prollyMap, keyBuilder) + if err != nil { + if err != nil { + return nil, err + } + } + if sc.Debug { + log.Printf("put bound: %s | %s: %v\n", j.table, firstNodeHash.String()[:5], firstRow) + } + sc.kv.PutBound(firstNodeHash, firstRow) + } + } + + for i, n := range j.nodes { + if _, ok, err := sc.kv.GetBucket(ctx, n.HashOf(), keyBuilder); err != nil { + return nil, err + } else if ok { + // concurrent reads overestimate shared buckets + sc.bucketCnt.Add(-1) + continue + } + // each node is a bucket + updater.newBucket() + + // we read exclusive range [node first key, next node first key) + start, stop := j.ordinals[i].start, j.ordinals[i].stop + iter, err := j.m.IterOrdinalRange(ctx, start, stop) + if err != nil { + return nil, err + } + for { + // stats key will be a prefix of the index key + keyBytes, _, err := iter.Next(ctx) + if errors.Is(err, io.EOF) { + break + } else if err != nil { + return nil, err + } + // build full key + for i := range keyBuilder.Desc.Types { + keyBuilder.PutRaw(i, keyBytes.GetField(i)) + } + + updater.add(keyBuilder.BuildPrefixNoRecycle(prollyMap.Pool(), updater.prefixLen)) + keyBuilder.Recycle() + } + + // finalize the aggregation + bucket, err := updater.finalize(ctx, prollyMap.NodeStore()) + if err != nil { + return nil, err + } + err = sc.kv.PutBucket(ctx, n.HashOf(), bucket, keyBuilder) + if err != nil { + return nil, err + } + } + return nil, nil +} + +func (sc *StatsCoord) runAnalyze(_ context.Context, j AnalyzeJob) ([]StatsJob, error) { + var ret []StatsJob + for _, tableName := range j.tables { + readJobs, _, err := sc.readJobsForTable(j.ctx, j.sqlDb, tableStatsInfo{name: tableName}) + if err != nil { + return nil, err + } + ret = append(ret, readJobs...) + } + if j.after.done != nil { + ret = append(ret, j.after) + } + return ret, nil +} + +func (sc *StatsCoord) finalizeUpdate(ctx context.Context, j FinalizeJob) ([]StatsJob, error) { + if len(j.editIndexes) == 0 { + // delete table + sc.statsMu.Lock() + delete(sc.Stats, j.tableKey) + sc.statsMu.Unlock() + return nil, nil + } + + var newStats []*stats.Statistic + for _, s := range sc.Stats[j.tableKey] { + if ok := j.keepIndexes[s.Qual]; ok { + newStats = append(newStats, s) + } + } + for key, fs := range j.editIndexes { + if len(fs.buckets) == 0 { + continue + } + + template, ok := sc.kv.GetTemplate(key) + if !ok { + return nil, fmt.Errorf(" missing template dependency for table: %s", key) + } + template.Qual = sql.NewStatQualifier(j.tableKey.db, "", j.tableKey.table, key.idxName) + + for i, bh := range fs.buckets { + if i == 0 { + bnd, ok := sc.kv.GetBound(bh, fs.tupB.Desc.Count()) + if !ok { + log.Println("chunks: ", fs.buckets) + return nil, fmt.Errorf("missing read job bound dependency for chunk %s: %s", key, bh) + } + template.LowerBnd = bnd[:fs.tupB.Desc.Count()] + } + // accumulate counts + if b, ok, err := sc.kv.GetBucket(ctx, bh, fs.tupB); err != nil { + return nil, err + } else if !ok { + log.Println("need chunks: ", fs.buckets) + return nil, fmt.Errorf("missing read job bucket dependency for chunk: %s", bh) + } else { + template.RowCnt += b.RowCnt + template.DistinctCnt += b.DistinctCnt + template.NullCnt += b.NullCnt + template.Hist = append(template.Hist, b) + } + } + newStats = append(newStats, &template) + } + + // We cannot mutex protect concurrent db drops + // and finalization. We need to check afterward + // whether there was a db/stats race. We check + // separately for database and branch deletes. + + sc.dbMu.Lock() + sc.ddlGuard = false + sc.dbMu.Unlock() + + sc.statsMu.Lock() + sc.Stats[j.tableKey] = newStats + sc.statsMu.Unlock() + + sc.dbMu.Lock() + if sc.ddlGuard { + sqlCtx, err := sc.ctxGen(ctx) + if err != nil { + return nil, err + } + + if _, err := j.sqlDb.GetRoot(sqlCtx); err != nil { + sc.statsMu.Lock() + delete(sc.Stats, j.tableKey) + sc.statsMu.Unlock() + } + } + sc.dbMu.Unlock() + + sqlCtx, err := sc.ctxGen(ctx) + if err != nil { + return nil, err + } + if _, err := j.sqlDb.GetRoot(sqlCtx); err != nil { + sc.statsMu.Lock() + delete(sc.Stats, j.tableKey) + sc.statsMu.Unlock() + } + + return nil, nil +} + +type dbBranchKey struct { + db string + branch string +} + +func (sc *StatsCoord) runBranchSync(ctx context.Context, done chan struct{}) ([]StatsJob, error) { + if !sc.enableBrSync.Swap(false) { + close(done) + return nil, nil + } + + if sc.Debug { + log.Println("stats branch check number: ", strconv.Itoa(int(sc.branchCounter.Load()))) + } + sc.branchCounter.Add(1) + + j := ControlJob{desc: "branch update"} + sqlCtx, err := sc.ctxGen(ctx) + if err != nil { + return nil, err + } + + newBranches := make(map[string][]ref.DoltRef) + var newDbs []dsess.SqlDatabase + + // Currently, updateBranches is sensitive to concurrent + // add/drop database. We used |ddlGuard| as a compare and + // swap check after collecting new dbs, branches, and stats. + // A failed guard check retries. + // If this were incrementally adding/deleting, |ddlGuard| would + // be unnecessary, but more complex and maybe more blocking. + sc.dbMu.Lock() + sc.ddlGuard = false + dbBranches := make(map[string][]ref.DoltRef) + for k, v := range sc.Branches { + dbBranches[k] = v + } + dbs := make([]dsess.SqlDatabase, len(sc.dbs)) + copy(dbs, sc.dbs) + sc.dbMu.Unlock() + + { + // filter for branches that haven't been deleted + var w int + for i := 0; i < len(dbs); i++ { + if _, err := dbs[i].GetRoot(sqlCtx); err != nil { + continue + } + dbs[w] = dbs[i] + w++ + } + + dbs = dbs[:w] + } + + var ret []StatsJob + for dbName, branches := range dbBranches { + var sqlDb dsess.SqlDatabase + for _, db := range dbs { + if strings.EqualFold(db.AliasedName(), dbName) { + sqlDb = db + break + } + } + + if sqlDb == nil { + sc.error(j, fmt.Errorf("database in branches list is not tracked: %s", dbName)) + continue + } + + // check if db still valid + dSess := dsess.DSessFromSess(sqlCtx.Session) + dbd, ok := dSess.GetDbData(sqlCtx, sqlDb.AliasedName()) + if !ok { + sc.error(j, fmt.Errorf("database in branches list does not exist: %s", dbName)) + continue + } + curBranches, err := dbd.Ddb.GetBranches(sqlCtx) + if err != nil { + sc.error(j, err) + continue + } + + newBranches[sqlDb.AliasedName()] = curBranches + + i := 0 + k := 0 + for i < len(branches) && k < len(curBranches) { + br := curBranches[k] + switch strings.Compare(branches[i].GetPath(), curBranches[k].GetPath()) { + case 0: + i++ + k++ + sqlDb, err := sqle.RevisionDbForBranch(sqlCtx, sqlDb, br.GetPath(), br.GetPath()+"/"+dbName) + if err != nil { + sc.error(j, err) + continue + } + newDbs = append(newDbs, sqlDb) + case -1: + i++ + case +1: + k++ + sqlDb, err := sqle.RevisionDbForBranch(sqlCtx, sqlDb, br.GetPath(), br.GetPath()+"/"+dbName) + if err != nil { + sc.error(j, err) + continue + } + _, err = sqlDb.GetRoot(sqlCtx) + if err != nil { + continue + } + + newDbs = append(newDbs, sqlDb) + ret = append(ret, NewSeedJob(sqlDb)) + sc.seedCnt.Add(1) + } + } + for k < len(curBranches) { + br := curBranches[k] + k++ + sqlDb, err := sqle.RevisionDbForBranch(sqlCtx, sqlDb, br.GetPath(), br.GetPath()+"/"+dbName) + if err != nil { + sc.error(j, err) + continue + } + + newDbs = append(newDbs, sqlDb) + ret = append(ret, NewSeedJob(sqlDb)) + sc.seedCnt.Add(1) + } + } + + sc.dbMu.Lock() + + if sc.ddlGuard { + // ddl interrupted branch refresh + sc.dbMu.Unlock() + return sc.runBranchSync(ctx, done) + } + + sc.Branches = newBranches + sc.dbs = newDbs + + var statKeys = make(map[dbBranchKey]bool) + for _, db := range sc.dbs { + statKeys[dbBranchKey{db.AliasedName(), db.Revision()}] = true + } + sc.dbMu.Unlock() + + newStats := make(map[tableIndexesKey][]*stats.Statistic) + sc.statsMu.Lock() + for k, s := range sc.Stats { + if statKeys[dbBranchKey{db: k.db, branch: k.branch}] { + newStats[k] = s + } + } + sc.Stats = newStats + sc.statsMu.Unlock() + + // Avoid branch checks starving the loop, only re-enable after + // letting a block of other work through. + ret = append(ret, NewControl("re-enable branch check", func(sc *StatsCoord) error { + sc.enableBrSync.Store(true) + close(done) + return nil + })) + + return ret, nil +} + +func (sc *StatsCoord) setGc() { + if sc.enableGc.Load() { + sc.doGc.Store(true) + } +} diff --git a/go/libraries/doltcore/sqle/statspro/scheduler_test.go b/go/libraries/doltcore/sqle/statspro/scheduler_test.go new file mode 100644 index 00000000000..a376febdfbe --- /dev/null +++ b/go/libraries/doltcore/sqle/statspro/scheduler_test.go @@ -0,0 +1,1498 @@ +// Copyright 2025 Dolthub, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package statspro + +import ( + "context" + "fmt" + "github.com/dolthub/dolt/go/libraries/doltcore/branch_control" + "github.com/dolthub/dolt/go/libraries/doltcore/dtestutils" + "github.com/dolthub/dolt/go/libraries/doltcore/env" + "github.com/dolthub/dolt/go/libraries/doltcore/ref" + "github.com/dolthub/dolt/go/libraries/doltcore/sqle" + "github.com/dolthub/dolt/go/libraries/doltcore/sqle/dsess" + "github.com/dolthub/dolt/go/libraries/doltcore/sqle/writer" + "github.com/dolthub/dolt/go/store/prolly/tree" + gms "github.com/dolthub/go-mysql-server" + "github.com/dolthub/go-mysql-server/sql" + "github.com/dolthub/go-mysql-server/sql/analyzer" + "github.com/dolthub/go-mysql-server/sql/stats" + lru "github.com/hashicorp/golang-lru/v2" + "github.com/sirupsen/logrus" + "github.com/stretchr/testify/require" + "io" + "log" + "os" + "strconv" + "strings" + "sync" + "testing" + "time" +) + +func TestScheduleLoop(t *testing.T) { + threads := sql.NewBackgroundThreads() + defer threads.Shutdown() + ctx, sqlEng, sc, sqlDbs := defaultSetup(t, threads, true) + wg := sync.WaitGroup{} + + { + // add more data + b := strings.Repeat("b", 100) + require.NoError(t, executeQuery(ctx, sqlEng, "create table ab (a int primary key, b varchar(100), key (b,a))")) + abIns := strings.Builder{} + abIns.WriteString("insert into ab values") + for i := range 200 { + if i > 0 { + abIns.WriteString(", ") + } + abIns.WriteString(fmt.Sprintf("(%d, '%s')", i, b)) + } + require.NoError(t, executeQuery(ctx, sqlEng, abIns.String())) + + // run two cycles -> (1) seed, (2) populate + runAndPause(t, ctx, sc, &wg) + validateJobState(t, ctx, sc, []StatsJob{ + ReadJob{ + db: sqlDbs[0], table: "ab", + ordinals: []updateOrdinal{{0, 47}, {47, 59}, {59, 94}, {94, 125}, {125, 159}, {159, 191}, {191, 200}}, + }, + ReadJob{ + db: sqlDbs[0], table: "ab", + ordinals: []updateOrdinal{{0, 26}, {26, 55}, {55, 92}, {92, 110}, {110, 147}, {147, 189}, {189, 200}}, + }, + FinalizeJob{ + tableKey: tableIndexesKey{db: "mydb", branch: "main", table: "ab"}, + editIndexes: map[templateCacheKey]finalizeStruct{ + templateCacheKey{idxName: "PRIMARY"}: {}, + templateCacheKey{idxName: "b"}: {}, + }}, + SeedDbTablesJob{sqlDb: sqlDbs[0], tables: []tableStatsInfo{{name: "ab"}, {name: "xy"}}}, + }) + + runAndPause(t, ctx, sc, &wg) + validateJobState(t, ctx, sc, []StatsJob{ + SeedDbTablesJob{sqlDb: sqlDbs[0], tables: []tableStatsInfo{{name: "ab"}, {name: "xy"}}}, + }) + + // 4 old + 2*7 new ab + kv := sc.kv.(*memStats) + require.Equal(t, 18, kv.buckets.Len()) + require.Equal(t, 4, len(kv.bounds)) + require.Equal(t, 4, len(kv.templates)) + require.Equal(t, 2, len(sc.Stats)) + stat := sc.Stats[tableIndexesKey{"mydb", "main", "ab", ""}] + require.Equal(t, 7, len(stat[0].Hist)) + require.Equal(t, 7, len(stat[1].Hist)) + } + + require.NoError(t, executeQuery(ctx, sqlEng, "drop table xy")) + runAndPause(t, ctx, sc, &wg) + runAndPause(t, ctx, sc, &wg) + + doGcCycle(t, ctx, sc) + + kv := sc.kv.(*memStats) + require.Equal(t, 14, kv.buckets.Len()) + require.Equal(t, 2, len(kv.bounds)) + require.Equal(t, 2, len(kv.templates)) + require.Equal(t, 1, len(sc.Stats)) + stat := sc.Stats[tableIndexesKey{"mydb", "main", "ab", ""}] + require.Equal(t, 2, len(stat)) + require.Equal(t, 7, len(stat[0].Hist)) + require.Equal(t, 7, len(stat[1].Hist)) +} + +func TestAnalyze(t *testing.T) { + threads := sql.NewBackgroundThreads() + defer threads.Shutdown() + ctx, sqlEng, sc, sqlDbs := defaultSetup(t, threads, true) + + sc.captureFlushQueue(ctx) + + wg := sync.WaitGroup{} + + require.NoError(t, executeQuery(ctx, sqlEng, "insert into xy values (-1,-1)")) + + analyze := NewAnalyzeJob(ctx, sqlDbs[0], []string{"xy"}, ControlJob{}) + sc.Jobs <- analyze + + validateJobState(t, ctx, sc, []StatsJob{ + AnalyzeJob{ + sqlDb: sqlDbs[0], + tables: []string{"xy"}, + }, + }) + + runAndPause(t, ctx, sc, &wg) + validateJobState(t, ctx, sc, []StatsJob{ + ReadJob{db: sqlDbs[0], table: "xy", nodes: []tree.Node{{}}, ordinals: []updateOrdinal{{0, 416}}}, + ReadJob{db: sqlDbs[0], table: "xy", nodes: []tree.Node{{}}, ordinals: []updateOrdinal{{0, 241}}}, + FinalizeJob{ + tableKey: tableIndexesKey{db: "mydb", branch: "main", table: "xy"}, + editIndexes: map[templateCacheKey]finalizeStruct{ + templateCacheKey{idxName: "PRIMARY"}: {}, + templateCacheKey{idxName: "y"}: {}, + }}, + }) + + runAndPause(t, ctx, sc, &wg) + validateJobState(t, ctx, sc, []StatsJob{}) + kv := sc.kv.(*memStats) + require.Equal(t, uint64(0), sc.gcCounter.Load()) + require.Equal(t, 6, kv.buckets.Len()) + require.Equal(t, 4, len(kv.bounds)) + require.Equal(t, 2, len(kv.templates)) + require.Equal(t, 1, len(sc.Stats)) + for _, tableStats := range sc.Stats { + require.Equal(t, 2, len(tableStats)) + } +} + +func TestModifyColumn(t *testing.T) { + threads := sql.NewBackgroundThreads() + defer threads.Shutdown() + ctx, sqlEng, sc, sqlDbs := defaultSetup(t, threads, true) + wg := sync.WaitGroup{} + sc.enableGc.Store(false) + { + require.NoError(t, executeQuery(ctx, sqlEng, "alter table xy modify column y bigint")) + + // expect finalize, no GC + runAndPause(t, ctx, sc, &wg) + validateJobState(t, ctx, sc, []StatsJob{ + ReadJob{db: sqlDbs[0], table: "xy", ordinals: []updateOrdinal{{0, 210}, {210, 415}, {415, 470}, {470, 500}}}, + ReadJob{db: sqlDbs[0], table: "xy", ordinals: []updateOrdinal{{0, 267}, {267, 500}}}, + FinalizeJob{ + tableKey: tableIndexesKey{db: "mydb", branch: "main", table: "xy"}, + editIndexes: map[templateCacheKey]finalizeStruct{ + templateCacheKey{idxName: "PRIMARY"}: {}, + templateCacheKey{idxName: "y"}: {}, + }}, + SeedDbTablesJob{sqlDb: sqlDbs[0], tables: []tableStatsInfo{{name: "xy"}}}, + }) + + runAndPause(t, ctx, sc, &wg) + validateJobState(t, ctx, sc, []StatsJob{ + SeedDbTablesJob{sqlDb: sqlDbs[0], tables: []tableStatsInfo{{name: "xy"}}}, + }) + + kv := sc.kv.(*memStats) + require.Equal(t, 10, kv.buckets.Len()) + require.Equal(t, 4, len(kv.bounds)) + require.Equal(t, 4, len(kv.templates)) + require.Equal(t, 1, len(sc.Stats)) + stat := sc.Stats[tableIndexesKey{"mydb", "main", "xy", ""}] + require.Equal(t, 4, len(stat[0].Hist)) + require.Equal(t, 2, len(stat[1].Hist)) + require.Equal(t, int64(6), sc.bucketCnt.Load()) + + doGcCycle(t, ctx, sc) + require.Equal(t, int64(6), sc.bucketCnt.Load()) + require.Equal(t, 6, kv.buckets.Len()) + } +} + +func TestAddColumn(t *testing.T) { + threads := sql.NewBackgroundThreads() + defer threads.Shutdown() + ctx, sqlEng, sc, sqlDbs := defaultSetup(t, threads, true) + wg := sync.WaitGroup{} + sc.enableGc.Store(false) + + { + require.NoError(t, executeQuery(ctx, sqlEng, "alter table xy add column z int")) + + // schema but no data change + runAndPause(t, ctx, sc, &wg) + validateJobState(t, ctx, sc, []StatsJob{ + FinalizeJob{ + tableKey: tableIndexesKey{db: "mydb", branch: "main", table: "xy"}, + editIndexes: map[templateCacheKey]finalizeStruct{ + templateCacheKey{idxName: "PRIMARY"}: {}, + }, + }, + SeedDbTablesJob{sqlDb: sqlDbs[0], tables: []tableStatsInfo{{name: "xy"}}}, + }) + + runAndPause(t, ctx, sc, &wg) + validateJobState(t, ctx, sc, []StatsJob{ + SeedDbTablesJob{sqlDb: sqlDbs[0], tables: []tableStatsInfo{{name: "xy"}}}, + }) + + kv := sc.kv.(*memStats) + require.Equal(t, 4, kv.buckets.Len()) + require.Equal(t, 2, len(kv.bounds)) + require.Equal(t, 4, len(kv.templates)) // +2 for new schema + require.Equal(t, 1, len(sc.Stats)) + stat := sc.Stats[tableIndexesKey{"mydb", "main", "xy", ""}] + require.Equal(t, 2, len(stat[0].Hist)) + require.Equal(t, 2, len(stat[1].Hist)) + require.Equal(t, int64(4), sc.bucketCnt.Load()) + } +} + +func TestDropIndex(t *testing.T) { + threads := sql.NewBackgroundThreads() + defer threads.Shutdown() + ctx, sqlEng, sc, sqlDbs := defaultSetup(t, threads, true) + sc.enableGc.Store(false) + + wg := sync.WaitGroup{} + + { + require.NoError(t, executeQuery(ctx, sqlEng, "alter table xy drop index y")) + + runAndPause(t, ctx, sc, &wg) + validateJobState(t, ctx, sc, []StatsJob{ + FinalizeJob{ + tableKey: tableIndexesKey{db: "mydb", branch: "main", table: "xy"}, + editIndexes: map[templateCacheKey]finalizeStruct{ + templateCacheKey{idxName: "PRIMARY"}: {}, + }, + }, + SeedDbTablesJob{sqlDb: sqlDbs[0], tables: []tableStatsInfo{{name: "xy"}}}, + }) + + runAndPause(t, ctx, sc, &wg) + validateJobState(t, ctx, sc, []StatsJob{ + SeedDbTablesJob{sqlDb: sqlDbs[0], tables: []tableStatsInfo{{name: "xy"}}}, + }) + + kv := sc.kv.(*memStats) + require.Equal(t, 4, kv.buckets.Len()) + require.Equal(t, 2, len(kv.bounds)) + require.Equal(t, 3, len(kv.templates)) + require.Equal(t, 1, len(sc.Stats)) + stat := sc.Stats[tableIndexesKey{"mydb", "main", "xy", ""}] + require.Equal(t, 1, len(stat)) + require.Equal(t, 2, len(stat[0].Hist)) + require.Equal(t, int64(2), sc.bucketCnt.Load()) + + doGcCycle(t, ctx, sc) + + kv = sc.kv.(*memStats) + require.Equal(t, 2, kv.buckets.Len()) + require.Equal(t, 1, len(kv.bounds)) + require.Equal(t, 1, len(kv.templates)) + require.Equal(t, 1, len(sc.Stats)) + stat = sc.Stats[tableIndexesKey{"mydb", "main", "xy", ""}] + require.Equal(t, 1, len(stat)) + require.Equal(t, 2, len(stat[0].Hist)) + require.Equal(t, int64(2), sc.bucketCnt.Load()) + } +} + +func TestDropTable(t *testing.T) { + threads := sql.NewBackgroundThreads() + defer threads.Shutdown() + ctx, sqlEng, sc, sqlDbs := defaultSetup(t, threads, true) + sc.enableGc.Store(false) + + wg := sync.WaitGroup{} + { + require.NoError(t, executeQuery(ctx, sqlEng, "create table ab (a int primary key, b int)")) + require.NoError(t, executeQuery(ctx, sqlEng, "insert into ab values (0,0)")) + require.NoError(t, executeQuery(ctx, sqlEng, "drop table xy")) + + runAndPause(t, ctx, sc, &wg) + + validateJobState(t, ctx, sc, []StatsJob{ + ReadJob{db: sqlDbs[0], table: "ab", ordinals: []updateOrdinal{{0, 1}}}, + FinalizeJob{ + tableKey: tableIndexesKey{db: "mydb", branch: "main", table: "ab"}, + editIndexes: map[templateCacheKey]finalizeStruct{ + templateCacheKey{idxName: "PRIMARY"}: {}, + }, + }, + FinalizeJob{ + tableKey: tableIndexesKey{db: "mydb", branch: "main", table: "xy"}, + editIndexes: nil, + }, + SeedDbTablesJob{sqlDb: sqlDbs[0], tables: []tableStatsInfo{{name: "ab"}}}, + }) + + runAndPause(t, ctx, sc, &wg) + + kv := sc.kv.(*memStats) + require.Equal(t, 5, kv.buckets.Len()) + require.Equal(t, 3, len(kv.bounds)) + require.Equal(t, 3, len(kv.templates)) + require.Equal(t, 1, len(sc.Stats)) + stat := sc.Stats[tableIndexesKey{"mydb", "main", "ab", ""}] + require.Equal(t, 1, len(stat)) + require.Equal(t, 1, len(stat[0].Hist)) + + doGcCycle(t, ctx, sc) + + kv = sc.kv.(*memStats) + require.Equal(t, 1, kv.buckets.Len()) + require.Equal(t, 1, len(kv.bounds)) + require.Equal(t, 1, len(kv.templates)) + require.Equal(t, 1, len(sc.Stats)) + stat = sc.Stats[tableIndexesKey{"mydb", "main", "ab", ""}] + require.Equal(t, 1, len(stat)) + require.Equal(t, 1, len(stat[0].Hist)) + require.Equal(t, int64(1), sc.bucketCnt.Load()) + } +} + +func TestDeleteAboveBoundary(t *testing.T) { + threads := sql.NewBackgroundThreads() + defer threads.Shutdown() + ctx, sqlEng, sc, _ := defaultSetup(t, threads, true) + sc.enableGc.Store(false) + + wg := sync.WaitGroup{} + + require.NoError(t, executeQuery(ctx, sqlEng, "alter table xy drop index y")) + + { + require.NoError(t, executeQuery(ctx, sqlEng, "delete from xy where x > 498")) + + runAndPause(t, ctx, sc, &wg) // seed + runAndPause(t, ctx, sc, &wg) // finalize + + kv := sc.kv.(*memStats) + require.Equal(t, 5, kv.buckets.Len()) // 1 for new chunk + require.Equal(t, 2, len(kv.bounds)) + require.Equal(t, 3, len(kv.templates)) // +1 for schema change + require.Equal(t, 1, len(sc.Stats)) + stat := sc.Stats[tableIndexesKey{db: "mydb", branch: "main", table: "xy"}] + require.Equal(t, 2, len(stat[0].Hist)) + require.Equal(t, int64(2), sc.bucketCnt.Load()) + + doGcCycle(t, ctx, sc) + require.Equal(t, 2, kv.buckets.Len()) + require.Equal(t, int64(2), sc.bucketCnt.Load()) + } +} + +func TestDeleteBelowBoundary(t *testing.T) { + threads := sql.NewBackgroundThreads() + defer threads.Shutdown() + ctx, sqlEng, sc, _ := defaultSetup(t, threads, true) + sc.enableGc.Store(false) + + wg := sync.WaitGroup{} + + require.NoError(t, executeQuery(ctx, sqlEng, "alter table xy drop index y")) + + { + require.NoError(t, executeQuery(ctx, sqlEng, "delete from xy where x > 410")) + + runAndPause(t, ctx, sc, &wg) // seed + runAndPause(t, ctx, sc, &wg) // finalize + + kv := sc.kv.(*memStats) + + require.Equal(t, 5, kv.buckets.Len()) // +1 rewrite partial chunk + require.Equal(t, 3, len(kv.bounds)) // +1 rewrite first chunk + require.Equal(t, 3, len(kv.templates)) + require.Equal(t, 1, len(sc.Stats)) + stat := sc.Stats[tableIndexesKey{db: "mydb", branch: "main", table: "xy"}] + require.Equal(t, 1, len(stat[0].Hist)) + require.Equal(t, int64(1), sc.bucketCnt.Load()) + + doGcCycle(t, ctx, sc) + require.Equal(t, 1, kv.buckets.Len()) + require.Equal(t, int64(1), sc.bucketCnt.Load()) + } +} + +func TestDeleteOnBoundary(t *testing.T) { + threads := sql.NewBackgroundThreads() + defer threads.Shutdown() + ctx, sqlEng, sc, _ := defaultSetup(t, threads, true) + sc.enableGc.Store(false) + + wg := sync.WaitGroup{} + + require.NoError(t, executeQuery(ctx, sqlEng, "alter table xy drop index y")) + + { + // PRIMARY boundary chunk -> rewrite y_idx's second + require.NoError(t, executeQuery(ctx, sqlEng, "delete from xy where x > 414")) + + runAndPause(t, ctx, sc, &wg) // seed + runAndPause(t, ctx, sc, &wg) // finalize + + kv := sc.kv.(*memStats) + require.Equal(t, 4, kv.buckets.Len()) + require.Equal(t, 2, len(kv.bounds)) + require.Equal(t, 3, len(kv.templates)) // +1 schema change + require.Equal(t, 1, len(sc.Stats)) + stat := sc.Stats[tableIndexesKey{db: "mydb", branch: "main", table: "xy"}] + require.Equal(t, 1, len(stat[0].Hist)) + require.Equal(t, int64(1), sc.bucketCnt.Load()) + + doGcCycle(t, ctx, sc) + require.Equal(t, 1, kv.buckets.Len()) + require.Equal(t, int64(1), sc.bucketCnt.Load()) + } +} + +func TestAddDropDatabases(t *testing.T) { + threads := sql.NewBackgroundThreads() + defer threads.Shutdown() + ctx, sqlEng, sc, sqlDbs := defaultSetup(t, threads, true) + sc.enableGc.Store(false) + + wg := sync.WaitGroup{} + + var otherDb sqle.Database + { + require.NoError(t, executeQuery(ctx, sqlEng, "create database otherdb")) + require.NoError(t, executeQuery(ctx, sqlEng, "use otherdb")) + require.NoError(t, executeQuery(ctx, sqlEng, "create table t (i int primary key)")) + require.NoError(t, executeQuery(ctx, sqlEng, "insert into t values (0), (1)")) + + for _, db := range sqlEng.Analyzer.Catalog.DbProvider.AllDatabases(ctx) { + if db.Name() == "otherdb" { + dsessDb, err := sqle.RevisionDbForBranch(ctx, db.(dsess.SqlDatabase), "main", "main/"+db.Name()) + require.NoError(t, err) + otherDb = dsessDb.(sqle.Database) + } + } + + // finish queue of read/finalize + runAndPause(t, ctx, sc, &wg) // pull seeds out of interrupt + runAndPause(t, ctx, sc, &wg) + + validateJobState(t, ctx, sc, []StatsJob{ + ReadJob{db: otherDb, table: "t", ordinals: []updateOrdinal{{0, 2}}}, + FinalizeJob{ + tableKey: tableIndexesKey{db: "otherdb", branch: "main", table: "t"}, + editIndexes: map[templateCacheKey]finalizeStruct{ + templateCacheKey{idxName: "PRIMARY"}: {}, + }}, + SeedDbTablesJob{sqlDb: otherDb, tables: []tableStatsInfo{{name: "t"}}}, + SeedDbTablesJob{sqlDb: sqlDbs[0], tables: []tableStatsInfo{{name: "xy"}}}, + }) + + runAndPause(t, ctx, sc, &wg) + + // xy and t + kv := sc.kv.(*memStats) + require.Equal(t, 5, kv.buckets.Len()) + require.Equal(t, 3, len(kv.bounds)) + require.Equal(t, 3, len(kv.templates)) + require.Equal(t, 2, len(sc.Stats)) + stat := sc.Stats[tableIndexesKey{db: "otherdb", branch: "main", table: "t"}] + require.Equal(t, 1, len(stat)) + } + + dropHook := NewDropDatabaseHook(sc) + { + require.NoError(t, executeQuery(ctx, sqlEng, "drop database otherdb")) + dropHook(ctx, "otherdb") + + _, ok := sc.Stats[tableIndexesKey{db: "otherdb", branch: "main", table: "t"}] + require.False(t, ok) + } +} + +func TestGC(t *testing.T) { + threads := sql.NewBackgroundThreads() + defer threads.Shutdown() + ctx, sqlEng, sc, _ := defaultSetup(t, threads, true) + wg := sync.WaitGroup{} + + { + require.NoError(t, executeQuery(ctx, sqlEng, "create database otherdb")) + require.NoError(t, executeQuery(ctx, sqlEng, "use otherdb")) + require.NoError(t, executeQuery(ctx, sqlEng, "create table t (i int primary key)")) + require.NoError(t, executeQuery(ctx, sqlEng, "insert into t values (0), (1)")) + + require.NoError(t, executeQuery(ctx, sqlEng, "create database thirddb")) + require.NoError(t, executeQuery(ctx, sqlEng, "use thirddb")) + require.NoError(t, executeQuery(ctx, sqlEng, "create table s (i int primary key, j int, key (j))")) + require.NoError(t, executeQuery(ctx, sqlEng, "insert into s values (0,0), (1,1), (2,2)")) + + runAndPause(t, ctx, sc, &wg) // seed interrupt + runAndPause(t, ctx, sc, &wg) // read jobs + runAndPause(t, ctx, sc, &wg) // finalize + + dropHook := NewDropDatabaseHook(sc) + require.NoError(t, executeQuery(ctx, sqlEng, "drop database otherdb")) + dropHook(ctx, "otherdb") + + require.NoError(t, executeQuery(ctx, sqlEng, "alter table s drop index j")) + + runAndPause(t, ctx, sc, &wg) // pick up table drop + runAndPause(t, ctx, sc, &wg) // finalize + + doGcCycle(t, ctx, sc) + + // test for cleanup + kv := sc.kv.(*memStats) + require.Equal(t, 5, kv.buckets.Len()) + require.Equal(t, 3, len(kv.bounds)) + require.Equal(t, 3, len(kv.templates)) + require.Equal(t, 2, len(sc.Stats)) + } +} + +func TestBranches(t *testing.T) { + threads := sql.NewBackgroundThreads() + defer threads.Shutdown() + ctx, sqlEng, sc, _ := defaultSetup(t, threads, true) + wg := sync.WaitGroup{} + sc.enableGc.Store(true) + + { + require.NoError(t, executeQuery(ctx, sqlEng, "call dolt_commit('-Am', 'add xy')")) + + require.NoError(t, executeQuery(ctx, sqlEng, "create database otherdb")) + require.NoError(t, executeQuery(ctx, sqlEng, "use otherdb")) + require.NoError(t, executeQuery(ctx, sqlEng, "create table t (i int primary key)")) + require.NoError(t, executeQuery(ctx, sqlEng, "insert into t values (0), (1)")) + require.NoError(t, executeQuery(ctx, sqlEng, "call dolt_commit('-Am', 'add t')")) + + require.NoError(t, executeQuery(ctx, sqlEng, "create database thirddb")) + require.NoError(t, executeQuery(ctx, sqlEng, "use thirddb")) + require.NoError(t, executeQuery(ctx, sqlEng, "create table s (i int primary key, j int, key (j))")) + require.NoError(t, executeQuery(ctx, sqlEng, "insert into s values (0,0), (1,1), (2,2)")) + require.NoError(t, executeQuery(ctx, sqlEng, "call dolt_commit('-Am', 'add s')")) + + runAndPause(t, ctx, sc, &wg) // seed interrupt + runAndPause(t, ctx, sc, &wg) // read jobs + runAndPause(t, ctx, sc, &wg) // finalize + + require.NoError(t, executeQuery(ctx, sqlEng, "use mydb")) + require.NoError(t, executeQuery(ctx, sqlEng, "call dolt_checkout('-b', 'feat1')")) + + require.NoError(t, executeQuery(ctx, sqlEng, "use otherdb")) + require.NoError(t, executeQuery(ctx, sqlEng, "call dolt_checkout('-b', 'feat2')")) + require.NoError(t, executeQuery(ctx, sqlEng, "insert into t values (2), (3)")) + require.NoError(t, executeQuery(ctx, sqlEng, "call dolt_commit('-Am', 'insert into t')")) + require.NoError(t, executeQuery(ctx, sqlEng, "call dolt_checkout('-b', 'feat3')")) + require.NoError(t, executeQuery(ctx, sqlEng, "drop table t")) + require.NoError(t, executeQuery(ctx, sqlEng, "call dolt_commit('-Am', 'drop t')")) + + require.NoError(t, executeQuery(ctx, sqlEng, "use thirddb")) + require.NoError(t, executeQuery(ctx, sqlEng, "call dolt_checkout('-b', 'feat1')")) + require.NoError(t, executeQuery(ctx, sqlEng, "alter table s drop index j")) + require.NoError(t, executeQuery(ctx, sqlEng, "call dolt_commit('-Am', 'drop index j')")) + + runAndPause(t, ctx, sc, &wg) // pick up table changes + runAndPause(t, ctx, sc, &wg) // finalize + + sc.doBranchSync.Store(true) + runAndPause(t, ctx, sc, &wg) // new branches + + require.Equal(t, 7, len(sc.dbs)) + stat, ok := sc.Stats[tableIndexesKey{"otherdb", "feat2", "t", ""}] + require.False(t, ok) + stat, ok = sc.Stats[tableIndexesKey{"otherdb", "feat3", "t", ""}] + require.False(t, ok) + stat, ok = sc.Stats[tableIndexesKey{"thirddb", "feat1", "s", ""}] + require.False(t, ok) + stat, ok = sc.Stats[tableIndexesKey{"otherdb", "main", "t", ""}] + require.Equal(t, 1, len(stat)) + stat = sc.Stats[tableIndexesKey{"thirddb", "main", "s", ""}] + require.Equal(t, 2, len(stat)) + + runAndPause(t, ctx, sc, &wg) // seed new branches + runAndPause(t, ctx, sc, &wg) // finalize branches + + require.Equal(t, 7, len(sc.dbs)) + + stat, ok = sc.Stats[tableIndexesKey{"mydb", "feat1", "xy", ""}] + require.True(t, ok) + require.Equal(t, 2, len(stat)) + stat, ok = sc.Stats[tableIndexesKey{"otherdb", "feat2", "t", ""}] + require.True(t, ok) + require.Equal(t, 1, len(stat)) + stat, ok = sc.Stats[tableIndexesKey{"otherdb", "feat3", "t", ""}] + require.False(t, ok) + stat, ok = sc.Stats[tableIndexesKey{"thirddb", "feat1", "s", ""}] + require.True(t, ok) + require.Equal(t, 1, len(stat)) + + // mydb: 4 shared + // otherdb: 1 + 1 + // thirddb: 2 + shared + kv := sc.kv.(*memStats) + require.Equal(t, 4+2+2, kv.buckets.Len()) + require.Equal(t, 2+(1+1)+2, len(kv.bounds)) + require.Equal(t, 2+1+(2+1), len(kv.templates)) + require.Equal(t, 7-1, len(sc.Stats)) + + dropHook := NewDropDatabaseHook(sc) + require.NoError(t, executeQuery(ctx, sqlEng, "drop database otherdb")) + dropHook(ctx, "otherdb") + + runAndPause(t, ctx, sc, &wg) // finalize drop otherdb + + require.Equal(t, 4, len(sc.dbs)) + stat, ok = sc.Stats[tableIndexesKey{"otherdb", "feat2", "t", ""}] + require.False(t, ok) + stat, ok = sc.Stats[tableIndexesKey{"otherdb", "main", "t", ""}] + require.False(t, ok) + + require.NoError(t, executeQuery(ctx, sqlEng, "use mydb")) + require.NoError(t, executeQuery(ctx, sqlEng, "call dolt_checkout('main')")) + require.NoError(t, executeQuery(ctx, sqlEng, "call dolt_branch('-D', 'feat1')")) + + sc.doBranchSync.Store(true) + runAndPause(t, ctx, sc, &wg) // detect deleted branch + runAndPause(t, ctx, sc, &wg) // finalize branch delete + + require.Equal(t, 3, len(sc.dbs)) + stat, ok = sc.Stats[tableIndexesKey{"mydb", "feat1", "xy", ""}] + require.False(t, ok) + stat, ok = sc.Stats[tableIndexesKey{"mydb", "main", "xy", ""}] + require.True(t, ok) + + doGcCycle(t, ctx, sc) + + // 3 dbs remaining, mydb/main, thirddb/feat1, thirddb/main + kv = sc.kv.(*memStats) + require.Equal(t, 4+2, kv.buckets.Len()) + require.Equal(t, 4, len(kv.bounds)) + require.Equal(t, 5, len(kv.templates)) + require.Equal(t, 3, len(sc.Stats)) + } +} + +func TestBucketDoubling(t *testing.T) { + threads := sql.NewBackgroundThreads() + defer threads.Shutdown() + ctx, sqlEng, sc, _ := defaultSetup(t, threads, true) + wg := sync.WaitGroup{} + + cur := sc.kv.(*memStats).buckets + newB, _ := lru.New[bucketKey, *stats.Bucket](4) + for _, k := range cur.Keys() { + v, _ := cur.Get(k) + newB.Add(k, v) + } + sc.kv.(*memStats).buckets = newB + sc.bucketCap = 4 + + // add more data + b := strings.Repeat("b", 100) + require.NoError(t, executeQuery(ctx, sqlEng, "create table ab (a int primary key, b varchar(100), key (b,a))")) + abIns := strings.Builder{} + abIns.WriteString("insert into ab values") + for i := range 200 { + if i > 0 { + abIns.WriteString(", ") + } + abIns.WriteString(fmt.Sprintf("(%d, '%s')", i, b)) + } + require.NoError(t, executeQuery(ctx, sqlEng, abIns.String())) + + sc.enableGc.Store(true) + + runAndPause(t, ctx, sc, &wg) // track ab + runAndPause(t, ctx, sc, &wg) // finalize ab + + // 4 old + 2*7 new ab + kv := sc.kv.(*memStats) + require.Equal(t, 18, kv.buckets.Len()) + require.Equal(t, 4, len(kv.bounds)) + require.Equal(t, 4, len(kv.templates)) + require.Equal(t, 2, len(sc.Stats)) + stat := sc.Stats[tableIndexesKey{"mydb", "main", "ab", ""}] + require.Equal(t, 7, len(stat[0].Hist)) + require.Equal(t, 7, len(stat[1].Hist)) +} + +func TestBucketCounting(t *testing.T) { + threads := sql.NewBackgroundThreads() + defer threads.Shutdown() + ctx, sqlEng, sc, _ := defaultSetup(t, threads, true) + wg := sync.WaitGroup{} + + // add more data + b := strings.Repeat("b", 100) + require.NoError(t, executeQuery(ctx, sqlEng, "create table ab (a int primary key, b varchar(100), key (b,a))")) + abIns := strings.Builder{} + abIns.WriteString("insert into ab values") + for i := range 200 { + if i > 0 { + abIns.WriteString(", ") + } + abIns.WriteString(fmt.Sprintf("(%d, '%s')", i, b)) + } + require.NoError(t, executeQuery(ctx, sqlEng, abIns.String())) + + sc.enableGc.Store(false) + + runAndPause(t, ctx, sc, &wg) // track ab + runAndPause(t, ctx, sc, &wg) // finalize ab + + // 4 old + 2*7 new ab + kv := sc.kv.(*memStats) + require.Equal(t, 18, kv.buckets.Len()) + require.Equal(t, 2, len(sc.Stats)) + + require.NoError(t, executeQuery(ctx, sqlEng, "create table cd (c int primary key, d varchar(200), key (d,c))")) + require.NoError(t, executeQuery(ctx, sqlEng, "insert into cd select a,b from ab")) + + runAndPause(t, ctx, sc, &wg) // track ab + runAndPause(t, ctx, sc, &wg) // finalize ab + + // no new buckets + kv = sc.kv.(*memStats) + require.Equal(t, 18, kv.buckets.Len()) + require.Equal(t, 3, len(sc.Stats)) +} + +func TestDropOnlyDb(t *testing.T) { + threads := sql.NewBackgroundThreads() + defer threads.Shutdown() + ctx, sqlEng, sc, _ := defaultSetup(t, threads, false) + + require.NoError(t, sc.Restart(ctx)) + + _, ok := sc.kv.(*prollyStats) + require.True(t, ok) + require.Equal(t, "mydb", sc.statsBackingDb) + + // what happens when we drop the only database? swap to memory? + // add first database, switch to prolly? + require.NoError(t, executeQuery(ctx, sqlEng, "drop database mydb")) + + require.NoError(t, executeQuery(ctx, sqlEng, "call dolt_stats_wait()")) + + sc.Stop() + + // empty memory KV + _, ok = sc.kv.(*memStats) + require.True(t, ok) + require.Equal(t, "", sc.statsBackingDb) + + require.NoError(t, executeQuery(ctx, sqlEng, "create database otherdb")) + + // empty prollyKv + _, ok = sc.kv.(*prollyStats) + require.True(t, ok) + require.Equal(t, "otherdb", sc.statsBackingDb) +} + +func TestRotateBackingDb(t *testing.T) { + threads := sql.NewBackgroundThreads() + defer threads.Shutdown() + ctx, sqlEng, sc, startDbs := defaultSetup(t, threads, true) + wg := sync.WaitGroup{} + + prollyKv, err := NewProllyStats(ctx, startDbs[0]) + require.NoError(t, err) + prollyKv.mem = sc.kv.(*memStats) + + require.NoError(t, executeQuery(ctx, sqlEng, "create database backupdb")) + + require.NoError(t, executeQuery(ctx, sqlEng, "use backupdb")) + require.NoError(t, executeQuery(ctx, sqlEng, "create table xy (x int primary key, y int)")) + require.NoError(t, executeQuery(ctx, sqlEng, "insert into xy values (0,0), (1,1), (2,2)")) + + runAndPause(t, ctx, sc, &wg) // seed + runAndPause(t, ctx, sc, &wg) // track xy + runAndPause(t, ctx, sc, &wg) // finalize xy + + require.Equal(t, 5, sc.kv.Len()) + require.Equal(t, 2, len(sc.Stats)) + + require.NoError(t, executeQuery(ctx, sqlEng, "drop database mydb")) + + prollyKv, ok := sc.kv.(*prollyStats) + require.True(t, ok) + require.Equal(t, "backupdb", sc.statsBackingDb) + + // lost the backing storage, previous in-memory moves into new kv + require.Equal(t, 5, sc.kv.Len()) + require.Equal(t, 1, len(sc.Stats)) + +} + +func TestReadCounter(t *testing.T) { + threads := sql.NewBackgroundThreads() + defer threads.Shutdown() + ctx, sqlEng, sc, _ := defaultSetup(t, threads, true) + wg := sync.WaitGroup{} + + { + require.Equal(t, 0, sc.Info().ReadCnt) + + require.NoError(t, executeQuery(ctx, sqlEng, "insert into xy values (501, 0)")) + runAndPause(t, ctx, sc, &wg) + + require.Equal(t, 2, sc.Info().ReadCnt) + } +} + +func TestJobQueueDoubling(t *testing.T) { + threads := sql.NewBackgroundThreads() + defer threads.Shutdown() + dEnv := dtestutils.CreateTestEnv() + sqlEng, ctx := newTestEngine(context.Background(), dEnv, threads) + defer sqlEng.Close() + + sc := sqlEng.Analyzer.Catalog.StatsProvider.(*StatsCoord) + sc.Jobs = make(chan StatsJob, 1) + + var jobs []StatsJob + for _ = range 1025 { + jobs = append(jobs, ControlJob{}) + } + require.NoError(t, sc.sendJobs(ctx, jobs...)) + require.Equal(t, 1025, len(sc.Jobs)) + require.Equal(t, 2048, cap(sc.Jobs)) +} + +func TestEmptyTable(t *testing.T) { + threads := sql.NewBackgroundThreads() + defer threads.Shutdown() + ctx, sqlEng, sc, sqlDbs := emptySetup(t, threads, false) + wg := sync.WaitGroup{} + + require.NoError(t, executeQuery(ctx, sqlEng, "create table xy (x int primary key, y varchar(10), key (y,x))")) + + runAndPause(t, ctx, sc, &wg) + validateJobState(t, ctx, sc, []StatsJob{ + FinalizeJob{ + tableKey: tableIndexesKey{db: "mydb", branch: "main", table: "xy"}, + editIndexes: map[templateCacheKey]finalizeStruct{ + templateCacheKey{idxName: "PRIMARY"}: {}, + templateCacheKey{idxName: "y"}: {}, + }}, + SeedDbTablesJob{sqlDb: sqlDbs[0], tables: []tableStatsInfo{{name: "xy"}}}, + }) +} + +func TestPanic(t *testing.T) { + threads := sql.NewBackgroundThreads() + defer threads.Shutdown() + ctx, sqlEng, sc, _ := emptySetup(t, threads, false) + sc.SetEnableGc(true) + + require.NoError(t, sc.Restart(ctx)) + + sc.Control(ctx, "panic", func(sc *StatsCoord) error { + panic("test panic") + }) + + require.NoError(t, executeQuery(ctx, sqlEng, "call dolt_stats_wait()")) +} + +func TestValidate(t *testing.T) { + threads := sql.NewBackgroundThreads() + defer threads.Shutdown() + ctx, sqlEng, sc, _ := emptySetup(t, threads, false) + sc.SetEnableGc(true) + + require.NoError(t, sc.Restart(ctx)) + + sc.Control(ctx, "panic", func(sc *StatsCoord) error { + panic("test panic") + }) + + require.NoError(t, executeQuery(ctx, sqlEng, "call dolt_stats_wait()")) +} + +func TestPurge(t *testing.T) { + threads := sql.NewBackgroundThreads() + defer threads.Shutdown() + ctx, sqlEng, sc, _ := emptySetup(t, threads, false) + sc.SetEnableGc(true) + + require.NoError(t, sc.Restart(ctx)) + + require.NoError(t, executeQuery(ctx, sqlEng, "create table xy (x int primary key, y varchar(10), key (y,x))")) + require.NoError(t, executeQuery(ctx, sqlEng, "insert into xy values (0,0), (1,1), (2,2)")) + require.NoError(t, executeQuery(ctx, sqlEng, "create database other")) + require.NoError(t, executeQuery(ctx, sqlEng, "use other")) + require.NoError(t, executeQuery(ctx, sqlEng, "create table ab (a int primary key, b varchar(10), key (b,a))")) + require.NoError(t, executeQuery(ctx, sqlEng, "insert into ab values (0,0), (1,1), (2,2)")) + + require.NoError(t, executeQuery(ctx, sqlEng, "call dolt_stats_wait()")) + + sc.Stop() + + kv := sc.kv.(*prollyStats) + require.Equal(t, 2, kv.Len()) + require.Equal(t, 4, len(kv.mem.templates)) + require.Equal(t, 2, len(kv.mem.bounds)) + m, err := kv.m.Map(ctx) + require.NoError(t, err) + cmpCnt, err := m.Count() + require.NoError(t, err) + require.Equal(t, 2, cmpCnt) + + require.NoError(t, sc.Purge(ctx)) + + kv = sc.kv.(*prollyStats) + require.Equal(t, 0, kv.Len()) + require.Equal(t, 0, len(kv.mem.templates)) + require.Equal(t, 0, len(kv.mem.bounds)) + m, err = kv.m.Map(ctx) + require.NoError(t, err) + cmpCnt, err = m.Count() + require.NoError(t, err) + require.Equal(t, 0, cmpCnt) +} + +func emptySetup(t *testing.T, threads *sql.BackgroundThreads, memOnly bool) (*sql.Context, *gms.Engine, *StatsCoord, []sqle.Database) { + dEnv := dtestutils.CreateTestEnv() + sqlEng, ctx := newTestEngine(context.Background(), dEnv, threads) + ctx.Session.SetClient(sql.Client{ + User: "billy boy", + Address: "bigbillie@fake.horse", + }) + + sc := sqlEng.Analyzer.Catalog.StatsProvider.(*StatsCoord) + sc.SetEnableGc(false) + sc.enableBrSync.Store(false) + require.NoError(t, sc.Restart(ctx)) + + ctx, _ = sc.ctxGen(ctx) + ctx.Session.SetClient(sql.Client{ + User: "billy boy", + Address: "bigbillie@fake.horse", + }) + require.NoError(t, executeQuery(ctx, sqlEng, "create database mydb")) + require.NoError(t, executeQuery(ctx, sqlEng, "use mydb")) + + require.NoError(t, executeQuery(ctx, sqlEng, "call dolt_stats_wait()")) + sc.Stop() + + var sqlDbs []sqle.Database + for _, db := range sqlEng.Analyzer.Catalog.DbProvider.AllDatabases(ctx) { + if sqlDb, ok := db.(sqle.Database); ok { + branch := ref.NewBranchRef("main") + db, err := sqle.RevisionDbForBranch(ctx, sqlDb, branch.GetPath(), branch.GetPath()+"/"+sqlDb.AliasedName()) + require.NoError(t, err) + sqlDbs = append(sqlDbs, db.(sqle.Database)) + } + } + + if memOnly { + statsKv := NewMemStats() + sc.kv = statsKv + } + + sc.enableBrSync.Store(true) + + return ctx, sqlEng, sc, sqlDbs +} + +func defaultSetup(t *testing.T, threads *sql.BackgroundThreads, memOnly bool) (*sql.Context, *gms.Engine, *StatsCoord, []sqle.Database) { + ctx, sqlEng, sc, sqlDbs := emptySetup(t, threads, memOnly) + //sc.Debug = true + + wg := sync.WaitGroup{} + + require.NoError(t, executeQuery(ctx, sqlEng, "create table xy (x int primary key, y int, key (y,x))")) + + xyIns := strings.Builder{} + xyIns.WriteString("insert into xy values") + for i := range 500 { + if i > 0 { + xyIns.WriteString(", ") + } + xyIns.WriteString(fmt.Sprintf("(%d, %d)", i, i%25)) + } + require.NoError(t, executeQuery(ctx, sqlEng, xyIns.String())) + + { + // seed creates read jobs + runAndPause(t, ctx, sc, &wg) + validateJobState(t, ctx, sc, []StatsJob{ + ReadJob{db: sqlDbs[0], table: "xy", ordinals: []updateOrdinal{{0, 415}, {415, 500}}}, + ReadJob{db: sqlDbs[0], table: "xy", ordinals: []updateOrdinal{{0, 240}, {240, 500}}}, + FinalizeJob{ + tableKey: tableIndexesKey{db: "mydb", branch: "main", table: "xy"}, + editIndexes: map[templateCacheKey]finalizeStruct{ + templateCacheKey{idxName: "PRIMARY"}: {}, + templateCacheKey{idxName: "y"}: {}, + }}, + SeedDbTablesJob{sqlDb: sqlDbs[0], tables: []tableStatsInfo{{name: "xy"}}}, + }) + } + + { + // read jobs populate cache + runAndPause(t, ctx, sc, &wg) + + validateJobState(t, ctx, sc, []StatsJob{ + SeedDbTablesJob{sqlDb: sqlDbs[0], tables: []tableStatsInfo{{name: "xy"}}}, + }) + + var kv *memStats + switch s := sc.kv.(type) { + case *memStats: + kv = s + case *prollyStats: + kv = s.mem + } + require.Equal(t, 4, kv.buckets.Len()) + require.Equal(t, 2, len(kv.bounds)) + require.Equal(t, 2, len(kv.templates)) + require.Equal(t, 1, len(sc.Stats)) + for _, tableStats := range sc.Stats { + require.Equal(t, 2, len(tableStats)) + } + } + + { + // seed with no changes yields no new jobs + runAndPause(t, ctx, sc, &wg) + + validateJobState(t, ctx, sc, []StatsJob{ + SeedDbTablesJob{sqlDb: sqlDbs[0], tables: []tableStatsInfo{{name: "xy"}}}, + }) + + var kv *memStats + switch s := sc.kv.(type) { + case *memStats: + kv = s + case *prollyStats: + kv = s.mem + } + require.Equal(t, 4, kv.buckets.Len()) + require.Equal(t, 2, len(kv.bounds)) + require.Equal(t, 2, len(kv.templates)) + require.Equal(t, 1, len(sc.Stats)) + for _, tableStats := range sc.Stats { + require.Equal(t, 2, len(tableStats)) + } + } + return ctx, sqlEng, sc, sqlDbs +} + +// validateJobs compares the current event loop and launches a background thread +// that will repopulate the queue in-order +func validateJobState(t *testing.T, ctx context.Context, sc *StatsCoord, expected []StatsJob) { + jobs, err := sc.captureFlushQueue(ctx) + require.NoError(t, err) + + require.Equal(t, len(expected), len(jobs), fmt.Sprintf("expected: %s; found: %s", expected, jobs)) + for i, j := range jobs { + switch j := j.(type) { + case SeedDbTablesJob: + ej, ok := expected[i].(SeedDbTablesJob) + require.True(t, ok) + for i := range ej.tables { + require.Equal(t, ej.tables[i].name, j.tables[i].name) + } + require.Equal(t, ej.sqlDb.AliasedName(), j.sqlDb.AliasedName()) + require.Equal(t, ej.sqlDb.Revision(), j.sqlDb.Revision()) + case ReadJob: + ej, ok := expected[i].(ReadJob) + require.True(t, ok) + require.Equal(t, ej.table, j.table) + require.Equal(t, ej.ordinals, j.ordinals) + require.Equal(t, ej.db.AliasedName(), j.db.AliasedName()) + require.Equal(t, ej.db.Revision(), j.db.Revision()) + case FinalizeJob: + ej, ok := expected[i].(FinalizeJob) + require.True(t, ok) + require.Equal(t, ej.tableKey, j.tableKey) + idx := make(map[string]bool) + for k, _ := range j.editIndexes { + idx[k.idxName] = true + } + for k, _ := range ej.editIndexes { + if _, ok := idx[k.idxName]; !ok { + require.Fail(t, "missing index: "+k.idxName) + } + } + case ControlJob: + ej, ok := expected[i].(ControlJob) + require.True(t, ok) + require.Equal(t, ej.desc, j.desc) + case AnalyzeJob: + ej, ok := expected[i].(AnalyzeJob) + require.True(t, ok) + require.Equal(t, ej.tables, j.tables) + require.Equal(t, ej.sqlDb, j.sqlDb) + } + } + + // expect queue to fit all jobs, otherwise this deadlocks + // since we stopped accepting before running this; it should + // just roundtrip to/from the same buffer + for _, j := range jobs { + select { + case <-ctx.Done(): + return + default: + sc.Jobs <- j + } + } +} + +func waitOnJob(wg *sync.WaitGroup, done chan struct{}) { + wg.Add(1) + go func() { + select { + case <-context.Background().Done(): + return + case <-done: + wg.Add(-1) + } + }() +} + +func doGcCycle(t *testing.T, ctx *sql.Context, sc *StatsCoord) { + sc.enableGc.Store(true) + sc.doGc.Store(true) + defer sc.enableGc.Store(false) + + wg := sync.WaitGroup{} + runAndPause(t, ctx, sc, &wg) // do GC + runAndPause(t, ctx, sc, &wg) // pick up finish GC job + + sc.gcMu.Lock() + defer sc.gcMu.Unlock() + require.False(t, sc.doGc.Load()) +} + +func runAndPause(t *testing.T, ctx *sql.Context, sc *StatsCoord, wg *sync.WaitGroup) { + // The stop job closes the controller's done channel before the job + // is finished. The done channel is closed before the next run loop, + // making the loop effectively inactive even if the goroutine is still + // in the process of closing by the time we are flushing/validating + // the queue. + j := NewControl("pause", func(sc *StatsCoord) error { + sc.Stop() + return nil + }) + sc.Jobs <- j + waitOnJob(wg, j.done) + require.NoError(t, sc.Restart(ctx)) + wg.Wait() + return +} + +func executeQuery(ctx *sql.Context, eng *gms.Engine, query string) error { + _, iter, _, err := eng.Query(ctx, query) + if err != nil { + return err + } + for { + _, err = iter.Next(ctx) + if err == io.EOF { + break + } + if err != nil { + return err + } + } + return iter.Close(ctx) // tx commit +} + +func executeQueryResults(ctx *sql.Context, eng *gms.Engine, query string) ([]sql.Row, error) { + _, iter, _, err := eng.Query(ctx, query) + if err != nil { + return nil, err + } + var ret []sql.Row + for { + r, err := iter.Next(ctx) + if err == io.EOF { + break + } + if err != nil { + return nil, err + } + ret = append(ret, r) + } + return ret, iter.Close(ctx) // tx commit +} + +func newTestEngine(ctx context.Context, dEnv *env.DoltEnv, threads *sql.BackgroundThreads) (*gms.Engine, *sql.Context) { + pro, err := sqle.NewDoltDatabaseProviderWithDatabases("main", dEnv.FS, nil, nil) + if err != nil { + panic(err) + } + + mrEnv, err := env.MultiEnvForDirectory(ctx, dEnv.Config.WriteableConfig(), dEnv.FS, dEnv.Version, dEnv) + if err != nil { + panic(err) + } + + sc := NewStatsCoord(pro, nil, logrus.StandardLogger(), threads, dEnv) + + gcSafepointController := dsess.NewGCSafepointController() + + doltSession, err := dsess.NewDoltSession(sql.NewBaseSession(), pro, dEnv.Config.WriteableConfig(), branch_control.CreateDefaultController(ctx), sc, writer.NewWriteSession, gcSafepointController) + if err != nil { + panic(err) + } + + sqlCtx := sql.NewContext(ctx, sql.WithSession(doltSession)) + sqlCtx.SetCurrentDatabase(mrEnv.GetFirstDatabase()) + + sc.ctxGen = func(ctx context.Context) (*sql.Context, error) { + doltSession, err := dsess.NewDoltSession(sql.NewBaseSession(), pro, dEnv.Config.WriteableConfig(), branch_control.CreateDefaultController(ctx), sc, writer.NewWriteSession, gcSafepointController) + if err != nil { + return nil, err + } + return sql.NewContext(ctx, sql.WithSession(doltSession)), nil + } + + pro.InitDatabaseHooks = append(pro.InitDatabaseHooks, NewInitDatabaseHook(sc)) + pro.DropDatabaseHooks = append(pro.DropDatabaseHooks, NewDropDatabaseHook(sc)) + + sqlEng := gms.New(analyzer.NewBuilder(pro).Build(), &gms.Config{ + IsReadOnly: false, + IsServerLocked: false, + }) + sqlEng.Analyzer.Catalog.StatsProvider = sc + return sqlEng, sqlCtx +} + +func TestStatsGcConcurrency(t *testing.T) { + threads := sql.NewBackgroundThreads() + defer threads.Shutdown() + ctx, sqlEng, sc, _ := emptySetup(t, threads, false) + sc.SetEnableGc(true) + sc.JobInterval = 1 * time.Nanosecond + sc.gcInterval = 100 * time.Nanosecond + sc.branchInterval = 50 * time.Nanosecond + require.NoError(t, sc.Restart(ctx)) + + addDb := func(ctx *sql.Context, dbName string) { + require.NoError(t, executeQuery(ctx, sqlEng, "create database "+dbName)) + } + + addData := func(ctx *sql.Context, dbName string, i int) { + //log.Println("add ", dbName) + require.NoError(t, executeQuery(ctx, sqlEng, "use "+dbName)) + require.NoError(t, executeQuery(ctx, sqlEng, "create table xy (x int primary key, y int)")) + require.NoError(t, executeQuery(ctx, sqlEng, "insert into xy values (0,0),(1,1),(2,2),(3,3),(4,4),(5,5), (6,"+strconv.Itoa(i)+")")) + } + + dropDb := func(dropCtx *sql.Context, dbName string) { + //log.Println("drop ", dbName) + require.NoError(t, executeQuery(ctx, sqlEng, "use mydb")) + require.NoError(t, executeQuery(ctx, sqlEng, "drop database "+dbName)) + } + + // it is important to use new sessions for this test, to avoid working root conflicts + addCtx, _ := sc.ctxGen(context.Background()) + writeCtx, _ := sc.ctxGen(context.Background()) + dropCtx, _ := sc.ctxGen(context.Background()) + + iters := 200 + dbs := make(chan string, iters) + + { + wg := sync.WaitGroup{} + wg.Add(2) + + addCnt := 0 + go func() { + for i := range iters { + addCnt++ + dbName := "db" + strconv.Itoa(i) + addDb(addCtx, dbName) + addData(writeCtx, dbName, i) + dbs <- dbName + } + close(dbs) + wg.Done() + }() + + dropCnt := 0 + go func() { + i := 0 + for db := range dbs { + if i%2 == 0 { + time.Sleep(50 * time.Millisecond) + dropCnt++ + dropDb(dropCtx, db) + } + i++ + } + wg.Done() + }() + + wg.Wait() + + sc.doBranchSync.Store(true) + require.NoError(t, executeQuery(ctx, sqlEng, "call dolt_stats_wait()")) + sc.doGc.Store(true) + require.NoError(t, executeQuery(ctx, sqlEng, "call dolt_stats_wait()")) + sc.Stop() + + // 101 dbs, 100 with stats (not main) + require.Equal(t, iters/2+1, len(sc.dbs)) + require.Equal(t, iters/2, len(sc.Stats)) + require.NoError(t, sc.ValidateState(ctx)) + require.Equal(t, iters/2, sc.kv.Len()) + } +} + +func TestStatsBranchConcurrency(t *testing.T) { + threads := sql.NewBackgroundThreads() + defer threads.Shutdown() + ctx, sqlEng, sc, _ := emptySetup(t, threads, false) + sc.SetEnableGc(true) + + sc.JobInterval = 10 + sc.gcInterval = 100 + sc.branchInterval = 100 + require.NoError(t, sc.Restart(ctx)) + + addBranch := func(ctx *sql.Context, i int) { + branchName := "branch" + strconv.Itoa(i) + require.NoError(t, executeQuery(ctx, sqlEng, "use mydb")) + require.NoError(t, executeQuery(ctx, sqlEng, "call dolt_checkout('main')")) + require.NoError(t, executeQuery(ctx, sqlEng, "call dolt_checkout('-b', '"+branchName+"')")) + } + + addData := func(ctx *sql.Context, i int) { + branchName := "branch" + strconv.Itoa(i) + require.NoError(t, executeQuery(ctx, sqlEng, "use mydb")) + require.NoError(t, executeQuery(ctx, sqlEng, "call dolt_checkout('"+branchName+"')")) + require.NoError(t, executeQuery(ctx, sqlEng, "create table xy (x int primary key, y int)")) + require.NoError(t, executeQuery(ctx, sqlEng, "insert into xy values (0,0),(1,1),(2,2),(3,3),(4,4),(5,5), (6,"+strconv.Itoa(i)+")")) + require.NoError(t, executeQuery(ctx, sqlEng, "call dolt_stats_wait()")) + + } + + dropBranch := func(dropCtx *sql.Context, branchName string) { + //log.Println("delete branch: ", branchName) + require.NoError(t, executeQuery(ctx, sqlEng, "use mydb")) + del := "call dolt_branch('-d', '" + branchName + "')" + require.NoError(t, executeQuery(ctx, sqlEng, del)) + } + + // it is important to use new sessions for this test, to avoid working root conflicts + addCtx, _ := sc.ctxGen(context.Background()) + dropCtx, _ := sc.ctxGen(context.Background()) + + iters := 100 + { + branches := make(chan string, iters) + + wg := sync.WaitGroup{} + wg.Add(2) + + go func() { + for i := range iters { + addBranch(addCtx, i) + addData(addCtx, i) + branches <- "branch" + strconv.Itoa(i) + } + close(branches) + wg.Done() + }() + + go func() { + i := 0 + for br := range branches { + if i%2 == 0 { + dropBranch(dropCtx, br) + time.Sleep(50 * time.Millisecond) + } + i++ + } + wg.Done() + }() + + wg.Wait() + + sc.doBranchSync.Store(true) + require.NoError(t, executeQuery(ctx, sqlEng, "call dolt_stats_wait()")) + sc.doGc.Store(true) + require.NoError(t, executeQuery(ctx, sqlEng, "call dolt_stats_wait()")) + sc.Stop() + + // at the end we should still have |iters/2| databases + require.Equal(t, iters/2, len(sc.Stats)) + require.NoError(t, sc.ValidateState(ctx)) + require.Equal(t, iters/2, sc.kv.Len()) + } +} + +func TestStatsCacheGrowth(t *testing.T) { + //t.Skip("expensive test") + + threads := sql.NewBackgroundThreads() + defer threads.Shutdown() + ctx, sqlEng, sc, _ := emptySetup(t, threads, false) + sc.SetEnableGc(true) + + sc.JobInterval = 10 + sc.gcInterval = 100 + sc.branchInterval = 100 + require.NoError(t, sc.Restart(ctx)) + + addBranch := func(ctx *sql.Context, i int) { + branchName := "branch" + strconv.Itoa(i) + require.NoError(t, executeQuery(ctx, sqlEng, "use mydb")) + require.NoError(t, executeQuery(ctx, sqlEng, "call dolt_checkout('main')")) + require.NoError(t, executeQuery(ctx, sqlEng, "call dolt_checkout('-b', '"+branchName+"')")) + } + + addData := func(ctx *sql.Context, i int) { + branchName := "branch" + strconv.Itoa(i) + require.NoError(t, executeQuery(ctx, sqlEng, "use mydb")) + require.NoError(t, executeQuery(ctx, sqlEng, "call dolt_checkout('"+branchName+"')")) + require.NoError(t, executeQuery(ctx, sqlEng, "create table xy (x int primary key, y int)")) + require.NoError(t, executeQuery(ctx, sqlEng, "insert into xy values (0,0),(1,1),(2,2),(3,3),(4,4),(5,5), (6,"+strconv.Itoa(i)+")")) + + } + + // it is important to use new sessions for this test, to avoid working root conflicts + iters := 2000 + if os.Getenv("CI") != "" { + iters = 1025 + } + { + branches := make(chan string, iters) + + go func() { + addCtx, _ := sc.ctxGen(context.Background()) + for i := range iters { + addBranch(addCtx, i) + addData(addCtx, i) + branches <- "branch" + strconv.Itoa(i) + if i%500 == 0 { + log.Println("branches: ", strconv.Itoa(i)) + require.NoError(t, executeQuery(addCtx, sqlEng, "call dolt_stats_wait()")) + } + } + close(branches) + }() + + //waitCtx, _ := sc.ctxGen(context.Background()) + i := 0 + for _ = range branches { + //if i%50 == 0 { + // log.Println("branches: ", strconv.Itoa(i)) + // require.NoError(t, executeQuery(waitCtx, sqlEng, "call dolt_stats_wait()")) + //} + i++ + } + + sc.doBranchSync.Store(true) + require.NoError(t, executeQuery(ctx, sqlEng, "call dolt_stats_wait()")) + sc.doGc.Store(true) + require.NoError(t, executeQuery(ctx, sqlEng, "call dolt_stats_wait()")) + sc.Stop() + + // at the end we should still have |iters/2| databases + require.Equal(t, iters, len(sc.Stats)) + require.NoError(t, sc.ValidateState(ctx)) + require.Equal(t, iters, sc.kv.Len()) + } +} diff --git a/go/libraries/doltcore/sqle/statspro/script_test.go b/go/libraries/doltcore/sqle/statspro/script_test.go new file mode 100644 index 00000000000..f5ceace6f44 --- /dev/null +++ b/go/libraries/doltcore/sqle/statspro/script_test.go @@ -0,0 +1,532 @@ +package statspro + +import ( + "github.com/dolthub/go-mysql-server/sql" + "github.com/stretchr/testify/require" + "testing" +) + +type scriptTest struct { + name string + setup []string + assertions []assertion +} + +type assertion struct { + query string + res []sql.Row +} + +func TestStatScripts(t *testing.T) { + threads := sql.NewBackgroundThreads() + defer threads.Shutdown() + + scripts := []scriptTest{ + { + name: "track updates", + setup: []string{ + "create table xy (x int primary key, y varchar(16), key (y,x))", + "insert into xy values (0,'zero'), (1, 'one')", + }, + assertions: []assertion{ + { + query: "select database_name, table_name, index_name from dolt_statistics order by index_name", + res: []sql.Row{{"mydb", "xy", "primary"}, {"mydb", "xy", "y"}}, + }, + { + query: "insert into xy select x, 1 from (with recursive inputs(x) as (select 4 union select x+1 from inputs where x < 1000) select * from inputs) dt;", + }, + { + query: "call dolt_stats_wait()", + }, + { + query: "select count(*) from dolt_statistics", + res: []sql.Row{{int64(9)}}, + }, + { + query: "update xy set y = 2 where x between 100 and 800", + }, + { + query: "call dolt_stats_wait()", + }, + { + query: "select count(*) from dolt_statistics", + res: []sql.Row{{int64(9)}}, + }, + }, + }, + { + name: "track deletes", + setup: []string{ + "create table xy (x int primary key, y varchar(16), key (y,x))", + "insert into xy values (0,'zero'), (1, 'one')", + }, + assertions: []assertion{ + { + query: "select database_name, table_name, index_name from dolt_statistics order by index_name", + res: []sql.Row{{"mydb", "xy", "primary"}, {"mydb", "xy", "y"}}, + }, + { + query: "insert into xy select x, 1 from (with recursive inputs(x) as (select 4 union select x+1 from inputs where x < 1000) select * from inputs) dt;", + }, + { + query: "call dolt_stats_wait()", + }, + { + query: "select count(*) from dolt_statistics", + res: []sql.Row{{int64(9)}}, + }, + { + query: "delete from xy where x > 600", + }, + { + query: "call dolt_stats_wait()", + }, + { + query: "select count(*) from dolt_statistics", + res: []sql.Row{{int64(4)}}, + }, + }, + }, + { + name: "ddl table", + setup: []string{ + "create table xy (x int primary key, y varchar(16), key (y,x))", + "insert into xy values (0,'0'), (1,'0'), (2,'0')", + }, + assertions: []assertion{ + { + query: "select database_name, table_name, index_name from dolt_statistics order by index_name", + res: []sql.Row{{"mydb", "xy", "primary"}, {"mydb", "xy", "y"}}, + }, + { + query: "select count(*) from dolt_statistics", + res: []sql.Row{{int64(2)}}, + }, + { + query: "truncate table xy", + }, + { + query: "call dolt_stats_wait()", + }, + { + query: "select count(*) from dolt_statistics", + res: []sql.Row{{int64(0)}}, + }, + { + query: "insert into xy values (0,'0'), (1,'0'), (2,'0')", + }, + { + query: "call dolt_stats_wait()", + }, + { + query: "select count(*) from dolt_statistics", + res: []sql.Row{{int64(2)}}, + }, + { + query: "drop table xy", + }, + { + query: "call dolt_stats_wait()", + }, + { + query: "select count(*) from dolt_statistics", + res: []sql.Row{{int64(0)}}, + }, + }, + }, + { + name: "ddl index", + setup: []string{ + "create table xy (x int primary key, y varchar(16), key (y,x))", + "insert into xy values (0,'0'), (1,'0'), (2,'0')", + }, + assertions: []assertion{ + { + query: "select database_name, table_name, index_name from dolt_statistics order by index_name", + res: []sql.Row{{"mydb", "xy", "primary"}, {"mydb", "xy", "y"}}, + }, + { + query: "select count(*) from dolt_statistics", + res: []sql.Row{{int64(2)}}, + }, + { + query: "alter table xy drop index y", + }, + { + query: "call dolt_stats_wait()", + }, + { + query: "select count(*) from dolt_statistics", + res: []sql.Row{{int64(1)}}, + }, + { + query: "alter table xy add index yx (y,x)", + }, + { + query: "call dolt_stats_wait()", + }, + { + query: "select count(*) from dolt_statistics", + res: []sql.Row{{int64(2)}}, + }, + { + query: "select types, upper_bound from dolt_statistics where index_name = 'yx'", + res: []sql.Row{{"varchar(16),int", "0,2"}}, + }, + { + query: "alter table xy modify column y int", + }, + { + query: "call dolt_stats_wait()", + }, + { + query: "select types, upper_bound from dolt_statistics where index_name = 'yx'", + res: []sql.Row{{"int,int", "0,2"}}, + }, + { + query: "select count(*) from dolt_statistics", + res: []sql.Row{{int64(2)}}, + }, + }, + }, + { + name: "mcv counts", + setup: []string{ + "create table xy (x int primary key, y int, key (y,x))", + "alter table xy add index y2 (y)", + "alter table xy add index x2 (x,y)", + "insert into xy values (0,0), (1,0), (2,0), (3,0), (4,0), (5,0), (6,1), (7,1), (8,1), (9,1),(10,3),(11,4),(12,5),(13,6),(14,7),(15,8),(16,9),(17,10),(18,11)", + }, + assertions: []assertion{ + { + query: "select mcv1, mcv2, mcv_counts from dolt_statistics where index_name = 'y2'", + res: []sql.Row{{"1", "0", "4,6"}}, + }, + { + query: "select mcv_counts from dolt_statistics where index_name = 'y'", + res: []sql.Row{{""}}, + }, + { + query: "select mcv_counts from dolt_statistics where index_name = 'x2'", + res: []sql.Row{{""}}, + }, + }, + }, + { + name: "caps testing", + setup: []string{ + "create table XY (x int primary key, Y int, key Yx (Y,x))", + "alter table xy add index y2 (y)", + "insert into xy values (0,0), (1,0), (2,0)", + }, + assertions: []assertion{ + { + query: "select database_name, table_name, index_name from dolt_statistics order by index_name", + res: []sql.Row{{"mydb", "xy", "primary"}, {"mydb", "xy", "y2"}, {"mydb", "xy", "yx"}}, + }, + { + query: "select count(*) from dolt_statistics", + res: []sql.Row{{int64(3)}}, + }, + { + query: "insert into xy select x, 1 from (with recursive inputs(x) as (select 4 union select x+1 from inputs where x < 1000) select * from inputs) dt;", + }, + { + query: "call dolt_stats_wait()", + }, + { + query: "select count(*) from dolt_statistics", + res: []sql.Row{{int64(12)}}, + }, + { + query: "delete from xy where x > 500", + }, + { + query: "call dolt_stats_wait()", + }, + { + query: "select count(*) from dolt_statistics", + res: []sql.Row{{int64(6)}}, + }, + }, + }, + { + name: "database ddl", + setup: []string{ + "create table mydb.xy (x int primary key, y int, key (y,x))", + "insert into xy values (0,0), (1,0), (2,0)", + "create database repo2", + "create table repo2.xy (x int primary key, y int, key (y,x))", + "insert into repo2.xy values (0,0), (1,0), (2,0)", + "create table repo2.ab (a int primary key, b int, key (b,a))", + "insert into repo2.ab values (0,0), (1,0), (2,0)", + }, + assertions: []assertion{ + { + query: "select database_name, table_name, index_name from dolt_statistics order by index_name", + res: []sql.Row{ + {"mydb", "xy", "primary"}, {"mydb", "xy", "y"}, + }, + }, + { + query: "select count(*) from dolt_statistics", + res: []sql.Row{{int64(2)}}, + }, + { + query: "select database_name, table_name, index_name from repo2.dolt_statistics order by index_name", + res: []sql.Row{ + {"repo2", "ab", "b"}, {"repo2", "ab", "primary"}, + {"repo2", "xy", "primary"}, {"repo2", "xy", "y"}, + }, + }, + { + query: "use repo2", + }, + { + query: "select database_name, table_name, index_name from dolt_statistics order by index_name", + res: []sql.Row{ + {"repo2", "ab", "b"}, {"repo2", "ab", "primary"}, + {"repo2", "xy", "primary"}, {"repo2", "xy", "y"}, + }, + }, + { + query: "select count(*) from dolt_statistics", + res: []sql.Row{{int64(4)}}, + }, + { + query: "insert into repo2.xy select x, 1 from (with recursive inputs(x) as (select 4 union select x+1 from inputs where x < 1000) select * from inputs) dt;", + }, + { + query: "call dolt_stats_wait()", + }, + { + query: "select count(*) from dolt_statistics", + res: []sql.Row{{int64(10)}}, + }, + { + query: "drop database repo2", + }, + { + query: "call dolt_stats_wait()", + }, + { + query: "use mydb", + }, + { + query: "select count(*) from dolt_statistics", + res: []sql.Row{{int64(2)}}, + }, + }, + }, + { + name: "recreate table without index", + setup: []string{ + "create table xy (x int primary key, y int, key (y,x))", + "insert into xy values (0,0), (1,0), (2,0)", + }, + assertions: []assertion{ + { + query: "select count(*) from dolt_statistics", + res: []sql.Row{{int64(2)}}, + }, + { + query: "drop table xy", + }, + { + query: "create table xy (x int primary key, y int)", + }, + { + query: "call dolt_stats_wait()", + }, + { + query: "select count(*) from dolt_statistics", + res: []sql.Row{{int64(1)}}, + }, + }, + }, + { + name: "stats info", + setup: []string{ + "create table xy (x int primary key, y int, key (y,x))", + "insert into xy values (0,0), (1,0), (2,0)", + "call dolt_add('-A')", + "call dolt_commit('-m', 'create xy')", + "call dolt_checkout('-b', 'feat')", + "call dolt_checkout('main')", + }, + assertions: []assertion{ + { + query: "call dolt_stats_info()", + res: []sql.Row{{`{"dbCnt":2,"readCnt":0,"active":true,"dbSeedCnt":2,"estBucketCnt":2,"cachedBucketCnt":2,"statCnt":2,"gcCounter":1,"branchCounter":1}`}}, + }, + { + query: "call dolt_checkout('feat')", + }, + { + query: "drop table xy", + }, + { + query: "call dolt_stats_wait()", + }, + { + query: "call dolt_stats_gc()", + }, + { + query: "call dolt_stats_wait()", + }, + { + query: "call dolt_stats_gc()", + }, + { + query: "call dolt_stats_sync()", + }, + { + query: "call dolt_stats_info()", + res: []sql.Row{{`{"dbCnt":2,"readCnt":0,"active":true,"dbSeedCnt":2,"estBucketCnt":2,"cachedBucketCnt":2,"statCnt":1,"gcCounter":3,"branchCounter":2}`}}, + }, + { + query: "call dolt_checkout('main')", + }, + { + query: "call dolt_branch('-D', 'feat')", + }, + { + query: "call dolt_stats_sync()", + }, + { + query: "call dolt_stats_gc()", + }, + { + query: "call dolt_stats_wait()", + }, + { + query: "call dolt_stats_info()", + res: []sql.Row{{`{"dbCnt":1,"readCnt":0,"active":true,"dbSeedCnt":1,"estBucketCnt":2,"cachedBucketCnt":2,"statCnt":1,"gcCounter":4,"branchCounter":3}`}}, + }, + }, + }, + { + name: "stats stop/start", + setup: []string{ + "create table xy (x int primary key, y int, key (y,x))", + "insert into xy values (0,0), (1,0), (2,0)", + "call dolt_add('-A')", + "call dolt_commit('-m', 'create xy')", + "call dolt_checkout('-b', 'feat')", + "call dolt_checkout('main')", + }, + assertions: []assertion{ + { + query: "call dolt_stats_info()", + res: []sql.Row{{`{"dbCnt":2,"readCnt":0,"active":true,"dbSeedCnt":2,"estBucketCnt":2,"cachedBucketCnt":2,"statCnt":2,"gcCounter":1,"branchCounter":1}`}}, + }, + { + query: "call dolt_stats_stop()", + }, + { + query: "call dolt_stats_info()", + res: []sql.Row{{`{"dbCnt":2,"readCnt":0,"active":false,"dbSeedCnt":0,"estBucketCnt":2,"cachedBucketCnt":2,"statCnt":2,"gcCounter":1,"branchCounter":1}`}}, + }, + { + query: "call dolt_stats_restart()", + }, + { + query: "call dolt_stats_info()", + res: []sql.Row{{`{"dbCnt":2,"readCnt":0,"active":true,"dbSeedCnt":2,"estBucketCnt":2,"cachedBucketCnt":2,"statCnt":2,"gcCounter":1,"branchCounter":1}`}}, + }, + }, + }, + { + name: "stats purge", + setup: []string{ + "create table xy (x int primary key, y int, key (y,x))", + "insert into xy values (0,0), (1,0), (2,0)", + "call dolt_add('-A')", + "call dolt_commit('-m', 'create xy')", + "call dolt_checkout('-b', 'feat')", + "call dolt_checkout('main')", + }, + assertions: []assertion{ + { + query: "call dolt_stats_info()", + res: []sql.Row{{`{"dbCnt":2,"readCnt":0,"active":true,"dbSeedCnt":2,"estBucketCnt":2,"cachedBucketCnt":2,"statCnt":2,"gcCounter":1,"branchCounter":1}`}}, + }, + { + query: "call dolt_stats_purge()", + }, + { + query: "call dolt_stats_info()", + res: []sql.Row{{`{"dbCnt":2,"readCnt":0,"active":false,"dbSeedCnt":2,"estBucketCnt":2,"cachedBucketCnt":2,"statCnt":2,"gcCounter":1,"branchCounter":1}`}}, + }, + { + query: "call dolt_stats_restart()", + }, + { + query: "call dolt_stats_wait()", + }, + { + query: "call dolt_stats_info()", + res: []sql.Row{{`{"dbCnt":2,"readCnt":0,"active":true,"dbSeedCnt":2,"estBucketCnt":2,"cachedBucketCnt":2,"statCnt":2,"gcCounter":1,"branchCounter":1}`}}, + }, + }, + }, + { + name: "stats validate", + setup: []string{ + "create table xy (x int primary key, y int, key (y,x))", + "insert into xy values (0,0), (1,0), (2,0)", + "call dolt_add('-A')", + "call dolt_commit('-m', 'create xy')", + "call dolt_checkout('-b', 'feat')", + "call dolt_checkout('main')", + }, + assertions: []assertion{ + { + query: "call dolt_stats_info()", + res: []sql.Row{{`{"dbCnt":2,"readCnt":0,"active":true,"dbSeedCnt":2,"estBucketCnt":2,"cachedBucketCnt":2,"statCnt":2,"gcCounter":1,"branchCounter":1}`}}, + }, + { + query: "call dolt_stats_stop()", + }, + { + query: "create table ab (a int primary key, b int)", + }, + { + query: "insert into ab values (0,0), (1,1), (2,2)", + }, + { + query: "call dolt_stats_validate()", + res: []sql.Row{{"(mydb/main) missing template (PRIMARY/e29in)\n(mydb/main) missing bound (d9aov)\n(mydb/main) missing chunk (d9aov)\n"}}, + }, + }, + }, + } + + for _, tt := range scripts { + t.Run(tt.name, func(t *testing.T) { + ctx, sqlEng, sc, _ := emptySetup(t, threads, false) + sc.SetEnableGc(true) + + require.NoError(t, sc.Restart(ctx)) + + sc.Debug = true + + for _, s := range tt.setup { + require.NoError(t, executeQuery(ctx, sqlEng, s)) + } + + require.NoError(t, executeQuery(ctx, sqlEng, "call dolt_stats_sync()")) + require.NoError(t, executeQuery(ctx, sqlEng, "call dolt_stats_wait()")) + require.NoError(t, executeQuery(ctx, sqlEng, "call dolt_stats_gc()")) + + for _, a := range tt.assertions { + rows, err := executeQueryResults(ctx, sqlEng, a.query) + require.NoError(t, err) + if a.res != nil { + require.Equal(t, a.res, rows) + } + } + }) + } +} diff --git a/go/libraries/doltcore/sqle/statspro/seed_job.go b/go/libraries/doltcore/sqle/statspro/seed_job.go new file mode 100644 index 00000000000..fab444c936d --- /dev/null +++ b/go/libraries/doltcore/sqle/statspro/seed_job.go @@ -0,0 +1,382 @@ +// Copyright 2023 Dolthub, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package statspro + +import ( + "context" + "errors" + "fmt" + "github.com/dolthub/dolt/go/libraries/doltcore/doltdb" + "github.com/dolthub/dolt/go/libraries/doltcore/doltdb/durable" + "github.com/dolthub/dolt/go/libraries/doltcore/sqle" + "github.com/dolthub/dolt/go/libraries/doltcore/sqle/dsess" + "github.com/dolthub/dolt/go/store/hash" + "github.com/dolthub/dolt/go/store/prolly" + "github.com/dolthub/dolt/go/store/prolly/tree" + "github.com/dolthub/dolt/go/store/val" + "github.com/dolthub/go-mysql-server/sql" + "github.com/dolthub/go-mysql-server/sql/stats" + "strings" +) + +func (sc *StatsCoord) seedDbTables(ctx context.Context, j SeedDbTablesJob) (ret []StatsJob, err error) { + // get list of tables, get list of indexes, partition index ranges into ordinal blocks + // return list of IO jobs for table/index/ordinal blocks + defer func() { + if errors.Is(doltdb.ErrWorkingSetNotFound, err) { + err = nil + ret = []StatsJob{NewSeedJob(j.sqlDb)} + } else if err != nil { + sc.seedCnt.Add(-1) + } + }() + + sqlCtx, err := sc.ctxGen(ctx) + if err != nil { + return nil, err + } + dSess := dsess.DSessFromSess(sqlCtx.Session) + db, err := dSess.Provider().Database(sqlCtx, j.sqlDb.AliasedName()) + if err != nil { + return nil, err + } + sqlDb, err := sqle.RevisionDbForBranch(sqlCtx, db.(dsess.SqlDatabase), j.sqlDb.Revision(), j.sqlDb.Revision()+"/"+j.sqlDb.AliasedName()) + if err != nil { + return nil, err + } + tableNames, err := sqlDb.GetTableNames(sqlCtx) + if err != nil { + return nil, err + } + + var newTableInfo []tableStatsInfo + var bucketDiff int + + i := 0 + k := 0 + for i < len(tableNames) && k < len(j.tables) { + var jobs []StatsJob + var ti tableStatsInfo + switch strings.Compare(tableNames[i], j.tables[k].name) { + case 0: + // continue + jobs, ti, err = sc.readJobsForTable(sqlCtx, sqlDb, j.tables[k]) + bucketDiff += ti.bucketCount - j.tables[k].bucketCount + i++ + k++ + case -1: + // new table + jobs, ti, err = sc.readJobsForTable(sqlCtx, sqlDb, tableStatsInfo{name: tableNames[i]}) + bucketDiff += ti.bucketCount + i++ + case +1: + // dropped table + jobs = append(jobs, sc.dropTableJob(sqlDb, j.tables[k].name)) + bucketDiff -= j.tables[k].bucketCount + k++ + } + if err != nil { + return nil, err + } + if ti.name != "" { + newTableInfo = append(newTableInfo, ti) + } + ret = append(ret, jobs...) + } + for i < len(tableNames) { + jobs, ti, err := sc.readJobsForTable(sqlCtx, sqlDb, tableStatsInfo{name: tableNames[i]}) + if err != nil { + return nil, err + } + bucketDiff += ti.bucketCount + newTableInfo = append(newTableInfo, ti) + ret = append(ret, jobs...) + i++ + } + + for k < len(j.tables) { + ret = append(ret, sc.dropTableJob(sqlDb, j.tables[k].name)) + bucketDiff -= j.tables[k].bucketCount + k++ + } + + sc.bucketCnt.Add(int64(bucketDiff)) + + for sc.bucketCnt.Load() > sc.bucketCap { + sc.bucketCap *= 2 + sc.doGc.Store(true) + } + + // retry again after finishing planned work + ret = append(ret, SeedDbTablesJob{tables: newTableInfo, sqlDb: sqlDb, done: make(chan struct{})}) + return ret, nil +} + +// GetLatestTable will get the WORKING root table for the current database/branch +func GetLatestTable(ctx *sql.Context, tableName string, sqlDb sql.Database) (*sqle.DoltTable, *doltdb.Table, error) { + var db sqle.Database + switch d := sqlDb.(type) { + case sqle.Database: + db = d + case sqle.ReadReplicaDatabase: + db = d.Database + default: + return nil, nil, fmt.Errorf("expected sqle.Database, found %T", sqlDb) + } + sqlTable, ok, err := db.GetTableInsensitive(ctx, tableName) + if err != nil { + return nil, nil, err + } + if !ok { + return nil, nil, fmt.Errorf("statistics refresh error: table not found %s", tableName) + } + + var dTab *doltdb.Table + var sqleTable *sqle.DoltTable + switch t := sqlTable.(type) { + case *sqle.AlterableDoltTable: + sqleTable = t.DoltTable + dTab, err = t.DoltTable.DoltTable(ctx) + case *sqle.WritableDoltTable: + sqleTable = t.DoltTable + dTab, err = t.DoltTable.DoltTable(ctx) + case *sqle.DoltTable: + sqleTable = t + dTab, err = t.DoltTable(ctx) + default: + err = fmt.Errorf("failed to unwrap dolt table from type: %T", sqlTable) + } + if err != nil { + return nil, nil, err + } + return sqleTable, dTab, nil +} + +func (sc *StatsCoord) readJobsForTable(ctx *sql.Context, sqlDb dsess.SqlDatabase, tableInfo tableStatsInfo) ([]StatsJob, tableStatsInfo, error) { + var ret []StatsJob + var bucketCnt int + sqlTable, dTab, err := GetLatestTable(ctx, tableInfo.name, sqlDb) + if err != nil { + return nil, tableStatsInfo{}, err + } + indexes, err := sqlTable.GetIndexes(ctx) + if err != nil { + return nil, tableStatsInfo{}, err + } + + schHashKey, _, err := sqlTable.IndexCacheKey(ctx) + if err != nil { + return nil, tableStatsInfo{}, err + } + + schemaChanged := !tableInfo.schHash.Equal(schHashKey.Hash) + if !tableInfo.schHash.IsEmpty() && schemaChanged { + sc.setGc() + } + + var dataChanged bool + var isNewData bool + var newIdxRoots []hash.Hash + + keepIndexes := make(map[sql.StatQualifier]bool) + fullIndexBuckets := make(map[templateCacheKey]finalizeStruct) + for i, sqlIdx := range indexes { + var idx durable.Index + var err error + if strings.EqualFold(sqlIdx.ID(), "PRIMARY") { + idx, err = dTab.GetRowData(ctx) + } else { + idx, err = dTab.GetIndexRowData(ctx, sqlIdx.ID()) + } + if err != nil { + return nil, tableStatsInfo{}, err + } + + prollyMap := durable.ProllyMapFromIndex(idx) + + idxRoot := prollyMap.Node().HashOf() + newIdxRoots = append(newIdxRoots, idxRoot) + + levelNodes, err := tree.GetHistogramLevel(ctx, prollyMap.Tuples(), bucketLowCnt) + if err != nil { + return nil, tableStatsInfo{}, err + } + + bucketCnt += len(levelNodes) + + indexKey := templateCacheKey{h: schHashKey.Hash, idxName: sqlIdx.ID()} + + if i < len(tableInfo.idxRoots) && idxRoot.Equal(tableInfo.idxRoots[i]) && !schemaChanged { + qual := sql.StatQualifier{ + Tab: tableInfo.name, + Database: strings.ToLower(sqlDb.AliasedName()), + Idx: strings.ToLower(sqlIdx.ID()), + } + keepIndexes[qual] = true + continue + } + dataChanged = true + + var buckets []hash.Hash + for _, n := range levelNodes { + buckets = append(buckets, n.HashOf()) + } + fullIndexBuckets[indexKey] = finalizeStruct{ + buckets: buckets, + tupB: val.NewTupleBuilder(prollyMap.KeyDesc().PrefixDesc(len(sqlIdx.Expressions()))), + } + + key, template, err := sc.getTemplate(ctx, sqlTable, sqlIdx) + if err != nil { + sc.logger.Errorf("stats collection failed to generate a statistic template: %s.%s.%s:%T; %s", sqlDb.RevisionQualifiedName(), tableInfo.name, sqlIdx, sqlIdx, err) + continue + } + + readJobs, err := sc.partitionStatReadJobs(ctx, sqlDb, tableInfo.name, key, template, levelNodes, prollyMap, len(sqlIdx.Expressions())) + if err != nil { + return nil, tableStatsInfo{}, err + } + ret = append(ret, readJobs...) + isNewData = isNewData || dataChanged + } + if len(ret) > 0 || isNewData || schemaChanged { + // if there are any reads to perform, we follow those reads with a table finalize + ret = append(ret, FinalizeJob{ + sqlDb: sqlDb, + tableKey: tableIndexesKey{ + db: sqlDb.AliasedName(), + branch: sqlDb.Revision(), + table: tableInfo.name, + }, + keepIndexes: keepIndexes, + editIndexes: fullIndexBuckets, + done: make(chan struct{}), + }) + } + + return ret, tableStatsInfo{name: tableInfo.name, schHash: schHashKey.Hash, idxRoots: newIdxRoots, bucketCount: bucketCnt}, nil +} + +type updateOrdinal struct { + start, stop uint64 +} + +func (sc *StatsCoord) partitionStatReadJobs(ctx *sql.Context, sqlDb dsess.SqlDatabase, tableName string, key templateCacheKey, template stats.Statistic, levelNodes []tree.Node, prollyMap prolly.Map, idxCnt int) ([]StatsJob, error) { + if cnt, err := prollyMap.Count(); err != nil { + return nil, err + } else if cnt == 0 { + return nil, nil + } + + curCnt := 0 + jobSize := 100_000 + var jobs []StatsJob + var batchOrdinals []updateOrdinal + var nodes []tree.Node + var offset uint64 + for _, n := range levelNodes { + treeCnt, err := n.TreeCount() + if err != nil { + return nil, err + } + ord := updateOrdinal{ + start: offset, + stop: offset + uint64(treeCnt), + } + offset += uint64(treeCnt) + + if _, ok, err := sc.kv.GetBucket(ctx, n.HashOf(), val.NewTupleBuilder(prollyMap.KeyDesc().PrefixDesc(idxCnt))); err != nil { + return nil, err + } else if ok { + // skip redundant work + continue + } + + curCnt += treeCnt + batchOrdinals = append(batchOrdinals, ord) + nodes = append(nodes, n) + + if curCnt > jobSize { + first := batchOrdinals[0].start == 0 + jobs = append(jobs, ReadJob{ctx: ctx, db: sqlDb, first: first, table: tableName, key: key, template: template, m: prollyMap, nodes: nodes, ordinals: batchOrdinals, idxLen: idxCnt, done: make(chan struct{})}) + curCnt = 0 + batchOrdinals = batchOrdinals[:0] + nodes = nodes[:0] + } + } + if curCnt > 0 { + first := batchOrdinals[0].start == 0 + jobs = append(jobs, ReadJob{ctx: ctx, db: sqlDb, first: first, table: tableName, key: key, template: template, m: prollyMap, nodes: nodes, ordinals: batchOrdinals, idxLen: idxCnt, done: make(chan struct{})}) + } + + return jobs, nil +} + +type templateCacheKey struct { + h hash.Hash + idxName string +} + +func (k templateCacheKey) String() string { + return k.idxName + "/" + k.h.String()[:5] +} + +func (sc *StatsCoord) getTemplate(ctx *sql.Context, sqlTable *sqle.DoltTable, sqlIdx sql.Index) (templateCacheKey, stats.Statistic, error) { + schHash, _, err := sqlTable.IndexCacheKey(ctx) + key := templateCacheKey{h: schHash.Hash, idxName: sqlIdx.ID()} + if template, ok := sc.kv.GetTemplate(key); ok { + return key, template, nil + } + fds, colset, err := stats.IndexFds(strings.ToLower(sqlTable.Name()), sqlTable.Schema(), sqlIdx) + if err != nil { + return templateCacheKey{}, stats.Statistic{}, err + } + + var class sql.IndexClass + switch { + case sqlIdx.IsSpatial(): + class = sql.IndexClassSpatial + case sqlIdx.IsFullText(): + class = sql.IndexClassFulltext + default: + class = sql.IndexClassDefault + } + + var types []sql.Type + for _, cet := range sqlIdx.ColumnExpressionTypes() { + types = append(types, cet.Type) + } + + tablePrefix := sqlTable.Name() + "." + cols := make([]string, len(sqlIdx.Expressions())) + for i, c := range sqlIdx.Expressions() { + cols[i] = strings.TrimPrefix(strings.ToLower(c), tablePrefix) + } + + template := stats.Statistic{ + Cols: cols, + Typs: types, + IdxClass: uint8(class), + Fds: fds, + Colset: colset, + } + + // We put template twice, once for schema changes with no data + // changes (here), and once when we put chunks to avoid GC dropping + // templates before the finalize job. + sc.kv.PutTemplate(key, template) + + return key, template, nil +} diff --git a/go/libraries/doltcore/sqle/statspro/stats_kv.go b/go/libraries/doltcore/sqle/statspro/stats_kv.go new file mode 100644 index 00000000000..87bddef7cb9 --- /dev/null +++ b/go/libraries/doltcore/sqle/statspro/stats_kv.go @@ -0,0 +1,551 @@ +// Copyright 2025 Dolthub, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package statspro + +import ( + "context" + "encoding/binary" + "errors" + "fmt" + "github.com/dolthub/dolt/go/libraries/doltcore/schema" + "github.com/dolthub/dolt/go/libraries/doltcore/sqle/dsess" + "github.com/dolthub/dolt/go/store/hash" + "github.com/dolthub/dolt/go/store/prolly" + "github.com/dolthub/dolt/go/store/prolly/tree" + "github.com/dolthub/dolt/go/store/val" + "github.com/dolthub/go-mysql-server/sql" + "github.com/dolthub/go-mysql-server/sql/stats" + "github.com/dolthub/go-mysql-server/sql/types" + lru "github.com/hashicorp/golang-lru/v2" + "strconv" + "strings" + "sync" + "sync/atomic" +) + +var ErrIncompatibleVersion = errors.New("client stats version mismatch") + +const defaultBucketSize = 1024 // must be > 0 to avoid panic + +type StatsKv interface { + PutBucket(ctx context.Context, h hash.Hash, b *stats.Bucket, tupB *val.TupleBuilder) error + GetBucket(ctx context.Context, h hash.Hash, tupB *val.TupleBuilder) (*stats.Bucket, bool, error) + GetTemplate(key templateCacheKey) (stats.Statistic, bool) + PutTemplate(key templateCacheKey, stat stats.Statistic) + GetBound(h hash.Hash, len int) (sql.Row, bool) + PutBound(h hash.Hash, r sql.Row) + Flush(ctx context.Context) error + StartGc(ctx context.Context, sz int) error + MarkBucket(ctx context.Context, h hash.Hash, tupB *val.TupleBuilder) error + FinishGc() + Len() int + Cap() int64 +} + +var _ StatsKv = (*prollyStats)(nil) +var _ StatsKv = (*memStats)(nil) + +func NewMemStats() *memStats { + buckets, _ := lru.New[bucketKey, *stats.Bucket](defaultBucketSize) + gcCap := atomic.Int64{} + gcCap.Store(defaultBucketSize) + return &memStats{ + mu: sync.Mutex{}, + buckets: buckets, + templates: make(map[templateCacheKey]stats.Statistic), + bounds: make(map[bucketKey]sql.Row), + gcCap: gcCap, + } +} + +type memStats struct { + mu sync.Mutex + doGc bool + gcCap atomic.Int64 + + buckets *lru.Cache[bucketKey, *stats.Bucket] + nextBuckets *lru.Cache[bucketKey, *stats.Bucket] + + templates map[templateCacheKey]stats.Statistic + nextTemplates map[templateCacheKey]stats.Statistic + + bounds map[bucketKey]sql.Row + nextBounds map[bucketKey]sql.Row +} + +func (m *memStats) GetTemplate(key templateCacheKey) (stats.Statistic, bool) { + m.mu.Lock() + defer m.mu.Unlock() + t, ok := m.templates[key] + if !ok { + return stats.Statistic{}, false + } + if m.doGc { + m.nextTemplates[key] = t + } + return t, true +} + +func (m *memStats) PutTemplate(key templateCacheKey, stat stats.Statistic) { + m.mu.Lock() + defer m.mu.Unlock() + m.templates[key] = stat + if m.doGc { + m.nextTemplates[key] = stat + } +} + +type bucketKey [22]byte + +func getBucketKey(h hash.Hash, l int) bucketKey { + var k bucketKey + copy(k[:hash.ByteLen], h[:]) + binary.BigEndian.PutUint16(k[hash.ByteLen:], uint16(l)) + return k +} + +func (m *memStats) GetBound(h hash.Hash, l int) (sql.Row, bool) { + m.mu.Lock() + defer m.mu.Unlock() + k := getBucketKey(h, l) + r, ok := m.bounds[k] + if !ok { + return nil, false + } + if m.doGc { + m.nextBounds[k] = r + } + return r, true +} + +func (m *memStats) PutBound(h hash.Hash, r sql.Row) { + m.mu.Lock() + defer m.mu.Unlock() + k := getBucketKey(h, len(r)) + m.bounds[k] = r + if m.doGc { + m.nextBounds[k] = r + } +} + +func (m *memStats) StartGc(ctx context.Context, sz int) error { + m.mu.Lock() + defer m.mu.Unlock() + m.doGc = true + m.gcCap.Store(int64(sz)) + if sz == 0 { + sz = m.buckets.Len() * 2 + } + var err error + m.nextBuckets, err = lru.New[bucketKey, *stats.Bucket](sz) + if err != nil { + return err + } + m.nextBounds = make(map[bucketKey]sql.Row) + m.nextTemplates = make(map[templateCacheKey]stats.Statistic) + return nil +} + +func (m *memStats) FinishGc() { + m.mu.Lock() + defer m.mu.Unlock() + m.buckets = m.nextBuckets + m.templates = m.nextTemplates + m.bounds = m.nextBounds + m.nextBuckets = nil + m.nextTemplates = nil + m.nextBounds = nil + m.doGc = false +} + +func (m *memStats) Len() int { + m.mu.Lock() + defer m.mu.Unlock() + return m.buckets.Len() +} + +func (m *memStats) Cap() int64 { + return m.gcCap.Load() +} + +func (m *memStats) PutBucket(_ context.Context, h hash.Hash, b *stats.Bucket, _ *val.TupleBuilder) error { + m.mu.Lock() + defer m.mu.Unlock() + k := getBucketKey(h, len(b.BoundVal)) + m.buckets.Add(k, b) + return nil +} + +func (m *memStats) MarkBucket(_ context.Context, h hash.Hash, tupB *val.TupleBuilder) error { + m.mu.Lock() + defer m.mu.Unlock() + k := getBucketKey(h, tupB.Desc.Count()) + b, ok := m.buckets.Get(k) + if ok { + m.nextBuckets.Add(k, b) + gcCap := int(m.gcCap.Load()) + nextLen := m.nextBuckets.Len() + if nextLen == 1000 { + print() + } + if m.nextBuckets.Len() >= gcCap { + m.gcCap.Store(int64(gcCap) * 2) + m.nextBuckets.Resize(gcCap * 2) + } + } + return nil +} + +func (m *memStats) GetBucket(_ context.Context, h hash.Hash, tupB *val.TupleBuilder) (*stats.Bucket, bool, error) { + m.mu.Lock() + defer m.mu.Unlock() + if h.IsEmpty() { + return nil, false, nil + } + k := getBucketKey(h, tupB.Desc.Count()) + b, ok := m.buckets.Get(k) + return b, ok, nil +} + +func (m *memStats) Flush(_ context.Context) error { + return nil +} + +func NewProllyStats(ctx context.Context, destDb dsess.SqlDatabase) (*prollyStats, error) { + sch := schema.StatsTableDoltSchema + kd, vd := sch.GetMapDescriptors() + + keyBuilder := val.NewTupleBuilder(kd) + valueBuilder := val.NewTupleBuilder(vd) + newMap, err := prolly.NewMapFromTuples(ctx, destDb.DbData().Ddb.NodeStore(), kd, vd) + if err != nil { + return nil, err + } + + return &prollyStats{ + mu: sync.Mutex{}, + destDb: destDb, + kb: keyBuilder, + vb: valueBuilder, + m: newMap.Mutate(), + mem: NewMemStats(), + }, nil +} + +type prollyStats struct { + mu sync.Mutex + destDb dsess.SqlDatabase + kb, vb *val.TupleBuilder + m *prolly.MutableMap + newM *prolly.MutableMap + mem *memStats +} + +func (p *prollyStats) Len() int { + return p.mem.Len() +} + +func (p *prollyStats) Cap() int64 { + return p.mem.Cap() +} + +func (p *prollyStats) GetTemplate(key templateCacheKey) (stats.Statistic, bool) { + return p.mem.GetTemplate(key) +} + +func (p *prollyStats) PutTemplate(key templateCacheKey, stat stats.Statistic) { + p.mem.PutTemplate(key, stat) +} + +func (p *prollyStats) GetBound(h hash.Hash, l int) (sql.Row, bool) { + return p.mem.GetBound(h, l) +} + +func (p *prollyStats) PutBound(h hash.Hash, r sql.Row) { + p.mem.PutBound(h, r) +} + +func (p *prollyStats) PutBucket(ctx context.Context, h hash.Hash, b *stats.Bucket, tupB *val.TupleBuilder) error { + if err := p.mem.PutBucket(ctx, h, b, tupB); err != nil { + return err + } + + k, err := p.encodeHash(h, tupB.Desc.Count()) + if err != nil { + return err + } + v, err := p.encodeBucket(ctx, b, tupB) + if err != nil { + return err + } + + p.mu.Lock() + defer p.mu.Unlock() + return p.m.Put(ctx, k, v) +} + +func (p *prollyStats) GetBucket(ctx context.Context, h hash.Hash, tupB *val.TupleBuilder) (*stats.Bucket, bool, error) { + if h.IsEmpty() { + return nil, false, nil + } + b, ok, err := p.mem.GetBucket(ctx, h, tupB) + + if err != nil { + return nil, false, err + } + if ok { + return b, true, nil + } + + // missing bucket and not GC'ing, try disk + k, err := p.encodeHash(h, tupB.Desc.Count()) + if err != nil { + return nil, false, err + } + + var v val.Tuple + err = p.m.Get(ctx, k, func(key val.Tuple, value val.Tuple) error { + if key != nil { + ok = true + v = value + } + return nil + }) + if !ok || err != nil { + return nil, false, err + } + + if tupB == nil { + // still function if treating like memStats + return nil, true, nil + } + + b, err = p.decodeBucketTuple(ctx, v, tupB) + if err != nil { + return nil, false, err + } + + p.mem.PutBucket(ctx, h, b, tupB) + return b, true, nil +} + +func (p *prollyStats) StartGc(ctx context.Context, sz int) error { + p.mu.Lock() + defer p.mu.Unlock() + if err := p.mem.StartGc(ctx, sz); err != nil { + return err + } + kd, vd := schema.StatsTableDoltSchema.GetMapDescriptors() + newMap, err := prolly.NewMapFromTuples(ctx, p.destDb.DbData().Ddb.NodeStore(), kd, vd) + if err != nil { + return err + } + p.newM = newMap.Mutate() + + return nil +} + +func (p *prollyStats) MarkBucket(ctx context.Context, h hash.Hash, tupB *val.TupleBuilder) error { + p.mem.MarkBucket(ctx, h, tupB) + + // try disk + k, err := p.encodeHash(h, tupB.Desc.Count()) + if err != nil { + return err + } + + p.mu.Lock() + defer p.mu.Unlock() + + var v val.Tuple + var ok bool + err = p.m.Get(ctx, k, func(key val.Tuple, value val.Tuple) error { + if key != nil { + ok = true + v = value + } + return nil + }) + if err != nil { + return err + } + if !ok { + return nil + } + + return p.newM.Put(ctx, k, v) +} + +func (p *prollyStats) FinishGc() { + p.mu.Lock() + defer p.mu.Unlock() + p.mem.FinishGc() + p.m = p.newM + p.newM = nil +} + +func (p *prollyStats) encodeHash(h hash.Hash, len int) (val.Tuple, error) { + p.mu.Lock() + defer p.mu.Unlock() + p.kb.PutInt64(0, int64(len)) + if err := p.kb.PutString(1, h.String()); err != nil { + return nil, err + } + return p.kb.Build(p.m.NodeStore().Pool()), nil +} + +func (p *prollyStats) decodeHashTuple(v val.Tuple) (int, hash.Hash, error) { + l, ok := p.kb.Desc.GetInt64(0, v) + hStr, ok := p.kb.Desc.GetString(1, v) + if !ok { + return 0, hash.Hash{}, fmt.Errorf("unexpected null hash") + } + return int(l), hash.Parse(hStr), nil +} + +func (p *prollyStats) decodeBucketTuple(ctx context.Context, v val.Tuple, tupB *val.TupleBuilder) (*stats.Bucket, error) { + var row []interface{} + for i := 0; i < p.vb.Desc.Count(); i++ { + f, err := tree.GetField(ctx, p.vb.Desc, i, v, p.m.NodeStore()) + if err != nil { + return nil, err + } + row = append(row, f) + } + + version := row[0] + if version != schema.StatsVersion { + return nil, fmt.Errorf("%w: write version %d does not match read version %d", ErrIncompatibleVersion, version, schema.StatsVersion) + } + rowCount := row[1].(int64) + distinctCount := row[2].(int64) + nullCount := row[3].(int64) + boundRowStr := row[4].(string) + upperBoundCnt := row[5].(int64) + mcvCountsStr := row[10].(string) + + boundRow, err := DecodeRow(ctx, p.m.NodeStore(), boundRowStr, tupB) + if err != nil { + return nil, err + } + + var mcvCnts []uint64 + if len(mcvCountsStr) > 0 { + for _, c := range strings.Split(mcvCountsStr, ",") { + cnt, err := strconv.ParseInt(c, 10, 64) + if err != nil { + return nil, err + } + mcvCnts = append(mcvCnts, uint64(cnt)) + } + } + + mcvs := make([]sql.Row, 4) + for i, v := range row[6:10] { + if v != nil && v != "" { + row, err := DecodeRow(ctx, p.m.NodeStore(), v.(string), tupB) + if err != nil { + return nil, err + } + mcvs[i] = row + } + } + + return &stats.Bucket{ + RowCnt: uint64(rowCount), + DistinctCnt: uint64(distinctCount), + NullCnt: uint64(nullCount), + McvsCnt: mcvCnts, + BoundCnt: uint64(upperBoundCnt), + BoundVal: boundRow, + McvVals: mcvs, + }, nil +} + +var mcvTypes = []sql.Type{types.Int16, types.Int16, types.Int16, types.Int16} + +func (p *prollyStats) encodeBucket(ctx context.Context, b *stats.Bucket, tupB *val.TupleBuilder) (val.Tuple, error) { + p.mu.Lock() + defer p.mu.Unlock() + + p.vb.PutInt64(0, schema.StatsVersion) + p.vb.PutInt64(1, int64(b.RowCount())) + p.vb.PutInt64(2, int64(b.DistinctCount())) + p.vb.PutInt64(3, int64(b.NullCount())) + boundRow, err := EncodeRow(ctx, p.m.NodeStore(), b.UpperBound(), tupB) + if err != nil { + return nil, err + } + p.vb.PutString(4, string(boundRow)) + p.vb.PutInt64(5, int64(b.BoundCount())) + for i, r := range b.Mcvs() { + mcvRow, err := EncodeRow(ctx, p.m.NodeStore(), r, tupB) + if err != nil { + return nil, err + } + p.vb.PutString(6+i, string(mcvRow)) + } + var mcvCntsRow sql.Row + for _, v := range b.McvCounts() { + mcvCntsRow = append(mcvCntsRow, int(v)) + } + p.vb.PutString(10, stats.StringifyKey(mcvCntsRow, mcvTypes[:len(mcvCntsRow)])) + + return p.vb.Build(p.m.NodeStore().Pool()), nil +} + +func (p *prollyStats) Flush(ctx context.Context) error { + flushedMap, err := p.m.Map(ctx) + if err != nil { + return err + } + return p.destDb.DbData().Ddb.SetStatisics(ctx, "main", flushedMap.HashOf()) +} + +func (p *prollyStats) NewEmpty(ctx *sql.Context) (StatsKv, error) { + kd, vd := schema.StatsTableDoltSchema.GetMapDescriptors() + newMap, err := prolly.NewMapFromTuples(ctx, p.destDb.DbData().Ddb.NodeStore(), kd, vd) + if err != nil { + return nil, err + } + m := newMap.Mutate() + return &prollyStats{m: m, destDb: p.destDb, kb: p.kb, vb: p.vb}, nil +} + +func EncodeRow(ctx context.Context, ns tree.NodeStore, r sql.Row, tb *val.TupleBuilder) ([]byte, error) { + for i := range tb.Desc.Count() { + v := r[i] + if v == nil { + continue + } + if err := tree.PutField(ctx, ns, tb, i, v); err != nil { + return nil, err + } + } + return tb.Build(ns.Pool()), nil +} + +func DecodeRow(ctx context.Context, ns tree.NodeStore, s string, tb *val.TupleBuilder) (sql.Row, error) { + tup := []byte(s) + r := make(sql.Row, tb.Desc.Count()) + var err error + for i, _ := range r { + r[i], err = tree.GetField(ctx, tb.Desc, i, tup, ns) + if err != nil { + return nil, err + } + } + return r, nil +} diff --git a/go/libraries/doltcore/sqle/statspro/stats_kv_test.go b/go/libraries/doltcore/sqle/statspro/stats_kv_test.go new file mode 100644 index 00000000000..7c44f7f5cb8 --- /dev/null +++ b/go/libraries/doltcore/sqle/statspro/stats_kv_test.go @@ -0,0 +1,215 @@ +// Copyright 2025 Dolthub, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package statspro + +import ( + "context" + "github.com/dolthub/dolt/go/libraries/doltcore/dtestutils" + "github.com/dolthub/dolt/go/libraries/doltcore/sqle/dsess" + "github.com/dolthub/dolt/go/store/hash" + "github.com/dolthub/dolt/go/store/val" + "github.com/dolthub/go-mysql-server/sql" + "github.com/dolthub/go-mysql-server/sql/stats" + "github.com/stretchr/testify/require" + "strconv" + "strings" + "testing" +) + +func TestProllyKv(t *testing.T) { + threads := sql.NewBackgroundThreads() + prollyKv := newTestProllyKv(t, threads) + + h := hash.Parse(strings.Repeat("a", hash.StringLen)) + h2 := hash.Parse(strings.Repeat("b", hash.StringLen)) + k := getBucketKey(h, 2) + + tupB := val.NewTupleBuilder(val.NewTupleDescriptor( + val.Type{Enc: val.Int64Enc, Nullable: true}, + val.Type{Enc: val.StringEnc, Nullable: true}, + )) + + t.Run("test bounds", func(t *testing.T) { + exp := sql.Row{1, 1} + prollyKv.PutBound(h, exp) + cmp, ok := prollyKv.GetBound(h, 2) + require.True(t, ok) + require.Equal(t, exp, cmp) + + _, ok = prollyKv.GetBound(h2, 2) + require.False(t, ok) + }) + + t.Run("test templates", func(t *testing.T) { + exp := stats.Statistic{RowCnt: 50, Qual: sql.StatQualifier{Database: "mydb", Tab: "xy"}} + key := templateCacheKey{ + h: h, + idxName: "PRIMARY", + } + prollyKv.PutTemplate(key, exp) + cmp, ok := prollyKv.GetTemplate(key) + require.True(t, ok) + require.Equal(t, exp, cmp) + + key2 := templateCacheKey{ + h: h2, + idxName: "PRIMARY", + } + _, ok = prollyKv.GetTemplate(key2) + require.False(t, ok) + }) + + t.Run("test buckets", func(t *testing.T) { + exp := stats.NewHistogramBucket(15, 7, 3, 4, sql.Row{int64(1), "one"}, []uint64{5, 4, 3, 1}, []sql.Row{{int64(5), "six"}, {int64(4), "three"}, {int64(3), "seven"}, {int64(1), "one"}}).(*stats.Bucket) + err := prollyKv.PutBucket(context.Background(), h, exp, tupB) + require.NoError(t, err) + cmp, ok, err := prollyKv.GetBucket(context.Background(), h, tupB) + require.NoError(t, err) + require.True(t, ok) + require.Equal(t, exp, cmp) + + _, ok, err = prollyKv.GetBucket(context.Background(), h2, tupB) + require.NoError(t, err) + require.False(t, ok) + + // delete from memory, should pull from disk when |tupB| supplied + prollyKv.mem.buckets.Remove(k) + + cmp, ok, err = prollyKv.GetBucket(context.Background(), h, tupB) + require.NoError(t, err) + require.True(t, ok) + require.Equal(t, (*stats.Bucket)(nil), cmp) + + cmp, ok, err = prollyKv.GetBucket(context.Background(), h, tupB) + require.NoError(t, err) + require.True(t, ok) + require.Equal(t, exp.RowCnt, cmp.RowCnt) + require.Equal(t, exp.DistinctCnt, cmp.DistinctCnt) + require.Equal(t, exp.NullCnt, cmp.NullCnt) + require.Equal(t, exp.McvsCnt, cmp.McvsCnt) + require.Equal(t, exp.McvVals[0], cmp.McvVals[0]) + require.Equal(t, exp.McvVals[1], cmp.McvVals[1]) + require.Equal(t, exp.McvVals[2], cmp.McvVals[2]) + require.Equal(t, exp.McvVals[3], cmp.McvVals[3]) + require.Equal(t, exp.BoundVal, cmp.BoundVal) + require.Equal(t, exp.BoundCnt, cmp.BoundCnt) + }) + + t.Run("test bucket GC", func(t *testing.T) { + prollyKv.StartGc(context.Background(), 10) + + // if we delete from memory, no more fallback to disk + prollyKv.mem.buckets.Remove(k) + _, ok, err := prollyKv.GetBucket(context.Background(), h2, tupB) + require.NoError(t, err) + require.False(t, ok) + + exp := stats.NewHistogramBucket(15, 7, 3, 4, sql.Row{int64(1), "one"}, []uint64{5, 4, 3, 1}, []sql.Row{{int64(5), "six"}, {int64(4), "three"}, {int64(3), "seven"}, {int64(1), "one"}}).(*stats.Bucket) + err = prollyKv.PutBucket(context.Background(), h, exp, tupB) + require.NoError(t, err) + + exp2 := stats.NewHistogramBucket(10, 7, 3, 4, sql.Row{int64(1), "one"}, []uint64{5, 4, 3, 1}, []sql.Row{{int64(5), "six"}, {int64(4), "three"}, {int64(3), "seven"}, {int64(1), "one"}}).(*stats.Bucket) + err = prollyKv.PutBucket(context.Background(), h2, exp2, tupB) + require.NoError(t, err) + + prollyKv.FinishGc() + + prollyKv.StartGc(context.Background(), 10) + cmp2, ok, err := prollyKv.GetBucket(context.Background(), h2, tupB) + require.NoError(t, err) + require.True(t, ok) + require.Equal(t, exp2.BoundCount(), cmp2.BoundCnt) + prollyKv.FinishGc() + // only tagged one bucket + require.Equal(t, 1, prollyKv.Len()) + }) + + t.Run("test GC overflow", func(t *testing.T) { + prollyKv.StartGc(context.Background(), 8) + expLen := 1024 + var expected []hash.Hash + for i := range expLen { + exp := stats.NewHistogramBucket(uint64(i), 7, 3, 4, sql.Row{int64(1), "one"}, []uint64{5, 4, 3, 1}, []sql.Row{{int64(5), "six"}, {int64(4), "three"}, {int64(3), "seven"}, {int64(1), "one"}}).(*stats.Bucket) + nh := strconv.AppendInt(nil, int64(i), 10) + nh = append(nh, h[:hash.ByteLen-len(nh)]...) + newH := hash.New(nh) + expected = append(expected, newH) + err := prollyKv.PutBucket(context.Background(), newH, exp, tupB) + require.NoError(t, err) + } + prollyKv.FinishGc() + + for _, h := range expected { + _, ok, err := prollyKv.GetBucket(context.Background(), h, tupB) + require.NoError(t, err) + require.True(t, ok) + } + + require.Equal(t, 1024, prollyKv.Len()) + require.Equal(t, int64(2048), prollyKv.Cap()) + }) + + t.Run("test bounds GC", func(t *testing.T) { + exp := sql.Row{1, 1} + prollyKv.PutBound(h, exp) + prollyKv.PutBound(h2, exp) + + prollyKv.StartGc(context.Background(), 10) + prollyKv.GetBound(h2, 2) + prollyKv.FinishGc() + + require.Equal(t, 1, len(prollyKv.mem.bounds)) + }) + + t.Run("test templates GC", func(t *testing.T) { + exp := stats.Statistic{RowCnt: 50, Qual: sql.StatQualifier{Database: "mydb", Tab: "xy"}} + key := templateCacheKey{ + h: h, + idxName: "PRIMARY", + } + key2 := templateCacheKey{ + h: h2, + idxName: "PRIMARY", + } + prollyKv.PutTemplate(key, exp) + prollyKv.PutTemplate(key2, exp) + + prollyKv.StartGc(context.Background(), 10) + prollyKv.GetTemplate(key2) + prollyKv.FinishGc() + + require.Equal(t, 1, len(prollyKv.mem.templates)) + }) + +} + +func newTestProllyKv(t *testing.T, threads *sql.BackgroundThreads) *prollyStats { + dEnv := dtestutils.CreateTestEnv() + + sqlEng, ctx := newTestEngine(context.Background(), dEnv, threads) + ctx.Session.SetClient(sql.Client{ + User: "billy boy", + Address: "bigbillie@fake.horse", + }) + require.NoError(t, executeQuery(ctx, sqlEng, "create database mydb")) + require.NoError(t, executeQuery(ctx, sqlEng, "use mydb")) + + startDbs := sqlEng.Analyzer.Catalog.DbProvider.AllDatabases(ctx) + + kv, err := NewProllyStats(ctx, startDbs[0].(dsess.SqlDatabase)) + require.NoError(t, err) + + return kv +} diff --git a/go/libraries/doltcore/sqle/statspro/stats_provider.go b/go/libraries/doltcore/sqle/statspro/stats_provider.go deleted file mode 100644 index 573e20b638a..00000000000 --- a/go/libraries/doltcore/sqle/statspro/stats_provider.go +++ /dev/null @@ -1,535 +0,0 @@ -// Copyright 2023 Dolthub, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package statspro - -import ( - "context" - "errors" - "fmt" - "path/filepath" - "strings" - "sync" - - "github.com/dolthub/go-mysql-server/sql" - - "github.com/dolthub/dolt/go/libraries/doltcore/dbfactory" - "github.com/dolthub/dolt/go/libraries/doltcore/env" - "github.com/dolthub/dolt/go/libraries/doltcore/sqle" - "github.com/dolthub/dolt/go/libraries/doltcore/sqle/dsess" - "github.com/dolthub/dolt/go/store/hash" - "github.com/dolthub/dolt/go/store/prolly/tree" -) - -var ErrFailedToLoad = errors.New("failed to load statistics") - -type indexMeta struct { - qual sql.StatQualifier - cols []string - newNodes []tree.Node - // updateOrdinals are [start, stop] tuples for each update chunk - updateOrdinals []updateOrdinal - keepChunks []sql.HistogramBucket - dropChunks []sql.HistogramBucket - allAddrs []hash.Hash -} - -type updateOrdinal struct { - start, stop uint64 -} - -func NewProvider(pro *sqle.DoltDatabaseProvider, sf StatsFactory) *Provider { - return &Provider{ - pro: pro, - sf: sf, - mu: &sync.Mutex{}, - statDbs: make(map[string]Database), - autoCtxCancelers: make(map[string]context.CancelFunc), - analyzeCtxCancelers: make(map[string]context.CancelFunc), - status: make(map[string]string), - lockedTables: make(map[string]bool), - } -} - -// Provider is the engine interface for reading and writing index statistics. -// Each database has its own statistics table that all tables/indexes in a db -// share. -type Provider struct { - mu *sync.Mutex - pro *sqle.DoltDatabaseProvider - sf StatsFactory - statDbs map[string]Database - autoCtxCancelers map[string]context.CancelFunc - analyzeCtxCancelers map[string]context.CancelFunc - starter sqle.InitDatabaseHook - status map[string]string - lockedTables map[string]bool -} - -// each database has one statistics table that is a collection of the -// table stats in the database -type dbToStats struct { - mu *sync.Mutex - dbName string - stats map[sql.StatQualifier]*DoltStats - statsDatabase Database - latestTableHashes map[string]hash.Hash -} - -func newDbStats(dbName string) *dbToStats { - return &dbToStats{ - mu: &sync.Mutex{}, - dbName: dbName, - stats: make(map[sql.StatQualifier]*DoltStats), - latestTableHashes: make(map[string]hash.Hash), - } -} - -var _ sql.StatsProvider = (*Provider)(nil) - -func (p *Provider) Close() error { - var lastErr error - for _, db := range p.statDbs { - if err := db.Close(); err != nil { - lastErr = err - } - } - return lastErr -} - -func (p *Provider) TryLockForUpdate(branch, db, table string) bool { - p.mu.Lock() - defer p.mu.Unlock() - lockId := fmt.Sprintf("%s.%s.%s", branch, db, table) - if ok := p.lockedTables[lockId]; ok { - return false - } - p.lockedTables[lockId] = true - return true -} - -func (p *Provider) UnlockTable(branch, db, table string) { - p.mu.Lock() - defer p.mu.Unlock() - lockId := fmt.Sprintf("%s.%s.%s", branch, db, table) - p.lockedTables[lockId] = false - return -} - -func (p *Provider) StartRefreshThread(ctx *sql.Context, pro dsess.DoltDatabaseProvider, name string, env *env.DoltEnv, db dsess.SqlDatabase) error { - err := p.starter(ctx, pro.(*sqle.DoltDatabaseProvider), name, env, db) - - if err != nil { - p.UpdateStatus(name, fmt.Sprintf("error restarting thread %s: %s", name, err.Error())) - return err - } - p.UpdateStatus(name, fmt.Sprintf("restarted thread: %s", name)) - return nil -} - -func (p *Provider) SetStarter(hook sqle.InitDatabaseHook) { - p.starter = hook -} - -func (p *Provider) CancelRefreshThread(dbName string) { - p.mu.Lock() - if cancel, ok := p.autoCtxCancelers[dbName]; ok { - cancel() - } - p.mu.Unlock() - p.UpdateStatus(dbName, fmt.Sprintf("cancelled thread: %s", dbName)) - -} - -func (p *Provider) ThreadStatus(dbName string) string { - p.mu.Lock() - defer p.mu.Unlock() - - if msg, ok := p.status[dbName]; ok { - return msg - } - return "no active stats thread" -} - -func (p *Provider) TrackedBranches(dbName string) []string { - db, ok := p.getStatDb(dbName) - if !ok { - return nil - } - return db.Branches() - -} - -func (p *Provider) GetTableStats(ctx *sql.Context, db string, table sql.Table) ([]sql.Statistic, error) { - dSess := dsess.DSessFromSess(ctx.Session) - branch, err := dSess.GetBranch() - if err != nil { - return nil, nil - } - - var schemaName string - if schTab, ok := table.(sql.DatabaseSchemaTable); ok { - schemaName = strings.ToLower(schTab.DatabaseSchema().SchemaName()) - } - - return p.GetTableDoltStats(ctx, branch, db, schemaName, table.Name()) -} - -func (p *Provider) GetTableDoltStats(ctx *sql.Context, branch, db, schema, table string) ([]sql.Statistic, error) { - statDb, ok := p.getStatDb(db) - if !ok || statDb == nil { - return nil, nil - } - - if branch == "" { - dSess := dsess.DSessFromSess(ctx.Session) - var err error - branch, err = dSess.GetBranch() - if err != nil { - return nil, nil - } - } - - var ret []sql.Statistic - for _, qual := range statDb.ListStatQuals(branch) { - if strings.EqualFold(db, qual.Database) && strings.EqualFold(schema, qual.Sch) && strings.EqualFold(table, qual.Tab) { - stat, _ := statDb.GetStat(branch, qual) - ret = append(ret, stat) - } - } - - return ret, nil -} - -func (p *Provider) setStatDb(name string, db Database) { - p.mu.Lock() - defer p.mu.Unlock() - p.statDbs[name] = db -} - -func (p *Provider) getStatDb(name string) (Database, bool) { - p.mu.Lock() - defer p.mu.Unlock() - statDb, ok := p.statDbs[strings.ToLower(name)] - return statDb, ok -} - -func (p *Provider) deleteStatDb(name string) { - p.mu.Lock() - defer p.mu.Unlock() - delete(p.statDbs, strings.ToLower(name)) -} - -func (p *Provider) SetStats(ctx *sql.Context, s sql.Statistic) error { - statDb, ok := p.getStatDb(s.Qualifier().Db()) - if !ok { - return nil - } - - dSess := dsess.DSessFromSess(ctx.Session) - branch, err := dSess.GetBranch() - if err != nil { - return nil - } - - doltStat, err := DoltStatsFromSql(s) - if err != nil { - return err - } - - p.UpdateStatus(s.Qualifier().Db(), fmt.Sprintf("refreshed %s", s.Qualifier().Db())) - - return statDb.SetStat(ctx, branch, s.Qualifier(), doltStat) -} - -func (p *Provider) getQualStats(ctx *sql.Context, qual sql.StatQualifier) (*DoltStats, bool) { - statDb, ok := p.getStatDb(qual.Db()) - if !ok { - return nil, false - } - - dSess := dsess.DSessFromSess(ctx.Session) - branch, err := dSess.GetBranch() - if err != nil { - return nil, false - } - - return statDb.GetStat(branch, qual) -} - -func (p *Provider) GetStats(ctx *sql.Context, qual sql.StatQualifier, _ []string) (sql.Statistic, bool) { - stat, ok := p.getQualStats(ctx, qual) - if !ok { - return nil, false - } - return stat, true -} - -func (p *Provider) DropBranchDbStats(ctx *sql.Context, branch, db string, flush bool) error { - statDb, ok := p.getStatDb(db) - if !ok { - return nil - } - - p.mu.Lock() - defer p.mu.Unlock() - - p.status[db] = "dropped" - - return statDb.DeleteBranchStats(ctx, branch, flush) -} - -func (p *Provider) DropDbStats(ctx *sql.Context, db string, flush bool) error { - statDb, ok := p.getStatDb(db) - if !ok { - return nil - } - for _, branch := range statDb.Branches() { - // remove provider access - p.DropBranchDbStats(ctx, branch, db, flush) - } - - if flush { - p.deleteStatDb(db) - } - - return nil -} - -func (p *Provider) DropStats(ctx *sql.Context, qual sql.StatQualifier, _ []string) error { - statDb, ok := p.getStatDb(qual.Db()) - if !ok { - return nil - } - - dSess := dsess.DSessFromSess(ctx.Session) - branch, err := dSess.GetBranch() - if err != nil { - return nil - } - - if _, ok := statDb.GetStat(branch, qual); ok { - statDb.DeleteStats(ctx, branch, qual) - p.UpdateStatus(qual.Db(), fmt.Sprintf("dropped statisic: %s", qual.String())) - } - - return nil -} - -func (p *Provider) UpdateStatus(db string, msg string) { - p.mu.Lock() - defer p.mu.Unlock() - - p.status[db] = msg -} - -func (p *Provider) RowCount(ctx *sql.Context, db string, table sql.Table) (uint64, error) { - statDb, ok := p.getStatDb(db) - if !ok { - return 0, sql.ErrDatabaseNotFound.New(db) - } - - dSess := dsess.DSessFromSess(ctx.Session) - branch, err := dSess.GetBranch() - if err != nil { - return 0, err - } - - var schemaName string - if schTab, ok := table.(sql.DatabaseSchemaTable); ok { - schemaName = strings.ToLower(schTab.DatabaseSchema().SchemaName()) - } - - priStats, ok := statDb.GetStat(branch, sql.NewStatQualifier(db, schemaName, table.Name(), "primary")) - if !ok { - return 0, nil - } - - return priStats.RowCount(), nil -} - -func (p *Provider) DataLength(ctx *sql.Context, db string, table sql.Table) (uint64, error) { - statDb, ok := p.getStatDb(db) - if !ok { - return 0, sql.ErrDatabaseNotFound.New(db) - } - - dSess := dsess.DSessFromSess(ctx.Session) - branch, err := dSess.GetBranch() - if err != nil { - return 0, err - } - - var schemaName string - if schTab, ok := table.(sql.DatabaseSchemaTable); ok { - schemaName = strings.ToLower(schTab.DatabaseSchema().SchemaName()) - } - - priStats, ok := statDb.GetStat(branch, sql.NewStatQualifier(db, schemaName, table.Name(), "primary")) - if !ok { - return 0, nil - } - - return priStats.AvgSize(), nil -} - -func (p *Provider) Prune(ctx *sql.Context) error { - dSess := dsess.DSessFromSess(ctx.Session) - - for _, sqlDb := range p.pro.DoltDatabases() { - dbName := strings.ToLower(sqlDb.Name()) - sqlDb, ok, err := dSess.Provider().SessionDatabase(ctx, dbName) - if err != nil { - return err - } - if !ok { - continue - } - statDb, ok := p.getStatDb(dbName) - if !ok { - continue - } - - // Canceling refresh thread prevents background thread from - // making progress. Prune should succeed. - p.CancelRefreshThread(dbName) - - tables, err := sqlDb.GetTableNames(ctx) - if err != nil { - return err - } - - for _, branch := range statDb.Branches() { - err := func() error { - // function closure ensures safe defers - var stats []sql.Statistic - for _, t := range tables { - // XXX: avoid races with ANALYZE with the table locks. - // Either concurrent purge or analyze (or both) will fail. - if !p.TryLockForUpdate(branch, dbName, t) { - p.mu.Lock() - fmt.Println(p.lockedTables) - p.mu.Unlock() - return fmt.Errorf("concurrent statistics update and prune; retry prune when update is finished") - } - defer p.UnlockTable(branch, dbName, t) - - tableStats, err := p.GetTableDoltStats(ctx, branch, dbName, sqlDb.SchemaName(), t) - if err != nil { - return err - } - stats = append(stats, tableStats...) - } - - if err := p.DropBranchDbStats(ctx, branch, dbName, true); err != nil { - return err - } - - for _, s := range stats { - ds, ok := s.(*DoltStats) - if !ok { - return fmt.Errorf("unexpected statistics type found: %T", s) - } - if err := statDb.SetStat(ctx, branch, ds.Qualifier(), ds); err != nil { - return err - } - } - if err := statDb.Flush(ctx, branch); err != nil { - return err - } - return nil - }() - if err != nil { - return err - } - } - } - return nil -} - -func (p *Provider) Purge(ctx *sql.Context) error { - for _, sqlDb := range p.pro.DoltDatabases() { - dbName := strings.ToLower(sqlDb.Name()) - - tables, err := sqlDb.GetTableNames(ctx) - if err != nil { - return err - } - - var branches []string - db, ok := p.getStatDb(dbName) - if ok { - // Canceling refresh thread prevents background thread from - // making progress. Purge should succeed. - p.CancelRefreshThread(dbName) - - branches = db.Branches() - for _, branch := range branches { - err := func() error { - for _, t := range tables { - // XXX: avoid races with ANALYZE with the table locks. - // Either concurrent purge or analyze (or both) will fail. - if !p.TryLockForUpdate(branch, dbName, t) { - return fmt.Errorf("concurrent statistics update and prune; retry purge when update is finished") - } - defer p.UnlockTable(branch, dbName, t) - } - - err := p.DropBranchDbStats(ctx, branch, dbName, true) - if err != nil { - return fmt.Errorf("failed to drop stats: %w", err) - } - return nil - }() - if err != nil { - return err - } - } - } - - // if the database's failed to load, we still want to delete the folder - - fs, err := p.pro.FileSystemForDatabase(dbName) - if err != nil { - return err - } - - //remove from filesystem - statsFs, err := fs.WithWorkingDir(dbfactory.DoltStatsDir) - if err != nil { - return err - } - - if ok, _ := statsFs.Exists(""); ok { - if err := statsFs.Delete("", true); err != nil { - return err - } - } - - dropDbLoc, err := statsFs.Abs("") - if err != nil { - return err - } - - if err = dbfactory.DeleteFromSingletonCache(filepath.ToSlash(dropDbLoc + "/.dolt/noms")); err != nil { - return err - } - if len(branches) == 0 { - // if stats db was invalid on startup, recreate from baseline - branches = p.getStatsBranches(ctx) - } - p.Load(ctx, fs, sqlDb, branches) - } - return nil -} diff --git a/go/libraries/doltcore/sqle/statspro/validate.go b/go/libraries/doltcore/sqle/statspro/validate.go new file mode 100644 index 00000000000..65ffda6bbc9 --- /dev/null +++ b/go/libraries/doltcore/sqle/statspro/validate.go @@ -0,0 +1,155 @@ +// Copyright 2023 Dolthub, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package statspro + +import ( + "context" + "fmt" + "github.com/dolthub/dolt/go/libraries/doltcore/doltdb/durable" + "github.com/dolthub/dolt/go/libraries/doltcore/sqle" + "github.com/dolthub/dolt/go/libraries/doltcore/sqle/dsess" + "github.com/dolthub/dolt/go/store/hash" + "github.com/dolthub/dolt/go/store/prolly/tree" + "github.com/dolthub/dolt/go/store/val" + "github.com/dolthub/go-mysql-server/sql" + "log" + "strings" +) + +func generateDeps( + sqlCtx *sql.Context, + sqlDb dsess.SqlDatabase, + tCb func(key templateCacheKey), + bCb func(h hash.Hash, cnt int), + hCb func(h hash.Hash, tupB *val.TupleBuilder) error, +) error { + dSess := dsess.DSessFromSess(sqlCtx.Session) + db, err := dSess.Provider().Database(sqlCtx, sqlDb.AliasedName()) + if err != nil { + return err + } + sqlDb, err = sqle.RevisionDbForBranch(sqlCtx, db.(dsess.SqlDatabase), sqlDb.Revision(), sqlDb.Revision()+"/"+sqlDb.AliasedName()) + if err != nil { + return err + } + tableNames, err := sqlDb.GetTableNames(sqlCtx) + if err != nil { + return err + } + + var bucketCnt int + for _, tableName := range tableNames { + sqlTable, dTab, err := GetLatestTable(sqlCtx, tableName, sqlDb) + if err != nil { + return err + } + indexes, err := sqlTable.GetIndexes(sqlCtx) + if err != nil { + return err + } + + for _, sqlIdx := range indexes { + var idx durable.Index + var err error + if strings.EqualFold(sqlIdx.ID(), "PRIMARY") { + idx, err = dTab.GetRowData(sqlCtx) + } else { + idx, err = dTab.GetIndexRowData(sqlCtx, sqlIdx.ID()) + } + if err != nil { + return err + } + + schHash, _, err := sqlTable.IndexCacheKey(sqlCtx) + key := templateCacheKey{h: schHash.Hash, idxName: sqlIdx.ID()} + tCb(key) + + idxLen := len(sqlIdx.Expressions()) + + prollyMap := durable.ProllyMapFromIndex(idx) + levelNodes, err := tree.GetHistogramLevel(sqlCtx, prollyMap.Tuples(), bucketLowCnt) + if err != nil { + return err + } + + if len(levelNodes) == 0 { + log.Println("db-table has no hashes: ", sqlDb.AliasedName()) + continue + } + + bucketCnt += len(levelNodes) + + firstNodeHash := levelNodes[0].HashOf() + bCb(firstNodeHash, idxLen) + + for _, n := range levelNodes { + err = hCb(n.HashOf(), val.NewTupleBuilder(prollyMap.KeyDesc().PrefixDesc(idxLen))) + if err != nil { + return err + } + } + } + } + return nil +} + +// ValidateState expects all tracked databases to be fully cached, +// and returns an error including any gaps. +func (sc *StatsCoord) ValidateState(ctx context.Context) error { + sc.dbMu.Lock() + dbs := make([]dsess.SqlDatabase, len(sc.dbs)) + copy(dbs, sc.dbs) + sc.dbMu.Unlock() + + sc.gcMu.Lock() + defer sc.gcMu.Unlock() + + sc.statsMu.Lock() + defer sc.statsMu.Unlock() + + sqlCtx, err := sc.ctxGen(ctx) + if err != nil { + return err + } + + b := strings.Builder{} + for i, db := range dbs { + _ = i + generateDeps(sqlCtx, db, func(key templateCacheKey) { + _, ok := sc.kv.GetTemplate(key) + if !ok { + fmt.Fprintf(&b, "(%s) missing template (%s)\n", db.RevisionQualifiedName(), key.String()) + } + }, func(h hash.Hash, cnt int) { + _, ok := sc.kv.GetBound(h, cnt) + if !ok { + fmt.Fprintf(&b, "(%s) missing bound (%s)\n", db.RevisionQualifiedName(), h.String()[:5]) + } + }, func(h hash.Hash, tupB *val.TupleBuilder) error { + _, ok, err := sc.kv.GetBucket(ctx, h, tupB) + if err != nil { + return err + } + if !ok { + fmt.Fprintf(&b, "(%s) missing chunk (%s)\n", db.RevisionQualifiedName(), h.String()[:5]) + } + return nil + }) + } + if b.Len() > 0 { + return fmt.Errorf(b.String()) + } + return nil +} diff --git a/go/libraries/doltcore/sqle/system_variables.go b/go/libraries/doltcore/sqle/system_variables.go index 99e6c2f5a9b..0e3ff291a72 100644 --- a/go/libraries/doltcore/sqle/system_variables.go +++ b/go/libraries/doltcore/sqle/system_variables.go @@ -219,39 +219,39 @@ var DoltSystemVariables = []sql.SystemVariable{ Default: int8(1), }, &sql.MysqlSystemVariable{ - Name: dsess.DoltStatsAutoRefreshEnabled, + Name: dsess.DoltStatsEnabled, Dynamic: true, Scope: sql.GetMysqlScope(sql.SystemVariableScope_Global), - Type: types.NewSystemBoolType(dsess.DoltStatsAutoRefreshEnabled), - Default: int8(0), + Type: types.NewSystemBoolType(dsess.DoltStatsEnabled), + Default: int8(1), }, &sql.MysqlSystemVariable{ - Name: dsess.DoltStatsBootstrapEnabled, + Name: dsess.DoltStatsMemoryOnly, Dynamic: true, Scope: sql.GetMysqlScope(sql.SystemVariableScope_Global), - Type: types.NewSystemBoolType(dsess.DoltStatsBootstrapEnabled), + Type: types.NewSystemBoolType(dsess.DoltStatsMemoryOnly), Default: int8(0), }, &sql.MysqlSystemVariable{ - Name: dsess.DoltStatsMemoryOnly, + Name: dsess.DoltStatsJobInterval, Dynamic: true, Scope: sql.GetMysqlScope(sql.SystemVariableScope_Global), - Type: types.NewSystemBoolType(dsess.DoltStatsMemoryOnly), - Default: int8(0), + Type: types.NewSystemIntType(dsess.DoltStatsJobInterval, 0, math.MaxInt, false), + Default: 100, }, &sql.MysqlSystemVariable{ - Name: dsess.DoltStatsAutoRefreshThreshold, + Name: dsess.DoltStatsBranchInterval, Dynamic: true, Scope: sql.GetMysqlScope(sql.SystemVariableScope_Global), - Type: types.NewSystemDoubleType(dsess.DoltStatsAutoRefreshThreshold, 0, 10), - Default: float64(.5), + Type: types.NewSystemIntType(dsess.DoltStatsBranchInterval, 0, math.MaxInt, false), + Default: 60 * 60 * 24, }, &sql.MysqlSystemVariable{ - Name: dsess.DoltStatsAutoRefreshInterval, + Name: dsess.DoltStatsGCInterval, Dynamic: true, Scope: sql.GetMysqlScope(sql.SystemVariableScope_Global), - Type: types.NewSystemIntType(dsess.DoltStatsAutoRefreshInterval, 0, math.MaxInt, false), - Default: 600, + Type: types.NewSystemIntType(dsess.DoltStatsGCInterval, 0, math.MaxInt, false), + Default: 60 * 60 * 24, }, &sql.MysqlSystemVariable{ Name: dsess.DoltStatsBranches, @@ -446,39 +446,39 @@ func AddDoltSystemVariables() { Default: int8(0), }, &sql.MysqlSystemVariable{ - Name: dsess.DoltStatsAutoRefreshEnabled, + Name: dsess.DoltStatsEnabled, Dynamic: true, Scope: sql.GetMysqlScope(sql.SystemVariableScope_Global), - Type: types.NewSystemBoolType(dsess.DoltStatsAutoRefreshEnabled), - Default: int8(0), + Type: types.NewSystemBoolType(dsess.DoltStatsEnabled), + Default: int8(1), }, &sql.MysqlSystemVariable{ - Name: dsess.DoltStatsBootstrapEnabled, + Name: dsess.DoltStatsGCInterval, Dynamic: true, Scope: sql.GetMysqlScope(sql.SystemVariableScope_Global), - Type: types.NewSystemBoolType(dsess.DoltStatsBootstrapEnabled), - Default: int8(0), + Type: types.NewSystemIntType(dsess.DoltStatsGCInterval, 0, math.MaxInt, false), + Default: 60 * 60 * 24, }, &sql.MysqlSystemVariable{ - Name: dsess.DoltStatsMemoryOnly, + Name: dsess.DoltStatsJobInterval, Dynamic: true, Scope: sql.GetMysqlScope(sql.SystemVariableScope_Global), - Type: types.NewSystemBoolType(dsess.DoltStatsMemoryOnly), - Default: int8(0), + Type: types.NewSystemIntType(dsess.DoltStatsJobInterval, 0, math.MaxInt, false), + Default: 60 * 60 * 24, }, &sql.MysqlSystemVariable{ - Name: dsess.DoltStatsAutoRefreshThreshold, + Name: dsess.DoltStatsBranchInterval, Dynamic: true, Scope: sql.GetMysqlScope(sql.SystemVariableScope_Global), - Type: types.NewSystemDoubleType(dsess.DoltStatsAutoRefreshThreshold, 0, 10), - Default: float64(.5), + Type: types.NewSystemIntType(dsess.DoltStatsBranchInterval, 0, math.MaxInt, false), + Default: 60 * 60 * 24, }, &sql.MysqlSystemVariable{ - Name: dsess.DoltStatsAutoRefreshInterval, + Name: dsess.DoltStatsMemoryOnly, Dynamic: true, Scope: sql.GetMysqlScope(sql.SystemVariableScope_Global), - Type: types.NewSystemIntType(dsess.DoltStatsAutoRefreshInterval, 0, math.MaxInt, false), - Default: 120, + Type: types.NewSystemBoolType(dsess.DoltStatsMemoryOnly), + Default: int8(0), }, &sql.MysqlSystemVariable{ Name: dsess.DoltStatsBranches, diff --git a/go/libraries/doltcore/sqle/tables.go b/go/libraries/doltcore/sqle/tables.go index e8fb46ea5d1..06765360bff 100644 --- a/go/libraries/doltcore/sqle/tables.go +++ b/go/libraries/doltcore/sqle/tables.go @@ -127,12 +127,12 @@ func (t *DoltTable) LookupForExpressions(ctx *sql.Context, exprs ...sql.Expressi return sql.IndexLookup{}, nil, nil, false, nil } - dbState, ok, err := sess.LookupDbState(ctx, t.db.Name()) + dbState, ok, err := sess.LookupDbState(ctx, t.db.AliasedName()) if err != nil { return sql.IndexLookup{}, nil, nil, false, nil } if !ok { - return sql.IndexLookup{}, nil, nil, false, fmt.Errorf("no state for database %s", t.db.Name()) + return sql.IndexLookup{}, nil, nil, false, fmt.Errorf("no state for database %s", t.db.AliasedName()) } var lookupCols []expression.LookupColumn diff --git a/go/libraries/doltcore/sqle/user_space_database.go b/go/libraries/doltcore/sqle/user_space_database.go index e54c03b7eb3..c3689e13a61 100644 --- a/go/libraries/doltcore/sqle/user_space_database.go +++ b/go/libraries/doltcore/sqle/user_space_database.go @@ -141,6 +141,10 @@ func (db *UserSpaceDatabase) RequestedName() string { return db.Name() } +func (db *UserSpaceDatabase) AliasedName() string { + return db.Name() +} + func (db *UserSpaceDatabase) GetSchema(ctx *sql.Context, schemaName string) (sql.DatabaseSchema, bool, error) { panic(fmt.Sprintf("GetSchema is not implemented for database %T", db)) } diff --git a/go/store/prolly/tree/mutator.go b/go/store/prolly/tree/mutator.go index e6474e16cbf..b65fdf8f101 100644 --- a/go/store/prolly/tree/mutator.go +++ b/go/store/prolly/tree/mutator.go @@ -17,7 +17,7 @@ package tree import ( "bytes" "context" - + "fmt" "github.com/dolthub/dolt/go/store/prolly/message" ) @@ -132,7 +132,7 @@ func ApplyMutations[K ~[]byte, O Ordering[K], S message.Serializer]( prev := newKey newKey, newValue = edits.NextMutation(ctx) if newKey != nil { - assertTrue(order.Compare(K(newKey), K(prev)) > 0, "expected sorted edits") + assertTrue(order.Compare(K(newKey), K(prev)) > 0, "expected sorted edits"+fmt.Sprintf("%v, %v", prev, newKey)) } } diff --git a/go/store/prolly/tree/stats.go b/go/store/prolly/tree/stats.go index 1573d01893d..9611f3b583d 100644 --- a/go/store/prolly/tree/stats.go +++ b/go/store/prolly/tree/stats.go @@ -141,6 +141,11 @@ func GetChunksAtLevel[K, V ~[]byte, O Ordering[K]](ctx context.Context, m Static // GetHistogramLevel returns the highest internal level of the tree that has // more than |low| addresses. func GetHistogramLevel[K, V ~[]byte, O Ordering[K]](ctx context.Context, m StaticMap[K, V, O], low int) ([]Node, error) { + if cnt, err := m.Count(); err != nil { + return nil, err + } else if cnt == 0 { + return nil, nil + } currentLevel := []Node{m.Root} level := m.Root.Level() for len(currentLevel) < low && level > 0 { diff --git a/go/store/val/tuple_builder.go b/go/store/val/tuple_builder.go index f92bc8ce1cb..9b3a50ea139 100644 --- a/go/store/val/tuple_builder.go +++ b/go/store/val/tuple_builder.go @@ -15,6 +15,8 @@ package val import ( + "log" + "strconv" "time" "github.com/dolthub/go-mysql-server/sql/analyzer/analyzererrors" @@ -77,7 +79,7 @@ func NewTupleBuilder(desc TupleDesc) *TupleBuilder { func (tb *TupleBuilder) Build(pool pool.BuffPool) (tup Tuple) { for i, typ := range tb.Desc.Types { if !typ.Nullable && tb.fields[i] == nil { - panic("cannot write NULL to non-NULL field") + log.Println("cannot write NULL to non-NULL field: " + strconv.Itoa(i) + " " + string(tb.fields[i])) } } return tb.BuildPermissive(pool) diff --git a/go/store/val/tuple_descriptor.go b/go/store/val/tuple_descriptor.go index bd55519ab35..188c1f98829 100644 --- a/go/store/val/tuple_descriptor.go +++ b/go/store/val/tuple_descriptor.go @@ -639,7 +639,7 @@ func (td TupleDesc) formatValue(enc Encoding, i int, value []byte) string { case StringAddrEnc: return hex.EncodeToString(value) case CommitAddrEnc: - return hex.EncodeToString(value) + return hash.New(value).String()[:5] case CellEnc: return hex.EncodeToString(value) case ExtendedEnc: