diff --git a/.changelog/3183.internal.md b/.changelog/3183.internal.md new file mode 100644 index 00000000000..4b3409fc7d0 --- /dev/null +++ b/.changelog/3183.internal.md @@ -0,0 +1 @@ +go/extra/conbench: Add consensus benchmark subcommand diff --git a/go/.gitignore b/go/.gitignore index 22ff16ef970..e7230e12e5a 100644 --- a/go/.gitignore +++ b/go/.gitignore @@ -14,3 +14,4 @@ registry/gen_vectors/gen_vectors staking/gen_vectors/gen_vectors extra/extract-metrics/extract-metrics +extra/conbench/conbench diff --git a/go/Makefile b/go/Makefile index 0c29638992a..c96055651fa 100644 --- a/go/Makefile +++ b/go/Makefile @@ -17,7 +17,8 @@ generate: # Build. # List of Go binaries to build. go-binaries := oasis-node oasis-test-runner oasis-net-runner oasis-remote-signer \ - extra/extract-metrics oasis-test-runner/scenario/pluginsigner/example_signer_plugin + oasis-test-runner/scenario/pluginsigner/example_signer_plugin \ + extra/extract-metrics extra/conbench $(go-binaries): @$(ECHO) "$(MAGENTA)*** Building $@...$(OFF)" diff --git a/go/extra/conbench/cmd/conbench.go b/go/extra/conbench/cmd/conbench.go new file mode 100644 index 00000000000..00dcda1d7cf --- /dev/null +++ b/go/extra/conbench/cmd/conbench.go @@ -0,0 +1,573 @@ +package cmd + +import ( + "context" + "crypto" + "fmt" + "math" + "math/rand" + "sort" + "strings" + "sync/atomic" + "time" + + "github.com/spf13/cobra" + flag "github.com/spf13/pflag" + "github.com/spf13/viper" + + "github.com/oasisprotocol/oasis-core/go/common/crypto/drbg" + "github.com/oasisprotocol/oasis-core/go/common/crypto/mathrand" + "github.com/oasisprotocol/oasis-core/go/common/crypto/signature" + memorySigner "github.com/oasisprotocol/oasis-core/go/common/crypto/signature/signers/memory" + "github.com/oasisprotocol/oasis-core/go/common/entity" + "github.com/oasisprotocol/oasis-core/go/common/logging" + "github.com/oasisprotocol/oasis-core/go/common/quantity" + consensus "github.com/oasisprotocol/oasis-core/go/consensus/api" + "github.com/oasisprotocol/oasis-core/go/consensus/api/transaction" + "github.com/oasisprotocol/oasis-core/go/control/api" + "github.com/oasisprotocol/oasis-core/go/oasis-node/cmd/common" + cmdFlags "github.com/oasisprotocol/oasis-core/go/oasis-node/cmd/common/flags" + cmdGrpc "github.com/oasisprotocol/oasis-core/go/oasis-node/cmd/common/grpc" + cmdSigner "github.com/oasisprotocol/oasis-core/go/oasis-node/cmd/common/signer" + staking "github.com/oasisprotocol/oasis-core/go/staking/api" +) + +const ( + // Number of test accounts to create. + // This also controls the number of parallel transfers. + CfgNumAccounts = "num_accounts" + + // Number of samples (transfers) per account. + CfgNumSamples = "num_samples" + + // Timeout for SubmitTx. + CfgSubmitTxTimeout = "submit_timeout" + + // Use test entity for funding? + CfgUseTestEntity = "use_test_entity" + + // Gas price (should be set to the minimum gas price of validators). + CfgGasPrice = "gas_price" + + // Only fund accounts and exit (useful for multiple runs of the benchmark, + // since funding is the slowest part). + CfgFundAndExit = "fund_and_exit" + + // Only refund funding account and exit (counterpart of the above). + CfgRefundAndExit = "refund_and_exit" + + // Skip funding accounts. + CfgSkipFunding = "skip_funding" + + // Seed to use for the DRBG. + CfgSeed = "seed" + + // Placeholder value for cachedNonce and cachedGas in localAccount struct + // when they haven't been initialized yet. + notYetCached = uint64(math.MaxUint64) +) + +var ( + logger = logging.GetLogger("cmd/conbench") + conbenchCmd = &cobra.Command{ + Use: "conbench", + Short: "benchmark consensus layer", + Long: "Runs a consensus layer benchmark.", + RunE: doRun, + } +) + +type localAccount struct { + signer signature.Signer + addr staking.Address + cachedNonce uint64 + cachedGas uint64 +} + +func transfer(ctx context.Context, cc consensus.ClientBackend, from *localAccount, toAddr staking.Address, amount uint64, noCache bool) error { + var err error + + // Get sender's nonce if not yet cached (or if we're ignoring cache). + nonce := from.cachedNonce + if nonce == notYetCached || noCache { + nonce, err = cc.GetSignerNonce(ctx, &consensus.GetSignerNonceRequest{ + AccountAddress: from.addr, + Height: consensus.HeightLatest, + }) + if err != nil { + return fmt.Errorf("unable to get sender's nonce: %w", err) + } + atomic.StoreUint64(&from.cachedNonce, nonce) + } + + // Construct transfer transaction. + transfer := staking.Transfer{ + To: toAddr, + } + if err = transfer.Amount.FromUint64(amount); err != nil { + return fmt.Errorf("unable to convert given amount from uint64: %w", err) + } + + var fee transaction.Fee + tx := staking.NewTransferTx(nonce, &fee, &transfer) + + // Estimate gas if not yet cached (or if we're ignoring cache). + gas := from.cachedGas + if gas == notYetCached || noCache { + estGas, grr := cc.EstimateGas(ctx, &consensus.EstimateGasRequest{ + Signer: from.signer.Public(), + Transaction: tx, + }) + if grr != nil { + return fmt.Errorf("unable to estimate gas: %w", grr) + } + gas = uint64(estGas) + atomic.StoreUint64(&from.cachedGas, gas) + } + + tx.Fee.Gas = transaction.Gas(gas) + if err = tx.Fee.Amount.FromUint64(gas * viper.GetUint64(CfgGasPrice)); err != nil { + return fmt.Errorf("unable to convert fee amount from uint64: %w", err) + } + + signedTx, err := transaction.Sign(from.signer, tx) + if err != nil { + return fmt.Errorf("unable to sign transfer transaction: %w", err) + } + + // Increment cached nonce. + atomic.AddUint64(&from.cachedNonce, 1) + + // Submit with timeout to avoid blocking forever if the client node + // is skipping CheckTx checks. The timeout should be set large enough + // for the network to handle the submission. + timeout := viper.GetDuration(CfgSubmitTxTimeout) + submissionCtx, cancel := context.WithTimeout(ctx, timeout) + defer cancel() + if err = cc.SubmitTx(submissionCtx, signedTx); err != nil { + return err + } + return nil +} + +func refund(ctx context.Context, cc consensus.ClientBackend, sc staking.Backend, from *localAccount, toAddr staking.Address) error { + // Fetch account info. + acct, err := sc.Account(ctx, &staking.OwnerQuery{ + Height: consensus.HeightLatest, + Owner: from.addr, + }) + if err != nil { + return fmt.Errorf("unable to fetch account balance: %w", err) + } + + // Since we're dealing with tiny amounts, we can afford this hack. + amount := acct.General.Balance.ToBigInt().Uint64() + + if amount == 0 { + // Nothing to refund. + return nil + } + + // We don't want refunds to fail, so disable caching. + if err = transfer(ctx, cc, from, toAddr, amount, true); err != nil { + return fmt.Errorf("unable to refund from account: %w", err) + } + + return nil +} + +func refundMultiple(ctx context.Context, cc consensus.ClientBackend, sc staking.Backend, account []localAccount, toAddr staking.Address) { + // Do the refunds in parallel. + doneCh := make(chan bool, len(account)) + for a := range account { + go func(a int) { + if err := refund(ctx, cc, sc, &account[a], toAddr); err != nil { + // Tough luck. + logger.Error("unable to refund from account", + "account_address", account[a].addr, + "err", err, + ) + } + doneCh <- true + }(a) + } + + // Wait for all goroutines to finish. + for range account { + <-doneCh + } +} + +func doRun(cmd *cobra.Command, args []string) error { // nolint: gocyclo + cmd.SilenceUsage = true + + if err := common.Init(); err != nil { + common.EarlyLogAndExit(err) + } + + numAccounts := viper.GetUint64(CfgNumAccounts) + numSamples := viper.GetUint64(CfgNumSamples) + + if numAccounts < 1 { + return fmt.Errorf("number of accounts must be >= 1") + } + if numSamples < 3 { + return fmt.Errorf("number of samples must be >= 3") + } + + fundAndExit := viper.GetBool(CfgFundAndExit) + refundAndExit := viper.GetBool(CfgRefundAndExit) + skipFunding := viper.GetBool(CfgSkipFunding) + + if fundAndExit && refundAndExit { + return fmt.Errorf("cannot both fund and refund at the same time") + } + if fundAndExit && skipFunding { + return fmt.Errorf("exiting") + } + if refundAndExit && skipFunding { + return fmt.Errorf("--skip_funding has no effect with --refund_and_exit") + } + + ctx := context.Background() + + // Connect to node. + logger.Debug("dialing node", "addr", viper.GetString(cmdGrpc.CfgAddress)) + conn, err := cmdGrpc.NewClient(cmd) + if err != nil { + return fmt.Errorf("unable to connect to node: %w", err) + } + defer conn.Close() + + cc := consensus.NewConsensusClient(conn) + sc := staking.NewStakingClient(conn) + ncc := api.NewNodeControllerClient(conn) + + // Set chain context from genesis document obtained from the node. + genDoc, err := cc.GetGenesisDocument(ctx) + if err != nil { + return fmt.Errorf("unable to obtain genesis document from node: %w", err) + } + genDoc.SetChainContext() + + // Create new DRBG. + src, err := drbg.New(crypto.SHA512, []byte(viper.GetString(CfgSeed)), nil, []byte("consensus benchmark")) + if err != nil { + return fmt.Errorf("unable to create deterministic random generator: %w", err) + } + rng := rand.New(mathrand.New(src)) + + // Wait for the node to sync. + logger.Debug("waiting for node sync") + if err = ncc.WaitSync(context.Background()); err != nil { + return fmt.Errorf("unable to wait for node sync: %w", err) + } + logger.Debug("node synced") + + // Create multiple accounts. + account := make([]localAccount, numAccounts) + msf := memorySigner.NewFactory() + for a := range account { + signer, grr := msf.Generate(signature.SignerEntity, rng) + if grr != nil { + return fmt.Errorf("unable to generate account %d: %w", a, grr) + } + account[a].signer = signer + account[a].addr = staking.NewAddress(signer.Public()) + account[a].cachedNonce = notYetCached + account[a].cachedGas = notYetCached + } + + var fundingSigner signature.Signer + + if !skipFunding { + if viper.GetBool(CfgUseTestEntity) { + // Use test entity for funding. + _, fundingSigner, _ = entity.TestEntity() + } else { + // Use given signer for funding. + signerDir, grr := cmdSigner.CLIDirOrPwd() + if grr != nil { + return fmt.Errorf("failed to retrieve signer dir: %w", grr) + } + signerFactory, grr := cmdSigner.NewFactory(cmdSigner.Backend(), signerDir, signature.SignerEntity) + if grr != nil { + return fmt.Errorf("failed to create signer factory: %w", grr) + } + fundingSigner, grr = signerFactory.Load(signature.SignerEntity) + if grr != nil { + return fmt.Errorf("failed to load signer: %w", grr) + } + } + } else { + // We won't need a signer, since we're not funding, fake it + // with the test entity instead. + _, fundingSigner, _ = entity.TestEntity() + } + + fundingAddr := staking.NewAddress(fundingSigner.Public()) + + if refundAndExit { + logger.Info("refunding money") + refundMultiple(ctx, cc, sc, account, fundingAddr) + logger.Info("money refunded") + return nil + } + + if !skipFunding { + fundingAcct := localAccount{ + signer: fundingSigner, + addr: fundingAddr, + cachedNonce: notYetCached, + cachedGas: notYetCached, + } + + // Check if funding account has enough funds. + logger.Debug("checking if funding account has enough funds") + var fundingAcctInfo *staking.Account + fundingAcctInfo, err = sc.Account(ctx, &staking.OwnerQuery{ + Height: consensus.HeightLatest, + Owner: fundingAddr, + }) + if err != nil { + return fmt.Errorf("unable to fetch funding account balance: %w", err) + } + // Estimate gas. + dummyXfer := &staking.Transfer{To: fundingAddr} + if err = dummyXfer.Amount.FromUint64(1); err != nil { + return fmt.Errorf("unable to convert uint64 to amount: %w", err) + } + var estGas transaction.Gas + estGas, err = cc.EstimateGas(ctx, &consensus.EstimateGasRequest{ + Signer: fundingSigner.Public(), + Transaction: staking.NewTransferTx(fundingAcctInfo.General.Nonce, nil, dummyXfer), + }) + if err != nil { + return fmt.Errorf("unable to estimate gas: %w", err) + } + // Each account needs additional tokens for fees. + // An additional fee is allocated for the refund at the end. + perAccountFunds := numSamples + (numSamples+1)*(viper.GetUint64(CfgGasPrice)*uint64(estGas)) + requiredFunds := quantity.NewFromUint64(numAccounts * perAccountFunds) + availableFunds := fundingAcctInfo.General.Balance + if availableFunds.Cmp(requiredFunds) < 0 { + return fmt.Errorf("funding account has insufficient funds (%s required, %s available)", requiredFunds.String(), availableFunds.String()) + } + logger.Debug("funding account has enough funds", + "required", requiredFunds.String(), + "available", availableFunds.String(), + ) + + // Fund all accounts from the funding account. + logger.Info("funding test accounts", + "num_accounts", numAccounts, + ) + for a := range account { + // Populate cached gas estimates. + account[a].cachedGas = uint64(estGas) + + // Each account gets perAccountFunds tokens. + if errr := transfer(ctx, cc, &fundingAcct, account[a].addr, perAccountFunds, true); errr != nil { + // An error has happened while funding, make sure to refund the + // funding account from the accounts funded until this point. + logger.Error("error while funding, attempting to refund account") + refundMultiple(ctx, cc, sc, account[0:a], fundingAddr) + return fmt.Errorf("unable to fund account %d: %w", a, errr) + } + } + if fundAndExit { + return nil + } + } + + logger.Info("starting benchmark", "num_accounts", numAccounts) + startStatus, err := cc.GetStatus(ctx) + if err != nil { + if !skipFunding { + logger.Info("refunding money") + refundMultiple(ctx, cc, sc, account, fundingAddr) + logger.Info("money refunded") + } + return fmt.Errorf("unable to get status: %w", err) + } + benchmarkStartHeight := startStatus.LatestHeight + benchmarkStartT := time.Now() + + // Submit time is the time required to submit the transaction and + // wait for it to be included in a block. + var ( + totalSubmitTimeNs uint64 + numSubmitSamples uint64 + numSubmitErrors uint64 + ) + + // Perform benchmark in parallel, one goroutine per account. + doneCh := make(chan bool, numAccounts*numSamples) + for a := range account { + go func(idx uint64) { + var noCache bool + for s := uint64(0); s < numSamples; s++ { + fromIdx := idx + toIdx := idx + toAddr := account[toIdx].addr + + startT := time.Now() + if err = transfer(ctx, cc, &account[fromIdx], toAddr, 1, noCache); err != nil { + atomic.AddUint64(&numSubmitErrors, 1) + // Disable cache for the next sample, just in case + // we messed up the nonce or if the gas cost changed. + noCache = true + doneCh <- true + continue + } + atomic.AddUint64(&totalSubmitTimeNs, uint64(time.Since(startT).Nanoseconds())) + atomic.AddUint64(&numSubmitSamples, 1) + doneCh <- true + noCache = false + } + }(uint64(a)) + } + + // Wait for all goroutines to finish. + for i := uint64(0); i < numAccounts*numSamples; i++ { + <-doneCh + } + + benchmarkDuration := time.Since(benchmarkStartT) + stopStatus, err := cc.GetStatus(ctx) + if err != nil { + if !skipFunding { + logger.Info("refunding money") + refundMultiple(ctx, cc, sc, account, fundingAddr) + logger.Info("money refunded") + } + return fmt.Errorf("unable to get status: %w", err) + } + benchmarkStopHeight := stopStatus.LatestHeight + + // Go through all transactions from benchmarkStartHeight to + // benchmarkStopHeight and calculate the average number of + // transactions per second and other stats. + // Note that we count all transactions, not just the ones made + // by this benchmark. + var totalTxs uint64 + var maxTxs uint64 + minTxs := uint64(18446744073709551615) + txsPerBlock := make([]uint64, 0) + txBytesPerBlock := make([]uint64, 0) + blockDeltaT := make([]float64, 0) + var prevBlockT time.Time + for height := benchmarkStartHeight; height <= benchmarkStopHeight; height++ { + // Count number of transactions. + txs, grr := cc.GetTransactions(ctx, height) + if grr != nil { + logger.Error("GetTransactions failed", "err", grr, "height", height) + continue + } + lenTxs := uint64(len(txs)) + totalTxs += lenTxs + txsPerBlock = append(txsPerBlock, lenTxs) + if lenTxs > maxTxs { + maxTxs = lenTxs + } + if lenTxs < minTxs { + minTxs = lenTxs + } + + // Count size of transactions in bytes. + var blkSizeBytes uint64 + for _, tx := range txs { + blkSizeBytes += uint64(len(tx)) + } + txBytesPerBlock = append(txBytesPerBlock, blkSizeBytes) + + // Calculate time between blocks. + blk, grr := cc.GetBlock(ctx, height) + if grr != nil { + logger.Error("GetBlock failed", "err", grr, "height", height) + continue + } + if prevBlockT.IsZero() { + prevBlockT = blk.Time + } + blockDeltaT = append(blockDeltaT, blk.Time.Sub(prevBlockT).Seconds()) + prevBlockT = blk.Time + } + + tps := float64(totalTxs) / benchmarkDuration.Seconds() + + // Calculate median number of transactions. + sort.Slice(txsPerBlock, func(i, j int) bool { return txsPerBlock[i] < txsPerBlock[j] }) + medianTxs := txsPerBlock[len(txsPerBlock)/2] + + avgSubmitTimeNs := float64(totalSubmitTimeNs) / float64(numSubmitSamples) + + logger.Info("benchmark finished", + // Number of accounts involved in benchmark (level of parallelism). + "num_accounts", numAccounts, + // Average time (in seconds) required to submit a transaction and wait + // for it to be included in a block. + "avg_submit_time_s", avgSubmitTimeNs/1.0e9, + // Transactions per second (this includes all transactions that + // appeared on the network during the time of the benchmark). + "transactions_per_second", tps, + // Number of successful SubmitTx calls (i.e. transfer transactions). + "submit_samples", numSubmitSamples, + // Number of unsuccessful SubmitTx calls. + "submit_errors", numSubmitErrors, + // Duration of the entire benchmark (in seconds). + "bench_duration_s", benchmarkDuration.Seconds(), + // Number of blocks seen on the network during the benchmark. + "num_blocks", len(txsPerBlock), + // Minimum number of transactions per block (during the benchmark). + "min_txs_per_block", minTxs, + // Maximum number of transactions per block (during the benchmark). + "max_txs_per_block", maxTxs, + // Average number of transactions per block (during the benchmark). + "avg_txs_per_block", float64(totalTxs)/float64(len(txsPerBlock)), + // Median number of transactions per block (during the benchmark). + "median_txs_per_block", medianTxs, + // Total number of transactions observed during the benchmark. + "total_txs", totalTxs, + // Number of transactions in each block (block size). + "block_sizes", strings.Trim(fmt.Sprint(txsPerBlock), "[]"), + // Size of all transactions in each block (in bytes). + "block_sizes_bytes", strings.Trim(fmt.Sprint(txBytesPerBlock), "[]"), + // Time delta between blocks (in seconds). + "block_delta_t_s", strings.Trim(fmt.Sprint(blockDeltaT), "[]"), + ) + + // Refund money into original funding account. + if !skipFunding { + logger.Info("refunding money") + refundMultiple(ctx, cc, sc, account, fundingAddr) + logger.Info("money refunded") + } + + return nil +} + +// Register registers the conbench sub-command. +func RegisterConbenchCmd(parentCmd *cobra.Command) { + parentCmd.AddCommand(conbenchCmd) +} + +func init() { + fs := flag.NewFlagSet("", flag.ContinueOnError) + fs.Uint64(CfgNumAccounts, 10, "Number of accounts to create for benchmarking (also level of parallelism)") + fs.Uint64(CfgNumSamples, 30, "Number of samples (transfers) per account") + fs.Duration(CfgSubmitTxTimeout, 10*time.Second, "Timeout for SubmitTx (set this based on network parameters)") + fs.Bool(CfgUseTestEntity, false, "Use test entity for funding (only for testing)") + fs.Uint64(CfgGasPrice, 1, "Gas price (should be set to the minimum gas price of validators)") + fs.Bool(CfgFundAndExit, false, "Only fund accounts and exit") + fs.Bool(CfgRefundAndExit, false, "Only refund funding account and exit") + fs.Bool(CfgSkipFunding, false, "Skip funding accounts") + fs.String(CfgSeed, "consensus benchmark random seeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeed", "Seed to use for the DRBG (change this if you're running multiple benchmarks in parallel)") + _ = viper.BindPFlags(fs) + conbenchCmd.Flags().AddFlagSet(fs) + + conbenchCmd.Flags().AddFlagSet(cmdGrpc.ClientFlags) + conbenchCmd.Flags().AddFlagSet(cmdFlags.DebugTestEntityFlags) + conbenchCmd.Flags().AddFlagSet(cmdFlags.DebugDontBlameOasisFlag) + conbenchCmd.Flags().AddFlagSet(cmdSigner.CLIFlags) +} diff --git a/go/extra/conbench/cmd/root.go b/go/extra/conbench/cmd/root.go new file mode 100644 index 00000000000..299a7fa82a2 --- /dev/null +++ b/go/extra/conbench/cmd/root.go @@ -0,0 +1,53 @@ +// Root command for the consensus benchmark. +package cmd + +import ( + "fmt" + + "github.com/spf13/cobra" + flag "github.com/spf13/pflag" + "github.com/spf13/viper" + + "github.com/oasisprotocol/oasis-core/go/common/logging" + "github.com/oasisprotocol/oasis-core/go/common/version" + "github.com/oasisprotocol/oasis-core/go/oasis-node/cmd/common" +) + +const cfgLogLevel = "log.level" + +var ( + rootCmd = &cobra.Command{ + Use: "conbench", + Short: "Consensus benchmark", + Version: version.SoftwareVersion, + } + + rootFlags = flag.NewFlagSet("", flag.ContinueOnError) +) + +// RootCommand returns the root (top level) cobra.Command. +func RootCommand() *cobra.Command { + return rootCmd +} + +// Execute spawns the main entry point after handling the command line arguments. +func Execute() { + var logLevel logging.Level + if err := logLevel.Set(viper.GetString(cfgLogLevel)); err != nil { + common.EarlyLogAndExit(fmt.Errorf("root: failed to set log level: %w", err)) + } + + if err := rootCmd.Execute(); err != nil { + common.EarlyLogAndExit(err) + } +} + +func init() { + logLevel := logging.LevelInfo + rootFlags.Var(&logLevel, cfgLogLevel, "log level") + _ = viper.BindPFlags(rootFlags) + rootCmd.PersistentFlags().AddFlagSet(rootFlags) + + // Register all of the sub-commands. + RegisterConbenchCmd(rootCmd) +} diff --git a/go/extra/conbench/conbench-plot.sh b/go/extra/conbench/conbench-plot.sh new file mode 100755 index 00000000000..11e1b2ab79a --- /dev/null +++ b/go/extra/conbench/conbench-plot.sh @@ -0,0 +1,312 @@ +#!/usr/bin/env bash +# +# Run consensus benchmark and plot results. +# +# Outputs are saved to the current working directory: +# conbench-data.TS.txt -- raw output with benchmark results from conbench +# conbench-tps.TS.png -- transactions per second +# conbench-avg-submit-time.TS.png -- average time required to submit txn +# conbench-both.TS.png -- both TPS and avg time on same graph +# conbench-block-size.TS.png -- block sizes (min/avg/max) +# conbench-block-sizes.TS.png -- 3D graph of block sizes per num of accounts +# conbench-block-sizes-bytes.TS.png -- 3D graph of block sizes in bytes +# conbench-block-times.TS.png -- 3D graph of time between blocks per #accts +# conbench-N.prof -- if profiling enabled, profile output for N accounts +# conbench-N.block.prof -- if profiling enabled, blocking profile output +# conbench-N.mutex.prof -- if profiling enabled, mutex contention profile +# +# The TS in filenames above represents the timestamp when the script was run. +# +# Most interesting output files are probably conbench-both.TS.png, +# conbench-block-size.TS.png, and conbench-block-sizes.TS.png. +# +# The environment variable OASIS_NODE_GRPC_ADDR should be set to the node's +# GRPC address, e.g. "unix:/tmp/foo/net-runner/network/client-0/internal.sock". +# +# If you want to run this script with a net runner network: +# conbench-plot.sh --use_test_entity +# Alternatively, you can run the "conbench-test.sh" script, which also sets up +# a default net runner network for you. +# +# If you want to run this script on a real network: +# conbench-plot.sh --signer.dir /path/to/your/entity/files/dir +# +# If you want to profile runs, edit the PROF variable below and make sure the +# node is run with the additional argument `--pprof.bind 127.0.0.1:10101`. +# If using the test runner and the conbench-test.sh, add the argument to the +# list in consensusValidator() in go/oasis-test-runner/oasis/args.go. +# + +set -o errexit -o nounset -o pipefail +trap "exit 1" INT + +# Output file names. +NOW=`date +%Y%m%d-%H%M%S` +RAW_DATA="conbench-data.${NOW}.txt" +TPS_PLOT="conbench-tps.${NOW}.png" +ST_PLOT="conbench-avg-submit-time.${NOW}.png" +BOTH_PLOT="conbench-both.${NOW}.png" +BS_PLOT="conbench-block-size.${NOW}.png" +BSS_PLOT="conbench-block-sizes.${NOW}.png" +BSSB_PLOT="conbench-block-sizes-bytes.${NOW}.png" +BTS_PLOT="conbench-block-times.${NOW}.png" + +# Get the root directory of the repository. +ROOT="$(cd $(dirname $0)/../../../; pwd -P)" + +# ANSI escape codes to brighten up the output. +RED=$'\e[31;1m' +GRN=$'\e[32;1m' +OFF=$'\e[0m' + + +CONBENCH="${ROOT}/go/extra/conbench/conbench" + +# Check if we have all the tools we need. +if [[ "$(which gnuplot)" == "" ]]; then + printf "${RED}ERROR: gnuplot not installed. Install it and try again.${OFF}\n" + exit 1 +fi +if [[ ! -x "${CONBENCH}" ]]; then + printf "${RED}ERROR: conbench command isn't built. Run 'make' in '${ROOT}/go' and try again.${OFF}\n" +fi + +TPS_DATA_FILE="$(mktemp -t oasis-conbench-tps-plot-XXXXXXXXXX)" +ST_DATA_FILE="$(mktemp -t oasis-conbench-st-plot-XXXXXXXXXX)" +BS_DATA_FILE="$(mktemp -t oasis-conbench-bs-plot-XXXXXXXXXX)" +BSS_DATA_FILE="$(mktemp -t oasis-conbench-bss-plot-XXXXXXXXXX)" +BSSB_DATA_FILE="$(mktemp -t oasis-conbench-bssb-plot-XXXXXXXXXX)" +BTS_DATA_FILE="$(mktemp -t oasis-conbench-bts-plot-XXXXXXXXXX)" + +ARGS="$@" + +rm -f "${RAW_DATA}" +touch "${RAW_DATA}" + +conbench() { + ${CONBENCH} conbench conbench \ + --address ${OASIS_NODE_GRPC_ADDR} \ + ${ARGS} \ + --log.level INFO \ + --num_samples 30 \ + $@ +} + +run_bench() { + local num_accounts=$1 + local no_plot=$2 + local output="$(mktemp -t oasis-conbench-output-${num_accounts}-XXXXXXXXXX)" + + # Run benchmark. + printf "${GRN}*** Running benchmark for ${num_accounts} accounts...${OFF}\n" + conbench --skip_funding --num_accounts ${num_accounts} > "${output}" + + local results=$(fgrep 'msg="benchmark finished"' "${output}") + echo "${results}" | tee -a "${RAW_DATA}" + + local tps=$(echo "${results}" | sed -r 's/[[:alnum:]_]+=/\n&/g' | awk -F= '$1=="transactions_per_second"{print $2}') + + local st=$(echo "${results}" | sed -r 's/[[:alnum:]_]+=/\n&/g' | awk -F= '$1=="avg_submit_time_s"{print $2}') + + local min_bs=$(echo "${results}" | sed -r 's/[[:alnum:]_]+=/\n&/g' | awk -F= '$1=="min_txs_per_block"{print $2}') + local avg_bs=$(echo "${results}" | sed -r 's/[[:alnum:]_]+=/\n&/g' | awk -F= '$1=="avg_txs_per_block"{print $2}') + local max_bs=$(echo "${results}" | sed -r 's/[[:alnum:]_]+=/\n&/g' | awk -F= '$1=="max_txs_per_block"{print $2}') + + local bss=$(echo "${results}" | sed -r 's/[[:alnum:]_]+=/\n&/g' | awk -F= '$1=="block_sizes"{print $2}' | tr -d '"') + + local bssb=$(echo "${results}" | sed -r 's/[[:alnum:]_]+=/\n&/g' | awk -F= '$1=="block_sizes_bytes"{print $2}' | tr -d '"') + + local bts=$(echo "${results}" | sed -r 's/[[:alnum:]_]+=/\n&/g' | awk -F= '$1=="block_delta_t_s"{print $2}' | tr -d '"') + + rm "${output}" + + if [[ "${no_plot}" == "no_plot" ]]; then + return + fi + + echo "${num_accounts} ${tps}" >> "${TPS_DATA_FILE}" + echo "${num_accounts} ${st}" >> "${ST_DATA_FILE}" + echo "${num_accounts} ${min_bs} ${avg_bs} ${max_bs}" >> "${BS_DATA_FILE}" + + local blk=0 + for bs in ${bss} + do + echo "${num_accounts} ${blk} ${bs}" >> "${BSS_DATA_FILE}" + blk=$((blk+1)) + done + + blk=0 + for bsb in ${bssb} + do + echo "${num_accounts} ${blk} ${bsb}" >> "${BSSB_DATA_FILE}" + blk=$((blk+1)) + done + + blk=0 + for bt in ${bts} + do + echo "${num_accounts} ${blk} ${bt}" >> "${BTS_DATA_FILE}" + blk=$((blk+1)) + done +} + +ACCT="10, 50, 100, 175, 250, 325, 425, 500, 650, 800, 900" + +ACCTN=$(echo "${ACCT}" | tr -d ',') +MAX_ACCTS=$(echo "${ACCTN}" | tr ' ' '\n' | sort -nr | head -1) +NUM_ACCT_RUNS=$(echo "${ACCTN}" | wc -w) + +# Set this to a list of runs you wish to profile (e.g. "175 500 800") +# or to the string "no" to disable. +PROF="no" +if [[ "${PROF}" != "no" ]]; then + NUM_PROF_RUNS=$(echo "${PROF}" | wc -w) +else + NUM_PROF_RUNS=0 +fi + +NUM_RUNS=$(( NUM_ACCT_RUNS + NUM_PROF_RUNS )) + +printf "${GRN}*** Funding ${MAX_ACCTS} accounts for ${NUM_RUNS} runs...${OFF}\n" +# The gas price is artificially inflated to provide enough tokens to pay fees +# for all the runs. +conbench --num_accounts ${MAX_ACCTS} --gas_price ${NUM_RUNS} --fund_and_exit + +for a in ${ACCTN} +do + run_bench $a plot + if grep -Fow "$a" <<< "${PROF}"; then + # Go's pprof server doesn't seem to have a way to start/stop profiling, + # it can only do a N second run, which is unfortunate. + # + # To work around this, we do a normal run first, then parse how many + # seconds it needed, round it up to the nearest integer, and finally + # do a second run, profiling for that number of seconds. Ugh. + + seconds=$(fgrep 'msg="benchmark finished" num_accounts='$a "${RAW_DATA}" | sed -r 's/[[:alnum:]_]+=/\n&/g' | awk -F= '$1=="bench_duration_s"{print $2}' | awk '{print int($1+0.5)}') + + printf "${GRN}*** Re-running benchmark for ${a} accounts with profiling (${seconds} s)...${OFF}\n" + + curl -so conbench-$a.prof 'http://127.0.0.1:10101/debug/pprof/profile?seconds='${seconds} & + run_bench $a no_plot + curl -so conbench-$a.block.prof 'http://127.0.0.1:10101/debug/pprof/block' + go tool pprof -png -lines -noinlines conbench-$a.block.prof > conbench-$a.block.png + curl -so conbench-$a.mutex.prof 'http://127.0.0.1:10101/debug/pprof/mutex' + go tool pprof -png -lines -noinlines conbench-$a.mutex.prof > conbench-$a.mutex.png + fi +done + + +# Plot TPS graph. +gnuplot <<- EOF +set title "Transactions per second" +set xlabel "Number of parallel accounts" +set xtics (${ACCT}) +set ylabel "transactions/s" textcolor lt 1 +set autoscale y +set grid +set term png +set output "${TPS_PLOT}" +plot '${TPS_DATA_FILE}' using 1:2 with linespoint notitle +EOF + +# Plot avg submit time graph. +gnuplot <<- EOF +set title "Average SubmitTx time" +set xlabel "Number of parallel accounts" +set xtics (${ACCT}) +set ylabel "SubmitTx time [s]" textcolor lt 1 +set autoscale y +set grid +set term png +set output "${ST_PLOT}" +plot '${ST_DATA_FILE}' using 1:2 with linespoint notitle +EOF + +# Plot both on a single graph. +gnuplot <<- EOF +set title "Transactions per second and average SubmitTx time" +set xlabel "Number of parallel accounts" +set xtics (${ACCT}) +set ylabel "transactions/s" textcolor lt 1 +set y2label "SubmitTx time [s]" textcolor lt 2 +set y2tics nomirror +set autoscale y +set autoscale y2 +set grid +set term png +set output "${BOTH_PLOT}" +plot '${TPS_DATA_FILE}' using 1:2 axes x1y1 with linespoint notitle, '${ST_DATA_FILE}' using 1:2 axes x1y2 with linespoint notitle +EOF + +# Plot block sizes (min/avg/max). +gnuplot <<- EOF +set title "Block size" +set xlabel "Number of parallel accounts" +set xtics (${ACCT}) +set ylabel "Block size [number of transactions]" +set autoscale y +set grid +set key left top +set term png +set output "${BS_PLOT}" +plot '${BS_DATA_FILE}' using 1:2 with linespoint title "min", '${BS_DATA_FILE}' using 1:3 with linespoint title "avg", '${BS_DATA_FILE}' using 1:4 with linespoint title "max" +EOF + +# Plot block sizes (number of transactions per block per number of accounts). +gnuplot <<- EOF +set title "Block size" +set xlabel "Number of parallel accounts" rotate parallel offset 0,-2,0 +set xtics (${ACCT}) offset 0,-1,0 +set ylabel "Block number" rotate parallel offset 0,-1,0 +set zlabel "Block size [number of transactions]" rotate parallel offset 1,0,0 +set ticslevel 0 +set autoscale y +set autoscale z +set grid +set term png +set output "${BSS_PLOT}" +set palette defined (0 "red", 1 "yellow", 2 "cyan", 3 "blue", 4 "dark-violet") +splot '${BSS_DATA_FILE}' with impulses lw 2 lc palette notitle +EOF + +# Plot block sizes in bytes. +gnuplot <<- EOF +set title "Block size in bytes" +set xlabel "Number of parallel accounts" rotate parallel offset 0,-2,0 +set xtics (${ACCT}) offset 0,-1,0 +set ylabel "Block number" rotate parallel offset 0,-1,0 +set zlabel "Block size [bytes]" rotate parallel +set ticslevel 0 +set autoscale y +set autoscale z +set grid +set term png +set output "${BSSB_PLOT}" +set palette defined (0 "red", 1 "yellow", 2 "cyan", 3 "blue", 4 "dark-violet") +splot '${BSSB_DATA_FILE}' with impulses lw 2 lc palette notitle +EOF + +# Plot time between blocks. +gnuplot <<- EOF +set title "Time between blocks" +set xlabel "Number of parallel accounts" rotate parallel offset 0,-2,0 +set xtics (${ACCT}) offset 0,-1,0 +set ylabel "Block number" rotate parallel offset 0,-1,0 +set zlabel "Time delta to previous block [s]" rotate parallel offset 1,0,0 +set ticslevel 0 +set autoscale y +set autoscale z +set grid +set term png +set output "${BTS_PLOT}" +set palette defined (0 "dark-violet", 1 "blue", 2 "cyan", 3 "yellow", 4 "red") +splot '${BTS_DATA_FILE}' with impulses lw 2 lc palette notitle +EOF + +rm "${TPS_DATA_FILE}" "${ST_DATA_FILE}" "${BS_DATA_FILE}" "${BSS_DATA_FILE}" "${BSSB_DATA_FILE}" "${BTS_DATA_FILE}" + +printf "${GRN}*** Refunding original funding account...${OFF}\n" +conbench --num_accounts ${MAX_ACCTS} --refund_and_exit + +printf "${GRN}*** Benchmarks completed.${OFF}\n" diff --git a/go/extra/conbench/conbench-test.sh b/go/extra/conbench/conbench-test.sh new file mode 100755 index 00000000000..270bdffced9 --- /dev/null +++ b/go/extra/conbench/conbench-test.sh @@ -0,0 +1,71 @@ +#!/usr/bin/env bash +# +# Set-up the default oasis-net-runner network and run conbench-plot on it. +# +# You might want to set the cpufreq governor to 'performance' before running +# any benchmarks: +# +# sudo sh -c 'for cpu in /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor; do echo "performance" > $cpu; done' +# ./conbench-test.sh +# sudo sh -c 'for cpu in /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor; do echo "powersave" > $cpu; done' +# + +set -o nounset -o pipefail -o errexit +trap "exit 1" INT + +# Get the root directory of the repository. +ROOT="$(cd $(dirname $0)/../../../; pwd -P)" + +# ANSI escape codes to brighten up the output. +GRN=$'\e[32;1m' +OFF=$'\e[0m' + +# Paths to various binaries and config files that we need. +OASIS_NET_RUNNER="${ROOT}/go/oasis-net-runner/oasis-net-runner" +OASIS_NODE="${ROOT}/go/oasis-node/oasis-node" + +# Kill all dangling processes on exit. +cleanup() { + printf "${OFF}" + pkill -P $$ || true + wait || true +} +trap "cleanup" EXIT + +# The base directory for all the node and test env cruft. +# Note: We don't make this under /tmp to prevent running out of RAM. +# Note2: There's a dumb limit to the path length for the UNIX socket, so run +# this script somewhere close to the root of the filesystem. +TEST_BASE_DIR=$(cd `mktemp -p . -d oasis-conbench-XXXXXXXXXX`; pwd -P) + +# The oasis-node binary must be in the path for the oasis-net-runner to find it. +export PATH="${PATH}:${ROOT}/go/oasis-node" + +# Make sure the open file limit is big enough. +ulimit -n 10240 + +printf "${GRN}### Starting the test network...${OFF}\n" +${OASIS_NET_RUNNER} \ + --fixture.default.setup_runtimes=false \ + --fixture.default.num_entities=1 \ + --fixture.default.disable_supplementary_sanity_checks=true \ + --fixture.default.timeout_commit=1ms \ + --basedir.no_temp_dir \ + --basedir "${TEST_BASE_DIR}" & + +export OASIS_NODE_GRPC_ADDR="unix:${TEST_BASE_DIR}/net-runner/network/validator-0/internal.sock" + +printf "${GRN}### Waiting for all nodes to register...${OFF}\n" +${OASIS_NODE} debug control wait-nodes \ + --address ${OASIS_NODE_GRPC_ADDR} \ + --nodes 1 \ + --wait + +printf "${GRN}### Running benchmark...${OFF}\n" +${ROOT}/go/extra/conbench/conbench-plot.sh --use_test_entity + +# Clean up after a successful run. +cleanup +rm -rf "${TEST_BASE_DIR}" + +printf "${GRN}### Tests finished.${OFF}\n" diff --git a/go/extra/conbench/main.go b/go/extra/conbench/main.go new file mode 100644 index 00000000000..56404c82b37 --- /dev/null +++ b/go/extra/conbench/main.go @@ -0,0 +1,10 @@ +// Consensus benchmark. +package main + +import ( + "github.com/oasisprotocol/oasis-core/go/extra/conbench/cmd" +) + +func main() { + cmd.Execute() +} diff --git a/go/oasis-net-runner/fixtures/default.go b/go/oasis-net-runner/fixtures/default.go index 3764dc00564..8eab70aaef7 100644 --- a/go/oasis-net-runner/fixtures/default.go +++ b/go/oasis-net-runner/fixtures/default.go @@ -28,6 +28,8 @@ const ( cfgRuntimeLoader = "fixture.default.runtime.loader" cfgSetupRuntimes = "fixture.default.setup_runtimes" cfgTEEHardware = "fixture.default.tee_hardware" + cfgDisableSupSanityChecks = "fixture.default.disable_supplementary_sanity_checks" + cfgTimeoutCommit = "fixture.default.timeout_commit" ) var ( @@ -57,7 +59,7 @@ func newDefaultFixture() (*oasis.NetworkFixture, error) { RuntimeSGXLoaderBinary: viper.GetString(cfgRuntimeLoader), Consensus: consensusGenesis.Genesis{ Parameters: consensusGenesis.Parameters{ - TimeoutCommit: 1 * time.Second, + TimeoutCommit: viper.GetDuration(cfgTimeoutCommit), }, }, EpochtimeMock: viper.GetBool(cfgEpochtimeMock), @@ -73,7 +75,7 @@ func newDefaultFixture() (*oasis.NetworkFixture, error) { {IsDebugTestEntity: true}, }, Validators: []oasis.ValidatorFixture{ - {Entity: 1}, + {Entity: 1, Consensus: oasis.ConsensusFixture{DisableSupplementarySanityChecks: viper.GetBool(cfgDisableSupSanityChecks)}}, }, } @@ -151,6 +153,7 @@ func init() { DefaultFixtureFlags.Bool(cfgFundEntities, false, "fund all entities in genesis") DefaultFixtureFlags.Bool(cfgEpochtimeMock, false, "use mock epochtime") DefaultFixtureFlags.Bool(cfgSetupRuntimes, true, "initialize the network with runtimes and runtime nodes") + DefaultFixtureFlags.Bool(cfgDisableSupSanityChecks, false, "disable supplementary sanity checks") DefaultFixtureFlags.Int(cfgNumEntities, 1, "number of (non debug) entities in genesis") DefaultFixtureFlags.String(cfgKeymanagerBinary, "simple-keymanager", "path to the keymanager runtime") DefaultFixtureFlags.String(cfgNodeBinary, "oasis-node", "path to the oasis-node binary") @@ -159,6 +162,7 @@ func init() { DefaultFixtureFlags.String(cfgRuntimeLoader, "oasis-core-runtime-loader", "path to the runtime loader") DefaultFixtureFlags.String(cfgTEEHardware, "", "TEE hardware to use") DefaultFixtureFlags.Uint64(cfgHaltEpoch, math.MaxUint64, "halt epoch height") + DefaultFixtureFlags.Duration(cfgTimeoutCommit, 1*time.Second, "consensus timeout commit parameter") _ = viper.BindPFlags(DefaultFixtureFlags) diff --git a/go/oasis-node/cmd/common/pprof/pprof.go b/go/oasis-node/cmd/common/pprof/pprof.go index b863a7c0cbe..b2e9af85bb6 100644 --- a/go/oasis-node/cmd/common/pprof/pprof.go +++ b/go/oasis-node/cmd/common/pprof/pprof.go @@ -71,6 +71,9 @@ func (p *pprofService) Start() error { return err } + runtime.SetBlockProfileRate(1) + runtime.SetMutexProfileFraction(1) + // Create a new mux just for the pprof endpoints to avoid using the // global multiplexer where pprof's init function registers by default. mux := http.NewServeMux() diff --git a/go/oasis-test-runner/oasis/entity.go b/go/oasis-test-runner/oasis/entity.go index f610e75ac8d..f8ddff7416c 100644 --- a/go/oasis-test-runner/oasis/entity.go +++ b/go/oasis-test-runner/oasis/entity.go @@ -38,8 +38,8 @@ type Entity struct { // EntityCfg is the Oasis entity provisioning configuration. type EntityCfg struct { - IsDebugTestEntity bool - Restore bool + IsDebugTestEntity bool `json:"is_debug_test_entity"` + Restore bool `json:"restore"` } // Inner returns the actual Oasis entity and it's signer. diff --git a/go/oasis-test-runner/oasis/fixture.go b/go/oasis-test-runner/oasis/fixture.go index 4d7285b4e15..a8dfe5e94ed 100644 --- a/go/oasis-test-runner/oasis/fixture.go +++ b/go/oasis-test-runner/oasis/fixture.go @@ -136,6 +136,9 @@ type ConsensusFixture struct { // nolint: maligned // EnableConsensusRPCWorker enables the public consensus RPC services worker. EnableConsensusRPCWorker bool `json:"enable_consensusrpc_worker,omitempty"` + + // DisableSupplementarySanityChecks disables the supplementary sanity checks. + DisableSupplementarySanityChecks bool `json:"disable_supplementary_sanity_checks,omitempty"` } // TEEFixture is a TEE configuration fixture. diff --git a/go/oasis-test-runner/oasis/validator.go b/go/oasis-test-runner/oasis/validator.go index 67cdcee4e51..953213fc0fe 100644 --- a/go/oasis-test-runner/oasis/validator.go +++ b/go/oasis-test-runner/oasis/validator.go @@ -104,8 +104,10 @@ func (val *Validator) startNode() error { workerConsensusRPCEnabled() } - if len(val.net.validators) >= 1 && val == val.net.validators[0] { - args = args.tendermintSupplementarySanityEnabled() + if !val.consensus.DisableSupplementarySanityChecks { + if len(val.net.validators) >= 1 && val == val.net.validators[0] { + args = args.tendermintSupplementarySanityEnabled() + } } if err := val.net.startOasisNode(&val.Node, nil, args); err != nil {