Add metric mo_log_message_count and check the count value in CI (matr…

…ixorigin#11023) Log messages print too much is no good to analyize situation. - Add new metric `mo_log_message_count` show rows of log message ourput per Node, per minute. - Add `Check Log Messages Count per second` CI Step to pre-check the val per second < threshold (default: 1000) modification: - metric 1. add new metric `mo_log_message_count` in new file `pkg/util/metric/m_mo.go` 2. add new `InternalCollectors ` array to maintain all internal metric, more likly belong to SubSystemMO 3. add new `internalRegistry` and `internalExporter` to handle internal metric with diff grather interval (default: 1 minite) 4. add new config item `metric-internal-gather-interval` to maintain internal metric's gather interval - CI 1. Add `optools/check_log_count.sh` helps to check metric value, it will do 1. try to access mo-service. If failed exit with 0 as exit-code. 2. count log_message_count per second, from metric `mo_log_message_count` 3. show all case that log_message_count > threshold. If result not empty, exit with 1 as exit-code. 1. Add `Check Log Messages Count per second` for standalone case with threshold: **#nodes * 1000** 4. Add `Check Log Messages Count per second` for docker-compose case with threshold: **1000** example of optools/check_log_count.sh exec result - help info ``` # ./check_log_count.sh -h Usage: ./check_log_count.sh [cnt_threshold [metric_interval]] like: ./check_log_count.sh or ./check_log_count.sh 1000 or ./check_log_count.sh 1000 60 options cnt_threshold - int, log messages per second threshold default: 1000 metric_interval - int, metric collected interval default: 60 ``` - Case: check ok. ``` # sh check_log_count.sh 1000 60 [2023-08-04 11:01:38] This script is inspired by matrixorigin#9835 [2023-08-04 11:01:38] arg count_threshold : 1000 [2023-08-04 11:01:38] arg metric_interval : 60 [2023-08-04 11:01:38] out_log_count file : /tmp/log_count.pid63053.ts1691118098 [2023-08-04 11:01:38] try access mo 1 times. [2023-08-04 11:01:38] seccess to access mo-servcie through port 6001. [2023-08-04 11:01:38] Query: select * from (select collecttime, cast( value / 60 as DECIMAL(38,2)) as cnt_per_second, node, role, type as level from system_metrics.metric where metric_name = 'mo_log_message_count' and type not in ('debug')) a where a.cnt_per_second > 1000 order by collecttime mysql: [Warning] Using a password on the command line interface can be insecure. [2023-08-04 11:01:38] log messages spitting out per second < threshold(val: 1000): OK! # echo $? 0 ``` - Case: log_message_count per second > threshold ``` # sh check_log_count.sh 100 60 [2023-08-04 10:59:32] This script is inspired by matrixorigin#9835 [2023-08-04 10:59:32] arg count_threshold : 100 [2023-08-04 10:59:32] arg metric_interval : 60 [2023-08-04 10:59:32] out_log_count file : /tmp/log_count.pid62999.ts1691117972 [2023-08-04 10:59:32] try access mo 1 times. [2023-08-04 10:59:32] seccess to access mo-servcie through port 6001. [2023-08-04 10:59:32] Query: select * from (select collecttime, cast( value / 60 as DECIMAL(38,2)) as cnt_per_second, node, role, type as level from system_metrics.metric where metric_name = 'mo_log_message_count' and type not in ('debug')) a where a.cnt_per_second > 100 order by collecttime mysql: [Warning] Using a password on the command line interface can be insecure. [2023-08-04 10:59:32] log messages spitting out per second threshold(val: 100) [2023-08-04 10:59:32] each rows show last 60 secs status [2023-08-04 10:59:32] collecttime cnt_per_second node role level 2023-08-03 14:42:24.975355 102.78 7c4dccb4-4d3c-41f8-b482-5251dc7a41bf ALL error 2023-08-03 16:04:08.580857 129.03 7c4dccb4-4d3c-41f8-b482-5251dc7a41bf ALL info 2023-08-03 21:03:39.621690 152.13 7c4dccb4-4d3c-41f8-b482-5251dc7a41bf ALL info [2023-08-04 10:59:32] log messages spitting out per second > threshold(val: 100): NOT ok!!! # echo $? 1 ``` - Case: access mo-service failed (with 3 times retry) ``` # sh check_log_count.sh 1000 60 [2023-08-04 11:05:23] This script is inspired by matrixorigin#9835 [2023-08-04 11:05:23] arg count_threshold : 1000 [2023-08-04 11:05:23] arg metric_interval : 60 [2023-08-04 11:05:23] out_log_count file : /tmp/log_count.pid63736.ts1691118323 [2023-08-04 11:05:23] Try to access mo 1 times. [2023-08-04 11:05:24] Try to access mo 2 times. [2023-08-04 11:05:25] Try to access mo 3 times. [2023-08-04 11:05:26] warning: failed to access mo-servcie through port 6001. # echo $? 0 ``` Approved by: @daviszhen, @aptend, @sukki37
LeftHandCold · Aug 4, 2023 · 1ac255f · 1ac255f
1 parent 4c28b29
commit 1ac255f
Show file tree

Hide file tree

Showing 10 changed files with 318 additions and 18 deletions.
diff --git a/.github/workflows/e2e-compose.yml b/.github/workflows/e2e-compose.yml
@@ -68,6 +68,10 @@ jobs:
           curl http://localhost:12345/debug/pprof/goroutine\?debug=2 -o ${{ github.workspace }}/docker-compose-log/cn-0-dump-stacks.log
           curl http://localhost:22345/debug/pprof/goroutine\?debug=2 -o ${{ github.workspace }}/docker-compose-log/cn-1-dump-stacks.log
 
+      - name: Check Log Messages Count per second
+        if: ${{ always() && !cancelled() }}
+        run: |
+          ./optools/check_log_count.sh 1000 60 # {count threshold} {metric collected interval}
 
       - name: shutdown containers
         if: ${{ always() }}
@@ -146,6 +150,12 @@ jobs:
           mv  ${{ github.workspace }}/mo-tester/report ${{ github.workspace }}/docker-compose-log
           curl http://localhost:12345/debug/pprof/goroutine\?debug=2 -o ${{ github.workspace }}/docker-compose-log/cn-0-dump-stacks.log
           curl http://localhost:22345/debug/pprof/goroutine\?debug=2 -o ${{ github.workspace }}/docker-compose-log/cn-1-dump-stacks.log
+
+      - name: Check Log Messages Count per second
+        if: ${{ always() && !cancelled() }}
+        run: |
+          ./optools/check_log_count.sh 1000 60 # {count threshold} {metric collected interval}
+
       - name: shutdown containers
         if: ${{ always() }}
         run: |
@@ -160,4 +170,4 @@ jobs:
           name: multi-cn-e2e-push-docker-compose-log
           path: |
             ${{ github.workspace }}/docker-compose-log
-          retention-days: 7
+          retention-days: 7
diff --git a/.github/workflows/e2e-standalone.yml b/.github/workflows/e2e-standalone.yml
@@ -89,6 +89,12 @@ jobs:
         if: ${{ always() && !cancelled() }}
         run: |
           if [ "$(ps -ef | grep 'mo-service' | grep -v "grep" | wc -l)" -gt 0 ]; then curl http://localhost:12345/debug/pprof/goroutine\?debug=2 -o ${{ github.workspace }}/head/restarted-dump-stacks.log; pkill -9 mo-service; else echo 'current mo-service has already crashed'; exit 1; fi               
+      - name: Check Log Messages Count per second
+        if: ${{ always() && !cancelled() }}
+        run: |
+          cd $GITHUB_WORKSPACE/head
+          # 4 nodes in one Process
+          ./optools/check_log_count.sh 4000 60 # {count threshold} {metric collected interval}
       - name: generate upload files
         if: ${{ always() }}
         continue-on-error: true
@@ -160,6 +166,12 @@ jobs:
         if: ${{ always() && !cancelled() }}
         run: |
           if [ "$(ps -ef | grep 'mo-service' | grep -v "grep" | wc -l)" -gt 0 ]; then curl http://localhost:12345/debug/pprof/goroutine\?debug=2 -o ${{ github.workspace }}/head/dump-stacks.log; else echo 'current mo-service has already crashed'; exit 1; fi
+      - name: Check Log Messages Count per second
+        if: ${{ always() && !cancelled() }}
+        run: |
+          cd $GITHUB_WORKSPACE/head
+          # 5 nodes in one Process
+          ./optools/check_log_count.sh 5000 60 # {count threshold} {metric collected interval}
       - name: generate upload files
         if: ${{ always() }}
         continue-on-error: true
@@ -378,6 +390,13 @@ jobs:
         run: |
           if [ "$(ps -ef | grep 'mo-service' | grep -v "grep" | wc -l)" -gt 0 ]; then curl http://localhost:12345/debug/pprof/goroutine\?debug=2 -o ${{ github.workspace }}/head/dump-stacks.log; else echo 'current mo-service has already crashed'; exit 1; fi
 
+      - name: Check Log Messages Count per second
+        if: ${{ always() && !cancelled() }}
+        run: |
+          cd $GITHUB_WORKSPACE/head
+          # 4 nodes in one Process
+          ./optools/check_log_count.sh 4000 60 # {count threshold} {metric collected interval}
+
       - name: Check mo-service Status
         if: ${{ always() && !cancelled() }}
         run: |
@@ -401,4 +420,4 @@ jobs:
           name: 1-cn-e2e-linux-x86-reports
           path: |
             ${{ github.workspace }}/upload
-          retention-days: 7
+          retention-days: 7
diff --git a/optools/check_log_count.sh b/optools/check_log_count.sh
@@ -0,0 +1,144 @@
+#!/bin/bash
+
+# Copyright 2023 Matrix Origin
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -o nounset
+
+## const
+###############
+pid=$$
+ts=`date +%s`
+out_log_count="/tmp/log_count.pid${pid}.ts${ts}"
+
+max_try_conn=3
+mod='mysql -h 127.0.0.1 -P 6001 -udump -p111 system -A'
+metric_interval=60
+count_threshold=1000
+
+## function
+###############
+
+echo_proxy() {
+    echo "[`date '+%F %T'`] $@"
+}
+
+show_env() {
+    echo_proxy "This script is inspired by #9835"
+    echo_proxy "arg count_threshold : $count_threshold"
+    echo_proxy "arg metric_interval : $metric_interval"
+    echo_proxy "out_log_count file  : $out_log_count"
+}
+
+check_mo_service_alive() {
+    ## Action: try access mo {max_try_conn} times.
+    ##         if failed, exit 0 with note: "failed to access mo-servcie ..."
+    local ret=0
+    for idx in `seq 1 $max_try_conn`;
+    do
+        echo_proxy "Try to access mo $idx times."
+        $mod -Nse "select 1;" 1>/dev/null 2>&1
+        ret=$?
+        if [ $ret -eq 0 ]; then
+            break
+        fi
+        # sleep 1 for retry
+        sleep 1
+    done
+    if [ $ret -ne 0 ]; then
+        echo_proxy "warning: failed to access mo-servcie through port 6001."
+        exit 0
+    fi
+    echo_proxy "seccess to access mo-servcie through port 6001."
+}
+
+get_log_count() {
+    # count log message per second (exclude level=debug record)
+    #
+    ### table system_metrics.metric example:
+    # metric_name collecttime value   node    role    account type
+    # mo_log_message_count    2023-08-03 15:08:08.591955  77  7c4dccb4-4d3c-41f8-b482-5251dc7a41bf    ALL sys error
+    # mo_log_message_count    2023-08-03 15:08:08.591955  37  7c4dccb4-4d3c-41f8-b482-5251dc7a41bf    ALL sys info
+    #
+    ### calculation result example
+    # collecttime	cnt_per_second	node	role	level
+    # 2023-08-03 14:37:24.977181	35.78	7c4dccb4-4d3c-41f8-b482-5251dc7a41bf	ALL	error
+    # 2023-08-03 14:37:24.977181	31.00	7c4dccb4-4d3c-41f8-b482-5251dc7a41bf	ALL	info
+    # 2023-08-03 14:38:24.987134	21.02	7c4dccb4-4d3c-41f8-b482-5251dc7a41bf	ALL	error
+    #
+    local sql=`cat << EOF
+select * from
+(select collecttime, cast( value / $metric_interval as DECIMAL(38,2)) as cnt_per_second, node, role, type as level from system_metrics.metric
+ where metric_name = 'mo_log_message_count' and type not in ('debug')) a
+ where a.cnt_per_second > $count_threshold
+ order by collecttime
+EOF`
+    echo_proxy "Query: $sql"
+    $mod -e "$sql" > $out_log_count
+}
+
+check_log_count() {
+    local rows=`wc -l $out_log_count | awk '{print $1}'`
+    if [ "$rows" == "0" ]; then
+        echo_proxy "log messages spitting out per second < threshold(val: $count_threshold): OK!"
+        return 0
+    fi
+
+    echo_proxy "log messages spitting out per second threshold(val: $count_threshold)"
+    echo_proxy "each rows show last $metric_interval secs status"
+    echo_proxy
+    cat $out_log_count
+    echo
+    return 1
+}
+
+usage() {
+    cat << EOF
+Usage: $0 [cnt_threshold [metric_interval]]
+like:  $0
+  or   $0 1000
+  or   $0 1000 60
+
+options
+    cnt_threshold   - int, log messages per second threshold
+                      default: $count_threshold
+    metric_interval - int, metric collected interval
+                      default: $metric_interval
+EOF
+}
+
+## main
+################
+
+if [ $# -eq 1 ]; then
+    arg=$1
+    if [ "$arg" == "-h" -o "$arg" == "--help" ]; then
+        usage
+        exit 1
+    fi
+    count_threshold=$arg
+elif [ $# -eq 2 ]; then
+    count_threshold=$1
+    metric_interval=$2
+fi
+
+show_env
+check_mo_service_alive
+get_log_count
+check_log_count
+ret=$?
+if [ "$ret" != "0" ]; then
+    echo_proxy "log messages spitting out per second > threshold(val: $count_threshold): NOT ok!!!"
+    exit 1
+fi
diff --git a/pkg/config/configuration.go b/pkg/config/configuration.go
@@ -119,6 +119,9 @@ var (
 	// defaultMetricGatherInterval default: 15 sec.
 	defaultMetricGatherInterval = 15
 
+	// defaultMetricInternalGatherInterval default: 1 min.
+	defaultMetricInternalGatherInterval = time.Minute
+
 	// defaultMetricUpdateStorageUsageInterval default: 15 min.
 	defaultMetricUpdateStorageUsageInterval = 15 * time.Minute
 
@@ -548,12 +551,15 @@ type ObservabilityParameters struct {
 	LongQueryTime float64 `toml:"longQueryTime"`
 
 	// MetricExportInterval default is 15 sec.
-	MetricExportInterval int `toml:"metricExportInterval"`
+	MetricExportInterval int `toml:"metric-export-interval"`
 
 	// MetricGatherInterval default is 15 sec.
-	MetricGatherInterval int `toml:"metricGatherInterval"`
+	MetricGatherInterval int `toml:"metric-gather-interval"`
+
+	// MetricInternalGatherInterval default is 1 min, handle metric.SubSystemMO metric
+	MetricInternalGatherInterval toml.Duration `toml:"metric-internal-gather-interval"`
 
-	// MetricStorageUsageUpdateInterval, default: 30 min
+	// MetricStorageUsageUpdateInterval, default: 15 min
 	MetricStorageUsageUpdateInterval toml.Duration `toml:"metricStorageUsageUpdateInterval"`
 
 	// MetricStorageUsageCheckNewInterval, default: 1 min
@@ -604,6 +610,7 @@ func NewObservabilityParameters() *ObservabilityParameters {
 		LongQueryTime:                      defaultLongQueryTime,
 		MetricExportInterval:               defaultMetricExportInterval,
 		MetricGatherInterval:               defaultMetricGatherInterval,
+		MetricInternalGatherInterval:       toml.Duration{},
 		MetricStorageUsageUpdateInterval:   toml.Duration{},
 		MetricStorageUsageCheckNewInterval: toml.Duration{},
 		MergeCycle:                         toml.Duration{},
@@ -617,6 +624,7 @@ func NewObservabilityParameters() *ObservabilityParameters {
 		EnableStmtMerge:                    false,
 		OBCollectorConfig:                  *NewOBCollectorConfig(),
 	}
+	op.MetricInternalGatherInterval.Duration = defaultMetricInternalGatherInterval
 	op.MetricStorageUsageUpdateInterval.Duration = defaultMetricUpdateStorageUsageInterval
 	op.MetricStorageUsageCheckNewInterval.Duration = defaultMetricStorageUsageCheckNewInterval
 	op.MergeCycle.Duration = defaultMergeCycle

diff --git a/pkg/util/metric/m_mo.go b/pkg/util/metric/m_mo.go
@@ -0,0 +1,33 @@
+// Copyright 2023 Matrix Origin
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package metric
+
+import "github.com/matrixorigin/matrixone/pkg/util/export/table"
+
+var (
+	MOLogMessageFactory = NewCounterVec(
+		CounterOpts{
+			Subsystem: SubSystemMO,
+			Name:      "log_message_count",
+			Help:      "Count of log message",
+		},
+		[]string{constTenantKey, "type"},
+		false,
+	)
+)
+
+func MOLogMessageCounter(level string) Counter {
+	return MOLogMessageFactory.WithLabelValues(table.AccountSys, level)
+}
diff --git a/pkg/util/metric/m_register.go b/pkg/util/metric/m_register.go
@@ -21,6 +21,8 @@ const constTenantKey = "account"
 // this constant lable is used for sys_* and process_* table
 var sysTenantID = prom.Labels{constTenantKey: "sys"}
 
+// InitCollectors contains all the collectors that belong to SubSystemSql, SubSystemServer, SubSystemProcess and
+// SubSystemStatement.
 var InitCollectors = []Collector{
 	// sql metric
 	StatementCounterFactory,
@@ -36,6 +38,12 @@ var InitCollectors = []Collector{
 	hardwareStatsCollector,
 }
 
+// InternalCollectors contains all the collectors that belong to SubSystemMO
+var InternalCollectors = []Collector{
+	// mo metric
+	MOLogMessageFactory,
+}
+
 type SubSystem struct {
 	Name              string
 	Comment           string
@@ -48,9 +56,23 @@ func RegisterSubSystem(s *SubSystem) {
 	AllSubSystem[s.Name] = s
 }
 
+const (
+	// SubSystemSql is the subsystem which base on query action.
+	SubSystemSql = "sql"
+	// SubSystemServer is the subsystem which base on server status, trigger by client query.
+	SubSystemServer = "server"
+	// SubSystemProcess is the subsystem which base on process status.
+	SubSystemProcess = "process"
+	// SubSystemSys is the subsystem which base on OS status.
+	SubSystemSys = "sys"
+	// SubSystemMO is the subsystem which show mo internal status
+	SubSystemMO = "mo"
+)
+
 func init() {
-	RegisterSubSystem(&SubSystem{"sql", "base on query action", true})
-	RegisterSubSystem(&SubSystem{"server", "MO Server status, observe from inside", true})
-	RegisterSubSystem(&SubSystem{"process", "MO process status", false})
-	RegisterSubSystem(&SubSystem{"sys", "OS status", false})
+	RegisterSubSystem(&SubSystem{SubSystemSql, "base on query action", true})
+	RegisterSubSystem(&SubSystem{SubSystemServer, "MO Server status, observe from inside", true})
+	RegisterSubSystem(&SubSystem{SubSystemProcess, "MO process status", false})
+	RegisterSubSystem(&SubSystem{SubSystemSys, "OS status", false})
+	RegisterSubSystem(&SubSystem{SubSystemMO, "MO internal status", false})
 }
diff --git a/pkg/util/metric/mometric/cron_task.go b/pkg/util/metric/mometric/cron_task.go
@@ -192,6 +192,8 @@ func checkNewAccountSize(ctx context.Context, logger *log.MOLogger, sqlExecutor
 	var err error
 	ctx, span := trace.Start(ctx, "checkNewAccountSize")
 	defer span.End()
+	logger = logger.WithContext(ctx)
+	logger.Info("started")
 	defer func() {
 		logger.Info("checkNewAccountSize exit", zap.Error(err))
 	}()
@@ -226,9 +228,16 @@ func checkNewAccountSize(ctx context.Context, logger *log.MOLogger, sqlExecutor
 		// more details in pkg/frontend/authenticate.go, function frontend.createTablesInMoCatalog
 		sql := fmt.Sprintf("select account_name, created_time from mo_catalog.mo_account where created_time >= %q;",
 			table.Time2DatetimeString(lastCheckTime.UTC()))
-		logger.Debug("query new account", zap.String("sql", sql))
+		getNewAccounts := func(ctx context.Context, sql string, lastCheck, now time.Time) ie.InternalExecResult {
+			ctx, spanQ := trace.Start(ctx, "QueryStorageStorage.getNewAccounts")
+			defer spanQ.End()
+			spanQ.AddExtraFields(zap.Time("last_check_time", lastCheckTime))
+			spanQ.AddExtraFields(zap.Time("now", now))
+			logger.Debug("query new account", zap.String("sql", sql))
+			return executor.Query(ctx, ShowAllAccountSQL, opts)
+		}
+		result := getNewAccounts(ctx, sql, lastCheckTime, now)
 		lastCheckTime = now
-		result := executor.Query(ctx, sql, opts)
 		err = result.Error()
 		if err != nil {
 			logger.Error("failed to fetch new created account", zap.Error(err), zap.String("sql", sql))
@@ -255,7 +264,14 @@ func checkNewAccountSize(ctx context.Context, logger *log.MOLogger, sqlExecutor
 
 			// query single account's storage
 			showSql := fmt.Sprintf(ShowAccountSQL, account)
-			showRet := executor.Query(ctx, showSql, opts)
+			getOneAccount := func(ctx context.Context, sql string) ie.InternalExecResult {
+				ctx, spanQ := trace.Start(ctx, "QueryStorageStorage.getOneAccount")
+				defer spanQ.End()
+				spanQ.AddExtraFields(zap.String("account", account))
+				logger.Debug("query one account", zap.String("sql", sql))
+				return executor.Query(ctx, ShowAllAccountSQL, opts)
+			}
+			showRet := getOneAccount(ctx, showSql)
 			err = showRet.Error()
 			if err != nil {
 				logger.Error("failed to exec query sql",