Skip to content

Commit

Permalink
Merge pull request #135 from opsre/fix-consumer
Browse files Browse the repository at this point in the history
Fix consumer knowns issue
  • Loading branch information
Cairry authored Feb 22, 2025
2 parents 96a5133 + 92d6b87 commit f902baf
Show file tree
Hide file tree
Showing 7 changed files with 115 additions and 75 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/test-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ name: CI
on:
push:
branches:
- '*/*'
- '*'

jobs:
build:
Expand Down
142 changes: 83 additions & 59 deletions alert/consumer/consumer.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,48 +54,39 @@ func (ag *AlertGroups) AddAlert(alert *models.AlertCurEvent, noticeGroup []map[s
ag.lock.Lock()
defer ag.lock.Unlock()

// 查找Rule位置
rulePos := sort.Search(len(ag.Rules), func(i int) bool {
return ag.Rules[i].RuleID >= alert.RuleId
})
// 查找 Rule 位置
rulePos := ag.getRuleNodePos(alert.RuleId)

// Rule存在时的处理,找到对应的规则组
// Rule 存在时的处理,找到对应的规则组
if rulePos < len(ag.Rules) && ag.Rules[rulePos].RuleID == alert.RuleId {
groups := &ag.Rules[rulePos].Groups
// 查找Group位置
groupPos := sort.Search(len(*groups), func(i int) bool {
return (*groups)[i].ID >= groupID
})
rule := &ag.Rules[rulePos]

if groupPos < len(*groups) && (*groups)[groupPos].ID == groupID {
// 查找 Group 位置
groupPos := ag.getGroupNodePos(rule, groupID)

if groupPos < len(rule.Groups) && (rule.Groups)[groupPos].ID == groupID {
// 追加事件
(*groups)[groupPos].Events = append((*groups)[groupPos].Events, alert)
(rule.Groups)[groupPos].Events = append((rule.Groups)[groupPos].Events, alert)
} else {
// 插入新Group,新增空元素扩容切片
*groups = append(*groups, EventsGroup{})
// 将插入位置之后的元素后移,
copy((*groups)[groupPos+1:], (*groups)[groupPos:])
// 将数据赋值到新的空元素上
(*groups)[groupPos] = EventsGroup{
// 插入新数据
rule.Groups = append(rule.Groups, EventsGroup{
ID: groupID,
Events: []*models.AlertCurEvent{alert},
}
})
}
return
}

// 插入新Rule
ag.Rules = append(ag.Rules, RulesGroup{})
copy(ag.Rules[rulePos+1:], ag.Rules[rulePos:])
ag.Rules[rulePos] = RulesGroup{
ag.Rules = append(ag.Rules, RulesGroup{
RuleID: alert.RuleId,
Groups: []EventsGroup{
{
ID: groupID,
Events: []*models.AlertCurEvent{alert},
},
},
}
})
}

// generateGroupID 生成分组ID,每个规则可能会有多个分组(其分组通知),默认为 default,如果有匹配的分组则根据 key/value 计算一个 HASH值作为 ID。
Expand All @@ -114,6 +105,31 @@ func (ag *AlertGroups) generateGroupID(alert *models.AlertCurEvent, noticeGroupM
return groupId
}

// getRuleNodePos 获取 Rule 点位
func (ag *AlertGroups) getRuleNodePos(ruleId string) int {
// Rules 切片排序
sort.Slice(ag.Rules, func(i, j int) bool {
return ag.Rules[i].RuleID < ag.Rules[j].RuleID
})

// 查找Rule位置
return sort.Search(len(ag.Rules), func(i int) bool {
return ag.Rules[i].RuleID >= ruleId
})
}

func (ag *AlertGroups) getGroupNodePos(rule *RulesGroup, groupId string) int {
// Groups 切片排序
sort.Slice(rule.Groups, func(i, j int) bool {
return rule.Groups[i].ID < rule.Groups[j].ID
})

// 查找Group位置
return sort.Search(len(rule.Groups), func(i int) bool {
return (rule.Groups)[i].ID >= groupId
})
}

func NewConsumerWork(ctx *ctx.Context) ConsumeInterface {
return &Consume{
ctx: ctx,
Expand Down Expand Up @@ -141,52 +157,52 @@ func (c *Consume) Stop(faultCenterId string) {

// Watch 启动 Consumer Watch 进程
func (c *Consume) Watch(ctx context.Context, faultCenter models.FaultCenter) {
taskChan := make(chan struct{}, 1)
timer := time.NewTicker(time.Second * time.Duration(1))
defer func() {
timer.Stop()
if r := recover(); r != nil {
logc.Error(c.ctx.Ctx, fmt.Sprintf("Recovered from consumer watch goroutine panic: %s, FaultCenterName: %s, Id: %s", r, faultCenter.Name, faultCenter.ID))
}
//if r := recover(); r != nil {
// logc.Error(c.ctx.Ctx, fmt.Sprintf("Recovered from consumer watch goroutine panic: %s, FaultCenterName: %s, Id: %s", r, faultCenter.Name, faultCenter.ID))
//}
}()

for {
select {
case <-timer.C:
c.processSilenceRule(faultCenter)
// 获取故障中心的所有告警事件
data, err := c.ctx.Redis.Redis().HGetAll(faultCenter.GetFaultCenterKey()).Result()
if err != nil {
logc.Error(c.ctx.Ctx, fmt.Sprintf("从 Redis 中获取事件信息错误, faultCenterKey: %s, err: %s", faultCenter.GetFaultCenterKey(), err.Error()))
return
}
// 事件过滤
filterEvents := c.filterAlertEvents(faultCenter, data)
// 事件分组
var alertGroups AlertGroups
c.alarmGrouping(faultCenter, &alertGroups, filterEvents)
// 事件聚合
aggEvents := c.alarmAggregation(faultCenter, &alertGroups)
// 发送事件
c.sendAlerts(faultCenter, aggEvents)
// 处理任务信号量
taskChan <- struct{}{}
c.executeTask(faultCenter, taskChan)
case <-ctx.Done():
return
}
}
}

func (c *Consume) wait(startAt, endAt int64, sem chan struct{}) {
timer := time.NewTicker(time.Second * time.Duration(1))
for {
select {
case <-timer.C:
if (endAt - startAt) > 60 {
time.Sleep(time.Millisecond * 200)
sem <- struct{}{}
} else {
return
}
}
// executeTask 执行具体的任务逻辑
func (c *Consume) executeTask(faultCenter models.FaultCenter, taskChan chan struct{}) {
defer func() {
// 释放任务信号量
<-taskChan
}()
// 处理静默规则
c.processSilenceRule(faultCenter)
// 获取故障中心的所有告警事件
data, err := c.ctx.Redis.Redis().HGetAll(faultCenter.GetFaultCenterKey()).Result()
if err != nil {
logc.Error(c.ctx.Ctx, fmt.Sprintf("从 Redis 中获取事件信息错误, faultCenterKey: %s, err: %s", faultCenter.GetFaultCenterKey(), err.Error()))
return
}

// 事件过滤
filterEvents := c.filterAlertEvents(faultCenter, data)
// 事件分组
var alertGroups AlertGroups
c.alarmGrouping(faultCenter, &alertGroups, filterEvents)
fmt.Println("alertGroups->", tools.JsonMarshal(alertGroups.Rules))
// 事件聚合
aggEvents := c.alarmAggregation(faultCenter, &alertGroups)
// 发送事件
c.sendAlerts(faultCenter, aggEvents)
}

// filterAlertEvents 过滤告警事件
Expand Down Expand Up @@ -226,23 +242,31 @@ func (c *Consume) isMutedEvent(event *models.AlertCurEvent, faultCenter models.F

// validateEvent 事件验证
func (c *Consume) validateEvent(event *models.AlertCurEvent, faultCenter models.FaultCenter) bool {
if event.State == "Pending" {
return false
}

return event.IsRecovered || event.LastSendTime == 0 ||
event.LastEvalTime >= event.LastSendTime+faultCenter.RepeatNoticeInterval*60
}

// alarmGrouping 告警分组
// 分组会进行 2 次分类
// 第一次是状态(用于区分事件是告警或恢复,用于后续聚合逻辑,避免告警和恢复聚合到一起)
// 第二次是规则(对隶属于相同规则的事件放再同一组,用于后续聚合逻辑,避免不同规则的告警或恢复聚合到一起)
func (c *Consume) alarmGrouping(faultCenter models.FaultCenter, alertGroups *AlertGroups, alerts []*models.AlertCurEvent) {
if len(alerts) == 0 {
return
}

for _, alert := range alerts {
// 状态分组
switch alert.IsRecovered {
case true:
alert.RuleId = "Recover_" + alert.RuleId
case false:
alert.RuleId = "Firing_" + alert.RuleId
default:
alert.RuleId = "Unknown_" + alert.RuleId
}

alertGroups.AddAlert(alert, faultCenter.NoticeGroup)
//c.addAlertToGroup(alert, faultCenter.NoticeGroup)
if alert.IsRecovered {
c.removeAlertFromCache(alert)
if err := process.RecordAlertHisEvent(c.ctx, *alert); err != nil {
Expand Down
23 changes: 19 additions & 4 deletions alert/eval/eval.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ type (
Submit(rule models.AlertRule)
Stop(ruleId string)
Eval(ctx context.Context, rule models.AlertRule)
Recover(faultCenterKey string, faultCenterInfoKey string, curKeys []string)
Recover(ruleId, faultCenterKey string, faultCenterInfoKey string, curKeys []string)
GC(rule models.AlertRule, curFiringKeys []string)
RestartAllEvals()
}
Expand Down Expand Up @@ -108,8 +108,8 @@ func (t *AlertRule) Eval(ctx context.Context, rule models.AlertRule) {
// 追加当前数据源的指纹到总列表
curFingerprints = append(curFingerprints, fingerprints...)
}
//logc.Infof(t.ctx.Ctx, fmt.Sprintf("规则评估 -> %v", tools.JsonMarshal(rule)))
t.Recover(models.BuildCacheEventKey(rule.TenantId, rule.FaultCenterId), models.BuildCacheInfoKey(rule.TenantId, rule.FaultCenterId), curFingerprints)
logc.Infof(t.ctx.Ctx, fmt.Sprintf("规则评估 -> %v", tools.JsonMarshal(rule)))
t.Recover(rule.RuleId, models.BuildCacheEventKey(rule.TenantId, rule.FaultCenterId), models.BuildCacheInfoKey(rule.TenantId, rule.FaultCenterId), curFingerprints)
t.GC(rule, curFingerprints)

case <-ctx.Done():
Expand All @@ -120,13 +120,22 @@ func (t *AlertRule) Eval(ctx context.Context, rule models.AlertRule) {
}
}

func (t *AlertRule) Recover(faultCenterKey string, faultCenterInfoKey string, curFingerprints []string) {
func (t *AlertRule) Recover(RuleId, faultCenterKey string, faultCenterInfoKey string, curFingerprints []string) {
// 获取所有的故障中心的告警事件
events, err := t.ctx.Redis.Event().GetAllEventsForFaultCenter(faultCenterKey)
if err != nil {
return
}

// 只获取当前规则的事件
var currentRuleEvents = make(map[string]models.AlertCurEvent)
for fingerprint, event := range events {
if event.RuleId == RuleId {
currentRuleEvents[fingerprint] = event
}
}
events = currentRuleEvents

// 提取事件中的告警指纹
fingerprints := make([]string, 0)
for fingerprint := range events {
Expand All @@ -146,6 +155,10 @@ func (t *AlertRule) Recover(faultCenterKey string, faultCenterInfoKey string, cu
return
}

// 调整为待恢复状态
event.Status = 3
t.ctx.Redis.Event().PushEventToFaultCenter(&event)

// 判断是否在等待时间范围内
wTime, exists := t.alarmRecoverWaitStore.Get(key)
if !exists {
Expand All @@ -159,6 +172,8 @@ func (t *AlertRule) Recover(faultCenterKey string, faultCenterInfoKey string, cu
continue
}

// 已恢复状态
event.Status = 4
event.IsRecovered = true
event.RecoverTime = curTime
event.LastSendTime = 0
Expand Down
10 changes: 4 additions & 6 deletions alert/process/process.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,17 +51,15 @@ func PushEventToFaultCenter(ctx *ctx.Context, event *models.AlertCurEvent) {
event.LastEvalTime = eventOpt.GetLastEvalTimeForFaultCenter()
event.LastSendTime = eventOpt.GetLastSendTimeForFaultCenter(event.TenantId, event.FaultCenterId, event.Fingerprint)

event.State = "Pending"
if event.IsArriveForDuration() {
event.State = "Firing"
event.Status = 1
}
if event.IsRecovered {
event.State = "Recover"
}

if isSilencedEvent(event) {
event.Status = 2
}
if event.IsRecovered {
event.Status = 3
}

eventOpt.PushEventToFaultCenter(event)
}
Expand Down
3 changes: 1 addition & 2 deletions internal/models/alert_current_event.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ type AlertCurEvent struct {
DatasourceType string `json:"datasource_type"`
DatasourceId string `json:"datasource_id" gorm:"datasource_id"`
Fingerprint string `json:"fingerprint"`
State string `json:"state"` // 事件状态,Pending / Firing
Severity string `json:"severity"`
Metric map[string]interface{} `json:"metric" gorm:"metric;serializer:json"`
Labels map[string]string `json:"labels" gorm:"labels;serializer:json"`
Expand All @@ -36,7 +35,7 @@ type AlertCurEvent struct {
FaultCenter FaultCenter `json:"faultCenter" gorm:"-"`
ResponseTime string `json:"response_time" gorm:"-"`
TimeRemaining int64 `json:"time_remaining" gorm:"-"`
Status int64 `json:"status" gorm:"-"` // 事件状态,告警中:1,静默中:2
Status int64 `json:"status" gorm:"-"` // 事件状态,告警中:1,静默中:2,待恢复:3,已恢复:4
}

type AlertCurEventQuery struct {
Expand Down
1 change: 1 addition & 0 deletions internal/models/fault_center.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ type FaultCenter struct {
RecoverWaitTime int64 `json:"recoverWaitTime"`
CurrentAlertNumber int64 `json:"currentAlertNumber" gorm:"-"`
CurrentMuteNumber int64 `json:"currentMuteNumber" gorm:"-"`
CurrentRecoverNumber int64 `json:"currentRecoverNumber" gorm:"-"`
}

func (f *FaultCenter) TableName() string {
Expand Down
9 changes: 6 additions & 3 deletions internal/services/fault_center.go
Original file line number Diff line number Diff line change
Expand Up @@ -89,14 +89,17 @@ func (f faultCenterService) List(req interface{}) (data interface{}, err interfa
}

for _, event := range events {
if event.Status == 1 {
switch event.Status {
case 1:
faultCenters[index].CurrentAlertNumber++
} else {
case 2:
faultCenters[index].CurrentMuteNumber++
case 3:
faultCenters[index].CurrentRecoverNumber++
}
}

}

return faultCenters, nil
}

Expand Down

0 comments on commit f902baf

Please sign in to comment.