diff --git a/README.md b/README.md index 9b3d59106..880cf2fcc 100644 --- a/README.md +++ b/README.md @@ -520,15 +520,16 @@ individual endpoints with configurable descriptions and thresholds. Alerts are configured at the endpoint level like so: -| Parameter | Description | Default | -|:-----------------------------|:-------------------------------------------------------------------------------|:--------------| -| `alerts` | List of all alerts for a given endpoint. | `[]` | -| `alerts[].type` | Type of alert.
See table below for all valid types. | Required `""` | -| `alerts[].enabled` | Whether to enable the alert. | `true` | -| `alerts[].failure-threshold` | Number of failures in a row needed before triggering the alert. | `3` | -| `alerts[].success-threshold` | Number of successes in a row before an ongoing incident is marked as resolved. | `2` | -| `alerts[].send-on-resolved` | Whether to send a notification once a triggered alert is marked as resolved. | `false` | -| `alerts[].description` | Description of the alert. Will be included in the alert sent. | `""` | +| Parameter | Description | Default | +|:----------------------------------|:-------------------------------------------------------------------------------|:--------------| +| `alerts` | List of all alerts for a given endpoint. | `[]` | +| `alerts[].type` | Type of alert.
See table below for all valid types. | Required `""` | +| `alerts[].enabled` | Whether to enable the alert. | `true` | +| `alerts[].failure-threshold` | Number of failures in a row needed before triggering the alert. | `3` | +| `alerts[].success-threshold` | Number of successes in a row before an ongoing incident is marked as resolved. | `2` | +| `alerts[].send-on-resolved` | Whether to send a notification once a triggered alert is marked as resolved. | `false` | +| `alerts[].description` | Description of the alert. Will be included in the alert sent. | `""` | +| `alerts[].repeat-interval` | Configuration for setting an interval between reminders. | `""` | Here's an example of what an alert configuration might look like at the endpoint level: ```yaml diff --git a/alerting/alert/alert.go b/alerting/alert/alert.go index 2afcd8b50..e885551cd 100644 --- a/alerting/alert/alert.go +++ b/alerting/alert/alert.go @@ -6,6 +6,7 @@ import ( "errors" "strconv" "strings" + "time" ) var ( @@ -48,6 +49,9 @@ type Alert struct { // ongoing/triggered incidents ResolveKey string `yaml:"-"` + // RepeatInterval is the interval between reminders + RepeatInterval *time.Duration `yaml:"repeat-interval"` + // Triggered is used to determine whether an alert has been triggered. When an alert is resolved, this value // should be set back to false. It is used to prevent the same alert from going out twice. // diff --git a/alerting/provider/provider.go b/alerting/provider/provider.go index 30e805e47..b66ad52b8 100644 --- a/alerting/provider/provider.go +++ b/alerting/provider/provider.go @@ -57,6 +57,9 @@ func ParseWithDefaultAlert(providerDefaultAlert, endpointAlert *alert.Alert) { if endpointAlert.SuccessThreshold == 0 { endpointAlert.SuccessThreshold = providerDefaultAlert.SuccessThreshold } + if endpointAlert.RepeatInterval == nil { + endpointAlert.RepeatInterval = providerDefaultAlert.RepeatInterval + } } var ( diff --git a/config/endpoint/endpoint.go b/config/endpoint/endpoint.go index ac765c1a5..bfc57872e 100644 --- a/config/endpoint/endpoint.go +++ b/config/endpoint/endpoint.go @@ -121,6 +121,9 @@ type Endpoint struct { // NumberOfSuccessesInARow is the number of successful evaluations in a row NumberOfSuccessesInARow int `yaml:"-"` + + // LastReminderSent is the time at which the last reminder was sent for this endpoint. + LastReminderSent map[alert.Type]time.Time `yaml:"-"` } // IsEnabled returns whether the endpoint is enabled or not @@ -190,6 +193,9 @@ func (e *Endpoint) ValidateAndSetDefaults() error { if len(e.Headers) == 0 { e.Headers = make(map[string]string) } + if len(e.LastReminderSent) == 0 { + e.LastReminderSent = make(map[alert.Type]time.Time) + } // Automatically add user agent header if there isn't one specified in the endpoint configuration if _, userAgentHeaderExists := e.Headers[UserAgentHeader]; !userAgentHeaderExists { e.Headers[UserAgentHeader] = GatusUserAgent diff --git a/config/endpoint/external_endpoint.go b/config/endpoint/external_endpoint.go index 58f37fedd..8527187cb 100644 --- a/config/endpoint/external_endpoint.go +++ b/config/endpoint/external_endpoint.go @@ -2,6 +2,7 @@ package endpoint import ( "errors" + "time" "github.com/TwiN/gatus/v5/alerting/alert" ) @@ -75,6 +76,8 @@ func (externalEndpoint *ExternalEndpoint) ToEndpoint() *Endpoint { Enabled: externalEndpoint.Enabled, Name: externalEndpoint.Name, Group: externalEndpoint.Group, + Headers: make(map[string]string), + LastReminderSent: make(map[alert.Type]time.Time), Alerts: externalEndpoint.Alerts, NumberOfFailuresInARow: externalEndpoint.NumberOfFailuresInARow, NumberOfSuccessesInARow: externalEndpoint.NumberOfSuccessesInARow, diff --git a/watchdog/alerting.go b/watchdog/alerting.go index 6fd7a2468..8a12d497d 100644 --- a/watchdog/alerting.go +++ b/watchdog/alerting.go @@ -4,6 +4,7 @@ import ( "errors" "log" "os" + "time" "github.com/TwiN/gatus/v5/alerting" "github.com/TwiN/gatus/v5/config/endpoint" @@ -30,16 +31,31 @@ func handleAlertsToTrigger(ep *endpoint.Endpoint, result *endpoint.Result, alert if !endpointAlert.IsEnabled() || endpointAlert.FailureThreshold > ep.NumberOfFailuresInARow { continue } - if endpointAlert.Triggered { + // Determine if an initial alert should be sent + sendInitialAlert := !endpointAlert.Triggered + // Determine if a reminder should be sent + var lastReminder time.Duration + if lr, ok := ep.LastReminderSent[endpointAlert.Type]; ok && !lr.IsZero() { + lastReminder = time.Since(lr) + } + sendReminder := endpointAlert.Triggered && endpointAlert.RepeatInterval != nil && + *endpointAlert.RepeatInterval > 0 && (lastReminder == 0 || lastReminder >= *endpointAlert.RepeatInterval) + // If neither initial alert nor reminder needs to be sent, skip to the next alert + if !sendInitialAlert && !sendReminder { if debug { - log.Printf("[watchdog.handleAlertsToTrigger] Alert for endpoint=%s with description='%s' has already been TRIGGERED, skipping", ep.Name, endpointAlert.GetDescription()) + log.Printf("[watchdog.handleAlertsToTrigger] Alert %s for endpoint=%s with description='%s' is not due for triggering (interval: %s last: %s), skipping", + endpointAlert.Type, ep.Name, endpointAlert.GetDescription(), endpointAlert.RepeatInterval, lastReminder) } continue } alertProvider := alertingConfig.GetAlertingProviderByAlertType(endpointAlert.Type) if alertProvider != nil { - log.Printf("[watchdog.handleAlertsToTrigger] Sending %s alert because alert for endpoint=%s with description='%s' has been TRIGGERED", endpointAlert.Type, ep.Name, endpointAlert.GetDescription()) var err error + alertType := "reminder" + if sendInitialAlert { + alertType = "initial" + } + log.Printf("[watchdog.handleAlertsToTrigger] Sending %s %s alert because alert for endpoint=%s with description='%s' has been TRIGGERED", alertType, endpointAlert.Type, ep.Name, endpointAlert.GetDescription()) if os.Getenv("MOCK_ALERT_PROVIDER") == "true" { if os.Getenv("MOCK_ALERT_PROVIDER_ERROR") == "true" { err = errors.New("error") @@ -50,7 +66,11 @@ func handleAlertsToTrigger(ep *endpoint.Endpoint, result *endpoint.Result, alert if err != nil { log.Printf("[watchdog.handleAlertsToTrigger] Failed to send an alert for endpoint=%s: %s", ep.Name, err.Error()) } else { - endpointAlert.Triggered = true + // Mark initial alert as triggered and update last reminder time + if sendInitialAlert { + endpointAlert.Triggered = true + } + ep.LastReminderSent[endpointAlert.Type] = time.Now() if err := store.Get().UpsertTriggeredEndpointAlert(ep, endpointAlert); err != nil { log.Printf("[watchdog.handleAlertsToTrigger] Failed to persist triggered endpoint alert for endpoint with key=%s: %s", ep.Key(), err.Error()) } @@ -93,6 +113,7 @@ func handleAlertsToResolve(ep *endpoint.Endpoint, result *endpoint.Result, alert } else { log.Printf("[watchdog.handleAlertsToResolve] Not sending alert of type=%s despite being RESOLVED, because the provider wasn't configured properly", endpointAlert.Type) } + ep.LastReminderSent[endpointAlert.Type] = time.Now() } ep.NumberOfFailuresInARow = 0 } diff --git a/watchdog/alerting_test.go b/watchdog/alerting_test.go index 914355e46..8fc75eb46 100644 --- a/watchdog/alerting_test.go +++ b/watchdog/alerting_test.go @@ -3,6 +3,7 @@ package watchdog import ( "os" "testing" + "time" "github.com/TwiN/gatus/v5/alerting" "github.com/TwiN/gatus/v5/alerting/alert" @@ -38,7 +39,8 @@ func TestHandleAlerting(t *testing.T) { } enabled := true ep := &endpoint.Endpoint{ - URL: "https://example.com", + URL: "https://example.com", + LastReminderSent: make(map[alert.Type]time.Time), Alerts: []*alert.Alert{ { Type: alert.TypeCustom, @@ -82,7 +84,8 @@ func TestHandleAlertingWithBadAlertProvider(t *testing.T) { enabled := true ep := &endpoint.Endpoint{ - URL: "http://example.com", + URL: "http://example.com", + LastReminderSent: make(map[alert.Type]time.Time), Alerts: []*alert.Alert{ { Type: alert.TypeCustom, @@ -117,7 +120,8 @@ func TestHandleAlertingWhenTriggeredAlertIsAlmostResolvedButendpointStartFailing } enabled := true ep := &endpoint.Endpoint{ - URL: "https://example.com", + URL: "https://example.com", + LastReminderSent: make(map[alert.Type]time.Time), Alerts: []*alert.Alert{ { Type: alert.TypeCustom, @@ -152,7 +156,8 @@ func TestHandleAlertingWhenTriggeredAlertIsResolvedButSendOnResolvedIsFalse(t *t enabled := true disabled := false ep := &endpoint.Endpoint{ - URL: "https://example.com", + URL: "https://example.com", + LastReminderSent: make(map[alert.Type]time.Time), Alerts: []*alert.Alert{ { Type: alert.TypeCustom, @@ -184,7 +189,8 @@ func TestHandleAlertingWhenTriggeredAlertIsResolvedPagerDuty(t *testing.T) { } enabled := true ep := &endpoint.Endpoint{ - URL: "https://example.com", + URL: "https://example.com", + LastReminderSent: make(map[alert.Type]time.Time), Alerts: []*alert.Alert{ { Type: alert.TypePagerDuty, @@ -220,7 +226,8 @@ func TestHandleAlertingWhenTriggeredAlertIsResolvedPushover(t *testing.T) { } enabled := true ep := &endpoint.Endpoint{ - URL: "https://example.com", + URL: "https://example.com", + LastReminderSent: make(map[alert.Type]time.Time), Alerts: []*alert.Alert{ { Type: alert.TypePushover, @@ -390,7 +397,8 @@ func TestHandleAlertingWithProviderThatReturnsAnError(t *testing.T) { for _, scenario := range scenarios { t.Run(scenario.Name, func(t *testing.T) { ep := &endpoint.Endpoint{ - URL: "https://example.com", + URL: "https://example.com", + LastReminderSent: make(map[alert.Type]time.Time), Alerts: []*alert.Alert{ { Type: scenario.AlertType, @@ -450,7 +458,8 @@ func TestHandleAlertingWithProviderThatOnlyReturnsErrorOnResolve(t *testing.T) { } enabled := true ep := &endpoint.Endpoint{ - URL: "https://example.com", + URL: "https://example.com", + LastReminderSent: make(map[alert.Type]time.Time), Alerts: []*alert.Alert{ { Type: alert.TypeCustom, @@ -487,6 +496,49 @@ func TestHandleAlertingWithProviderThatOnlyReturnsErrorOnResolve(t *testing.T) { verify(t, ep, 0, 2, false, "") } +func TestHandleAlertingWithRepeatInterval(t *testing.T) { + _ = os.Setenv("MOCK_ALERT_PROVIDER", "true") + defer os.Clearenv() + + cfg := &config.Config{ + Debug: true, + Alerting: &alerting.Config{ + Custom: &custom.AlertProvider{ + URL: "https://twin.sh/health", + Method: "GET", + }, + }, + } + enabled := true + repeatInterval := 1 * time.Second + ep := &endpoint.Endpoint{ + URL: "https://example.com", + LastReminderSent: make(map[alert.Type]time.Time), + Alerts: []*alert.Alert{ + { + Type: alert.TypeCustom, + Enabled: &enabled, + FailureThreshold: 2, + SuccessThreshold: 3, + SendOnResolved: &enabled, + Triggered: false, + RepeatInterval: &repeatInterval, + }, + }, + } + + verify(t, ep, 0, 0, false, "The alert shouldn't start triggered") + HandleAlerting(ep, &endpoint.Result{Success: false}, cfg.Alerting, cfg.Debug) + verify(t, ep, 1, 0, false, "The alert shouldn't have triggered") + HandleAlerting(ep, &endpoint.Result{Success: false}, cfg.Alerting, cfg.Debug) + verify(t, ep, 2, 0, true, "The alert should've triggered") + HandleAlerting(ep, &endpoint.Result{Success: false}, cfg.Alerting, cfg.Debug) + verify(t, ep, 3, 0, true, "The alert should still be triggered") + HandleAlerting(ep, &endpoint.Result{Success: false}, cfg.Alerting, cfg.Debug) + verify(t, ep, 4, 0, true, "The alert should still be triggered") + HandleAlerting(ep, &endpoint.Result{Success: true}, cfg.Alerting, cfg.Debug) +} + func verify(t *testing.T, ep *endpoint.Endpoint, expectedNumberOfFailuresInARow, expectedNumberOfSuccessInARow int, expectedTriggered bool, expectedTriggeredReason string) { if ep.NumberOfFailuresInARow != expectedNumberOfFailuresInARow { t.Errorf("endpoint.NumberOfFailuresInARow should've been %d, got %d", expectedNumberOfFailuresInARow, ep.NumberOfFailuresInARow)