diff --git a/README.md b/README.md
index 9b3d59106..880cf2fcc 100644
--- a/README.md
+++ b/README.md
@@ -520,15 +520,16 @@ individual endpoints with configurable descriptions and thresholds.
Alerts are configured at the endpoint level like so:
-| Parameter | Description | Default |
-|:-----------------------------|:-------------------------------------------------------------------------------|:--------------|
-| `alerts` | List of all alerts for a given endpoint. | `[]` |
-| `alerts[].type` | Type of alert.
See table below for all valid types. | Required `""` |
-| `alerts[].enabled` | Whether to enable the alert. | `true` |
-| `alerts[].failure-threshold` | Number of failures in a row needed before triggering the alert. | `3` |
-| `alerts[].success-threshold` | Number of successes in a row before an ongoing incident is marked as resolved. | `2` |
-| `alerts[].send-on-resolved` | Whether to send a notification once a triggered alert is marked as resolved. | `false` |
-| `alerts[].description` | Description of the alert. Will be included in the alert sent. | `""` |
+| Parameter | Description | Default |
+|:----------------------------------|:-------------------------------------------------------------------------------|:--------------|
+| `alerts` | List of all alerts for a given endpoint. | `[]` |
+| `alerts[].type` | Type of alert.
See table below for all valid types. | Required `""` |
+| `alerts[].enabled` | Whether to enable the alert. | `true` |
+| `alerts[].failure-threshold` | Number of failures in a row needed before triggering the alert. | `3` |
+| `alerts[].success-threshold` | Number of successes in a row before an ongoing incident is marked as resolved. | `2` |
+| `alerts[].send-on-resolved` | Whether to send a notification once a triggered alert is marked as resolved. | `false` |
+| `alerts[].description` | Description of the alert. Will be included in the alert sent. | `""` |
+| `alerts[].repeat-interval` | Configuration for setting an interval between reminders. | `""` |
Here's an example of what an alert configuration might look like at the endpoint level:
```yaml
diff --git a/alerting/alert/alert.go b/alerting/alert/alert.go
index 2afcd8b50..e885551cd 100644
--- a/alerting/alert/alert.go
+++ b/alerting/alert/alert.go
@@ -6,6 +6,7 @@ import (
"errors"
"strconv"
"strings"
+ "time"
)
var (
@@ -48,6 +49,9 @@ type Alert struct {
// ongoing/triggered incidents
ResolveKey string `yaml:"-"`
+ // RepeatInterval is the interval between reminders
+ RepeatInterval *time.Duration `yaml:"repeat-interval"`
+
// Triggered is used to determine whether an alert has been triggered. When an alert is resolved, this value
// should be set back to false. It is used to prevent the same alert from going out twice.
//
diff --git a/alerting/provider/provider.go b/alerting/provider/provider.go
index 30e805e47..b66ad52b8 100644
--- a/alerting/provider/provider.go
+++ b/alerting/provider/provider.go
@@ -57,6 +57,9 @@ func ParseWithDefaultAlert(providerDefaultAlert, endpointAlert *alert.Alert) {
if endpointAlert.SuccessThreshold == 0 {
endpointAlert.SuccessThreshold = providerDefaultAlert.SuccessThreshold
}
+ if endpointAlert.RepeatInterval == nil {
+ endpointAlert.RepeatInterval = providerDefaultAlert.RepeatInterval
+ }
}
var (
diff --git a/config/endpoint/endpoint.go b/config/endpoint/endpoint.go
index ac765c1a5..bfc57872e 100644
--- a/config/endpoint/endpoint.go
+++ b/config/endpoint/endpoint.go
@@ -121,6 +121,9 @@ type Endpoint struct {
// NumberOfSuccessesInARow is the number of successful evaluations in a row
NumberOfSuccessesInARow int `yaml:"-"`
+
+ // LastReminderSent is the time at which the last reminder was sent for this endpoint.
+ LastReminderSent map[alert.Type]time.Time `yaml:"-"`
}
// IsEnabled returns whether the endpoint is enabled or not
@@ -190,6 +193,9 @@ func (e *Endpoint) ValidateAndSetDefaults() error {
if len(e.Headers) == 0 {
e.Headers = make(map[string]string)
}
+ if len(e.LastReminderSent) == 0 {
+ e.LastReminderSent = make(map[alert.Type]time.Time)
+ }
// Automatically add user agent header if there isn't one specified in the endpoint configuration
if _, userAgentHeaderExists := e.Headers[UserAgentHeader]; !userAgentHeaderExists {
e.Headers[UserAgentHeader] = GatusUserAgent
diff --git a/config/endpoint/external_endpoint.go b/config/endpoint/external_endpoint.go
index 58f37fedd..8527187cb 100644
--- a/config/endpoint/external_endpoint.go
+++ b/config/endpoint/external_endpoint.go
@@ -2,6 +2,7 @@ package endpoint
import (
"errors"
+ "time"
"github.com/TwiN/gatus/v5/alerting/alert"
)
@@ -75,6 +76,8 @@ func (externalEndpoint *ExternalEndpoint) ToEndpoint() *Endpoint {
Enabled: externalEndpoint.Enabled,
Name: externalEndpoint.Name,
Group: externalEndpoint.Group,
+ Headers: make(map[string]string),
+ LastReminderSent: make(map[alert.Type]time.Time),
Alerts: externalEndpoint.Alerts,
NumberOfFailuresInARow: externalEndpoint.NumberOfFailuresInARow,
NumberOfSuccessesInARow: externalEndpoint.NumberOfSuccessesInARow,
diff --git a/watchdog/alerting.go b/watchdog/alerting.go
index 6fd7a2468..8a12d497d 100644
--- a/watchdog/alerting.go
+++ b/watchdog/alerting.go
@@ -4,6 +4,7 @@ import (
"errors"
"log"
"os"
+ "time"
"github.com/TwiN/gatus/v5/alerting"
"github.com/TwiN/gatus/v5/config/endpoint"
@@ -30,16 +31,31 @@ func handleAlertsToTrigger(ep *endpoint.Endpoint, result *endpoint.Result, alert
if !endpointAlert.IsEnabled() || endpointAlert.FailureThreshold > ep.NumberOfFailuresInARow {
continue
}
- if endpointAlert.Triggered {
+ // Determine if an initial alert should be sent
+ sendInitialAlert := !endpointAlert.Triggered
+ // Determine if a reminder should be sent
+ var lastReminder time.Duration
+ if lr, ok := ep.LastReminderSent[endpointAlert.Type]; ok && !lr.IsZero() {
+ lastReminder = time.Since(lr)
+ }
+ sendReminder := endpointAlert.Triggered && endpointAlert.RepeatInterval != nil &&
+ *endpointAlert.RepeatInterval > 0 && (lastReminder == 0 || lastReminder >= *endpointAlert.RepeatInterval)
+ // If neither initial alert nor reminder needs to be sent, skip to the next alert
+ if !sendInitialAlert && !sendReminder {
if debug {
- log.Printf("[watchdog.handleAlertsToTrigger] Alert for endpoint=%s with description='%s' has already been TRIGGERED, skipping", ep.Name, endpointAlert.GetDescription())
+ log.Printf("[watchdog.handleAlertsToTrigger] Alert %s for endpoint=%s with description='%s' is not due for triggering (interval: %s last: %s), skipping",
+ endpointAlert.Type, ep.Name, endpointAlert.GetDescription(), endpointAlert.RepeatInterval, lastReminder)
}
continue
}
alertProvider := alertingConfig.GetAlertingProviderByAlertType(endpointAlert.Type)
if alertProvider != nil {
- log.Printf("[watchdog.handleAlertsToTrigger] Sending %s alert because alert for endpoint=%s with description='%s' has been TRIGGERED", endpointAlert.Type, ep.Name, endpointAlert.GetDescription())
var err error
+ alertType := "reminder"
+ if sendInitialAlert {
+ alertType = "initial"
+ }
+ log.Printf("[watchdog.handleAlertsToTrigger] Sending %s %s alert because alert for endpoint=%s with description='%s' has been TRIGGERED", alertType, endpointAlert.Type, ep.Name, endpointAlert.GetDescription())
if os.Getenv("MOCK_ALERT_PROVIDER") == "true" {
if os.Getenv("MOCK_ALERT_PROVIDER_ERROR") == "true" {
err = errors.New("error")
@@ -50,7 +66,11 @@ func handleAlertsToTrigger(ep *endpoint.Endpoint, result *endpoint.Result, alert
if err != nil {
log.Printf("[watchdog.handleAlertsToTrigger] Failed to send an alert for endpoint=%s: %s", ep.Name, err.Error())
} else {
- endpointAlert.Triggered = true
+ // Mark initial alert as triggered and update last reminder time
+ if sendInitialAlert {
+ endpointAlert.Triggered = true
+ }
+ ep.LastReminderSent[endpointAlert.Type] = time.Now()
if err := store.Get().UpsertTriggeredEndpointAlert(ep, endpointAlert); err != nil {
log.Printf("[watchdog.handleAlertsToTrigger] Failed to persist triggered endpoint alert for endpoint with key=%s: %s", ep.Key(), err.Error())
}
@@ -93,6 +113,7 @@ func handleAlertsToResolve(ep *endpoint.Endpoint, result *endpoint.Result, alert
} else {
log.Printf("[watchdog.handleAlertsToResolve] Not sending alert of type=%s despite being RESOLVED, because the provider wasn't configured properly", endpointAlert.Type)
}
+ ep.LastReminderSent[endpointAlert.Type] = time.Now()
}
ep.NumberOfFailuresInARow = 0
}
diff --git a/watchdog/alerting_test.go b/watchdog/alerting_test.go
index 914355e46..8fc75eb46 100644
--- a/watchdog/alerting_test.go
+++ b/watchdog/alerting_test.go
@@ -3,6 +3,7 @@ package watchdog
import (
"os"
"testing"
+ "time"
"github.com/TwiN/gatus/v5/alerting"
"github.com/TwiN/gatus/v5/alerting/alert"
@@ -38,7 +39,8 @@ func TestHandleAlerting(t *testing.T) {
}
enabled := true
ep := &endpoint.Endpoint{
- URL: "https://example.com",
+ URL: "https://example.com",
+ LastReminderSent: make(map[alert.Type]time.Time),
Alerts: []*alert.Alert{
{
Type: alert.TypeCustom,
@@ -82,7 +84,8 @@ func TestHandleAlertingWithBadAlertProvider(t *testing.T) {
enabled := true
ep := &endpoint.Endpoint{
- URL: "http://example.com",
+ URL: "http://example.com",
+ LastReminderSent: make(map[alert.Type]time.Time),
Alerts: []*alert.Alert{
{
Type: alert.TypeCustom,
@@ -117,7 +120,8 @@ func TestHandleAlertingWhenTriggeredAlertIsAlmostResolvedButendpointStartFailing
}
enabled := true
ep := &endpoint.Endpoint{
- URL: "https://example.com",
+ URL: "https://example.com",
+ LastReminderSent: make(map[alert.Type]time.Time),
Alerts: []*alert.Alert{
{
Type: alert.TypeCustom,
@@ -152,7 +156,8 @@ func TestHandleAlertingWhenTriggeredAlertIsResolvedButSendOnResolvedIsFalse(t *t
enabled := true
disabled := false
ep := &endpoint.Endpoint{
- URL: "https://example.com",
+ URL: "https://example.com",
+ LastReminderSent: make(map[alert.Type]time.Time),
Alerts: []*alert.Alert{
{
Type: alert.TypeCustom,
@@ -184,7 +189,8 @@ func TestHandleAlertingWhenTriggeredAlertIsResolvedPagerDuty(t *testing.T) {
}
enabled := true
ep := &endpoint.Endpoint{
- URL: "https://example.com",
+ URL: "https://example.com",
+ LastReminderSent: make(map[alert.Type]time.Time),
Alerts: []*alert.Alert{
{
Type: alert.TypePagerDuty,
@@ -220,7 +226,8 @@ func TestHandleAlertingWhenTriggeredAlertIsResolvedPushover(t *testing.T) {
}
enabled := true
ep := &endpoint.Endpoint{
- URL: "https://example.com",
+ URL: "https://example.com",
+ LastReminderSent: make(map[alert.Type]time.Time),
Alerts: []*alert.Alert{
{
Type: alert.TypePushover,
@@ -390,7 +397,8 @@ func TestHandleAlertingWithProviderThatReturnsAnError(t *testing.T) {
for _, scenario := range scenarios {
t.Run(scenario.Name, func(t *testing.T) {
ep := &endpoint.Endpoint{
- URL: "https://example.com",
+ URL: "https://example.com",
+ LastReminderSent: make(map[alert.Type]time.Time),
Alerts: []*alert.Alert{
{
Type: scenario.AlertType,
@@ -450,7 +458,8 @@ func TestHandleAlertingWithProviderThatOnlyReturnsErrorOnResolve(t *testing.T) {
}
enabled := true
ep := &endpoint.Endpoint{
- URL: "https://example.com",
+ URL: "https://example.com",
+ LastReminderSent: make(map[alert.Type]time.Time),
Alerts: []*alert.Alert{
{
Type: alert.TypeCustom,
@@ -487,6 +496,49 @@ func TestHandleAlertingWithProviderThatOnlyReturnsErrorOnResolve(t *testing.T) {
verify(t, ep, 0, 2, false, "")
}
+func TestHandleAlertingWithRepeatInterval(t *testing.T) {
+ _ = os.Setenv("MOCK_ALERT_PROVIDER", "true")
+ defer os.Clearenv()
+
+ cfg := &config.Config{
+ Debug: true,
+ Alerting: &alerting.Config{
+ Custom: &custom.AlertProvider{
+ URL: "https://twin.sh/health",
+ Method: "GET",
+ },
+ },
+ }
+ enabled := true
+ repeatInterval := 1 * time.Second
+ ep := &endpoint.Endpoint{
+ URL: "https://example.com",
+ LastReminderSent: make(map[alert.Type]time.Time),
+ Alerts: []*alert.Alert{
+ {
+ Type: alert.TypeCustom,
+ Enabled: &enabled,
+ FailureThreshold: 2,
+ SuccessThreshold: 3,
+ SendOnResolved: &enabled,
+ Triggered: false,
+ RepeatInterval: &repeatInterval,
+ },
+ },
+ }
+
+ verify(t, ep, 0, 0, false, "The alert shouldn't start triggered")
+ HandleAlerting(ep, &endpoint.Result{Success: false}, cfg.Alerting, cfg.Debug)
+ verify(t, ep, 1, 0, false, "The alert shouldn't have triggered")
+ HandleAlerting(ep, &endpoint.Result{Success: false}, cfg.Alerting, cfg.Debug)
+ verify(t, ep, 2, 0, true, "The alert should've triggered")
+ HandleAlerting(ep, &endpoint.Result{Success: false}, cfg.Alerting, cfg.Debug)
+ verify(t, ep, 3, 0, true, "The alert should still be triggered")
+ HandleAlerting(ep, &endpoint.Result{Success: false}, cfg.Alerting, cfg.Debug)
+ verify(t, ep, 4, 0, true, "The alert should still be triggered")
+ HandleAlerting(ep, &endpoint.Result{Success: true}, cfg.Alerting, cfg.Debug)
+}
+
func verify(t *testing.T, ep *endpoint.Endpoint, expectedNumberOfFailuresInARow, expectedNumberOfSuccessInARow int, expectedTriggered bool, expectedTriggeredReason string) {
if ep.NumberOfFailuresInARow != expectedNumberOfFailuresInARow {
t.Errorf("endpoint.NumberOfFailuresInARow should've been %d, got %d", expectedNumberOfFailuresInARow, ep.NumberOfFailuresInARow)