Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: repeat-interval #852

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 10 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -520,15 +520,16 @@ individual endpoints with configurable descriptions and thresholds.

Alerts are configured at the endpoint level like so:

| Parameter | Description | Default |
|:-----------------------------|:-------------------------------------------------------------------------------|:--------------|
| `alerts` | List of all alerts for a given endpoint. | `[]` |
| `alerts[].type` | Type of alert. <br />See table below for all valid types. | Required `""` |
| `alerts[].enabled` | Whether to enable the alert. | `true` |
| `alerts[].failure-threshold` | Number of failures in a row needed before triggering the alert. | `3` |
| `alerts[].success-threshold` | Number of successes in a row before an ongoing incident is marked as resolved. | `2` |
| `alerts[].send-on-resolved` | Whether to send a notification once a triggered alert is marked as resolved. | `false` |
| `alerts[].description` | Description of the alert. Will be included in the alert sent. | `""` |
| Parameter | Description | Default |
|:----------------------------------|:-------------------------------------------------------------------------------|:--------------|
| `alerts` | List of all alerts for a given endpoint. | `[]` |
| `alerts[].type` | Type of alert. <br />See table below for all valid types. | Required `""` |
| `alerts[].enabled` | Whether to enable the alert. | `true` |
| `alerts[].failure-threshold` | Number of failures in a row needed before triggering the alert. | `3` |
| `alerts[].success-threshold` | Number of successes in a row before an ongoing incident is marked as resolved. | `2` |
| `alerts[].send-on-resolved` | Whether to send a notification once a triggered alert is marked as resolved. | `false` |
| `alerts[].description` | Description of the alert. Will be included in the alert sent. | `""` |
| `alerts[].repeat-interval` | Configuration for setting an interval between reminders. | `""` |

Here's an example of what an alert configuration might look like at the endpoint level:
```yaml
Expand Down
4 changes: 4 additions & 0 deletions alerting/alert/alert.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"errors"
"strconv"
"strings"
"time"
)

var (
Expand Down Expand Up @@ -48,6 +49,9 @@ type Alert struct {
// ongoing/triggered incidents
ResolveKey string `yaml:"-"`

// RepeatInterval is the interval between reminders
RepeatInterval *time.Duration `yaml:"repeat-interval"`

// Triggered is used to determine whether an alert has been triggered. When an alert is resolved, this value
// should be set back to false. It is used to prevent the same alert from going out twice.
//
Expand Down
3 changes: 3 additions & 0 deletions alerting/provider/provider.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,9 @@ func ParseWithDefaultAlert(providerDefaultAlert, endpointAlert *alert.Alert) {
if endpointAlert.SuccessThreshold == 0 {
endpointAlert.SuccessThreshold = providerDefaultAlert.SuccessThreshold
}
if endpointAlert.RepeatInterval == nil {
endpointAlert.RepeatInterval = providerDefaultAlert.RepeatInterval
}
}

var (
Expand Down
6 changes: 6 additions & 0 deletions config/endpoint/endpoint.go
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,9 @@ type Endpoint struct {

// NumberOfSuccessesInARow is the number of successful evaluations in a row
NumberOfSuccessesInARow int `yaml:"-"`

// LastReminderSent is the time at which the last reminder was sent for this endpoint.
LastReminderSent map[alert.Type]time.Time `yaml:"-"`
}

// IsEnabled returns whether the endpoint is enabled or not
Expand Down Expand Up @@ -190,6 +193,9 @@ func (e *Endpoint) ValidateAndSetDefaults() error {
if len(e.Headers) == 0 {
e.Headers = make(map[string]string)
}
if len(e.LastReminderSent) == 0 {
e.LastReminderSent = make(map[alert.Type]time.Time)
}
// Automatically add user agent header if there isn't one specified in the endpoint configuration
if _, userAgentHeaderExists := e.Headers[UserAgentHeader]; !userAgentHeaderExists {
e.Headers[UserAgentHeader] = GatusUserAgent
Expand Down
3 changes: 3 additions & 0 deletions config/endpoint/external_endpoint.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package endpoint

import (
"errors"
"time"

"github.com/TwiN/gatus/v5/alerting/alert"
)
Expand Down Expand Up @@ -75,6 +76,8 @@ func (externalEndpoint *ExternalEndpoint) ToEndpoint() *Endpoint {
Enabled: externalEndpoint.Enabled,
Name: externalEndpoint.Name,
Group: externalEndpoint.Group,
Headers: make(map[string]string),
LastReminderSent: make(map[alert.Type]time.Time),
Alerts: externalEndpoint.Alerts,
NumberOfFailuresInARow: externalEndpoint.NumberOfFailuresInARow,
NumberOfSuccessesInARow: externalEndpoint.NumberOfSuccessesInARow,
Expand Down
29 changes: 25 additions & 4 deletions watchdog/alerting.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"errors"
"log"
"os"
"time"

"github.com/TwiN/gatus/v5/alerting"
"github.com/TwiN/gatus/v5/config/endpoint"
Expand All @@ -30,16 +31,31 @@ func handleAlertsToTrigger(ep *endpoint.Endpoint, result *endpoint.Result, alert
if !endpointAlert.IsEnabled() || endpointAlert.FailureThreshold > ep.NumberOfFailuresInARow {
continue
}
if endpointAlert.Triggered {
// Determine if an initial alert should be sent
sendInitialAlert := !endpointAlert.Triggered
// Determine if a reminder should be sent
var lastReminder time.Duration
if lr, ok := ep.LastReminderSent[endpointAlert.Type]; ok && !lr.IsZero() {
lastReminder = time.Since(lr)
}
sendReminder := endpointAlert.Triggered && endpointAlert.RepeatInterval != nil &&
*endpointAlert.RepeatInterval > 0 && (lastReminder == 0 || lastReminder >= *endpointAlert.RepeatInterval)
// If neither initial alert nor reminder needs to be sent, skip to the next alert
if !sendInitialAlert && !sendReminder {
if debug {
log.Printf("[watchdog.handleAlertsToTrigger] Alert for endpoint=%s with description='%s' has already been TRIGGERED, skipping", ep.Name, endpointAlert.GetDescription())
log.Printf("[watchdog.handleAlertsToTrigger] Alert %s for endpoint=%s with description='%s' is not due for triggering (interval: %s last: %s), skipping",
endpointAlert.Type, ep.Name, endpointAlert.GetDescription(), endpointAlert.RepeatInterval, lastReminder)
}
continue
}
alertProvider := alertingConfig.GetAlertingProviderByAlertType(endpointAlert.Type)
if alertProvider != nil {
log.Printf("[watchdog.handleAlertsToTrigger] Sending %s alert because alert for endpoint=%s with description='%s' has been TRIGGERED", endpointAlert.Type, ep.Name, endpointAlert.GetDescription())
var err error
alertType := "reminder"
if sendInitialAlert {
alertType = "initial"
}
log.Printf("[watchdog.handleAlertsToTrigger] Sending %s %s alert because alert for endpoint=%s with description='%s' has been TRIGGERED", alertType, endpointAlert.Type, ep.Name, endpointAlert.GetDescription())
if os.Getenv("MOCK_ALERT_PROVIDER") == "true" {
if os.Getenv("MOCK_ALERT_PROVIDER_ERROR") == "true" {
err = errors.New("error")
Expand All @@ -50,7 +66,11 @@ func handleAlertsToTrigger(ep *endpoint.Endpoint, result *endpoint.Result, alert
if err != nil {
log.Printf("[watchdog.handleAlertsToTrigger] Failed to send an alert for endpoint=%s: %s", ep.Name, err.Error())
} else {
endpointAlert.Triggered = true
// Mark initial alert as triggered and update last reminder time
if sendInitialAlert {
endpointAlert.Triggered = true
}
ep.LastReminderSent[endpointAlert.Type] = time.Now()
if err := store.Get().UpsertTriggeredEndpointAlert(ep, endpointAlert); err != nil {
log.Printf("[watchdog.handleAlertsToTrigger] Failed to persist triggered endpoint alert for endpoint with key=%s: %s", ep.Key(), err.Error())
}
Expand Down Expand Up @@ -93,6 +113,7 @@ func handleAlertsToResolve(ep *endpoint.Endpoint, result *endpoint.Result, alert
} else {
log.Printf("[watchdog.handleAlertsToResolve] Not sending alert of type=%s despite being RESOLVED, because the provider wasn't configured properly", endpointAlert.Type)
}
ep.LastReminderSent[endpointAlert.Type] = time.Now()
}
ep.NumberOfFailuresInARow = 0
}
68 changes: 60 additions & 8 deletions watchdog/alerting_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package watchdog
import (
"os"
"testing"
"time"

"github.com/TwiN/gatus/v5/alerting"
"github.com/TwiN/gatus/v5/alerting/alert"
Expand Down Expand Up @@ -38,7 +39,8 @@ func TestHandleAlerting(t *testing.T) {
}
enabled := true
ep := &endpoint.Endpoint{
URL: "https://example.com",
URL: "https://example.com",
LastReminderSent: make(map[alert.Type]time.Time),
Alerts: []*alert.Alert{
{
Type: alert.TypeCustom,
Expand Down Expand Up @@ -82,7 +84,8 @@ func TestHandleAlertingWithBadAlertProvider(t *testing.T) {

enabled := true
ep := &endpoint.Endpoint{
URL: "http://example.com",
URL: "http://example.com",
LastReminderSent: make(map[alert.Type]time.Time),
Alerts: []*alert.Alert{
{
Type: alert.TypeCustom,
Expand Down Expand Up @@ -117,7 +120,8 @@ func TestHandleAlertingWhenTriggeredAlertIsAlmostResolvedButendpointStartFailing
}
enabled := true
ep := &endpoint.Endpoint{
URL: "https://example.com",
URL: "https://example.com",
LastReminderSent: make(map[alert.Type]time.Time),
Alerts: []*alert.Alert{
{
Type: alert.TypeCustom,
Expand Down Expand Up @@ -152,7 +156,8 @@ func TestHandleAlertingWhenTriggeredAlertIsResolvedButSendOnResolvedIsFalse(t *t
enabled := true
disabled := false
ep := &endpoint.Endpoint{
URL: "https://example.com",
URL: "https://example.com",
LastReminderSent: make(map[alert.Type]time.Time),
Alerts: []*alert.Alert{
{
Type: alert.TypeCustom,
Expand Down Expand Up @@ -184,7 +189,8 @@ func TestHandleAlertingWhenTriggeredAlertIsResolvedPagerDuty(t *testing.T) {
}
enabled := true
ep := &endpoint.Endpoint{
URL: "https://example.com",
URL: "https://example.com",
LastReminderSent: make(map[alert.Type]time.Time),
Alerts: []*alert.Alert{
{
Type: alert.TypePagerDuty,
Expand Down Expand Up @@ -220,7 +226,8 @@ func TestHandleAlertingWhenTriggeredAlertIsResolvedPushover(t *testing.T) {
}
enabled := true
ep := &endpoint.Endpoint{
URL: "https://example.com",
URL: "https://example.com",
LastReminderSent: make(map[alert.Type]time.Time),
Alerts: []*alert.Alert{
{
Type: alert.TypePushover,
Expand Down Expand Up @@ -390,7 +397,8 @@ func TestHandleAlertingWithProviderThatReturnsAnError(t *testing.T) {
for _, scenario := range scenarios {
t.Run(scenario.Name, func(t *testing.T) {
ep := &endpoint.Endpoint{
URL: "https://example.com",
URL: "https://example.com",
LastReminderSent: make(map[alert.Type]time.Time),
Alerts: []*alert.Alert{
{
Type: scenario.AlertType,
Expand Down Expand Up @@ -450,7 +458,8 @@ func TestHandleAlertingWithProviderThatOnlyReturnsErrorOnResolve(t *testing.T) {
}
enabled := true
ep := &endpoint.Endpoint{
URL: "https://example.com",
URL: "https://example.com",
LastReminderSent: make(map[alert.Type]time.Time),
Alerts: []*alert.Alert{
{
Type: alert.TypeCustom,
Expand Down Expand Up @@ -487,6 +496,49 @@ func TestHandleAlertingWithProviderThatOnlyReturnsErrorOnResolve(t *testing.T) {
verify(t, ep, 0, 2, false, "")
}

func TestHandleAlertingWithRepeatInterval(t *testing.T) {
_ = os.Setenv("MOCK_ALERT_PROVIDER", "true")
defer os.Clearenv()

cfg := &config.Config{
Debug: true,
Alerting: &alerting.Config{
Custom: &custom.AlertProvider{
URL: "https://twin.sh/health",
Method: "GET",
},
},
}
enabled := true
repeatInterval := 1 * time.Second
ep := &endpoint.Endpoint{
URL: "https://example.com",
LastReminderSent: make(map[alert.Type]time.Time),
Alerts: []*alert.Alert{
{
Type: alert.TypeCustom,
Enabled: &enabled,
FailureThreshold: 2,
SuccessThreshold: 3,
SendOnResolved: &enabled,
Triggered: false,
RepeatInterval: &repeatInterval,
},
},
}

verify(t, ep, 0, 0, false, "The alert shouldn't start triggered")
HandleAlerting(ep, &endpoint.Result{Success: false}, cfg.Alerting, cfg.Debug)
verify(t, ep, 1, 0, false, "The alert shouldn't have triggered")
HandleAlerting(ep, &endpoint.Result{Success: false}, cfg.Alerting, cfg.Debug)
verify(t, ep, 2, 0, true, "The alert should've triggered")
HandleAlerting(ep, &endpoint.Result{Success: false}, cfg.Alerting, cfg.Debug)
verify(t, ep, 3, 0, true, "The alert should still be triggered")
HandleAlerting(ep, &endpoint.Result{Success: false}, cfg.Alerting, cfg.Debug)
verify(t, ep, 4, 0, true, "The alert should still be triggered")
HandleAlerting(ep, &endpoint.Result{Success: true}, cfg.Alerting, cfg.Debug)
}

func verify(t *testing.T, ep *endpoint.Endpoint, expectedNumberOfFailuresInARow, expectedNumberOfSuccessInARow int, expectedTriggered bool, expectedTriggeredReason string) {
if ep.NumberOfFailuresInARow != expectedNumberOfFailuresInARow {
t.Errorf("endpoint.NumberOfFailuresInARow should've been %d, got %d", expectedNumberOfFailuresInARow, ep.NumberOfFailuresInARow)
Expand Down