Skip to content

Commit

Permalink
fix(agent): ensure self-monitoring throttle resets at the end of inte…
Browse files Browse the repository at this point in the history
…rval (#173)
  • Loading branch information
PierreDemailly authored Dec 17, 2023
1 parent e3df479 commit df26efd
Show file tree
Hide file tree
Showing 6 changed files with 117 additions and 11 deletions.
1 change: 1 addition & 0 deletions src/agent/data/init-db.sql
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ CREATE TABLE IF NOT EXISTS agentFailures
message TEXT,
timestamp INTEGER,
count INTEGER DEFAULT 1,
processed INTEGER DEFAULT 0,
FOREIGN KEY(ruleId)
REFERENCES rules(id)
ON UPDATE CASCADE
Expand Down
2 changes: 2 additions & 0 deletions src/agent/src/database.ts
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,8 @@ export interface DbAgentFailure {
ruleId: number;
message: string;
timestamp: number;
count: number;
processed: number;
}

export interface DbAgentFailureAlert {
Expand Down
47 changes: 37 additions & 10 deletions src/agent/src/utils/selfMonitoring.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
// Import Third-party Dependencies
import { SigynInitializedSelfMonitoring, getConfig } from "@sigyn/config";
import { Result, Ok, Err } from "@openally/result";

// Import Internal Dependencies
import { DbAgentFailure, getDB } from "../database";
Expand All @@ -18,9 +19,9 @@ export function getAgentFailureRules(alert: AgentFailureAlert): string {
return failures.map(({ name }) => name).join(", ");
}

function hasAgentFailureThrottle(throttle: SigynInitializedSelfMonitoring["throttle"]) {
function hasAgentFailureThrottle(throttle: SigynInitializedSelfMonitoring["throttle"]): Result<string, string> {
if (!throttle) {
return false;
return Err("no throttle for the given rule");
}

const { interval, count, activationThreshold } = throttle;
Expand All @@ -29,21 +30,46 @@ function hasAgentFailureThrottle(throttle: SigynInitializedSelfMonitoring["throt
const agentFailuresAlert = (getDB()
.prepare("SELECT count FROM agentFailures WHERE timestamp >= ? ORDER BY count DESC")
.get(intervalDate) as { count: number });
const lastAgentFailureAlert = (getDB()
.prepare("SELECT * FROM agentFailures ORDER BY count DESC LIMIT 1")
.get() as DbAgentFailure);
getDB().exec("UPDATE agentFailures SET processed = 1");

const agentFailuresAlertCount = agentFailuresAlert?.count ?? 0;
const countThresholdExceeded = count > 0 && agentFailuresAlertCount - activationThreshold > count;
const activationThresholdExceeded = activationThreshold > 0 && agentFailuresAlertCount <= activationThreshold;
const intervalExceeded = lastAgentFailureAlert.processed && lastAgentFailureAlert.timestamp > intervalDate;

function logMessage(throttle: boolean, details: string) {
// eslint-disable-next-line max-len
return `(throttle: ${throttle ? "on" : "off"}|details: ${details}|processed: ${lastAgentFailureAlert.processed}|lastAlertTime: ${lastAgentFailureAlert.timestamp}|activationThreshold: ${activationThreshold}|agentFailuresCount: ${agentFailuresAlertCount}|count: ${count})`;
}

if (!activationThresholdExceeded && intervalExceeded && !countThresholdExceeded) {
return Ok(logMessage(true, "within interval"));
}
else if (lastAgentFailureAlert.processed && activationThreshold === 0) {
return Err(logMessage(false, "interval exceeded"));
}

if (activationThreshold > 0 && agentFailuresAlertCount <= activationThreshold) {
return false;

if (activationThresholdExceeded) {
return Err(logMessage(false, "activation threshold exceeded"));
}

if (count > 0 && agentFailuresAlertCount - activationThreshold > count) {
return false;
if (countThresholdExceeded) {
return Err(logMessage(false, "count threshold exceeded"));
}

if (activationThreshold > 0 && agentFailuresAlertCount > activationThreshold) {
return true;
return Ok(logMessage(true, "failures count > activationThreshold"));
}

return agentFailuresAlertCount === 1 ? false : agentFailuresAlertCount - activationThreshold <= count;
const hasThrottle = agentFailuresAlertCount === 1 ? false : agentFailuresAlertCount - activationThreshold <= count;

return hasThrottle ?
Ok(logMessage(true, "failures count < activationThreshold + count")) :
Err(logMessage(false, "failures count > activationThreshold + count"));
}

export function handleAgentFailure(errorMessage: string, rule: Rule, logger: Logger) {
Expand Down Expand Up @@ -80,9 +106,10 @@ export function handleAgentFailure(errorMessage: string, rule: Rule, logger: Log

const agentFailures = getDB().prepare("SELECT * FROM agentFailures").all() as DbAgentFailure[];
if (agentFailures.length > minimumErrorCount) {
if (hasAgentFailureThrottle(config.selfMonitoring.throttle)) {
logger.info(`[SELF MONITORING](skip: throttle is activated)`);
const throttle = hasAgentFailureThrottle(config.selfMonitoring.throttle);
logger.info(`[SELF MONITORING]${throttle.val}`);

if (throttle.ok) {
return;
}

Expand Down
3 changes: 2 additions & 1 deletion src/agent/test/FT/database.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,8 @@ const kExpectedTablesColumns = {
{ name: "ruleId", dflt_value: null, type: "INTEGER", pk: 0, notnull: 0 },
{ name: "message", dflt_value: null, type: "TEXT", pk: 0, notnull: 0 },
{ name: "timestamp", dflt_value: null, type: "INTEGER", pk: 0, notnull: 0 },
{ name: "count", dflt_value: 1, type: "INTEGER", pk: 0, notnull: 0 }
{ name: "count", dflt_value: 1, type: "INTEGER", pk: 0, notnull: 0 },
{ name: "processed", dflt_value: 0, type: "INTEGER", pk: 0, notnull: 0 }
],
compositeRuleAlerts: [
{ name: "id", dflt_value: null, type: "INTEGER", pk: 1, notnull: 0 },
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
{
"loki": {
"apiUrl": "http://localhost:3100"
},
"notifiers": {
"discord": {
"notifier": "../../test/FT/mocks/sigyn-test-notifier.js",
"webhookUrl": "https://discord.com/api/webhooks/aaa/bbb"
}
},
"rules": [
{
"name": "State KO >= 80%",
"logql": "{app=\"sigyn\"} |~ `state: (ok|ko)` | regexp `state: (?P<state>ok|ko)`",
"polling": "200ms",
"alert": {
"on": {
"label": "state",
"value": "ko",
"percentThreshold": 80,
"minimumLabelCount": 10
},
"template": {
"title": "Alert"
}
}
}
],
"selfMonitoring": {
"notifiers": ["discord"],
"template": {
"title": "foo"
},
"throttle": {
"interval": "5s"
}
}
}
37 changes: 37 additions & 0 deletions src/agent/test/FT/selfMonitoring.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ const kRuleMatchErrorFiltersConfigLocation = path.join(kFixturePath, "/no-self-m
const kRuleNoFiltersConfigLocation = path.join(kFixturePath, "/no-self-monitoring-filters/sigyn.config.json");
const kRuleThrottleConfigLocation = path.join(kFixturePath, "/self-monitoring-throttle/sigyn.config.json");
const kRuleActivationThresholdConfigLocation = path.join(kFixturePath, "/self-monitoring-activation-threshold/sigyn.config.json");
const kRuleIntervalThrottleConfigLocation = path.join(kFixturePath, "/self-monitoring-interval/sigyn.config.json");
const kLogger = new MockLogger();
const kMockLokiApi = {
Loki: {
Expand Down Expand Up @@ -244,4 +245,40 @@ describe("Self-monitoring", () => {
await setTimeout(kTimeout);
assert.equal(getCalls(), 4);
});

it("should disable throttle after interval", async() => {
const config = await initConfig(kRuleIntervalThrottleConfigLocation);
const rule = new Rule(config.rules[0], { logger: kLogger });
rule.init();

const task = asyncTask(
config.rules[0], {
logger: kLogger,
lokiApi: kMockLokiApi as any,
rule
}
);

task.execute();
await setTimeout(kTimeout);
// first alert, no throttle
assert.equal(getCalls(), 1);

task.execute();
await setTimeout(kTimeout);
// throttle activated, still 1 call
assert.equal(getCalls(), 1);

task.execute();
await setTimeout(kTimeout);
// throttle activated, still 1 call
assert.equal(getCalls(), 1);

// wait 5s (the interval value)
await setTimeout(5000);
task.execute();
await setTimeout(kTimeout);
// throttle deactivated, now 2 calls
assert.equal(getCalls(), 2);
});
});

0 comments on commit df26efd

Please sign in to comment.