Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

A more detailed list of clients #63

Merged
merged 7 commits into from
Nov 30, 2023
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,10 @@ receivers:
...
- name: cos-alerter
webhook_configs:
- url: http://<cos-alerter-address>:8080/alive?clientid=<clientid>
- url: http://<cos-alerter-address>:8080/alive?clientid=<clientid>&key=<clientkey>
dstathis marked this conversation as resolved.
Show resolved Hide resolved
# The above URL should be configured with the appropriate values for clientid and key.
# - clientid: Unique identifier for the Alertmanager instance.
# - key: Secret key for authenticating and authorizing communication with COS Alerter.
route:
...
routes:
Expand Down
39 changes: 28 additions & 11 deletions cos_alerter/alerter.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,10 @@
"""Main logic for COS Alerter."""

import datetime
import hashlib
import logging
import os
import sys
import textwrap
import threading
import time
Expand All @@ -14,7 +16,8 @@

import apprise
import durationpy
import yaml
from ruamel.yaml import YAML
from ruamel.yaml.constructor import DuplicateKeyError

logger = logging.getLogger(__name__)

Expand All @@ -30,15 +33,29 @@ def set_path(self, path: str):
"""Set the config file path."""
self.path = Path(path)

def _hash_keys(self, clients):
"""Hash the keys in the clients dictionary."""
for client_info in clients.values():
client_key = client_info.get("key", "")
if client_key:
hashed_key = hashlib.sha256(client_key.encode()).hexdigest()
client_info["key"] = hashed_key

IbraAoad marked this conversation as resolved.
Show resolved Hide resolved
def reload(self):
"""Reload config values from the disk."""
yaml = YAML(typ="rt")
with open(
os.path.join(os.path.dirname(os.path.realpath(__file__)), "config-defaults.yaml")
) as f:
self.data = yaml.safe_load(f)
self.data = yaml.load(f)
with open(self.path, "r") as f:
user_data = yaml.safe_load(f)
try:
user_data = yaml.load(f)
except DuplicateKeyError:
logger.error("Duplicate client IDs found in COS Alerter config. Exiting...")
IbraAoad marked this conversation as resolved.
Show resolved Hide resolved
sys.exit(1)
deep_update(self.data, user_data)
self._hash_keys(self.data["watch"]["clients"])
self.data["watch"]["down_interval"] = durationpy.from_str(
self.data["watch"]["down_interval"]
).total_seconds()
Expand All @@ -50,15 +67,15 @@ def reload(self):
def deep_update(base: dict, new: typing.Optional[dict]):
"""Deep dict update.

Same as dict.update() except it recurses into dubdicts.
Same as dict.update() except it recurses into subdicts.
"""
if new is None:
return
for key in base:
if key in new and isinstance(base[key], dict):
deep_update(base[key], new[key])
elif key in new:
base[key] = new[key]
for key, new_value in new.items():
if key in base and isinstance(base[key], dict) and isinstance(new_value, dict):
deep_update(base[key], new_value)
else:
base[key] = new_value


config = Config()
Expand Down Expand Up @@ -120,9 +137,9 @@ def initialize():
# ...
# }
state["clients"] = {}
for client in config["watch"]["clients"]:
for client_id in config["watch"]["clients"]:
alert_time = None if config["watch"]["wait_for_first_connection"] else current_time
state["clients"][client] = {
state["clients"][client_id] = {
"lock": threading.Lock(),
"alert_time": alert_time,
"notify_time": None,
Expand Down
12 changes: 8 additions & 4 deletions cos_alerter/config-defaults.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,16 @@ watch:
wait_for_first_connection: true

# The list of Alertmanager instances we are monitoring. Alertmanager instances should be
# configured with the clientid=<client> parameter.
# configured with the clientid=<clientid> &key=<clientkey> parameters.
IbraAoad marked this conversation as resolved.
Show resolved Hide resolved
# eg:
# clients:
# - "client0"
# - "client1"
clients: []
# clientid1:
# key: "clientkey1"
# name: "Instance Name 1"
# clientid2:
# key: "clientkey2"
# name: "Instance Name 2"
clients: {}

notify:

Expand Down
32 changes: 23 additions & 9 deletions cos_alerter/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

"""HTTP server for COS Alerter."""

import hashlib
import logging

import timeago
Expand All @@ -28,9 +29,10 @@ def dashboard():
status = "up" if not state.is_down() else "down"
if last_alert is None:
status = "unknown"
client_name = config["watch"]["clients"][clientid].get("name", "")
clients.append(
{
"clientid": clientid,
"client_name": client_name,
"status": status,
"alert_time": alert_time,
}
Expand All @@ -44,16 +46,28 @@ def alive():
# TODO Decide if we should validate the request.
params = request.args
clientid_list = params.getlist("clientid") # params is a werkzeug.datastructures.MultiDict
if len(clientid_list) < 1:
logger.warning("Request %s has no clientid.", request.url)
return 'Parameter "clientid" required.', 400
if len(clientid_list) > 1:
logger.warning("Request %s specified clientid more than once.", request.url)
return 'Parameter "clientid" provided more than once.', 400
key_list = params.getlist("key")

if len(clientid_list) < 1 or len(key_list) < 1:
logger.warning("Request %s is missing clientid or key.", request.url)
return 'Parameters "clientid" and "key" are required.', 400
if len(clientid_list) > 1 or len(key_list) > 1:
logger.warning("Request %s specified clientid or key more than once.", request.url)
return 'Parameters "clientid" and "key" should be provided exactly once.', 400
clientid = clientid_list[0]
if clientid not in config["watch"]["clients"]:
logger.warning("Request %s specified an unknown clientid.")
key = key_list[0]

# Find the client with the specified clientid
client_info = config["watch"]["clients"].get(clientid)
if not client_info:
logger.warning("Request %s specified an unknown clientid.", request.url)
return 'Clientid {params["clientid"]} not found. ', 404

# Hash the key and compare with the stored hashed key
hashed_key = hashlib.sha256(key.encode()).hexdigest()
if hashed_key != client_info.get("key", ""):
logger.warning("Request %s provided an incorrect key.", request.url)
return "Incorrect key for the specified clientid.", 401
logger.info("Received alert from Alertmanager clientid: %s.", clientid)
with AlerterState(clientid) as state:
state.reset_alert_timeout()
Expand Down
2 changes: 1 addition & 1 deletion cos_alerter/templates/dashboard.html
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ <h2>Clients</h2>
<tbody>
{% for client in clients %}
<tr>
<td>{{ client["clientid"] }}</td>
<td>{{ client["client_name"] }}</td>
{% if client["status"] == "up" %}
<td>✅ Up</td>
{% elif client["status"] == "down" %}
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ dependencies = [
"pyyaml~=6.0",
"timeago~=1.0",
"waitress~=2.1",
"ruamel.yaml~=0.18.0"
]

[project.urls]
Expand Down
7 changes: 6 additions & 1 deletion tests/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,12 @@
"watch": {
"down_interval": "5m",
"wait_for_first_connection": False,
"clients": ["client0"],
"clients": {
"clientid1": {
"key": "clientkey1",
"name": "Instance Name 1",
},
},
},
"notify": {
"destinations": DESTINATIONS,
Expand Down
41 changes: 33 additions & 8 deletions tests/test_alerter.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,31 @@ def test_config_default_empty_file(fake_fs):
assert config["watch"]["down_interval"] == 300


def test_duplicate_key_error(fake_fs):
duplicate_config = """
watch:
down_interval: "5m"
wait_for_first_connection: true
clients:
clientid1:
key: "clientkey1"
name: "Instance Name 1"
clientid1:
key: "clientkey1"
name: "Instance Name 1"
"""
with open("/etc/cos-alerter.yaml", "w") as f:
f.write(duplicate_config)

try:
config.reload()
except SystemExit as exc:
assert exc.code == 1
else:
# If no exception is raised, fail the test
assert False


def test_config_default_partial_file(fake_fs):
conf = yaml.dump({"log_level": "info"})
with open("/etc/cos-alerter.yaml", "w") as f:
Expand All @@ -50,7 +75,7 @@ def test_config_default_override(fake_fs):
def test_initialize(monotonic_mock, fake_fs):
monotonic_mock.return_value = 1000
AlerterState.initialize()
state = AlerterState(clientid="client0")
state = AlerterState(clientid="clientid1")
with state:
assert state.start_date == 1672531200.0
assert state.start_time == 1000
Expand All @@ -72,7 +97,7 @@ def test_up_time(monotonic_mock, fake_fs):
def test_is_down_from_initialize(monotonic_mock, fake_fs):
monotonic_mock.return_value = 1000
AlerterState.initialize()
state = AlerterState(clientid="client0")
state = AlerterState(clientid="clientid1")
with state:
monotonic_mock.return_value = 1180 # Three minutes have passed
assert state.is_down() is False
Expand All @@ -85,7 +110,7 @@ def test_is_down_from_initialize(monotonic_mock, fake_fs):
def test_is_down_with_reset_alert_timeout(monotonic_mock, fake_fs):
monotonic_mock.return_value = 1000
AlerterState.initialize()
state = AlerterState(clientid="client0")
state = AlerterState(clientid="clientid1")
with state:
monotonic_mock.return_value = 2000
state.reset_alert_timeout()
Expand All @@ -106,7 +131,7 @@ def test_is_down_with_wait_for_first_connection(monotonic_mock, fake_fs):
config.reload()
monotonic_mock.return_value = 1000
AlerterState.initialize()
state = AlerterState(clientid="client0")
state = AlerterState(clientid="clientid1")
with state:
monotonic_mock.return_value = 1500
assert state.is_down() is False # 6 minutes have passes but we have not started counting.
Expand All @@ -122,7 +147,7 @@ def test_is_down_with_wait_for_first_connection(monotonic_mock, fake_fs):
def test_is_down(monotonic_mock, fake_fs):
monotonic_mock.return_value = 1000
AlerterState.initialize()
state = AlerterState(clientid="client0")
state = AlerterState(clientid="clientid1")
with state:
monotonic_mock.return_value = 2000
state.reset_alert_timeout()
Expand All @@ -137,7 +162,7 @@ def test_is_down(monotonic_mock, fake_fs):
def test_recently_notified(monotonic_mock, fake_fs):
monotonic_mock.return_value = 1000
AlerterState.initialize()
state = AlerterState(clientid="client0")
state = AlerterState(clientid="clientid1")
with state:
state._set_notify_time()
monotonic_mock.return_value = 2800 # 30 minutes have passed
Expand All @@ -153,7 +178,7 @@ def test_recently_notified(monotonic_mock, fake_fs):
def test_notify(notify_mock, add_mock, monotonic_mock, fake_fs):
monotonic_mock.return_value = 1000
AlerterState.initialize()
state = AlerterState(clientid="client0")
state = AlerterState(clientid="clientid1")

with state:
state.notify()
Expand All @@ -166,7 +191,7 @@ def test_notify(notify_mock, add_mock, monotonic_mock, fake_fs):
title="**Alertmanager is Down!**",
body=textwrap.dedent(
"""
Your Alertmanager instance: client0 seems to be down!
Your Alertmanager instance: clientid1 seems to be down!
It has not alerted COS-Alerter ever.
"""
),
Expand Down
16 changes: 14 additions & 2 deletions tests/test_daemon.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,12 @@ def mock_fs(fake_fs):
"watch": {
"down_interval": "4s",
"wait_for_first_connection": False,
"clients": ["client0"],
"clients": {
"clientid1": {
"key": "clientkey1",
"name": "Instance Name 1",
},
},
},
"notify": {
"destinations": DESTINATIONS,
Expand All @@ -53,7 +58,14 @@ def test_main(notify_mock, add_mock, mock_fs):
main_thread.start()
time.sleep(2) # Should not be considered down yet.
notify_mock.assert_not_called()
subprocess.call(["curl", "-X", "POST", "http://localhost:8080/alive?clientid=client0"])
subprocess.call(
[
"curl",
"-X",
"POST",
"http://localhost:8080/alive?clientid=clientid1&key=clientkey1",
]
)
time.sleep(3) # Would be considered down but we just sent an alive call.
notify_mock.assert_not_called()
time.sleep(3) # It has been > 4 seconds since we last alerted so it should be down.
Expand Down
Loading
Loading