Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: First iteration of a prometheus exporter for ara #483

Draft
wants to merge 9 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
282 changes: 282 additions & 0 deletions ara/cli/prometheus.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,282 @@
# Copyright (c) 2023 The ARA Records Ansible authors
# GNU General Public License v3.0+ (see COPYING or https://www.gnu.org/licenses/gpl-3.0.txt)

import logging
import sys
import time
from collections import defaultdict
from datetime import datetime, timedelta

from cliff.command import Command

import ara.cli.utils as cli_utils
from ara.cli.base import global_arguments
from ara.clients.utils import get_client

try:
from prometheus_client import Gauge, Summary, start_http_server

HAS_PROMETHEUS_CLIENT = True
except ImportError:
HAS_PROMETHEUS_CLIENT = False

# Where possible and relevant, apply these labels to the metrics so we can write prometheus
# queries to filter and aggregate by these properties
# TODO: make configurable
DEFAULT_PLAYBOOK_LABELS = [
"ansible_version",
"client_version",
"controller",
"name",
"path",
"python_version",
"server_version",
"status",
"updated",
"user",
]
DEFAULT_TASK_LABELS = ["action", "name", "path", "playbook", "status", "updated"]
DEFAULT_HOST_LABELS = ["name", "playbook", "updated"]


# TODO: This could be made more flexible and live in a library
def get_search_results(client, kind, limit, created_after):
"""
kind: string, one of ["playbooks", "hosts", "tasks"]
limit: int, the number of items to return per page
created_after: string, a date formatted as such: 2020-01-31T15:45:36.737000Z
"""
query = f"/api/v1/{kind}?order=-id&limit={limit}"
if created_after is not None:
query += f"&created_after={created_after}"

response = client.get(query)
items = response["results"]

# Iterate through multiple pages of results if necessary
while response["next"]:
# For example:
# "next": "https://demo.recordsansible.org/api/v1/playbooks?limit=1000&offset=2000",
uri = response["next"].replace(client.endpoint, "")
response = client.get(uri)
items.extend(response["results"])

return items


class AraPlaybookCollector(object):
def __init__(self, client, log, limit, labels=DEFAULT_PLAYBOOK_LABELS):
self.client = client
self.log = log
self.limit = limit
self.labels = labels

self.metrics = {
"range": Gauge("ara_playbooks_range", "Limit metric collection to the N most recent playbooks"),
"total": Gauge("ara_playbooks_total", "Total number of playbooks recorded by ara"),
"playbooks": Summary(
"ara_playbooks", "Labels and duration (in seconds) of playbooks recorded by ara", labels
),
}
self.metrics["range"].set(self.limit)

def collect_metrics(self, created_after=None):
playbooks = get_search_results(self.client, "playbooks", self.limit, created_after)
# Save the most recent timestamp so we only scrape beyond it next time
if playbooks:
created_after = cli_utils.increment_timestamp(playbooks[0]["created"])
self.log.info(f"updating metrics for {len(playbooks)} playbooks...")

for playbook in playbooks:
# The API returns a duration in string format, convert it back to seconds
# so we can use it as a value for the metric.
if playbook["duration"] is not None:
# TODO: parse_timedelta throws an exception for playbooks that last longer than a day
# That was meant to be fixed in https://github.com/ansible-community/ara/commit/db8243c3af938ece12c9cd59dd7fe4d9a711b76d
try:
seconds = cli_utils.parse_timedelta(playbook["duration"])
except ValueError:
seconds = 0
else:
seconds = 0

# Gather the values of each label so we can attach them to our metrics
labels = {label: playbook[label] for label in self.labels}

self.metrics["playbooks"].labels(**labels).observe(seconds)
self.metrics["total"].inc()

return created_after


class AraTaskCollector(object):
def __init__(self, client, log, limit, labels=DEFAULT_TASK_LABELS):
self.client = client
self.log = log
self.limit = limit
self.labels = labels

self.metrics = {
"range": Gauge("ara_tasks_range", "Limit metric collection to the N most recent tasks"),
"total": Gauge("ara_tasks_total", "Number of tasks recorded by ara in prometheus"),
"tasks": Summary("ara_tasks", "Labels and duration, in seconds, of playbook tasks recorded by ara", labels),
}
self.metrics["range"].set(self.limit)

def collect_metrics(self, created_after=None):
tasks = get_search_results(self.client, "tasks", self.limit, created_after)
# Save the most recent timestamp so we only scrape beyond it next time
if tasks:
created_after = cli_utils.increment_timestamp(tasks[0]["created"])
self.log.info(f"updating metrics for {len(tasks)} tasks...")

for task in tasks:
# The API returns a duration in string format, convert it back to seconds
# so we can use it as a value for the metric.
if task["duration"] is not None:
# TODO: parse_timedelta throws an exception for tasks that last longer than a day
# That was meant to be fixed in https://github.com/ansible-community/ara/commit/db8243c3af938ece12c9cd59dd7fe4d9a711b76d
try:
seconds = cli_utils.parse_timedelta(task["duration"])
except ValueError:
seconds = 0
else:
seconds = 0

# Gather the values of each label so we can attach them to our metrics
labels = {label: task[label] for label in self.labels}

self.metrics["tasks"].labels(**labels).observe(seconds)
self.metrics["total"].inc()

return created_after


class AraHostCollector(object):
def __init__(self, client, log, limit, labels=DEFAULT_HOST_LABELS):
self.client = client
self.log = log
self.limit = limit
self.labels = labels

self.metrics = {
"changed": Gauge("ara_hosts_changed", "Number of changes on a host", labels),
"failed": Gauge("ara_hosts_failed", "Number of failures on a host", labels),
"ok": Gauge("ara_hosts_ok", "Number of successful tasks without changes on a host", labels),
"range": Gauge("ara_hosts_range", "Limit metric collection to the N most recent hosts"),
"skipped": Gauge("ara_hosts_skipped", "Number of skipped tasks on a host", labels),
"total": Gauge("ara_hosts_total", "Hosts recorded by ara"),
"unreachable": Gauge("ara_hosts_unreachable", "Number of unreachable errors on a host", labels),
}
self.metrics["range"].set(self.limit)

def collect_metrics(self, created_after=None):
hosts = get_search_results(self.client, "hosts", self.limit, created_after)
# Save the most recent timestamp so we only scrape beyond it next time
if hosts:
created_after = cli_utils.increment_timestamp(hosts[0]["created"])
self.log.info(f"updating metrics for {len(hosts)} hosts...")

for host in hosts:
self.metrics["total"].inc()

# Gather the values of each label so we can attach them to our metrics
labels = {label: host[label] for label in self.labels}

# The values of "changed", "failed" and so on are integers so we can
# use them as values for our metric
for status in ["changed", "failed", "ok", "skipped", "unreachable"]:
if host[status]:
self.metrics[status].labels(**labels).set(host[status])

return created_after


class PrometheusExporter(Command):
"""Exposes a prometheus exporter to provide metrics from an instance of ara"""

log = logging.getLogger(__name__)

def get_parser(self, prog_name):
parser = super().get_parser(prog_name)
parser = global_arguments(parser)
# fmt: off
parser.add_argument(
'--playbook-limit',
help='Max number of playbooks to request at once (default: 1000)',
default=1000,
type=int
)
parser.add_argument(
'--task-limit',
help='Max number of tasks to request at once (default: 2500)',
default=2500,
type=int
)
parser.add_argument(
'--host-limit',
help='Max number of hosts to request at once (default: 2500)',
default=2500,
type=int
)
parser.add_argument(
'--poll-frequency',
help='Seconds to wait until querying ara for new metrics (default: 60)',
default=60,
type=int
)
parser.add_argument(
'--prometheus-port',
help='Port on which the prometheus exporter will listen (default: 8001)',
default=8001,
type=int
)
parser.add_argument(
'--max-days',
help='Maximum number of days to backfill metrics for (default: 90)',
default=90,
type=int
)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it could be interesting for the exporter to be able to filter queries like the general CLI commands work, for example ara playbook list (docs) has:

  --ansible_version <ansible_version>
                        List playbooks that ran with the specified Ansible
                        version (full or partial)
  --client_version <client_version>
                        List playbooks that were recorded with the specified
                        ara client version (full or partial)
  --server_version <server_version>
                        List playbooks that were recorded with the specified
                        ara server version (full or partial)
  --python_version <python_version>
                        List playbooks that were recorded with the specified
                        python version (full or partial)
  --user <user>         List playbooks that were run by the specified user
                        (full or partial)
  --controller <controller>
                        List playbooks that ran from the provided controller
                        (full or partial)
  --name <name>         List playbooks matching the provided name (full or
                        partial)
  --path <path>         List playbooks matching the provided path (full or
                        partial)
  --status <status>     List playbooks matching a specific status
                        ('completed', 'running', 'failed')

return parser

def take_action(self, args):
if not HAS_PROMETHEUS_CLIENT:
self.log.error("The prometheus_client python package must be installed to run this command")
sys.exit(2)

verify = False if args.insecure else True
if args.ssl_ca:
verify = args.ssl_ca
client = get_client(
client=args.client,
endpoint=args.server,
timeout=args.timeout,
username=args.username,
password=args.password,
cert=args.ssl_cert,
key=args.ssl_key,
verify=verify,
run_sql_migrations=False,
)

# Prepare collectors so we can gather various metrics
playbooks = AraPlaybookCollector(client=client, log=self.log, limit=args.playbook_limit)
hosts = AraHostCollector(client=client, log=self.log, limit=args.host_limit)
tasks = AraTaskCollector(client=client, log=self.log, limit=args.task_limit)

start_http_server(args.prometheus_port)
self.log.info(f"ara prometheus exporter listening on http://0.0.0.0:{args.prometheus_port}/metrics")

created_after = (datetime.now() - timedelta(days=args.max_days)).isoformat()
self.log.info(
f"Backfilling metrics for the last {args.max_days} days since {created_after}... This can take a while."
)

latest = defaultdict(lambda: created_after)
while True:
latest["playbooks"] = playbooks.collect_metrics(latest["playbooks"])
latest["hosts"] = hosts.collect_metrics(latest["hosts"])
latest["tasks"] = tasks.collect_metrics(latest["tasks"])

time.sleep(args.poll_frequency)
self.log.info("Checking for updated metrics...")
8 changes: 8 additions & 0 deletions ara/cli/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,14 @@ def avg_timedelta(delta: timedelta, count: int):
return str(delta / count)


def increment_timestamp(timestamp, pattern="%Y-%m-%dT%H:%M:%S.%fZ"):
"""
API timestamps have this python isoformat: 2022-12-08T05:45:38.465607Z
We want to increment timestamps by one microsecond so we can search for things created after them.
"""
return (datetime.strptime(timestamp, pattern) + timedelta(microseconds=1)).isoformat()


# Also see: ui.templatetags.truncatepath
def truncatepath(path, count):
"""
Expand Down
Loading