Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[wip] prometheus monitoring support #26837

Closed
wants to merge 23 commits into from
Closed
Show file tree
Hide file tree
Changes from 20 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 17 additions & 11 deletions corehq/apps/api/odata/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from corehq.apps.export.models import ExportInstance
from corehq.util.datadog.gauges import datadog_counter
from corehq.util.datadog.utils import bucket_value
from corehq.util.metrics import metrics

FieldMetadata = namedtuple('FieldMetadata', ['name', 'odata_type'])

Expand Down Expand Up @@ -54,6 +55,13 @@ def _get_dot_path(export_column):
return metadata


odata_feed_access_histogram = metrics.histogram(
'commcare.odata_feed.test_v3', 'Odata Feed Access',
bucket_tag='duration_bucket', buckets=(1, 5, 20, 60, 120, 300, 600), bucket_unit='s',
tag_names=['domain', 'feed_id', 'feed_type', 'username', 'row_count', 'column_count', 'size']
)


def record_feed_access_in_datadog(request, config_id, duration, response):
config = ExportInstance.get(config_id)
username = request.couch_user.username
Expand All @@ -64,14 +72,12 @@ def record_feed_access_in_datadog(request, config_id, duration, response):
column_count = len(rows[0])
except IndexError:
column_count = 0
datadog_counter('commcare.odata_feed.test_v3', tags=[
'domain:{}'.format(request.domain),
'feed_id:{}'.format(config_id),
'feed_type:{}'.format(config.type),
'username:{}'.format(username),
'row_count:{}'.format(row_count),
'column_count:{}'.format(column_count),
'size:{}'.format(len(response.content)),
'duration:{}'.format(duration),
'duration_bucket:{}'.format(bucket_value(duration, (1, 5, 20, 60, 120, 300, 600), 's')),
])
odata_feed_access_histogram.tag(
domain=request.domain,
feed_id=config_id,
feed_type=config.type,
username=username,
row_count=row_count,
column_count=column_count,
size=len(response.content),
).observe(duration)
71 changes: 44 additions & 27 deletions corehq/apps/receiverwrapper/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

import couchforms
from casexml.apps.case.xform import get_case_updates, is_device_report
from corehq.util.metrics import metrics
from couchforms import openrosa_response
from couchforms.const import MAGIC_PROPERTY, BadRequest
from couchforms.getters import MultimediaBug
Expand Down Expand Up @@ -50,18 +51,38 @@
convert_xform_to_json,
should_use_sql_backend,
)
from corehq.util.datadog.gauges import datadog_counter, datadog_gauge
from corehq.util.datadog.metrics import (
MULTIMEDIA_SUBMISSION_ERROR_COUNT,
XFORM_LOCKED_COUNT,
)
from corehq.util.datadog.utils import bucket_value
from corehq.util.timer import TimingContext

PROFILE_PROBABILITY = float(os.getenv('COMMCARE_PROFILE_SUBMISSION_PROBABILITY', 0))
PROFILE_LIMIT = os.getenv('COMMCARE_PROFILE_SUBMISSION_LIMIT')
PROFILE_LIMIT = int(PROFILE_LIMIT) if PROFILE_LIMIT is not None else 1

corrupt_multimedia_counter = metrics.counter(
'commcare.corrupt_multimedia_submissions', 'Count of requests with corrupt multimedia',
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This name is changed

- commcare.corrupt-multimedia-submission.error.count
+ commcare.corrupt_multimedia_submissions

I changed this because .error seemed redundant since it's implied by the fact that it's 'corrupt'. I dropped the 'count' suffix because Prometheus adds a _total prefix to the end of counter metric names. If we want to keep the suffix we should add it to all 'counter' metrics at the provider level.

tag_names=['domain', 'authenticated']
)

xform_locked_error_counter = metrics.counter(
'commcare.xformlocked.count', 'Count of locking errors',
tag_names=['domain', 'authenticated']
)

submission_counter = metrics.counter(
'commcare.xform_submissions.count', 'Count of form submissions',
tag_names=['domain', 'backend', 'submission_type', 'status_code']
)

submission_duration_histogram = metrics.histogram(
'commcare.xform_submissions.duration.seconds', 'Count of form submissions',
bucket_tag='duration', buckets=(1, 5, 20, 60, 120, 300, 600), bucket_unit='s',
tag_names=['domain', 'backend', 'submission_type', 'status_code']
)

submission_lag_histogram = metrics.histogram(
'commcare.xform_submissions.lag.days', 'Count of form submissions',
bucket_tag='lag', buckets=(1, 2, 4, 7, 14, 31, 90), bucket_unit='d',
tag_names=['domain', 'backend', 'submission_type', 'status_code']
)

@profile_prod('commcare_receiverwapper_process_form.prof', probability=PROFILE_PROBABILITY, limit=PROFILE_LIMIT)
def _process_form(request, domain, app_id, user_id, authenticated,
Expand All @@ -70,10 +91,10 @@ def _process_form(request, domain, app_id, user_id, authenticated,
if rate_limit_submission(domain):
return HttpTooManyRequests()

metric_tags = [
'backend:sql' if should_use_sql_backend(domain) else 'backend:couch',
'domain:{}'.format(domain),
]
metric_tags = {
'backend': 'sql' if should_use_sql_backend(domain) else 'couch',
'domain': domain
}

try:
instance, attachments = couchforms.get_instance_and_attachment(request)
Expand All @@ -85,9 +106,9 @@ def _process_form(request, domain, app_id, user_id, authenticated,
except:
meta = {}

corrupt_multimedia_counter.tag(domain=domain, authenticated=authenticated).inc()
return _submission_error(
request, "Received a submission with POST.keys()",
MULTIMEDIA_SUBMISSION_ERROR_COUNT, metric_tags,
request, "Received a submission with POST.keys()", metric_tags,
domain, app_id, user_id, authenticated, meta,
)

Expand Down Expand Up @@ -133,8 +154,9 @@ def _process_form(request, domain, app_id, user_id, authenticated,
try:
result = submission_post.run()
except XFormLockError as err:
xform_locked_error_counter.tag(domain=domain, authenticated=authenticated).inc()
return _submission_error(
request, "XFormLockError: %s" % err, XFORM_LOCKED_COUNT,
request, "XFormLockError: %s" % err,
metric_tags, domain, app_id, user_id, authenticated, status=423,
notify=False,
)
Expand All @@ -145,7 +167,7 @@ def _process_form(request, domain, app_id, user_id, authenticated,
return response


def _submission_error(request, message, count_metric, metric_tags,
def _submission_error(request, message, metric_tags,
domain, app_id, user_id, authenticated, meta=None, status=400,
notify=True):
"""Notify exception, datadog count, record metrics, construct response
Expand All @@ -157,7 +179,6 @@ def _submission_error(request, message, count_metric, metric_tags,
"domain:{}".format(domain),
"authenticated:{}".format(authenticated),
]
datadog_counter(count_metric, tags=details)
if notify:
details.extend([
"user_id:{}".format(user_id),
Expand All @@ -172,24 +193,20 @@ def _submission_error(request, message, count_metric, metric_tags,


def _record_metrics(tags, submission_type, response, timer=None, xform=None):
tags.update({
'submission_type': submission_type,
'status_code': response.status_code
})

if xform and xform.metadata and xform.metadata.timeEnd and xform.received_on:
lag = xform.received_on - xform.metadata.timeEnd
lag_days = lag.total_seconds() / 86400
tags += [
'lag:%s' % bucket_value(lag_days, (1, 2, 4, 7, 14, 31, 90), 'd')
]

tags += [
'submission_type:{}'.format(submission_type),
'status_code:{}'.format(response.status_code)
]
submission_lag_histogram.tag(**tags).observe(lag_days)

if timer:
tags += [
'duration:%s' % bucket_value(timer.duration, (1, 5, 20, 60, 120, 300, 600), 's'),
]
submission_duration_histogram.tag(**tags).observe(timer.duration)

datadog_counter('commcare.xform_submissions.count', tags=tags)
submission_counter.tag(**tags).inc()
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is an example of a metric where we have attached multiple histogram bucket tags to a single metric. If we want to convert these histograms to the native histogram types then they must be separated out as is done here.



@location_safe
Expand Down
27 changes: 27 additions & 0 deletions corehq/util/datadog/apps.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
from django.apps import AppConfig
from django.conf import settings


class DatadogConfig(AppConfig):

name = 'corehq.util.datadog'
verbose_name = 'Datadog'

def ready(self):
if not settings.DATADOG_API_KEY or not settings.DATADOG_APP_KEY:
return

try:
from datadog import initialize
except ImportError:
pass
else:
initialize(settings.DATADOG_API_KEY, settings.DATADOG_APP_KEY)

if settings.UNIT_TESTING or settings.DEBUG or 'ddtrace.contrib.django' not in settings.INSTALLED_APPS:
try:
from ddtrace import tracer
tracer.enabled = False
except ImportError:
pass

snopoke marked this conversation as resolved.
Show resolved Hide resolved
2 changes: 0 additions & 2 deletions corehq/util/datadog/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,4 @@
ERROR_COUNT = 'commcare.error.count'
REPEATER_ERROR_COUNT = 'commcare.repeaters.error'
REPEATER_SUCCESS_COUNT = 'commcare.repeaters.success'
MULTIMEDIA_SUBMISSION_ERROR_COUNT = 'commcare.corrupt-multimedia-submission.error.count'
DATE_OPENED_CASEBLOCK_ERROR_COUNT = 'commcare.date-opened-caseblock-bug.error.count'
XFORM_LOCKED_COUNT = 'commcare.xformlocked.count'
15 changes: 15 additions & 0 deletions corehq/util/metrics/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from django.utils.functional import SimpleLazyObject

from corehq.util.metrics.datadog import DatadogMetrics
from corehq.util.metrics.metrics import DummyMetrics, DelegatedMetrics
from corehq.util.metrics.prometheus import PrometheusMetrics


def _get_metrics():
enabled = list(filter(lambda m: m.enabled(), [DatadogMetrics(), PrometheusMetrics()]))
if not enabled:
return [DummyMetrics()]
return enabled


metrics = DelegatedMetrics(SimpleLazyObject(_get_metrics)) # singleton/global
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Initially I had this as a function get_metrics() but after making it lazy there wasn't a need for the function. This will only call the _get_metrics function when a metric class is actually used. This is needed make sure that Django is fully set up before determining if the metrics providers are enabled (since they rely on settings).

m = metrics.counter('name', 'description')  # still lazy
m.inc()  # now it's evaluated

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this mean that DelegatedMetrics is always used even when there is only a single "metrics" object to delegate to? That seems wasteful. Would it be possible to only delegate if there is more than one (I would expect that to be the exceptional case).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes it does mean that. The alternative is to split DelegatedMetrics into two classes: DelegatedMetrics and LazyMetrics and then combine them as needed (the laziness is always needed).

87 changes: 87 additions & 0 deletions corehq/util/metrics/datadog.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
import logging

from django.conf import settings

from corehq.util.datadog.utils import bucket_value
from corehq.util.metrics.metrics import (
HqCounter,
HqGauge,
HqHistogram,
HqMetrics,
)
from datadog import api
from datadog.dogstatsd.base import DogStatsd

datadog_logger = logging.getLogger('datadog')

COMMON_TAGS = ['environment:{}'.format(settings.SERVER_ENVIRONMENT)]

statsd = DogStatsd(constant_tags=COMMON_TAGS)


def _format_tags(tag_values: dict):
if not tag_values:
return None

return [
f'{name}:{value}' for name, value in tag_values.items()
]


def _datadog_record(fn, name, value, tags=None):
try:
fn(name, value, tags=tags)
except Exception:
datadog_logger.exception('Unable to record Datadog stats')


class Counter(HqCounter):
def _record(self, amount: float):
tags = _format_tags(self.tag_values)
_datadog_record(statsd.increment, self.name, amount, tags)


class Gauge(HqGauge):
def _record(self, value):
tags = _format_tags(self.tag_values)
_datadog_record(statsd.gauge, self.name, value, tags)


class Histogram(HqHistogram):
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As stated in the docstring this isn't a real histogram. It uses our workaround of adding bucketed tags.

"""This implementation of histogram uses tagging to record the buckets.
It does not use the Datadog Histogram metric type.

The metric itself will be incremented by 1 on each call to `observe`. The value
passed to `observe` will be used to create the bucket tag.

For example:

h = Histogram(
'commcare.request.duration', 'description',
bucket_tag='duration', buckets=[1,2,3], bucket_units='ms'
)
h.observe(1.4)

# resulting Datadog metric
# commcare.request.duration:1|c|#duration:lt_2ms

For more details see:
* https://github.com/dimagi/commcare-hq/pull/17080
* https://github.com/dimagi/commcare-hq/pull/17030#issuecomment-315794700
"""
def _record(self, value: float):
tags = _format_tags(self.tag_values)
if not tags:
tags = []
bucket = bucket_value(value, self._buckets, self._bucket_unit)
tags.append(f'{self._bucket_tag}:{bucket}')
_datadog_record(statsd.increment, self.name, 1, tags)


class DatadogMetrics(HqMetrics):
_counter_class = Counter
_gauge_class = Gauge
_histogram_class = Histogram

def enabled(self) -> bool:
return api._api_key and api._application_key
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this doesn't look like it will always return bool (it will probably return a string or None). If you want a bool you could do

return bool(api._api_key and api._application_key)

But that doesn't seem very Pythonic. I'd probably drop the type hint.

However, I'm not very familiar with the conventions around type hints. Does -> bool imply that returned object can be treated like a bool (most objects support that), or that the returned value is always True or False? This is one reason I'm not a fan of type hints in Python.

Loading