-
-
Notifications
You must be signed in to change notification settings - Fork 218
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[wip] prometheus monitoring support #26837
Changes from 10 commits
90bdf7d
7ad93fa
0694926
c1f8270
5034156
172ef37
2be4e6a
eb8afe3
54e91c0
54f2d1b
eba69c4
01fbc07
450f211
2c3edef
8664161
0bcedea
9ba3d4a
904e358
d68e104
5c00052
e00b7e1
20e8a61
0ab0006
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10,6 +10,7 @@ | |
|
||
import couchforms | ||
from casexml.apps.case.xform import get_case_updates, is_device_report | ||
from corehq.util.metrics import metrics | ||
from couchforms import openrosa_response | ||
from couchforms.const import MAGIC_PROPERTY, BadRequest | ||
from couchforms.getters import MultimediaBug | ||
|
@@ -50,18 +51,38 @@ | |
convert_xform_to_json, | ||
should_use_sql_backend, | ||
) | ||
from corehq.util.datadog.gauges import datadog_counter, datadog_gauge | ||
from corehq.util.datadog.metrics import ( | ||
MULTIMEDIA_SUBMISSION_ERROR_COUNT, | ||
XFORM_LOCKED_COUNT, | ||
) | ||
from corehq.util.datadog.utils import bucket_value | ||
from corehq.util.timer import TimingContext | ||
|
||
PROFILE_PROBABILITY = float(os.getenv('COMMCARE_PROFILE_SUBMISSION_PROBABILITY', 0)) | ||
PROFILE_LIMIT = os.getenv('COMMCARE_PROFILE_SUBMISSION_LIMIT') | ||
PROFILE_LIMIT = int(PROFILE_LIMIT) if PROFILE_LIMIT is not None else 1 | ||
|
||
corrupt_multimedia_counter = metrics.counter( | ||
'commcare.corrupt_multimedia_submissions', 'Count of requests with corrupt multimedia', | ||
tag_names=['domain', 'authenticated'] | ||
) | ||
|
||
xform_locked_error_counter = metrics.counter( | ||
'commcare.xformlocked.count', 'Count of locking errors', | ||
tag_names=['domain', 'authenticated'] | ||
) | ||
|
||
submission_counter = metrics.counter( | ||
'commcare.xform_submissions.count', 'Count of form submissions', | ||
tag_names=['domain', 'backend', 'submission_type', 'status_code'] | ||
) | ||
|
||
submission_duration_histogram = metrics.histogram( | ||
'commcare.xform_submissions.duration.seconds', 'Count of form submissions', | ||
bucket_tag='duration', buckets=(1, 5, 20, 60, 120, 300, 600), bucket_unit='s', | ||
tag_names=['domain', 'backend', 'submission_type', 'status_code'] | ||
) | ||
|
||
submission_lag_histogram = metrics.histogram( | ||
'commcare.xform_submissions.lag.days', 'Count of form submissions', | ||
bucket_tag='lag', buckets=(1, 2, 4, 7, 14, 31, 90), bucket_unit='d', | ||
tag_names=['domain', 'backend', 'submission_type', 'status_code'] | ||
) | ||
|
||
@profile_prod('commcare_receiverwapper_process_form.prof', probability=PROFILE_PROBABILITY, limit=PROFILE_LIMIT) | ||
def _process_form(request, domain, app_id, user_id, authenticated, | ||
|
@@ -70,10 +91,10 @@ def _process_form(request, domain, app_id, user_id, authenticated, | |
if rate_limit_submission(domain): | ||
return HttpTooManyRequests() | ||
|
||
metric_tags = [ | ||
'backend:sql' if should_use_sql_backend(domain) else 'backend:couch', | ||
'domain:{}'.format(domain), | ||
] | ||
metric_tags = { | ||
'backend': 'sql' if should_use_sql_backend(domain) else 'couch', | ||
'domain': domain | ||
} | ||
|
||
try: | ||
instance, attachments = couchforms.get_instance_and_attachment(request) | ||
|
@@ -85,9 +106,9 @@ def _process_form(request, domain, app_id, user_id, authenticated, | |
except: | ||
meta = {} | ||
|
||
corrupt_multimedia_counter.tag(domain=domain, authenticated=authenticated).inc() | ||
return _submission_error( | ||
request, "Received a submission with POST.keys()", | ||
MULTIMEDIA_SUBMISSION_ERROR_COUNT, metric_tags, | ||
request, "Received a submission with POST.keys()", metric_tags, | ||
domain, app_id, user_id, authenticated, meta, | ||
) | ||
|
||
|
@@ -133,8 +154,9 @@ def _process_form(request, domain, app_id, user_id, authenticated, | |
try: | ||
result = submission_post.run() | ||
except XFormLockError as err: | ||
xform_locked_error_counter.tag(domain=domain, authenticated=authenticated).inc() | ||
return _submission_error( | ||
request, "XFormLockError: %s" % err, XFORM_LOCKED_COUNT, | ||
request, "XFormLockError: %s" % err, | ||
metric_tags, domain, app_id, user_id, authenticated, status=423, | ||
notify=False, | ||
) | ||
|
@@ -145,7 +167,7 @@ def _process_form(request, domain, app_id, user_id, authenticated, | |
return response | ||
|
||
|
||
def _submission_error(request, message, count_metric, metric_tags, | ||
def _submission_error(request, message, metric_tags, | ||
domain, app_id, user_id, authenticated, meta=None, status=400, | ||
notify=True): | ||
"""Notify exception, datadog count, record metrics, construct response | ||
|
@@ -157,7 +179,6 @@ def _submission_error(request, message, count_metric, metric_tags, | |
"domain:{}".format(domain), | ||
"authenticated:{}".format(authenticated), | ||
] | ||
datadog_counter(count_metric, tags=details) | ||
if notify: | ||
details.extend([ | ||
"user_id:{}".format(user_id), | ||
|
@@ -172,24 +193,20 @@ def _submission_error(request, message, count_metric, metric_tags, | |
|
||
|
||
def _record_metrics(tags, submission_type, response, timer=None, xform=None): | ||
if xform and xform.metadata and xform.metadata.timeEnd and xform.received_on: | ||
lag = xform.received_on - xform.metadata.timeEnd | ||
lag_days = lag.total_seconds() / 86400 | ||
tags += [ | ||
'lag:%s' % bucket_value(lag_days, (1, 2, 4, 7, 14, 31, 90), 'd') | ||
] | ||
|
||
tags += [ | ||
'submission_type:{}'.format(submission_type), | ||
'status_code:{}'.format(response.status_code) | ||
] | ||
|
||
if xform and xform.metadata and xform.metadata.timeEnd and xform.received_on: | ||
lag = xform.received_on - xform.metadata.timeEnd | ||
lag_days = lag.total_seconds() / 86400 | ||
submission_lag_histogram.tag(**tags).observe(lag_days) | ||
|
||
if timer: | ||
tags += [ | ||
'duration:%s' % bucket_value(timer.duration, (1, 5, 20, 60, 120, 300, 600), 's'), | ||
] | ||
submission_duration_histogram.tag(**tags).observe(timer.duration) | ||
|
||
datadog_counter('commcare.xform_submissions.count', tags=tags) | ||
submission_counter.tag(**tags).inc() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is an example of a metric where we have attached multiple histogram bucket tags to a single metric. If we want to convert these histograms to the native histogram types then they must be separated out as is done here. |
||
|
||
|
||
@location_safe | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
from django.apps import AppConfig | ||
from django.conf import settings | ||
|
||
|
||
class DatadogConfig(AppConfig): | ||
|
||
name = 'corehq.util.datadog' | ||
verbose_name = 'Datadog' | ||
|
||
def ready(self): | ||
if not settings.DATADOG_API_KEY or not settings.DATADOG_APP_KEY: | ||
return | ||
|
||
try: | ||
from datadog import initialize | ||
except ImportError: | ||
pass | ||
else: | ||
initialize(settings.DATADOG_API_KEY, settings.DATADOG_APP_KEY) | ||
|
||
if settings.UNIT_TESTING or settings.DEBUG or 'ddtrace.contrib.django' not in settings.INSTALLED_APPS: | ||
try: | ||
from ddtrace import tracer | ||
tracer.enabled = False | ||
except ImportError: | ||
pass | ||
|
||
snopoke marked this conversation as resolved.
Show resolved
Hide resolved
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
from django.utils.functional import SimpleLazyObject | ||
|
||
from corehq.util.metrics.datadog import DatadogMetrics | ||
from corehq.util.metrics.metrics import DummyMetrics, DelegatedMetrics, HqMetrics | ||
snopoke marked this conversation as resolved.
Show resolved
Hide resolved
|
||
from corehq.util.metrics.prometheus import PrometheusMetrics | ||
|
||
|
||
def _get_metrics(): | ||
enabled = list(filter(lambda m: m.enabled(), [DatadogMetrics(), PrometheusMetrics()])) | ||
if not enabled: | ||
return [DummyMetrics()] | ||
return enabled | ||
|
||
|
||
metrics = DelegatedMetrics(SimpleLazyObject(_get_metrics)) # singleton/global | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Initially I had this as a function m = metrics.counter('name', 'description') # still lazy
m.inc() # now it's evaluated There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does this mean that There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes it does mean that. The alternative is to split |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
import logging | ||
|
||
from django.conf import settings | ||
|
||
from corehq.util.datadog.utils import bucket_value | ||
from corehq.util.metrics.metrics import ( | ||
HqCounter, | ||
HqGauge, | ||
HqHistogram, | ||
HqMetrics, | ||
) | ||
from datadog import api | ||
from datadog.dogstatsd.base import DogStatsd | ||
|
||
datadog_logger = logging.getLogger('datadog') | ||
|
||
COMMON_TAGS = ['environment:{}'.format(settings.SERVER_ENVIRONMENT)] | ||
|
||
statsd = DogStatsd(constant_tags=COMMON_TAGS) | ||
|
||
|
||
def _format_tags(tag_names, tag_values): | ||
if not tag_names or not tag_values: | ||
return None | ||
|
||
return [ | ||
f'{name}:{value}' for name, value in zip(tag_names, tag_values) | ||
] | ||
|
||
|
||
def _datadog_record(fn, name, value, tags=None): | ||
try: | ||
fn(name, value, tags=tags) | ||
except Exception: | ||
datadog_logger.exception('Unable to record Datadog stats') | ||
|
||
|
||
class Counter(HqCounter): | ||
def _record(self, amount: float): | ||
tags = _format_tags(self.tag_names, self.tag_values) | ||
_datadog_record(statsd.increment, self.name, amount, tags) | ||
|
||
|
||
class Gauge(HqGauge): | ||
def _record(self, value): | ||
tags = _format_tags(self.tag_names, self.tag_values) | ||
_datadog_record(statsd.gauge, self.name, value, tags) | ||
|
||
|
||
class Histogram(HqHistogram): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As stated in the docstring this isn't a real histogram. It uses our workaround of adding bucketed tags. |
||
"""This implementation of histogram uses tagging to record the buckets. | ||
It does not use the Datadog Histogram metric type. | ||
|
||
The metric itself will be incremented by 1 on each call to `observe`. The value | ||
passed to `observe` will be used to create the bucket tag. | ||
|
||
For example: | ||
|
||
h = Histogram( | ||
'commcare.request.duration', 'description', | ||
bucket_tag='duration', buckets=[1,2,3], bucket_units='ms' | ||
) | ||
h.observe(1.4) | ||
|
||
# resulting Datadog metric | ||
# commcare.request.duration:1|c|#duration:lt_2ms | ||
|
||
More details: https://help.datadoghq.com/hc/en-us/articles/211545826 | ||
snopoke marked this conversation as resolved.
Show resolved
Hide resolved
|
||
""" | ||
def _record(self, value: float): | ||
tags = _format_tags(self.tag_names, self.tag_values) | ||
if not tags: | ||
tags = [] | ||
bucket = bucket_value(value, self._buckets, self._bucket_unit) | ||
tags.append(f'{self._bucket_tag}:{bucket}') | ||
_datadog_record(statsd.increment, self.name, 1, tags) | ||
|
||
|
||
class DatadogMetrics(HqMetrics): | ||
_counter_class = Counter | ||
_gauge_class = Gauge | ||
_histogram_class = Histogram | ||
|
||
def enabled(self) -> bool: | ||
return api._api_key and api._application_key | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this doesn't look like it will always return return bool(api._api_key and api._application_key) But that doesn't seem very Pythonic. I'd probably drop the type hint. However, I'm not very familiar with the conventions around type hints. Does |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This name is changed
I changed this because
.error
seemed redundant since it's implied by the fact that it's 'corrupt'. I dropped the 'count' suffix because Prometheus adds a_total
prefix to the end of counter metric names. If we want to keep the suffix we should add it to all 'counter' metrics at the provider level.