diff --git a/corehq/apps/api/odata/utils.py b/corehq/apps/api/odata/utils.py index f09608e12274..7628a03eb8c7 100644 --- a/corehq/apps/api/odata/utils.py +++ b/corehq/apps/api/odata/utils.py @@ -2,8 +2,7 @@ from collections import namedtuple from corehq.apps.export.models import ExportInstance -from corehq.util.datadog.gauges import datadog_counter -from corehq.util.datadog.utils import bucket_value +from corehq.util.metrics import metrics FieldMetadata = namedtuple('FieldMetadata', ['name', 'odata_type']) @@ -54,6 +53,13 @@ def _get_dot_path(export_column): return metadata +odata_feed_access_histogram = metrics.histogram( + 'commcare.odata_feed.test_v3', 'Odata Feed Access', + bucket_tag='duration_bucket', buckets=(1, 5, 20, 60, 120, 300, 600), bucket_unit='s', + tag_names=['domain', 'feed_id', 'feed_type', 'username', 'row_count', 'column_count', 'size'] +) + + def record_feed_access_in_datadog(request, config_id, duration, response): config = ExportInstance.get(config_id) username = request.couch_user.username @@ -64,14 +70,13 @@ def record_feed_access_in_datadog(request, config_id, duration, response): column_count = len(rows[0]) except IndexError: column_count = 0 - datadog_counter('commcare.odata_feed.test_v3', tags=[ - 'domain:{}'.format(request.domain), - 'feed_id:{}'.format(config_id), - 'feed_type:{}'.format(config.type), - 'username:{}'.format(username), - 'row_count:{}'.format(row_count), - 'column_count:{}'.format(column_count), - 'size:{}'.format(len(response.content)), - 'duration:{}'.format(duration), - 'duration_bucket:{}'.format(bucket_value(duration, (1, 5, 20, 60, 120, 300, 600), 's')), - ]) + odata_feed_access_histogram.observe( + duration, + domain=request.domain, + feed_id=config_id, + feed_type=config.type, + username=username, + row_count=row_count, + column_count=column_count, + size=len(response.content) + ) diff --git a/corehq/apps/receiverwrapper/views.py b/corehq/apps/receiverwrapper/views.py index 10922e0b4e45..16d28bf5a38b 100644 --- a/corehq/apps/receiverwrapper/views.py +++ b/corehq/apps/receiverwrapper/views.py @@ -10,6 +10,7 @@ import couchforms from casexml.apps.case.xform import get_case_updates, is_device_report +from corehq.util.metrics import metrics from couchforms import openrosa_response from couchforms.const import MAGIC_PROPERTY, BadRequest from couchforms.getters import MultimediaBug @@ -50,18 +51,38 @@ convert_xform_to_json, should_use_sql_backend, ) -from corehq.util.datadog.gauges import datadog_counter, datadog_gauge -from corehq.util.datadog.metrics import ( - MULTIMEDIA_SUBMISSION_ERROR_COUNT, - XFORM_LOCKED_COUNT, -) -from corehq.util.datadog.utils import bucket_value from corehq.util.timer import TimingContext PROFILE_PROBABILITY = float(os.getenv('COMMCARE_PROFILE_SUBMISSION_PROBABILITY', 0)) PROFILE_LIMIT = os.getenv('COMMCARE_PROFILE_SUBMISSION_LIMIT') PROFILE_LIMIT = int(PROFILE_LIMIT) if PROFILE_LIMIT is not None else 1 +corrupt_multimedia_counter = metrics.counter( + 'commcare.corrupt_multimedia_submissions', 'Count of requests with corrupt multimedia', + tag_names=['domain', 'authenticated'] +) + +xform_locked_error_counter = metrics.counter( + 'commcare.xformlocked.count', 'Count of locking errors', + tag_names=['domain', 'authenticated'] +) + +submission_counter = metrics.counter( + 'commcare.xform_submissions.count', 'Count of form submissions', + tag_names=['domain', 'backend', 'submission_type', 'status_code'] +) + +submission_duration_histogram = metrics.histogram( + 'commcare.xform_submissions.duration.seconds', 'Count of form submissions', + bucket_tag='duration', buckets=(1, 5, 20, 60, 120, 300, 600), bucket_unit='s', + tag_names=['domain', 'backend', 'submission_type', 'status_code'] +) + +submission_lag_histogram = metrics.histogram( + 'commcare.xform_submissions.lag.days', 'Count of form submissions', + bucket_tag='lag', buckets=(1, 2, 4, 7, 14, 31, 90), bucket_unit='d', + tag_names=['domain', 'backend', 'submission_type', 'status_code'] +) @profile_prod('commcare_receiverwapper_process_form.prof', probability=PROFILE_PROBABILITY, limit=PROFILE_LIMIT) def _process_form(request, domain, app_id, user_id, authenticated, @@ -70,10 +91,10 @@ def _process_form(request, domain, app_id, user_id, authenticated, if rate_limit_submission(domain): return HttpTooManyRequests() - metric_tags = [ - 'backend:sql' if should_use_sql_backend(domain) else 'backend:couch', - 'domain:{}'.format(domain), - ] + metric_tags = { + 'backend': 'sql' if should_use_sql_backend(domain) else 'couch', + 'domain': domain + } try: instance, attachments = couchforms.get_instance_and_attachment(request) @@ -85,9 +106,9 @@ def _process_form(request, domain, app_id, user_id, authenticated, except: meta = {} + corrupt_multimedia_counter.inc(domain=domain, authenticated=authenticated) return _submission_error( - request, "Received a submission with POST.keys()", - MULTIMEDIA_SUBMISSION_ERROR_COUNT, metric_tags, + request, "Received a submission with POST.keys()", metric_tags, domain, app_id, user_id, authenticated, meta, ) @@ -133,8 +154,9 @@ def _process_form(request, domain, app_id, user_id, authenticated, try: result = submission_post.run() except XFormLockError as err: + xform_locked_error_counter.inc(domain=domain, authenticated=authenticated) return _submission_error( - request, "XFormLockError: %s" % err, XFORM_LOCKED_COUNT, + request, "XFormLockError: %s" % err, metric_tags, domain, app_id, user_id, authenticated, status=423, notify=False, ) @@ -145,7 +167,7 @@ def _process_form(request, domain, app_id, user_id, authenticated, return response -def _submission_error(request, message, count_metric, metric_tags, +def _submission_error(request, message, metric_tags, domain, app_id, user_id, authenticated, meta=None, status=400, notify=True): """Notify exception, datadog count, record metrics, construct response @@ -157,7 +179,6 @@ def _submission_error(request, message, count_metric, metric_tags, "domain:{}".format(domain), "authenticated:{}".format(authenticated), ] - datadog_counter(count_metric, tags=details) if notify: details.extend([ "user_id:{}".format(user_id), @@ -172,24 +193,20 @@ def _submission_error(request, message, count_metric, metric_tags, def _record_metrics(tags, submission_type, response, timer=None, xform=None): + tags.update({ + 'submission_type': submission_type, + 'status_code': response.status_code + }) + if xform and xform.metadata and xform.metadata.timeEnd and xform.received_on: lag = xform.received_on - xform.metadata.timeEnd lag_days = lag.total_seconds() / 86400 - tags += [ - 'lag:%s' % bucket_value(lag_days, (1, 2, 4, 7, 14, 31, 90), 'd') - ] - - tags += [ - 'submission_type:{}'.format(submission_type), - 'status_code:{}'.format(response.status_code) - ] + submission_lag_histogram.observe(lag_days, **tags) if timer: - tags += [ - 'duration:%s' % bucket_value(timer.duration, (1, 5, 20, 60, 120, 300, 600), 's'), - ] + submission_duration_histogram.observe(timer.duration, **tags) - datadog_counter('commcare.xform_submissions.count', tags=tags) + submission_counter.inc(**tags) @location_safe diff --git a/corehq/util/datadog/apps.py b/corehq/util/datadog/apps.py new file mode 100644 index 000000000000..032d76e978d5 --- /dev/null +++ b/corehq/util/datadog/apps.py @@ -0,0 +1,27 @@ +from django.apps import AppConfig +from django.conf import settings + + +class DatadogConfig(AppConfig): + + name = 'corehq.util.datadog' + verbose_name = 'Datadog' + + def ready(self): + if not settings.DATADOG_API_KEY or not settings.DATADOG_APP_KEY: + return + + try: + from datadog import initialize + except ImportError: + pass + else: + initialize(settings.DATADOG_API_KEY, settings.DATADOG_APP_KEY) + + if settings.UNIT_TESTING or settings.DEBUG or 'ddtrace.contrib.django' not in settings.INSTALLED_APPS: + try: + from ddtrace import tracer + tracer.enabled = False + except ImportError: + pass + diff --git a/corehq/util/datadog/metrics.py b/corehq/util/datadog/metrics.py index 545be07a5a85..d56dea5154db 100644 --- a/corehq/util/datadog/metrics.py +++ b/corehq/util/datadog/metrics.py @@ -2,6 +2,4 @@ ERROR_COUNT = 'commcare.error.count' REPEATER_ERROR_COUNT = 'commcare.repeaters.error' REPEATER_SUCCESS_COUNT = 'commcare.repeaters.success' -MULTIMEDIA_SUBMISSION_ERROR_COUNT = 'commcare.corrupt-multimedia-submission.error.count' DATE_OPENED_CASEBLOCK_ERROR_COUNT = 'commcare.date-opened-caseblock-bug.error.count' -XFORM_LOCKED_COUNT = 'commcare.xformlocked.count' diff --git a/corehq/util/metrics/__init__.py b/corehq/util/metrics/__init__.py new file mode 100644 index 000000000000..1e6a6053ea49 --- /dev/null +++ b/corehq/util/metrics/__init__.py @@ -0,0 +1,15 @@ +from django.utils.functional import SimpleLazyObject + +from corehq.util.metrics.datadog import DatadogMetrics +from corehq.util.metrics.metrics import DummyMetrics, DelegatedMetrics +from corehq.util.metrics.prometheus import PrometheusMetrics + + +def _get_metrics(): + enabled = list(filter(lambda m: m.enabled(), [DatadogMetrics(), PrometheusMetrics()])) + if not enabled: + return [DummyMetrics()] + return enabled + + +metrics = DelegatedMetrics(SimpleLazyObject(_get_metrics)) # singleton/global diff --git a/corehq/util/metrics/datadog.py b/corehq/util/metrics/datadog.py new file mode 100644 index 000000000000..0163e5696924 --- /dev/null +++ b/corehq/util/metrics/datadog.py @@ -0,0 +1,87 @@ +import logging + +from django.conf import settings + +from corehq.util.datadog.utils import bucket_value +from corehq.util.metrics.metrics import ( + HqCounter, + HqGauge, + HqHistogram, + HqMetrics, +) +from datadog import api +from datadog.dogstatsd.base import DogStatsd + +datadog_logger = logging.getLogger('datadog') + +COMMON_TAGS = ['environment:{}'.format(settings.SERVER_ENVIRONMENT)] + +statsd = DogStatsd(constant_tags=COMMON_TAGS) + + +def _format_tags(tag_values: dict): + if not tag_values: + return None + + return [ + f'{name}:{value}' for name, value in tag_values.items() + ] + + +def _datadog_record(fn, name, value, tags=None): + try: + fn(name, value, tags=tags) + except Exception: + datadog_logger.exception('Unable to record Datadog stats') + + +class Counter(HqCounter): + def _record(self, amount: float, tag_values: dict): + tags = _format_tags(tag_values) + _datadog_record(statsd.increment, self.name, amount, tags) + + +class Gauge(HqGauge): + def _record(self, value: float, tag_values: dict): + tags = _format_tags(tag_values) + _datadog_record(statsd.gauge, self.name, value, tags) + + +class Histogram(HqHistogram): + """This implementation of histogram uses tagging to record the buckets. + It does not use the Datadog Histogram metric type. + + The metric itself will be incremented by 1 on each call to `observe`. The value + passed to `observe` will be used to create the bucket tag. + + For example: + + h = Histogram( + 'commcare.request.duration', 'description', + bucket_tag='duration', buckets=[1,2,3], bucket_units='ms' + ) + h.observe(1.4) + + # resulting Datadog metric + # commcare.request.duration:1|c|#duration:lt_2ms + + For more details see: + * https://github.com/dimagi/commcare-hq/pull/17080 + * https://github.com/dimagi/commcare-hq/pull/17030#issuecomment-315794700 + """ + def _record(self, value: float, tag_values: dict): + tags = _format_tags(tag_values) + if not tags: + tags = [] + bucket = bucket_value(value, self._buckets, self._bucket_unit) + tags.append(f'{self._bucket_tag}:{bucket}') + _datadog_record(statsd.increment, self.name, 1, tags) + + +class DatadogMetrics(HqMetrics): + _counter_class = Counter + _gauge_class = Gauge + _histogram_class = Histogram + + def enabled(self) -> bool: + return api._api_key and api._application_key diff --git a/corehq/util/metrics/metrics.py b/corehq/util/metrics/metrics.py new file mode 100644 index 000000000000..5abb6811f308 --- /dev/null +++ b/corehq/util/metrics/metrics.py @@ -0,0 +1,174 @@ +import abc +import re +from abc import abstractmethod +from typing import Iterable, List + +from django.utils.functional import SimpleLazyObject + +from corehq.util.soft_assert import soft_assert +from prometheus_client.utils import INF + +METRIC_NAME_RE = re.compile(r'^[a-zA-Z_:.][a-zA-Z0-9_:.]*$') +METRIC_TAG_NAME_RE = re.compile(r'^[a-zA-Z_][a-zA-Z0-9_]*$') +RESERVED_METRIC_TAG_NAME_RE = re.compile(r'^__.*$') +RESERVED_METRIC_TAG_NAMES = ['quantile', 'le'] + + +def _enforce_prefix(name, prefix): + soft_assert(fail_if_debug=True).call( + not prefix or name.startswith(prefix), + "Did you mean to call your metric 'commcare.{}'? ".format(name)) + + +def _validate_tag_names(tag_names): + tag_names = set(tag_names) + for l in tag_names: + if not METRIC_TAG_NAME_RE.match(l): + raise ValueError('Invalid metric tag name: ' + l) + if RESERVED_METRIC_TAG_NAME_RE.match(l): + raise ValueError('Reserved metric tag name: ' + l) + if l in RESERVED_METRIC_TAG_NAMES: + raise ValueError('Reserved metric tag name: ' + l) + return tag_names + + +class MetricBase: + def __init__(self, name: str, documentation: str, tag_names: Iterable = ()): + self.name = name + if not METRIC_NAME_RE.match(name): + raise ValueError('Invalid metric name: ' + name) + _enforce_prefix(name, 'commcare') + self.documentation = documentation + self.tag_names = _validate_tag_names(tag_names) + self._init_metric() + + def _init_metric(self): + pass + + def _validate_tags(self, tag_values: dict): + if self.tag_names and not tag_values: + raise Exception('Metric has missing tag values.') + + if tag_values: + assert isinstance(tag_values, dict) + if tag_values.keys() != self.tag_names: + raise ValueError('Incorrect tag names') + + def _record(self, value: float, tags: dict): + raise NotImplementedError + + +class HqCounter(MetricBase): + def inc(self, amount: float = 1, **tags): + """Increment the counter by the given amount.""" + self._validate_tags(tags) + self._record(amount, tags) + + +class HqGauge(MetricBase): + def set(self, value: float, **tags): + """Set gauge to the given value.""" + self._validate_tags(tags) + self._record(value, tags) + + +DEFAULT_BUCKETS = (.005, .01, .025, .05, .075, .1, .25, .5, .75, 1.0, 2.5, 5.0, 7.5, 10.0, INF) + + +class HqHistogram(MetricBase): + + def __init__(self, name: str, documentation: str, + bucket_tag: str, buckets: List[int] = DEFAULT_BUCKETS, bucket_unit: str = '', + tag_names: Iterable = ()): + self._bucket_tag = bucket_tag + self._buckets = buckets + self._bucket_unit = bucket_unit + if self._bucket_tag in tag_names: + tag_names = tuple(name for name in tag_names if name != bucket_tag) + super().__init__(name, documentation, tag_names) + + def observe(self, value: float, **tags): + """Update histogram with the given value.""" + self._validate_tags(tags) + self._record(value, tags) + + +class HqMetrics(metaclass=abc.ABCMeta): + _counter_class = None + _gauge_class = None + _histogram_class = None + + @abstractmethod + def enabled(self) -> bool: + raise NotImplementedError + + def counter(self, name: str, documentation: str, tag_names: Iterable = ()) -> HqCounter: + return self._counter_class(name, documentation, tag_names) + + def gauge(self, name: str, documentation: str, tag_names: Iterable = ()) -> HqGauge: + return self._gauge_class(name, documentation, tag_names) + + def histogram(self, name: str, documentation: str, + bucket_tag: str, buckets: List[int] = DEFAULT_BUCKETS, bucket_unit: str = '', + tag_names: Iterable = ()) -> HqHistogram: + """Create a histogram metric. Histogram implementations differ between provider. See provider + implementations for details. + """ + return self._histogram_class( + name, documentation, bucket_tag, buckets=buckets, bucket_unit=bucket_unit, tag_names=tag_names + ) + + +class DummyMetric: + def __init__(self, *args, **kwargs): + pass + + def __getattr__(self, item): + if item in ('inc', 'set', 'observe'): + return lambda *args, **kwargs: None + raise AttributeError + + +class DummyMetrics(HqMetrics): + _counter_class = DummyMetric + _gauge_class = DummyMetric + _histogram_class = DummyMetric + + def enabled(self) -> bool: + return True + + +class DelegatedMetrics: + """This class makes the metric class instantiation lazy and + also multiple metrics providers to be used.""" + def __init__(self, delegates): + self.delegates = delegates + self._types = { + 'counter': 'inc', + 'gauge': 'set', + 'histogram': 'observe', + } + + def __getattr__(self, item): + if item in self._types: + def _make_type(*args, **kwargs): + return SimpleLazyObject(lambda: DelegatingMetric([ + getattr(d, item)(*args, **kwargs) for d in self.delegates + ], self._types[item])) + return _make_type + raise AttributeError + + +class DelegatingMetric: + def __init__(self, delegates, record_fn_name): + self._delegates = delegates + self._record_fn_name = record_fn_name + + def __getattr__(self, item): + if item == self._record_fn_name: + def record(*args, **kwargs): + for metric in self._delegates: + getattr(metric, item)(*args, **kwargs) + return record + + raise AttributeError diff --git a/corehq/util/metrics/prometheus.py b/corehq/util/metrics/prometheus.py new file mode 100644 index 000000000000..cce9a0c671ed --- /dev/null +++ b/corehq/util/metrics/prometheus.py @@ -0,0 +1,59 @@ +import settings +from corehq.util.metrics.metrics import ( + HqCounter, + HqGauge, + HqHistogram, + HqMetrics, + MetricBase, +) +from prometheus_client import Counter as PCounter +from prometheus_client import Gauge as PGauge +from prometheus_client import Histogram as PHistogram + + +class Counter(HqCounter): + """https://prometheus.io/docs/concepts/metric_types/#counter""" + + def _init_metric(self): + self.name = self.name.replace('.', '_') + self._delegate = PCounter(self.name, self.documentation, self.tag_names) + + def _record(self, amount: float, tags): + _get_labeled(self._delegate, tags).inc(amount) + + +class Gauge(HqGauge): + """https://prometheus.io/docs/concepts/metric_types/#gauge""" + + def _init_metric(self): + self.name = self.name.replace('.', '_') + self._delegate = PGauge(self.name, self.documentation, self.tag_names) + + def _record(self, value: float, tags): + _get_labeled(self._delegate, tags).set(value) + + +class Histogram(HqHistogram): + """This metric class implements the native Prometheus Histogram type + + https://prometheus.io/docs/concepts/metric_types/#histogram + """ + def _init_metric(self): + self.name = self.name.replace('.', '_') + self._delegate = PHistogram(self.name, self.documentation, self.tag_names, buckets=self._buckets) + + def _record(self, value: float, tags: dict): + _get_labeled(self._delegate, tags).observe(value) + + +def _get_labeled(metric, labels): + return metric.labels(**labels) if labels else metric + + +class PrometheusMetrics(HqMetrics): + _counter_class = Counter + _gauge_class = Gauge + _histogram_class = Histogram + + def enabled(self) -> bool: + return settings.ENABLE_PROMETHEUS_METRICS diff --git a/corehq/util/metrics/tests/__init__.py b/corehq/util/metrics/tests/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/corehq/util/metrics/tests/test_metrics.py b/corehq/util/metrics/tests/test_metrics.py new file mode 100644 index 000000000000..bb9c8232a00b --- /dev/null +++ b/corehq/util/metrics/tests/test_metrics.py @@ -0,0 +1,220 @@ +from typing import Dict, Tuple + +from django.test import SimpleTestCase +from django.utils.functional import SimpleLazyObject + +from corehq.util.metrics import DatadogMetrics, PrometheusMetrics +from corehq.util.metrics.metrics import ( + DelegatedMetrics, + HqCounter, + HqGauge, + HqHistogram, +) +from corehq.util.metrics.tests.utils import patch_datadog +from prometheus_client.samples import Sample +from prometheus_client.utils import INF +from testil import eq + + +class _TestMetrics(SimpleTestCase): + provider_class = None + + @classmethod + def setUpClass(cls): + super().setUpClass() + cls.provider = cls.provider_class() + + def test_counter(self): + counter = self.provider.counter('commcare.test.counter', 'Description', tag_names=['t1', 't2']) + counter.inc(t1='a', t2='b') + counter.inc(2, t1='c', t2='b') + counter.inc(t1='c', t2='b') + self.assertCounterMetric(counter, { + (('t1', 'a'), ('t2', 'b')): 1, + (('t1', 'c'), ('t2', 'b')): 3, + }) + + def test_gauge(self): + gauge = self.provider.gauge('commcare.test.gauge', 'Description', tag_names=['t1', 't2']) + gauge.set(4.2, t1='a', t2='b') + gauge.set(2, t1='c', t2='b') + gauge.set(5, t1='c', t2='b') + self.assertGaugeMetric(gauge, { + (('t1', 'a'), ('t2', 'b')): 4.2, + (('t1', 'c'), ('t2', 'b')): 5, + }) + + def assertCounterMetric(self, metric: HqCounter, expected: Dict[Tuple[Tuple[str, str], ...], float]): + """ + :param metric: metric class + :param expected: dict mapping tag tuples to metric values + """ + raise NotImplementedError + + def assertGaugeMetric(self, metric: HqGauge, expected: Dict[Tuple[Tuple[str, str], ...], float]): + """ + :param metric: metric class + :param expected: dict mapping tag tuples to metric values + """ + raise NotImplementedError + + +class TestDatadogMetrics(_TestMetrics): + provider_class = DatadogMetrics + + def setUp(self) -> None: + super().setUp() + self.patch = patch_datadog() + self.recorded_metrics = self.patch.__enter__() + + def tearDown(self) -> None: + self.patch.__exit__(None, None, None) + super().tearDown() + + def test_histogram(self): + histogram = self.provider.histogram( + 'commcare.test.histogram', 'Description', 'duration', + buckets=[1, 2, 3], bucket_unit='ms', tag_names=['t1', 't2'] + ) + histogram.observe(0.2, t1='a', t2='b') + histogram.observe(0.7, t1='a', t2='b') + histogram.observe(2.5, t1='a', t2='b') + + histogram.observe(2, t1='c', t2='b') + histogram.observe(5, t1='c', t2='b') + self.assertHistogramMetric(histogram, { + (('t1', 'a'), ('t2', 'b')): {1: 2, 3: 1}, + (('t1', 'c'), ('t2', 'b')): {3: 1, INF: 1} + }) + + def assertCounterMetric(self, metric, expected): + self.assertEqual({key[0] for key in self.recorded_metrics}, {metric.name}) + actual = { + key[1]: sum(val) for key, val in self.recorded_metrics.items() + } + self.assertDictEqual(actual, expected) + + def assertGaugeMetric(self, metric, expected): + self.assertEqual({key[0] for key in self.recorded_metrics}, {metric.name}) + actual = { + key[1]: val[-1] for key, val in self.recorded_metrics.items() + } + self.assertDictEqual(actual, expected) + + def assertHistogramMetric(self, metric, expected): + self.assertEqual({key[0] for key in self.recorded_metrics}, {metric.name}) + expected_samples = {} + for tags, buckets in expected.items(): + for bucket, val in buckets.items(): + prefix = 'lt' + if bucket == INF: + bucket = metric._buckets[-1] + prefix = 'over' + bucket_tag = (metric._bucket_tag, f'{prefix}_{bucket:03d}{metric._bucket_unit}') + expected_samples[tuple(sorted(tags + (bucket_tag,)))] = val + + actual = { + key[1]: sum(val) for key, val in self.recorded_metrics.items() + } + self.assertDictEqual(actual, expected_samples) + + +class TestPrometheusMetrics(_TestMetrics): + provider_class = PrometheusMetrics + + def test_histogram(self): + histogram = self.provider.histogram( + 'commcare.test.histogram', 'Description', 'duration', + buckets=[1, 2, 3], bucket_unit='ms', tag_names=['t1', 't2'] + ) + histogram.observe(0.2, t1='a', t2='b') + histogram.observe(0.7, t1='a', t2='b') + histogram.observe(2.5, t1='a', t2='b') + + histogram.observe(2, t1='c', t2='b') + histogram.observe(5, t1='c', t2='b') + self.assertHistogramMetric(histogram, { + (('t1', 'a'), ('t2', 'b')): {1: 2, 3: 1}, + (('t1', 'c'), ('t2', 'b')): {2: 1, INF: 1} + }) + + def _samples_to_dics(self, samples, filter_name=None): + """Convert a Sample tuple into a dict((name, (labels tuple)) -> value)""" + return { + tuple(sorted(sample.labels.items())): sample.value + for sample in samples + if not filter_name or sample.name == filter_name + } + + def assertGaugeMetric(self, metric, expected): + [collected] = metric._delegate.collect() + actual = self._samples_to_dics(collected.samples) + self.assertDictEqual(actual, expected) + + def assertCounterMetric(self, metric, expected): + total_name = f'{metric.name}_total' + [collected] = metric._delegate.collect() + actual = self._samples_to_dics(collected.samples, total_name) + self.assertDictEqual(actual, expected) + + def assertHistogramMetric(self, metric, expected): + # Note that Prometheus histograms are cumulative so we must sum up the successive bucket values + # https://en.wikipedia.org/wiki/Histogram#Cumulative_histogram + [collected] = metric._delegate.collect() + + sample_name = f'{metric.name}_bucket' + expected_samples = [] + for key, value in expected.items(): + cumulative_value = 0 + for bucket in metric._buckets: + val = value.get(bucket, 0) + cumulative_value += val + labels = dict(key + (('le', str(float(bucket))),)) + expected_samples.append(Sample(sample_name, labels, float(cumulative_value), None, None)) + + labels = dict(key + (('le', '+Inf'),)) + cumulative_value += value.get(INF, 0) + expected_samples.append(Sample(sample_name, labels, float(cumulative_value), None, None)) + + actual = [ + s for s in collected.samples + if s.name.endswith('bucket') + ] + self.assertListEqual(actual, expected_samples) + + +def test_delegate_lazy(): + metrics = DelegatedMetrics([DatadogMetrics(), PrometheusMetrics()]) + + def _check(metric): + assert isinstance(metric, SimpleLazyObject), '' + + test_cases = [ + metrics.counter('commcare.name.1', ''), + metrics.gauge('commcare.name.2', ''), + metrics.histogram('commcare.name.3', '', 'duration'), + ] + for metric in test_cases: + yield _check, metric + + +def test_lazy_recording(): + metrics = DelegatedMetrics([DatadogMetrics(), PrometheusMetrics()]) + + def _check(metric, method_name): + with patch_datadog() as stats: + getattr(metric, method_name)(1) + + dd_metric, prom_metric = metric._delegates + [collected] = prom_metric._delegate.collect() + + eq(len(stats), 1, stats) + eq(len(collected.samples) >= 1, True, collected.samples) + + test_cases = [ + (metrics.counter('commcare.name.1', ''), 'inc'), + (metrics.gauge('commcare.name.2', ''), 'set'), + (metrics.histogram('commcare.name.3', '', 'duration'), 'observe'), + ] + for metric, method_name in test_cases: + yield _check, metric, method_name diff --git a/corehq/util/metrics/tests/utils.py b/corehq/util/metrics/tests/utils.py new file mode 100644 index 000000000000..3c012c0ff9d5 --- /dev/null +++ b/corehq/util/metrics/tests/utils.py @@ -0,0 +1,18 @@ +from collections import defaultdict +from contextlib import contextmanager + +import mock + + +@contextmanager +def patch_datadog(): + def record(fn, name, value, tags=None): + key = tuple([ + name, + tuple(sorted(tuple(t.split(':')) for t in (tags or []))), + ]) + stats[key].append(value) + + stats = defaultdict(list) + with mock.patch("corehq.util.metrics.datadog._datadog_record", new=record): + yield stats diff --git a/requirements-python3/dev-requirements.txt b/requirements-python3/dev-requirements.txt index e8ddb5311179..d8f390f9b4af 100644 --- a/requirements-python3/dev-requirements.txt +++ b/requirements-python3/dev-requirements.txt @@ -130,6 +130,7 @@ pillow==6.2.1 pip-tools==4.4.0 ply==3.11 # via eulxml, jsonpath-rw polib==1.1.0 +prometheus-client==0.7.1 prompt-toolkit==1.0.18 # via ipython psutil==5.1.3 psycogreen==1.0.1 diff --git a/requirements-python3/prod-requirements.txt b/requirements-python3/prod-requirements.txt index 2215c01047ed..39794d0cf3f3 100644 --- a/requirements-python3/prod-requirements.txt +++ b/requirements-python3/prod-requirements.txt @@ -109,6 +109,7 @@ pickleshare==0.7.5 # via ipython pillow==6.2.1 ply==3.11 # via eulxml, jsonpath-rw polib==1.1.0 +prometheus-client==0.7.1 prompt-toolkit==1.0.18 # via ipython psycogreen==1.0.1 psycopg2==2.7.7 diff --git a/requirements-python3/requirements.txt b/requirements-python3/requirements.txt index b7e4f2fb9c4b..313cdf9fe778 100644 --- a/requirements-python3/requirements.txt +++ b/requirements-python3/requirements.txt @@ -101,6 +101,7 @@ phonenumberslite==8.10.22 pillow==6.2.1 ply==3.11 # via eulxml, jsonpath-rw polib==1.1.0 +prometheus-client==0.7.1 psycogreen==1.0.1 psycopg2==2.7.7 py-kissmetrics==1.0.1 diff --git a/requirements-python3/test-requirements.txt b/requirements-python3/test-requirements.txt index 7daa4ecef200..0c55274d867c 100644 --- a/requirements-python3/test-requirements.txt +++ b/requirements-python3/test-requirements.txt @@ -113,6 +113,7 @@ pillow==6.2.1 pip-tools==4.4.0 ply==3.11 # via eulxml, jsonpath-rw polib==1.1.0 +prometheus-client==0.7.1 psycogreen==1.0.1 psycopg2==2.7.7 py-kissmetrics==1.0.1 diff --git a/requirements/dev-requirements.txt b/requirements/dev-requirements.txt index e8ddb5311179..d8f390f9b4af 100644 --- a/requirements/dev-requirements.txt +++ b/requirements/dev-requirements.txt @@ -130,6 +130,7 @@ pillow==6.2.1 pip-tools==4.4.0 ply==3.11 # via eulxml, jsonpath-rw polib==1.1.0 +prometheus-client==0.7.1 prompt-toolkit==1.0.18 # via ipython psutil==5.1.3 psycogreen==1.0.1 diff --git a/requirements/prod-requirements.txt b/requirements/prod-requirements.txt index 2215c01047ed..39794d0cf3f3 100644 --- a/requirements/prod-requirements.txt +++ b/requirements/prod-requirements.txt @@ -109,6 +109,7 @@ pickleshare==0.7.5 # via ipython pillow==6.2.1 ply==3.11 # via eulxml, jsonpath-rw polib==1.1.0 +prometheus-client==0.7.1 prompt-toolkit==1.0.18 # via ipython psycogreen==1.0.1 psycopg2==2.7.7 diff --git a/requirements/requirements.in b/requirements/requirements.in index 1e555dd475c3..96b3a9eb92db 100644 --- a/requirements/requirements.in +++ b/requirements/requirements.in @@ -114,3 +114,4 @@ werkzeug==0.11.15 CommcareTranslationChecker==0.9.3.5 WeasyPrint==0.42.3 architect==0.5.6 +prometheus-client==0.7.1 diff --git a/requirements/requirements.txt b/requirements/requirements.txt index b7e4f2fb9c4b..313cdf9fe778 100644 --- a/requirements/requirements.txt +++ b/requirements/requirements.txt @@ -101,6 +101,7 @@ phonenumberslite==8.10.22 pillow==6.2.1 ply==3.11 # via eulxml, jsonpath-rw polib==1.1.0 +prometheus-client==0.7.1 psycogreen==1.0.1 psycopg2==2.7.7 py-kissmetrics==1.0.1 diff --git a/requirements/test-requirements.txt b/requirements/test-requirements.txt index 7daa4ecef200..0c55274d867c 100644 --- a/requirements/test-requirements.txt +++ b/requirements/test-requirements.txt @@ -113,6 +113,7 @@ pillow==6.2.1 pip-tools==4.4.0 ply==3.11 # via eulxml, jsonpath-rw polib==1.1.0 +prometheus-client==0.7.1 psycogreen==1.0.1 psycopg2==2.7.7 py-kissmetrics==1.0.1 diff --git a/settings.py b/settings.py index eaa6db1f5837..05431414ab78 100755 --- a/settings.py +++ b/settings.py @@ -340,6 +340,7 @@ 'corehq.motech.openmrs', 'corehq.motech.repeaters', 'corehq.util', + 'corehq.util.datadog.apps.DatadogConfig', 'dimagi.ext', 'corehq.blobs', 'corehq.apps.case_search', @@ -837,6 +838,7 @@ DATADOG_API_KEY = None DATADOG_APP_KEY = None +ENABLE_PROMETHEUS_METRICS = False SYNCLOGS_SQL_DB_ALIAS = 'default' @@ -2073,19 +2075,6 @@ if 'dummy' not in CACHES: CACHES['dummy'] = {'BACKEND': 'django.core.cache.backends.dummy.DummyCache'} -try: - from datadog import initialize -except ImportError: - pass -else: - initialize(DATADOG_API_KEY, DATADOG_APP_KEY) - -if UNIT_TESTING or DEBUG or 'ddtrace.contrib.django' not in INSTALLED_APPS: - try: - from ddtrace import tracer - tracer.enabled = False - except ImportError: - pass REST_FRAMEWORK = { 'DATETIME_FORMAT': '%Y-%m-%dT%H:%M:%S.%fZ',