dimagi · snopoke · Mar 9, 2020 · Mar 10, 2020 · Mar 10, 2020 · Mar 10, 2020
diff --git a/corehq/apps/api/odata/utils.py b/corehq/apps/api/odata/utils.py
@@ -4,6 +4,7 @@
 from corehq.apps.export.models import ExportInstance
 from corehq.util.datadog.gauges import datadog_counter
 from corehq.util.datadog.utils import bucket_value
+from corehq.util.metrics import metrics
 
 FieldMetadata = namedtuple('FieldMetadata', ['name', 'odata_type'])
 
@@ -54,6 +55,13 @@ def _get_dot_path(export_column):
     return metadata
 
 
+odata_feed_access_histogram = metrics.histogram(
+    'commcare.odata_feed.test_v3', 'Odata Feed Access',
+    bucket_tag='duration_bucket', buckets=(1, 5, 20, 60, 120, 300, 600), bucket_unit='s',
+    tag_names=['domain', 'feed_id', 'feed_type', 'username', 'row_count', 'column_count', 'size']
+)
+
+
 def record_feed_access_in_datadog(request, config_id, duration, response):
     config = ExportInstance.get(config_id)
     username = request.couch_user.username
@@ -64,14 +72,12 @@ def record_feed_access_in_datadog(request, config_id, duration, response):
         column_count = len(rows[0])
     except IndexError:
         column_count = 0
-    datadog_counter('commcare.odata_feed.test_v3', tags=[
-        'domain:{}'.format(request.domain),
-        'feed_id:{}'.format(config_id),
-        'feed_type:{}'.format(config.type),
-        'username:{}'.format(username),
-        'row_count:{}'.format(row_count),
-        'column_count:{}'.format(column_count),
-        'size:{}'.format(len(response.content)),
-        'duration:{}'.format(duration),
-        'duration_bucket:{}'.format(bucket_value(duration, (1, 5, 20, 60, 120, 300, 600), 's')),
-    ])
+    odata_feed_access_histogram.tag(
+        domain=request.domain,
+        feed_id=config_id,
+        feed_type=config.type,
+        username=username,
+        row_count=row_count,
+        column_count=column_count,
+        size=len(response.content),
+    ).observe(duration)
@@ -10,6 +10,7 @@
 
 import couchforms
 from casexml.apps.case.xform import get_case_updates, is_device_report
+from corehq.util.metrics import metrics
 from couchforms import openrosa_response
 from couchforms.const import MAGIC_PROPERTY, BadRequest
 from couchforms.getters import MultimediaBug
@@ -50,18 +51,38 @@
     convert_xform_to_json,
     should_use_sql_backend,
 )
-from corehq.util.datadog.gauges import datadog_counter, datadog_gauge
-from corehq.util.datadog.metrics import (
-    MULTIMEDIA_SUBMISSION_ERROR_COUNT,
-    XFORM_LOCKED_COUNT,
-)
-from corehq.util.datadog.utils import bucket_value
 from corehq.util.timer import TimingContext
 
 PROFILE_PROBABILITY = float(os.getenv('COMMCARE_PROFILE_SUBMISSION_PROBABILITY', 0))
 PROFILE_LIMIT = os.getenv('COMMCARE_PROFILE_SUBMISSION_LIMIT')
 PROFILE_LIMIT = int(PROFILE_LIMIT) if PROFILE_LIMIT is not None else 1
 
+corrupt_multimedia_counter = metrics.counter(
+    'commcare.corrupt_multimedia_submissions', 'Count of requests with corrupt multimedia',
+    tag_names=['domain', 'authenticated']
+)
+
+xform_locked_error_counter = metrics.counter(
+    'commcare.xformlocked.count', 'Count of locking errors',
+    tag_names=['domain', 'authenticated']
+)
+
+submission_counter = metrics.counter(
+    'commcare.xform_submissions.count', 'Count of form submissions',
+    tag_names=['domain', 'backend', 'submission_type', 'status_code']
+)
+
+submission_duration_histogram = metrics.histogram(
+    'commcare.xform_submissions.duration.seconds', 'Count of form submissions',
+    bucket_tag='duration', buckets=(1, 5, 20, 60, 120, 300, 600), bucket_unit='s',
+    tag_names=['domain', 'backend', 'submission_type', 'status_code']
+)
+
+submission_lag_histogram = metrics.histogram(
+    'commcare.xform_submissions.lag.days', 'Count of form submissions',
+    bucket_tag='lag', buckets=(1, 2, 4, 7, 14, 31, 90), bucket_unit='d',
+    tag_names=['domain', 'backend', 'submission_type', 'status_code']
+)
 
 @profile_prod('commcare_receiverwapper_process_form.prof', probability=PROFILE_PROBABILITY, limit=PROFILE_LIMIT)
 def _process_form(request, domain, app_id, user_id, authenticated,
@@ -70,10 +91,10 @@ def _process_form(request, domain, app_id, user_id, authenticated,
     if rate_limit_submission(domain):
         return HttpTooManyRequests()
 
-    metric_tags = [
-        'backend:sql' if should_use_sql_backend(domain) else 'backend:couch',
-        'domain:{}'.format(domain),
-    ]
+    metric_tags = {
+        'backend': 'sql' if should_use_sql_backend(domain) else 'couch',
+        'domain': domain
+    }
 
     try:
         instance, attachments = couchforms.get_instance_and_attachment(request)
@@ -85,9 +106,9 @@ def _process_form(request, domain, app_id, user_id, authenticated,
         except:
             meta = {}
 
+        corrupt_multimedia_counter.tag(domain=domain, authenticated=authenticated).inc()
         return _submission_error(
-            request, "Received a submission with POST.keys()",
-            MULTIMEDIA_SUBMISSION_ERROR_COUNT, metric_tags,
+            request, "Received a submission with POST.keys()", metric_tags,
             domain, app_id, user_id, authenticated, meta,
         )
 
@@ -133,8 +154,9 @@ def _process_form(request, domain, app_id, user_id, authenticated,
         try:
             result = submission_post.run()
         except XFormLockError as err:
+            xform_locked_error_counter.tag(domain=domain, authenticated=authenticated).inc()
             return _submission_error(
-                request, "XFormLockError: %s" % err, XFORM_LOCKED_COUNT,
+                request, "XFormLockError: %s" % err,
                 metric_tags, domain, app_id, user_id, authenticated, status=423,
                 notify=False,
             )
@@ -145,7 +167,7 @@ def _process_form(request, domain, app_id, user_id, authenticated,
     return response
 
 
-def _submission_error(request, message, count_metric, metric_tags,
+def _submission_error(request, message, metric_tags,
         domain, app_id, user_id, authenticated, meta=None, status=400,
         notify=True):
     """Notify exception, datadog count, record metrics, construct response
@@ -157,7 +179,6 @@ def _submission_error(request, message, count_metric, metric_tags,
         "domain:{}".format(domain),
         "authenticated:{}".format(authenticated),
     ]
-    datadog_counter(count_metric, tags=details)
     if notify:
         details.extend([
             "user_id:{}".format(user_id),
@@ -172,24 +193,20 @@ def _submission_error(request, message, count_metric, metric_tags,
 
 
 def _record_metrics(tags, submission_type, response, timer=None, xform=None):
-    if xform and xform.metadata and xform.metadata.timeEnd and xform.received_on:
-        lag = xform.received_on - xform.metadata.timeEnd
-        lag_days = lag.total_seconds() / 86400
-        tags += [
-            'lag:%s' % bucket_value(lag_days, (1, 2, 4, 7, 14, 31, 90), 'd')
-        ]
-
     tags += [
         'submission_type:{}'.format(submission_type),
         'status_code:{}'.format(response.status_code)
     ]
 
+    if xform and xform.metadata and xform.metadata.timeEnd and xform.received_on:
+        lag = xform.received_on - xform.metadata.timeEnd
+        lag_days = lag.total_seconds() / 86400
+        submission_lag_histogram.tag(**tags).observe(lag_days)
+
     if timer:
-        tags += [
-            'duration:%s' % bucket_value(timer.duration, (1, 5, 20, 60, 120, 300, 600), 's'),
-        ]
+        submission_duration_histogram.tag(**tags).observe(timer.duration)
 
-    datadog_counter('commcare.xform_submissions.count', tags=tags)
+    submission_counter.tag(**tags).inc()
 
 
 @location_safe

diff --git a/corehq/util/datadog/apps.py b/corehq/util/datadog/apps.py
@@ -0,0 +1,27 @@
+from django.apps import AppConfig
+from django.conf import settings
+
+
+class DatadogConfig(AppConfig):
+
+    name = 'corehq.util.datadog'
+    verbose_name = 'Datadog'
+
+    def ready(self):
+        if not settings.DATADOG_API_KEY or not settings.DATADOG_APP_KEY:
+            return
+
+        try:
+            from datadog import initialize
+        except ImportError:
+            pass
+        else:
+            initialize(settings.DATADOG_API_KEY, settings.DATADOG_APP_KEY)
+
+        if settings.UNIT_TESTING or settings.DEBUG or 'ddtrace.contrib.django' not in settings.INSTALLED_APPS:
+            try:
+                from ddtrace import tracer
+                tracer.enabled = False
+            except ImportError:
+                pass
+
diff --git a/corehq/util/datadog/metrics.py b/corehq/util/datadog/metrics.py
@@ -2,6 +2,4 @@
 ERROR_COUNT = 'commcare.error.count'
 REPEATER_ERROR_COUNT = 'commcare.repeaters.error'
 REPEATER_SUCCESS_COUNT = 'commcare.repeaters.success'
-MULTIMEDIA_SUBMISSION_ERROR_COUNT = 'commcare.corrupt-multimedia-submission.error.count'
 DATE_OPENED_CASEBLOCK_ERROR_COUNT = 'commcare.date-opened-caseblock-bug.error.count'
-XFORM_LOCKED_COUNT = 'commcare.xformlocked.count'
diff --git a/corehq/util/datadog/utils.py b/corehq/util/datadog/utils.py
@@ -115,7 +115,7 @@ def bucket_value(value, buckets, unit=''):
     lt_template = "lt_{:0%s}{}" % number_length
     over_template = "over_{:0%s}{}" % number_length
     for bucket in buckets:
-        if value < bucket:
+        if value <= bucket:
             return lt_template.format(bucket, unit)
     return over_template.format(buckets[-1], unit)
 

diff --git a/corehq/util/metrics/__init__.py b/corehq/util/metrics/__init__.py
@@ -0,0 +1,15 @@
+from django.utils.functional import SimpleLazyObject
+
+from corehq.util.metrics.datadog import DatadogMetrics
+from corehq.util.metrics.metrics import DummyMetrics, DelegatedMetrics, HqMetrics
+from corehq.util.metrics.prometheus import PrometheusMetrics
+
+
+def _get_metrics():
+    enabled = list(filter(lambda m: m.enabled(), [DatadogMetrics(), PrometheusMetrics()]))
+    if not enabled:
+        return [DummyMetrics()]
+    return enabled
+
+
+metrics = DelegatedMetrics(SimpleLazyObject(_get_metrics))  # singleton/global
diff --git a/corehq/util/metrics/datadog.py b/corehq/util/metrics/datadog.py
@@ -0,0 +1,85 @@
+import logging
+
+from django.conf import settings
+
+from corehq.util.datadog.utils import bucket_value
+from corehq.util.metrics.metrics import (
+    HqCounter,
+    HqGauge,
+    HqHistogram,
+    HqMetrics,
+)
+from datadog import api
+from datadog.dogstatsd.base import DogStatsd
+
+datadog_logger = logging.getLogger('datadog')
+
+COMMON_TAGS = ['environment:{}'.format(settings.SERVER_ENVIRONMENT)]
+
+statsd = DogStatsd(constant_tags=COMMON_TAGS)
+
+
+def _format_tags(tag_names, tag_values):
+    if not tag_names or not tag_values:
+        return None
+
+    return [
+        f'{name}:{value}' for name, value in zip(tag_names, tag_values)
+    ]
+
+
+def _datadog_record(fn, name, value, tags=None):
+    try:
+        fn(name, value, tags=tags)
+    except Exception:
+        datadog_logger.exception('Unable to record Datadog stats')
+
+
+class Counter(HqCounter):
+    def _record(self, amount: float):
+        tags = _format_tags(self.tag_names, self.tag_values)
+        _datadog_record(statsd.increment, self.name, amount, tags)
+
+
+class Gauge(HqGauge):
+    def _record(self, value):
+        tags = _format_tags(self.tag_names, self.tag_values)
+        _datadog_record(statsd.gauge, self.name, value, tags)
+
+
+class Histogram(HqHistogram):
+    """This implementation of histogram uses tagging to record the buckets.
+    It does not use the Datadog Histogram metric type.
+
+    The metric itself will be incremented by 1 on each call to `observe`. The value
+    passed to `observe` will be used to create the bucket tag.
+
+    For example:
+
+        h = Histogram(
+            'commcare.request.duration', 'description',
+            bucket_tag='duration', buckets=[1,2,3], bucket_units='ms'
+        )
+        h.observe(1.4)
+
+        # resulting Datadog metric
+        #    commcare.request.duration:1|c|#duration:lt_2ms
+
+    More details: https://help.datadoghq.com/hc/en-us/articles/211545826
+    """
+    def _record(self, value: float):
+        tags = _format_tags(self.tag_names, self.tag_values)
+        if not tags:
+            tags = []
+        bucket = bucket_value(value, self._buckets, self._bucket_unit)
+        tags.append(f'{self._bucket_tag}:{bucket}')
+        _datadog_record(statsd.increment, self.name, 1, tags)
+
+
+class DatadogMetrics(HqMetrics):
+    _counter_class = Counter
+    _gauge_class = Gauge
+    _histogram_class = Histogram
+
+    def enabled(self) -> bool:
+        return api._api_key and api._application_key