mozilla-services · willkg · May 24, 2024 · May 24, 2024 · willkg · May 24, 2024
@@ -59,7 +59,7 @@
 
 
 def count_sentry_scrub_error(msg):
-    METRICS.incr("app.sentry_scrub_error", value=1)
+    METRICS.incr("collector.sentry_scrub_error", value=1)
 
 
 def configure_sentry(app_config):

@@ -143,7 +143,7 @@ def extract_payload(self, req):
 
         # Decompress payload if it's compressed
         if req.env.get("HTTP_CONTENT_ENCODING") == "gzip":
-            METRICS.incr("breakpad_resource.gzipped_crash")
+            METRICS.incr("collector.breakpad_resource.gzipped_crash")
             crash_report.payload_compressed = "1"
 
             # If the content is gzipped, we pull it out and decompress it. We
@@ -154,13 +154,13 @@ def extract_payload(self, req):
             try:
                 data = zlib.decompress(req.stream.read(content_length), gzip_header)
                 METRICS.histogram(
-                    "breakpad_resource.gzipped_crash_decompress",
+                    "collector.breakpad_resource.gzipped_crash_decompress",
                     value=(time.perf_counter() - start_time) * 1000.0,
                     tags=["result:success"],
                 )
             except zlib.error as exc:
                 METRICS.histogram(
-                    "breakpad_resource.gzipped_crash_decompress",
+                    "collector.breakpad_resource.gzipped_crash_decompress",
                     value=(time.perf_counter() - start_time) * 1000.0,
                     tags=["result:fail"],
                 )
@@ -178,15 +178,15 @@ def extract_payload(self, req):
 
             data = io.BytesIO(data)
             METRICS.histogram(
-                "breakpad_resource.crash_size",
+                "collector.breakpad_resource.crash_size",
                 value=content_length,
                 tags=["payload:compressed"],
             )
 
         else:
             data = req.bounded_stream
             METRICS.histogram(
-                "breakpad_resource.crash_size",
+                "collector.breakpad_resource.crash_size",
                 value=content_length,
                 tags=["payload:uncompressed"],
             )
@@ -261,7 +261,7 @@ def extract_payload(self, req):
 
         if has_json and has_kvpairs:
             # If the crash payload has both kvpairs and a JSON blob, then it's malformed
-            # so we add a note and log it.
+            # so we add a note and log it, but we don't reject it
             msg = "includes annotations in both json-encoded extra and formdata parts"
             LOGGER.info(msg)
             crash_report.notes.append(msg)
@@ -302,7 +302,7 @@ def cleanup_crash_report(self, raw_crash):
                 del raw_crash[bad_field]
                 notes.append("Removed %s from raw crash." % bad_field)
 
-    @METRICS.timer_decorator("breakpad_resource.on_post.time")
+    @METRICS.timer_decorator("collector.breakpad_resource.on_post.time")
     def on_post(self, req, resp):
         """Handle incoming HTTP POSTs.
 
@@ -324,12 +324,14 @@ def on_post(self, req, resp):
         except MalformedCrashReport as exc:
             # If this is malformed, then reject it with malformed error code.
             msg = str(exc)
-            METRICS.incr("breakpad_resource.malformed", tags=["reason:%s" % msg])
+            METRICS.incr(
+                "collector.breakpad_resource.malformed", tags=["reason:%s" % msg]
+            )
             resp.status = falcon.HTTP_400
             resp.text = "Discarded=malformed_%s" % msg
             return
 
-        METRICS.incr("breakpad_resource.incoming_crash")
+        METRICS.incr("collector.breakpad_resource.incoming_crash")
 
         raw_crash = crash_report.annotations
 
@@ -380,9 +382,11 @@ def on_post(self, req, resp):
             rule_name,
             RESULT_TO_TEXT[throttle_result],
         )
-        METRICS.incr("breakpad_resource.throttle_rule", tags=["rule:%s" % rule_name])
         METRICS.incr(
-            "breakpad_resource.throttle",
+            "collector.breakpad_resource.throttle_rule", tags=["rule:%s" % rule_name]
+        )
+        METRICS.incr(
+            "collector.breakpad_resource.throttle",
             tags=["result:%s" % RESULT_TO_TEXT[throttle_result].lower()],
         )
         raw_crash["metadata"]["throttle_rule"] = rule_name

@@ -17,7 +17,7 @@
 def _incr_wait_generator(counter, attempts, sleep_seconds):
     def _generator_generator():
         for _ in range(attempts - 1):
-            METRICS.incr(f"crashmover.{counter}")
+            METRICS.incr(f"collector.crashmover.{counter}")
             yield sleep_seconds
 
     return _generator_generator
@@ -124,7 +124,7 @@ def check_health(self, state):
         if hasattr(self.crashpublish, "check_health"):
             self.crashpublish.check_health(state)
 
-    @METRICS.timer("crashmover.crash_handling.time")
+    @METRICS.timer("collector.crashmover.crash_handling.time")
     def handle_crashreport(self, raw_crash, dumps, crash_id):
         """Handle a new crash report synchronously and return whether that succeeded.
 
@@ -143,27 +143,27 @@ def handle_crashreport(self, raw_crash, dumps, crash_id):
         except MaxAttemptsError:
             # After max attempts, we give up on this crash
             LOGGER.error("%s: too many errors trying to save; dropped", crash_id)
-            METRICS.incr("crashmover.save_crash_dropped.count")
+            METRICS.incr("collector.crashmover.save_crash_dropped.count")
             return False
 
         try:
             self.crashmover_publish_with_retry(crash_report)
-            METRICS.incr("crashmover.save_crash.count")
+            METRICS.incr("collector.crashmover.save_crash.count")
         except MaxAttemptsError:
             LOGGER.error("%s: too many errors trying to publish; dropped", crash_id)
-            METRICS.incr("crashmover.publish_crash_dropped.count")
+            METRICS.incr("collector.crashmover.publish_crash_dropped.count")
             # return True even when publish fails because it will be automatically
             # published later via self-healing mechanisms
 
         return True
 
-    @METRICS.timer("crashmover.crash_save.time")
+    @METRICS.timer("collector.crashmover.crash_save.time")
     def crashmover_save(self, crash_report):
         """Save crash report to storage."""
         self.crashstorage.save_crash(crash_report)
         LOGGER.info("%s saved", crash_report.crash_id)
 
-    @METRICS.timer("crashmover.crash_publish.time")
+    @METRICS.timer("collector.crashmover.crash_publish.time")
     def crashmover_publish(self, crash_report):
         """Publish crash_id in publish queue."""
         self.crashpublish.publish_crash(crash_report)

@@ -17,7 +17,7 @@ class BrokenResource:
 
     def on_get(self, req, resp):
         """Implement GET HTTP request."""
-        METRICS.incr("health.broken.count")
+        METRICS.incr("collector.health.broken.count")
         # This is intentional breakage
         raise Exception("intentional exception")
 
@@ -30,7 +30,7 @@ def __init__(self, basedir):
 
     def on_get(self, req, resp):
         """Implement GET HTTP request."""
-        METRICS.incr("health.version.count")
+        METRICS.incr("collector.health.version.count")
         version_info = get_version_info(self.basedir)
         # FIXME(willkg): there's no cloud provider environment variable to use, so
         # we'll cheat and look at whether there's a "gcs" in
@@ -52,7 +52,7 @@ class LBHeartbeatResource:
 
     def on_get(self, req, resp):
         """Implement GET HTTP request."""
-        METRICS.incr("health.lbheartbeat.count")
+        METRICS.incr("collector.health.lbheartbeat.count")
         resp.content_type = "application/json; charset=utf-8"
         resp.status = falcon.HTTP_200
 
@@ -94,7 +94,7 @@ def __init__(self, app):
 
     def on_get(self, req, resp):
         """Implement GET HTTP request."""
-        METRICS.incr("health.heartbeat.count")
+        METRICS.incr("collector.health.heartbeat.count")
         state = HealthState()
 
         # So we're going to think of Antenna like a big object graph and
@@ -107,7 +107,7 @@ def on_get(self, req, resp):
 
         # Go through and call gauge for each statsd item.
         for key, value in state.statsd.items():
-            METRICS.gauge(f"health.{key}", value=value)
+            METRICS.gauge(f"collector.health.{key}", value=value)
 
         if state.is_healthy():
             resp.status = falcon.HTTP_200

@@ -13,7 +13,7 @@
 _IS_MARKUS_SETUP = False
 
 LOGGER = logging.getLogger(__name__)
-METRICS = markus.get_metrics()
+METRICS = markus.get_metrics("socorro")
 
 
 def setup_metrics(statsd_host, statsd_port, hostname, debug=False):

@@ -19,6 +19,34 @@
 from testlib.mini_poster import compress, multipart_encode
 
 
+class AnyTagValue:
+    """Matches a markus metrics tag with any value"""
+
+    def __init__(self, key):
+        self.key = key
+
+    def __repr__(self):
+        return f"<AnyTagValue {self.key}>"
+
+    def get_other_key(self, other):
+        # This is comparing against a tag string
+        if ":" in other:
+            other_key, _ = other.split(":")
+        else:
+            other_key = other
+        return other_key
+
+    def __eq__(self, other):
+        if isinstance(other, AnyTagValue):
+            return self.key == other.key
+        return self.key == self.get_other_key(other)
+
+    def __lt__(self, other):
+        if isinstance(other, AnyTagValue):
+            return self.key < other.key
+        return self.key < self.get_other_key(other)
+
+
 class FakeCrashMover:
     """Fake crash mover that raises an error when used"""
 
@@ -346,13 +374,14 @@ def test_extract_payload_invalid_json_not_dict(self, request_generator):
         with pytest.raises(MalformedCrashReport, match="invalid_json_value"):
             bsp.extract_payload(req)
 
-    def text_extract_payload_kvpairs_and_json(self, request_generator, metricsmock):
-        # If there's a JSON blob and also kv pairs, then that's a malformed
-        # crash
+    def test_extract_payload_kvpairs_and_json(self, request_generator, metricsmock):
+        # If there's a JSON blob and also kv pairs, use the annotations from "extra" and
+        # log a note
         data, headers = multipart_encode(
             {
                 "extra": '{"ProductName":"Firefox","Version":"1.0"}',
-                "BadKey": "BadValue",
+                # This annotation is dropped because it's not in "extra"
+                "IgnoredAnnotation": "someval",
                 "upload_file_minidump": ("fakecrash.dump", io.BytesIO(b"abcd1234")),
             }
         )
@@ -363,10 +392,20 @@ def text_extract_payload_kvpairs_and_json(self, request_generator, metricsmock):
         bsp = BreakpadSubmitterResource(
             config=EMPTY_CONFIG, crashmover=FakeCrashMover()
         )
-        with metricsmock as metrics:
-            result = bsp.extract_payload(req)
-            assert result == ({}, {})
-            assert metrics.has_record(stat="malformed", tags=["reason:has_json_and_kv"])
+        crash_report = CrashReport(
+            annotations={
+                "ProductName": "Firefox",
+                "Version": "1.0",
+            },
+            dumps={"upload_file_minidump": b"abcd1234"},
+            notes=[
+                "includes annotations in both json-encoded extra and formdata parts"
+            ],
+            payload="json",
+            payload_compressed="0",
+            payload_size=542,
+        )
+        assert bsp.extract_payload(req) == crash_report
 
 
 @pytest.mark.parametrize(
@@ -405,7 +444,7 @@ def test_get_throttle_result(client):
 
 
 class TestBreakpadSubmitterResourceIntegration:
-    def test_submit_crash_report(self, client):
+    def test_submit_crash_report(self, client, metricsmock):
         data, headers = multipart_encode(
             {
                 "ProductName": "Firefox",
@@ -415,7 +454,9 @@ def test_submit_crash_report(self, client):
             }
         )
 
-        result = client.simulate_post("/submit", headers=headers, body=data)
+        with metricsmock as mm:
+            result = client.simulate_post("/submit", headers=headers, body=data)
+
         assert result.status_code == 200
         assert result.headers["Content-Type"].startswith("text/plain")
         assert result.content.startswith(b"CrashID=bp")
@@ -446,6 +487,22 @@ def test_submit_crash_report(self, client):
             "version": 2,
         }
 
+        mm.assert_histogram("socorro.collector.breakpad_resource.crash_size", value=632)
+        mm.assert_incr("socorro.collector.breakpad_resource.incoming_crash")
+        mm.assert_incr(
+            "socorro.collector.breakpad_resource.throttle_rule",
+            tags=["rule:is_nightly", AnyTagValue("host")],
+        )
+        mm.assert_incr(
+            "socorro.collector.breakpad_resource.throttle",
+            tags=["result:accept", AnyTagValue("host")],
+        )
+        mm.assert_timing("socorro.collector.crashmover.crash_save.time")
+        mm.assert_timing("socorro.collector.crashmover.crash_publish.time")
+        mm.assert_incr("socorro.collector.crashmover.save_crash.count")
+        mm.assert_timing("socorro.collector.crashmover.crash_handling.time")
+        mm.assert_timing("socorro.collector.breakpad_resource.on_post.time")
+
     def test_existing_uuid(self, client):
         """Verify if the crash report has a uuid already, it's reused."""
         crash_id = "de1bb258-cbbf-4589-a673-34f800160918"

@@ -146,4 +146,4 @@ def test_count_sentry_scrub_error():
     with MetricsMock() as metricsmock:
         metricsmock.clear_records()
         count_sentry_scrub_error("foo")
-        metricsmock.assert_incr("app.sentry_scrub_error", value=1)
+        metricsmock.assert_incr("socorro.collector.sentry_scrub_error", value=1)