Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

bug-1898345: fix metrics key prefix #1020

Merged
merged 1 commit into from
May 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion antenna/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@


def count_sentry_scrub_error(msg):
METRICS.incr("app.sentry_scrub_error", value=1)
METRICS.incr("collector.sentry_scrub_error", value=1)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This changes the key to socorro.collector.sentry_scrub_error which is more like the other things we're doing. I'll update the dashboard once this lands.



def configure_sentry(app_config):
Expand Down
26 changes: 15 additions & 11 deletions antenna/breakpad_resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ def extract_payload(self, req):

# Decompress payload if it's compressed
if req.env.get("HTTP_CONTENT_ENCODING") == "gzip":
METRICS.incr("breakpad_resource.gzipped_crash")
METRICS.incr("collector.breakpad_resource.gzipped_crash")
crash_report.payload_compressed = "1"

# If the content is gzipped, we pull it out and decompress it. We
Expand All @@ -154,13 +154,13 @@ def extract_payload(self, req):
try:
data = zlib.decompress(req.stream.read(content_length), gzip_header)
METRICS.histogram(
"breakpad_resource.gzipped_crash_decompress",
"collector.breakpad_resource.gzipped_crash_decompress",
value=(time.perf_counter() - start_time) * 1000.0,
tags=["result:success"],
)
except zlib.error as exc:
METRICS.histogram(
"breakpad_resource.gzipped_crash_decompress",
"collector.breakpad_resource.gzipped_crash_decompress",
value=(time.perf_counter() - start_time) * 1000.0,
tags=["result:fail"],
)
Expand All @@ -178,15 +178,15 @@ def extract_payload(self, req):

data = io.BytesIO(data)
METRICS.histogram(
"breakpad_resource.crash_size",
"collector.breakpad_resource.crash_size",
value=content_length,
tags=["payload:compressed"],
)

else:
data = req.bounded_stream
METRICS.histogram(
"breakpad_resource.crash_size",
"collector.breakpad_resource.crash_size",
value=content_length,
tags=["payload:uncompressed"],
)
Expand Down Expand Up @@ -261,7 +261,7 @@ def extract_payload(self, req):

if has_json and has_kvpairs:
# If the crash payload has both kvpairs and a JSON blob, then it's malformed
# so we add a note and log it.
# so we add a note and log it, but we don't reject it
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I clarified this here because it used to be that we rejected these crash reports, then we changed the behavior and current me wants future me to be less confused if I have to look at this again.

msg = "includes annotations in both json-encoded extra and formdata parts"
LOGGER.info(msg)
crash_report.notes.append(msg)
Expand Down Expand Up @@ -302,7 +302,7 @@ def cleanup_crash_report(self, raw_crash):
del raw_crash[bad_field]
notes.append("Removed %s from raw crash." % bad_field)

@METRICS.timer_decorator("breakpad_resource.on_post.time")
@METRICS.timer_decorator("collector.breakpad_resource.on_post.time")
def on_post(self, req, resp):
"""Handle incoming HTTP POSTs.

Expand All @@ -324,12 +324,14 @@ def on_post(self, req, resp):
except MalformedCrashReport as exc:
# If this is malformed, then reject it with malformed error code.
msg = str(exc)
METRICS.incr("breakpad_resource.malformed", tags=["reason:%s" % msg])
METRICS.incr(
"collector.breakpad_resource.malformed", tags=["reason:%s" % msg]
)
resp.status = falcon.HTTP_400
resp.text = "Discarded=malformed_%s" % msg
return

METRICS.incr("breakpad_resource.incoming_crash")
METRICS.incr("collector.breakpad_resource.incoming_crash")

raw_crash = crash_report.annotations

Expand Down Expand Up @@ -380,9 +382,11 @@ def on_post(self, req, resp):
rule_name,
RESULT_TO_TEXT[throttle_result],
)
METRICS.incr("breakpad_resource.throttle_rule", tags=["rule:%s" % rule_name])
METRICS.incr(
"breakpad_resource.throttle",
"collector.breakpad_resource.throttle_rule", tags=["rule:%s" % rule_name]
)
METRICS.incr(
"collector.breakpad_resource.throttle",
tags=["result:%s" % RESULT_TO_TEXT[throttle_result].lower()],
)
raw_crash["metadata"]["throttle_rule"] = rule_name
Expand Down
14 changes: 7 additions & 7 deletions antenna/crashmover.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
def _incr_wait_generator(counter, attempts, sleep_seconds):
def _generator_generator():
for _ in range(attempts - 1):
METRICS.incr(f"crashmover.{counter}")
METRICS.incr(f"collector.crashmover.{counter}")
yield sleep_seconds

return _generator_generator
Expand Down Expand Up @@ -124,7 +124,7 @@ def check_health(self, state):
if hasattr(self.crashpublish, "check_health"):
self.crashpublish.check_health(state)

@METRICS.timer("crashmover.crash_handling.time")
@METRICS.timer("collector.crashmover.crash_handling.time")
def handle_crashreport(self, raw_crash, dumps, crash_id):
"""Handle a new crash report synchronously and return whether that succeeded.

Expand All @@ -143,27 +143,27 @@ def handle_crashreport(self, raw_crash, dumps, crash_id):
except MaxAttemptsError:
# After max attempts, we give up on this crash
LOGGER.error("%s: too many errors trying to save; dropped", crash_id)
METRICS.incr("crashmover.save_crash_dropped.count")
METRICS.incr("collector.crashmover.save_crash_dropped.count")
return False

try:
self.crashmover_publish_with_retry(crash_report)
METRICS.incr("crashmover.save_crash.count")
METRICS.incr("collector.crashmover.save_crash.count")
except MaxAttemptsError:
LOGGER.error("%s: too many errors trying to publish; dropped", crash_id)
METRICS.incr("crashmover.publish_crash_dropped.count")
METRICS.incr("collector.crashmover.publish_crash_dropped.count")
# return True even when publish fails because it will be automatically
# published later via self-healing mechanisms

return True

@METRICS.timer("crashmover.crash_save.time")
@METRICS.timer("collector.crashmover.crash_save.time")
def crashmover_save(self, crash_report):
"""Save crash report to storage."""
self.crashstorage.save_crash(crash_report)
LOGGER.info("%s saved", crash_report.crash_id)

@METRICS.timer("crashmover.crash_publish.time")
@METRICS.timer("collector.crashmover.crash_publish.time")
def crashmover_publish(self, crash_report):
"""Publish crash_id in publish queue."""
self.crashpublish.publish_crash(crash_report)
Expand Down
10 changes: 5 additions & 5 deletions antenna/health_resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ class BrokenResource:

def on_get(self, req, resp):
"""Implement GET HTTP request."""
METRICS.incr("health.broken.count")
METRICS.incr("collector.health.broken.count")
# This is intentional breakage
raise Exception("intentional exception")

Expand All @@ -30,7 +30,7 @@ def __init__(self, basedir):

def on_get(self, req, resp):
"""Implement GET HTTP request."""
METRICS.incr("health.version.count")
METRICS.incr("collector.health.version.count")
version_info = get_version_info(self.basedir)
# FIXME(willkg): there's no cloud provider environment variable to use, so
# we'll cheat and look at whether there's a "gcs" in
Expand All @@ -52,7 +52,7 @@ class LBHeartbeatResource:

def on_get(self, req, resp):
"""Implement GET HTTP request."""
METRICS.incr("health.lbheartbeat.count")
METRICS.incr("collector.health.lbheartbeat.count")
resp.content_type = "application/json; charset=utf-8"
resp.status = falcon.HTTP_200

Expand Down Expand Up @@ -94,7 +94,7 @@ def __init__(self, app):

def on_get(self, req, resp):
"""Implement GET HTTP request."""
METRICS.incr("health.heartbeat.count")
METRICS.incr("collector.health.heartbeat.count")
state = HealthState()

# So we're going to think of Antenna like a big object graph and
Expand All @@ -107,7 +107,7 @@ def on_get(self, req, resp):

# Go through and call gauge for each statsd item.
for key, value in state.statsd.items():
METRICS.gauge(f"health.{key}", value=value)
METRICS.gauge(f"collector.health.{key}", value=value)

if state.is_healthy():
resp.status = falcon.HTTP_200
Expand Down
2 changes: 1 addition & 1 deletion antenna/libmarkus.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
_IS_MARKUS_SETUP = False

LOGGER = logging.getLogger(__name__)
METRICS = markus.get_metrics()
METRICS = markus.get_metrics("socorro")


def setup_metrics(statsd_host, statsd_port, hostname, debug=False):
Expand Down
77 changes: 67 additions & 10 deletions tests/unittest/test_breakpad_resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,34 @@
from testlib.mini_poster import compress, multipart_encode


class AnyTagValue:
Copy link
Contributor Author

@willkg willkg May 24, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the other PR, I did a ridiculous thing to solve the same problem this solves because I thought I couldn't solve it this way on account of how Markus filter_records works. But then I decided to try it out and it works super. I'll fix Markus and then we can remove this and the ridiculous thing I did in that other PR.

The thing this does is two fold:

  1. AnyTagValue("host") == "host:somevalue"
  2. AnyTagValue("host") will sort correctly with other strings; for example these will both sort such that the "host" tag ends up in the same place:
    foo = ["host:12345", "environment:stage"]
    foo = [AnyTagValue("host"), "environment:stage"]
    

"""Matches a markus metrics tag with any value"""

def __init__(self, key):
self.key = key

def __repr__(self):
return f"<AnyTagValue {self.key}>"

def get_other_key(self, other):
# This is comparing against a tag string
if ":" in other:
other_key, _ = other.split(":")
else:
other_key = other
return other_key

def __eq__(self, other):
if isinstance(other, AnyTagValue):
return self.key == other.key
return self.key == self.get_other_key(other)

def __lt__(self, other):
if isinstance(other, AnyTagValue):
return self.key < other.key
return self.key < self.get_other_key(other)


class FakeCrashMover:
"""Fake crash mover that raises an error when used"""

Expand Down Expand Up @@ -346,13 +374,14 @@ def test_extract_payload_invalid_json_not_dict(self, request_generator):
with pytest.raises(MalformedCrashReport, match="invalid_json_value"):
bsp.extract_payload(req)

def text_extract_payload_kvpairs_and_json(self, request_generator, metricsmock):
# If there's a JSON blob and also kv pairs, then that's a malformed
# crash
def test_extract_payload_kvpairs_and_json(self, request_generator, metricsmock):
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This test never ran because it started with text. This tests what happens if a crash report includes annotations in multipart format and annotations in the extra field in a JSON-encoded value.

Once I fixed the test method name, it ran and failed because it didn't account for the change in how we handle this situation. I updated this test.

# If there's a JSON blob and also kv pairs, use the annotations from "extra" and
# log a note
data, headers = multipart_encode(
{
"extra": '{"ProductName":"Firefox","Version":"1.0"}',
"BadKey": "BadValue",
# This annotation is dropped because it's not in "extra"
"IgnoredAnnotation": "someval",
"upload_file_minidump": ("fakecrash.dump", io.BytesIO(b"abcd1234")),
}
)
Expand All @@ -363,10 +392,20 @@ def text_extract_payload_kvpairs_and_json(self, request_generator, metricsmock):
bsp = BreakpadSubmitterResource(
config=EMPTY_CONFIG, crashmover=FakeCrashMover()
)
with metricsmock as metrics:
result = bsp.extract_payload(req)
assert result == ({}, {})
assert metrics.has_record(stat="malformed", tags=["reason:has_json_and_kv"])
crash_report = CrashReport(
annotations={
"ProductName": "Firefox",
"Version": "1.0",
},
dumps={"upload_file_minidump": b"abcd1234"},
notes=[
"includes annotations in both json-encoded extra and formdata parts"
],
payload="json",
payload_compressed="0",
payload_size=542,
)
assert bsp.extract_payload(req) == crash_report


@pytest.mark.parametrize(
Expand Down Expand Up @@ -405,7 +444,7 @@ def test_get_throttle_result(client):


class TestBreakpadSubmitterResourceIntegration:
def test_submit_crash_report(self, client):
def test_submit_crash_report(self, client, metricsmock):
data, headers = multipart_encode(
{
"ProductName": "Firefox",
Expand All @@ -415,7 +454,9 @@ def test_submit_crash_report(self, client):
}
)

result = client.simulate_post("/submit", headers=headers, body=data)
with metricsmock as mm:
result = client.simulate_post("/submit", headers=headers, body=data)

assert result.status_code == 200
assert result.headers["Content-Type"].startswith("text/plain")
assert result.content.startswith(b"CrashID=bp")
Expand Down Expand Up @@ -446,6 +487,22 @@ def test_submit_crash_report(self, client):
"version": 2,
}

mm.assert_histogram("socorro.collector.breakpad_resource.crash_size", value=632)
Copy link
Contributor Author

@willkg willkg May 24, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

None of the tests assert anything about metrics which is bad. I added this section here to this test to assert what metrics should get emitted.

mm.assert_incr("socorro.collector.breakpad_resource.incoming_crash")
mm.assert_incr(
"socorro.collector.breakpad_resource.throttle_rule",
tags=["rule:is_nightly", AnyTagValue("host")],
)
mm.assert_incr(
"socorro.collector.breakpad_resource.throttle",
tags=["result:accept", AnyTagValue("host")],
)
mm.assert_timing("socorro.collector.crashmover.crash_save.time")
mm.assert_timing("socorro.collector.crashmover.crash_publish.time")
mm.assert_incr("socorro.collector.crashmover.save_crash.count")
mm.assert_timing("socorro.collector.crashmover.crash_handling.time")
mm.assert_timing("socorro.collector.breakpad_resource.on_post.time")

def test_existing_uuid(self, client):
"""Verify if the crash report has a uuid already, it's reused."""
crash_id = "de1bb258-cbbf-4589-a673-34f800160918"
Expand Down
2 changes: 1 addition & 1 deletion tests/unittest/test_sentry.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,4 +146,4 @@ def test_count_sentry_scrub_error():
with MetricsMock() as metricsmock:
metricsmock.clear_records()
count_sentry_scrub_error("foo")
metricsmock.assert_incr("app.sentry_scrub_error", value=1)
metricsmock.assert_incr("socorro.collector.sentry_scrub_error", value=1)