From 33b2499d7f615f40fcea098933dda4ca47d30c4b Mon Sep 17 00:00:00 2001
From: Munir Abdinur <munir.abdinur@datadoghq.com>
Date: Wed, 8 Jan 2025 12:36:11 -0500
Subject: [PATCH 01/19] chore(telemetry): make logs less noisy and more clear
 (#11853)

Partially Resolves: https://github.com/DataDog/dd-trace-py/issues/10842

- Removes exception traceback from telemetry client logs. We should not
generate a traceback everytime we fail to send telemetry payloads to the
agent. This traceback is noisy and not actionable.
- Updates telemetry client log message to clearly state that
instrumentation telemetry failed to send and not user telemetry (ex:
traces, logs, metrics).

## Checklist
- [x] PR author has checked that all the criteria below are met
- The PR description includes an overview of the change
- The PR description articulates the motivation for the change
- The change includes tests OR the PR description describes a testing
strategy
- The PR description notes risks associated with the change, if any
- Newly-added code is easy to change
- The change follows the [library release note
guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html)
- The change includes or references documentation updates if necessary
- Backport labels are set (if
[applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting))

## Reviewer Checklist
- [x] Reviewer has checked that all the criteria below are met
- Title is accurate
- All changes are related to the pull request's stated goal
- Avoids breaking
[API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces)
changes
- Testing strategy adequately addresses listed risks
- Newly-added code is easy to change
- Release note makes sense to a user of the library
- If necessary, author has acknowledged and discussed the performance
implications of this PR as reported in the benchmarks PR comment
- Backport labels are set in a manner that is consistent with the
[release branch maintenance
policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)
---
 ddtrace/internal/telemetry/writer.py | 14 ++++++++++----
 tests/telemetry/test_writer.py       |  2 +-
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/ddtrace/internal/telemetry/writer.py b/ddtrace/internal/telemetry/writer.py
index 71de6b03907..2be240c06fd 100644
--- a/ddtrace/internal/telemetry/writer.py
+++ b/ddtrace/internal/telemetry/writer.py
@@ -118,11 +118,17 @@ def send_event(self, request: Dict) -> Optional[httplib.HTTPResponse]:
                 conn.request("POST", self._endpoint, rb_json, headers)
                 resp = get_connection_response(conn)
             if resp.status < 300:
-                log.debug("sent %d in %.5fs to %s. response: %s", len(rb_json), sw.elapsed(), self.url, resp.status)
+                log.debug(
+                    "Instrumentation Telemetry sent %d in %.5fs to %s. response: %s",
+                    len(rb_json),
+                    sw.elapsed(),
+                    self.url,
+                    resp.status,
+                )
             else:
-                log.debug("failed to send telemetry to %s. response: %s", self.url, resp.status)
-        except Exception:
-            log.debug("failed to send telemetry to %s.", self.url, exc_info=True)
+                log.debug("Failed to send Instrumentation Telemetry to %s. response: %s", self.url, resp.status)
+        except Exception as e:
+            log.debug("Failed to send Instrumentation Telemetry to %s. Error: %s", self.url, str(e))
         finally:
             if conn is not None:
                 conn.close()
diff --git a/tests/telemetry/test_writer.py b/tests/telemetry/test_writer.py
index bcc3be9e38c..3b5ec7226af 100644
--- a/tests/telemetry/test_writer.py
+++ b/tests/telemetry/test_writer.py
@@ -638,7 +638,7 @@ def test_send_failing_request(mock_status, telemetry_writer):
                 telemetry_writer.periodic(force_flush=True)
                 # asserts unsuccessful status code was logged
                 log.debug.assert_called_with(
-                    "failed to send telemetry to %s. response: %s",
+                    "Failed to send Instrumentation Telemetry to %s. response: %s",
                     telemetry_writer._client.url,
                     mock_status,
                 )

From 86a367a4112eddce00591169056046136af3785e Mon Sep 17 00:00:00 2001
From: Munir Abdinur <munir.abdinur@datadoghq.com>
Date: Wed, 8 Jan 2025 12:46:26 -0500
Subject: [PATCH 02/19] chore: skip failing urlib3 ASM test (#11861)

- There seems to be a bug in the ASM Standalone logic where the
`_dd.p.appsec` tag is not being set on `urllib3` spans. Since this tag
is not set x-datadog distributed tracing headers are not generated.

## Checklist
- [x] PR author has checked that all the criteria below are met
- The PR description includes an overview of the change
- The PR description articulates the motivation for the change
- The change includes tests OR the PR description describes a testing
strategy
- The PR description notes risks associated with the change, if any
- Newly-added code is easy to change
- The change follows the [library release note
guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html)
- The change includes or references documentation updates if necessary
- Backport labels are set (if
[applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting))

## Reviewer Checklist
- [x] Reviewer has checked that all the criteria below are met
- Title is accurate
- All changes are related to the pull request's stated goal
- Avoids breaking
[API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces)
changes
- Testing strategy adequately addresses listed risks
- Newly-added code is easy to change
- Release note makes sense to a user of the library
- If necessary, author has acknowledged and discussed the performance
implications of this PR as reported in the benchmarks PR comment
- Backport labels are set in a manner that is consistent with the
[release branch maintenance
policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)
---
 tests/contrib/urllib3/test_urllib3.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/tests/contrib/urllib3/test_urllib3.py b/tests/contrib/urllib3/test_urllib3.py
index 2f0c447ee65..841e2c826ab 100644
--- a/tests/contrib/urllib3/test_urllib3.py
+++ b/tests/contrib/urllib3/test_urllib3.py
@@ -12,6 +12,7 @@
 from ddtrace.ext import http
 from ddtrace.internal.schema import DEFAULT_SPAN_SERVICE_NAME
 from ddtrace.pin import Pin
+from ddtrace.settings.asm import config as asm_config
 from tests.contrib.config import HTTPBIN_CONFIG
 from tests.opentracer.utils import init_tracer
 from tests.utils import TracerTestCase
@@ -527,12 +528,16 @@ def test_distributed_tracing_disabled(self):
                     timeout=mock.ANY,
                 )
 
+    @pytest.mark.skip(reason="urlib3 does not set the ASM Manual keep tag so x-datadog headers are not propagated")
     def test_distributed_tracing_apm_opt_out_true(self):
         """Tests distributed tracing headers are passed by default"""
         # Check that distributed tracing headers are passed down; raise an error rather than make the
         # request since we don't care about the response at all
         config.urllib3["distributed_tracing"] = True
         self.tracer.enabled = False
+        # Ensure the ASM SpanProcessor is set
+        self.tracer.configure(appsec_standalone_enabled=True, appsec_enabled=True)
+        assert asm_config._apm_opt_out
         with mock.patch(
             "urllib3.connectionpool.HTTPConnectionPool._make_request", side_effect=ValueError
         ) as m_make_request:
@@ -580,6 +585,9 @@ def test_distributed_tracing_apm_opt_out_false(self):
         """Test with distributed tracing disabled does not propagate the headers"""
         config.urllib3["distributed_tracing"] = True
         self.tracer.enabled = False
+        # Ensure the ASM SpanProcessor is set.
+        self.tracer.configure(appsec_standalone_enabled=False, appsec_enabled=True)
+        assert not asm_config._apm_opt_out
         with mock.patch(
             "urllib3.connectionpool.HTTPConnectionPool._make_request", side_effect=ValueError
         ) as m_make_request:

From 6c61f40f63afdda1bdb2fff72e9b98f8d4f35956 Mon Sep 17 00:00:00 2001
From: Brett Langdon <brett.langdon@datadoghq.com>
Date: Wed, 8 Jan 2025 15:07:23 -0500
Subject: [PATCH 03/19] ci: fix flaky es search query (#11877)

---
 .../elasticsearch/test_elasticsearch.py       | 30 +++++++++++++++----
 1 file changed, 24 insertions(+), 6 deletions(-)

diff --git a/tests/contrib/elasticsearch/test_elasticsearch.py b/tests/contrib/elasticsearch/test_elasticsearch.py
index b80b4486e71..4a480c550c8 100644
--- a/tests/contrib/elasticsearch/test_elasticsearch.py
+++ b/tests/contrib/elasticsearch/test_elasticsearch.py
@@ -1,6 +1,7 @@
 import datetime
 from http.client import HTTPConnection
 from importlib import import_module
+import json
 import time
 
 import pytest
@@ -167,7 +168,12 @@ def test_elasticsearch(self):
             es.index(id=10, body={"name": "ten", "created": datetime.date(2016, 1, 1)}, **args)
             es.index(id=11, body={"name": "eleven", "created": datetime.date(2016, 2, 1)}, **args)
             es.index(id=12, body={"name": "twelve", "created": datetime.date(2016, 3, 1)}, **args)
-            result = es.search(sort=["name:desc"], size=100, body={"query": {"match_all": {}}}, **args)
+            result = es.search(
+                sort={"name": {"order": "desc", "unmapped_type": "keyword"}},
+                size=100,
+                body={"query": {"match_all": {}}},
+                **args,
+            )
 
         assert len(result["hits"]["hits"]) == 3, result
         spans = self.get_spans()
@@ -183,13 +189,25 @@ def test_elasticsearch(self):
         assert url.endswith("/_search")
         assert url == span.get_tag("elasticsearch.url")
         if elasticsearch.__version__ >= (8, 0, 0):
-            assert span.get_tag("elasticsearch.body").replace(" ", "") == '{"query":{"match_all":{}},"size":100}'
-            assert set(span.get_tag("elasticsearch.params").split("&")) == {"sort=name%3Adesc"}
-            assert set(span.get_tag(http.QUERY_STRING).split("&")) == {"sort=name%3Adesc"}
+            # Key order is not consistent, parse into dict to compare
+            body = json.loads(span.get_tag("elasticsearch.body"))
+            assert body == {
+                "query": {"match_all": {}},
+                "sort": {"name": {"order": "desc", "unmapped_type": "keyword"}},
+                "size": 100,
+            }
+            assert not span.get_tag("elasticsearch.params")
+            assert not span.get_tag(http.QUERY_STRING)
         else:
             assert span.get_tag("elasticsearch.body").replace(" ", "") == '{"query":{"match_all":{}}}'
-            assert set(span.get_tag("elasticsearch.params").split("&")) == {"sort=name%3Adesc", "size=100"}
-            assert set(span.get_tag(http.QUERY_STRING).split("&")) == {"sort=name%3Adesc", "size=100"}
+            assert set(span.get_tag("elasticsearch.params").split("&")) == {
+                "sort=%7B%27name%27%3A+%7B%27order%27%3A+%27desc%27%2C+%27unmapped_type%27%3A+%27keyword%27%7D%7D",
+                "size=100",
+            }
+            assert set(span.get_tag(http.QUERY_STRING).split("&")) == {
+                "sort=%7B%27name%27%3A+%7B%27order%27%3A+%27desc%27%2C+%27unmapped_type%27%3A+%27keyword%27%7D%7D",
+                "size=100",
+            }
         assert span.get_tag("component") == "elasticsearch"
         assert span.get_tag("span.kind") == "client"
 

From 75bed24c32dd865e83353455d1c61cb5f6265f56 Mon Sep 17 00:00:00 2001
From: wantsui <wan.tsui@datadoghq.com>
Date: Wed, 8 Jan 2025 16:45:49 -0500
Subject: [PATCH 04/19] chore: clarify usage and wording for current_root_span
 (#11764)

Following up on https://github.com/DataDog/dd-trace-py/issues/9758, our
current definition of current_root_span is a bit misleading. This
attempts to clarify the definition since in a distributed trace, you
can't actually use this to grab the root span. (The only thing it
consistently returns is "local root").

## Checklist
- [x] PR author has checked that all the criteria below are met
- The PR description includes an overview of the change
- The PR description articulates the motivation for the change
- The change includes tests OR the PR description describes a testing
strategy
- The PR description notes risks associated with the change, if any
- Newly-added code is easy to change
- The change follows the [library release note
guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html)
- The change includes or references documentation updates if necessary
- Backport labels are set (if
[applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting))

## Reviewer Checklist
- [x] Reviewer has checked that all the criteria below are met
- Title is accurate
- All changes are related to the pull request's stated goal
- Avoids breaking
[API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces)
changes
- Testing strategy adequately addresses listed risks
- Newly-added code is easy to change
- Release note makes sense to a user of the library
- If necessary, author has acknowledged and discussed the performance
implications of this PR as reported in the benchmarks PR comment
- Backport labels are set in a manner that is consistent with the
[release branch maintenance
policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)
---
 ddtrace/_trace/tracer.py | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/ddtrace/_trace/tracer.py b/ddtrace/_trace/tracer.py
index f815e0f184e..02d5fed7626 100644
--- a/ddtrace/_trace/tracer.py
+++ b/ddtrace/_trace/tracer.py
@@ -940,18 +940,23 @@ def trace(
         )
 
     def current_root_span(self) -> Optional[Span]:
-        """Returns the root span of the current execution.
+        """Returns the local root span of the current execution/process.
 
-        This is useful for attaching information related to the trace as a
-        whole without needing to add to child spans.
+        Note: This cannot be used to access the true root span of the trace
+        in a distributed tracing setup if the actual root span occurred in
+        another execution/process.
+
+        This is useful for attaching information to the local root span
+        of the current execution/process, which is often also service
+        entry span.
 
         For example::
 
-            # get the root span
-            root_span = tracer.current_root_span()
+            # get the local root span
+            local_root_span = tracer.current_root_span()
             # set the host just once on the root span
-            if root_span:
-                root_span.set_tag('host', '127.0.0.1')
+            if local_root_span:
+                local_root_span.set_tag('host', '127.0.0.1')
         """
         span = self.current_span()
         if span is None:

From 41fce6da11ae17ae8ef50ef337bb1586f851f7c4 Mon Sep 17 00:00:00 2001
From: "Gabriele N. Tornetta" <P403n1x87@users.noreply.github.com>
Date: Thu, 9 Jan 2025 10:17:52 +0000
Subject: [PATCH 05/19] ci: fix CircleCI dynamic job selection (#11873)

We fix a residual issue in #11668 whereby no suites where actually
dynamically selected for CircleCI.

## Checklist
- [x] PR author has checked that all the criteria below are met
- The PR description includes an overview of the change
- The PR description articulates the motivation for the change
- The change includes tests OR the PR description describes a testing
strategy
- The PR description notes risks associated with the change, if any
- Newly-added code is easy to change
- The change follows the [library release note
guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html)
- The change includes or references documentation updates if necessary
- Backport labels are set (if
[applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting))

## Reviewer Checklist
- [x] Reviewer has checked that all the criteria below are met
- Title is accurate
- All changes are related to the pull request's stated goal
- Avoids breaking
[API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces)
changes
- Testing strategy adequately addresses listed risks
- Newly-added code is easy to change
- Release note makes sense to a user of the library
- If necessary, author has acknowledged and discussed the performance
implications of this PR as reported in the benchmarks PR comment
- Backport labels are set in a manner that is consistent with the
[release branch maintenance
policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)
---
 scripts/gen_circleci_config.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/scripts/gen_circleci_config.py b/scripts/gen_circleci_config.py
index bc51f2c5519..3a68a1a7975 100644
--- a/scripts/gen_circleci_config.py
+++ b/scripts/gen_circleci_config.py
@@ -17,10 +17,13 @@ def gen_required_suites(template: dict) -> None:
     required_suites = template["requires_tests"]["requires"] = []
     for_each_testrun_needed(
         suites=sorted(
-            set(n.rpartition("::")[-1] for n, s in get_suites().items() if not s.get("skip", False))
-            & set(template["jobs"].keys())
+            set(
+                n
+                for n, s in get_suites().items()
+                if not s.get("skip", False) and n.rpartition("::")[-1] in template["jobs"]
+            )
         ),
-        action=lambda suite: required_suites.append(suite),
+        action=lambda suite: required_suites.append(suite.rpartition("::")[-1]),
         git_selections=extract_git_commit_selections(os.getenv("GIT_COMMIT_DESC", "")),
     )
 

From 5581f73514ad93195158817d74f90550fba2acb3 Mon Sep 17 00:00:00 2001
From: Christophe Papazian
 <114495376+christophe-papazian@users.noreply.github.com>
Date: Thu, 9 Jan 2025 15:29:01 +0100
Subject: [PATCH 06/19] feat(asm): cmdi patch refactor + rasp support (#11870)

- Add suport for CMDI for Exploit Prevention on `subprocess` module and
`system.spawn*` functions
- Increase support for SHI for Exploit Prevention on `subprocess` module
(with `shell=True`)
- Refactor patching process for subprocess and os related to those
functions. Use only one patching process for both asm attacks and IAST.
- Update Codeowners file
- Add Exploit Prevention unit tests for CMDI on all supported frameworks
- Add rule_variant tags for exploit prevention telemetry

APPSEC-56275

## Checklist
- [x] PR author has checked that all the criteria below are met
- The PR description includes an overview of the change
- The PR description articulates the motivation for the change
- The change includes tests OR the PR description describes a testing
strategy
- The PR description notes risks associated with the change, if any
- Newly-added code is easy to change
- The change follows the [library release note
guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html)
- The change includes or references documentation updates if necessary
- Backport labels are set (if
[applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting))

## Reviewer Checklist
- [x] Reviewer has checked that all the criteria below are met
- Title is accurate
- All changes are related to the pull request's stated goal
- Avoids breaking
[API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces)
changes
- Testing strategy adequately addresses listed risks
- Newly-added code is easy to change
- Release note makes sense to a user of the library
- If necessary, author has acknowledged and discussed the performance
implications of this PR as reported in the benchmarks PR comment
- Backport labels are set in a manner that is consistent with the
[release branch maintenance
policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)
---
 .github/CODEOWNERS                            |   1 +
 ddtrace/appsec/_capabilities.py               |   3 +-
 ddtrace/appsec/_common_module_patches.py      | 119 +++++++++++-------
 ddtrace/appsec/_constants.py                  |   6 +-
 ddtrace/appsec/_iast/_pytest_plugin.py        |   2 +
 .../_iast/taint_sinks/command_injection.py    |  51 ++------
 ddtrace/appsec/_metrics.py                    |  15 ++-
 ddtrace/appsec/_processor.py                  |   4 +
 ddtrace/contrib/internal/subprocess/patch.py  | 109 ++++++++++------
 .../notes/rasp_cmdi-3c7819ee9e33e447.yaml     |   6 +
 tests/appsec/appsec/rules-rasp-blocking.json  |  49 ++++++++
 tests/appsec/appsec/rules-rasp-disabled.json  |  49 ++++++++
 .../appsec/appsec/rules-rasp-redirecting.json |  49 ++++++++
 tests/appsec/appsec/rules-rasp.json           |  48 +++++++
 .../appsec/appsec/test_remoteconfiguration.py |   4 +-
 .../appsec/contrib_appsec/django_app/urls.py  |  25 +++-
 .../appsec/contrib_appsec/fastapi_app/app.py  |  25 +++-
 tests/appsec/contrib_appsec/flask_app/app.py  |  25 +++-
 tests/appsec/contrib_appsec/utils.py          |  30 ++++-
 .../taint_sinks/test_command_injection.py     |  10 +-
 20 files changed, 484 insertions(+), 146 deletions(-)
 create mode 100644 releasenotes/notes/rasp_cmdi-3c7819ee9e33e447.yaml

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index d281fe80148..23a48b6f344 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -110,6 +110,7 @@ benchmarks/base/aspects_benchmarks_generate.py @DataDog/asm-python
 ddtrace/appsec/                     @DataDog/asm-python
 ddtrace/settings/asm.py             @DataDog/asm-python
 ddtrace/contrib/subprocess/         @DataDog/asm-python
+ddtrace/contrib/internal/subprocess/         @DataDog/asm-python
 ddtrace/contrib/flask_login/        @DataDog/asm-python
 ddtrace/contrib/webbrowser          @DataDog/asm-python
 ddtrace/contrib/urllib              @DataDog/asm-python
diff --git a/ddtrace/appsec/_capabilities.py b/ddtrace/appsec/_capabilities.py
index c173f2d6471..c999b61cb97 100644
--- a/ddtrace/appsec/_capabilities.py
+++ b/ddtrace/appsec/_capabilities.py
@@ -31,6 +31,7 @@ class Flags(enum.IntFlag):
     ASM_SESSION_FINGERPRINT = 1 << 33
     ASM_NETWORK_FINGERPRINT = 1 << 34
     ASM_HEADER_FINGERPRINT = 1 << 35
+    ASM_RASP_CMDI = 1 << 37
 
 
 _ALL_ASM_BLOCKING = (
@@ -49,7 +50,7 @@ class Flags(enum.IntFlag):
     | Flags.ASM_HEADER_FINGERPRINT
 )
 
-_ALL_RASP = Flags.ASM_RASP_SQLI | Flags.ASM_RASP_LFI | Flags.ASM_RASP_SSRF | Flags.ASM_RASP_SHI
+_ALL_RASP = Flags.ASM_RASP_SQLI | Flags.ASM_RASP_LFI | Flags.ASM_RASP_SSRF | Flags.ASM_RASP_SHI | Flags.ASM_RASP_CMDI
 _FEATURE_REQUIRED = Flags.ASM_ACTIVATION | Flags.ASM_AUTO_USER
 
 
diff --git a/ddtrace/appsec/_common_module_patches.py b/ddtrace/appsec/_common_module_patches.py
index 215d8b05ee6..0b455dbba6b 100644
--- a/ddtrace/appsec/_common_module_patches.py
+++ b/ddtrace/appsec/_common_module_patches.py
@@ -7,16 +7,20 @@
 from typing import Callable
 from typing import Dict
 from typing import Iterable
+from typing import List
+from typing import Union
 
 from wrapt import FunctionWrapper
 from wrapt import resolve_path
 
 import ddtrace
 from ddtrace.appsec._asm_request_context import get_blocked
+from ddtrace.appsec._constants import EXPLOIT_PREVENTION
 from ddtrace.appsec._constants import WAF_ACTIONS
 from ddtrace.appsec._iast._iast_request_context import is_iast_request_enabled
 from ddtrace.appsec._iast._metrics import _set_metric_iast_instrumented_sink
 from ddtrace.appsec._iast.constants import VULN_PATH_TRAVERSAL
+import ddtrace.contrib.internal.subprocess.patch as subprocess_patch
 from ddtrace.internal import core
 from ddtrace.internal._exceptions import BlockingException
 from ddtrace.internal._unpatched import _gc as gc
@@ -30,6 +34,9 @@
 
 _is_patched = False
 
+_RASP_SYSTEM = "rasp_os.system"
+_RASP_POPEN = "rasp_Popen"
+
 
 def patch_common_modules():
     global _is_patched
@@ -39,7 +46,10 @@ def patch_common_modules():
     try_wrap_function_wrapper("urllib.request", "OpenerDirector.open", wrapped_open_ED4CF71136E15EBF)
     try_wrap_function_wrapper("_io", "BytesIO.read", wrapped_read_F3E51D71B4EC16EF)
     try_wrap_function_wrapper("_io", "StringIO.read", wrapped_read_F3E51D71B4EC16EF)
-    try_wrap_function_wrapper("os", "system", wrapped_system_5542593D237084A7)
+    # ensure that the subprocess patch is applied even after one click activation
+    subprocess_patch.patch()
+    subprocess_patch.add_str_callback(_RASP_SYSTEM, wrapped_system_5542593D237084A7)
+    subprocess_patch.add_lst_callback(_RASP_POPEN, popen_FD233052260D8B4D)
     core.on("asm.block.dbapi.execute", execute_4C9BAC8E228EB347)
     if asm_config._iast_enabled:
         _set_metric_iast_instrumented_sink(VULN_PATH_TRAVERSAL)
@@ -54,6 +64,8 @@ def unpatch_common_modules():
     try_unwrap("urllib.request", "OpenerDirector.open")
     try_unwrap("_io", "BytesIO.read")
     try_unwrap("_io", "StringIO.read")
+    subprocess_patch.del_str_callback(_RASP_SYSTEM)
+    subprocess_patch.del_lst_callback(_RASP_POPEN)
     _is_patched = False
 
 
@@ -106,7 +118,6 @@ def wrapped_open_CFDDB7ABBA9081B6(original_open_callable, instance, args, kwargs
         try:
             from ddtrace.appsec._asm_request_context import call_waf_callback
             from ddtrace.appsec._asm_request_context import in_asm_context
-            from ddtrace.appsec._constants import EXPLOIT_PREVENTION
         except ImportError:
             # open is used during module initialization
             # and shouldn't be changed at that time
@@ -124,7 +135,9 @@ def wrapped_open_CFDDB7ABBA9081B6(original_open_callable, instance, args, kwargs
                 rule_type=EXPLOIT_PREVENTION.TYPE.LFI,
             )
             if res and _must_block(res.actions):
-                raise BlockingException(get_blocked(), "exploit_prevention", "lfi", filename)
+                raise BlockingException(
+                    get_blocked(), EXPLOIT_PREVENTION.BLOCKING, EXPLOIT_PREVENTION.TYPE.LFI, filename
+                )
     try:
         return original_open_callable(*args, **kwargs)
     except Exception as e:
@@ -151,7 +164,6 @@ def wrapped_open_ED4CF71136E15EBF(original_open_callable, instance, args, kwargs
         try:
             from ddtrace.appsec._asm_request_context import call_waf_callback
             from ddtrace.appsec._asm_request_context import in_asm_context
-            from ddtrace.appsec._constants import EXPLOIT_PREVENTION
         except ImportError:
             # open is used during module initialization
             # and shouldn't be changed at that time
@@ -168,7 +180,9 @@ def wrapped_open_ED4CF71136E15EBF(original_open_callable, instance, args, kwargs
                     rule_type=EXPLOIT_PREVENTION.TYPE.SSRF,
                 )
                 if res and _must_block(res.actions):
-                    raise BlockingException(get_blocked(), "exploit_prevention", "ssrf", url)
+                    raise BlockingException(
+                        get_blocked(), EXPLOIT_PREVENTION.BLOCKING, EXPLOIT_PREVENTION.TYPE.SSRF, url
+                    )
     return original_open_callable(*args, **kwargs)
 
 
@@ -191,7 +205,6 @@ def wrapped_request_D8CB81E472AF98A2(original_request_callable, instance, args,
         try:
             from ddtrace.appsec._asm_request_context import call_waf_callback
             from ddtrace.appsec._asm_request_context import in_asm_context
-            from ddtrace.appsec._constants import EXPLOIT_PREVENTION
         except ImportError:
             # open is used during module initialization
             # and shouldn't be changed at that time
@@ -206,50 +219,67 @@ def wrapped_request_D8CB81E472AF98A2(original_request_callable, instance, args,
                     rule_type=EXPLOIT_PREVENTION.TYPE.SSRF,
                 )
                 if res and _must_block(res.actions):
-                    raise BlockingException(get_blocked(), "exploit_prevention", "ssrf", url)
+                    raise BlockingException(
+                        get_blocked(), EXPLOIT_PREVENTION.BLOCKING, EXPLOIT_PREVENTION.TYPE.SSRF, url
+                    )
 
     return original_request_callable(*args, **kwargs)
 
 
-def wrapped_system_5542593D237084A7(original_command_callable, instance, args, kwargs):
+def wrapped_system_5542593D237084A7(command: str) -> None:
     """
     wrapper for os.system function
     """
-    command = args[0] if args else kwargs.get("command", None)
-    if command is not None:
-        if asm_config._iast_enabled and is_iast_request_enabled():
-            from ddtrace.appsec._iast.taint_sinks.command_injection import _iast_report_cmdi
-
-            _iast_report_cmdi(command)
-
-        if (
-            asm_config._asm_enabled
-            and asm_config._ep_enabled
-            and ddtrace.tracer._appsec_processor is not None
-            and ddtrace.tracer._appsec_processor.rasp_cmdi_enabled
-        ):
-            try:
-                from ddtrace.appsec._asm_request_context import call_waf_callback
-                from ddtrace.appsec._asm_request_context import in_asm_context
-                from ddtrace.appsec._constants import EXPLOIT_PREVENTION
-            except ImportError:
-                return original_command_callable(*args, **kwargs)
-
-            if in_asm_context():
-                res = call_waf_callback(
-                    {EXPLOIT_PREVENTION.ADDRESS.CMDI: command},
-                    crop_trace="wrapped_system_5542593D237084A7",
-                    rule_type=EXPLOIT_PREVENTION.TYPE.CMDI,
+    if (
+        asm_config._asm_enabled
+        and asm_config._ep_enabled
+        and ddtrace.tracer._appsec_processor is not None
+        and ddtrace.tracer._appsec_processor.rasp_shi_enabled
+    ):
+        try:
+            from ddtrace.appsec._asm_request_context import call_waf_callback
+            from ddtrace.appsec._asm_request_context import in_asm_context
+        except ImportError:
+            return
+
+        if in_asm_context():
+            res = call_waf_callback(
+                {EXPLOIT_PREVENTION.ADDRESS.SHI: command},
+                crop_trace="wrapped_system_5542593D237084A7",
+                rule_type=EXPLOIT_PREVENTION.TYPE.SHI,
+            )
+            if res and _must_block(res.actions):
+                raise BlockingException(
+                    get_blocked(), EXPLOIT_PREVENTION.BLOCKING, EXPLOIT_PREVENTION.TYPE.SHI, command
+                )
+
+
+def popen_FD233052260D8B4D(arg_list: Union[List[str], str]) -> None:
+    """
+    listener for subprocess.Popen class
+    """
+    if (
+        asm_config._asm_enabled
+        and asm_config._ep_enabled
+        and ddtrace.tracer._appsec_processor is not None
+        and ddtrace.tracer._appsec_processor.rasp_cmdi_enabled
+    ):
+        try:
+            from ddtrace.appsec._asm_request_context import call_waf_callback
+            from ddtrace.appsec._asm_request_context import in_asm_context
+        except ImportError:
+            return
+
+        if in_asm_context():
+            res = call_waf_callback(
+                {EXPLOIT_PREVENTION.ADDRESS.CMDI: arg_list if isinstance(arg_list, list) else [arg_list]},
+                crop_trace="popen_FD233052260D8B4D",
+                rule_type=EXPLOIT_PREVENTION.TYPE.CMDI,
+            )
+            if res and _must_block(res.actions):
+                raise BlockingException(
+                    get_blocked(), EXPLOIT_PREVENTION.BLOCKING, EXPLOIT_PREVENTION.TYPE.CMDI, arg_list
                 )
-                if res and _must_block(res.actions):
-                    raise BlockingException(get_blocked(), "exploit_prevention", "cmdi", command)
-    try:
-        return original_command_callable(*args, **kwargs)
-    except Exception as e:
-        previous_frame = e.__traceback__.tb_frame.f_back
-        raise e.with_traceback(
-            e.__traceback__.__class__(None, previous_frame, previous_frame.f_lasti, previous_frame.f_lineno)
-        )
 
 
 _DB_DIALECTS = {
@@ -279,7 +309,6 @@ def execute_4C9BAC8E228EB347(instrument_self, query, args, kwargs) -> None:
         try:
             from ddtrace.appsec._asm_request_context import call_waf_callback
             from ddtrace.appsec._asm_request_context import in_asm_context
-            from ddtrace.appsec._constants import EXPLOIT_PREVENTION
         except ImportError:
             # execute is used during module initialization
             # and shouldn't be changed at that time
@@ -296,7 +325,9 @@ def execute_4C9BAC8E228EB347(instrument_self, query, args, kwargs) -> None:
                     rule_type=EXPLOIT_PREVENTION.TYPE.SQLI,
                 )
                 if res and _must_block(res.actions):
-                    raise BlockingException(get_blocked(), "exploit_prevention", "sqli", query)
+                    raise BlockingException(
+                        get_blocked(), EXPLOIT_PREVENTION.BLOCKING, EXPLOIT_PREVENTION.TYPE.SQLI, query
+                    )
 
 
 def try_unwrap(module, name):
diff --git a/ddtrace/appsec/_constants.py b/ddtrace/appsec/_constants.py
index 83cb53e78ff..45a96834cc1 100644
--- a/ddtrace/appsec/_constants.py
+++ b/ddtrace/appsec/_constants.py
@@ -202,7 +202,8 @@ class WAF_DATA_NAMES(metaclass=Constant_Class):
 
     # EPHEMERAL ADDRESSES
     PROCESSOR_SETTINGS: Literal["waf.context.processor"] = "waf.context.processor"
-    CMDI_ADDRESS: Literal["server.sys.shell.cmd"] = "server.sys.shell.cmd"
+    CMDI_ADDRESS: Literal["server.sys.exec.cmd"] = "server.sys.exec.cmd"
+    SHI_ADDRESS: Literal["server.sys.shell.cmd"] = "server.sys.shell.cmd"
     LFI_ADDRESS: Literal["server.io.fs.file"] = "server.io.fs.file"
     SSRF_ADDRESS: Literal["server.io.net.url"] = "server.io.net.url"
     SQLI_ADDRESS: Literal["server.db.statement"] = "server.db.statement"
@@ -328,6 +329,7 @@ class DEFAULT(metaclass=Constant_Class):
 
 
 class EXPLOIT_PREVENTION(metaclass=Constant_Class):
+    BLOCKING: Literal["exploit_prevention"] = "exploit_prevention"
     STACK_TRACE_ID: Literal["stack_id"] = "stack_id"
     EP_ENABLED: Literal["DD_APPSEC_RASP_ENABLED"] = "DD_APPSEC_RASP_ENABLED"
     STACK_TRACE_ENABLED: Literal["DD_APPSEC_STACK_TRACE_ENABLED"] = "DD_APPSEC_STACK_TRACE_ENABLED"
@@ -339,6 +341,7 @@ class EXPLOIT_PREVENTION(metaclass=Constant_Class):
 
     class TYPE(metaclass=Constant_Class):
         CMDI: Literal["command_injection"] = "command_injection"
+        SHI: Literal["shell_injection"] = "shell_injection"
         LFI: Literal["lfi"] = "lfi"
         SSRF: Literal["ssrf"] = "ssrf"
         SQLI: Literal["sql_injection"] = "sql_injection"
@@ -346,6 +349,7 @@ class TYPE(metaclass=Constant_Class):
     class ADDRESS(metaclass=Constant_Class):
         CMDI: Literal["CMDI_ADDRESS"] = "CMDI_ADDRESS"
         LFI: Literal["LFI_ADDRESS"] = "LFI_ADDRESS"
+        SHI: Literal["SHI_ADDRESS"] = "SHI_ADDRESS"
         SSRF: Literal["SSRF_ADDRESS"] = "SSRF_ADDRESS"
         SQLI: Literal["SQLI_ADDRESS"] = "SQLI_ADDRESS"
         SQLI_TYPE: Literal["SQLI_SYSTEM_ADDRESS"] = "SQLI_SYSTEM_ADDRESS"
diff --git a/ddtrace/appsec/_iast/_pytest_plugin.py b/ddtrace/appsec/_iast/_pytest_plugin.py
index 672acc4a031..82c23c53174 100644
--- a/ddtrace/appsec/_iast/_pytest_plugin.py
+++ b/ddtrace/appsec/_iast/_pytest_plugin.py
@@ -27,6 +27,8 @@ def ddtrace_iast(request, ddspan):
         Optionally output the test as failed if vulnerabilities are found.
         """
         yield
+        if ddspan is None:
+            return
         data = ddspan.get_tag(IAST.JSON)
         if not data:
             return
diff --git a/ddtrace/appsec/_iast/taint_sinks/command_injection.py b/ddtrace/appsec/_iast/taint_sinks/command_injection.py
index ee22b294bfc..2607c6c9447 100644
--- a/ddtrace/appsec/_iast/taint_sinks/command_injection.py
+++ b/ddtrace/appsec/_iast/taint_sinks/command_injection.py
@@ -1,18 +1,15 @@
-import os
-import subprocess  # nosec
 from typing import List
 from typing import Union
 
-from ddtrace.appsec._common_module_patches import try_unwrap
 from ddtrace.appsec._constants import IAST_SPAN_TAGS
 from ddtrace.appsec._iast import oce
 from ddtrace.appsec._iast._iast_request_context import is_iast_request_enabled
 from ddtrace.appsec._iast._metrics import _set_metric_iast_executed_sink
 from ddtrace.appsec._iast._metrics import _set_metric_iast_instrumented_sink
 from ddtrace.appsec._iast._metrics import increment_iast_span_metric
-from ddtrace.appsec._iast._patch import try_wrap_function_wrapper
 from ddtrace.appsec._iast._taint_tracking._taint_objects import is_pyobject_tainted
 from ddtrace.appsec._iast.constants import VULN_CMDI
+import ddtrace.contrib.internal.subprocess.patch as subprocess_patch
 from ddtrace.internal.logger import get_logger
 from ddtrace.settings.asm import config as asm_config
 
@@ -26,48 +23,20 @@ def get_version() -> str:
     return ""
 
 
-def patch():
-    if not asm_config._iast_enabled:
-        return
-
-    if not getattr(os, "_datadog_cmdi_patch", False):
-        # all os.spawn* variants eventually use this one:
-        try_wrap_function_wrapper("os", "_spawnvef", _iast_cmdi_osspawn)
-
-    if not getattr(subprocess, "_datadog_cmdi_patch", False):
-        try_wrap_function_wrapper("subprocess", "Popen.__init__", _iast_cmdi_subprocess_init)
+_IAST_CMDI = "iast_cmdi"
 
-        os._datadog_cmdi_patch = True
-        subprocess._datadog_cmdi_patch = True
 
-    _set_metric_iast_instrumented_sink(VULN_CMDI)
+def patch():
+    if asm_config._iast_enabled:
+        subprocess_patch.patch()
+        subprocess_patch.add_str_callback(_IAST_CMDI, _iast_report_cmdi)
+        subprocess_patch.add_lst_callback(_IAST_CMDI, _iast_report_cmdi)
+        _set_metric_iast_instrumented_sink(VULN_CMDI)
 
 
 def unpatch() -> None:
-    try_unwrap("os", "system")
-    try_unwrap("os", "_spawnvef")
-    try_unwrap("subprocess", "Popen.__init__")
-
-    os._datadog_cmdi_patch = False  # type: ignore[attr-defined]
-    subprocess._datadog_cmdi_patch = False  # type: ignore[attr-defined]
-
-
-def _iast_cmdi_osspawn(wrapped, instance, args, kwargs):
-    mode, file, func_args, _, _ = args
-    _iast_report_cmdi(func_args)
-
-    if hasattr(wrapped, "__func__"):
-        return wrapped.__func__(instance, *args, **kwargs)
-    return wrapped(*args, **kwargs)
-
-
-def _iast_cmdi_subprocess_init(wrapped, instance, args, kwargs):
-    cmd_args = args[0] if len(args) else kwargs["args"]
-    _iast_report_cmdi(cmd_args)
-
-    if hasattr(wrapped, "__func__"):
-        return wrapped.__func__(instance, *args, **kwargs)
-    return wrapped(*args, **kwargs)
+    subprocess_patch.del_str_callback(_IAST_CMDI)
+    subprocess_patch.del_lst_callback(_IAST_CMDI)
 
 
 @oce.register
diff --git a/ddtrace/appsec/_metrics.py b/ddtrace/appsec/_metrics.py
index f8713dc5ea7..cbe8490d717 100644
--- a/ddtrace/appsec/_metrics.py
+++ b/ddtrace/appsec/_metrics.py
@@ -1,4 +1,5 @@
 from ddtrace.appsec import _asm_request_context
+from ddtrace.appsec import _constants
 from ddtrace.appsec._ddwaf import version as _version
 from ddtrace.appsec._deduplications import deduplication
 from ddtrace.internal import telemetry
@@ -64,6 +65,15 @@ def _set_waf_init_metric(info):
         log.warning("Error reporting ASM WAF init metrics", exc_info=True)
 
 
+_TYPES_AND_TAGS = {
+    _constants.EXPLOIT_PREVENTION.TYPE.CMDI: (("rule_type", "command_injection"), ("rule_variant", "exec")),
+    _constants.EXPLOIT_PREVENTION.TYPE.SHI: (("rule_type", "command_injection"), ("rule_variant", "shell")),
+    _constants.EXPLOIT_PREVENTION.TYPE.LFI: (("rule_type", "lfi"),),
+    _constants.EXPLOIT_PREVENTION.TYPE.SSRF: (("rule_type", "ssrf"),),
+    _constants.EXPLOIT_PREVENTION.TYPE.SQLI: (("rule_type", "sql_injection"),),
+}
+
+
 def _set_waf_request_metrics(*args):
     try:
         result = _asm_request_context.get_waf_telemetry_results()
@@ -94,10 +104,7 @@ def _set_waf_request_metrics(*args):
                                 TELEMETRY_NAMESPACE_TAG_APPSEC,
                                 n,
                                 float(value),
-                                tags=(
-                                    ("rule_type", rule_type),
-                                    ("waf_version", DDWAF_VERSION),
-                                ),
+                                tags=_TYPES_AND_TAGS.get(rule_type, ()) + (("waf_version", DDWAF_VERSION),),
                             )
 
     except Exception:
diff --git a/ddtrace/appsec/_processor.py b/ddtrace/appsec/_processor.py
index 06328d1201a..54a9f624afe 100644
--- a/ddtrace/appsec/_processor.py
+++ b/ddtrace/appsec/_processor.py
@@ -202,6 +202,10 @@ def _update_rules(self, new_rules: Dict[str, Any]) -> bool:
     def rasp_lfi_enabled(self) -> bool:
         return WAF_DATA_NAMES.LFI_ADDRESS in self._addresses_to_keep
 
+    @property
+    def rasp_shi_enabled(self) -> bool:
+        return WAF_DATA_NAMES.SHI_ADDRESS in self._addresses_to_keep
+
     @property
     def rasp_cmdi_enabled(self) -> bool:
         return WAF_DATA_NAMES.CMDI_ADDRESS in self._addresses_to_keep
diff --git a/ddtrace/contrib/internal/subprocess/patch.py b/ddtrace/contrib/internal/subprocess/patch.py
index 7380e72fdaf..76530c195df 100644
--- a/ddtrace/contrib/internal/subprocess/patch.py
+++ b/ddtrace/contrib/internal/subprocess/patch.py
@@ -4,8 +4,8 @@
 import os
 import re
 import shlex
-import subprocess  # nosec
 from threading import RLock
+from typing import Callable  # noqa:F401
 from typing import Deque  # noqa:F401
 from typing import Dict  # noqa:F401
 from typing import List  # noqa:F401
@@ -33,45 +33,71 @@
 )
 
 
-def get_version():
-    # type: () -> str
+def get_version() -> str:
     return ""
 
 
-def patch():
-    # type: () -> List[str]
-    patched = []  # type: List[str]
-    if not asm_config._asm_enabled:
-        return patched
+_STR_CALLBACKS: Dict[str, Callable[[str], None]] = {}
+_LST_CALLBACKS: Dict[str, Callable[[Union[List[str], str]], None]] = {}
 
-    import os
 
-    if not getattr(os, "_datadog_patch", False):
-        Pin().onto(os)
-        trace_utils.wrap(os, "system", _traced_ossystem(os))
-        trace_utils.wrap(os, "fork", _traced_fork(os))
+def add_str_callback(name: str, callback: Callable[[str], None]):
+    _STR_CALLBACKS[name] = callback
+
+
+def del_str_callback(name: str):
+    _STR_CALLBACKS.pop(name, None)
+
+
+def add_lst_callback(name: str, callback: Callable[[Union[List[str], str]], None]):
+    _LST_CALLBACKS[name] = callback
+
+
+def del_lst_callback(name: str):
+    _LST_CALLBACKS.pop(name, None)
+
 
-        # all os.spawn* variants eventually use this one:
-        trace_utils.wrap(os, "_spawnvef", _traced_osspawn(os))
+def patch() -> List[str]:
+    if not (asm_config._asm_enabled or asm_config._iast_enabled):
+        return []
+    patched: List[str] = []
 
+    import os  # nosec
+    import subprocess  # nosec
+
+    should_patch_system = not trace_utils.iswrapped(os.system)
+    should_patch_fork = not trace_utils.iswrapped(os.fork)
+    spawnvef = getattr(os, "_spawnvef", None)
+    should_patch_spawnvef = spawnvef is not None and not trace_utils.iswrapped(spawnvef)
+
+    if should_patch_system or should_patch_fork or should_patch_spawnvef:
+        Pin().onto(os)
+        if should_patch_system:
+            trace_utils.wrap(os, "system", _traced_ossystem(os))
+        if should_patch_fork:
+            trace_utils.wrap(os, "fork", _traced_fork(os))
+        if should_patch_spawnvef:
+            # all os.spawn* variants eventually use this one:
+            trace_utils.wrap(os, "_spawnvef", _traced_osspawn(os))
         patched.append("os")
 
-    if not getattr(subprocess, "_datadog_patch", False):
+    should_patch_Popen_init = not trace_utils.iswrapped(subprocess.Popen.__init__)
+    should_patch_Popen_wait = not trace_utils.iswrapped(subprocess.Popen.wait)
+    if should_patch_Popen_init or should_patch_Popen_wait:
         Pin().onto(subprocess)
         # We store the parameters on __init__ in the context and set the tags on wait
         # (where all the Popen objects eventually arrive, unless killed before it)
-        trace_utils.wrap(subprocess, "Popen.__init__", _traced_subprocess_init(subprocess))
-        trace_utils.wrap(subprocess, "Popen.wait", _traced_subprocess_wait(subprocess))
-
-        os._datadog_patch = True
-        subprocess._datadog_patch = True
+        if should_patch_Popen_init:
+            trace_utils.wrap(subprocess, "Popen.__init__", _traced_subprocess_init(subprocess))
+        if should_patch_Popen_wait:
+            trace_utils.wrap(subprocess, "Popen.wait", _traced_subprocess_wait(subprocess))
         patched.append("subprocess")
 
     return patched
 
 
 @dataclass(eq=False)
-class SubprocessCmdLineCacheEntry(object):
+class SubprocessCmdLineCacheEntry:
     binary: Optional[str] = None
     arguments: Optional[List] = None
     truncated: bool = False
@@ -80,10 +106,10 @@ class SubprocessCmdLineCacheEntry(object):
     as_string: Optional[str] = None
 
 
-class SubprocessCmdLine(object):
+class SubprocessCmdLine:
     # This catches the computed values into a SubprocessCmdLineCacheEntry object
-    _CACHE = {}  # type: Dict[str, SubprocessCmdLineCacheEntry]
-    _CACHE_DEQUE = collections.deque()  # type: Deque[str]
+    _CACHE: Dict[str, SubprocessCmdLineCacheEntry] = {}
+    _CACHE_DEQUE: Deque[str] = collections.deque()
     _CACHE_MAXSIZE = 32
     _CACHE_LOCK = RLock()
 
@@ -138,8 +164,7 @@ def _clear_cache(cls):
     ]
     _COMPILED_ENV_VAR_REGEXP = re.compile(r"\b[A-Z_]+=\w+")
 
-    def __init__(self, shell_args, shell=False):
-        # type: (Union[str, List[str]], bool) -> None
+    def __init__(self, shell_args: Union[str, List[str]], shell: bool = False) -> None:
         cache_key = str(shell_args) + str(shell)
         self._cache_entry = SubprocessCmdLine._CACHE.get(cache_key)
         if self._cache_entry:
@@ -250,8 +275,7 @@ def scrub_arguments(self):
 
         self.arguments = new_args
 
-    def truncate_string(self, str_):
-        # type: (str) -> str
+    def truncate_string(self, str_: str) -> str:
         oversize = len(str_) - self.TRUNCATE_LIMIT
 
         if oversize <= 0:
@@ -263,9 +287,7 @@ def truncate_string(self, str_):
         msg = ' "4kB argument truncated by %d characters"' % oversize
         return str_[0 : -(oversize + len(msg))] + msg
 
-    def _as_list_and_string(self):
-        # type: () -> Tuple[list[str], str]
-
+    def _as_list_and_string(self) -> Tuple[List[str], str]:
         total_list = self.env_vars + [self.binary] + self.arguments
         truncated_str = self.truncate_string(shjoin(total_list))
         truncated_list = shlex.split(truncated_str)
@@ -290,8 +312,10 @@ def as_string(self):
         return str_res
 
 
-def unpatch():
-    # type: () -> None
+def unpatch() -> None:
+    import os  # nosec
+    import subprocess  # nosec
+
     trace_utils.unwrap(os, "system")
     trace_utils.unwrap(os, "_spawnvef")
     trace_utils.unwrap(subprocess.Popen, "__init__")
@@ -299,13 +323,13 @@ def unpatch():
 
     SubprocessCmdLine._clear_cache()
 
-    os._datadog_patch = False
-    subprocess._datadog_patch = False
-
 
 @trace_utils.with_traced_module
 def _traced_ossystem(module, pin, wrapped, instance, args, kwargs):
     try:
+        if isinstance(args[0], str):
+            for callback in _STR_CALLBACKS.values():
+                callback(args[0])
         shellcmd = SubprocessCmdLine(args[0], shell=True)  # nosec
 
         with pin.tracer.trace(COMMANDS.SPAN_NAME, resource=shellcmd.binary, span_type=SpanTypes.SYSTEM) as span:
@@ -342,6 +366,10 @@ def _traced_fork(module, pin, wrapped, instance, args, kwargs):
 def _traced_osspawn(module, pin, wrapped, instance, args, kwargs):
     try:
         mode, file, func_args, _, _ = args
+        if isinstance(func_args, (list, tuple, str)):
+            commands = [file] + list(func_args)
+            for callback in _LST_CALLBACKS.values():
+                callback(commands)
         shellcmd = SubprocessCmdLine(func_args, shell=False)
 
         with pin.tracer.trace(COMMANDS.SPAN_NAME, resource=shellcmd.binary, span_type=SpanTypes.SYSTEM) as span:
@@ -366,6 +394,13 @@ def _traced_osspawn(module, pin, wrapped, instance, args, kwargs):
 def _traced_subprocess_init(module, pin, wrapped, instance, args, kwargs):
     try:
         cmd_args = args[0] if len(args) else kwargs["args"]
+        if isinstance(cmd_args, (list, tuple, str)):
+            if kwargs.get("shell", False):
+                for callback in _STR_CALLBACKS.values():
+                    callback(cmd_args)
+            else:
+                for callback in _LST_CALLBACKS.values():
+                    callback(cmd_args)
         cmd_args_list = shlex.split(cmd_args) if isinstance(cmd_args, str) else cmd_args
         is_shell = kwargs.get("shell", False)
         shellcmd = SubprocessCmdLine(cmd_args_list, shell=is_shell)  # nosec
diff --git a/releasenotes/notes/rasp_cmdi-3c7819ee9e33e447.yaml b/releasenotes/notes/rasp_cmdi-3c7819ee9e33e447.yaml
new file mode 100644
index 00000000000..89744bf9be2
--- /dev/null
+++ b/releasenotes/notes/rasp_cmdi-3c7819ee9e33e447.yaml
@@ -0,0 +1,6 @@
+---
+features:
+  - |
+    ASM: This introduces the support for command injection for Exploit Prevention. With previous support of shell injection with os.system, 
+    this provides automatic instrumentation for subprocess module functions and os.spawn* functions, 
+    ensuring monitoring and blocking for Exploit Prevention on those endpoints.
diff --git a/tests/appsec/appsec/rules-rasp-blocking.json b/tests/appsec/appsec/rules-rasp-blocking.json
index f2f8c4d7955..e5038e4a7c2 100644
--- a/tests/appsec/appsec/rules-rasp-blocking.json
+++ b/tests/appsec/appsec/rules-rasp-blocking.json
@@ -201,6 +201,55 @@
         "stack_trace",
         "block"
       ]
+    },
+    {
+      "id": "rasp-932-110",
+      "name": "OS command injection exploit",
+      "tags": {
+        "type": "command_injection",
+        "category": "vulnerability_trigger",
+        "cwe": "77",
+        "capec": "1000/152/248/88",
+        "confidence": "0",
+        "module": "rasp"
+      },
+      "conditions": [
+        {
+          "parameters": {
+            "resource": [
+              {
+                "address": "server.sys.exec.cmd"
+              }
+            ],
+            "params": [
+              {
+                "address": "server.request.query"
+              },
+              {
+                "address": "server.request.body"
+              },
+              {
+                "address": "server.request.path_params"
+              },
+              {
+                "address": "grpc.server.request.message"
+              },
+              {
+                "address": "graphql.server.all_resolvers"
+              },
+              {
+                "address": "graphql.server.resolver"
+              }
+            ]
+          },
+          "operator": "cmdi_detector"
+        }
+      ],
+      "transformers": [],
+      "on_match": [
+        "stack_trace",
+        "block"
+      ]
     }
    ]
 }
\ No newline at end of file
diff --git a/tests/appsec/appsec/rules-rasp-disabled.json b/tests/appsec/appsec/rules-rasp-disabled.json
index 4a0943a34fb..ec67b186732 100644
--- a/tests/appsec/appsec/rules-rasp-disabled.json
+++ b/tests/appsec/appsec/rules-rasp-disabled.json
@@ -201,6 +201,55 @@
       "on_match": [
         "stack_trace"
       ]
+    },
+    {
+      "id": "rasp-932-110",
+      "name": "OS command injection exploit",
+      "enabled": false,
+      "tags": {
+        "type": "command_injection",
+        "category": "vulnerability_trigger",
+        "cwe": "77",
+        "capec": "1000/152/248/88",
+        "confidence": "0",
+        "module": "rasp"
+      },
+      "conditions": [
+        {
+          "parameters": {
+            "resource": [
+              {
+                "address": "server.sys.exec.cmd"
+              }
+            ],
+            "params": [
+              {
+                "address": "server.request.query"
+              },
+              {
+                "address": "server.request.body"
+              },
+              {
+                "address": "server.request.path_params"
+              },
+              {
+                "address": "grpc.server.request.message"
+              },
+              {
+                "address": "graphql.server.all_resolvers"
+              },
+              {
+                "address": "graphql.server.resolver"
+              }
+            ]
+          },
+          "operator": "cmdi_detector"
+        }
+      ],
+      "transformers": [],
+      "on_match": [
+        "stack_trace"
+      ]
     }
   ]
 }
\ No newline at end of file
diff --git a/tests/appsec/appsec/rules-rasp-redirecting.json b/tests/appsec/appsec/rules-rasp-redirecting.json
index a7a53db6e3b..6e2080b2dbf 100644
--- a/tests/appsec/appsec/rules-rasp-redirecting.json
+++ b/tests/appsec/appsec/rules-rasp-redirecting.json
@@ -211,6 +211,55 @@
         "stack_trace",
         "block"
       ]
+    },
+    {
+      "id": "rasp-932-110",
+      "name": "OS command injection exploit",
+      "tags": {
+        "type": "command_injection",
+        "category": "vulnerability_trigger",
+        "cwe": "77",
+        "capec": "1000/152/248/88",
+        "confidence": "0",
+        "module": "rasp"
+      },
+      "conditions": [
+        {
+          "parameters": {
+            "resource": [
+              {
+                "address": "server.sys.exec.cmd"
+              }
+            ],
+            "params": [
+              {
+                "address": "server.request.query"
+              },
+              {
+                "address": "server.request.body"
+              },
+              {
+                "address": "server.request.path_params"
+              },
+              {
+                "address": "grpc.server.request.message"
+              },
+              {
+                "address": "graphql.server.all_resolvers"
+              },
+              {
+                "address": "graphql.server.resolver"
+              }
+            ]
+          },
+          "operator": "cmdi_detector"
+        }
+      ],
+      "transformers": [],
+      "on_match": [
+        "stack_trace",
+        "block"
+      ]
     }
   ]
 }
\ No newline at end of file
diff --git a/tests/appsec/appsec/rules-rasp.json b/tests/appsec/appsec/rules-rasp.json
index c1a6822d261..d73672392af 100644
--- a/tests/appsec/appsec/rules-rasp.json
+++ b/tests/appsec/appsec/rules-rasp.json
@@ -197,6 +197,54 @@
       "on_match": [
         "stack_trace"
       ]
+    },
+    {
+      "id": "rasp-932-110",
+      "name": "OS command injection exploit",
+      "tags": {
+        "type": "command_injection",
+        "category": "vulnerability_trigger",
+        "cwe": "77",
+        "capec": "1000/152/248/88",
+        "confidence": "0",
+        "module": "rasp"
+      },
+      "conditions": [
+        {
+          "parameters": {
+            "resource": [
+              {
+                "address": "server.sys.exec.cmd"
+              }
+            ],
+            "params": [
+              {
+                "address": "server.request.query"
+              },
+              {
+                "address": "server.request.body"
+              },
+              {
+                "address": "server.request.path_params"
+              },
+              {
+                "address": "grpc.server.request.message"
+              },
+              {
+                "address": "graphql.server.all_resolvers"
+              },
+              {
+                "address": "graphql.server.resolver"
+              }
+            ]
+          },
+          "operator": "cmdi_detector"
+        }
+      ],
+      "transformers": [],
+      "on_match": [
+        "stack_trace"
+      ]
     }
   ]
 }
\ No newline at end of file
diff --git a/tests/appsec/appsec/test_remoteconfiguration.py b/tests/appsec/appsec/test_remoteconfiguration.py
index f00167706dc..1d2c47bc190 100644
--- a/tests/appsec/appsec/test_remoteconfiguration.py
+++ b/tests/appsec/appsec/test_remoteconfiguration.py
@@ -117,7 +117,7 @@ def test_rc_activation_states_off(tracer, appsec_enabled, rc_value, remote_confi
 @pytest.mark.parametrize(
     "rc_enabled, appsec_enabled, capability",
     [
-        (True, "true", "D4HkA/w="),  # All capabilities except ASM_ACTIVATION
+        (True, "true", "L4HkA/w="),  # All capabilities except ASM_ACTIVATION
         (False, "true", ""),
         (True, "false", "gAAAAA=="),
         (False, "false", ""),
@@ -142,7 +142,7 @@ def test_rc_capabilities(rc_enabled, appsec_enabled, capability, tracer):
 @pytest.mark.parametrize(
     "env_rules, expected",
     [
-        ({}, "D4HkA/4="),  # All capabilities
+        ({}, "L4HkA/4="),  # All capabilities
         ({"_asm_static_rule_file": DEFAULT.RULES}, "gAAAAg=="),  # Only ASM_FEATURES
     ],
 )
diff --git a/tests/appsec/contrib_appsec/django_app/urls.py b/tests/appsec/contrib_appsec/django_app/urls.py
index 77ad7a7f0a6..aaff69169b5 100644
--- a/tests/appsec/contrib_appsec/django_app/urls.py
+++ b/tests/appsec/contrib_appsec/django_app/urls.py
@@ -1,5 +1,6 @@
 import os
 import sqlite3
+import subprocess
 import tempfile
 
 import django
@@ -129,13 +130,33 @@ def rasp(request, endpoint: str):
                 res.append(f"Error: {e}")
         tracer.current_span()._local_root.set_tag("rasp.request.done", endpoint)
         return HttpResponse("<\\br>\n".join(res))
+    elif endpoint == "shell_injection":
+        res = ["shell_injection endpoint"]
+        for param in query_params:
+            if param.startswith("cmd"):
+                cmd = query_params[param]
+                try:
+                    if param.startswith("cmdsys"):
+                        res.append(f'cmd stdout: {os.system(f"ls {cmd}")}')
+                    else:
+                        res.append(f'cmd stdout: {subprocess.run(f"ls {cmd}", shell=True)}')
+                except Exception as e:
+                    res.append(f"Error: {e}")
+        tracer.current_span()._local_root.set_tag("rasp.request.done", endpoint)
+        return HttpResponse("<\\br>\n".join(res))
     elif endpoint == "command_injection":
         res = ["command_injection endpoint"]
         for param in query_params:
-            if param.startswith("cmd"):
+            if param.startswith("cmda"):
+                cmd = query_params[param]
+                try:
+                    res.append(f'cmd stdout: {subprocess.run([cmd, "-c", "3", "localhost"])}')
+                except Exception as e:
+                    res.append(f"Error: {e}")
+            elif param.startswith("cmds"):
                 cmd = query_params[param]
                 try:
-                    res.append(f'cmd stdout: {os.system(f"ls {cmd}")}')
+                    res.append(f"cmd stdout: {subprocess.run(cmd)}")
                 except Exception as e:
                     res.append(f"Error: {e}")
         tracer.current_span()._local_root.set_tag("rasp.request.done", endpoint)
diff --git a/tests/appsec/contrib_appsec/fastapi_app/app.py b/tests/appsec/contrib_appsec/fastapi_app/app.py
index 10b7b430543..c5b765c4bbb 100644
--- a/tests/appsec/contrib_appsec/fastapi_app/app.py
+++ b/tests/appsec/contrib_appsec/fastapi_app/app.py
@@ -1,6 +1,7 @@
 import asyncio
 import os
 import sqlite3
+import subprocess
 from typing import Optional
 
 from fastapi import FastAPI
@@ -178,13 +179,33 @@ async def rasp(endpoint: str, request: Request):
                     res.append(f"Error: {e}")
             tracer.current_span()._local_root.set_tag("rasp.request.done", endpoint)
             return HTMLResponse("<\\br>\n".join(res))
+        elif endpoint == "shell_injection":
+            res = ["shell_injection endpoint"]
+            for param in query_params:
+                if param.startswith("cmd"):
+                    cmd = query_params[param]
+                    try:
+                        if param.startswith("cmdsys"):
+                            res.append(f'cmd stdout: {os.system(f"ls {cmd}")}')
+                        else:
+                            res.append(f'cmd stdout: {subprocess.run(f"ls {cmd}", shell=True)}')
+                    except Exception as e:
+                        res.append(f"Error: {e}")
+            tracer.current_span()._local_root.set_tag("rasp.request.done", endpoint)
+            return HTMLResponse("<\\br>\n".join(res))
         elif endpoint == "command_injection":
             res = ["command_injection endpoint"]
             for param in query_params:
-                if param.startswith("cmd"):
+                if param.startswith("cmda"):
+                    cmd = query_params[param]
+                    try:
+                        res.append(f'cmd stdout: {subprocess.run([cmd, "-c", "3", "localhost"])}')
+                    except Exception as e:
+                        res.append(f"Error: {e}")
+                elif param.startswith("cmds"):
                     cmd = query_params[param]
                     try:
-                        res.append(f'cmd stdout: {os.system(f"ls {cmd}")}')
+                        res.append(f"cmd stdout: {subprocess.run(cmd)}")
                     except Exception as e:
                         res.append(f"Error: {e}")
             tracer.current_span()._local_root.set_tag("rasp.request.done", endpoint)
diff --git a/tests/appsec/contrib_appsec/flask_app/app.py b/tests/appsec/contrib_appsec/flask_app/app.py
index 5270229d3e9..939a7cad678 100644
--- a/tests/appsec/contrib_appsec/flask_app/app.py
+++ b/tests/appsec/contrib_appsec/flask_app/app.py
@@ -1,5 +1,6 @@
 import os
 import sqlite3
+import subprocess
 from typing import Optional
 
 from flask import Flask
@@ -126,13 +127,33 @@ def rasp(endpoint: str):
                 res.append(f"Error: {e}")
         tracer.current_span()._local_root.set_tag("rasp.request.done", endpoint)
         return "<\\br>\n".join(res)
+    elif endpoint == "shell_injection":
+        res = ["shell_injection endpoint"]
+        for param in query_params:
+            if param.startswith("cmd"):
+                cmd = query_params[param]
+                try:
+                    if param.startswith("cmdsys"):
+                        res.append(f'cmd stdout: {os.system(f"ls {cmd}")}')
+                    else:
+                        res.append(f'cmd stdout: {subprocess.run(f"ls {cmd}", shell=True)}')
+                except Exception as e:
+                    res.append(f"Error: {e}")
+        tracer.current_span()._local_root.set_tag("rasp.request.done", endpoint)
+        return "<\\br>\n".join(res)
     elif endpoint == "command_injection":
         res = ["command_injection endpoint"]
         for param in query_params:
-            if param.startswith("cmd"):
+            if param.startswith("cmda"):
+                cmd = query_params[param]
+                try:
+                    res.append(f'cmd stdout: {subprocess.run([cmd, "-c", "3", "localhost"])}')
+                except Exception as e:
+                    res.append(f"Error: {e}")
+            elif param.startswith("cmds"):
                 cmd = query_params[param]
                 try:
-                    res.append(f'cmd stdout: {os.system(f"ls {cmd}")}')
+                    res.append(f"cmd stdout: {subprocess.run(cmd)}")
                 except Exception as e:
                     res.append(f"Error: {e}")
         tracer.current_span()._local_root.set_tag("rasp.request.done", endpoint)
diff --git a/tests/appsec/contrib_appsec/utils.py b/tests/appsec/contrib_appsec/utils.py
index 315caa49a5d..d3691e2bea3 100644
--- a/tests/appsec/contrib_appsec/utils.py
+++ b/tests/appsec/contrib_appsec/utils.py
@@ -1308,11 +1308,19 @@ def test_stream_response(
         + [("sql_injection", "user_id_1=1 OR 1=1&user_id_2=1 OR 1=1", "rasp-942-100", ("dispatch",))]
         + [
             (
-                "command_injection",
-                "cmd_1=$(cat /etc/passwd 1>%262 ; echo .)&cmd_2=$(uname -a 1>%262 ; echo .)",
+                "shell_injection",
+                "cmdsys_1=$(cat /etc/passwd 1>%262 ; echo .)&cmdrun_2=$(uname -a 1>%262 ; echo .)",
                 "rasp-932-100",
                 ("system", "rasp"),
             )
+        ]
+        + [
+            (
+                "command_injection",
+                "cmda_1=/sbin/ping&cmds_2=/usr/bin/ls%20-la",
+                "rasp-932-110",
+                ("Popen", "rasp"),
+            )
         ],
     )
     @pytest.mark.parametrize(
@@ -1381,11 +1389,23 @@ def validate_top_function(trace):
                         trace
                     ), f"unknown top function {trace['frames'][0]} {[t['function'] for t in trace['frames'][:4]]}"
                 # assert mocked.call_args_list == []
+                expected_rule_type = "command_injection" if endpoint == "shell_injection" else endpoint
+                expected_variant = (
+                    "exec" if endpoint == "command_injection" else "shell" if endpoint == "shell_injection" else None
+                )
                 matches = [t for c, n, t in telemetry_calls if c == "CountMetric" and n == "appsec.rasp.rule.match"]
-                assert matches == [(("rule_type", endpoint), ("waf_version", DDWAF_VERSION))], matches
+                if expected_variant:
+                    expected_tags = (
+                        ("rule_type", expected_rule_type),
+                        ("rule_variant", expected_variant),
+                        ("waf_version", DDWAF_VERSION),
+                    )
+                else:
+                    expected_tags = (("rule_type", expected_rule_type), ("waf_version", DDWAF_VERSION))
+                    assert matches == [expected_tags], matches
                 evals = [t for c, n, t in telemetry_calls if c == "CountMetric" and n == "appsec.rasp.rule.eval"]
                 # there may have been multiple evaluations of other rules too
-                assert (("rule_type", endpoint), ("waf_version", DDWAF_VERSION)) in evals
+                assert expected_tags in evals
                 if action_level == 2:
                     assert get_tag("rasp.request.done") is None, get_tag("rasp.request.done")
                 else:
@@ -1509,7 +1529,7 @@ def test_fingerprinting(self, interface, root_span, get_tag, asm_enabled, user_a
     def test_iast(self, interface, root_span, get_tag):
         from ddtrace.ext import http
 
-        url = "/rasp/command_injection/?cmd=."
+        url = "/rasp/command_injection/?cmds=."
         self.update_tracer(interface)
         response = interface.client.get(url)
         assert self.status(response) == 200
diff --git a/tests/appsec/iast/taint_sinks/test_command_injection.py b/tests/appsec/iast/taint_sinks/test_command_injection.py
index b716f594e85..ab611c1969b 100644
--- a/tests/appsec/iast/taint_sinks/test_command_injection.py
+++ b/tests/appsec/iast/taint_sinks/test_command_injection.py
@@ -123,7 +123,7 @@ def test_popen_wait_shell_true(iast_context_defaults):
     _assert_vulnerability("test_popen_wait_shell_true", source_name=source_name)
 
 
-@pytest.mark.skipif(sys.platform != "linux", reason="Only for Linux")
+@pytest.mark.skipif(sys.platform not in ["linux", "darwin"], reason="Only for Unix")
 @pytest.mark.parametrize(
     "function,mode,arguments,tag",
     [
@@ -156,11 +156,11 @@ def test_osspawn_variants(iast_context_defaults, function, mode, arguments, tag)
 
     if "spawnv" in cleaned_name:
         # label test_osspawn_variants2
-        function(mode, copied_args[0], copied_args)
+        function(mode, copied_args[0], copied_args[1:])
         label = "test_osspawn_variants2"
     else:
         # label test_osspawn_variants1
-        function(mode, copied_args[0], *copied_args)
+        function(mode, copied_args[0], *copied_args[1:])
         label = "test_osspawn_variants1"
 
     _assert_vulnerability(
@@ -171,7 +171,7 @@ def test_osspawn_variants(iast_context_defaults, function, mode, arguments, tag)
     )
 
 
-@pytest.mark.skipif(sys.platform != "linux", reason="Only for Linux")
+@pytest.mark.skipif(sys.platform not in ["linux", "darwin"], reason="Only for Unix")
 def test_multiple_cmdi(iast_context_defaults):
     _BAD_DIR = taint_pyobject(
         pyobject=_BAD_DIR_DEFAULT,
@@ -193,7 +193,7 @@ def test_multiple_cmdi(iast_context_defaults):
     assert len(list(data["vulnerabilities"])) == 2
 
 
-@pytest.mark.skipif(sys.platform != "linux", reason="Only for Linux")
+@pytest.mark.skipif(sys.platform not in ["linux", "darwin"], reason="Only for Unix")
 def test_string_cmdi(iast_context_defaults):
     cmd = taint_pyobject(
         pyobject="dir -l .",

From 35fe7b5c876c38e597a782d2c4f34295f0bf3d62 Mon Sep 17 00:00:00 2001
From: Yun Kim <35776586+Yun-Kim@users.noreply.github.com>
Date: Thu, 9 Jan 2025 10:35:24 -0500
Subject: [PATCH 07/19] feat(openai): add token usage stream options to request
 (#11606)

This PR adds special casing such that any user's openai streamed
chat/completion requests, unless explicitly specified otherwise, will by
default include the token usage as part of the streamed response.

### Motivation
OpenAI streamed responses have historically not provided token usage
details as part of the streamed response. However OpenAI earlier this
year added a `stream_options: {"include_usage": True}` kwarg option to
the chat/completions API to provide token usage details as part of an
additional stream chunk at the end of the streamed response.

If this kwarg option was not specified by the user, then token usage is
not provided by OpenAI and our current behavior is to give our best
effort to 1) use the `tiktoken` library to calculate token counts, or 2)
use a very crude heuristic to estimate token counts. Both are not ideal
as neither alternative takes into account function/tool calling. **It is
simpler and more accurate to just request the token counts from OpenAI
directly.**

### Proposed design
There are 2 major components for this feature:
1. If a user does not specify `stream_options: {"include_usage": True}`
as a kwarg on the chat/completions call, we need to manually insert that
as part of the kwargs before the request is made.
2. If a user does not specify `stream_options: {"include_usage": True}`
as a kwarg on the chat/completions call but we add that option on the
integration-side, the returned streamed response will include an
additional chunk (with empty content) at the end containing token usage
information. To avoid disrupting user applications with one more chunk
(with different content/fields) than expected, the integration should
automatically extract the last chunk under the hood.

Note: if a user does explicitly specify `stream_options:
{"include_usage": False}`, then we must respect their intent and avoid
adding token usage into the kwargs. We'll add in our release note that
we cannot guarantee 100% accurate token counts in this case.`

### Streamed reading logic change

Additionally, we make a change to `__iter__/__aiter__` methods of our
traced streamed responses. Previously we returned the traced streamed
response (and relied on the underlying `__next__/__anext__` methods),
but to ensure spans will be finished even if the streamed response is
not fully consumed, we change the `__iter__/__aiter__` methods to
implement the stream consumption using a try/catch/finally.

Note: this only applies to
1. When users use `__iter__/__aiter__()`, since directly calling
`__next__()/__anext__()` individually will not let us know when the
overall response is fully consumed.
2. When users use `__aiter__()` and break early, they are still
responsible for calling `resp.close()`, since asynchronous generators do
not automatically close when the context manager is exited (this is held
until close() is called either manually or by the garbage collector).

### Testing

This PR modifies the existing OpenAI streamed completion/chat completion
tests to be simplified (use snapshots when possible instead of making
large numbers of tedious assertions) and to add coverage for the token
extraction behavior (existing tests remove `include_usage: True` options
to assert that the automatic extraction works, and we add a couple tests
asserting our original behavior if `include_usage: False` is explicitly
set).

## Checklist
- [x] PR author has checked that all the criteria below are met
- The PR description includes an overview of the change
- The PR description articulates the motivation for the change
- The change includes tests OR the PR description describes a testing
strategy
- The PR description notes risks associated with the change, if any
- Newly-added code is easy to change
- The change follows the [library release note
guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html)
- The change includes or references documentation updates if necessary
- Backport labels are set (if
[applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting))

## Reviewer Checklist
- [x] Reviewer has checked that all the criteria below are met
- Title is accurate
- All changes are related to the pull request's stated goal
- Avoids breaking
[API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces)
changes
- Testing strategy adequately addresses listed risks
- Newly-added code is easy to change
- Release note makes sense to a user of the library
- If necessary, author has acknowledged and discussed the performance
implications of this PR as reported in the benchmarks PR comment
- Backport labels are set in a manner that is consistent with the
[release branch maintenance
policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)
---
 .../internal/openai/_endpoint_hooks.py        |   8 +
 ddtrace/contrib/internal/openai/utils.py      |  69 ++++-
 ...d-chunk-auto-extract-4cbaea8870b1df13.yaml |   6 +
 tests/contrib/openai/test_openai_llmobs.py    |  27 +-
 tests/contrib/openai/test_openai_v1.py        | 281 ++++--------------
 ...st_openai.test_chat_completion_stream.json |  53 ++++
 ...ai.test_openai.test_completion_stream.json |  49 +++
 ..._v1.test_completion_stream_est_tokens.json |  49 +++
 8 files changed, 305 insertions(+), 237 deletions(-)
 create mode 100644 releasenotes/notes/feat-openai-streamed-chunk-auto-extract-4cbaea8870b1df13.yaml
 create mode 100644 tests/snapshots/tests.contrib.openai.test_openai.test_chat_completion_stream.json
 create mode 100644 tests/snapshots/tests.contrib.openai.test_openai.test_completion_stream.json
 create mode 100644 tests/snapshots/tests.contrib.openai.test_openai_v1.test_completion_stream_est_tokens.json

diff --git a/ddtrace/contrib/internal/openai/_endpoint_hooks.py b/ddtrace/contrib/internal/openai/_endpoint_hooks.py
index 73a2b2511c9..979e1774a8a 100644
--- a/ddtrace/contrib/internal/openai/_endpoint_hooks.py
+++ b/ddtrace/contrib/internal/openai/_endpoint_hooks.py
@@ -255,6 +255,14 @@ def _record_request(self, pin, integration, span, args, kwargs):
                 span.set_tag_str("openai.request.messages.%d.content" % idx, integration.trunc(str(content)))
             span.set_tag_str("openai.request.messages.%d.role" % idx, str(role))
             span.set_tag_str("openai.request.messages.%d.name" % idx, str(name))
+        if parse_version(OPENAI_VERSION) >= (1, 26) and kwargs.get("stream"):
+            if kwargs.get("stream_options", {}).get("include_usage", None) is not None:
+                # Only perform token chunk auto-extraction if this option is not explicitly set
+                return
+            span._set_ctx_item("_dd.auto_extract_token_chunk", True)
+            stream_options = kwargs.get("stream_options", {})
+            stream_options["include_usage"] = True
+            kwargs["stream_options"] = stream_options
 
     def _record_response(self, pin, integration, span, args, kwargs, resp, error):
         resp = super()._record_response(pin, integration, span, args, kwargs, resp, error)
diff --git a/ddtrace/contrib/internal/openai/utils.py b/ddtrace/contrib/internal/openai/utils.py
index d967383e366..f5dfc10efef 100644
--- a/ddtrace/contrib/internal/openai/utils.py
+++ b/ddtrace/contrib/internal/openai/utils.py
@@ -48,11 +48,28 @@ def __exit__(self, exc_type, exc_val, exc_tb):
         self.__wrapped__.__exit__(exc_type, exc_val, exc_tb)
 
     def __iter__(self):
-        return self
+        exception_raised = False
+        try:
+            for chunk in self.__wrapped__:
+                self._extract_token_chunk(chunk)
+                yield chunk
+                _loop_handler(self._dd_span, chunk, self._streamed_chunks)
+        except Exception:
+            self._dd_span.set_exc_info(*sys.exc_info())
+            exception_raised = True
+            raise
+        finally:
+            if not exception_raised:
+                _process_finished_stream(
+                    self._dd_integration, self._dd_span, self._kwargs, self._streamed_chunks, self._is_completion
+                )
+            self._dd_span.finish()
+            self._dd_integration.metric(self._dd_span, "dist", "request.duration", self._dd_span.duration_ns)
 
     def __next__(self):
         try:
             chunk = self.__wrapped__.__next__()
+            self._extract_token_chunk(chunk)
             _loop_handler(self._dd_span, chunk, self._streamed_chunks)
             return chunk
         except StopIteration:
@@ -68,6 +85,22 @@ def __next__(self):
             self._dd_integration.metric(self._dd_span, "dist", "request.duration", self._dd_span.duration_ns)
             raise
 
+    def _extract_token_chunk(self, chunk):
+        """Attempt to extract the token chunk (last chunk in the stream) from the streamed response."""
+        if not self._dd_span._get_ctx_item("_dd.auto_extract_token_chunk"):
+            return
+        choice = getattr(chunk, "choices", [None])[0]
+        if not getattr(choice, "finish_reason", None):
+            # Only the second-last chunk in the stream with token usage enabled will have finish_reason set
+            return
+        try:
+            # User isn't expecting last token chunk to be present since it's not part of the default streamed response,
+            # so we consume it and extract the token usage metadata before it reaches the user.
+            usage_chunk = self.__wrapped__.__next__()
+            self._streamed_chunks[0].insert(0, usage_chunk)
+        except (StopIteration, GeneratorExit):
+            return
+
 
 class TracedOpenAIAsyncStream(BaseTracedOpenAIStream):
     async def __aenter__(self):
@@ -77,12 +110,29 @@ async def __aenter__(self):
     async def __aexit__(self, exc_type, exc_val, exc_tb):
         await self.__wrapped__.__aexit__(exc_type, exc_val, exc_tb)
 
-    def __aiter__(self):
-        return self
+    async def __aiter__(self):
+        exception_raised = False
+        try:
+            async for chunk in self.__wrapped__:
+                await self._extract_token_chunk(chunk)
+                yield chunk
+                _loop_handler(self._dd_span, chunk, self._streamed_chunks)
+        except Exception:
+            self._dd_span.set_exc_info(*sys.exc_info())
+            exception_raised = True
+            raise
+        finally:
+            if not exception_raised:
+                _process_finished_stream(
+                    self._dd_integration, self._dd_span, self._kwargs, self._streamed_chunks, self._is_completion
+                )
+            self._dd_span.finish()
+            self._dd_integration.metric(self._dd_span, "dist", "request.duration", self._dd_span.duration_ns)
 
     async def __anext__(self):
         try:
             chunk = await self.__wrapped__.__anext__()
+            await self._extract_token_chunk(chunk)
             _loop_handler(self._dd_span, chunk, self._streamed_chunks)
             return chunk
         except StopAsyncIteration:
@@ -98,6 +148,19 @@ async def __anext__(self):
             self._dd_integration.metric(self._dd_span, "dist", "request.duration", self._dd_span.duration_ns)
             raise
 
+    async def _extract_token_chunk(self, chunk):
+        """Attempt to extract the token chunk (last chunk in the stream) from the streamed response."""
+        if not self._dd_span._get_ctx_item("_dd.auto_extract_token_chunk"):
+            return
+        choice = getattr(chunk, "choices", [None])[0]
+        if not getattr(choice, "finish_reason", None):
+            return
+        try:
+            usage_chunk = await self.__wrapped__.__anext__()
+            self._streamed_chunks[0].insert(0, usage_chunk)
+        except (StopAsyncIteration, GeneratorExit):
+            return
+
 
 def _compute_token_count(content, model):
     # type: (Union[str, List[int]], Optional[str]) -> Tuple[bool, int]
diff --git a/releasenotes/notes/feat-openai-streamed-chunk-auto-extract-4cbaea8870b1df13.yaml b/releasenotes/notes/feat-openai-streamed-chunk-auto-extract-4cbaea8870b1df13.yaml
new file mode 100644
index 00000000000..afaf95876d5
--- /dev/null
+++ b/releasenotes/notes/feat-openai-streamed-chunk-auto-extract-4cbaea8870b1df13.yaml
@@ -0,0 +1,6 @@
+---
+features:
+  - |
+    openai: Introduces automatic extraction of token usage from streamed chat completions. 
+    Unless ``stream_options: {"include_usage": False}`` is explicitly set on your streamed chat completion request, 
+    the OpenAI integration will add ``stream_options: {"include_usage": True}`` to your request and automatically extract the token usage chunk from the streamed response.
diff --git a/tests/contrib/openai/test_openai_llmobs.py b/tests/contrib/openai/test_openai_llmobs.py
index a1a2b93a5ca..a145877c8c8 100644
--- a/tests/contrib/openai/test_openai_llmobs.py
+++ b/tests/contrib/openai/test_openai_llmobs.py
@@ -518,11 +518,17 @@ async def test_chat_completion_azure_async(
             )
         )
 
-    def test_chat_completion_stream(self, openai, ddtrace_global_config, mock_llmobs_writer, mock_tracer):
+    @pytest.mark.skipif(
+        parse_version(openai_module.version.VERSION) < (1, 26), reason="Stream options only available openai >= 1.26"
+    )
+    def test_chat_completion_stream_explicit_no_tokens(
+        self, openai, ddtrace_global_config, mock_llmobs_writer, mock_tracer
+    ):
         """Ensure llmobs records are emitted for chat completion endpoints when configured.
 
         Also ensure the llmobs records have the correct tagging including trace/span ID for trace correlation.
         """
+
         with get_openai_vcr(subdirectory_name="v1").use_cassette("chat_completion_streamed.yaml"):
             with mock.patch("ddtrace.contrib.internal.openai.utils.encoding_for_model", create=True) as mock_encoding:
                 with mock.patch("ddtrace.contrib.internal.openai.utils._est_tokens") as mock_est:
@@ -534,7 +540,11 @@ def test_chat_completion_stream(self, openai, ddtrace_global_config, mock_llmobs
                     expected_completion = "The Los Angeles Dodgers won the World Series in 2020."
                     client = openai.OpenAI()
                     resp = client.chat.completions.create(
-                        model=model, messages=input_messages, stream=True, user="ddtrace-test"
+                        model=model,
+                        messages=input_messages,
+                        stream=True,
+                        user="ddtrace-test",
+                        stream_options={"include_usage": False},
                     )
                     for chunk in resp:
                         resp_model = chunk.model
@@ -547,7 +557,7 @@ def test_chat_completion_stream(self, openai, ddtrace_global_config, mock_llmobs
                 model_provider="openai",
                 input_messages=input_messages,
                 output_messages=[{"content": expected_completion, "role": "assistant"}],
-                metadata={"stream": True, "user": "ddtrace-test"},
+                metadata={"stream": True, "stream_options": {"include_usage": False}, "user": "ddtrace-test"},
                 token_metrics={"input_tokens": 8, "output_tokens": 8, "total_tokens": 16},
                 tags={"ml_app": "<ml-app-name>", "service": "tests.contrib.openai"},
             )
@@ -557,20 +567,14 @@ def test_chat_completion_stream(self, openai, ddtrace_global_config, mock_llmobs
         parse_version(openai_module.version.VERSION) < (1, 26, 0), reason="Streamed tokens available in 1.26.0+"
     )
     def test_chat_completion_stream_tokens(self, openai, ddtrace_global_config, mock_llmobs_writer, mock_tracer):
-        """
-        Ensure llmobs records are emitted for chat completion endpoints when configured
-        with the `stream_options={"include_usage": True}`.
-        Also ensure the llmobs records have the correct tagging including trace/span ID for trace correlation.
-        """
+        """Assert that streamed token chunk extraction logic works when options are not explicitly passed from user."""
         with get_openai_vcr(subdirectory_name="v1").use_cassette("chat_completion_streamed_tokens.yaml"):
             model = "gpt-3.5-turbo"
             resp_model = model
             input_messages = [{"role": "user", "content": "Who won the world series in 2020?"}]
             expected_completion = "The Los Angeles Dodgers won the World Series in 2020."
             client = openai.OpenAI()
-            resp = client.chat.completions.create(
-                model=model, messages=input_messages, stream=True, stream_options={"include_usage": True}
-            )
+            resp = client.chat.completions.create(model=model, messages=input_messages, stream=True)
             for chunk in resp:
                 resp_model = chunk.model
         span = mock_tracer.pop_traces()[0][0]
@@ -671,7 +675,6 @@ def test_chat_completion_tool_call_stream(self, openai, ddtrace_global_config, m
                 messages=[{"role": "user", "content": chat_completion_input_description}],
                 user="ddtrace-test",
                 stream=True,
-                stream_options={"include_usage": True},
             )
             for chunk in resp:
                 resp_model = chunk.model
diff --git a/tests/contrib/openai/test_openai_v1.py b/tests/contrib/openai/test_openai_v1.py
index f13de144fc5..91737d9e5eb 100644
--- a/tests/contrib/openai/test_openai_v1.py
+++ b/tests/contrib/openai/test_openai_v1.py
@@ -921,128 +921,78 @@ def test_span_finish_on_stream_error(openai, openai_vcr, snapshot_tracer):
                 )
 
 
-def test_completion_stream(openai, openai_vcr, mock_metrics, mock_tracer):
+@pytest.mark.snapshot
+@pytest.mark.skipif(TIKTOKEN_AVAILABLE, reason="This test estimates token counts")
+def test_completion_stream_est_tokens(openai, openai_vcr, mock_metrics, snapshot_tracer):
     with openai_vcr.use_cassette("completion_streamed.yaml"):
         with mock.patch("ddtrace.contrib.internal.openai.utils.encoding_for_model", create=True) as mock_encoding:
             mock_encoding.return_value.encode.side_effect = lambda x: [1, 2]
-            expected_completion = '! ... A page layouts page drawer? ... Interesting. The "Tools" is'
             client = openai.OpenAI()
             resp = client.completions.create(model="ada", prompt="Hello world", stream=True, n=None)
-            chunks = [c for c in resp]
-
-    completion = "".join([c.choices[0].text for c in chunks])
-    assert completion == expected_completion
+            _ = [c for c in resp]
 
-    traces = mock_tracer.pop_traces()
-    assert len(traces) == 1
-    assert len(traces[0]) == 1
-    assert traces[0][0].get_tag("openai.response.choices.0.text") == expected_completion
-    assert traces[0][0].get_tag("openai.response.choices.0.finish_reason") == "length"
 
-    expected_tags = [
-        "version:",
-        "env:",
-        "service:tests.contrib.openai",
-        "openai.request.model:ada",
-        "model:ada",
-        "openai.request.endpoint:/v1/completions",
-        "openai.request.method:POST",
-        "openai.organization.id:",
-        "openai.organization.name:datadog-4",
-        "openai.user.api_key:sk-...key>",
-        "error:0",
-        "openai.estimated:true",
-    ]
-    if TIKTOKEN_AVAILABLE:
-        expected_tags = expected_tags[:-1]
-    assert mock.call.distribution("tokens.prompt", 2, tags=expected_tags) in mock_metrics.mock_calls
-    assert mock.call.distribution("tokens.completion", mock.ANY, tags=expected_tags) in mock_metrics.mock_calls
-    assert mock.call.distribution("tokens.total", mock.ANY, tags=expected_tags) in mock_metrics.mock_calls
+@pytest.mark.skipif(not TIKTOKEN_AVAILABLE, reason="This test computes token counts using tiktoken")
+@pytest.mark.snapshot(token="tests.contrib.openai.test_openai.test_completion_stream")
+def test_completion_stream(openai, openai_vcr, mock_metrics, snapshot_tracer):
+    with openai_vcr.use_cassette("completion_streamed.yaml"):
+        with mock.patch("ddtrace.contrib.internal.openai.utils.encoding_for_model", create=True) as mock_encoding:
+            mock_encoding.return_value.encode.side_effect = lambda x: [1, 2]
+            client = openai.OpenAI()
+            resp = client.completions.create(model="ada", prompt="Hello world", stream=True, n=None)
+            _ = [c for c in resp]
 
 
-async def test_completion_async_stream(openai, openai_vcr, mock_metrics, mock_tracer):
+@pytest.mark.skipif(not TIKTOKEN_AVAILABLE, reason="This test computes token counts using tiktoken")
+@pytest.mark.snapshot(token="tests.contrib.openai.test_openai.test_completion_stream")
+async def test_completion_async_stream(openai, openai_vcr, mock_metrics, snapshot_tracer):
     with openai_vcr.use_cassette("completion_streamed.yaml"):
         with mock.patch("ddtrace.contrib.internal.openai.utils.encoding_for_model", create=True) as mock_encoding:
             mock_encoding.return_value.encode.side_effect = lambda x: [1, 2]
-            expected_completion = '! ... A page layouts page drawer? ... Interesting. The "Tools" is'
             client = openai.AsyncOpenAI()
-            resp = await client.completions.create(model="ada", prompt="Hello world", stream=True)
-            chunks = [c async for c in resp]
-
-    completion = "".join([c.choices[0].text for c in chunks])
-    assert completion == expected_completion
-
-    traces = mock_tracer.pop_traces()
-    assert len(traces) == 1
-    assert len(traces[0]) == 1
-    assert traces[0][0].get_tag("openai.response.choices.0.text") == expected_completion
-    assert traces[0][0].get_tag("openai.response.choices.0.finish_reason") == "length"
-
-    expected_tags = [
-        "version:",
-        "env:",
-        "service:tests.contrib.openai",
-        "openai.request.model:ada",
-        "model:ada",
-        "openai.request.endpoint:/v1/completions",
-        "openai.request.method:POST",
-        "openai.organization.id:",
-        "openai.organization.name:datadog-4",
-        "openai.user.api_key:sk-...key>",
-        "error:0",
-        "openai.estimated:true",
-    ]
-    if TIKTOKEN_AVAILABLE:
-        expected_tags = expected_tags[:-1]
-    assert mock.call.distribution("tokens.prompt", 2, tags=expected_tags) in mock_metrics.mock_calls
-    assert mock.call.distribution("tokens.completion", mock.ANY, tags=expected_tags) in mock_metrics.mock_calls
-    assert mock.call.distribution("tokens.total", mock.ANY, tags=expected_tags) in mock_metrics.mock_calls
+            resp = await client.completions.create(model="ada", prompt="Hello world", stream=True, n=None)
+            _ = [c async for c in resp]
 
 
 @pytest.mark.skipif(
-    parse_version(openai_module.version.VERSION) < (1, 6, 0),
+    parse_version(openai_module.version.VERSION) < (1, 6, 0) or not TIKTOKEN_AVAILABLE,
     reason="Streamed response context managers are only available v1.6.0+",
 )
-def test_completion_stream_context_manager(openai, openai_vcr, mock_metrics, mock_tracer):
+@pytest.mark.snapshot(token="tests.contrib.openai.test_openai.test_completion_stream")
+def test_completion_stream_context_manager(openai, openai_vcr, mock_metrics, snapshot_tracer):
     with openai_vcr.use_cassette("completion_streamed.yaml"):
         with mock.patch("ddtrace.contrib.internal.openai.utils.encoding_for_model", create=True) as mock_encoding:
             mock_encoding.return_value.encode.side_effect = lambda x: [1, 2]
-            expected_completion = '! ... A page layouts page drawer? ... Interesting. The "Tools" is'
             client = openai.OpenAI()
             with client.completions.create(model="ada", prompt="Hello world", stream=True, n=None) as resp:
-                chunks = [c for c in resp]
+                _ = [c for c in resp]
 
-    completion = "".join([c.choices[0].text for c in chunks])
-    assert completion == expected_completion
-
-    traces = mock_tracer.pop_traces()
-    assert len(traces) == 1
-    assert len(traces[0]) == 1
-    assert traces[0][0].get_tag("openai.response.choices.0.text") == expected_completion
-    assert traces[0][0].get_tag("openai.response.choices.0.finish_reason") == "length"
 
-    expected_tags = [
-        "version:",
-        "env:",
-        "service:tests.contrib.openai",
-        "openai.request.model:ada",
-        "model:ada",
-        "openai.request.endpoint:/v1/completions",
-        "openai.request.method:POST",
-        "openai.organization.id:",
-        "openai.organization.name:datadog-4",
-        "openai.user.api_key:sk-...key>",
-        "error:0",
-        "openai.estimated:true",
-    ]
-    if TIKTOKEN_AVAILABLE:
-        expected_tags = expected_tags[:-1]
-    assert mock.call.distribution("tokens.prompt", 2, tags=expected_tags) in mock_metrics.mock_calls
-    assert mock.call.distribution("tokens.completion", mock.ANY, tags=expected_tags) in mock_metrics.mock_calls
-    assert mock.call.distribution("tokens.total", mock.ANY, tags=expected_tags) in mock_metrics.mock_calls
+@pytest.mark.skipif(
+    parse_version(openai_module.version.VERSION) < (1, 26), reason="Stream options only available openai >= 1.26"
+)
+@pytest.mark.snapshot(token="tests.contrib.openai.test_openai.test_chat_completion_stream")
+def test_chat_completion_stream(openai, openai_vcr, mock_metrics, snapshot_tracer):
+    """Assert that streamed token chunk extraction logic works automatically."""
+    with openai_vcr.use_cassette("chat_completion_streamed_tokens.yaml"):
+        with mock.patch("ddtrace.contrib.internal.openai.utils.encoding_for_model", create=True) as mock_encoding:
+            mock_encoding.return_value.encode.side_effect = lambda x: [1, 2, 3, 4, 5, 6, 7, 8]
+            client = openai.OpenAI()
+            resp = client.chat.completions.create(
+                model="gpt-3.5-turbo",
+                messages=[{"role": "user", "content": "Who won the world series in 2020?"}],
+                stream=True,
+                user="ddtrace-test",
+                n=None,
+            )
+            _ = [c for c in resp]
 
 
-def test_chat_completion_stream(openai, openai_vcr, mock_metrics, snapshot_tracer):
+@pytest.mark.skipif(
+    parse_version(openai_module.version.VERSION) < (1, 26), reason="Stream options only available openai >= 1.26"
+)
+def test_chat_completion_stream_explicit_no_tokens(openai, openai_vcr, mock_metrics, snapshot_tracer):
+    """Assert that streamed token chunk extraction logic is avoided if explicitly set to False by the user."""
     with openai_vcr.use_cassette("chat_completion_streamed.yaml"):
         with mock.patch("ddtrace.contrib.internal.openai.utils.encoding_for_model", create=True) as mock_encoding:
             mock_encoding.return_value.encode.side_effect = lambda x: [1, 2, 3, 4, 5, 6, 7, 8]
@@ -1054,20 +1004,16 @@ def test_chat_completion_stream(openai, openai_vcr, mock_metrics, snapshot_trace
                     {"role": "user", "content": "Who won the world series in 2020?"},
                 ],
                 stream=True,
+                stream_options={"include_usage": False},
                 user="ddtrace-test",
                 n=None,
             )
-            prompt_tokens = 8
             span = snapshot_tracer.current_span()
             chunks = [c for c in resp]
             assert len(chunks) == 15
             completion = "".join([c.choices[0].delta.content for c in chunks if c.choices[0].delta.content is not None])
             assert completion == expected_completion
 
-    assert span.get_tag("openai.response.choices.0.message.content") == expected_completion
-    assert span.get_tag("openai.response.choices.0.message.role") == "assistant"
-    assert span.get_tag("openai.response.choices.0.finish_reason") == "stop"
-
     expected_tags = [
         "version:",
         "env:",
@@ -1087,16 +1033,19 @@ def test_chat_completion_stream(openai, openai_vcr, mock_metrics, snapshot_trace
     expected_tags += ["openai.estimated:true"]
     if TIKTOKEN_AVAILABLE:
         expected_tags = expected_tags[:-1]
-    assert mock.call.distribution("tokens.prompt", prompt_tokens, tags=expected_tags) in mock_metrics.mock_calls
+    assert mock.call.distribution("tokens.prompt", 8, tags=expected_tags) in mock_metrics.mock_calls
     assert mock.call.distribution("tokens.completion", mock.ANY, tags=expected_tags) in mock_metrics.mock_calls
     assert mock.call.distribution("tokens.total", mock.ANY, tags=expected_tags) in mock_metrics.mock_calls
 
 
+@pytest.mark.skipif(
+    parse_version(openai_module.version.VERSION) < (1, 26, 0), reason="Streamed tokens available in 1.26.0+"
+)
+@pytest.mark.snapshot(token="tests.contrib.openai.test_openai.test_chat_completion_stream")
 async def test_chat_completion_async_stream(openai, openai_vcr, mock_metrics, snapshot_tracer):
-    with openai_vcr.use_cassette("chat_completion_streamed.yaml"):
+    with openai_vcr.use_cassette("chat_completion_streamed_tokens.yaml"):
         with mock.patch("ddtrace.contrib.internal.openai.utils.encoding_for_model", create=True) as mock_encoding:
             mock_encoding.return_value.encode.side_effect = lambda x: [1, 2, 3, 4, 5, 6, 7, 8]
-            expected_completion = "The Los Angeles Dodgers won the World Series in 2020."
             client = openai.AsyncOpenAI()
             resp = await client.chat.completions.create(
                 model="gpt-3.5-turbo",
@@ -1104,99 +1053,21 @@ async def test_chat_completion_async_stream(openai, openai_vcr, mock_metrics, sn
                     {"role": "user", "content": "Who won the world series in 2020?"},
                 ],
                 stream=True,
+                n=None,
                 user="ddtrace-test",
             )
-            prompt_tokens = 8
-            span = snapshot_tracer.current_span()
-            chunks = [c async for c in resp]
-            assert len(chunks) == 15
-            completion = "".join([c.choices[0].delta.content for c in chunks if c.choices[0].delta.content is not None])
-            assert completion == expected_completion
-
-    assert span.get_tag("openai.response.choices.0.message.content") == expected_completion
-    assert span.get_tag("openai.response.choices.0.message.role") == "assistant"
-    assert span.get_tag("openai.response.choices.0.finish_reason") == "stop"
-
-    expected_tags = [
-        "version:",
-        "env:",
-        "service:tests.contrib.openai",
-        "openai.request.model:gpt-3.5-turbo",
-        "model:gpt-3.5-turbo",
-        "openai.request.endpoint:/v1/chat/completions",
-        "openai.request.method:POST",
-        "openai.organization.id:",
-        "openai.organization.name:datadog-4",
-        "openai.user.api_key:sk-...key>",
-        "error:0",
-    ]
-    assert mock.call.distribution("request.duration", span.duration_ns, tags=expected_tags) in mock_metrics.mock_calls
-    assert mock.call.gauge("ratelimit.requests", 3000, tags=expected_tags) in mock_metrics.mock_calls
-    assert mock.call.gauge("ratelimit.remaining.requests", 2999, tags=expected_tags) in mock_metrics.mock_calls
-    expected_tags += ["openai.estimated:true"]
-    if TIKTOKEN_AVAILABLE:
-        expected_tags = expected_tags[:-1]
-    assert mock.call.distribution("tokens.prompt", prompt_tokens, tags=expected_tags) in mock_metrics.mock_calls
-    assert mock.call.distribution("tokens.completion", mock.ANY, tags=expected_tags) in mock_metrics.mock_calls
-    assert mock.call.distribution("tokens.total", mock.ANY, tags=expected_tags) in mock_metrics.mock_calls
-
-
-@pytest.mark.skipif(
-    parse_version(openai_module.version.VERSION) < (1, 26, 0), reason="Streamed tokens available in 1.26.0+"
-)
-def test_chat_completion_stream_tokens(openai, openai_vcr, mock_metrics, snapshot_tracer):
-    with openai_vcr.use_cassette("chat_completion_streamed_tokens.yaml"):
-        expected_completion = "The Los Angeles Dodgers won the World Series in 2020."
-        client = openai.OpenAI()
-        resp = client.chat.completions.create(
-            model="gpt-3.5-turbo",
-            messages=[{"role": "user", "content": "Who won the world series in 2020?"}],
-            stream=True,
-            user="ddtrace-test",
-            n=None,
-            stream_options={"include_usage": True},
-        )
-        span = snapshot_tracer.current_span()
-        chunks = [c for c in resp]
-        completion = "".join(
-            [c.choices[0].delta.content for c in chunks if c.choices and c.choices[0].delta.content is not None]
-        )
-        assert completion == expected_completion
-
-    assert span.get_tag("openai.response.choices.0.message.content") == expected_completion
-    assert span.get_tag("openai.response.choices.0.message.role") == "assistant"
-    assert span.get_tag("openai.response.choices.0.finish_reason") == "stop"
-
-    expected_tags = [
-        "version:",
-        "env:",
-        "service:tests.contrib.openai",
-        "openai.request.model:gpt-3.5-turbo",
-        "model:gpt-3.5-turbo",
-        "openai.request.endpoint:/v1/chat/completions",
-        "openai.request.method:POST",
-        "openai.organization.id:",
-        "openai.organization.name:datadog-4",
-        "openai.user.api_key:sk-...key>",
-        "error:0",
-    ]
-    assert mock.call.distribution("request.duration", span.duration_ns, tags=expected_tags) in mock_metrics.mock_calls
-    assert mock.call.gauge("ratelimit.requests", 3000, tags=expected_tags) in mock_metrics.mock_calls
-    assert mock.call.gauge("ratelimit.remaining.requests", 2999, tags=expected_tags) in mock_metrics.mock_calls
-    assert mock.call.distribution("tokens.prompt", 17, tags=expected_tags) in mock_metrics.mock_calls
-    assert mock.call.distribution("tokens.completion", 19, tags=expected_tags) in mock_metrics.mock_calls
-    assert mock.call.distribution("tokens.total", 36, tags=expected_tags) in mock_metrics.mock_calls
+            _ = [c async for c in resp]
 
 
 @pytest.mark.skipif(
-    parse_version(openai_module.version.VERSION) < (1, 6, 0),
-    reason="Streamed response context managers are only available v1.6.0+",
+    parse_version(openai_module.version.VERSION) < (1, 26, 0),
+    reason="Streamed response context managers are only available v1.6.0+, tokens available 1.26.0+",
 )
+@pytest.mark.snapshot(token="tests.contrib.openai.test_openai.test_chat_completion_stream")
 async def test_chat_completion_async_stream_context_manager(openai, openai_vcr, mock_metrics, snapshot_tracer):
-    with openai_vcr.use_cassette("chat_completion_streamed.yaml"):
+    with openai_vcr.use_cassette("chat_completion_streamed_tokens.yaml"):
         with mock.patch("ddtrace.contrib.internal.openai.utils.encoding_for_model", create=True) as mock_encoding:
             mock_encoding.return_value.encode.side_effect = lambda x: [1, 2, 3, 4, 5, 6, 7, 8]
-            expected_completion = "The Los Angeles Dodgers won the World Series in 2020."
             client = openai.AsyncOpenAI()
             async with await client.chat.completions.create(
                 model="gpt-3.5-turbo",
@@ -1207,41 +1078,7 @@ async def test_chat_completion_async_stream_context_manager(openai, openai_vcr,
                 user="ddtrace-test",
                 n=None,
             ) as resp:
-                prompt_tokens = 8
-                span = snapshot_tracer.current_span()
-                chunks = [c async for c in resp]
-                assert len(chunks) == 15
-                completion = "".join(
-                    [c.choices[0].delta.content for c in chunks if c.choices[0].delta.content is not None]
-                )
-                assert completion == expected_completion
-
-    assert span.get_tag("openai.response.choices.0.message.content") == expected_completion
-    assert span.get_tag("openai.response.choices.0.message.role") == "assistant"
-    assert span.get_tag("openai.response.choices.0.finish_reason") == "stop"
-
-    expected_tags = [
-        "version:",
-        "env:",
-        "service:tests.contrib.openai",
-        "openai.request.model:gpt-3.5-turbo",
-        "model:gpt-3.5-turbo",
-        "openai.request.endpoint:/v1/chat/completions",
-        "openai.request.method:POST",
-        "openai.organization.id:",
-        "openai.organization.name:datadog-4",
-        "openai.user.api_key:sk-...key>",
-        "error:0",
-    ]
-    assert mock.call.distribution("request.duration", span.duration_ns, tags=expected_tags) in mock_metrics.mock_calls
-    assert mock.call.gauge("ratelimit.requests", 3000, tags=expected_tags) in mock_metrics.mock_calls
-    assert mock.call.gauge("ratelimit.remaining.requests", 2999, tags=expected_tags) in mock_metrics.mock_calls
-    expected_tags += ["openai.estimated:true"]
-    if TIKTOKEN_AVAILABLE:
-        expected_tags = expected_tags[:-1]
-    assert mock.call.distribution("tokens.prompt", prompt_tokens, tags=expected_tags) in mock_metrics.mock_calls
-    assert mock.call.distribution("tokens.completion", mock.ANY, tags=expected_tags) in mock_metrics.mock_calls
-    assert mock.call.distribution("tokens.total", mock.ANY, tags=expected_tags) in mock_metrics.mock_calls
+                _ = [c async for c in resp]
 
 
 @pytest.mark.snapshot(
diff --git a/tests/snapshots/tests.contrib.openai.test_openai.test_chat_completion_stream.json b/tests/snapshots/tests.contrib.openai.test_openai.test_chat_completion_stream.json
new file mode 100644
index 00000000000..fe7c9e3b0f2
--- /dev/null
+++ b/tests/snapshots/tests.contrib.openai.test_openai.test_chat_completion_stream.json
@@ -0,0 +1,53 @@
+[[
+  {
+    "name": "openai.request",
+    "service": "tests.contrib.openai",
+    "resource": "createChatCompletion",
+    "trace_id": 0,
+    "span_id": 1,
+    "parent_id": 0,
+    "type": "",
+    "error": 0,
+    "meta": {
+      "_dd.p.dm": "-0",
+      "_dd.p.tid": "67741fca00000000",
+      "component": "openai",
+      "language": "python",
+      "openai.base_url": "https://api.openai.com/v1/",
+      "openai.organization.name": "datadog-4",
+      "openai.request.client": "OpenAI",
+      "openai.request.endpoint": "/v1/chat/completions",
+      "openai.request.messages.0.content": "Who won the world series in 2020?",
+      "openai.request.messages.0.name": "",
+      "openai.request.messages.0.role": "user",
+      "openai.request.method": "POST",
+      "openai.request.model": "gpt-3.5-turbo",
+      "openai.request.n": "None",
+      "openai.request.stream": "True",
+      "openai.request.user": "ddtrace-test",
+      "openai.response.choices.0.finish_reason": "stop",
+      "openai.response.choices.0.message.content": "The Los Angeles Dodgers won the World Series in 2020.",
+      "openai.response.choices.0.message.role": "assistant",
+      "openai.response.model": "gpt-3.5-turbo-0301",
+      "openai.user.api_key": "sk-...key>",
+      "runtime-id": "d174f65e33314f43ad1de8cf0a5ca4e0"
+    },
+    "metrics": {
+      "_dd.measured": 1,
+      "_dd.top_level": 1,
+      "_dd.tracer_kr": 1.0,
+      "_sampling_priority_v1": 1,
+      "openai.organization.ratelimit.requests.limit": 3000,
+      "openai.organization.ratelimit.requests.remaining": 2999,
+      "openai.organization.ratelimit.tokens.limit": 250000,
+      "openai.organization.ratelimit.tokens.remaining": 249979,
+      "openai.request.prompt_tokens_estimated": 0,
+      "openai.response.completion_tokens_estimated": 0,
+      "openai.response.usage.completion_tokens": 19,
+      "openai.response.usage.prompt_tokens": 17,
+      "openai.response.usage.total_tokens": 36,
+      "process_id": 22982
+    },
+    "duration": 29869000,
+    "start": 1735663562179157000
+  }]]
diff --git a/tests/snapshots/tests.contrib.openai.test_openai.test_completion_stream.json b/tests/snapshots/tests.contrib.openai.test_openai.test_completion_stream.json
new file mode 100644
index 00000000000..7cf644cfb3d
--- /dev/null
+++ b/tests/snapshots/tests.contrib.openai.test_openai.test_completion_stream.json
@@ -0,0 +1,49 @@
+[[
+  {
+    "name": "openai.request",
+    "service": "tests.contrib.openai",
+    "resource": "createCompletion",
+    "trace_id": 0,
+    "span_id": 1,
+    "parent_id": 0,
+    "type": "",
+    "error": 0,
+    "meta": {
+      "_dd.p.dm": "-0",
+      "_dd.p.tid": "6774231f00000000",
+      "component": "openai",
+      "language": "python",
+      "openai.base_url": "https://api.openai.com/v1/",
+      "openai.organization.name": "datadog-4",
+      "openai.request.client": "OpenAI",
+      "openai.request.endpoint": "/v1/completions",
+      "openai.request.method": "POST",
+      "openai.request.model": "ada",
+      "openai.request.n": "None",
+      "openai.request.prompt.0": "Hello world",
+      "openai.request.stream": "True",
+      "openai.response.choices.0.finish_reason": "length",
+      "openai.response.choices.0.text": "! ... A page layouts page drawer? ... Interesting. The \"Tools\" is",
+      "openai.response.model": "ada",
+      "openai.user.api_key": "sk-...key>",
+      "runtime-id": "11872c9ca653441db861b108a4f795eb"
+    },
+    "metrics": {
+      "_dd.measured": 1,
+      "_dd.top_level": 1,
+      "_dd.tracer_kr": 1.0,
+      "_sampling_priority_v1": 1,
+      "openai.organization.ratelimit.requests.limit": 3000,
+      "openai.organization.ratelimit.requests.remaining": 2999,
+      "openai.organization.ratelimit.tokens.limit": 250000,
+      "openai.organization.ratelimit.tokens.remaining": 249979,
+      "openai.request.prompt_tokens_estimated": 0,
+      "openai.response.completion_tokens_estimated": 0,
+      "openai.response.usage.completion_tokens": 2,
+      "openai.response.usage.prompt_tokens": 2,
+      "openai.response.usage.total_tokens": 4,
+      "process_id": 27488
+    },
+    "duration": 28739000,
+    "start": 1735664415266386000
+  }]]
diff --git a/tests/snapshots/tests.contrib.openai.test_openai_v1.test_completion_stream_est_tokens.json b/tests/snapshots/tests.contrib.openai.test_openai_v1.test_completion_stream_est_tokens.json
new file mode 100644
index 00000000000..445dc39db98
--- /dev/null
+++ b/tests/snapshots/tests.contrib.openai.test_openai_v1.test_completion_stream_est_tokens.json
@@ -0,0 +1,49 @@
+[[
+  {
+    "name": "openai.request",
+    "service": "tests.contrib.openai",
+    "resource": "createCompletion",
+    "trace_id": 0,
+    "span_id": 1,
+    "parent_id": 0,
+    "type": "",
+    "error": 0,
+    "meta": {
+      "_dd.p.dm": "-0",
+      "_dd.p.tid": "677c221c00000000",
+      "component": "openai",
+      "language": "python",
+      "openai.base_url": "https://api.openai.com/v1/",
+      "openai.organization.name": "datadog-4",
+      "openai.request.client": "OpenAI",
+      "openai.request.endpoint": "/v1/completions",
+      "openai.request.method": "POST",
+      "openai.request.model": "ada",
+      "openai.request.n": "None",
+      "openai.request.prompt.0": "Hello world",
+      "openai.request.stream": "True",
+      "openai.response.choices.0.finish_reason": "length",
+      "openai.response.choices.0.text": "! ... A page layouts page drawer? ... Interesting. The \"Tools\" is",
+      "openai.response.model": "ada",
+      "openai.user.api_key": "sk-...key>",
+      "runtime-id": "24f8e851c87e4f758c73d6acd0aaf82b"
+    },
+    "metrics": {
+      "_dd.measured": 1,
+      "_dd.top_level": 1,
+      "_dd.tracer_kr": 1.0,
+      "_sampling_priority_v1": 1,
+      "openai.organization.ratelimit.requests.limit": 3000,
+      "openai.organization.ratelimit.requests.remaining": 2999,
+      "openai.organization.ratelimit.tokens.limit": 250000,
+      "openai.organization.ratelimit.tokens.remaining": 249979,
+      "openai.request.prompt_tokens_estimated": 1,
+      "openai.response.completion_tokens_estimated": 1,
+      "openai.response.usage.completion_tokens": 16,
+      "openai.response.usage.prompt_tokens": 2,
+      "openai.response.usage.total_tokens": 18,
+      "process_id": 47101
+    },
+    "duration": 37957000,
+    "start": 1736188444222291000
+  }]]

From bfa3b821d6ac147ccc9a9f6e3f4fcfa5632df7c2 Mon Sep 17 00:00:00 2001
From: wantsui <wan.tsui@datadoghq.com>
Date: Thu, 9 Jan 2025 12:37:18 -0500
Subject: [PATCH 08/19] chore: swap out deprecated datetime's utcnow and
 utcfromtimestamp (#11850)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

See https://github.com/DataDog/dd-trace-py/issues/11608 and
https://github.com/DataDog/dd-trace-py/pull/11497.

Starting with [Python
3.12](https://docs.python.org/3/whatsnew/3.12.html), there were changes
to datetime:
>
[datetime](https://docs.python.org/3/library/datetime.html#module-datetime):
[datetime.datetime](https://docs.python.org/3/library/datetime.html#datetime.datetime)’s
[utcnow()](https://docs.python.org/3/library/datetime.html#datetime.datetime.utcnow)
and
[utcfromtimestamp()](https://docs.python.org/3/library/datetime.html#datetime.datetime.utcfromtimestamp)
are deprecated and will be removed in a future version. Instead, use
timezone-aware objects to represent datetimes in UTC: respectively, call
[now()](https://docs.python.org/3/library/datetime.html#datetime.datetime.now)
and
[fromtimestamp()](https://docs.python.org/3/library/datetime.html#datetime.datetime.fromtimestamp)
with the tz parameter set to
[datetime.UTC](https://docs.python.org/3/library/datetime.html#datetime.UTC).
(Contributed by Paul Ganssle in
[gh-103857](https://github.com/python/cpython/issues/103857).)

The result is that the usage of **utcnow** and **utcfromtimestamp** now
throw deprecation warnings when used, ie:
> DeprecationWarning: datetime.datetime.utcfromtimestamp() is deprecated
and scheduled for removal in a future version. Use timezone-aware
objects to represent datetimes in UTC:
datetime.datetime.fromtimestamp(timestamp, datetime.UTC).


There's a difference of `+00:00` between the old version and the new
format.

**For utcnow -> now**
- `datetime.datetime.utcnow().isoformat()` |
`'2025-01-02T19:51:32.579733'`
- `datetime.datetime.now(datetime.timezone.utc).isoformat()` |
`'2025-01-02T19:51:02.275232+00:00'`

**For utcfromtimestamp -> fromtimestamp**

Assume that `end_time_ns=1735848645000000000`:
- `(datetime.datetime.fromtimestamp(end_time_ns / 1e9,
tz=datetime.timezone.utc).replace(microsecond=0).isoformat() + "Z")`
    - returns `'2025-01-02T20:10:45+00:00Z'`
- `(datetime.datetime.utcfromtimestamp(end_time_ns /
1e9).replace(microsecond=0).isoformat() + "Z")`
    - returns `'2025-01-02T20:10:45Z'`

As a result, I attempted remove the trailing ones to be consistent with
the old format, but can bring it back.

## Checklist
- [x] PR author has checked that all the criteria below are met
- The PR description includes an overview of the change
- The PR description articulates the motivation for the change
- The change includes tests OR the PR description describes a testing
strategy
- The PR description notes risks associated with the change, if any
- Newly-added code is easy to change
- The change follows the [library release note
guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html)
- The change includes or references documentation updates if necessary
- Backport labels are set (if
[applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting))

## Reviewer Checklist
- [x] Reviewer has checked that all the criteria below are met
- Title is accurate
- All changes are related to the pull request's stated goal
- Avoids breaking
[API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces)
changes
- Testing strategy adequately addresses listed risks
- Newly-added code is easy to change
- Release note makes sense to a user of the library
- If necessary, author has acknowledged and discussed the performance
implications of this PR as reported in the benchmarks PR comment
- Backport labels are set in a manner that is consistent with the
[release branch maintenance
policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)
---
 ddtrace/internal/debug.py                        |  4 ++--
 ddtrace/profiling/exporter/http.py               | 14 ++++++++++++--
 tests/appsec/iast_packages/packages/pkg_pyjwt.py |  6 +++++-
 3 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/ddtrace/internal/debug.py b/ddtrace/internal/debug.py
index 4d533b604b6..c33ff5ad46d 100644
--- a/ddtrace/internal/debug.py
+++ b/ddtrace/internal/debug.py
@@ -117,8 +117,8 @@ def collect(tracer):
     from ddtrace._trace.tracer import log
 
     return dict(
-        # Timestamp UTC ISO 8601
-        date=datetime.datetime.utcnow().isoformat(),
+        # Timestamp UTC ISO 8601 with the trailing +00:00 removed
+        date=datetime.datetime.now(datetime.timezone.utc).isoformat()[0:-6],
         # eg. "Linux", "Darwin"
         os_name=platform.system(),
         # eg. 12.5.0
diff --git a/ddtrace/profiling/exporter/http.py b/ddtrace/profiling/exporter/http.py
index 6700e584ade..b4ec6994d72 100644
--- a/ddtrace/profiling/exporter/http.py
+++ b/ddtrace/profiling/exporter/http.py
@@ -220,8 +220,18 @@ def export(
             "family": "python",
             "attachments": [item["filename"].decode("utf-8") for item in data],
             "tags_profiler": self._get_tags(service),
-            "start": (datetime.datetime.utcfromtimestamp(start_time_ns / 1e9).replace(microsecond=0).isoformat() + "Z"),
-            "end": (datetime.datetime.utcfromtimestamp(end_time_ns / 1e9).replace(microsecond=0).isoformat() + "Z"),
+            "start": (
+                datetime.datetime.fromtimestamp(start_time_ns / 1e9, tz=datetime.timezone.utc)
+                .replace(microsecond=0)
+                .isoformat()[0:-6]  # removes the trailing +00:00 portion of the time
+                + "Z"
+            ),
+            "end": (
+                datetime.datetime.fromtimestamp(end_time_ns / 1e9, tz=datetime.timezone.utc)
+                .replace(microsecond=0)
+                .isoformat()[0:-6]  # removes the trailing +00:00 portion of the time
+                + "Z"
+            ),
         }  # type: Dict[str, Any]
 
         if self.endpoint_call_counter_span_processor is not None:
diff --git a/tests/appsec/iast_packages/packages/pkg_pyjwt.py b/tests/appsec/iast_packages/packages/pkg_pyjwt.py
index 4712f6cee0f..ec43d8a17d2 100644
--- a/tests/appsec/iast_packages/packages/pkg_pyjwt.py
+++ b/tests/appsec/iast_packages/packages/pkg_pyjwt.py
@@ -3,6 +3,7 @@
 
 https://pypi.org/project/PyJWT/
 """
+
 import datetime
 
 from flask import Blueprint
@@ -25,7 +26,10 @@ def pkg_pyjwt_view():
         secret_key = "your-256-bit-secret"
         user_payload = request.args.get("package_param", "default-user")
 
-        payload = {"user": user_payload, "exp": datetime.datetime.utcnow() + datetime.timedelta(seconds=30)}
+        payload = {
+            "user": user_payload,
+            "exp": datetime.datetime.now(datetime.timezone.utc) + datetime.timedelta(seconds=30),
+        }
 
         try:
             # Encode the payload to create a JWT

From d67623327d1b4de78563ab98a9feb6229fdc8826 Mon Sep 17 00:00:00 2001
From: kyle <kyle@verhoog.ca>
Date: Thu, 9 Jan 2025 12:45:54 -0500
Subject: [PATCH 09/19] fix(llmobs): replace trace processor with event
 listener (#11781)

The LLMObs service formerly depended on the TraceProcessor interface in
the tracer. This was problematic due to sharing a dependency with the
public API. As such, users could configure a trace filter (under the
hood is a trace processor) and overwrite the LLMObs TraceProcessor.

Instead, the tracer can emit span start and finish events which the
LLMObs service listens to and acts on, as proposed here.

The gotcha is that the LLMObs service no longer has a way to drop traces
when run in agentless mode, which only LLMObs supports. Instead, we
encourage users to explicitly turn off APM which carries the benefit of
clarity since this was implicit before.

Co-authored-by: Yun Kim <yun.kim@datadoghq.com>
---
 .riot/requirements/16562eb.txt                |   32 +
 ddtrace/_trace/tracer.py                      |    5 +-
 ddtrace/llmobs/_llmobs.py                     |  161 +-
 ddtrace/llmobs/_trace_processor.py            |  177 ---
 ddtrace/llmobs/_utils.py                      |   18 +-
 ...fix-llmobs-processor-4afd715a84323d32.yaml |    5 +
 riotfile.py                                   |    2 +-
 tests/llmobs/conftest.py                      |   53 +-
 tests/llmobs/test_llmobs.py                   |   21 +-
 ...est_llmobs_ragas_faithfulness_evaluator.py |   34 +-
 tests/llmobs/test_llmobs_service.py           | 1313 +++++++----------
 tests/llmobs/test_llmobs_span_agent_writer.py |    3 +-
 .../test_llmobs_span_agentless_writer.py      |   15 +-
 tests/llmobs/test_llmobs_trace_processor.py   |   36 -
 tests/llmobs/test_propagation.py              |    2 -
 15 files changed, 798 insertions(+), 1079 deletions(-)
 create mode 100644 .riot/requirements/16562eb.txt
 delete mode 100644 ddtrace/llmobs/_trace_processor.py
 create mode 100644 releasenotes/notes/fix-llmobs-processor-4afd715a84323d32.yaml
 delete mode 100644 tests/llmobs/test_llmobs_trace_processor.py

diff --git a/.riot/requirements/16562eb.txt b/.riot/requirements/16562eb.txt
new file mode 100644
index 00000000000..e2aac88c146
--- /dev/null
+++ b/.riot/requirements/16562eb.txt
@@ -0,0 +1,32 @@
+#
+# This file is autogenerated by pip-compile with Python 3.7
+# by the following command:
+#
+#    pip-compile --allow-unsafe --config=pyproject.toml --no-annotate --resolver=backtracking .riot/requirements/16562eb.in
+#
+attrs==24.2.0
+coverage[toml]==7.2.7
+exceptiongroup==1.2.2
+hypothesis==6.45.0
+idna==3.10
+importlib-metadata==6.7.0
+iniconfig==2.0.0
+mock==5.1.0
+multidict==6.0.5
+opentracing==2.4.0
+packaging==24.0
+pluggy==1.2.0
+pytest==7.4.4
+pytest-asyncio==0.21.1
+pytest-cov==4.1.0
+pytest-mock==3.11.1
+pyyaml==6.0.1
+six==1.17.0
+sortedcontainers==2.4.0
+tomli==2.0.1
+typing-extensions==4.7.1
+urllib3==1.26.20
+vcrpy==4.4.0
+wrapt==1.16.0
+yarl==1.9.4
+zipp==3.15.0
diff --git a/ddtrace/_trace/tracer.py b/ddtrace/_trace/tracer.py
index 02d5fed7626..af9b09d3e02 100644
--- a/ddtrace/_trace/tracer.py
+++ b/ddtrace/_trace/tracer.py
@@ -41,6 +41,7 @@
 from ddtrace.internal.atexit import register_on_exit_signal
 from ddtrace.internal.constants import SAMPLING_DECISION_TRACE_TAG_KEY
 from ddtrace.internal.constants import SPAN_API_DATADOG
+from ddtrace.internal.core import dispatch
 from ddtrace.internal.dogstatsd import get_dogstatsd_client
 from ddtrace.internal.logger import get_logger
 from ddtrace.internal.peer_service.processor import PeerServiceProcessor
@@ -849,7 +850,7 @@ def _start_span(
             for p in chain(self._span_processors, SpanProcessor.__processors__, self._deferred_processors):
                 p.on_span_start(span)
         self._hooks.emit(self.__class__.start_span, span)
-
+        dispatch("trace.span_start", (span,))
         return span
 
     start_span = _start_span
@@ -866,6 +867,8 @@ def _on_span_finish(self, span: Span) -> None:
             for p in chain(self._span_processors, SpanProcessor.__processors__, self._deferred_processors):
                 p.on_span_finish(span)
 
+        dispatch("trace.span_finish", (span,))
+
         if log.isEnabledFor(logging.DEBUG):
             log.debug("finishing span %s (enabled:%s)", span._pprint(), self.enabled)
 
diff --git a/ddtrace/llmobs/_llmobs.py b/ddtrace/llmobs/_llmobs.py
index 49815151118..cd4069b4094 100644
--- a/ddtrace/llmobs/_llmobs.py
+++ b/ddtrace/llmobs/_llmobs.py
@@ -3,7 +3,9 @@
 import time
 from typing import Any
 from typing import Dict
+from typing import List
 from typing import Optional
+from typing import Tuple
 from typing import Union
 
 import ddtrace
@@ -11,8 +13,12 @@
 from ddtrace import config
 from ddtrace import patch
 from ddtrace._trace.context import Context
+from ddtrace.constants import ERROR_MSG
+from ddtrace.constants import ERROR_STACK
+from ddtrace.constants import ERROR_TYPE
 from ddtrace.ext import SpanTypes
 from ddtrace.internal import atexit
+from ddtrace.internal import core
 from ddtrace.internal import forksafe
 from ddtrace.internal._rand import rand64bits
 from ddtrace.internal.compat import ensure_text
@@ -24,6 +30,7 @@
 from ddtrace.internal.telemetry.constants import TELEMETRY_APM_PRODUCT
 from ddtrace.internal.utils.formats import asbool
 from ddtrace.internal.utils.formats import parse_tags_str
+from ddtrace.llmobs import _constants as constants
 from ddtrace.llmobs._constants import ANNOTATIONS_CONTEXT_ID
 from ddtrace.llmobs._constants import INPUT_DOCUMENTS
 from ddtrace.llmobs._constants import INPUT_MESSAGES
@@ -45,11 +52,11 @@
 from ddtrace.llmobs._constants import SPAN_START_WHILE_DISABLED_WARNING
 from ddtrace.llmobs._constants import TAGS
 from ddtrace.llmobs._evaluators.runner import EvaluatorRunner
-from ddtrace.llmobs._trace_processor import LLMObsTraceProcessor
 from ddtrace.llmobs._utils import AnnotationContext
 from ddtrace.llmobs._utils import _get_llmobs_parent_id
 from ddtrace.llmobs._utils import _get_ml_app
 from ddtrace.llmobs._utils import _get_session_id
+from ddtrace.llmobs._utils import _get_span_name
 from ddtrace.llmobs._utils import _inject_llmobs_parent_id
 from ddtrace.llmobs._utils import safe_json
 from ddtrace.llmobs._utils import validate_prompt
@@ -81,34 +88,157 @@ class LLMObs(Service):
     def __init__(self, tracer=None):
         super(LLMObs, self).__init__()
         self.tracer = tracer or ddtrace.tracer
-        self._llmobs_span_writer = None
-
         self._llmobs_span_writer = LLMObsSpanWriter(
             is_agentless=config._llmobs_agentless_enabled,
             interval=float(os.getenv("_DD_LLMOBS_WRITER_INTERVAL", 1.0)),
             timeout=float(os.getenv("_DD_LLMOBS_WRITER_TIMEOUT", 5.0)),
         )
-
         self._llmobs_eval_metric_writer = LLMObsEvalMetricWriter(
             site=config._dd_site,
             api_key=config._dd_api_key,
             interval=float(os.getenv("_DD_LLMOBS_WRITER_INTERVAL", 1.0)),
             timeout=float(os.getenv("_DD_LLMOBS_WRITER_TIMEOUT", 5.0)),
         )
-
         self._evaluator_runner = EvaluatorRunner(
             interval=float(os.getenv("_DD_LLMOBS_EVALUATOR_INTERVAL", 1.0)),
             llmobs_service=self,
         )
 
-        self._trace_processor = LLMObsTraceProcessor(self._llmobs_span_writer, self._evaluator_runner)
         forksafe.register(self._child_after_fork)
 
         self._annotations = []
         self._annotation_context_lock = forksafe.RLock()
-        self.tracer.on_start_span(self._do_annotations)
 
-    def _do_annotations(self, span):
+        # Register hooks for span events
+        core.on("trace.span_start", self._do_annotations)
+        core.on("trace.span_finish", self._on_span_finish)
+
+    def _on_span_finish(self, span):
+        if self.enabled and span.span_type == SpanTypes.LLM:
+            self._submit_llmobs_span(span)
+
+    def _submit_llmobs_span(self, span: Span) -> None:
+        """Generate and submit an LLMObs span event to be sent to LLMObs."""
+        span_event = None
+        is_llm_span = span._get_ctx_item(SPAN_KIND) == "llm"
+        is_ragas_integration_span = False
+        try:
+            span_event, is_ragas_integration_span = self._llmobs_span_event(span)
+            self._llmobs_span_writer.enqueue(span_event)
+        except (KeyError, TypeError):
+            log.error(
+                "Error generating LLMObs span event for span %s, likely due to malformed span", span, exc_info=True
+            )
+        finally:
+            if not span_event or not is_llm_span or is_ragas_integration_span:
+                return
+            if self._evaluator_runner:
+                self._evaluator_runner.enqueue(span_event, span)
+
+    @classmethod
+    def _llmobs_span_event(cls, span: Span) -> Tuple[Dict[str, Any], bool]:
+        """Span event object structure."""
+        span_kind = span._get_ctx_item(SPAN_KIND)
+        if not span_kind:
+            raise KeyError("Span kind not found in span context")
+        meta: Dict[str, Any] = {"span.kind": span_kind, "input": {}, "output": {}}
+        if span_kind in ("llm", "embedding") and span._get_ctx_item(MODEL_NAME) is not None:
+            meta["model_name"] = span._get_ctx_item(MODEL_NAME)
+            meta["model_provider"] = (span._get_ctx_item(MODEL_PROVIDER) or "custom").lower()
+        meta["metadata"] = span._get_ctx_item(METADATA) or {}
+        if span._get_ctx_item(INPUT_PARAMETERS):
+            meta["input"]["parameters"] = span._get_ctx_item(INPUT_PARAMETERS)
+        if span_kind == "llm" and span._get_ctx_item(INPUT_MESSAGES) is not None:
+            meta["input"]["messages"] = span._get_ctx_item(INPUT_MESSAGES)
+        if span._get_ctx_item(INPUT_VALUE) is not None:
+            meta["input"]["value"] = safe_json(span._get_ctx_item(INPUT_VALUE))
+        if span_kind == "llm" and span._get_ctx_item(OUTPUT_MESSAGES) is not None:
+            meta["output"]["messages"] = span._get_ctx_item(OUTPUT_MESSAGES)
+        if span_kind == "embedding" and span._get_ctx_item(INPUT_DOCUMENTS) is not None:
+            meta["input"]["documents"] = span._get_ctx_item(INPUT_DOCUMENTS)
+        if span._get_ctx_item(OUTPUT_VALUE) is not None:
+            meta["output"]["value"] = safe_json(span._get_ctx_item(OUTPUT_VALUE))
+        if span_kind == "retrieval" and span._get_ctx_item(OUTPUT_DOCUMENTS) is not None:
+            meta["output"]["documents"] = span._get_ctx_item(OUTPUT_DOCUMENTS)
+        if span._get_ctx_item(INPUT_PROMPT) is not None:
+            prompt_json_str = span._get_ctx_item(INPUT_PROMPT)
+            if span_kind != "llm":
+                log.warning(
+                    "Dropping prompt on non-LLM span kind, annotating prompts is only supported for LLM span kinds."
+                )
+            else:
+                meta["input"]["prompt"] = prompt_json_str
+        if span.error:
+            meta.update(
+                {
+                    ERROR_MSG: span.get_tag(ERROR_MSG),
+                    ERROR_STACK: span.get_tag(ERROR_STACK),
+                    ERROR_TYPE: span.get_tag(ERROR_TYPE),
+                }
+            )
+        if not meta["input"]:
+            meta.pop("input")
+        if not meta["output"]:
+            meta.pop("output")
+        metrics = span._get_ctx_item(METRICS) or {}
+        ml_app = _get_ml_app(span)
+
+        is_ragas_integration_span = False
+
+        if ml_app.startswith(constants.RAGAS_ML_APP_PREFIX):
+            is_ragas_integration_span = True
+
+        span._set_ctx_item(ML_APP, ml_app)
+        parent_id = str(_get_llmobs_parent_id(span) or "undefined")
+
+        llmobs_span_event = {
+            "trace_id": "{:x}".format(span.trace_id),
+            "span_id": str(span.span_id),
+            "parent_id": parent_id,
+            "name": _get_span_name(span),
+            "start_ns": span.start_ns,
+            "duration": span.duration_ns,
+            "status": "error" if span.error else "ok",
+            "meta": meta,
+            "metrics": metrics,
+        }
+        session_id = _get_session_id(span)
+        if session_id is not None:
+            span._set_ctx_item(SESSION_ID, session_id)
+            llmobs_span_event["session_id"] = session_id
+
+        llmobs_span_event["tags"] = cls._llmobs_tags(
+            span, ml_app, session_id, is_ragas_integration_span=is_ragas_integration_span
+        )
+        return llmobs_span_event, is_ragas_integration_span
+
+    @staticmethod
+    def _llmobs_tags(
+        span: Span, ml_app: str, session_id: Optional[str] = None, is_ragas_integration_span: bool = False
+    ) -> List[str]:
+        tags = {
+            "version": config.version or "",
+            "env": config.env or "",
+            "service": span.service or "",
+            "source": "integration",
+            "ml_app": ml_app,
+            "ddtrace.version": ddtrace.__version__,
+            "language": "python",
+            "error": span.error,
+        }
+        err_type = span.get_tag(ERROR_TYPE)
+        if err_type:
+            tags["error_type"] = err_type
+        if session_id:
+            tags["session_id"] = session_id
+        if is_ragas_integration_span:
+            tags[constants.RUNNER_IS_INTEGRATION_SPAN_TAG] = "ragas"
+        existing_tags = span._get_ctx_item(TAGS)
+        if existing_tags is not None:
+            tags.update(existing_tags)
+        return ["{}:{}".format(k, v) for k, v in tags.items()]
+
+    def _do_annotations(self, span: Span) -> None:
         # get the current span context
         # only do the annotations if it matches the context
         if span.span_type != SpanTypes.LLM:  # do this check to avoid the warning log in `annotate`
@@ -120,20 +250,14 @@ def _do_annotations(self, span):
                 if current_context_id == context_id:
                     self.annotate(span, **annotation_kwargs)
 
-    def _child_after_fork(self):
+    def _child_after_fork(self) -> None:
         self._llmobs_span_writer = self._llmobs_span_writer.recreate()
         self._llmobs_eval_metric_writer = self._llmobs_eval_metric_writer.recreate()
         self._evaluator_runner = self._evaluator_runner.recreate()
-        self._trace_processor._span_writer = self._llmobs_span_writer
-        self._trace_processor._evaluator_runner = self._evaluator_runner
         if self.enabled:
             self._start_service()
 
     def _start_service(self) -> None:
-        tracer_filters = self.tracer._filters
-        if not any(isinstance(tracer_filter, LLMObsTraceProcessor) for tracer_filter in tracer_filters):
-            tracer_filters += [self._trace_processor]
-            self.tracer.configure(settings={"FILTERS": tracer_filters})
         try:
             self._llmobs_span_writer.start()
             self._llmobs_eval_metric_writer.start()
@@ -160,11 +284,7 @@ def _stop_service(self) -> None:
         except ServiceStatusError:
             log.debug("Error stopping LLMObs writers")
 
-        try:
-            forksafe.unregister(self._child_after_fork)
-            self.tracer.shutdown()
-        except Exception:
-            log.warning("Failed to shutdown tracer", exc_info=True)
+        forksafe.unregister(self._child_after_fork)
 
     @classmethod
     def enable(
@@ -265,7 +385,6 @@ def disable(cls) -> None:
 
         cls._instance.stop()
         cls.enabled = False
-        cls._instance.tracer.deregister_on_start_span(cls._instance._do_annotations)
         telemetry_writer.product_activated(TELEMETRY_APM_PRODUCT.LLMOBS, False)
 
         log.debug("%s disabled", cls.__name__)
diff --git a/ddtrace/llmobs/_trace_processor.py b/ddtrace/llmobs/_trace_processor.py
deleted file mode 100644
index 231d53d7626..00000000000
--- a/ddtrace/llmobs/_trace_processor.py
+++ /dev/null
@@ -1,177 +0,0 @@
-from typing import Any
-from typing import Dict
-from typing import List
-from typing import Optional
-from typing import Tuple
-
-import ddtrace
-from ddtrace import Span
-from ddtrace import config
-from ddtrace._trace.processor import TraceProcessor
-from ddtrace.constants import ERROR_MSG
-from ddtrace.constants import ERROR_STACK
-from ddtrace.constants import ERROR_TYPE
-from ddtrace.ext import SpanTypes
-from ddtrace.internal.logger import get_logger
-from ddtrace.llmobs._constants import INPUT_DOCUMENTS
-from ddtrace.llmobs._constants import INPUT_MESSAGES
-from ddtrace.llmobs._constants import INPUT_PARAMETERS
-from ddtrace.llmobs._constants import INPUT_PROMPT
-from ddtrace.llmobs._constants import INPUT_VALUE
-from ddtrace.llmobs._constants import METADATA
-from ddtrace.llmobs._constants import METRICS
-from ddtrace.llmobs._constants import ML_APP
-from ddtrace.llmobs._constants import MODEL_NAME
-from ddtrace.llmobs._constants import MODEL_PROVIDER
-from ddtrace.llmobs._constants import OUTPUT_DOCUMENTS
-from ddtrace.llmobs._constants import OUTPUT_MESSAGES
-from ddtrace.llmobs._constants import OUTPUT_VALUE
-from ddtrace.llmobs._constants import RAGAS_ML_APP_PREFIX
-from ddtrace.llmobs._constants import RUNNER_IS_INTEGRATION_SPAN_TAG
-from ddtrace.llmobs._constants import SESSION_ID
-from ddtrace.llmobs._constants import SPAN_KIND
-from ddtrace.llmobs._constants import TAGS
-from ddtrace.llmobs._utils import _get_llmobs_parent_id
-from ddtrace.llmobs._utils import _get_ml_app
-from ddtrace.llmobs._utils import _get_session_id
-from ddtrace.llmobs._utils import _get_span_name
-from ddtrace.llmobs._utils import safe_json
-
-
-log = get_logger(__name__)
-
-
-class LLMObsTraceProcessor(TraceProcessor):
-    """
-    Processor that extracts LLM-type spans in a trace to submit as separate LLMObs span events to LLM Observability.
-    """
-
-    def __init__(self, llmobs_span_writer, evaluator_runner=None):
-        self._span_writer = llmobs_span_writer
-        self._evaluator_runner = evaluator_runner
-
-    def process_trace(self, trace: List[Span]) -> Optional[List[Span]]:
-        if not trace:
-            return None
-        for span in trace:
-            if span.span_type == SpanTypes.LLM:
-                self.submit_llmobs_span(span)
-        return None if config._llmobs_agentless_enabled else trace
-
-    def submit_llmobs_span(self, span: Span) -> None:
-        """Generate and submit an LLMObs span event to be sent to LLMObs."""
-        span_event = None
-        is_llm_span = span._get_ctx_item(SPAN_KIND) == "llm"
-        is_ragas_integration_span = False
-        try:
-            span_event, is_ragas_integration_span = self._llmobs_span_event(span)
-            self._span_writer.enqueue(span_event)
-        except (KeyError, TypeError):
-            log.error("Error generating LLMObs span event for span %s, likely due to malformed span", span)
-        finally:
-            if not span_event or not is_llm_span or is_ragas_integration_span:
-                return
-            if self._evaluator_runner:
-                self._evaluator_runner.enqueue(span_event, span)
-
-    def _llmobs_span_event(self, span: Span) -> Tuple[Dict[str, Any], bool]:
-        """Span event object structure."""
-        span_kind = span._get_ctx_item(SPAN_KIND)
-        if not span_kind:
-            raise KeyError("Span kind not found in span context")
-        meta: Dict[str, Any] = {"span.kind": span_kind, "input": {}, "output": {}}
-        if span_kind in ("llm", "embedding") and span._get_ctx_item(MODEL_NAME) is not None:
-            meta["model_name"] = span._get_ctx_item(MODEL_NAME)
-            meta["model_provider"] = (span._get_ctx_item(MODEL_PROVIDER) or "custom").lower()
-        meta["metadata"] = span._get_ctx_item(METADATA) or {}
-        if span._get_ctx_item(INPUT_PARAMETERS):
-            meta["input"]["parameters"] = span._get_ctx_item(INPUT_PARAMETERS)
-        if span_kind == "llm" and span._get_ctx_item(INPUT_MESSAGES) is not None:
-            meta["input"]["messages"] = span._get_ctx_item(INPUT_MESSAGES)
-        if span._get_ctx_item(INPUT_VALUE) is not None:
-            meta["input"]["value"] = safe_json(span._get_ctx_item(INPUT_VALUE))
-        if span_kind == "llm" and span._get_ctx_item(OUTPUT_MESSAGES) is not None:
-            meta["output"]["messages"] = span._get_ctx_item(OUTPUT_MESSAGES)
-        if span_kind == "embedding" and span._get_ctx_item(INPUT_DOCUMENTS) is not None:
-            meta["input"]["documents"] = span._get_ctx_item(INPUT_DOCUMENTS)
-        if span._get_ctx_item(OUTPUT_VALUE) is not None:
-            meta["output"]["value"] = safe_json(span._get_ctx_item(OUTPUT_VALUE))
-        if span_kind == "retrieval" and span._get_ctx_item(OUTPUT_DOCUMENTS) is not None:
-            meta["output"]["documents"] = span._get_ctx_item(OUTPUT_DOCUMENTS)
-        if span._get_ctx_item(INPUT_PROMPT) is not None:
-            prompt_json_str = span._get_ctx_item(INPUT_PROMPT)
-            if span_kind != "llm":
-                log.warning(
-                    "Dropping prompt on non-LLM span kind, annotating prompts is only supported for LLM span kinds."
-                )
-            else:
-                meta["input"]["prompt"] = prompt_json_str
-        if span.error:
-            meta.update(
-                {
-                    ERROR_MSG: span.get_tag(ERROR_MSG),
-                    ERROR_STACK: span.get_tag(ERROR_STACK),
-                    ERROR_TYPE: span.get_tag(ERROR_TYPE),
-                }
-            )
-        if not meta["input"]:
-            meta.pop("input")
-        if not meta["output"]:
-            meta.pop("output")
-        metrics = span._get_ctx_item(METRICS) or {}
-        ml_app = _get_ml_app(span)
-
-        is_ragas_integration_span = False
-
-        if ml_app.startswith(RAGAS_ML_APP_PREFIX):
-            is_ragas_integration_span = True
-
-        span._set_ctx_item(ML_APP, ml_app)
-        parent_id = str(_get_llmobs_parent_id(span) or "undefined")
-
-        llmobs_span_event = {
-            "trace_id": "{:x}".format(span.trace_id),
-            "span_id": str(span.span_id),
-            "parent_id": parent_id,
-            "name": _get_span_name(span),
-            "start_ns": span.start_ns,
-            "duration": span.duration_ns,
-            "status": "error" if span.error else "ok",
-            "meta": meta,
-            "metrics": metrics,
-        }
-        session_id = _get_session_id(span)
-        if session_id is not None:
-            span._set_ctx_item(SESSION_ID, session_id)
-            llmobs_span_event["session_id"] = session_id
-
-        llmobs_span_event["tags"] = self._llmobs_tags(
-            span, ml_app, session_id, is_ragas_integration_span=is_ragas_integration_span
-        )
-        return llmobs_span_event, is_ragas_integration_span
-
-    @staticmethod
-    def _llmobs_tags(
-        span: Span, ml_app: str, session_id: Optional[str] = None, is_ragas_integration_span: bool = False
-    ) -> List[str]:
-        tags = {
-            "version": config.version or "",
-            "env": config.env or "",
-            "service": span.service or "",
-            "source": "integration",
-            "ml_app": ml_app,
-            "ddtrace.version": ddtrace.__version__,
-            "language": "python",
-            "error": span.error,
-        }
-        err_type = span.get_tag(ERROR_TYPE)
-        if err_type:
-            tags["error_type"] = err_type
-        if session_id:
-            tags["session_id"] = session_id
-        if is_ragas_integration_span:
-            tags[RUNNER_IS_INTEGRATION_SPAN_TAG] = "ragas"
-        existing_tags = span._get_ctx_item(TAGS)
-        if existing_tags is not None:
-            tags.update(existing_tags)
-        return ["{}:{}".format(k, v) for k, v in tags.items()]
diff --git a/ddtrace/llmobs/_utils.py b/ddtrace/llmobs/_utils.py
index c1b1c4a776c..dd616db8bef 100644
--- a/ddtrace/llmobs/_utils.py
+++ b/ddtrace/llmobs/_utils.py
@@ -135,9 +135,12 @@ def _get_ml_app(span: Span) -> str:
     ml_app = span._get_ctx_item(ML_APP)
     if ml_app:
         return ml_app
-    nearest_llmobs_ancestor = _get_nearest_llmobs_ancestor(span)
-    if nearest_llmobs_ancestor:
-        ml_app = nearest_llmobs_ancestor._get_ctx_item(ML_APP)
+    llmobs_parent = _get_nearest_llmobs_ancestor(span)
+    while llmobs_parent:
+        ml_app = llmobs_parent._get_ctx_item(ML_APP)
+        if ml_app is not None:
+            return ml_app
+        llmobs_parent = _get_nearest_llmobs_ancestor(llmobs_parent)
     return ml_app or config._llmobs_ml_app or "unknown-ml-app"
 
 
@@ -149,9 +152,12 @@ def _get_session_id(span: Span) -> Optional[str]:
     session_id = span._get_ctx_item(SESSION_ID)
     if session_id:
         return session_id
-    nearest_llmobs_ancestor = _get_nearest_llmobs_ancestor(span)
-    if nearest_llmobs_ancestor:
-        session_id = nearest_llmobs_ancestor._get_ctx_item(SESSION_ID)
+    llmobs_parent = _get_nearest_llmobs_ancestor(span)
+    while llmobs_parent:
+        session_id = llmobs_parent._get_ctx_item(SESSION_ID)
+        if session_id is not None:
+            return session_id
+        llmobs_parent = _get_nearest_llmobs_ancestor(llmobs_parent)
     return session_id
 
 
diff --git a/releasenotes/notes/fix-llmobs-processor-4afd715a84323d32.yaml b/releasenotes/notes/fix-llmobs-processor-4afd715a84323d32.yaml
new file mode 100644
index 00000000000..5912a415022
--- /dev/null
+++ b/releasenotes/notes/fix-llmobs-processor-4afd715a84323d32.yaml
@@ -0,0 +1,5 @@
+---
+fixes:
+  - |
+    LLM Observability: Resolves an issue where configuring custom trace filters/processors onto the tracer would disable LLM Observability.
+     Note that if LLM Observability is enabled in agentless mode, writing APM traces must be explicitly disabled by setting `DD_TRACE_ENABLED=0`.  
diff --git a/riotfile.py b/riotfile.py
index 0d9f66ca925..0398175d930 100644
--- a/riotfile.py
+++ b/riotfile.py
@@ -2958,8 +2958,8 @@ def select_pys(min_version: str = MIN_PYTHON_VERSION, max_version: str = MAX_PYT
             name="llmobs",
             command="pytest {cmdargs} tests/llmobs",
             pkgs={"vcrpy": latest, "pytest-asyncio": "==0.21.1"},
-            pys=select_pys(min_version="3.7"),
             venvs=[
+                Venv(pys="3.7"),
                 Venv(pys=select_pys(min_version="3.8"), pkgs={"ragas": "==0.1.21", "langchain": latest}),
             ],
         ),
diff --git a/tests/llmobs/conftest.py b/tests/llmobs/conftest.py
index a7d467b3985..15cffe5faa9 100644
--- a/tests/llmobs/conftest.py
+++ b/tests/llmobs/conftest.py
@@ -41,16 +41,6 @@ def mock_llmobs_span_writer():
     patcher.stop()
 
 
-@pytest.fixture
-def mock_llmobs_span_agentless_writer():
-    patcher = mock.patch("ddtrace.llmobs._llmobs.LLMObsSpanWriter")
-    LLMObsSpanWriterMock = patcher.start()
-    m = mock.MagicMock()
-    LLMObsSpanWriterMock.return_value = m
-    yield m
-    patcher.stop()
-
-
 @pytest.fixture
 def mock_llmobs_eval_metric_writer():
     patcher = mock.patch("ddtrace.llmobs._llmobs.LLMObsEvalMetricWriter")
@@ -85,10 +75,7 @@ def mock_llmobs_submit_evaluation():
 def mock_http_writer_send_payload_response():
     with mock.patch(
         "ddtrace.internal.writer.HTTPWriter._send_payload",
-        return_value=Response(
-            status=200,
-            body="{}",
-        ),
+        return_value=Response(status=200, body="{}"),
     ):
         yield
 
@@ -124,9 +111,10 @@ def mock_evaluator_sampler_logs():
 
 
 @pytest.fixture
-def mock_http_writer_logs():
-    with mock.patch("ddtrace.internal.writer.writer.log") as m:
+def mock_llmobs_logs():
+    with mock.patch("ddtrace.llmobs._llmobs.log") as m:
         yield m
+        m.reset_mock()
 
 
 @pytest.fixture
@@ -154,7 +142,7 @@ def LLMObs(
 
 @pytest.fixture
 def AgentlessLLMObs(
-    mock_llmobs_span_agentless_writer,
+    mock_llmobs_span_writer,
     mock_llmobs_eval_metric_writer,
     mock_llmobs_evaluator_runner,
     ddtrace_global_config,
@@ -191,16 +179,20 @@ def mock_ragas_dependencies_not_present():
 @pytest.fixture
 def ragas(mock_llmobs_span_writer, mock_llmobs_eval_metric_writer):
     with override_global_config(dict(_dd_api_key="<not-a-real-key>")):
-        import ragas
-
+        try:
+            import ragas
+        except ImportError:
+            pytest.skip("Ragas not installed")
         with override_env(dict(OPENAI_API_KEY=os.getenv("OPENAI_API_KEY", "<not-a-real-key>"))):
             yield ragas
 
 
 @pytest.fixture
 def reset_ragas_faithfulness_llm():
-    import ragas
-
+    try:
+        import ragas
+    except ImportError:
+        pytest.skip("Ragas not installed")
     previous_llm = ragas.metrics.faithfulness.llm
     yield
     ragas.metrics.faithfulness.llm = previous_llm
@@ -243,16 +235,25 @@ def llmobs_span_writer():
 
 
 @pytest.fixture
-def llmobs(monkeypatch, tracer, llmobs_env, llmobs_span_writer):
+def llmobs(
+    ddtrace_global_config,
+    monkeypatch,
+    tracer,
+    llmobs_env,
+    llmobs_span_writer,
+    mock_llmobs_eval_metric_writer,
+    mock_llmobs_evaluator_runner,
+):
     for env, val in llmobs_env.items():
         monkeypatch.setenv(env, val)
-
+    global_config = default_global_config()
+    global_config.update(dict(_llmobs_ml_app=llmobs_env.get("DD_LLMOBS_ML_APP")))
+    global_config.update(ddtrace_global_config)
     # TODO: remove once rest of tests are moved off of global config tampering
-    with override_global_config(dict(_llmobs_ml_app=llmobs_env.get("DD_LLMOBS_ML_APP"))):
+    with override_global_config(global_config):
         llmobs_service.enable(_tracer=tracer)
         llmobs_service._instance._llmobs_span_writer = llmobs_span_writer
-        llmobs_service._instance._trace_processor._span_writer = llmobs_span_writer
-        yield llmobs
+        yield llmobs_service
     llmobs_service.disable()
 
 
diff --git a/tests/llmobs/test_llmobs.py b/tests/llmobs/test_llmobs.py
index 1bae7efe9ed..6cf19fc3e2c 100644
--- a/tests/llmobs/test_llmobs.py
+++ b/tests/llmobs/test_llmobs.py
@@ -1,4 +1,3 @@
-import mock
 import pytest
 
 from ddtrace.ext import SpanTypes
@@ -8,12 +7,6 @@
 from tests.llmobs._utils import _expected_llmobs_llm_span_event
 
 
-@pytest.fixture
-def mock_logs():
-    with mock.patch("ddtrace.llmobs._trace_processor.log") as mock_logs:
-        yield mock_logs
-
-
 class TestMLApp:
     @pytest.mark.parametrize("llmobs_env", [{"DD_LLMOBS_ML_APP": "<not-a-real-app-name>"}])
     def test_tag_defaults_to_env_var(self, tracer, llmobs_env, llmobs_events):
@@ -228,19 +221,19 @@ def test_model_and_provider_are_set(tracer, llmobs_events):
     assert span_event["meta"]["model_provider"] == "model_provider"
 
 
-def test_malformed_span_logs_error_instead_of_raising(mock_logs, tracer, llmobs_events):
+def test_malformed_span_logs_error_instead_of_raising(tracer, llmobs_events, mock_llmobs_logs):
     """Test that a trying to create a span event from a malformed span will log an error instead of crashing."""
     with tracer.trace("root_llm_span", span_type=SpanTypes.LLM) as llm_span:
         # span does not have SPAN_KIND tag
         pass
-    mock_logs.error.assert_called_once_with(
-        "Error generating LLMObs span event for span %s, likely due to malformed span", llm_span
+    mock_llmobs_logs.error.assert_called_with(
+        "Error generating LLMObs span event for span %s, likely due to malformed span", llm_span, exc_info=True
     )
     assert len(llmobs_events) == 0
 
 
-def test_processor_only_creates_llmobs_span_event(tracer, llmobs_events):
-    """Test that the LLMObsTraceProcessor only creates LLMObs span events for LLM span types."""
+def test_only_generate_span_events_from_llmobs_spans(tracer, llmobs_events):
+    """Test that we only generate LLMObs span events for LLM span types."""
     with tracer.trace("root_llm_span", service="tests.llmobs", span_type=SpanTypes.LLM) as root_span:
         root_span._set_ctx_item(const.SPAN_KIND, "llm")
         with tracer.trace("child_span"):
@@ -250,5 +243,5 @@ def test_processor_only_creates_llmobs_span_event(tracer, llmobs_events):
     expected_grandchild_llmobs_span["parent_id"] = str(root_span.span_id)
 
     assert len(llmobs_events) == 2
-    assert llmobs_events[0] == _expected_llmobs_llm_span_event(root_span, "llm")
-    assert llmobs_events[1] == expected_grandchild_llmobs_span
+    assert llmobs_events[1] == _expected_llmobs_llm_span_event(root_span, "llm")
+    assert llmobs_events[0] == expected_grandchild_llmobs_span
diff --git a/tests/llmobs/test_llmobs_ragas_faithfulness_evaluator.py b/tests/llmobs/test_llmobs_ragas_faithfulness_evaluator.py
index 1f78b538f24..ec8e181e527 100644
--- a/tests/llmobs/test_llmobs_ragas_faithfulness_evaluator.py
+++ b/tests/llmobs/test_llmobs_ragas_faithfulness_evaluator.py
@@ -11,6 +11,9 @@
 from tests.llmobs._utils import _llm_span_with_expected_ragas_inputs_in_prompt
 
 
+pytest.importorskip("ragas", reason="Tests require ragas to be available on user env")
+
+
 def _llm_span_without_io():
     return _expected_llmobs_llm_span_event(Span("dummy"))
 
@@ -167,19 +170,17 @@ def test_ragas_faithfulness_submits_evaluation_on_span_with_custom_keys(ragas, L
 
 
 @pytest.mark.vcr_logs
-def test_ragas_faithfulness_emits_traces(ragas, LLMObs):
-    rf_evaluator = RagasFaithfulnessEvaluator(LLMObs)
+def test_ragas_faithfulness_emits_traces(ragas, llmobs, llmobs_events):
+    rf_evaluator = RagasFaithfulnessEvaluator(llmobs)
     rf_evaluator.evaluate(_llm_span_with_expected_ragas_inputs_in_prompt())
-    assert rf_evaluator.llmobs_service._instance._llmobs_span_writer.enqueue.call_count == 7
-    calls = rf_evaluator.llmobs_service._instance._llmobs_span_writer.enqueue.call_args_list
-
-    spans = [call[0][0] for call in calls]
-
+    ragas_spans = [event for event in llmobs_events if event["name"].startswith("dd-ragas.")]
+    ragas_spans = sorted(ragas_spans, key=lambda d: d["start_ns"])
+    assert len(ragas_spans) == 7
     # check name, io, span kinds match
-    assert spans == _expected_ragas_spans()
+    assert ragas_spans == _expected_ragas_spans()
 
     # verify the trace structure
-    root_span = spans[0]
+    root_span = ragas_spans[0]
     root_span_id = root_span["span_id"]
     assert root_span["parent_id"] == "undefined"
     assert root_span["meta"] is not None
@@ -187,16 +188,15 @@ def test_ragas_faithfulness_emits_traces(ragas, LLMObs):
     assert isinstance(root_span["meta"]["metadata"]["faithfulness_list"], list)
     assert isinstance(root_span["meta"]["metadata"]["statements"], list)
     root_span_trace_id = root_span["trace_id"]
-    for child_span in spans[1:]:
+    for child_span in ragas_spans[1:]:
         assert child_span["trace_id"] == root_span_trace_id
 
-    assert spans[1]["parent_id"] == root_span_id  # input extraction (task)
-    assert spans[2]["parent_id"] == root_span_id  # create statements (workflow)
-    assert spans[4]["parent_id"] == root_span_id  # create verdicts (workflow)
-    assert spans[6]["parent_id"] == root_span_id  # create score (task)
-
-    assert spans[3]["parent_id"] == spans[2]["span_id"]  # create statements prompt (task)
-    assert spans[5]["parent_id"] == spans[4]["span_id"]  # create verdicts prompt (task)
+    assert ragas_spans[1]["parent_id"] == root_span_id  # input extraction (task)
+    assert ragas_spans[2]["parent_id"] == root_span_id  # create statements (workflow)
+    assert ragas_spans[4]["parent_id"] == root_span_id  # create verdicts (workflow)
+    assert ragas_spans[6]["parent_id"] == root_span_id  # create score (task)
+    assert ragas_spans[3]["parent_id"] == ragas_spans[2]["span_id"]  # create statements prompt (task)
+    assert ragas_spans[5]["parent_id"] == ragas_spans[4]["span_id"]  # create verdicts prompt (task)
 
 
 def test_llmobs_with_faithfulness_emits_traces_and_evals_on_exit(mock_writer_logs, run_python_code_in_subprocess):
diff --git a/tests/llmobs/test_llmobs_service.py b/tests/llmobs/test_llmobs_service.py
index 98748250c3a..2ba5754019f 100644
--- a/tests/llmobs/test_llmobs_service.py
+++ b/tests/llmobs/test_llmobs_service.py
@@ -7,9 +7,7 @@
 
 import ddtrace
 from ddtrace._trace.context import Context
-from ddtrace._trace.span import Span
 from ddtrace.ext import SpanTypes
-from ddtrace.filters import TraceFilter
 from ddtrace.internal.service import ServiceStatus
 from ddtrace.llmobs import LLMObs as llmobs_service
 from ddtrace.llmobs._constants import INPUT_DOCUMENTS
@@ -31,7 +29,6 @@
 from ddtrace.llmobs._constants import SPAN_START_WHILE_DISABLED_WARNING
 from ddtrace.llmobs._constants import TAGS
 from ddtrace.llmobs._llmobs import SUPPORTED_LLMOBS_INTEGRATIONS
-from ddtrace.llmobs._llmobs import LLMObsTraceProcessor
 from ddtrace.llmobs.utils import Prompt
 from tests.llmobs._utils import _expected_llmobs_eval_metric_event
 from tests.llmobs._utils import _expected_llmobs_llm_span_event
@@ -41,20 +38,13 @@
 from tests.utils import override_global_config
 
 
-@pytest.fixture
-def mock_logs():
-    with mock.patch("ddtrace.llmobs._llmobs.log") as mock_logs:
-        yield mock_logs
+RAGAS_AVAILABLE = os.getenv("RAGAS_AVAILABLE", False)
 
 
 def run_llmobs_trace_filter(dummy_tracer):
-    for trace_filter in dummy_tracer._filters:
-        if isinstance(trace_filter, LLMObsTraceProcessor):
-            root_llm_span = Span(name="span1", span_type=SpanTypes.LLM)
-            root_llm_span.set_tag_str(SPAN_KIND, "llm")
-            trace1 = [root_llm_span]
-            return trace_filter.process_trace(trace1)
-    raise ValueError("LLMObsTraceProcessor not found in tracer filters.")
+    with dummy_tracer.trace("span1", span_type=SpanTypes.LLM) as span:
+        span.set_tag_str(SPAN_KIND, "llm")
+    return dummy_tracer._writer.pop()
 
 
 def test_service_enable():
@@ -65,26 +55,11 @@ def test_service_enable():
         assert llmobs_instance is not None
         assert llmobs_service.enabled
         assert llmobs_instance.tracer == dummy_tracer
-        assert any(isinstance(tracer_filter, LLMObsTraceProcessor) for tracer_filter in dummy_tracer._filters)
         assert run_llmobs_trace_filter(dummy_tracer) is not None
 
         llmobs_service.disable()
 
 
-def test_service_enable_with_apm_disabled(monkeypatch):
-    with override_global_config(dict(_dd_api_key="<not-a-real-api-key>", _llmobs_ml_app="<ml-app-name>")):
-        dummy_tracer = DummyTracer()
-        llmobs_service.enable(_tracer=dummy_tracer, agentless_enabled=True)
-        llmobs_instance = llmobs_service._instance
-        assert llmobs_instance is not None
-        assert llmobs_service.enabled
-        assert llmobs_instance.tracer == dummy_tracer
-        assert any(isinstance(tracer_filter, LLMObsTraceProcessor) for tracer_filter in dummy_tracer._filters)
-        assert run_llmobs_trace_filter(dummy_tracer) is None
-
-        llmobs_service.disable()
-
-
 def test_service_disable():
     with override_global_config(dict(_dd_api_key="<not-a-real-api-key>", _llmobs_ml_app="<ml-app-name>")):
         dummy_tracer = DummyTracer()
@@ -118,7 +93,7 @@ def test_service_enable_no_ml_app_specified():
         assert llmobs_service._instance._evaluator_runner.status.value == "stopped"
 
 
-def test_service_enable_deprecated_ml_app_name(monkeypatch, mock_logs):
+def test_service_enable_deprecated_ml_app_name(monkeypatch, mock_llmobs_logs):
     with override_global_config(dict(_dd_api_key="<not-a-real-key>", _llmobs_ml_app="")):
         dummy_tracer = DummyTracer()
         monkeypatch.setenv("DD_LLMOBS_APP_NAME", "test_ml_app")
@@ -126,11 +101,13 @@ def test_service_enable_deprecated_ml_app_name(monkeypatch, mock_logs):
         assert llmobs_service.enabled is True
         assert llmobs_service._instance._llmobs_eval_metric_writer.status.value == "running"
         assert llmobs_service._instance._llmobs_span_writer.status.value == "running"
-        mock_logs.warning.assert_called_once_with("`DD_LLMOBS_APP_NAME` is deprecated. Use `DD_LLMOBS_ML_APP` instead.")
+        mock_llmobs_logs.warning.assert_called_once_with(
+            "`DD_LLMOBS_APP_NAME` is deprecated. Use `DD_LLMOBS_ML_APP` instead."
+        )
         llmobs_service.disable()
 
 
-def test_service_enable_already_enabled(mock_logs):
+def test_service_enable_already_enabled(mock_llmobs_logs):
     with override_global_config(dict(_dd_api_key="<not-a-real-api-key>", _llmobs_ml_app="<ml-app-name>")):
         dummy_tracer = DummyTracer()
         llmobs_service.enable(_tracer=dummy_tracer)
@@ -139,9 +116,8 @@ def test_service_enable_already_enabled(mock_logs):
         assert llmobs_instance is not None
         assert llmobs_service.enabled
         assert llmobs_instance.tracer == dummy_tracer
-        assert any(isinstance(tracer_filter, LLMObsTraceProcessor) for tracer_filter in dummy_tracer._filters)
         llmobs_service.disable()
-        mock_logs.debug.assert_has_calls([mock.call("%s already enabled", "LLMObs")])
+        mock_llmobs_logs.debug.assert_has_calls([mock.call("%s already enabled", "LLMObs")])
 
 
 @mock.patch("ddtrace.llmobs._llmobs.patch")
@@ -203,107 +179,83 @@ def test_service_enable_does_not_override_global_patch_config(mock_tracer_patch,
         llmobs_service.disable()
 
 
-def test_start_span_while_disabled_logs_warning(LLMObs, mock_logs):
-    LLMObs.disable()
-    _ = LLMObs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider")
-    mock_logs.warning.assert_called_once_with(SPAN_START_WHILE_DISABLED_WARNING)
-    mock_logs.reset_mock()
-    _ = LLMObs.tool(name="test_tool")
-    mock_logs.warning.assert_called_once_with(SPAN_START_WHILE_DISABLED_WARNING)
-    mock_logs.reset_mock()
-    _ = LLMObs.task(name="test_task")
-    mock_logs.warning.assert_called_once_with(SPAN_START_WHILE_DISABLED_WARNING)
-    mock_logs.reset_mock()
-    _ = LLMObs.workflow(name="test_workflow")
-    mock_logs.warning.assert_called_once_with(SPAN_START_WHILE_DISABLED_WARNING)
-    mock_logs.reset_mock()
-    _ = LLMObs.agent(name="test_agent")
-    mock_logs.warning.assert_called_once_with(SPAN_START_WHILE_DISABLED_WARNING)
-
-
-def test_start_span_uses_kind_as_default_name(LLMObs):
-    with LLMObs.llm(model_name="test_model", model_provider="test_provider") as span:
+def test_start_span_while_disabled_logs_warning(llmobs, mock_llmobs_logs):
+    llmobs.disable()
+    _ = llmobs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider")
+    mock_llmobs_logs.warning.assert_called_once_with(SPAN_START_WHILE_DISABLED_WARNING)
+    mock_llmobs_logs.reset_mock()
+    _ = llmobs.tool(name="test_tool")
+    mock_llmobs_logs.warning.assert_called_once_with(SPAN_START_WHILE_DISABLED_WARNING)
+    mock_llmobs_logs.reset_mock()
+    _ = llmobs.task(name="test_task")
+    mock_llmobs_logs.warning.assert_called_once_with(SPAN_START_WHILE_DISABLED_WARNING)
+    mock_llmobs_logs.reset_mock()
+    _ = llmobs.workflow(name="test_workflow")
+    mock_llmobs_logs.warning.assert_called_once_with(SPAN_START_WHILE_DISABLED_WARNING)
+    mock_llmobs_logs.reset_mock()
+    _ = llmobs.agent(name="test_agent")
+    mock_llmobs_logs.warning.assert_called_once_with(SPAN_START_WHILE_DISABLED_WARNING)
+
+
+def test_start_span_uses_kind_as_default_name(llmobs):
+    with llmobs.llm(model_name="test_model", model_provider="test_provider") as span:
         assert span.name == "llm"
-    with LLMObs.tool() as span:
+    with llmobs.tool() as span:
         assert span.name == "tool"
-    with LLMObs.task() as span:
+    with llmobs.task() as span:
         assert span.name == "task"
-    with LLMObs.workflow() as span:
+    with llmobs.workflow() as span:
         assert span.name == "workflow"
-    with LLMObs.agent() as span:
+    with llmobs.agent() as span:
         assert span.name == "agent"
 
 
-def test_start_span_with_session_id(LLMObs):
-    with LLMObs.llm(model_name="test_model", session_id="test_session_id") as span:
+def test_start_span_with_session_id(llmobs):
+    with llmobs.llm(model_name="test_model", session_id="test_session_id") as span:
         assert span._get_ctx_item(SESSION_ID) == "test_session_id"
-    with LLMObs.tool(session_id="test_session_id") as span:
+    with llmobs.tool(session_id="test_session_id") as span:
         assert span._get_ctx_item(SESSION_ID) == "test_session_id"
-    with LLMObs.task(session_id="test_session_id") as span:
+    with llmobs.task(session_id="test_session_id") as span:
         assert span._get_ctx_item(SESSION_ID) == "test_session_id"
-    with LLMObs.workflow(session_id="test_session_id") as span:
+    with llmobs.workflow(session_id="test_session_id") as span:
         assert span._get_ctx_item(SESSION_ID) == "test_session_id"
-    with LLMObs.agent(session_id="test_session_id") as span:
+    with llmobs.agent(session_id="test_session_id") as span:
         assert span._get_ctx_item(SESSION_ID) == "test_session_id"
 
 
-def test_session_id_becomes_top_level_field(LLMObs, mock_llmobs_span_writer):
-    session_id = "test_session_id"
-    with LLMObs.task(session_id=session_id) as span:
-        pass
-    mock_llmobs_span_writer.enqueue.assert_called_with(
-        _expected_llmobs_non_llm_span_event(span, "task", session_id=session_id)
-    )
-
-
-def test_session_id_becomes_top_level_field_agentless(AgentlessLLMObs, mock_llmobs_span_agentless_writer):
+def test_session_id_becomes_top_level_field(llmobs, llmobs_events):
     session_id = "test_session_id"
-    with AgentlessLLMObs.task(session_id=session_id) as span:
+    with llmobs.task(session_id=session_id) as span:
         pass
-    mock_llmobs_span_agentless_writer.enqueue.assert_called_with(
-        _expected_llmobs_non_llm_span_event(span, "task", session_id=session_id)
-    )
+    assert len(llmobs_events) == 1
+    assert llmobs_events[0] == _expected_llmobs_non_llm_span_event(span, "task", session_id=session_id)
 
 
-def test_llm_span(LLMObs, mock_llmobs_span_writer):
-    with LLMObs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span:
+def test_llm_span(llmobs, llmobs_events):
+    with llmobs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span:
         assert span.name == "test_llm_call"
         assert span.resource == "llm"
         assert span.span_type == "llm"
         assert span._get_ctx_item(SPAN_KIND) == "llm"
         assert span._get_ctx_item(MODEL_NAME) == "test_model"
         assert span._get_ctx_item(MODEL_PROVIDER) == "test_provider"
-
-    mock_llmobs_span_writer.enqueue.assert_called_with(
-        _expected_llmobs_llm_span_event(span, "llm", model_name="test_model", model_provider="test_provider")
-    )
-
-
-def test_llm_span_agentless(AgentlessLLMObs, mock_llmobs_span_agentless_writer):
-    with AgentlessLLMObs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span:
-        assert span.name == "test_llm_call"
-        assert span.resource == "llm"
-        assert span.span_type == "llm"
-        assert span._get_ctx_item(SPAN_KIND) == "llm"
-        assert span._get_ctx_item(MODEL_NAME) == "test_model"
-        assert span._get_ctx_item(MODEL_PROVIDER) == "test_provider"
-
-    mock_llmobs_span_agentless_writer.enqueue.assert_called_with(
-        _expected_llmobs_llm_span_event(span, "llm", model_name="test_model", model_provider="test_provider")
+    assert len(llmobs_events) == 1
+    assert llmobs_events[0] == _expected_llmobs_llm_span_event(
+        span, "llm", model_name="test_model", model_provider="test_provider"
     )
 
 
-def test_llm_span_no_model_sets_default(LLMObs, mock_llmobs_span_writer):
-    with LLMObs.llm(name="test_llm_call", model_provider="test_provider") as span:
+def test_llm_span_no_model_sets_default(llmobs, llmobs_events):
+    with llmobs.llm(name="test_llm_call", model_provider="test_provider") as span:
         assert span._get_ctx_item(MODEL_NAME) == "custom"
-
-    mock_llmobs_span_writer.enqueue.assert_called_with(
-        _expected_llmobs_llm_span_event(span, "llm", model_name="custom", model_provider="test_provider")
+    assert len(llmobs_events) == 1
+    assert llmobs_events[0] == _expected_llmobs_llm_span_event(
+        span, "llm", model_name="custom", model_provider="test_provider"
     )
 
 
-def test_default_model_provider_set_to_custom(LLMObs):
-    with LLMObs.llm(model_name="test_model", name="test_llm_call") as span:
+def test_default_model_provider_set_to_custom(llmobs):
+    with llmobs.llm(model_name="test_model", name="test_llm_call") as span:
         assert span.name == "test_llm_call"
         assert span.resource == "llm"
         assert span.span_type == "llm"
@@ -312,88 +264,57 @@ def test_default_model_provider_set_to_custom(LLMObs):
         assert span._get_ctx_item(MODEL_PROVIDER) == "custom"
 
 
-def test_tool_span(LLMObs, mock_llmobs_span_writer):
-    with LLMObs.tool(name="test_tool") as span:
+def test_tool_span(llmobs, llmobs_events):
+    with llmobs.tool(name="test_tool") as span:
         assert span.name == "test_tool"
         assert span.resource == "tool"
         assert span.span_type == "llm"
         assert span._get_ctx_item(SPAN_KIND) == "tool"
-    mock_llmobs_span_writer.enqueue.assert_called_with(_expected_llmobs_non_llm_span_event(span, "tool"))
+    assert len(llmobs_events) == 1
+    assert llmobs_events[0] == _expected_llmobs_non_llm_span_event(span, "tool")
 
 
-def test_tool_span_agentless(AgentlessLLMObs, mock_llmobs_span_agentless_writer):
-    with AgentlessLLMObs.tool(name="test_tool") as span:
-        assert span.name == "test_tool"
-        assert span.resource == "tool"
-        assert span.span_type == "llm"
-        assert span._get_ctx_item(SPAN_KIND) == "tool"
-    mock_llmobs_span_agentless_writer.enqueue.assert_called_with(_expected_llmobs_non_llm_span_event(span, "tool"))
-
-
-def test_task_span(LLMObs, mock_llmobs_span_writer):
-    with LLMObs.task(name="test_task") as span:
-        assert span.name == "test_task"
-        assert span.resource == "task"
-        assert span.span_type == "llm"
-        assert span._get_ctx_item(SPAN_KIND) == "task"
-    mock_llmobs_span_writer.enqueue.assert_called_with(_expected_llmobs_non_llm_span_event(span, "task"))
-
-
-def test_task_span_agentless(AgentlessLLMObs, mock_llmobs_span_agentless_writer):
-    with AgentlessLLMObs.task(name="test_task") as span:
+def test_task_span(llmobs, llmobs_events):
+    with llmobs.task(name="test_task") as span:
         assert span.name == "test_task"
         assert span.resource == "task"
         assert span.span_type == "llm"
         assert span._get_ctx_item(SPAN_KIND) == "task"
-    mock_llmobs_span_agentless_writer.enqueue.assert_called_with(_expected_llmobs_non_llm_span_event(span, "task"))
+    assert len(llmobs_events) == 1
+    assert llmobs_events[0] == _expected_llmobs_non_llm_span_event(span, "task")
 
 
-def test_workflow_span(LLMObs, mock_llmobs_span_writer):
-    with LLMObs.workflow(name="test_workflow") as span:
+def test_workflow_span(llmobs, llmobs_events):
+    with llmobs.workflow(name="test_workflow") as span:
         assert span.name == "test_workflow"
         assert span.resource == "workflow"
         assert span.span_type == "llm"
         assert span._get_ctx_item(SPAN_KIND) == "workflow"
-    mock_llmobs_span_writer.enqueue.assert_called_with(_expected_llmobs_non_llm_span_event(span, "workflow"))
-
-
-def test_workflow_span_agentless(AgentlessLLMObs, mock_llmobs_span_agentless_writer):
-    with AgentlessLLMObs.workflow(name="test_workflow") as span:
-        assert span.name == "test_workflow"
-        assert span.resource == "workflow"
-        assert span.span_type == "llm"
-        assert span._get_ctx_item(SPAN_KIND) == "workflow"
-    mock_llmobs_span_agentless_writer.enqueue.assert_called_with(_expected_llmobs_non_llm_span_event(span, "workflow"))
-
-
-def test_agent_span(LLMObs, mock_llmobs_span_writer):
-    with LLMObs.agent(name="test_agent") as span:
-        assert span.name == "test_agent"
-        assert span.resource == "agent"
-        assert span.span_type == "llm"
-        assert span._get_ctx_item(SPAN_KIND) == "agent"
-    mock_llmobs_span_writer.enqueue.assert_called_with(_expected_llmobs_llm_span_event(span, "agent"))
+    assert len(llmobs_events) == 1
+    assert llmobs_events[0] == _expected_llmobs_non_llm_span_event(span, "workflow")
 
 
-def test_agent_span_agentless(AgentlessLLMObs, mock_llmobs_span_agentless_writer):
-    with AgentlessLLMObs.agent(name="test_agent") as span:
+def test_agent_span(llmobs, llmobs_events):
+    with llmobs.agent(name="test_agent") as span:
         assert span.name == "test_agent"
         assert span.resource == "agent"
         assert span.span_type == "llm"
         assert span._get_ctx_item(SPAN_KIND) == "agent"
-    mock_llmobs_span_agentless_writer.enqueue.assert_called_with(_expected_llmobs_llm_span_event(span, "agent"))
+    assert len(llmobs_events) == 1
+    assert llmobs_events[0] == _expected_llmobs_llm_span_event(span, "agent")
 
 
-def test_embedding_span_no_model_sets_default(LLMObs, mock_llmobs_span_writer):
-    with LLMObs.embedding(name="test_embedding", model_provider="test_provider") as span:
+def test_embedding_span_no_model_sets_default(llmobs, llmobs_events):
+    with llmobs.embedding(name="test_embedding", model_provider="test_provider") as span:
         assert span._get_ctx_item(MODEL_NAME) == "custom"
-    mock_llmobs_span_writer.enqueue.assert_called_with(
-        _expected_llmobs_llm_span_event(span, "embedding", model_name="custom", model_provider="test_provider")
+    assert len(llmobs_events) == 1
+    assert llmobs_events[0] == _expected_llmobs_llm_span_event(
+        span, "embedding", model_name="custom", model_provider="test_provider"
     )
 
 
-def test_embedding_default_model_provider_set_to_custom(LLMObs):
-    with LLMObs.embedding(model_name="test_model", name="test_embedding") as span:
+def test_embedding_default_model_provider_set_to_custom(llmobs):
+    with llmobs.embedding(model_name="test_model", name="test_embedding") as span:
         assert span.name == "test_embedding"
         assert span.resource == "embedding"
         assert span.span_type == "llm"
@@ -402,198 +323,182 @@ def test_embedding_default_model_provider_set_to_custom(LLMObs):
         assert span._get_ctx_item(MODEL_PROVIDER) == "custom"
 
 
-def test_embedding_span(LLMObs, mock_llmobs_span_writer):
-    with LLMObs.embedding(model_name="test_model", name="test_embedding", model_provider="test_provider") as span:
-        assert span.name == "test_embedding"
-        assert span.resource == "embedding"
-        assert span.span_type == "llm"
-        assert span._get_ctx_item(SPAN_KIND) == "embedding"
-        assert span._get_ctx_item(MODEL_NAME) == "test_model"
-        assert span._get_ctx_item(MODEL_PROVIDER) == "test_provider"
-
-    mock_llmobs_span_writer.enqueue.assert_called_with(
-        _expected_llmobs_llm_span_event(span, "embedding", model_name="test_model", model_provider="test_provider")
-    )
-
-
-def test_embedding_span_agentless(AgentlessLLMObs, mock_llmobs_span_agentless_writer):
-    with AgentlessLLMObs.embedding(
-        model_name="test_model", name="test_embedding", model_provider="test_provider"
-    ) as span:
+def test_embedding_span(llmobs, llmobs_events):
+    with llmobs.embedding(model_name="test_model", name="test_embedding", model_provider="test_provider") as span:
         assert span.name == "test_embedding"
         assert span.resource == "embedding"
         assert span.span_type == "llm"
         assert span._get_ctx_item(SPAN_KIND) == "embedding"
         assert span._get_ctx_item(MODEL_NAME) == "test_model"
         assert span._get_ctx_item(MODEL_PROVIDER) == "test_provider"
-
-    mock_llmobs_span_agentless_writer.enqueue.assert_called_with(
-        _expected_llmobs_llm_span_event(span, "embedding", model_name="test_model", model_provider="test_provider")
+    assert len(llmobs_events) == 1
+    assert llmobs_events[0] == _expected_llmobs_llm_span_event(
+        span, "embedding", model_name="test_model", model_provider="test_provider"
     )
 
 
-def test_annotate_no_active_span_logs_warning(LLMObs, mock_logs):
-    LLMObs.annotate(parameters={"test": "test"})
-    mock_logs.warning.assert_called_once_with("No span provided and no active LLMObs-generated span found.")
+def test_annotate_no_active_span_logs_warning(llmobs, mock_llmobs_logs):
+    llmobs.annotate(parameters={"test": "test"})
+    mock_llmobs_logs.warning.assert_called_once_with("No span provided and no active LLMObs-generated span found.")
 
 
-def test_annotate_non_llm_span_logs_warning(LLMObs, mock_logs):
+def test_annotate_non_llm_span_logs_warning(llmobs, mock_llmobs_logs):
     dummy_tracer = DummyTracer()
     with dummy_tracer.trace("root") as non_llmobs_span:
-        LLMObs.annotate(span=non_llmobs_span, parameters={"test": "test"})
-        mock_logs.warning.assert_called_once_with("Span must be an LLMObs-generated span.")
+        llmobs.annotate(span=non_llmobs_span, parameters={"test": "test"})
+        mock_llmobs_logs.warning.assert_called_once_with("Span must be an LLMObs-generated span.")
 
 
-def test_annotate_finished_span_does_nothing(LLMObs, mock_logs):
-    with LLMObs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span:
+def test_annotate_finished_span_does_nothing(llmobs, mock_llmobs_logs):
+    with llmobs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span:
         pass
-    LLMObs.annotate(span=span, parameters={"test": "test"})
-    mock_logs.warning.assert_called_once_with("Cannot annotate a finished span.")
+    llmobs.annotate(span=span, parameters={"test": "test"})
+    mock_llmobs_logs.warning.assert_called_once_with("Cannot annotate a finished span.")
 
 
-def test_annotate_parameters(LLMObs, mock_logs):
-    with LLMObs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span:
-        LLMObs.annotate(span=span, parameters={"temperature": 0.9, "max_tokens": 50})
+def test_annotate_parameters(llmobs, mock_llmobs_logs):
+    with llmobs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span:
+        llmobs.annotate(span=span, parameters={"temperature": 0.9, "max_tokens": 50})
         assert span._get_ctx_item(INPUT_PARAMETERS) == {"temperature": 0.9, "max_tokens": 50}
-        mock_logs.warning.assert_called_once_with(
+        mock_llmobs_logs.warning.assert_called_once_with(
             "Setting parameters is deprecated, please set parameters and other metadata as tags instead."
         )
 
 
-def test_annotate_metadata(LLMObs):
-    with LLMObs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span:
-        LLMObs.annotate(span=span, metadata={"temperature": 0.5, "max_tokens": 20, "top_k": 10, "n": 3})
+def test_annotate_metadata(llmobs):
+    with llmobs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span:
+        llmobs.annotate(span=span, metadata={"temperature": 0.5, "max_tokens": 20, "top_k": 10, "n": 3})
         assert span._get_ctx_item(METADATA) == {"temperature": 0.5, "max_tokens": 20, "top_k": 10, "n": 3}
 
 
-def test_annotate_metadata_wrong_type_raises_warning(LLMObs, mock_logs):
-    with LLMObs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span:
-        LLMObs.annotate(span=span, metadata="wrong_metadata")
+def test_annotate_metadata_wrong_type_raises_warning(llmobs, mock_llmobs_logs):
+    with llmobs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span:
+        llmobs.annotate(span=span, metadata="wrong_metadata")
         assert span._get_ctx_item(METADATA) is None
-        mock_logs.warning.assert_called_once_with("metadata must be a dictionary of string key-value pairs.")
-        mock_logs.reset_mock()
+        mock_llmobs_logs.warning.assert_called_once_with("metadata must be a dictionary of string key-value pairs.")
+        mock_llmobs_logs.reset_mock()
 
 
-def test_annotate_tag(LLMObs):
-    with LLMObs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span:
-        LLMObs.annotate(span=span, tags={"test_tag_name": "test_tag_value", "test_numeric_tag": 10})
+def test_annotate_tag(llmobs):
+    with llmobs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span:
+        llmobs.annotate(span=span, tags={"test_tag_name": "test_tag_value", "test_numeric_tag": 10})
         assert span._get_ctx_item(TAGS) == {"test_tag_name": "test_tag_value", "test_numeric_tag": 10}
 
 
-def test_annotate_tag_wrong_type(LLMObs, mock_logs):
-    with LLMObs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span:
-        LLMObs.annotate(span=span, tags=12345)
+def test_annotate_tag_wrong_type(llmobs, mock_llmobs_logs):
+    with llmobs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span:
+        llmobs.annotate(span=span, tags=12345)
         assert span._get_ctx_item(TAGS) is None
-        mock_logs.warning.assert_called_once_with(
+        mock_llmobs_logs.warning.assert_called_once_with(
             "span_tags must be a dictionary of string key - primitive value pairs."
         )
 
 
-def test_annotate_input_string(LLMObs):
-    with LLMObs.llm(model_name="test_model") as llm_span:
-        LLMObs.annotate(span=llm_span, input_data="test_input")
+def test_annotate_input_string(llmobs):
+    with llmobs.llm(model_name="test_model") as llm_span:
+        llmobs.annotate(span=llm_span, input_data="test_input")
         assert llm_span._get_ctx_item(INPUT_MESSAGES) == [{"content": "test_input"}]
-    with LLMObs.task() as task_span:
-        LLMObs.annotate(span=task_span, input_data="test_input")
+    with llmobs.task() as task_span:
+        llmobs.annotate(span=task_span, input_data="test_input")
         assert task_span._get_ctx_item(INPUT_VALUE) == "test_input"
-    with LLMObs.tool() as tool_span:
-        LLMObs.annotate(span=tool_span, input_data="test_input")
+    with llmobs.tool() as tool_span:
+        llmobs.annotate(span=tool_span, input_data="test_input")
         assert tool_span._get_ctx_item(INPUT_VALUE) == "test_input"
-    with LLMObs.workflow() as workflow_span:
-        LLMObs.annotate(span=workflow_span, input_data="test_input")
+    with llmobs.workflow() as workflow_span:
+        llmobs.annotate(span=workflow_span, input_data="test_input")
         assert workflow_span._get_ctx_item(INPUT_VALUE) == "test_input"
-    with LLMObs.agent() as agent_span:
-        LLMObs.annotate(span=agent_span, input_data="test_input")
+    with llmobs.agent() as agent_span:
+        llmobs.annotate(span=agent_span, input_data="test_input")
         assert agent_span._get_ctx_item(INPUT_VALUE) == "test_input"
-    with LLMObs.retrieval() as retrieval_span:
-        LLMObs.annotate(span=retrieval_span, input_data="test_input")
+    with llmobs.retrieval() as retrieval_span:
+        llmobs.annotate(span=retrieval_span, input_data="test_input")
         assert retrieval_span._get_ctx_item(INPUT_VALUE) == "test_input"
 
 
-def test_annotate_numeric_io(LLMObs):
-    with LLMObs.task() as task_span:
-        LLMObs.annotate(span=task_span, input_data=0, output_data=0)
+def test_annotate_numeric_io(llmobs):
+    with llmobs.task() as task_span:
+        llmobs.annotate(span=task_span, input_data=0, output_data=0)
         assert task_span._get_ctx_item(INPUT_VALUE) == "0"
         assert task_span._get_ctx_item(OUTPUT_VALUE) == "0"
-    with LLMObs.task() as task_span:
-        LLMObs.annotate(span=task_span, input_data=1.23, output_data=1.23)
+    with llmobs.task() as task_span:
+        llmobs.annotate(span=task_span, input_data=1.23, output_data=1.23)
         assert task_span._get_ctx_item(INPUT_VALUE) == "1.23"
         assert task_span._get_ctx_item(OUTPUT_VALUE) == "1.23"
 
 
-def test_annotate_input_serializable_value(LLMObs):
-    with LLMObs.task() as task_span:
-        LLMObs.annotate(span=task_span, input_data=["test_input"])
+def test_annotate_input_serializable_value(llmobs):
+    with llmobs.task() as task_span:
+        llmobs.annotate(span=task_span, input_data=["test_input"])
         assert task_span._get_ctx_item(INPUT_VALUE) == str(["test_input"])
-    with LLMObs.tool() as tool_span:
-        LLMObs.annotate(span=tool_span, input_data={"test_input": "hello world"})
+    with llmobs.tool() as tool_span:
+        llmobs.annotate(span=tool_span, input_data={"test_input": "hello world"})
         assert tool_span._get_ctx_item(INPUT_VALUE) == str({"test_input": "hello world"})
-    with LLMObs.workflow() as workflow_span:
-        LLMObs.annotate(span=workflow_span, input_data=("asd", 123))
+    with llmobs.workflow() as workflow_span:
+        llmobs.annotate(span=workflow_span, input_data=("asd", 123))
         assert workflow_span._get_ctx_item(INPUT_VALUE) == str(("asd", 123))
-    with LLMObs.agent() as agent_span:
-        LLMObs.annotate(span=agent_span, input_data="test_input")
+    with llmobs.agent() as agent_span:
+        llmobs.annotate(span=agent_span, input_data="test_input")
         assert agent_span._get_ctx_item(INPUT_VALUE) == "test_input"
-    with LLMObs.retrieval() as retrieval_span:
-        LLMObs.annotate(span=retrieval_span, input_data=[0, 1, 2, 3, 4])
+    with llmobs.retrieval() as retrieval_span:
+        llmobs.annotate(span=retrieval_span, input_data=[0, 1, 2, 3, 4])
         assert retrieval_span._get_ctx_item(INPUT_VALUE) == str([0, 1, 2, 3, 4])
 
 
-def test_annotate_input_llm_message(LLMObs):
-    with LLMObs.llm(model_name="test_model") as span:
-        LLMObs.annotate(span=span, input_data=[{"content": "test_input", "role": "human"}])
+def test_annotate_input_llm_message(llmobs):
+    with llmobs.llm(model_name="test_model") as span:
+        llmobs.annotate(span=span, input_data=[{"content": "test_input", "role": "human"}])
         assert span._get_ctx_item(INPUT_MESSAGES) == [{"content": "test_input", "role": "human"}]
 
 
-def test_annotate_input_llm_message_wrong_type(LLMObs, mock_logs):
-    with LLMObs.llm(model_name="test_model") as span:
-        LLMObs.annotate(span=span, input_data=[{"content": object()}])
+def test_annotate_input_llm_message_wrong_type(llmobs, mock_llmobs_logs):
+    with llmobs.llm(model_name="test_model") as span:
+        llmobs.annotate(span=span, input_data=[{"content": object()}])
         assert span._get_ctx_item(INPUT_MESSAGES) is None
-        mock_logs.warning.assert_called_once_with("Failed to parse input messages.", exc_info=True)
+        mock_llmobs_logs.warning.assert_called_once_with("Failed to parse input messages.", exc_info=True)
 
 
-def test_llmobs_annotate_incorrect_message_content_type_raises_warning(LLMObs, mock_logs):
-    with LLMObs.llm(model_name="test_model") as span:
-        LLMObs.annotate(span=span, input_data={"role": "user", "content": {"nested": "yes"}})
-        mock_logs.warning.assert_called_once_with("Failed to parse input messages.", exc_info=True)
-        mock_logs.reset_mock()
-        LLMObs.annotate(span=span, output_data={"role": "user", "content": {"nested": "yes"}})
-        mock_logs.warning.assert_called_once_with("Failed to parse output messages.", exc_info=True)
+def test_llmobs_annotate_incorrect_message_content_type_raises_warning(llmobs, mock_llmobs_logs):
+    with llmobs.llm(model_name="test_model") as span:
+        llmobs.annotate(span=span, input_data={"role": "user", "content": {"nested": "yes"}})
+        mock_llmobs_logs.warning.assert_called_once_with("Failed to parse input messages.", exc_info=True)
+        mock_llmobs_logs.reset_mock()
+        llmobs.annotate(span=span, output_data={"role": "user", "content": {"nested": "yes"}})
+        mock_llmobs_logs.warning.assert_called_once_with("Failed to parse output messages.", exc_info=True)
 
 
-def test_annotate_document_str(LLMObs):
-    with LLMObs.embedding(model_name="test_model") as span:
-        LLMObs.annotate(span=span, input_data="test_document_text")
+def test_annotate_document_str(llmobs):
+    with llmobs.embedding(model_name="test_model") as span:
+        llmobs.annotate(span=span, input_data="test_document_text")
         documents = span._get_ctx_item(INPUT_DOCUMENTS)
         assert documents
         assert len(documents) == 1
         assert documents[0]["text"] == "test_document_text"
-    with LLMObs.retrieval() as span:
-        LLMObs.annotate(span=span, output_data="test_document_text")
+    with llmobs.retrieval() as span:
+        llmobs.annotate(span=span, output_data="test_document_text")
         documents = span._get_ctx_item(OUTPUT_DOCUMENTS)
         assert documents
         assert len(documents) == 1
         assert documents[0]["text"] == "test_document_text"
 
 
-def test_annotate_document_dict(LLMObs):
-    with LLMObs.embedding(model_name="test_model") as span:
-        LLMObs.annotate(span=span, input_data={"text": "test_document_text"})
+def test_annotate_document_dict(llmobs):
+    with llmobs.embedding(model_name="test_model") as span:
+        llmobs.annotate(span=span, input_data={"text": "test_document_text"})
         documents = span._get_ctx_item(INPUT_DOCUMENTS)
         assert documents
         assert len(documents) == 1
         assert documents[0]["text"] == "test_document_text"
-    with LLMObs.retrieval() as span:
-        LLMObs.annotate(span=span, output_data={"text": "test_document_text"})
+    with llmobs.retrieval() as span:
+        llmobs.annotate(span=span, output_data={"text": "test_document_text"})
         documents = span._get_ctx_item(OUTPUT_DOCUMENTS)
         assert documents
         assert len(documents) == 1
         assert documents[0]["text"] == "test_document_text"
 
 
-def test_annotate_document_list(LLMObs):
-    with LLMObs.embedding(model_name="test_model") as span:
-        LLMObs.annotate(
+def test_annotate_document_list(llmobs):
+    with llmobs.embedding(model_name="test_model") as span:
+        llmobs.annotate(
             span=span,
             input_data=[{"text": "test_document_text"}, {"text": "text", "name": "name", "score": 0.9, "id": "id"}],
         )
@@ -605,8 +510,8 @@ def test_annotate_document_list(LLMObs):
         assert documents[1]["name"] == "name"
         assert documents[1]["id"] == "id"
         assert documents[1]["score"] == 0.9
-    with LLMObs.retrieval() as span:
-        LLMObs.annotate(
+    with llmobs.retrieval() as span:
+        llmobs.annotate(
             span=span,
             output_data=[{"text": "test_document_text"}, {"text": "text", "name": "name", "score": 0.9, "id": "id"}],
         )
@@ -620,129 +525,131 @@ def test_annotate_document_list(LLMObs):
         assert documents[1]["score"] == 0.9
 
 
-def test_annotate_incorrect_document_type_raises_warning(LLMObs, mock_logs):
-    with LLMObs.embedding(model_name="test_model") as span:
-        LLMObs.annotate(span=span, input_data={"text": 123})
-        mock_logs.warning.assert_called_once_with("Failed to parse input documents.", exc_info=True)
-        mock_logs.reset_mock()
-        LLMObs.annotate(span=span, input_data=123)
-        mock_logs.warning.assert_called_once_with("Failed to parse input documents.", exc_info=True)
-        mock_logs.reset_mock()
-        LLMObs.annotate(span=span, input_data=object())
-        mock_logs.warning.assert_called_once_with("Failed to parse input documents.", exc_info=True)
-        mock_logs.reset_mock()
-    with LLMObs.retrieval() as span:
-        LLMObs.annotate(span=span, output_data=[{"score": 0.9, "id": "id", "name": "name"}])
-        mock_logs.warning.assert_called_once_with("Failed to parse output documents.", exc_info=True)
-        mock_logs.reset_mock()
-        LLMObs.annotate(span=span, output_data=123)
-        mock_logs.warning.assert_called_once_with("Failed to parse output documents.", exc_info=True)
-        mock_logs.reset_mock()
-        LLMObs.annotate(span=span, output_data=object())
-        mock_logs.warning.assert_called_once_with("Failed to parse output documents.", exc_info=True)
-
-
-def test_annotate_document_no_text_raises_warning(LLMObs, mock_logs):
-    with LLMObs.embedding(model_name="test_model") as span:
-        LLMObs.annotate(span=span, input_data=[{"score": 0.9, "id": "id", "name": "name"}])
-        mock_logs.warning.assert_called_once_with("Failed to parse input documents.", exc_info=True)
-    mock_logs.reset_mock()
-    with LLMObs.retrieval() as span:
-        LLMObs.annotate(span=span, output_data=[{"score": 0.9, "id": "id", "name": "name"}])
-        mock_logs.warning.assert_called_once_with("Failed to parse output documents.", exc_info=True)
-
-
-def test_annotate_incorrect_document_field_type_raises_warning(LLMObs, mock_logs):
-    with LLMObs.embedding(model_name="test_model") as span:
-        LLMObs.annotate(span=span, input_data=[{"text": "test_document_text", "score": "0.9"}])
-        mock_logs.warning.assert_called_once_with("Failed to parse input documents.", exc_info=True)
-    mock_logs.reset_mock()
-    with LLMObs.embedding(model_name="test_model") as span:
-        LLMObs.annotate(
+def test_annotate_incorrect_document_type_raises_warning(llmobs, mock_llmobs_logs):
+    with llmobs.embedding(model_name="test_model") as span:
+        llmobs.annotate(span=span, input_data={"text": 123})
+        mock_llmobs_logs.warning.assert_called_once_with("Failed to parse input documents.", exc_info=True)
+        mock_llmobs_logs.reset_mock()
+        llmobs.annotate(span=span, input_data=123)
+        mock_llmobs_logs.warning.assert_called_once_with("Failed to parse input documents.", exc_info=True)
+        mock_llmobs_logs.reset_mock()
+        llmobs.annotate(span=span, input_data=object())
+        mock_llmobs_logs.warning.assert_called_once_with("Failed to parse input documents.", exc_info=True)
+        mock_llmobs_logs.reset_mock()
+    with llmobs.retrieval() as span:
+        llmobs.annotate(span=span, output_data=[{"score": 0.9, "id": "id", "name": "name"}])
+        mock_llmobs_logs.warning.assert_called_once_with("Failed to parse output documents.", exc_info=True)
+        mock_llmobs_logs.reset_mock()
+        llmobs.annotate(span=span, output_data=123)
+        mock_llmobs_logs.warning.assert_called_once_with("Failed to parse output documents.", exc_info=True)
+        mock_llmobs_logs.reset_mock()
+        llmobs.annotate(span=span, output_data=object())
+        mock_llmobs_logs.warning.assert_called_once_with("Failed to parse output documents.", exc_info=True)
+
+
+def test_annotate_document_no_text_raises_warning(llmobs, mock_llmobs_logs):
+    with llmobs.embedding(model_name="test_model") as span:
+        llmobs.annotate(span=span, input_data=[{"score": 0.9, "id": "id", "name": "name"}])
+        mock_llmobs_logs.warning.assert_called_once_with("Failed to parse input documents.", exc_info=True)
+    mock_llmobs_logs.reset_mock()
+    with llmobs.retrieval() as span:
+        llmobs.annotate(span=span, output_data=[{"score": 0.9, "id": "id", "name": "name"}])
+        mock_llmobs_logs.warning.assert_called_once_with("Failed to parse output documents.", exc_info=True)
+
+
+def test_annotate_incorrect_document_field_type_raises_warning(llmobs, mock_llmobs_logs):
+    with llmobs.embedding(model_name="test_model") as span:
+        llmobs.annotate(span=span, input_data=[{"text": "test_document_text", "score": "0.9"}])
+        mock_llmobs_logs.warning.assert_called_once_with("Failed to parse input documents.", exc_info=True)
+    mock_llmobs_logs.reset_mock()
+    with llmobs.embedding(model_name="test_model") as span:
+        llmobs.annotate(
             span=span, input_data=[{"text": "text", "id": 123, "score": "0.9", "name": ["h", "e", "l", "l", "o"]}]
         )
-        mock_logs.warning.assert_called_once_with("Failed to parse input documents.", exc_info=True)
-    mock_logs.reset_mock()
-    with LLMObs.retrieval() as span:
-        LLMObs.annotate(span=span, output_data=[{"text": "test_document_text", "score": "0.9"}])
-        mock_logs.warning.assert_called_once_with("Failed to parse output documents.", exc_info=True)
-    mock_logs.reset_mock()
-    with LLMObs.retrieval() as span:
-        LLMObs.annotate(
+        mock_llmobs_logs.warning.assert_called_once_with("Failed to parse input documents.", exc_info=True)
+    mock_llmobs_logs.reset_mock()
+    with llmobs.retrieval() as span:
+        llmobs.annotate(span=span, output_data=[{"text": "test_document_text", "score": "0.9"}])
+        mock_llmobs_logs.warning.assert_called_once_with("Failed to parse output documents.", exc_info=True)
+    mock_llmobs_logs.reset_mock()
+    with llmobs.retrieval() as span:
+        llmobs.annotate(
             span=span, output_data=[{"text": "text", "id": 123, "score": "0.9", "name": ["h", "e", "l", "l", "o"]}]
         )
-        mock_logs.warning.assert_called_once_with("Failed to parse output documents.", exc_info=True)
+        mock_llmobs_logs.warning.assert_called_once_with("Failed to parse output documents.", exc_info=True)
 
 
-def test_annotate_output_string(LLMObs):
-    with LLMObs.llm(model_name="test_model") as llm_span:
-        LLMObs.annotate(span=llm_span, output_data="test_output")
+def test_annotate_output_string(llmobs):
+    with llmobs.llm(model_name="test_model") as llm_span:
+        llmobs.annotate(span=llm_span, output_data="test_output")
         assert llm_span._get_ctx_item(OUTPUT_MESSAGES) == [{"content": "test_output"}]
-    with LLMObs.embedding(model_name="test_model") as embedding_span:
-        LLMObs.annotate(span=embedding_span, output_data="test_output")
+    with llmobs.embedding(model_name="test_model") as embedding_span:
+        llmobs.annotate(span=embedding_span, output_data="test_output")
         assert embedding_span._get_ctx_item(OUTPUT_VALUE) == "test_output"
-    with LLMObs.task() as task_span:
-        LLMObs.annotate(span=task_span, output_data="test_output")
+    with llmobs.task() as task_span:
+        llmobs.annotate(span=task_span, output_data="test_output")
         assert task_span._get_ctx_item(OUTPUT_VALUE) == "test_output"
-    with LLMObs.tool() as tool_span:
-        LLMObs.annotate(span=tool_span, output_data="test_output")
+    with llmobs.tool() as tool_span:
+        llmobs.annotate(span=tool_span, output_data="test_output")
         assert tool_span._get_ctx_item(OUTPUT_VALUE) == "test_output"
-    with LLMObs.workflow() as workflow_span:
-        LLMObs.annotate(span=workflow_span, output_data="test_output")
+    with llmobs.workflow() as workflow_span:
+        llmobs.annotate(span=workflow_span, output_data="test_output")
         assert workflow_span._get_ctx_item(OUTPUT_VALUE) == "test_output"
-    with LLMObs.agent() as agent_span:
-        LLMObs.annotate(span=agent_span, output_data="test_output")
+    with llmobs.agent() as agent_span:
+        llmobs.annotate(span=agent_span, output_data="test_output")
         assert agent_span._get_ctx_item(OUTPUT_VALUE) == "test_output"
 
 
-def test_annotate_output_serializable_value(LLMObs):
-    with LLMObs.embedding(model_name="test_model") as embedding_span:
-        LLMObs.annotate(span=embedding_span, output_data=[[0, 1, 2, 3], [4, 5, 6, 7]])
+def test_annotate_output_serializable_value(llmobs):
+    with llmobs.embedding(model_name="test_model") as embedding_span:
+        llmobs.annotate(span=embedding_span, output_data=[[0, 1, 2, 3], [4, 5, 6, 7]])
         assert embedding_span._get_ctx_item(OUTPUT_VALUE) == str([[0, 1, 2, 3], [4, 5, 6, 7]])
-    with LLMObs.task() as task_span:
-        LLMObs.annotate(span=task_span, output_data=["test_output"])
+    with llmobs.task() as task_span:
+        llmobs.annotate(span=task_span, output_data=["test_output"])
         assert task_span._get_ctx_item(OUTPUT_VALUE) == str(["test_output"])
-    with LLMObs.tool() as tool_span:
-        LLMObs.annotate(span=tool_span, output_data={"test_output": "hello world"})
+    with llmobs.tool() as tool_span:
+        llmobs.annotate(span=tool_span, output_data={"test_output": "hello world"})
         assert tool_span._get_ctx_item(OUTPUT_VALUE) == str({"test_output": "hello world"})
-    with LLMObs.workflow() as workflow_span:
-        LLMObs.annotate(span=workflow_span, output_data=("asd", 123))
+    with llmobs.workflow() as workflow_span:
+        llmobs.annotate(span=workflow_span, output_data=("asd", 123))
         assert workflow_span._get_ctx_item(OUTPUT_VALUE) == str(("asd", 123))
-    with LLMObs.agent() as agent_span:
-        LLMObs.annotate(span=agent_span, output_data="test_output")
+    with llmobs.agent() as agent_span:
+        llmobs.annotate(span=agent_span, output_data="test_output")
         assert agent_span._get_ctx_item(OUTPUT_VALUE) == "test_output"
 
 
-def test_annotate_output_llm_message(LLMObs):
-    with LLMObs.llm(model_name="test_model") as llm_span:
-        LLMObs.annotate(span=llm_span, output_data=[{"content": "test_output", "role": "human"}])
+def test_annotate_output_llm_message(llmobs):
+    with llmobs.llm(model_name="test_model") as llm_span:
+        llmobs.annotate(span=llm_span, output_data=[{"content": "test_output", "role": "human"}])
         assert llm_span._get_ctx_item(OUTPUT_MESSAGES) == [{"content": "test_output", "role": "human"}]
 
 
-def test_annotate_output_llm_message_wrong_type(LLMObs, mock_logs):
-    with LLMObs.llm(model_name="test_model") as llm_span:
-        LLMObs.annotate(span=llm_span, output_data=[{"content": object()}])
+def test_annotate_output_llm_message_wrong_type(llmobs, mock_llmobs_logs):
+    with llmobs.llm(model_name="test_model") as llm_span:
+        llmobs.annotate(span=llm_span, output_data=[{"content": object()}])
         assert llm_span._get_ctx_item(OUTPUT_MESSAGES) is None
-        mock_logs.warning.assert_called_once_with("Failed to parse output messages.", exc_info=True)
+        mock_llmobs_logs.warning.assert_called_once_with("Failed to parse output messages.", exc_info=True)
 
 
-def test_annotate_metrics(LLMObs):
-    with LLMObs.llm(model_name="test_model") as span:
-        LLMObs.annotate(span=span, metrics={"input_tokens": 10, "output_tokens": 20, "total_tokens": 30})
+def test_annotate_metrics(llmobs):
+    with llmobs.llm(model_name="test_model") as span:
+        llmobs.annotate(span=span, metrics={"input_tokens": 10, "output_tokens": 20, "total_tokens": 30})
         assert span._get_ctx_item(METRICS) == {"input_tokens": 10, "output_tokens": 20, "total_tokens": 30}
 
 
-def test_annotate_metrics_wrong_type(LLMObs, mock_logs):
-    with LLMObs.llm(model_name="test_model") as llm_span:
-        LLMObs.annotate(span=llm_span, metrics=12345)
+def test_annotate_metrics_wrong_type(llmobs, mock_llmobs_logs):
+    with llmobs.llm(model_name="test_model") as llm_span:
+        llmobs.annotate(span=llm_span, metrics=12345)
         assert llm_span._get_ctx_item(METRICS) is None
-        mock_logs.warning.assert_called_once_with("metrics must be a dictionary of string key - numeric value pairs.")
-        mock_logs.reset_mock()
+        mock_llmobs_logs.warning.assert_called_once_with(
+            "metrics must be a dictionary of string key - numeric value pairs."
+        )
+        mock_llmobs_logs.reset_mock()
 
 
-def test_annotate_prompt_dict(LLMObs):
-    with LLMObs.llm(model_name="test_model") as span:
-        LLMObs.annotate(
+def test_annotate_prompt_dict(llmobs):
+    with llmobs.llm(model_name="test_model") as span:
+        llmobs.annotate(
             span=span,
             prompt={
                 "template": "{var1} {var3}",
@@ -761,9 +668,9 @@ def test_annotate_prompt_dict(LLMObs):
         }
 
 
-def test_annotate_prompt_dict_with_context_var_keys(LLMObs):
-    with LLMObs.llm(model_name="test_model") as span:
-        LLMObs.annotate(
+def test_annotate_prompt_dict_with_context_var_keys(llmobs):
+    with llmobs.llm(model_name="test_model") as span:
+        llmobs.annotate(
             span=span,
             prompt={
                 "template": "{var1} {var3}",
@@ -784,9 +691,9 @@ def test_annotate_prompt_dict_with_context_var_keys(LLMObs):
         }
 
 
-def test_annotate_prompt_typed_dict(LLMObs):
-    with LLMObs.llm(model_name="test_model") as span:
-        LLMObs.annotate(
+def test_annotate_prompt_typed_dict(llmobs):
+    with llmobs.llm(model_name="test_model") as span:
+        llmobs.annotate(
             span=span,
             prompt=Prompt(
                 template="{var1} {var3}",
@@ -807,47 +714,30 @@ def test_annotate_prompt_typed_dict(LLMObs):
         }
 
 
-def test_annotate_prompt_wrong_type(LLMObs, mock_logs):
-    with LLMObs.llm(model_name="test_model") as span:
-        LLMObs.annotate(span=span, prompt="prompt")
+def test_annotate_prompt_wrong_type(llmobs, mock_llmobs_logs):
+    with llmobs.llm(model_name="test_model") as span:
+        llmobs.annotate(span=span, prompt="prompt")
         assert span._get_ctx_item(INPUT_PROMPT) is None
-        mock_logs.warning.assert_called_once_with("Failed to validate prompt with error: ", exc_info=True)
-        mock_logs.reset_mock()
+        mock_llmobs_logs.warning.assert_called_once_with("Failed to validate prompt with error: ", exc_info=True)
+        mock_llmobs_logs.reset_mock()
 
-        LLMObs.annotate(span=span, prompt={"template": 1})
-        mock_logs.warning.assert_called_once_with("Failed to validate prompt with error: ", exc_info=True)
-        mock_logs.reset_mock()
+        llmobs.annotate(span=span, prompt={"template": 1})
+        mock_llmobs_logs.warning.assert_called_once_with("Failed to validate prompt with error: ", exc_info=True)
+        mock_llmobs_logs.reset_mock()
 
 
-def test_span_error_sets_error(LLMObs, mock_llmobs_span_writer):
+def test_span_error_sets_error(llmobs, llmobs_events):
     with pytest.raises(ValueError):
-        with LLMObs.llm(model_name="test_model", model_provider="test_model_provider") as span:
+        with llmobs.llm(model_name="test_model", model_provider="test_model_provider") as span:
             raise ValueError("test error message")
-    mock_llmobs_span_writer.enqueue.assert_called_with(
-        _expected_llmobs_llm_span_event(
-            span,
-            model_name="test_model",
-            model_provider="test_model_provider",
-            error="builtins.ValueError",
-            error_message="test error message",
-            error_stack=span.get_tag("error.stack"),
-        )
-    )
-
-
-def test_span_error_sets_error_agentless(AgentlessLLMObs, mock_llmobs_span_agentless_writer):
-    with pytest.raises(ValueError):
-        with AgentlessLLMObs.llm(model_name="test_model", model_provider="test_model_provider") as span:
-            raise ValueError("test error message")
-    mock_llmobs_span_agentless_writer.enqueue.assert_called_with(
-        _expected_llmobs_llm_span_event(
-            span,
-            model_name="test_model",
-            model_provider="test_model_provider",
-            error="builtins.ValueError",
-            error_message="test error message",
-            error_stack=span.get_tag("error.stack"),
-        )
+    assert len(llmobs_events) == 1
+    assert llmobs_events[0] == _expected_llmobs_llm_span_event(
+        span,
+        model_name="test_model",
+        model_provider="test_model_provider",
+        error="builtins.ValueError",
+        error_message="test error message",
+        error_stack=span.get_tag("error.stack"),
     )
 
 
@@ -855,218 +745,152 @@ def test_span_error_sets_error_agentless(AgentlessLLMObs, mock_llmobs_span_agent
     "ddtrace_global_config",
     [dict(version="1.2.3", env="test_env", service="test_service", _llmobs_ml_app="test_app_name")],
 )
-def test_tags(ddtrace_global_config, LLMObs, mock_llmobs_span_writer, monkeypatch):
-    with LLMObs.task(name="test_task") as span:
+def test_tags(ddtrace_global_config, llmobs, llmobs_events, monkeypatch):
+    with llmobs.task(name="test_task") as span:
         pass
-    mock_llmobs_span_writer.enqueue.assert_called_with(
-        _expected_llmobs_non_llm_span_event(
-            span,
-            "task",
-            tags={"version": "1.2.3", "env": "test_env", "service": "test_service", "ml_app": "test_app_name"},
-        )
+    assert len(llmobs_events) == 1
+    assert llmobs_events[0] == _expected_llmobs_non_llm_span_event(
+        span,
+        "task",
+        tags={"version": "1.2.3", "env": "test_env", "service": "test_service", "ml_app": "test_app_name"},
     )
 
 
-@pytest.mark.parametrize(
-    "ddtrace_global_config",
-    [dict(version="1.2.3", env="test_env", service="test_service", _llmobs_ml_app="test_app_name")],
-)
-def test_tags_agentless(ddtrace_global_config, AgentlessLLMObs, mock_llmobs_span_agentless_writer, monkeypatch):
-    with AgentlessLLMObs.task(name="test_task") as span:
+def test_ml_app_override(llmobs, llmobs_events):
+    with llmobs.task(name="test_task", ml_app="test_app") as span:
         pass
-    mock_llmobs_span_agentless_writer.enqueue.assert_called_with(
-        _expected_llmobs_non_llm_span_event(
-            span,
-            "task",
-            tags={"version": "1.2.3", "env": "test_env", "service": "test_service", "ml_app": "test_app_name"},
-        )
-    )
-
-
-def test_ml_app_override(LLMObs, mock_llmobs_span_writer):
-    with LLMObs.task(name="test_task", ml_app="test_app") as span:
+    assert len(llmobs_events) == 1
+    assert llmobs_events[0] == _expected_llmobs_non_llm_span_event(span, "task", tags={"ml_app": "test_app"})
+    with llmobs.tool(name="test_tool", ml_app="test_app") as span:
         pass
-    mock_llmobs_span_writer.enqueue.assert_called_with(
-        _expected_llmobs_non_llm_span_event(span, "task", tags={"ml_app": "test_app"})
-    )
-    with LLMObs.tool(name="test_tool", ml_app="test_app") as span:
+    assert len(llmobs_events) == 2
+    assert llmobs_events[1] == _expected_llmobs_non_llm_span_event(span, "tool", tags={"ml_app": "test_app"})
+    with llmobs.llm(model_name="model_name", name="test_llm", ml_app="test_app") as span:
         pass
-    mock_llmobs_span_writer.enqueue.assert_called_with(
-        _expected_llmobs_non_llm_span_event(span, "tool", tags={"ml_app": "test_app"})
+    assert len(llmobs_events) == 3
+    assert llmobs_events[2] == _expected_llmobs_llm_span_event(
+        span, "llm", model_name="model_name", model_provider="custom", tags={"ml_app": "test_app"}
     )
-    with LLMObs.llm(model_name="model_name", name="test_llm", ml_app="test_app") as span:
+    with llmobs.embedding(model_name="model_name", name="test_embedding", ml_app="test_app") as span:
         pass
-    mock_llmobs_span_writer.enqueue.assert_called_with(
-        _expected_llmobs_llm_span_event(
-            span, "llm", model_name="model_name", model_provider="custom", tags={"ml_app": "test_app"}
-        )
-    )
-    with LLMObs.embedding(model_name="model_name", name="test_embedding", ml_app="test_app") as span:
-        pass
-    mock_llmobs_span_writer.enqueue.assert_called_with(
-        _expected_llmobs_llm_span_event(
-            span, "embedding", model_name="model_name", model_provider="custom", tags={"ml_app": "test_app"}
-        )
+    assert len(llmobs_events) == 4
+    assert llmobs_events[3] == _expected_llmobs_llm_span_event(
+        span, "embedding", model_name="model_name", model_provider="custom", tags={"ml_app": "test_app"}
     )
-    with LLMObs.workflow(name="test_workflow", ml_app="test_app") as span:
+    with llmobs.workflow(name="test_workflow", ml_app="test_app") as span:
         pass
-    mock_llmobs_span_writer.enqueue.assert_called_with(
-        _expected_llmobs_non_llm_span_event(span, "workflow", tags={"ml_app": "test_app"})
-    )
-    with LLMObs.agent(name="test_agent", ml_app="test_app") as span:
+    assert len(llmobs_events) == 5
+    assert llmobs_events[4] == _expected_llmobs_non_llm_span_event(span, "workflow", tags={"ml_app": "test_app"})
+    with llmobs.agent(name="test_agent", ml_app="test_app") as span:
         pass
-    mock_llmobs_span_writer.enqueue.assert_called_with(
-        _expected_llmobs_llm_span_event(span, "agent", tags={"ml_app": "test_app"})
-    )
-    with LLMObs.retrieval(name="test_retrieval", ml_app="test_app") as span:
+    assert len(llmobs_events) == 6
+    assert llmobs_events[5] == _expected_llmobs_llm_span_event(span, "agent", tags={"ml_app": "test_app"})
+    with llmobs.retrieval(name="test_retrieval", ml_app="test_app") as span:
         pass
-    mock_llmobs_span_writer.enqueue.assert_called_with(
-        _expected_llmobs_non_llm_span_event(span, "retrieval", tags={"ml_app": "test_app"})
-    )
+    assert len(llmobs_events) == 7
+    assert llmobs_events[6] == _expected_llmobs_non_llm_span_event(span, "retrieval", tags={"ml_app": "test_app"})
 
 
-def test_ml_app_override_agentless(AgentlessLLMObs, mock_llmobs_span_agentless_writer):
-    with AgentlessLLMObs.task(name="test_task", ml_app="test_app") as span:
-        pass
-    mock_llmobs_span_agentless_writer.enqueue.assert_called_with(
-        _expected_llmobs_non_llm_span_event(span, "task", tags={"ml_app": "test_app"})
-    )
-    with AgentlessLLMObs.tool(name="test_tool", ml_app="test_app") as span:
-        pass
-    mock_llmobs_span_agentless_writer.enqueue.assert_called_with(
-        _expected_llmobs_non_llm_span_event(span, "tool", tags={"ml_app": "test_app"})
-    )
-    with AgentlessLLMObs.llm(model_name="model_name", name="test_llm", ml_app="test_app") as span:
-        pass
-    mock_llmobs_span_agentless_writer.enqueue.assert_called_with(
-        _expected_llmobs_llm_span_event(
-            span, "llm", model_name="model_name", model_provider="custom", tags={"ml_app": "test_app"}
-        )
-    )
-    with AgentlessLLMObs.embedding(model_name="model_name", name="test_embedding", ml_app="test_app") as span:
-        pass
-    mock_llmobs_span_agentless_writer.enqueue.assert_called_with(
-        _expected_llmobs_llm_span_event(
-            span, "embedding", model_name="model_name", model_provider="custom", tags={"ml_app": "test_app"}
-        )
-    )
-    with AgentlessLLMObs.workflow(name="test_workflow", ml_app="test_app") as span:
-        pass
-    mock_llmobs_span_agentless_writer.enqueue.assert_called_with(
-        _expected_llmobs_non_llm_span_event(span, "workflow", tags={"ml_app": "test_app"})
-    )
-    with AgentlessLLMObs.agent(name="test_agent", ml_app="test_app") as span:
-        pass
-    mock_llmobs_span_agentless_writer.enqueue.assert_called_with(
-        _expected_llmobs_llm_span_event(span, "agent", tags={"ml_app": "test_app"})
-    )
-    with AgentlessLLMObs.retrieval(name="test_retrieval", ml_app="test_app") as span:
-        pass
-    mock_llmobs_span_agentless_writer.enqueue.assert_called_with(
-        _expected_llmobs_non_llm_span_event(span, "retrieval", tags={"ml_app": "test_app"})
-    )
-
+def test_export_span_specified_span_is_incorrect_type_raises_warning(llmobs, mock_llmobs_logs):
+    llmobs.export_span(span="asd")
+    mock_llmobs_logs.warning.assert_called_once_with("Failed to export span. Span must be a valid Span object.")
 
-def test_export_span_specified_span_is_incorrect_type_raises_warning(LLMObs, mock_logs):
-    LLMObs.export_span(span="asd")
-    mock_logs.warning.assert_called_once_with("Failed to export span. Span must be a valid Span object.")
 
-
-def test_export_span_specified_span_is_not_llmobs_span_raises_warning(LLMObs, mock_logs):
+def test_export_span_specified_span_is_not_llmobs_span_raises_warning(llmobs, mock_llmobs_logs):
     with DummyTracer().trace("non_llmobs_span") as span:
-        LLMObs.export_span(span=span)
-    mock_logs.warning.assert_called_once_with("Span must be an LLMObs-generated span.")
+        llmobs.export_span(span=span)
+    mock_llmobs_logs.warning.assert_called_once_with("Span must be an LLMObs-generated span.")
 
 
-def test_export_span_specified_span_returns_span_context(LLMObs):
-    with LLMObs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span:
-        span_context = LLMObs.export_span(span=span)
+def test_export_span_specified_span_returns_span_context(llmobs):
+    with llmobs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span:
+        span_context = llmobs.export_span(span=span)
         assert span_context is not None
         assert span_context["span_id"] == str(span.span_id)
         assert span_context["trace_id"] == "{:x}".format(span.trace_id)
 
 
-def test_export_span_no_specified_span_no_active_span_raises_warning(LLMObs, mock_logs):
-    LLMObs.export_span()
-    mock_logs.warning.assert_called_once_with("No span provided and no active LLMObs-generated span found.")
+def test_export_span_no_specified_span_no_active_span_raises_warning(llmobs, mock_llmobs_logs):
+    llmobs.export_span()
+    mock_llmobs_logs.warning.assert_called_once_with("No span provided and no active LLMObs-generated span found.")
 
 
-def test_export_span_active_span_not_llmobs_span_raises_warning(LLMObs, mock_logs):
-    with LLMObs._instance.tracer.trace("non_llmobs_span"):
-        LLMObs.export_span()
-    mock_logs.warning.assert_called_once_with("Span must be an LLMObs-generated span.")
+def test_export_span_active_span_not_llmobs_span_raises_warning(llmobs, mock_llmobs_logs):
+    with llmobs._instance.tracer.trace("non_llmobs_span"):
+        llmobs.export_span()
+    mock_llmobs_logs.warning.assert_called_once_with("Span must be an LLMObs-generated span.")
 
 
-def test_export_span_no_specified_span_returns_exported_active_span(LLMObs):
-    with LLMObs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span:
-        span_context = LLMObs.export_span()
+def test_export_span_no_specified_span_returns_exported_active_span(llmobs):
+    with llmobs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span:
+        span_context = llmobs.export_span()
         assert span_context is not None
         assert span_context["span_id"] == str(span.span_id)
         assert span_context["trace_id"] == "{:x}".format(span.trace_id)
 
 
-def test_submit_evaluation_llmobs_disabled_raises_warning(LLMObs, mock_logs):
-    LLMObs.disable()
-    LLMObs.submit_evaluation(
+def test_submit_evaluation_llmobs_disabled_raises_warning(llmobs, mock_llmobs_logs):
+    llmobs.disable()
+    llmobs.submit_evaluation(
         span_context={"span_id": "123", "trace_id": "456"}, label="toxicity", metric_type="categorical", value="high"
     )
-    mock_logs.warning.assert_called_once_with(
+    mock_llmobs_logs.warning.assert_called_once_with(
         "LLMObs.submit_evaluation() called when LLMObs is not enabled. Evaluation metric data will not be sent."
     )
 
 
-def test_submit_evaluation_no_api_key_raises_warning(AgentlessLLMObs, mock_logs):
+def test_submit_evaluation_no_api_key_raises_warning(llmobs, mock_llmobs_logs):
     with override_global_config(dict(_dd_api_key="")):
-        AgentlessLLMObs.submit_evaluation(
+        llmobs.submit_evaluation(
             span_context={"span_id": "123", "trace_id": "456"},
             label="toxicity",
             metric_type="categorical",
             value="high",
         )
-        mock_logs.warning.assert_called_once_with(
+        mock_llmobs_logs.warning.assert_called_once_with(
             "DD_API_KEY is required for sending evaluation metrics. Evaluation metric data will not be sent. "
             "Ensure this configuration is set before running your application."
         )
 
 
-def test_submit_evaluation_ml_app_raises_warning(LLMObs, mock_logs):
+def test_submit_evaluation_ml_app_raises_warning(llmobs, mock_llmobs_logs):
     with override_global_config(dict(_llmobs_ml_app="")):
-        LLMObs.submit_evaluation(
+        llmobs.submit_evaluation(
             span_context={"span_id": "123", "trace_id": "456"},
             label="toxicity",
             metric_type="categorical",
             value="high",
         )
-        mock_logs.warning.assert_called_once_with(
+        mock_llmobs_logs.warning.assert_called_once_with(
             "ML App name is required for sending evaluation metrics. Evaluation metric data will not be sent. "
             "Ensure this configuration is set before running your application."
         )
 
 
-def test_submit_evaluation_span_context_incorrect_type_raises_warning(LLMObs, mock_logs):
-    LLMObs.submit_evaluation(span_context="asd", label="toxicity", metric_type="categorical", value="high")
-    mock_logs.warning.assert_called_once_with(
+def test_submit_evaluation_span_context_incorrect_type_raises_warning(llmobs, mock_llmobs_logs):
+    llmobs.submit_evaluation(span_context="asd", label="toxicity", metric_type="categorical", value="high")
+    mock_llmobs_logs.warning.assert_called_once_with(
         "span_context must be a dictionary containing both span_id and trace_id keys. "
         "LLMObs.export_span() can be used to generate this dictionary from a given span."
     )
 
 
-def test_submit_evaluation_empty_span_or_trace_id_raises_warning(LLMObs, mock_logs):
-    LLMObs.submit_evaluation(
+def test_submit_evaluation_empty_span_or_trace_id_raises_warning(llmobs, mock_llmobs_logs):
+    llmobs.submit_evaluation(
         span_context={"trace_id": "456"}, label="toxicity", metric_type="categorical", value="high"
     )
-    mock_logs.warning.assert_called_once_with(
+    mock_llmobs_logs.warning.assert_called_once_with(
         "span_id and trace_id must both be specified for the given evaluation metric to be submitted."
     )
-    mock_logs.reset_mock()
-    LLMObs.submit_evaluation(span_context={"span_id": "456"}, label="toxicity", metric_type="categorical", value="high")
-    mock_logs.warning.assert_called_once_with(
+    mock_llmobs_logs.reset_mock()
+    llmobs.submit_evaluation(span_context={"span_id": "456"}, label="toxicity", metric_type="categorical", value="high")
+    mock_llmobs_logs.warning.assert_called_once_with(
         "span_id and trace_id must both be specified for the given evaluation metric to be submitted."
     )
 
 
-def test_submit_evaluation_invalid_timestamp_raises_warning(LLMObs, mock_logs):
-    LLMObs.submit_evaluation(
+def test_submit_evaluation_invalid_timestamp_raises_warning(llmobs, mock_llmobs_logs):
+    llmobs.submit_evaluation(
         span_context={"span_id": "123", "trace_id": "456"},
         label="",
         metric_type="categorical",
@@ -1074,35 +898,35 @@ def test_submit_evaluation_invalid_timestamp_raises_warning(LLMObs, mock_logs):
         ml_app="dummy",
         timestamp_ms="invalid",
     )
-    mock_logs.warning.assert_called_once_with(
+    mock_llmobs_logs.warning.assert_called_once_with(
         "timestamp_ms must be a non-negative integer. Evaluation metric data will not be sent"
     )
 
 
-def test_submit_evaluation_empty_label_raises_warning(LLMObs, mock_logs):
-    LLMObs.submit_evaluation(
+def test_submit_evaluation_empty_label_raises_warning(llmobs, mock_llmobs_logs):
+    llmobs.submit_evaluation(
         span_context={"span_id": "123", "trace_id": "456"}, label="", metric_type="categorical", value="high"
     )
-    mock_logs.warning.assert_called_once_with("label must be the specified name of the evaluation metric.")
+    mock_llmobs_logs.warning.assert_called_once_with("label must be the specified name of the evaluation metric.")
 
 
-def test_submit_evaluation_incorrect_metric_type_raises_warning(LLMObs, mock_logs):
-    LLMObs.submit_evaluation(
+def test_submit_evaluation_incorrect_metric_type_raises_warning(llmobs, mock_llmobs_logs):
+    llmobs.submit_evaluation(
         span_context={"span_id": "123", "trace_id": "456"}, label="toxicity", metric_type="wrong", value="high"
     )
-    mock_logs.warning.assert_called_once_with("metric_type must be one of 'categorical' or 'score'.")
-    mock_logs.reset_mock()
-    LLMObs.submit_evaluation(
+    mock_llmobs_logs.warning.assert_called_once_with("metric_type must be one of 'categorical' or 'score'.")
+    mock_llmobs_logs.reset_mock()
+    llmobs.submit_evaluation(
         span_context={"span_id": "123", "trace_id": "456"}, label="toxicity", metric_type="", value="high"
     )
-    mock_logs.warning.assert_called_once_with("metric_type must be one of 'categorical' or 'score'.")
+    mock_llmobs_logs.warning.assert_called_once_with("metric_type must be one of 'categorical' or 'score'.")
 
 
-def test_submit_evaluation_numerical_value_raises_unsupported_warning(LLMObs, mock_logs):
-    LLMObs.submit_evaluation(
+def test_submit_evaluation_numerical_value_raises_unsupported_warning(llmobs, mock_llmobs_logs):
+    llmobs.submit_evaluation(
         span_context={"span_id": "123", "trace_id": "456"}, label="token_count", metric_type="numerical", value="high"
     )
-    mock_logs.warning.assert_has_calls(
+    mock_llmobs_logs.warning.assert_has_calls(
         [
             mock.call(
                 "The evaluation metric type 'numerical' is unsupported. Use 'score' instead. "
@@ -1112,44 +936,44 @@ def test_submit_evaluation_numerical_value_raises_unsupported_warning(LLMObs, mo
     )
 
 
-def test_submit_evaluation_incorrect_numerical_value_type_raises_warning(LLMObs, mock_logs):
-    LLMObs.submit_evaluation(
+def test_submit_evaluation_incorrect_numerical_value_type_raises_warning(llmobs, mock_llmobs_logs):
+    llmobs.submit_evaluation(
         span_context={"span_id": "123", "trace_id": "456"}, label="token_count", metric_type="numerical", value="high"
     )
-    mock_logs.warning.assert_has_calls(
+    mock_llmobs_logs.warning.assert_has_calls(
         [
             mock.call("value must be an integer or float for a score metric."),
         ]
     )
 
 
-def test_submit_evaluation_incorrect_score_value_type_raises_warning(LLMObs, mock_logs):
-    LLMObs.submit_evaluation(
+def test_submit_evaluation_incorrect_score_value_type_raises_warning(llmobs, mock_llmobs_logs):
+    llmobs.submit_evaluation(
         span_context={"span_id": "123", "trace_id": "456"}, label="token_count", metric_type="score", value="high"
     )
-    mock_logs.warning.assert_called_once_with("value must be an integer or float for a score metric.")
+    mock_llmobs_logs.warning.assert_called_once_with("value must be an integer or float for a score metric.")
 
 
-def test_submit_evaluation_invalid_tags_raises_warning(LLMObs, mock_logs):
-    LLMObs.submit_evaluation(
+def test_submit_evaluation_invalid_tags_raises_warning(llmobs, mock_llmobs_logs):
+    llmobs.submit_evaluation(
         span_context={"span_id": "123", "trace_id": "456"},
         label="toxicity",
         metric_type="categorical",
         value="high",
         tags=["invalid"],
     )
-    mock_logs.warning.assert_called_once_with("tags must be a dictionary of string key-value pairs.")
+    mock_llmobs_logs.warning.assert_called_once_with("tags must be a dictionary of string key-value pairs.")
 
 
-def test_submit_evaluation_invalid_metadata_raises_warning(LLMObs, mock_logs):
-    LLMObs.submit_evaluation(
+def test_submit_evaluation_invalid_metadata_raises_warning(llmobs, mock_llmobs_logs):
+    llmobs.submit_evaluation(
         span_context={"span_id": "123", "trace_id": "456"},
         label="toxicity",
         metric_type="categorical",
         value="high",
         metadata=1,
     )
-    mock_logs.warning.assert_called_once_with("metadata must be json serializable dictionary.")
+    mock_llmobs_logs.warning.assert_called_once_with("metadata must be json serializable dictionary.")
 
 
 @pytest.mark.parametrize(
@@ -1157,9 +981,9 @@ def test_submit_evaluation_invalid_metadata_raises_warning(LLMObs, mock_logs):
     [dict(_llmobs_ml_app="test_app_name")],
 )
 def test_submit_evaluation_non_string_tags_raises_warning_but_still_submits(
-    LLMObs, mock_logs, mock_llmobs_eval_metric_writer
+    llmobs, mock_llmobs_logs, mock_llmobs_eval_metric_writer
 ):
-    LLMObs.submit_evaluation(
+    llmobs.submit_evaluation(
         span_context={"span_id": "123", "trace_id": "456"},
         label="toxicity",
         metric_type="categorical",
@@ -1167,8 +991,10 @@ def test_submit_evaluation_non_string_tags_raises_warning_but_still_submits(
         tags={1: 2, "foo": "bar"},
         ml_app="dummy",
     )
-    mock_logs.warning.assert_called_once_with("Failed to parse tags. Tags for evaluation metrics must be strings.")
-    mock_logs.reset_mock()
+    mock_llmobs_logs.warning.assert_called_once_with(
+        "Failed to parse tags. Tags for evaluation metrics must be strings."
+    )
+    mock_llmobs_logs.reset_mock()
     mock_llmobs_eval_metric_writer.enqueue.assert_called_with(
         _expected_llmobs_eval_metric_event(
             ml_app="dummy",
@@ -1186,8 +1012,8 @@ def test_submit_evaluation_non_string_tags_raises_warning_but_still_submits(
     "ddtrace_global_config",
     [dict(ddtrace="1.2.3", env="test_env", service="test_service", _llmobs_ml_app="test_app_name")],
 )
-def test_submit_evaluation_metric_tags(LLMObs, mock_llmobs_eval_metric_writer):
-    LLMObs.submit_evaluation(
+def test_submit_evaluation_metric_tags(llmobs, mock_llmobs_eval_metric_writer):
+    llmobs.submit_evaluation(
         span_context={"span_id": "123", "trace_id": "456"},
         label="toxicity",
         metric_type="categorical",
@@ -1212,8 +1038,8 @@ def test_submit_evaluation_metric_tags(LLMObs, mock_llmobs_eval_metric_writer):
     "ddtrace_global_config",
     [dict(ddtrace="1.2.3", env="test_env", service="test_service", _llmobs_ml_app="test_app_name")],
 )
-def test_submit_evaluation_metric_with_metadata_enqueues_metric(LLMObs, mock_llmobs_eval_metric_writer):
-    LLMObs.submit_evaluation(
+def test_submit_evaluation_metric_with_metadata_enqueues_metric(llmobs, mock_llmobs_eval_metric_writer):
+    llmobs.submit_evaluation(
         span_context={"span_id": "123", "trace_id": "456"},
         label="toxicity",
         metric_type="categorical",
@@ -1235,7 +1061,7 @@ def test_submit_evaluation_metric_with_metadata_enqueues_metric(LLMObs, mock_llm
         )
     )
     mock_llmobs_eval_metric_writer.reset()
-    LLMObs.submit_evaluation(
+    llmobs.submit_evaluation(
         span_context={"span_id": "123", "trace_id": "456"},
         label="toxicity",
         metric_type="categorical",
@@ -1257,8 +1083,8 @@ def test_submit_evaluation_metric_with_metadata_enqueues_metric(LLMObs, mock_llm
     )
 
 
-def test_submit_evaluation_enqueues_writer_with_categorical_metric(LLMObs, mock_llmobs_eval_metric_writer):
-    LLMObs.submit_evaluation(
+def test_submit_evaluation_enqueues_writer_with_categorical_metric(llmobs, mock_llmobs_eval_metric_writer):
+    llmobs.submit_evaluation(
         span_context={"span_id": "123", "trace_id": "456"},
         label="toxicity",
         metric_type="categorical",
@@ -1276,9 +1102,9 @@ def test_submit_evaluation_enqueues_writer_with_categorical_metric(LLMObs, mock_
         )
     )
     mock_llmobs_eval_metric_writer.reset_mock()
-    with LLMObs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span:
-        LLMObs.submit_evaluation(
-            span_context=LLMObs.export_span(span),
+    with llmobs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span:
+        llmobs.submit_evaluation(
+            span_context=llmobs.export_span(span),
             label="toxicity",
             metric_type="categorical",
             value="high",
@@ -1296,8 +1122,8 @@ def test_submit_evaluation_enqueues_writer_with_categorical_metric(LLMObs, mock_
     )
 
 
-def test_submit_evaluation_enqueues_writer_with_score_metric(LLMObs, mock_llmobs_eval_metric_writer):
-    LLMObs.submit_evaluation(
+def test_submit_evaluation_enqueues_writer_with_score_metric(llmobs, mock_llmobs_eval_metric_writer):
+    llmobs.submit_evaluation(
         span_context={"span_id": "123", "trace_id": "456"},
         label="sentiment",
         metric_type="score",
@@ -1310,9 +1136,9 @@ def test_submit_evaluation_enqueues_writer_with_score_metric(LLMObs, mock_llmobs
         )
     )
     mock_llmobs_eval_metric_writer.reset_mock()
-    with LLMObs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span:
-        LLMObs.submit_evaluation(
-            span_context=LLMObs.export_span(span), label="sentiment", metric_type="score", value=0.9, ml_app="dummy"
+    with llmobs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span:
+        llmobs.submit_evaluation(
+            span_context=llmobs.export_span(span), label="sentiment", metric_type="score", value=0.9, ml_app="dummy"
         )
     mock_llmobs_eval_metric_writer.enqueue.assert_called_with(
         _expected_llmobs_eval_metric_event(
@@ -1327,9 +1153,9 @@ def test_submit_evaluation_enqueues_writer_with_score_metric(LLMObs, mock_llmobs
 
 
 def test_submit_evaluation_with_numerical_metric_enqueues_writer_with_score_metric(
-    LLMObs, mock_llmobs_eval_metric_writer
+    llmobs, mock_llmobs_eval_metric_writer
 ):
-    LLMObs.submit_evaluation(
+    llmobs.submit_evaluation(
         span_context={"span_id": "123", "trace_id": "456"},
         label="token_count",
         metric_type="numerical",
@@ -1342,9 +1168,9 @@ def test_submit_evaluation_with_numerical_metric_enqueues_writer_with_score_metr
         )
     )
     mock_llmobs_eval_metric_writer.reset_mock()
-    with LLMObs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span:
-        LLMObs.submit_evaluation(
-            span_context=LLMObs.export_span(span),
+    with llmobs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span:
+        llmobs.submit_evaluation(
+            span_context=llmobs.export_span(span),
             label="token_count",
             metric_type="numerical",
             value=35,
@@ -1363,143 +1189,143 @@ def test_submit_evaluation_with_numerical_metric_enqueues_writer_with_score_metr
 
 
 def test_flush_calls_periodic_agentless(
-    AgentlessLLMObs, mock_llmobs_span_agentless_writer, mock_llmobs_eval_metric_writer, mock_llmobs_evaluator_runner
+    AgentlessLLMObs, mock_llmobs_span_writer, mock_llmobs_eval_metric_writer, mock_llmobs_evaluator_runner
 ):
     AgentlessLLMObs.flush()
-    mock_llmobs_span_agentless_writer.periodic.assert_called_once()
+    mock_llmobs_span_writer.periodic.assert_called_once()
     mock_llmobs_eval_metric_writer.periodic.assert_called_once()
     mock_llmobs_evaluator_runner.periodic.assert_called_once()
 
 
 def test_flush_does_not_call_periodic_when_llmobs_is_disabled(
-    LLMObs,
+    llmobs,
     mock_llmobs_span_writer,
     mock_llmobs_eval_metric_writer,
     mock_llmobs_evaluator_runner,
-    mock_logs,
+    mock_llmobs_logs,
     disabled_llmobs,
 ):
-    LLMObs.flush()
+    llmobs.flush()
     mock_llmobs_span_writer.periodic.assert_not_called()
     mock_llmobs_eval_metric_writer.periodic.assert_not_called()
     mock_llmobs_evaluator_runner.periodic.assert_not_called()
-    mock_logs.warning.assert_has_calls(
+    mock_llmobs_logs.warning.assert_has_calls(
         [mock.call("flushing when LLMObs is disabled. No spans or evaluation metrics will be sent.")]
     )
 
 
 def test_flush_does_not_call_periodic_when_llmobs_is_disabled_agentless(
     AgentlessLLMObs,
-    mock_llmobs_span_agentless_writer,
+    mock_llmobs_span_writer,
     mock_llmobs_eval_metric_writer,
     mock_llmobs_evaluator_runner,
-    mock_logs,
+    mock_llmobs_logs,
     disabled_llmobs,
 ):
     AgentlessLLMObs.flush()
-    mock_llmobs_span_agentless_writer.periodic.assert_not_called()
+    mock_llmobs_span_writer.periodic.assert_not_called()
     mock_llmobs_eval_metric_writer.periodic.assert_not_called()
     mock_llmobs_evaluator_runner.periodic.assert_not_called()
-    mock_logs.warning.assert_has_calls(
+    mock_llmobs_logs.warning.assert_has_calls(
         [mock.call("flushing when LLMObs is disabled. No spans or evaluation metrics will be sent.")]
     )
 
 
-def test_inject_distributed_headers_llmobs_disabled_does_nothing(LLMObs, mock_logs):
-    LLMObs.disable()
-    headers = LLMObs.inject_distributed_headers({}, span=None)
-    mock_logs.warning.assert_called_once_with(
+def test_inject_distributed_headers_llmobs_disabled_does_nothing(llmobs, mock_llmobs_logs):
+    llmobs.disable()
+    headers = llmobs.inject_distributed_headers({}, span=None)
+    mock_llmobs_logs.warning.assert_called_once_with(
         "LLMObs.inject_distributed_headers() called when LLMObs is not enabled. "
         "Distributed context will not be injected."
     )
     assert headers == {}
 
 
-def test_inject_distributed_headers_not_dict_logs_warning(LLMObs, mock_logs):
-    headers = LLMObs.inject_distributed_headers("not a dictionary", span=None)
-    mock_logs.warning.assert_called_once_with("request_headers must be a dictionary of string key-value pairs.")
+def test_inject_distributed_headers_not_dict_logs_warning(llmobs, mock_llmobs_logs):
+    headers = llmobs.inject_distributed_headers("not a dictionary", span=None)
+    mock_llmobs_logs.warning.assert_called_once_with("request_headers must be a dictionary of string key-value pairs.")
     assert headers == "not a dictionary"
-    mock_logs.reset_mock()
-    headers = LLMObs.inject_distributed_headers(123, span=None)
-    mock_logs.warning.assert_called_once_with("request_headers must be a dictionary of string key-value pairs.")
+    mock_llmobs_logs.reset_mock()
+    headers = llmobs.inject_distributed_headers(123, span=None)
+    mock_llmobs_logs.warning.assert_called_once_with("request_headers must be a dictionary of string key-value pairs.")
     assert headers == 123
-    mock_logs.reset_mock()
-    headers = LLMObs.inject_distributed_headers(None, span=None)
-    mock_logs.warning.assert_called_once_with("request_headers must be a dictionary of string key-value pairs.")
+    mock_llmobs_logs.reset_mock()
+    headers = llmobs.inject_distributed_headers(None, span=None)
+    mock_llmobs_logs.warning.assert_called_once_with("request_headers must be a dictionary of string key-value pairs.")
     assert headers is None
 
 
-def test_inject_distributed_headers_no_active_span_logs_warning(LLMObs, mock_logs):
-    headers = LLMObs.inject_distributed_headers({}, span=None)
-    mock_logs.warning.assert_called_once_with("No span provided and no currently active span found.")
+def test_inject_distributed_headers_no_active_span_logs_warning(llmobs, mock_llmobs_logs):
+    headers = llmobs.inject_distributed_headers({}, span=None)
+    mock_llmobs_logs.warning.assert_called_once_with("No span provided and no currently active span found.")
     assert headers == {}
 
 
-def test_inject_distributed_headers_span_calls_httppropagator_inject(LLMObs, mock_logs):
-    span = LLMObs._instance.tracer.trace("test_span")
+def test_inject_distributed_headers_span_calls_httppropagator_inject(llmobs, mock_llmobs_logs):
+    span = llmobs._instance.tracer.trace("test_span")
     with mock.patch("ddtrace.propagation.http.HTTPPropagator.inject") as mock_inject:
-        LLMObs.inject_distributed_headers({}, span=span)
+        llmobs.inject_distributed_headers({}, span=span)
         assert mock_inject.call_count == 1
         mock_inject.assert_called_once_with(span.context, {})
 
 
-def test_inject_distributed_headers_current_active_span_injected(LLMObs, mock_logs):
-    span = LLMObs._instance.tracer.trace("test_span")
+def test_inject_distributed_headers_current_active_span_injected(llmobs, mock_llmobs_logs):
+    span = llmobs._instance.tracer.trace("test_span")
     with mock.patch("ddtrace.llmobs._llmobs.HTTPPropagator.inject") as mock_inject:
-        LLMObs.inject_distributed_headers({}, span=None)
+        llmobs.inject_distributed_headers({}, span=None)
         assert mock_inject.call_count == 1
         mock_inject.assert_called_once_with(span.context, {})
 
 
-def test_activate_distributed_headers_llmobs_disabled_does_nothing(LLMObs, mock_logs):
-    LLMObs.disable()
-    LLMObs.activate_distributed_headers({})
-    mock_logs.warning.assert_called_once_with(
+def test_activate_distributed_headers_llmobs_disabled_does_nothing(llmobs, mock_llmobs_logs):
+    llmobs.disable()
+    llmobs.activate_distributed_headers({})
+    mock_llmobs_logs.warning.assert_called_once_with(
         "LLMObs.activate_distributed_headers() called when LLMObs is not enabled. "
         "Distributed context will not be activated."
     )
 
 
-def test_activate_distributed_headers_calls_httppropagator_extract(LLMObs, mock_logs):
+def test_activate_distributed_headers_calls_httppropagator_extract(llmobs, mock_llmobs_logs):
     with mock.patch("ddtrace.llmobs._llmobs.HTTPPropagator.extract") as mock_extract:
-        LLMObs.activate_distributed_headers({})
+        llmobs.activate_distributed_headers({})
         assert mock_extract.call_count == 1
         mock_extract.assert_called_once_with({})
 
 
-def test_activate_distributed_headers_no_trace_id_does_nothing(LLMObs, mock_logs):
+def test_activate_distributed_headers_no_trace_id_does_nothing(llmobs, mock_llmobs_logs):
     with mock.patch("ddtrace.llmobs._llmobs.HTTPPropagator.extract") as mock_extract:
         mock_extract.return_value = Context(span_id="123", meta={PROPAGATED_PARENT_ID_KEY: "123"})
-        LLMObs.activate_distributed_headers({})
+        llmobs.activate_distributed_headers({})
         assert mock_extract.call_count == 1
-        mock_logs.warning.assert_called_once_with("Failed to extract trace ID or span ID from request headers.")
+        mock_llmobs_logs.warning.assert_called_once_with("Failed to extract trace ID or span ID from request headers.")
 
 
-def test_activate_distributed_headers_no_span_id_does_nothing(LLMObs, mock_logs):
+def test_activate_distributed_headers_no_span_id_does_nothing(llmobs, mock_llmobs_logs):
     with mock.patch("ddtrace.llmobs._llmobs.HTTPPropagator.extract") as mock_extract:
         mock_extract.return_value = Context(trace_id="123", meta={PROPAGATED_PARENT_ID_KEY: "123"})
-        LLMObs.activate_distributed_headers({})
+        llmobs.activate_distributed_headers({})
         assert mock_extract.call_count == 1
-        mock_logs.warning.assert_called_once_with("Failed to extract trace ID or span ID from request headers.")
+        mock_llmobs_logs.warning.assert_called_once_with("Failed to extract trace ID or span ID from request headers.")
 
 
-def test_activate_distributed_headers_no_llmobs_parent_id_does_nothing(LLMObs, mock_logs):
+def test_activate_distributed_headers_no_llmobs_parent_id_does_nothing(llmobs, mock_llmobs_logs):
     with mock.patch("ddtrace.llmobs._llmobs.HTTPPropagator.extract") as mock_extract:
         dummy_context = Context(trace_id="123", span_id="456")
         mock_extract.return_value = dummy_context
         with mock.patch("ddtrace.llmobs.LLMObs._instance.tracer.context_provider.activate") as mock_activate:
-            LLMObs.activate_distributed_headers({})
+            llmobs.activate_distributed_headers({})
             assert mock_extract.call_count == 1
-            mock_logs.warning.assert_called_once_with("Failed to extract LLMObs parent ID from request headers.")
+            mock_llmobs_logs.warning.assert_called_once_with("Failed to extract LLMObs parent ID from request headers.")
             mock_activate.assert_called_once_with(dummy_context)
 
 
-def test_activate_distributed_headers_activates_context(LLMObs, mock_logs):
+def test_activate_distributed_headers_activates_context(llmobs, mock_llmobs_logs):
     with mock.patch("ddtrace.llmobs._llmobs.HTTPPropagator.extract") as mock_extract:
         dummy_context = Context(trace_id="123", span_id="456", meta={PROPAGATED_PARENT_ID_KEY: "789"})
         mock_extract.return_value = dummy_context
         with mock.patch("ddtrace.llmobs.LLMObs._instance.tracer.context_provider.activate") as mock_activate:
-            LLMObs.activate_distributed_headers({})
+            llmobs.activate_distributed_headers({})
             assert mock_extract.call_count == 1
             mock_activate.assert_called_once_with(dummy_context)
 
@@ -1514,16 +1340,10 @@ def test_llmobs_fork_recreates_and_restarts_span_writer():
         if pid:  # parent
             assert llmobs_service._instance.tracer._pid == original_pid
             assert llmobs_service._instance._llmobs_span_writer == original_span_writer
-            assert (
-                llmobs_service._instance._trace_processor._span_writer == llmobs_service._instance._llmobs_span_writer
-            )
             assert llmobs_service._instance._llmobs_span_writer.status == ServiceStatus.RUNNING
         else:  # child
             assert llmobs_service._instance.tracer._pid != original_pid
             assert llmobs_service._instance._llmobs_span_writer != original_span_writer
-            assert (
-                llmobs_service._instance._trace_processor._span_writer == llmobs_service._instance._llmobs_span_writer
-            )
             assert llmobs_service._instance._llmobs_span_writer.status == ServiceStatus.RUNNING
             llmobs_service.disable()
             os._exit(12)
@@ -1569,18 +1389,10 @@ def test_llmobs_fork_recreates_and_restarts_evaluator_runner(mock_ragas_evaluato
             if pid:  # parent
                 assert llmobs_service._instance.tracer._pid == original_pid
                 assert llmobs_service._instance._evaluator_runner == original_evaluator_runner
-                assert (
-                    llmobs_service._instance._trace_processor._evaluator_runner
-                    == llmobs_service._instance._evaluator_runner
-                )
                 assert llmobs_service._instance._evaluator_runner.status == ServiceStatus.RUNNING
             else:  # child
                 assert llmobs_service._instance.tracer._pid != original_pid
                 assert llmobs_service._instance._evaluator_runner != original_evaluator_runner
-                assert (
-                    llmobs_service._instance._trace_processor._evaluator_runner
-                    == llmobs_service._instance._evaluator_runner
-                )
                 assert llmobs_service._instance._evaluator_runner.status == ServiceStatus.RUNNING
                 llmobs_service.disable()
                 os._exit(12)
@@ -1667,42 +1479,6 @@ def test_llmobs_fork_evaluator_runner_run(monkeypatch):
         llmobs_service.disable()
 
 
-def test_llmobs_fork_custom_filter(monkeypatch):
-    """Test that forking a process correctly keeps any custom filters."""
-
-    class CustomFilter(TraceFilter):
-        def process_trace(self, trace):
-            return trace
-
-    monkeypatch.setenv("_DD_LLMOBS_WRITER_INTERVAL", 5.0)
-    with mock.patch("ddtrace.internal.writer.HTTPWriter._send_payload"):
-        tracer = DummyTracer()
-        custom_filter = CustomFilter()
-        tracer.configure(settings={"FILTERS": [custom_filter]})
-        llmobs_service.enable(_tracer=tracer, ml_app="test_app")
-        assert custom_filter in llmobs_service._instance.tracer._filters
-        pid = os.fork()
-        if pid:  # parent
-            assert custom_filter in llmobs_service._instance.tracer._filters
-            assert any(
-                isinstance(tracer_filter, LLMObsTraceProcessor)
-                for tracer_filter in llmobs_service._instance.tracer._filters
-            )
-        else:  # child
-            assert custom_filter in llmobs_service._instance.tracer._filters
-            assert any(
-                isinstance(tracer_filter, LLMObsTraceProcessor)
-                for tracer_filter in llmobs_service._instance.tracer._filters
-            )
-            llmobs_service.disable()
-            os._exit(12)
-
-        _, status = os.waitpid(pid, 0)
-        exit_code = os.WEXITSTATUS(status)
-        assert exit_code == 12
-        llmobs_service.disable()
-
-
 def test_llmobs_fork_disabled(monkeypatch):
     """Test that after being disabled the service remains disabled when forking"""
     monkeypatch.setenv("DD_LLMOBS_ENABLED", "0")
@@ -1746,46 +1522,46 @@ def test_llmobs_fork_disabled_then_enabled(monkeypatch):
     svc.disable()
 
 
-def test_llmobs_with_evaluator_runner(LLMObs, mock_llmobs_evaluator_runner):
-    with LLMObs.llm(model_name="test_model"):
+def test_llmobs_with_evaluator_runner(llmobs, mock_llmobs_evaluator_runner):
+    with llmobs.llm(model_name="test_model"):
         pass
     time.sleep(0.1)
-    assert LLMObs._instance._evaluator_runner.enqueue.call_count == 1
+    assert llmobs._instance._evaluator_runner.enqueue.call_count == 1
 
 
-def test_llmobs_with_evaluator_runner_does_not_enqueue_evaluation_spans(mock_llmobs_evaluator_runner, LLMObs):
-    with LLMObs.llm(model_name="test_model", ml_app="{}-dummy".format(RAGAS_ML_APP_PREFIX)):
+def test_llmobs_with_evaluator_runner_does_not_enqueue_evaluation_spans(mock_llmobs_evaluator_runner, llmobs):
+    with llmobs.llm(model_name="test_model", ml_app="{}-dummy".format(RAGAS_ML_APP_PREFIX)):
         pass
     time.sleep(0.1)
-    assert LLMObs._instance._evaluator_runner.enqueue.call_count == 0
+    assert llmobs._instance._evaluator_runner.enqueue.call_count == 0
 
 
-def test_llmobs_with_evaluation_runner_does_not_enqueue_non_llm_spans(mock_llmobs_evaluator_runner, LLMObs):
-    with LLMObs.workflow(name="test"):
+def test_llmobs_with_evaluation_runner_does_not_enqueue_non_llm_spans(mock_llmobs_evaluator_runner, llmobs):
+    with llmobs.workflow(name="test"):
         pass
-    with LLMObs.agent(name="test"):
+    with llmobs.agent(name="test"):
         pass
-    with LLMObs.task(name="test"):
+    with llmobs.task(name="test"):
         pass
-    with LLMObs.embedding(model_name="test"):
+    with llmobs.embedding(model_name="test"):
         pass
-    with LLMObs.retrieval(name="test"):
+    with llmobs.retrieval(name="test"):
         pass
-    with LLMObs.tool(name="test"):
+    with llmobs.tool(name="test"):
         pass
     time.sleep(0.1)
-    assert LLMObs._instance._evaluator_runner.enqueue.call_count == 0
+    assert llmobs._instance._evaluator_runner.enqueue.call_count == 0
 
 
-def test_annotation_context_modifies_span_tags(LLMObs):
-    with LLMObs.annotation_context(tags={"foo": "bar"}):
-        with LLMObs.agent(name="test_agent") as span:
+def test_annotation_context_modifies_span_tags(llmobs):
+    with llmobs.annotation_context(tags={"foo": "bar"}):
+        with llmobs.agent(name="test_agent") as span:
             assert span._get_ctx_item(TAGS) == {"foo": "bar"}
 
 
-def test_annotation_context_modifies_prompt(LLMObs):
-    with LLMObs.annotation_context(prompt={"template": "test_template"}):
-        with LLMObs.llm(name="test_agent", model_name="test") as span:
+def test_annotation_context_modifies_prompt(llmobs):
+    with llmobs.annotation_context(prompt={"template": "test_template"}):
+        with llmobs.llm(name="test_agent", model_name="test") as span:
             assert span._get_ctx_item(INPUT_PROMPT) == {
                 "template": "test_template",
                 "_dd_context_variable_keys": ["context"],
@@ -1793,80 +1569,80 @@ def test_annotation_context_modifies_prompt(LLMObs):
             }
 
 
-def test_annotation_context_modifies_name(LLMObs):
-    with LLMObs.annotation_context(name="test_agent_override"):
-        with LLMObs.llm(name="test_agent", model_name="test") as span:
+def test_annotation_context_modifies_name(llmobs):
+    with llmobs.annotation_context(name="test_agent_override"):
+        with llmobs.llm(name="test_agent", model_name="test") as span:
             assert span.name == "test_agent_override"
 
 
-def test_annotation_context_finished_context_does_not_modify_tags(LLMObs):
-    with LLMObs.annotation_context(tags={"foo": "bar"}):
+def test_annotation_context_finished_context_does_not_modify_tags(llmobs):
+    with llmobs.annotation_context(tags={"foo": "bar"}):
         pass
-    with LLMObs.agent(name="test_agent") as span:
+    with llmobs.agent(name="test_agent") as span:
         assert span._get_ctx_item(TAGS) is None
 
 
-def test_annotation_context_finished_context_does_not_modify_prompt(LLMObs):
-    with LLMObs.annotation_context(prompt={"template": "test_template"}):
+def test_annotation_context_finished_context_does_not_modify_prompt(llmobs):
+    with llmobs.annotation_context(prompt={"template": "test_template"}):
         pass
-    with LLMObs.llm(name="test_agent", model_name="test") as span:
+    with llmobs.llm(name="test_agent", model_name="test") as span:
         assert span._get_ctx_item(INPUT_PROMPT) is None
 
 
-def test_annotation_context_finished_context_does_not_modify_name(LLMObs):
-    with LLMObs.annotation_context(name="test_agent_override"):
+def test_annotation_context_finished_context_does_not_modify_name(llmobs):
+    with llmobs.annotation_context(name="test_agent_override"):
         pass
-    with LLMObs.agent(name="test_agent") as span:
+    with llmobs.agent(name="test_agent") as span:
         assert span.name == "test_agent"
 
 
-def test_annotation_context_nested(LLMObs):
-    with LLMObs.annotation_context(tags={"foo": "bar", "boo": "bar"}):
-        with LLMObs.annotation_context(tags={"foo": "baz"}):
-            with LLMObs.agent(name="test_agent") as span:
+def test_annotation_context_nested(llmobs):
+    with llmobs.annotation_context(tags={"foo": "bar", "boo": "bar"}):
+        with llmobs.annotation_context(tags={"foo": "baz"}):
+            with llmobs.agent(name="test_agent") as span:
                 assert span._get_ctx_item(TAGS) == {"foo": "baz", "boo": "bar"}
 
 
-def test_annotation_context_nested_overrides_name(LLMObs):
-    with LLMObs.annotation_context(name="unexpected"):
-        with LLMObs.annotation_context(name="expected"):
-            with LLMObs.agent(name="test_agent") as span:
+def test_annotation_context_nested_overrides_name(llmobs):
+    with llmobs.annotation_context(name="unexpected"):
+        with llmobs.annotation_context(name="expected"):
+            with llmobs.agent(name="test_agent") as span:
                 assert span.name == "expected"
 
 
-def test_annotation_context_nested_maintains_trace_structure(LLMObs, mock_llmobs_span_writer):
+def test_annotation_context_nested_maintains_trace_structure(llmobs, llmobs_events):
     """This test makes sure starting/stopping annotation contexts do not modify the llmobs trace structure"""
-    with LLMObs.annotation_context(tags={"foo": "bar", "boo": "bar"}):
-        with LLMObs.agent(name="parent_span") as parent_span:
-            with LLMObs.annotation_context(tags={"foo": "baz"}):
-                with LLMObs.workflow(name="child_span") as child_span:
+    with llmobs.annotation_context(tags={"foo": "bar", "boo": "bar"}):
+        with llmobs.agent(name="parent_span") as parent_span:
+            with llmobs.annotation_context(tags={"foo": "baz"}):
+                with llmobs.workflow(name="child_span") as child_span:
                     assert child_span._get_ctx_item(TAGS) == {"foo": "baz", "boo": "bar"}
                     assert parent_span._get_ctx_item(TAGS) == {"foo": "bar", "boo": "bar"}
 
-    assert len(mock_llmobs_span_writer.enqueue.call_args_list) == 2
-    parent_span, child_span = [span[0] for span, _ in mock_llmobs_span_writer.enqueue.call_args_list]
+    assert len(llmobs_events) == 2
+    parent_span, child_span = llmobs_events[1], llmobs_events[0]
     assert child_span["trace_id"] == parent_span["trace_id"]
     assert child_span["span_id"] != parent_span["span_id"]
     assert child_span["parent_id"] == parent_span["span_id"]
     assert parent_span["parent_id"] == "undefined"
 
-    mock_llmobs_span_writer.reset_mock()
 
-    with LLMObs.annotation_context(tags={"foo": "bar", "boo": "bar"}):
-        with LLMObs.agent(name="parent_span"):
+def test_annotation_context_separate_traces_maintained(llmobs, llmobs_events):
+    with llmobs.annotation_context(tags={"foo": "bar", "boo": "bar"}):
+        with llmobs.agent(name="parent_span"):
             pass
-        with LLMObs.workflow(name="child_span"):
+        with llmobs.workflow(name="child_span"):
             pass
 
-    assert len(mock_llmobs_span_writer.enqueue.call_args_list) == 2
-    trace_one, trace_two = [span[0] for span, _ in mock_llmobs_span_writer.enqueue.call_args_list]
-    assert trace_one["trace_id"] != trace_two["trace_id"]
-    assert trace_one["span_id"] != trace_two["span_id"]
-    assert trace_two["parent_id"] == "undefined"
-    assert trace_one["parent_id"] == "undefined"
+    assert len(llmobs_events) == 2
+    agent_span, workflow_span = llmobs_events[1], llmobs_events[0]
+    assert agent_span["trace_id"] != workflow_span["trace_id"]
+    assert agent_span["span_id"] != workflow_span["span_id"]
+    assert workflow_span["parent_id"] == "undefined"
+    assert agent_span["parent_id"] == "undefined"
 
 
-def test_annotation_context_only_applies_to_local_context(LLMObs):
+def test_annotation_context_only_applies_to_local_context(llmobs):
     """
     tests that annotation contexts only apply to spans belonging to the same
     trace context and not globally to all spans.
@@ -1882,8 +1658,8 @@ def test_annotation_context_only_applies_to_local_context(LLMObs):
     def context_one():
         nonlocal agent_has_correct_name
         nonlocal agent_has_correct_tags
-        with LLMObs.annotation_context(name="expected_agent", tags={"foo": "bar"}):
-            with LLMObs.agent(name="test_agent") as span:
+        with llmobs.annotation_context(name="expected_agent", tags={"foo": "bar"}):
+            with llmobs.agent(name="test_agent") as span:
                 event.wait()
                 agent_has_correct_tags = span._get_ctx_item(TAGS) == {"foo": "bar"}
                 agent_has_correct_name = span.name == "expected_agent"
@@ -1892,9 +1668,9 @@ def context_one():
     def context_two():
         nonlocal tool_has_correct_name
         nonlocal tool_does_not_have_tags
-        with LLMObs.agent(name="test_agent"):
-            with LLMObs.annotation_context(name="expected_tool"):
-                with LLMObs.tool(name="test_tool") as tool_span:
+        with llmobs.agent(name="test_agent"):
+            with llmobs.annotation_context(name="expected_tool"):
+                with llmobs.tool(name="test_tool") as tool_span:
                     event.wait()
                     tool_does_not_have_tags = tool_span._get_ctx_item(TAGS) is None
                     tool_has_correct_name = tool_span.name == "expected_tool"
@@ -1904,7 +1680,7 @@ def context_two():
     thread_one.start()
     thread_two.start()
 
-    with LLMObs.agent(name="test_agent") as span:
+    with llmobs.agent(name="test_agent") as span:
         assert span.name == "test_agent"
         assert span._get_ctx_item(TAGS) is None
 
@@ -1920,15 +1696,15 @@ def context_two():
     assert tool_does_not_have_tags is True
 
 
-async def test_annotation_context_async_modifies_span_tags(LLMObs):
-    async with LLMObs.annotation_context(tags={"foo": "bar"}):
-        with LLMObs.agent(name="test_agent") as span:
+async def test_annotation_context_async_modifies_span_tags(llmobs):
+    async with llmobs.annotation_context(tags={"foo": "bar"}):
+        with llmobs.agent(name="test_agent") as span:
             assert span._get_ctx_item(TAGS) == {"foo": "bar"}
 
 
-async def test_annotation_context_async_modifies_prompt(LLMObs):
-    async with LLMObs.annotation_context(prompt={"template": "test_template"}):
-        with LLMObs.llm(name="test_agent", model_name="test") as span:
+async def test_annotation_context_async_modifies_prompt(llmobs):
+    async with llmobs.annotation_context(prompt={"template": "test_template"}):
+        with llmobs.llm(name="test_agent", model_name="test") as span:
             assert span._get_ctx_item(INPUT_PROMPT) == {
                 "template": "test_template",
                 "_dd_context_variable_keys": ["context"],
@@ -1936,41 +1712,42 @@ async def test_annotation_context_async_modifies_prompt(LLMObs):
             }
 
 
-async def test_annotation_context_async_modifies_name(LLMObs):
-    async with LLMObs.annotation_context(name="test_agent_override"):
-        with LLMObs.llm(name="test_agent", model_name="test") as span:
+async def test_annotation_context_async_modifies_name(llmobs):
+    async with llmobs.annotation_context(name="test_agent_override"):
+        with llmobs.llm(name="test_agent", model_name="test") as span:
             assert span.name == "test_agent_override"
 
 
-async def test_annotation_context_async_finished_context_does_not_modify_tags(LLMObs):
-    async with LLMObs.annotation_context(tags={"foo": "bar"}):
+async def test_annotation_context_async_finished_context_does_not_modify_tags(llmobs):
+    async with llmobs.annotation_context(tags={"foo": "bar"}):
         pass
-    with LLMObs.agent(name="test_agent") as span:
+    with llmobs.agent(name="test_agent") as span:
         assert span._get_ctx_item(TAGS) is None
 
 
-async def test_annotation_context_async_finished_context_does_not_modify_prompt(LLMObs):
-    async with LLMObs.annotation_context(prompt={"template": "test_template"}):
+async def test_annotation_context_async_finished_context_does_not_modify_prompt(llmobs):
+    async with llmobs.annotation_context(prompt={"template": "test_template"}):
         pass
-    with LLMObs.llm(name="test_agent", model_name="test") as span:
+    with llmobs.llm(name="test_agent", model_name="test") as span:
         assert span._get_ctx_item(INPUT_PROMPT) is None
 
 
-async def test_annotation_context_finished_context_async_does_not_modify_name(LLMObs):
-    async with LLMObs.annotation_context(name="test_agent_override"):
+async def test_annotation_context_finished_context_async_does_not_modify_name(llmobs):
+    async with llmobs.annotation_context(name="test_agent_override"):
         pass
-    with LLMObs.agent(name="test_agent") as span:
+    with llmobs.agent(name="test_agent") as span:
         assert span.name == "test_agent"
 
 
-async def test_annotation_context_async_nested(LLMObs):
-    async with LLMObs.annotation_context(tags={"foo": "bar", "boo": "bar"}):
-        async with LLMObs.annotation_context(tags={"foo": "baz"}):
-            with LLMObs.agent(name="test_agent") as span:
+async def test_annotation_context_async_nested(llmobs):
+    async with llmobs.annotation_context(tags={"foo": "bar", "boo": "bar"}):
+        async with llmobs.annotation_context(tags={"foo": "baz"}):
+            with llmobs.agent(name="test_agent") as span:
                 assert span._get_ctx_item(TAGS) == {"foo": "baz", "boo": "bar"}
 
 
 def test_service_enable_starts_evaluator_runner_when_evaluators_exist():
+    pytest.importorskip("ragas")
     with override_global_config(dict(_dd_api_key="<not-a-real-api-key>", _llmobs_ml_app="<ml-app-name>")):
         with override_env(dict(_DD_LLMOBS_EVALUATORS="ragas_faithfulness")):
             dummy_tracer = DummyTracer()
diff --git a/tests/llmobs/test_llmobs_span_agent_writer.py b/tests/llmobs/test_llmobs_span_agent_writer.py
index 76fe0f21aef..d16bb9f0e2c 100644
--- a/tests/llmobs/test_llmobs_span_agent_writer.py
+++ b/tests/llmobs/test_llmobs_span_agent_writer.py
@@ -44,7 +44,8 @@ def test_flush_queue_when_event_cause_queue_to_exceed_payload_limit(
         [
             mock.call("flushing queue because queuing next event will exceed EVP payload limit"),
             mock.call("encode %d LLMObs span events to be sent", 5),
-        ]
+        ],
+        any_order=True,
     )
 
 
diff --git a/tests/llmobs/test_llmobs_span_agentless_writer.py b/tests/llmobs/test_llmobs_span_agentless_writer.py
index 4882f3553d8..4a54faf130d 100644
--- a/tests/llmobs/test_llmobs_span_agentless_writer.py
+++ b/tests/llmobs/test_llmobs_span_agentless_writer.py
@@ -75,26 +75,25 @@ def test_truncating_oversized_events(mock_writer_logs, mock_http_writer_send_pay
         )
 
 
-def test_send_completion_event(mock_writer_logs, mock_http_writer_logs, mock_http_writer_send_payload_response):
+def test_send_completion_event(mock_writer_logs, mock_http_writer_send_payload_response):
     with override_global_config(dict(_dd_site=DATADOG_SITE, _dd_api_key="foobar.baz")):
         llmobs_span_writer = LLMObsSpanWriter(is_agentless=True, interval=1, timeout=1)
         llmobs_span_writer.start()
         llmobs_span_writer.enqueue(_completion_event())
         llmobs_span_writer.periodic()
         mock_writer_logs.debug.assert_has_calls([mock.call("encode %d LLMObs span events to be sent", 1)])
-        mock_http_writer_logs.error.assert_not_called()
 
 
-def test_send_chat_completion_event(mock_writer_logs, mock_http_writer_logs, mock_http_writer_send_payload_response):
+def test_send_chat_completion_event(mock_writer_logs, mock_http_writer_send_payload_response):
     with override_global_config(dict(_dd_site=DATADOG_SITE, _dd_api_key="foobar.baz")):
         llmobs_span_writer = LLMObsSpanWriter(is_agentless=True, interval=1, timeout=1)
         llmobs_span_writer.start()
         llmobs_span_writer.enqueue(_chat_completion_event())
         llmobs_span_writer.periodic()
         mock_writer_logs.debug.assert_has_calls([mock.call("encode %d LLMObs span events to be sent", 1)])
-        mock_http_writer_logs.error.assert_not_called()
 
 
+@mock.patch("ddtrace.internal.writer.writer.log")
 def test_send_completion_bad_api_key(mock_http_writer_logs, mock_http_writer_put_response_forbidden):
     with override_global_config(dict(_dd_site=DATADOG_SITE, _dd_api_key="<bad-api-key>")):
         llmobs_span_writer = LLMObsSpanWriter(is_agentless=True, interval=1, timeout=1)
@@ -109,7 +108,7 @@ def test_send_completion_bad_api_key(mock_http_writer_logs, mock_http_writer_put
         )
 
 
-def test_send_timed_events(mock_writer_logs, mock_http_writer_logs, mock_http_writer_send_payload_response):
+def test_send_timed_events(mock_writer_logs, mock_http_writer_send_payload_response):
     with override_global_config(dict(_dd_site=DATADOG_SITE, _dd_api_key="foobar.baz")):
         llmobs_span_writer = LLMObsSpanWriter(is_agentless=True, interval=0.01, timeout=1)
         llmobs_span_writer.start()
@@ -122,10 +121,9 @@ def test_send_timed_events(mock_writer_logs, mock_http_writer_logs, mock_http_wr
         llmobs_span_writer.enqueue(_chat_completion_event())
         time.sleep(0.1)
         mock_writer_logs.debug.assert_has_calls([mock.call("encode %d LLMObs span events to be sent", 1)])
-        mock_http_writer_logs.error.assert_not_called()
 
 
-def test_send_multiple_events(mock_writer_logs, mock_http_writer_logs, mock_http_writer_send_payload_response):
+def test_send_multiple_events(mock_writer_logs, mock_http_writer_send_payload_response):
     with override_global_config(dict(_dd_site=DATADOG_SITE, _dd_api_key="foobar.baz")):
         llmobs_span_writer = LLMObsSpanWriter(is_agentless=True, interval=0.01, timeout=1)
         llmobs_span_writer.start()
@@ -135,10 +133,9 @@ def test_send_multiple_events(mock_writer_logs, mock_http_writer_logs, mock_http
         llmobs_span_writer.enqueue(_chat_completion_event())
         time.sleep(0.1)
         mock_writer_logs.debug.assert_has_calls([mock.call("encode %d LLMObs span events to be sent", 2)])
-        mock_http_writer_logs.error.assert_not_called()
 
 
-def test_send_on_exit(mock_writer_logs, run_python_code_in_subprocess):
+def test_send_on_exit(run_python_code_in_subprocess):
     env = os.environ.copy()
     pypath = [os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))]
     if "PYTHONPATH" in env:
diff --git a/tests/llmobs/test_llmobs_trace_processor.py b/tests/llmobs/test_llmobs_trace_processor.py
deleted file mode 100644
index b55286d49c8..00000000000
--- a/tests/llmobs/test_llmobs_trace_processor.py
+++ /dev/null
@@ -1,36 +0,0 @@
-import mock
-
-from ddtrace._trace.span import Span
-from ddtrace.ext import SpanTypes
-from ddtrace.llmobs._constants import SPAN_KIND
-from ddtrace.llmobs._trace_processor import LLMObsTraceProcessor
-from tests.utils import override_global_config
-
-
-def test_processor_returns_all_traces_by_default():
-    """Test that the LLMObsTraceProcessor returns all traces by default."""
-    trace_filter = LLMObsTraceProcessor(llmobs_span_writer=mock.MagicMock())
-    root_llm_span = Span(name="span1", span_type=SpanTypes.LLM)
-    root_llm_span._set_ctx_item(SPAN_KIND, "llm")
-    trace1 = [root_llm_span]
-    assert trace_filter.process_trace(trace1) == trace1
-
-
-def test_processor_returns_all_traces_if_not_agentless():
-    """Test that the LLMObsTraceProcessor returns all traces if DD_LLMOBS_AGENTLESS_ENABLED is not set to true."""
-    with override_global_config(dict(_llmobs_agentless_enabled=False)):
-        trace_filter = LLMObsTraceProcessor(llmobs_span_writer=mock.MagicMock())
-        root_llm_span = Span(name="span1", span_type=SpanTypes.LLM)
-        root_llm_span._set_ctx_item(SPAN_KIND, "llm")
-        trace1 = [root_llm_span]
-        assert trace_filter.process_trace(trace1) == trace1
-
-
-def test_processor_returns_none_in_agentless_mode():
-    """Test that the LLMObsTraceProcessor returns None if DD_LLMOBS_AGENTLESS_ENABLED is set to true."""
-    with override_global_config(dict(_llmobs_agentless_enabled=True)):
-        trace_filter = LLMObsTraceProcessor(llmobs_span_writer=mock.MagicMock())
-        root_llm_span = Span(name="span1", span_type=SpanTypes.LLM)
-        root_llm_span._set_ctx_item(SPAN_KIND, "llm")
-        trace1 = [root_llm_span]
-        assert trace_filter.process_trace(trace1) is None
diff --git a/tests/llmobs/test_propagation.py b/tests/llmobs/test_propagation.py
index d892c6b98a2..d14b22d65d5 100644
--- a/tests/llmobs/test_propagation.py
+++ b/tests/llmobs/test_propagation.py
@@ -216,7 +216,6 @@ def test_activate_distributed_headers_propagate_correct_llmobs_parent_id_simple(
     env["DD_TRACE_ENABLED"] = "0"
     stdout, stderr, status, _ = run_python_code_in_subprocess(code=code, env=env)
     assert status == 0, (stdout, stderr)
-    assert stderr == b"", (stdout, stderr)
 
     headers = json.loads(stdout.decode())
     LLMObs.activate_distributed_headers(headers)
@@ -252,7 +251,6 @@ def test_activate_distributed_headers_propagate_llmobs_parent_id_complex(run_pyt
     env["DD_TRACE_ENABLED"] = "0"
     stdout, stderr, status, _ = run_python_code_in_subprocess(code=code, env=env)
     assert status == 0, (stdout, stderr)
-    assert stderr == b"", (stdout, stderr)
 
     headers = json.loads(stdout.decode())
     LLMObs.activate_distributed_headers(headers)

From 68fc8f0624b47eef91474fa560b3de2700a8a0b5 Mon Sep 17 00:00:00 2001
From: Zachary Groves <32471391+ZStriker19@users.noreply.github.com>
Date: Thu, 9 Jan 2025 13:46:29 -0500
Subject: [PATCH 10/19] fix(propagation): make baggage header extract case
 insensitive (#11859)

Following the WSGI spec, headers shouldn't be case sensitive, therefore
baggage, like the other headers we extract, should not be case
sensitive.
https://peps.python.org/pep-3333/#environ-variables: "(A reminder for
server/gateway authors: HTTP header names are case-insensitive, so be
sure to take that into consideration when examining application-supplied
headers!)"

## Checklist
- [x] PR author has checked that all the criteria below are met
- The PR description includes an overview of the change
- The PR description articulates the motivation for the change
- The change includes tests OR the PR description describes a testing
strategy
- The PR description notes risks associated with the change, if any
- Newly-added code is easy to change
- The change follows the [library release note
guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html)
- The change includes or references documentation updates if necessary
- Backport labels are set (if
[applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting))

## Reviewer Checklist
- [x] Reviewer has checked that all the criteria below are met
- Title is accurate
- All changes are related to the pull request's stated goal
- Avoids breaking
[API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces)
changes
- Testing strategy adequately addresses listed risks
- Newly-added code is easy to change
- Release note makes sense to a user of the library
- If necessary, author has acknowledged and discussed the performance
implications of this PR as reported in the benchmarks PR comment
- Backport labels are set in a manner that is consistent with the
[release branch maintenance
policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)
---
 ddtrace/propagation/http.py                   |  3 ++-
 ...ge_header_extraction-63167c492474da6f.yaml |  6 +++++
 tests/tracer/test_propagation.py              | 24 ++++++++++++-------
 3 files changed, 24 insertions(+), 9 deletions(-)
 create mode 100644 releasenotes/notes/case_insensitive_baggage_header_extraction-63167c492474da6f.yaml

diff --git a/ddtrace/propagation/http.py b/ddtrace/propagation/http.py
index a1664664ace..563ee838d84 100644
--- a/ddtrace/propagation/http.py
+++ b/ddtrace/propagation/http.py
@@ -101,6 +101,7 @@ def _possible_header(header):
 _POSSIBLE_HTTP_HEADER_B3_FLAGS = _possible_header(_HTTP_HEADER_B3_FLAGS)
 _POSSIBLE_HTTP_HEADER_TRACEPARENT = _possible_header(_HTTP_HEADER_TRACEPARENT)
 _POSSIBLE_HTTP_HEADER_TRACESTATE = _possible_header(_HTTP_HEADER_TRACESTATE)
+_POSSIBLE_HTTP_BAGGAGE_HEADER = _possible_header(_HTTP_HEADER_BAGGAGE)
 
 
 # https://www.w3.org/TR/trace-context/#traceparent-header-field-values
@@ -937,7 +938,7 @@ def _inject(span_context: Context, headers: Dict[str, str]) -> None:
 
     @staticmethod
     def _extract(headers: Dict[str, str]) -> Context:
-        header_value = headers.get(_HTTP_HEADER_BAGGAGE)
+        header_value = _extract_header_value(_POSSIBLE_HTTP_BAGGAGE_HEADER, headers)
 
         if not header_value:
             return Context(baggage={})
diff --git a/releasenotes/notes/case_insensitive_baggage_header_extraction-63167c492474da6f.yaml b/releasenotes/notes/case_insensitive_baggage_header_extraction-63167c492474da6f.yaml
new file mode 100644
index 00000000000..ad0eacb28e8
--- /dev/null
+++ b/releasenotes/notes/case_insensitive_baggage_header_extraction-63167c492474da6f.yaml
@@ -0,0 +1,6 @@
+---
+fixes:
+  - |
+    tracer: This fix resolves an issue where baggage header extraction was case sensitive and didn't accept the header prepended with HTTP.
+    Now the baggage header will be extracted regardless of casing and the HTTP format.
+    
diff --git a/tests/tracer/test_propagation.py b/tests/tracer/test_propagation.py
index 61fec650a70..0d4c5d7c01d 100644
--- a/tests/tracer/test_propagation.py
+++ b/tests/tracer/test_propagation.py
@@ -1888,6 +1888,14 @@ def test_extract_tracecontext(headers, expected_context):
         B3_SINGLE_HEADERS_VALID,
         CONTEXT_EMPTY,
     ),
+    (
+        "baggage_case_insensitive",
+        None,
+        {"BAgGage": "key1=val1,key2=val2"},
+        {
+            "baggage": {"key1": "val1", "key2": "val2"},
+        },
+    ),
     # All valid headers
     (
         "valid_all_headers_default_style",
@@ -2278,14 +2286,14 @@ def test_propagation_extract_w_config(name, styles, headers, expected_context, r
     overrides = {}
     if styles is not None:
         overrides["_propagation_style_extract"] = styles
-        with override_global_config(overrides):
-            context = HTTPPropagator.extract(headers)
-            if not expected_context.get("tracestate"):
-                assert context == Context(**expected_context)
-            else:
-                copied_expectation = expected_context.copy()
-                tracestate = copied_expectation.pop("tracestate")
-                assert context == Context(**copied_expectation, meta={"tracestate": tracestate})
+    with override_global_config(overrides):
+        context = HTTPPropagator.extract(headers)
+        if not expected_context.get("tracestate"):
+            assert context == Context(**expected_context)
+        else:
+            copied_expectation = expected_context.copy()
+            tracestate = copied_expectation.pop("tracestate")
+            assert context == Context(**copied_expectation, meta={"tracestate": tracestate})
 
 
 EXTRACT_OVERRIDE_FIXTURES = [

From 04ee68f4ba0261b145d8227d6a3fbe602ba3c9c4 Mon Sep 17 00:00:00 2001
From: Yun Kim <35776586+Yun-Kim@users.noreply.github.com>
Date: Thu, 9 Jan 2025 13:59:40 -0500
Subject: [PATCH 11/19] chore(llmobs): assert using span events instead of
 mocks (#11856)

Follow up of #11781 to further clean up LLMObs tests, specifically
replacing potentially flaky LLMObs span writer mocks and assertions with
the test LLMObsSpanWriter dummy class. Also clean up the
`tests/llmobs/conftest.py` file which previously contained a ton of
rarely used and sometimes redundant fixtures.

## Checklist
- [x] PR author has checked that all the criteria below are met
- The PR description includes an overview of the change
- The PR description articulates the motivation for the change
- The change includes tests OR the PR description describes a testing
strategy
- The PR description notes risks associated with the change, if any
- Newly-added code is easy to change
- The change follows the [library release note
guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html)
- The change includes or references documentation updates if necessary
- Backport labels are set (if
[applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting))

## Reviewer Checklist
- [x] Reviewer has checked that all the criteria below are met
- Title is accurate
- All changes are related to the pull request's stated goal
- Avoids breaking
[API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces)
changes
- Testing strategy adequately addresses listed risks
- Newly-added code is easy to change
- Release note makes sense to a user of the library
- If necessary, author has acknowledged and discussed the performance
implications of this PR as reported in the benchmarks PR comment
- Backport labels are set in a manner that is consistent with the
[release branch maintenance
policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)
---
 tests/llmobs/conftest.py                      |  50 +-
 tests/llmobs/test_llmobs_decorators.py        | 492 ++++++++----------
 tests/llmobs/test_llmobs_evaluator_runner.py  |  12 +-
 ...est_llmobs_ragas_faithfulness_evaluator.py |  33 +-
 tests/llmobs/test_llmobs_service.py           |  49 +-
 tests/llmobs/test_propagation.py              |  35 +-
 6 files changed, 277 insertions(+), 394 deletions(-)

diff --git a/tests/llmobs/conftest.py b/tests/llmobs/conftest.py
index 15cffe5faa9..5a63b7e2b8f 100644
--- a/tests/llmobs/conftest.py
+++ b/tests/llmobs/conftest.py
@@ -31,16 +31,6 @@ def pytest_configure(config):
     config.addinivalue_line("markers", "vcr_logs: mark test to use recorded request/responses")
 
 
-@pytest.fixture
-def mock_llmobs_span_writer():
-    patcher = mock.patch("ddtrace.llmobs._llmobs.LLMObsSpanWriter")
-    LLMObsSpanWriterMock = patcher.start()
-    m = mock.MagicMock()
-    LLMObsSpanWriterMock.return_value = m
-    yield m
-    patcher.stop()
-
-
 @pytest.fixture
 def mock_llmobs_eval_metric_writer():
     patcher = mock.patch("ddtrace.llmobs._llmobs.LLMObsEvalMetricWriter")
@@ -127,44 +117,6 @@ def default_global_config():
     return {"_dd_api_key": "<not-a-real-api_key>", "_llmobs_ml_app": "unnamed-ml-app"}
 
 
-@pytest.fixture
-def LLMObs(
-    mock_llmobs_span_writer, mock_llmobs_eval_metric_writer, mock_llmobs_evaluator_runner, ddtrace_global_config
-):
-    global_config = default_global_config()
-    global_config.update(ddtrace_global_config)
-    with override_global_config(global_config):
-        dummy_tracer = DummyTracer()
-        llmobs_service.enable(_tracer=dummy_tracer)
-        yield llmobs_service
-        llmobs_service.disable()
-
-
-@pytest.fixture
-def AgentlessLLMObs(
-    mock_llmobs_span_writer,
-    mock_llmobs_eval_metric_writer,
-    mock_llmobs_evaluator_runner,
-    ddtrace_global_config,
-):
-    global_config = default_global_config()
-    global_config.update(ddtrace_global_config)
-    global_config.update(dict(_llmobs_agentless_enabled=True))
-    with override_global_config(global_config):
-        dummy_tracer = DummyTracer()
-        llmobs_service.enable(_tracer=dummy_tracer)
-        yield llmobs_service
-        llmobs_service.disable()
-
-
-@pytest.fixture
-def disabled_llmobs():
-    prev = llmobs_service.enabled
-    llmobs_service.enabled = False
-    yield
-    llmobs_service.enabled = prev
-
-
 @pytest.fixture
 def mock_ragas_dependencies_not_present():
     import ragas
@@ -177,7 +129,7 @@ def mock_ragas_dependencies_not_present():
 
 
 @pytest.fixture
-def ragas(mock_llmobs_span_writer, mock_llmobs_eval_metric_writer):
+def ragas(mock_llmobs_eval_metric_writer):
     with override_global_config(dict(_dd_api_key="<not-a-real-key>")):
         try:
             import ragas
diff --git a/tests/llmobs/test_llmobs_decorators.py b/tests/llmobs/test_llmobs_decorators.py
index e94d72aec64..056de72ee96 100644
--- a/tests/llmobs/test_llmobs_decorators.py
+++ b/tests/llmobs/test_llmobs_decorators.py
@@ -19,7 +19,7 @@ def mock_logs():
         yield mock_logs
 
 
-def test_llm_decorator_with_llmobs_disabled_logs_warning(LLMObs, mock_logs):
+def test_llm_decorator_with_llmobs_disabled_logs_warning(llmobs, mock_logs):
     for decorator_name, decorator in (("llm", llm), ("embedding", embedding)):
 
         @decorator(
@@ -28,13 +28,13 @@ def test_llm_decorator_with_llmobs_disabled_logs_warning(LLMObs, mock_logs):
         def f():
             pass
 
-        LLMObs.disable()
+        llmobs.disable()
         f()
         mock_logs.warning.assert_called_with(SPAN_START_WHILE_DISABLED_WARNING)
         mock_logs.reset_mock()
 
 
-def test_non_llm_decorator_with_llmobs_disabled_logs_warning(LLMObs, mock_logs):
+def test_non_llm_decorator_with_llmobs_disabled_logs_warning(llmobs, mock_logs):
     for decorator_name, decorator in (
         ("task", task),
         ("workflow", workflow),
@@ -47,53 +47,49 @@ def test_non_llm_decorator_with_llmobs_disabled_logs_warning(LLMObs, mock_logs):
         def f():
             pass
 
-        LLMObs.disable()
+        llmobs.disable()
         f()
         mock_logs.warning.assert_called_with(SPAN_START_WHILE_DISABLED_WARNING)
         mock_logs.reset_mock()
 
 
-def test_llm_decorator(LLMObs, mock_llmobs_span_writer):
+def test_llm_decorator(llmobs, llmobs_events):
     @llm(model_name="test_model", model_provider="test_provider", name="test_function", session_id="test_session_id")
     def f():
         pass
 
     f()
-    span = LLMObs._instance.tracer.pop()[0]
-    mock_llmobs_span_writer.enqueue.assert_called_with(
-        _expected_llmobs_llm_span_event(
-            span, "llm", model_name="test_model", model_provider="test_provider", session_id="test_session_id"
-        )
+    span = llmobs._instance.tracer.pop()[0]
+    assert llmobs_events[0] == _expected_llmobs_llm_span_event(
+        span, "llm", model_name="test_model", model_provider="test_provider", session_id="test_session_id"
     )
 
 
-def test_llm_decorator_no_model_name_sets_default(LLMObs, mock_llmobs_span_writer):
+def test_llm_decorator_no_model_name_sets_default(llmobs, llmobs_events):
     @llm(model_provider="test_provider", name="test_function", session_id="test_session_id")
     def f():
         pass
 
     f()
-    span = LLMObs._instance.tracer.pop()[0]
-    mock_llmobs_span_writer.enqueue.assert_called_with(
-        _expected_llmobs_llm_span_event(
-            span, "llm", model_name="custom", model_provider="test_provider", session_id="test_session_id"
-        )
+    span = llmobs._instance.tracer.pop()[0]
+    assert llmobs_events[0] == _expected_llmobs_llm_span_event(
+        span, "llm", model_name="custom", model_provider="test_provider", session_id="test_session_id"
     )
 
 
-def test_llm_decorator_default_kwargs(LLMObs, mock_llmobs_span_writer):
+def test_llm_decorator_default_kwargs(llmobs, llmobs_events):
     @llm
     def f():
         pass
 
     f()
-    span = LLMObs._instance.tracer.pop()[0]
-    mock_llmobs_span_writer.enqueue.assert_called_with(
-        _expected_llmobs_llm_span_event(span, "llm", model_name="custom", model_provider="custom")
+    span = llmobs._instance.tracer.pop()[0]
+    assert llmobs_events[0] == _expected_llmobs_llm_span_event(
+        span, "llm", model_name="custom", model_provider="custom"
     )
 
 
-def test_embedding_decorator(LLMObs, mock_llmobs_span_writer):
+def test_embedding_decorator(llmobs, llmobs_events):
     @embedding(
         model_name="test_model", model_provider="test_provider", name="test_function", session_id="test_session_id"
     )
@@ -101,173 +97,157 @@ def f():
         pass
 
     f()
-    span = LLMObs._instance.tracer.pop()[0]
-    mock_llmobs_span_writer.enqueue.assert_called_with(
-        _expected_llmobs_llm_span_event(
-            span, "embedding", model_name="test_model", model_provider="test_provider", session_id="test_session_id"
-        )
+    span = llmobs._instance.tracer.pop()[0]
+    assert llmobs_events[0] == _expected_llmobs_llm_span_event(
+        span, "embedding", model_name="test_model", model_provider="test_provider", session_id="test_session_id"
     )
 
 
-def test_embedding_decorator_no_model_name_sets_default(LLMObs, mock_llmobs_span_writer):
+def test_embedding_decorator_no_model_name_sets_default(llmobs, llmobs_events):
     @embedding(model_provider="test_provider", name="test_function", session_id="test_session_id")
     def f():
         pass
 
     f()
-    span = LLMObs._instance.tracer.pop()[0]
-    mock_llmobs_span_writer.enqueue.assert_called_with(
-        _expected_llmobs_llm_span_event(
-            span, "embedding", model_name="custom", model_provider="test_provider", session_id="test_session_id"
-        )
+    span = llmobs._instance.tracer.pop()[0]
+    assert llmobs_events[0] == _expected_llmobs_llm_span_event(
+        span, "embedding", model_name="custom", model_provider="test_provider", session_id="test_session_id"
     )
 
 
-def test_embedding_decorator_default_kwargs(LLMObs, mock_llmobs_span_writer):
+def test_embedding_decorator_default_kwargs(llmobs, llmobs_events):
     @embedding
     def f():
         pass
 
     f()
-    span = LLMObs._instance.tracer.pop()[0]
-    mock_llmobs_span_writer.enqueue.assert_called_with(
-        _expected_llmobs_llm_span_event(span, "embedding", model_name="custom", model_provider="custom")
+    span = llmobs._instance.tracer.pop()[0]
+    assert llmobs_events[0] == _expected_llmobs_llm_span_event(
+        span, "embedding", model_name="custom", model_provider="custom"
     )
 
 
-def test_retrieval_decorator(LLMObs, mock_llmobs_span_writer):
+def test_retrieval_decorator(llmobs, llmobs_events):
     @retrieval(name="test_function", session_id="test_session_id")
     def f():
         pass
 
     f()
-    span = LLMObs._instance.tracer.pop()[0]
-    mock_llmobs_span_writer.enqueue.assert_called_with(
-        _expected_llmobs_non_llm_span_event(span, "retrieval", session_id="test_session_id")
-    )
+    span = llmobs._instance.tracer.pop()[0]
+    assert llmobs_events[0] == _expected_llmobs_non_llm_span_event(span, "retrieval", session_id="test_session_id")
 
 
-def test_retrieval_decorator_default_kwargs(LLMObs, mock_llmobs_span_writer):
+def test_retrieval_decorator_default_kwargs(llmobs, llmobs_events):
     @retrieval()
     def f():
         pass
 
     f()
-    span = LLMObs._instance.tracer.pop()[0]
-    mock_llmobs_span_writer.enqueue.assert_called_with(_expected_llmobs_non_llm_span_event(span, "retrieval"))
+    span = llmobs._instance.tracer.pop()[0]
+    assert llmobs_events[0] == _expected_llmobs_non_llm_span_event(span, "retrieval")
 
 
-def test_task_decorator(LLMObs, mock_llmobs_span_writer):
+def test_task_decorator(llmobs, llmobs_events):
     @task(name="test_function", session_id="test_session_id")
     def f():
         pass
 
     f()
-    span = LLMObs._instance.tracer.pop()[0]
-    mock_llmobs_span_writer.enqueue.assert_called_with(
-        _expected_llmobs_non_llm_span_event(span, "task", session_id="test_session_id")
-    )
+    span = llmobs._instance.tracer.pop()[0]
+    assert llmobs_events[0] == _expected_llmobs_non_llm_span_event(span, "task", session_id="test_session_id")
 
 
-def test_task_decorator_default_kwargs(LLMObs, mock_llmobs_span_writer):
+def test_task_decorator_default_kwargs(llmobs, llmobs_events):
     @task()
     def f():
         pass
 
     f()
-    span = LLMObs._instance.tracer.pop()[0]
-    mock_llmobs_span_writer.enqueue.assert_called_with(_expected_llmobs_non_llm_span_event(span, "task"))
+    span = llmobs._instance.tracer.pop()[0]
+    assert llmobs_events[0] == _expected_llmobs_non_llm_span_event(span, "task")
 
 
-def test_tool_decorator(LLMObs, mock_llmobs_span_writer):
+def test_tool_decorator(llmobs, llmobs_events):
     @tool(name="test_function", session_id="test_session_id")
     def f():
         pass
 
     f()
-    span = LLMObs._instance.tracer.pop()[0]
-    mock_llmobs_span_writer.enqueue.assert_called_with(
-        _expected_llmobs_non_llm_span_event(span, "tool", session_id="test_session_id")
-    )
+    span = llmobs._instance.tracer.pop()[0]
+    assert llmobs_events[0] == _expected_llmobs_non_llm_span_event(span, "tool", session_id="test_session_id")
 
 
-def test_tool_decorator_default_kwargs(LLMObs, mock_llmobs_span_writer):
+def test_tool_decorator_default_kwargs(llmobs, llmobs_events):
     @tool()
     def f():
         pass
 
     f()
-    span = LLMObs._instance.tracer.pop()[0]
-    mock_llmobs_span_writer.enqueue.assert_called_with(_expected_llmobs_non_llm_span_event(span, "tool"))
+    span = llmobs._instance.tracer.pop()[0]
+    assert llmobs_events[0] == _expected_llmobs_non_llm_span_event(span, "tool")
 
 
-def test_workflow_decorator(LLMObs, mock_llmobs_span_writer):
+def test_workflow_decorator(llmobs, llmobs_events):
     @workflow(name="test_function", session_id="test_session_id")
     def f():
         pass
 
     f()
-    span = LLMObs._instance.tracer.pop()[0]
-    mock_llmobs_span_writer.enqueue.assert_called_with(
-        _expected_llmobs_non_llm_span_event(span, "workflow", session_id="test_session_id")
-    )
+    span = llmobs._instance.tracer.pop()[0]
+    assert llmobs_events[0] == _expected_llmobs_non_llm_span_event(span, "workflow", session_id="test_session_id")
 
 
-def test_workflow_decorator_default_kwargs(LLMObs, mock_llmobs_span_writer):
+def test_workflow_decorator_default_kwargs(llmobs, llmobs_events):
     @workflow()
     def f():
         pass
 
     f()
-    span = LLMObs._instance.tracer.pop()[0]
-    mock_llmobs_span_writer.enqueue.assert_called_with(_expected_llmobs_non_llm_span_event(span, "workflow"))
+    span = llmobs._instance.tracer.pop()[0]
+    assert llmobs_events[0] == _expected_llmobs_non_llm_span_event(span, "workflow")
 
 
-def test_agent_decorator(LLMObs, mock_llmobs_span_writer):
+def test_agent_decorator(llmobs, llmobs_events):
     @agent(name="test_function", session_id="test_session_id")
     def f():
         pass
 
     f()
-    span = LLMObs._instance.tracer.pop()[0]
-    mock_llmobs_span_writer.enqueue.assert_called_with(
-        _expected_llmobs_llm_span_event(span, "agent", session_id="test_session_id")
-    )
+    span = llmobs._instance.tracer.pop()[0]
+    assert llmobs_events[0] == _expected_llmobs_llm_span_event(span, "agent", session_id="test_session_id")
 
 
-def test_agent_decorator_default_kwargs(LLMObs, mock_llmobs_span_writer):
+def test_agent_decorator_default_kwargs(llmobs, llmobs_events):
     @agent()
     def f():
         pass
 
     f()
-    span = LLMObs._instance.tracer.pop()[0]
-    mock_llmobs_span_writer.enqueue.assert_called_with(_expected_llmobs_llm_span_event(span, "agent"))
+    span = llmobs._instance.tracer.pop()[0]
+    assert llmobs_events[0] == _expected_llmobs_llm_span_event(span, "agent")
 
 
-def test_llm_decorator_with_error(LLMObs, mock_llmobs_span_writer):
+def test_llm_decorator_with_error(llmobs, llmobs_events):
     @llm(model_name="test_model", model_provider="test_provider", name="test_function", session_id="test_session_id")
     def f():
         raise ValueError("test_error")
 
     with pytest.raises(ValueError):
         f()
-    span = LLMObs._instance.tracer.pop()[0]
-    mock_llmobs_span_writer.enqueue.assert_called_with(
-        _expected_llmobs_llm_span_event(
-            span,
-            "llm",
-            model_name="test_model",
-            model_provider="test_provider",
-            session_id="test_session_id",
-            error=span.get_tag("error.type"),
-            error_message=span.get_tag("error.message"),
-            error_stack=span.get_tag("error.stack"),
-        )
+    span = llmobs._instance.tracer.pop()[0]
+    assert llmobs_events[0] == _expected_llmobs_llm_span_event(
+        span,
+        "llm",
+        model_name="test_model",
+        model_provider="test_provider",
+        session_id="test_session_id",
+        error=span.get_tag("error.type"),
+        error_message=span.get_tag("error.message"),
+        error_stack=span.get_tag("error.stack"),
     )
 
 
-def test_non_llm_decorators_with_error(LLMObs, mock_llmobs_span_writer):
+def test_non_llm_decorators_with_error(llmobs, llmobs_events):
     for decorator_name, decorator in [("task", task), ("workflow", workflow), ("tool", tool), ("agent", agent)]:
 
         @decorator(name="test_function", session_id="test_session_id")
@@ -276,23 +256,21 @@ def f():
 
         with pytest.raises(ValueError):
             f()
-        span = LLMObs._instance.tracer.pop()[0]
-        mock_llmobs_span_writer.enqueue.assert_called_with(
-            _expected_llmobs_non_llm_span_event(
-                span,
-                decorator_name,
-                session_id="test_session_id",
-                error=span.get_tag("error.type"),
-                error_message=span.get_tag("error.message"),
-                error_stack=span.get_tag("error.stack"),
-            )
+        span = llmobs._instance.tracer.pop()[0]
+        assert llmobs_events[-1] == _expected_llmobs_non_llm_span_event(
+            span,
+            decorator_name,
+            session_id="test_session_id",
+            error=span.get_tag("error.type"),
+            error_message=span.get_tag("error.message"),
+            error_stack=span.get_tag("error.stack"),
         )
 
 
-def test_llm_annotate(LLMObs, mock_llmobs_span_writer):
+def test_llm_annotate(llmobs, llmobs_events):
     @llm(model_name="test_model", model_provider="test_provider", name="test_function", session_id="test_session_id")
     def f():
-        LLMObs.annotate(
+        llmobs.annotate(
             parameters={"temperature": 0.9, "max_tokens": 50},
             input_data=[{"content": "test_prompt"}],
             output_data=[{"content": "test_response"}],
@@ -301,27 +279,25 @@ def f():
         )
 
     f()
-    span = LLMObs._instance.tracer.pop()[0]
-    mock_llmobs_span_writer.enqueue.assert_called_with(
-        _expected_llmobs_llm_span_event(
-            span,
-            "llm",
-            model_name="test_model",
-            model_provider="test_provider",
-            input_messages=[{"content": "test_prompt"}],
-            output_messages=[{"content": "test_response"}],
-            parameters={"temperature": 0.9, "max_tokens": 50},
-            token_metrics={"input_tokens": 10, "output_tokens": 20, "total_tokens": 30},
-            tags={"custom_tag": "tag_value"},
-            session_id="test_session_id",
-        )
+    span = llmobs._instance.tracer.pop()[0]
+    assert llmobs_events[0] == _expected_llmobs_llm_span_event(
+        span,
+        "llm",
+        model_name="test_model",
+        model_provider="test_provider",
+        input_messages=[{"content": "test_prompt"}],
+        output_messages=[{"content": "test_response"}],
+        parameters={"temperature": 0.9, "max_tokens": 50},
+        token_metrics={"input_tokens": 10, "output_tokens": 20, "total_tokens": 30},
+        tags={"custom_tag": "tag_value"},
+        session_id="test_session_id",
     )
 
 
-def test_llm_annotate_raw_string_io(LLMObs, mock_llmobs_span_writer):
+def test_llm_annotate_raw_string_io(llmobs, llmobs_events):
     @llm(model_name="test_model", model_provider="test_provider", name="test_function", session_id="test_session_id")
     def f():
-        LLMObs.annotate(
+        llmobs.annotate(
             parameters={"temperature": 0.9, "max_tokens": 50},
             input_data="test_prompt",
             output_data="test_response",
@@ -330,24 +306,22 @@ def f():
         )
 
     f()
-    span = LLMObs._instance.tracer.pop()[0]
-    mock_llmobs_span_writer.enqueue.assert_called_with(
-        _expected_llmobs_llm_span_event(
-            span,
-            "llm",
-            model_name="test_model",
-            model_provider="test_provider",
-            input_messages=[{"content": "test_prompt"}],
-            output_messages=[{"content": "test_response"}],
-            parameters={"temperature": 0.9, "max_tokens": 50},
-            token_metrics={"input_tokens": 10, "output_tokens": 20, "total_tokens": 30},
-            tags={"custom_tag": "tag_value"},
-            session_id="test_session_id",
-        )
+    span = llmobs._instance.tracer.pop()[0]
+    assert llmobs_events[0] == _expected_llmobs_llm_span_event(
+        span,
+        "llm",
+        model_name="test_model",
+        model_provider="test_provider",
+        input_messages=[{"content": "test_prompt"}],
+        output_messages=[{"content": "test_response"}],
+        parameters={"temperature": 0.9, "max_tokens": 50},
+        token_metrics={"input_tokens": 10, "output_tokens": 20, "total_tokens": 30},
+        tags={"custom_tag": "tag_value"},
+        session_id="test_session_id",
     )
 
 
-def test_non_llm_decorators_no_args(LLMObs, mock_llmobs_span_writer):
+def test_non_llm_decorators_no_args(llmobs, llmobs_events):
     """Test that using the decorators without any arguments, i.e. @tool, works the same as @tool(...)."""
     for decorator_name, decorator in [
         ("task", task),
@@ -362,11 +336,11 @@ def f():
             pass
 
         f()
-        span = LLMObs._instance.tracer.pop()[0]
-        mock_llmobs_span_writer.enqueue.assert_called_with(_expected_llmobs_non_llm_span_event(span, decorator_name))
+        span = llmobs._instance.tracer.pop()[0]
+        assert llmobs_events[-1] == _expected_llmobs_non_llm_span_event(span, decorator_name)
 
 
-def test_agent_decorator_no_args(LLMObs, mock_llmobs_span_writer):
+def test_agent_decorator_no_args(llmobs, llmobs_events):
     """Test that using agent decorator without any arguments, i.e. @agent, works the same as @agent(...)."""
 
     @agent
@@ -374,11 +348,11 @@ def f():
         pass
 
     f()
-    span = LLMObs._instance.tracer.pop()[0]
-    mock_llmobs_span_writer.enqueue.assert_called_with(_expected_llmobs_llm_span_event(span, "agent"))
+    span = llmobs._instance.tracer.pop()[0]
+    assert llmobs_events[0] == _expected_llmobs_llm_span_event(span, "agent")
 
 
-def test_ml_app_override(LLMObs, mock_llmobs_span_writer):
+def test_ml_app_override(llmobs, llmobs_events):
     """Test that setting ml_app kwarg on the LLMObs decorators will override the DD_LLMOBS_ML_APP value."""
     for decorator_name, decorator in [("task", task), ("workflow", workflow), ("tool", tool)]:
 
@@ -387,9 +361,9 @@ def f():
             pass
 
         f()
-        span = LLMObs._instance.tracer.pop()[0]
-        mock_llmobs_span_writer.enqueue.assert_called_with(
-            _expected_llmobs_non_llm_span_event(span, decorator_name, tags={"ml_app": "test_ml_app"})
+        span = llmobs._instance.tracer.pop()[0]
+        assert llmobs_events[-1] == _expected_llmobs_non_llm_span_event(
+            span, decorator_name, tags={"ml_app": "test_ml_app"}
         )
 
     @llm(model_name="test_model", ml_app="test_ml_app")
@@ -397,11 +371,9 @@ def g():
         pass
 
     g()
-    span = LLMObs._instance.tracer.pop()[0]
-    mock_llmobs_span_writer.enqueue.assert_called_with(
-        _expected_llmobs_llm_span_event(
-            span, "llm", model_name="test_model", model_provider="custom", tags={"ml_app": "test_ml_app"}
-        )
+    span = llmobs._instance.tracer.pop()[0]
+    assert llmobs_events[-1] == _expected_llmobs_llm_span_event(
+        span, "llm", model_name="test_model", model_provider="custom", tags={"ml_app": "test_ml_app"}
     )
 
     @embedding(model_name="test_model", ml_app="test_ml_app")
@@ -409,15 +381,13 @@ def h():
         pass
 
     h()
-    span = LLMObs._instance.tracer.pop()[0]
-    mock_llmobs_span_writer.enqueue.assert_called_with(
-        _expected_llmobs_llm_span_event(
-            span, "embedding", model_name="test_model", model_provider="custom", tags={"ml_app": "test_ml_app"}
-        )
+    span = llmobs._instance.tracer.pop()[0]
+    assert llmobs_events[-1] == _expected_llmobs_llm_span_event(
+        span, "embedding", model_name="test_model", model_provider="custom", tags={"ml_app": "test_ml_app"}
     )
 
 
-async def test_non_llm_async_decorators(LLMObs, mock_llmobs_span_writer):
+async def test_non_llm_async_decorators(llmobs, llmobs_events):
     """Test that decorators work with async functions."""
     for decorator_name, decorator in [
         ("task", task),
@@ -432,11 +402,11 @@ async def f():
             pass
 
         await f()
-        span = LLMObs._instance.tracer.pop()[0]
-        mock_llmobs_span_writer.enqueue.assert_called_with(_expected_llmobs_non_llm_span_event(span, decorator_name))
+        span = llmobs._instance.tracer.pop()[0]
+        assert llmobs_events[-1] == _expected_llmobs_non_llm_span_event(span, decorator_name)
 
 
-async def test_llm_async_decorators(LLMObs, mock_llmobs_span_writer):
+async def test_llm_async_decorators(llmobs, llmobs_events):
     """Test that decorators work with async functions."""
     for decorator_name, decorator in [("llm", llm), ("embedding", embedding)]:
 
@@ -445,15 +415,13 @@ async def f():
             pass
 
         await f()
-        span = LLMObs._instance.tracer.pop()[0]
-        mock_llmobs_span_writer.enqueue.assert_called_with(
-            _expected_llmobs_llm_span_event(
-                span, decorator_name, model_name="test_model", model_provider="test_provider"
-            )
+        span = llmobs._instance.tracer.pop()[0]
+        assert llmobs_events[-1] == _expected_llmobs_llm_span_event(
+            span, decorator_name, model_name="test_model", model_provider="test_provider"
         )
 
 
-def test_automatic_annotation_non_llm_decorators(LLMObs, mock_llmobs_span_writer):
+def test_automatic_annotation_non_llm_decorators(llmobs, llmobs_events):
     """Test that automatic input/output annotation works for non-LLM decorators."""
     for decorator_name, decorator in (("task", task), ("workflow", workflow), ("tool", tool), ("agent", agent)):
 
@@ -462,19 +430,17 @@ def f(prompt, arg_2, kwarg_1=None, kwarg_2=None):
             return prompt
 
         f("test_prompt", "arg_2", kwarg_2=12345)
-        span = LLMObs._instance.tracer.pop()[0]
-        mock_llmobs_span_writer.enqueue.assert_called_with(
-            _expected_llmobs_non_llm_span_event(
-                span,
-                decorator_name,
-                input_value=str({"prompt": "test_prompt", "arg_2": "arg_2", "kwarg_2": 12345}),
-                output_value="test_prompt",
-                session_id="test_session_id",
-            )
+        span = llmobs._instance.tracer.pop()[0]
+        assert llmobs_events[-1] == _expected_llmobs_non_llm_span_event(
+            span,
+            decorator_name,
+            input_value=str({"prompt": "test_prompt", "arg_2": "arg_2", "kwarg_2": 12345}),
+            output_value="test_prompt",
+            session_id="test_session_id",
         )
 
 
-def test_automatic_annotation_retrieval_decorator(LLMObs, mock_llmobs_span_writer):
+def test_automatic_annotation_retrieval_decorator(llmobs, llmobs_events):
     """Test that automatic input annotation works for retrieval decorators."""
 
     @retrieval(session_id="test_session_id")
@@ -482,18 +448,16 @@ def test_retrieval(query, arg_2, kwarg_1=None, kwarg_2=None):
         return [{"name": "name", "id": "1234567890", "score": 0.9}]
 
     test_retrieval("test_query", "arg_2", kwarg_2=12345)
-    span = LLMObs._instance.tracer.pop()[0]
-    mock_llmobs_span_writer.enqueue.assert_called_with(
-        _expected_llmobs_non_llm_span_event(
-            span,
-            "retrieval",
-            input_value=str({"query": "test_query", "arg_2": "arg_2", "kwarg_2": 12345}),
-            session_id="test_session_id",
-        )
+    span = llmobs._instance.tracer.pop()[0]
+    assert llmobs_events[0] == _expected_llmobs_non_llm_span_event(
+        span,
+        "retrieval",
+        input_value=str({"query": "test_query", "arg_2": "arg_2", "kwarg_2": 12345}),
+        session_id="test_session_id",
     )
 
 
-def test_automatic_annotation_off_non_llm_decorators(LLMObs, mock_llmobs_span_writer):
+def test_automatic_annotation_off_non_llm_decorators(llmobs, llmobs_events):
     """Test disabling automatic input/output annotation for non-LLM decorators."""
     for decorator_name, decorator in (
         ("task", task),
@@ -508,35 +472,33 @@ def f(prompt, arg_2, kwarg_1=None, kwarg_2=None):
             return prompt
 
         f("test_prompt", "arg_2", kwarg_2=12345)
-        span = LLMObs._instance.tracer.pop()[0]
-        mock_llmobs_span_writer.enqueue.assert_called_with(
-            _expected_llmobs_non_llm_span_event(span, decorator_name, session_id="test_session_id")
+        span = llmobs._instance.tracer.pop()[0]
+        assert llmobs_events[-1] == _expected_llmobs_non_llm_span_event(
+            span, decorator_name, session_id="test_session_id"
         )
 
 
-def test_automatic_annotation_off_if_manually_annotated(LLMObs, mock_llmobs_span_writer):
+def test_automatic_annotation_off_if_manually_annotated(llmobs, llmobs_events):
     """Test disabling automatic input/output annotation for non-LLM decorators."""
     for decorator_name, decorator in (("task", task), ("workflow", workflow), ("tool", tool), ("agent", agent)):
 
         @decorator(name="test_function", session_id="test_session_id")
         def f(prompt, arg_2, kwarg_1=None, kwarg_2=None):
-            LLMObs.annotate(input_data="my custom input", output_data="my custom output")
+            llmobs.annotate(input_data="my custom input", output_data="my custom output")
             return prompt
 
         f("test_prompt", "arg_2", kwarg_2=12345)
-        span = LLMObs._instance.tracer.pop()[0]
-        mock_llmobs_span_writer.enqueue.assert_called_with(
-            _expected_llmobs_non_llm_span_event(
-                span,
-                decorator_name,
-                session_id="test_session_id",
-                input_value="my custom input",
-                output_value="my custom output",
-            )
+        span = llmobs._instance.tracer.pop()[0]
+        assert llmobs_events[-1] == _expected_llmobs_non_llm_span_event(
+            span,
+            decorator_name,
+            session_id="test_session_id",
+            input_value="my custom input",
+            output_value="my custom output",
         )
 
 
-def test_generator_sync(LLMObs, mock_llmobs_span_writer):
+def test_generator_sync(llmobs, llmobs_events):
     """
     Test that decorators work with generator functions.
     The span should finish after the generator is exhausted.
@@ -556,7 +518,7 @@ def f():
             for i in range(3):
                 yield i
 
-            LLMObs.annotate(
+            llmobs.annotate(
                 input_data="hello",
                 output_data="world",
             )
@@ -566,7 +528,7 @@ def f():
             assert e == i
             i += 1
 
-        span = LLMObs._instance.tracer.pop()[0]
+        span = llmobs._instance.tracer.pop()[0]
         if decorator_name == "llm":
             expected_span_event = _expected_llmobs_llm_span_event(
                 span,
@@ -594,10 +556,10 @@ def f():
                 span, decorator_name, input_value="hello", output_value="world"
             )
 
-        mock_llmobs_span_writer.enqueue.assert_called_with(expected_span_event)
+        assert llmobs_events[-1] == expected_span_event
 
 
-async def test_generator_async(LLMObs, mock_llmobs_span_writer):
+async def test_generator_async(llmobs, llmobs_events):
     """
     Test that decorators work with generator functions.
     The span should finish after the generator is exhausted.
@@ -617,7 +579,7 @@ async def f():
             for i in range(3):
                 yield i
 
-            LLMObs.annotate(
+            llmobs.annotate(
                 input_data="hello",
                 output_data="world",
             )
@@ -627,7 +589,7 @@ async def f():
             assert e == i
             i += 1
 
-        span = LLMObs._instance.tracer.pop()[0]
+        span = llmobs._instance.tracer.pop()[0]
         if decorator_name == "llm":
             expected_span_event = _expected_llmobs_llm_span_event(
                 span,
@@ -655,11 +617,11 @@ async def f():
                 span, decorator_name, input_value="hello", output_value="world"
             )
 
-        mock_llmobs_span_writer.enqueue.assert_called_with(expected_span_event)
+        assert llmobs_events[-1] == expected_span_event
 
 
-def test_generator_sync_with_llmobs_disabled(LLMObs, mock_logs):
-    LLMObs.disable()
+def test_generator_sync_with_llmobs_disabled(llmobs, mock_logs):
+    llmobs.disable()
 
     @workflow()
     def f():
@@ -684,10 +646,11 @@ def g():
         i += 1
 
     mock_logs.warning.assert_called_with(SPAN_START_WHILE_DISABLED_WARNING)
+    llmobs.enable()
 
 
-async def test_generator_async_with_llmobs_disabled(LLMObs, mock_logs):
-    LLMObs.disable()
+async def test_generator_async_with_llmobs_disabled(llmobs, mock_logs):
+    llmobs.disable()
 
     @workflow()
     async def f():
@@ -712,9 +675,10 @@ async def g():
         i += 1
 
     mock_logs.warning.assert_called_with(SPAN_START_WHILE_DISABLED_WARNING)
+    llmobs.enable()
 
 
-def test_generator_sync_finishes_span_on_error(LLMObs, mock_llmobs_span_writer):
+def test_generator_sync_finishes_span_on_error(llmobs, llmobs_events):
     """Tests that"""
 
     @workflow()
@@ -728,19 +692,17 @@ def f():
         for _ in f():
             pass
 
-    span = LLMObs._instance.tracer.pop()[0]
-    mock_llmobs_span_writer.enqueue.assert_called_with(
-        _expected_llmobs_non_llm_span_event(
-            span,
-            "workflow",
-            error=span.get_tag("error.type"),
-            error_message=span.get_tag("error.message"),
-            error_stack=span.get_tag("error.stack"),
-        )
+    span = llmobs._instance.tracer.pop()[0]
+    assert llmobs_events[0] == _expected_llmobs_non_llm_span_event(
+        span,
+        "workflow",
+        error=span.get_tag("error.type"),
+        error_message=span.get_tag("error.message"),
+        error_stack=span.get_tag("error.stack"),
     )
 
 
-async def test_generator_async_finishes_span_on_error(LLMObs, mock_llmobs_span_writer):
+async def test_generator_async_finishes_span_on_error(llmobs, llmobs_events):
     @workflow()
     async def f():
         for i in range(3):
@@ -752,19 +714,17 @@ async def f():
         async for _ in f():
             pass
 
-    span = LLMObs._instance.tracer.pop()[0]
-    mock_llmobs_span_writer.enqueue.assert_called_with(
-        _expected_llmobs_non_llm_span_event(
-            span,
-            "workflow",
-            error=span.get_tag("error.type"),
-            error_message=span.get_tag("error.message"),
-            error_stack=span.get_tag("error.stack"),
-        )
+    span = llmobs._instance.tracer.pop()[0]
+    assert llmobs_events[0] == _expected_llmobs_non_llm_span_event(
+        span,
+        "workflow",
+        error=span.get_tag("error.type"),
+        error_message=span.get_tag("error.message"),
+        error_stack=span.get_tag("error.stack"),
     )
 
 
-def test_generator_sync_send(LLMObs, mock_llmobs_span_writer):
+def test_generator_sync_send(llmobs, llmobs_events):
     @workflow()
     def f():
         while True:
@@ -780,16 +740,11 @@ def f():
     assert gen.send(4) == 16
     gen.close()
 
-    span = LLMObs._instance.tracer.pop()[0]
-    mock_llmobs_span_writer.enqueue.assert_called_with(
-        _expected_llmobs_non_llm_span_event(
-            span,
-            "workflow",
-        )
-    )
+    span = llmobs._instance.tracer.pop()[0]
+    assert llmobs_events[0] == _expected_llmobs_non_llm_span_event(span, "workflow")
 
 
-async def test_generator_async_send(LLMObs, mock_llmobs_span_writer):
+async def test_generator_async_send(llmobs, llmobs_events):
     @workflow()
     async def f():
         while True:
@@ -805,16 +760,11 @@ async def f():
 
     await gen.aclose()
 
-    span = LLMObs._instance.tracer.pop()[0]
-    mock_llmobs_span_writer.enqueue.assert_called_with(
-        _expected_llmobs_non_llm_span_event(
-            span,
-            "workflow",
-        )
-    )
+    span = llmobs._instance.tracer.pop()[0]
+    assert llmobs_events[0] == _expected_llmobs_non_llm_span_event(span, "workflow")
 
 
-def test_generator_sync_throw(LLMObs, mock_llmobs_span_writer):
+def test_generator_sync_throw(llmobs, llmobs_events):
     @workflow()
     def f():
         for i in range(3):
@@ -825,19 +775,17 @@ def f():
         next(gen)
         gen.throw(ValueError("test_error"))
 
-    span = LLMObs._instance.tracer.pop()[0]
-    mock_llmobs_span_writer.enqueue.assert_called_with(
-        _expected_llmobs_non_llm_span_event(
-            span,
-            "workflow",
-            error=span.get_tag("error.type"),
-            error_message=span.get_tag("error.message"),
-            error_stack=span.get_tag("error.stack"),
-        )
+    span = llmobs._instance.tracer.pop()[0]
+    assert llmobs_events[0] == _expected_llmobs_non_llm_span_event(
+        span,
+        "workflow",
+        error=span.get_tag("error.type"),
+        error_message=span.get_tag("error.message"),
+        error_stack=span.get_tag("error.stack"),
     )
 
 
-async def test_generator_async_throw(LLMObs, mock_llmobs_span_writer):
+async def test_generator_async_throw(llmobs, llmobs_events):
     @workflow()
     async def f():
         for i in range(3):
@@ -848,19 +796,17 @@ async def f():
         await gen.asend(None)
         await gen.athrow(ValueError("test_error"))
 
-    span = LLMObs._instance.tracer.pop()[0]
-    mock_llmobs_span_writer.enqueue.assert_called_with(
-        _expected_llmobs_non_llm_span_event(
-            span,
-            "workflow",
-            error=span.get_tag("error.type"),
-            error_message=span.get_tag("error.message"),
-            error_stack=span.get_tag("error.stack"),
-        )
+    span = llmobs._instance.tracer.pop()[0]
+    assert llmobs_events[0] == _expected_llmobs_non_llm_span_event(
+        span,
+        "workflow",
+        error=span.get_tag("error.type"),
+        error_message=span.get_tag("error.message"),
+        error_stack=span.get_tag("error.stack"),
     )
 
 
-def test_generator_exit_exception_sync(LLMObs, mock_llmobs_span_writer):
+def test_generator_exit_exception_sync(llmobs, llmobs_events):
     @workflow()
     def get_next_element(alist):
         for element in alist:
@@ -873,14 +819,12 @@ def get_next_element(alist):
         if element == 5:
             break
 
-    span = LLMObs._instance.tracer.pop()[0]
-    mock_llmobs_span_writer.enqueue.assert_called_with(
-        _expected_llmobs_non_llm_span_event(
-            span,
-            "workflow",
-            input_value=str({"alist": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]}),
-            error=span.get_tag("error.type"),
-            error_message=span.get_tag("error.message"),
-            error_stack=span.get_tag("error.stack"),
-        )
+    span = llmobs._instance.tracer.pop()[0]
+    assert llmobs_events[0] == _expected_llmobs_non_llm_span_event(
+        span,
+        "workflow",
+        input_value=str({"alist": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]}),
+        error=span.get_tag("error.type"),
+        error_message=span.get_tag("error.message"),
+        error_stack=span.get_tag("error.stack"),
     )
diff --git a/tests/llmobs/test_llmobs_evaluator_runner.py b/tests/llmobs/test_llmobs_evaluator_runner.py
index 7ee7d510276..128c4639946 100644
--- a/tests/llmobs/test_llmobs_evaluator_runner.py
+++ b/tests/llmobs/test_llmobs_evaluator_runner.py
@@ -34,9 +34,9 @@ def test_evaluator_runner_buffer_limit(mock_evaluator_logs):
     )
 
 
-def test_evaluator_runner_periodic_enqueues_eval_metric(LLMObs, mock_llmobs_eval_metric_writer):
-    evaluator_runner = EvaluatorRunner(interval=0.01, llmobs_service=LLMObs)
-    evaluator_runner.evaluators.append(DummyEvaluator(llmobs_service=LLMObs))
+def test_evaluator_runner_periodic_enqueues_eval_metric(llmobs, mock_llmobs_eval_metric_writer):
+    evaluator_runner = EvaluatorRunner(interval=0.01, llmobs_service=llmobs)
+    evaluator_runner.evaluators.append(DummyEvaluator(llmobs_service=llmobs))
     evaluator_runner.enqueue({"span_id": "123", "trace_id": "1234"}, DUMMY_SPAN)
     evaluator_runner.periodic()
     mock_llmobs_eval_metric_writer.enqueue.assert_called_once_with(
@@ -45,9 +45,9 @@ def test_evaluator_runner_periodic_enqueues_eval_metric(LLMObs, mock_llmobs_eval
 
 
 @pytest.mark.vcr_logs
-def test_evaluator_runner_timed_enqueues_eval_metric(LLMObs, mock_llmobs_eval_metric_writer):
-    evaluator_runner = EvaluatorRunner(interval=0.01, llmobs_service=LLMObs)
-    evaluator_runner.evaluators.append(DummyEvaluator(llmobs_service=LLMObs))
+def test_evaluator_runner_timed_enqueues_eval_metric(llmobs, mock_llmobs_eval_metric_writer):
+    evaluator_runner = EvaluatorRunner(interval=0.01, llmobs_service=llmobs)
+    evaluator_runner.evaluators.append(DummyEvaluator(llmobs_service=llmobs))
     evaluator_runner.start()
 
     evaluator_runner.enqueue({"span_id": "123", "trace_id": "1234"}, DUMMY_SPAN)
diff --git a/tests/llmobs/test_llmobs_ragas_faithfulness_evaluator.py b/tests/llmobs/test_llmobs_ragas_faithfulness_evaluator.py
index ec8e181e527..39e315b37e4 100644
--- a/tests/llmobs/test_llmobs_ragas_faithfulness_evaluator.py
+++ b/tests/llmobs/test_llmobs_ragas_faithfulness_evaluator.py
@@ -18,27 +18,27 @@ def _llm_span_without_io():
     return _expected_llmobs_llm_span_event(Span("dummy"))
 
 
-def test_ragas_evaluator_init(ragas, LLMObs):
-    rf_evaluator = RagasFaithfulnessEvaluator(LLMObs)
-    assert rf_evaluator.llmobs_service == LLMObs
+def test_ragas_evaluator_init(ragas, llmobs):
+    rf_evaluator = RagasFaithfulnessEvaluator(llmobs)
+    assert rf_evaluator.llmobs_service == llmobs
     assert rf_evaluator.ragas_faithfulness_instance == ragas.metrics.faithfulness
     assert rf_evaluator.ragas_faithfulness_instance.llm == ragas.llms.llm_factory()
 
 
-def test_ragas_faithfulness_throws_if_dependencies_not_present(LLMObs, mock_ragas_dependencies_not_present, ragas):
+def test_ragas_faithfulness_throws_if_dependencies_not_present(llmobs, mock_ragas_dependencies_not_present, ragas):
     with pytest.raises(NotImplementedError, match="Failed to load dependencies for `ragas_faithfulness` evaluator"):
-        RagasFaithfulnessEvaluator(LLMObs)
+        RagasFaithfulnessEvaluator(llmobs)
 
 
-def test_ragas_faithfulness_returns_none_if_inputs_extraction_fails(ragas, mock_llmobs_submit_evaluation, LLMObs):
-    rf_evaluator = RagasFaithfulnessEvaluator(LLMObs)
+def test_ragas_faithfulness_returns_none_if_inputs_extraction_fails(ragas, mock_llmobs_submit_evaluation, llmobs):
+    rf_evaluator = RagasFaithfulnessEvaluator(llmobs)
     failure_msg, _ = rf_evaluator.evaluate(_llm_span_without_io())
     assert failure_msg == "fail_extract_faithfulness_inputs"
     assert rf_evaluator.llmobs_service.submit_evaluation.call_count == 0
 
 
 def test_ragas_faithfulness_has_modified_faithfulness_instance(
-    ragas, mock_llmobs_submit_evaluation, reset_ragas_faithfulness_llm, LLMObs
+    ragas, mock_llmobs_submit_evaluation, reset_ragas_faithfulness_llm, llmobs
 ):
     """Faithfulness instance used in ragas evaluator should match the global ragas faithfulness instance"""
     from ragas.llms import BaseRagasLLM
@@ -56,7 +56,7 @@ def agenerate_text(self) -> str:
 
     faithfulness.llm = FirstDummyLLM()
 
-    rf_evaluator = RagasFaithfulnessEvaluator(LLMObs)
+    rf_evaluator = RagasFaithfulnessEvaluator(llmobs)
 
     assert rf_evaluator.ragas_faithfulness_instance.llm.generate_text() == "dummy llm"
 
@@ -77,9 +77,9 @@ def agenerate_text(self, statements) -> str:
 
 
 @pytest.mark.vcr_logs
-def test_ragas_faithfulness_submits_evaluation(ragas, LLMObs, mock_llmobs_submit_evaluation):
+def test_ragas_faithfulness_submits_evaluation(ragas, llmobs, mock_llmobs_submit_evaluation):
     """Test that evaluation is submitted for a valid llm span where question is in the prompt variables"""
-    rf_evaluator = RagasFaithfulnessEvaluator(LLMObs)
+    rf_evaluator = RagasFaithfulnessEvaluator(llmobs)
     llm_span = _llm_span_with_expected_ragas_inputs_in_prompt()
     rf_evaluator.run_and_submit_evaluation(llm_span)
     rf_evaluator.llmobs_service.submit_evaluation.assert_has_calls(
@@ -104,10 +104,10 @@ def test_ragas_faithfulness_submits_evaluation(ragas, LLMObs, mock_llmobs_submit
 
 @pytest.mark.vcr_logs
 def test_ragas_faithfulness_submits_evaluation_on_span_with_question_in_messages(
-    ragas, LLMObs, mock_llmobs_submit_evaluation
+    ragas, llmobs, mock_llmobs_submit_evaluation
 ):
     """Test that evaluation is submitted for a valid llm span where the last message content is the question"""
-    rf_evaluator = RagasFaithfulnessEvaluator(LLMObs)
+    rf_evaluator = RagasFaithfulnessEvaluator(llmobs)
     llm_span = _llm_span_with_expected_ragas_inputs_in_messages()
     rf_evaluator.run_and_submit_evaluation(llm_span)
     rf_evaluator.llmobs_service.submit_evaluation.assert_has_calls(
@@ -131,9 +131,9 @@ def test_ragas_faithfulness_submits_evaluation_on_span_with_question_in_messages
 
 
 @pytest.mark.vcr_logs
-def test_ragas_faithfulness_submits_evaluation_on_span_with_custom_keys(ragas, LLMObs, mock_llmobs_submit_evaluation):
+def test_ragas_faithfulness_submits_evaluation_on_span_with_custom_keys(ragas, llmobs, mock_llmobs_submit_evaluation):
     """Test that evaluation is submitted for a valid llm span where the last message content is the question"""
-    rf_evaluator = RagasFaithfulnessEvaluator(LLMObs)
+    rf_evaluator = RagasFaithfulnessEvaluator(llmobs)
     llm_span = _expected_llmobs_llm_span_event(
         Span("dummy"),
         prompt={
@@ -178,7 +178,6 @@ def test_ragas_faithfulness_emits_traces(ragas, llmobs, llmobs_events):
     assert len(ragas_spans) == 7
     # check name, io, span kinds match
     assert ragas_spans == _expected_ragas_spans()
-
     # verify the trace structure
     root_span = ragas_spans[0]
     root_span_id = root_span["span_id"]
@@ -241,7 +240,7 @@ def test_llmobs_with_faithfulness_emits_traces_and_evals_on_exit(mock_writer_log
 ):
     LLMObs.enable()
     LLMObs._instance._evaluator_runner.enqueue(_llm_span_with_expected_ragas_inputs_in_messages(), None)
-""",
+    """,
         env=env,
     )
     assert status == 0, err
diff --git a/tests/llmobs/test_llmobs_service.py b/tests/llmobs/test_llmobs_service.py
index 2ba5754019f..69ebb216d7e 100644
--- a/tests/llmobs/test_llmobs_service.py
+++ b/tests/llmobs/test_llmobs_service.py
@@ -29,6 +29,8 @@
 from ddtrace.llmobs._constants import SPAN_START_WHILE_DISABLED_WARNING
 from ddtrace.llmobs._constants import TAGS
 from ddtrace.llmobs._llmobs import SUPPORTED_LLMOBS_INTEGRATIONS
+from ddtrace.llmobs._writer import LLMObsAgentlessEventClient
+from ddtrace.llmobs._writer import LLMObsProxiedEventClient
 from ddtrace.llmobs.utils import Prompt
 from tests.llmobs._utils import _expected_llmobs_eval_metric_event
 from tests.llmobs._utils import _expected_llmobs_llm_span_event
@@ -47,7 +49,7 @@ def run_llmobs_trace_filter(dummy_tracer):
     return dummy_tracer._writer.pop()
 
 
-def test_service_enable():
+def test_service_enable_proxy_default():
     with override_global_config(dict(_dd_api_key="<not-a-real-api-key>", _llmobs_ml_app="<ml-app-name>")):
         dummy_tracer = DummyTracer()
         llmobs_service.enable(_tracer=dummy_tracer)
@@ -55,6 +57,21 @@ def test_service_enable():
         assert llmobs_instance is not None
         assert llmobs_service.enabled
         assert llmobs_instance.tracer == dummy_tracer
+        assert isinstance(llmobs_instance._llmobs_span_writer._clients[0], LLMObsProxiedEventClient)
+        assert run_llmobs_trace_filter(dummy_tracer) is not None
+
+        llmobs_service.disable()
+
+
+def test_enable_agentless():
+    with override_global_config(dict(_dd_api_key="<not-a-real-key>", _llmobs_ml_app="<ml-app-name>")):
+        dummy_tracer = DummyTracer()
+        llmobs_service.enable(_tracer=dummy_tracer, agentless_enabled=True)
+        llmobs_instance = llmobs_service._instance
+        assert llmobs_instance is not None
+        assert llmobs_service.enabled
+        assert llmobs_instance.tracer == dummy_tracer
+        assert isinstance(llmobs_instance._llmobs_span_writer._clients[0], LLMObsAgentlessEventClient)
         assert run_llmobs_trace_filter(dummy_tracer) is not None
 
         llmobs_service.disable()
@@ -1188,42 +1205,14 @@ def test_submit_evaluation_with_numerical_metric_enqueues_writer_with_score_metr
     )
 
 
-def test_flush_calls_periodic_agentless(
-    AgentlessLLMObs, mock_llmobs_span_writer, mock_llmobs_eval_metric_writer, mock_llmobs_evaluator_runner
-):
-    AgentlessLLMObs.flush()
-    mock_llmobs_span_writer.periodic.assert_called_once()
-    mock_llmobs_eval_metric_writer.periodic.assert_called_once()
-    mock_llmobs_evaluator_runner.periodic.assert_called_once()
-
-
 def test_flush_does_not_call_periodic_when_llmobs_is_disabled(
     llmobs,
-    mock_llmobs_span_writer,
     mock_llmobs_eval_metric_writer,
     mock_llmobs_evaluator_runner,
     mock_llmobs_logs,
-    disabled_llmobs,
 ):
+    llmobs.enabled = False
     llmobs.flush()
-    mock_llmobs_span_writer.periodic.assert_not_called()
-    mock_llmobs_eval_metric_writer.periodic.assert_not_called()
-    mock_llmobs_evaluator_runner.periodic.assert_not_called()
-    mock_llmobs_logs.warning.assert_has_calls(
-        [mock.call("flushing when LLMObs is disabled. No spans or evaluation metrics will be sent.")]
-    )
-
-
-def test_flush_does_not_call_periodic_when_llmobs_is_disabled_agentless(
-    AgentlessLLMObs,
-    mock_llmobs_span_writer,
-    mock_llmobs_eval_metric_writer,
-    mock_llmobs_evaluator_runner,
-    mock_llmobs_logs,
-    disabled_llmobs,
-):
-    AgentlessLLMObs.flush()
-    mock_llmobs_span_writer.periodic.assert_not_called()
     mock_llmobs_eval_metric_writer.periodic.assert_not_called()
     mock_llmobs_evaluator_runner.periodic.assert_not_called()
     mock_llmobs_logs.warning.assert_has_calls(
diff --git a/tests/llmobs/test_propagation.py b/tests/llmobs/test_propagation.py
index d14b22d65d5..e3ab9c80d66 100644
--- a/tests/llmobs/test_propagation.py
+++ b/tests/llmobs/test_propagation.py
@@ -157,39 +157,39 @@ def test_no_llmobs_parent_id_propagated_if_no_llmobs_spans(run_python_code_in_su
         assert _get_llmobs_parent_id(span) == "undefined"
 
 
-def test_inject_distributed_headers_simple(LLMObs):
+def test_inject_distributed_headers_simple(llmobs):
     dummy_tracer = DummyTracer()
     with dummy_tracer.trace("LLMObs span", span_type=SpanTypes.LLM) as root_span:
-        request_headers = LLMObs.inject_distributed_headers({}, span=root_span)
+        request_headers = llmobs.inject_distributed_headers({}, span=root_span)
     assert PROPAGATED_PARENT_ID_KEY in request_headers["x-datadog-tags"]
 
 
-def test_inject_distributed_headers_nested_llmobs_non_llmobs(LLMObs):
+def test_inject_distributed_headers_nested_llmobs_non_llmobs(llmobs):
     dummy_tracer = DummyTracer()
     with dummy_tracer.trace("LLMObs span", span_type=SpanTypes.LLM):
         with dummy_tracer.trace("Non-LLMObs span") as child_span:
-            request_headers = LLMObs.inject_distributed_headers({}, span=child_span)
+            request_headers = llmobs.inject_distributed_headers({}, span=child_span)
     assert PROPAGATED_PARENT_ID_KEY in request_headers["x-datadog-tags"]
 
 
-def test_inject_distributed_headers_non_llmobs_root_span(LLMObs):
+def test_inject_distributed_headers_non_llmobs_root_span(llmobs):
     dummy_tracer = DummyTracer()
     with dummy_tracer.trace("Non-LLMObs span"):
         with dummy_tracer.trace("LLMObs span", span_type=SpanTypes.LLM) as child_span:
-            request_headers = LLMObs.inject_distributed_headers({}, span=child_span)
+            request_headers = llmobs.inject_distributed_headers({}, span=child_span)
     assert PROPAGATED_PARENT_ID_KEY in request_headers["x-datadog-tags"]
 
 
-def test_inject_distributed_headers_nested_llmobs_spans(LLMObs):
+def test_inject_distributed_headers_nested_llmobs_spans(llmobs):
     dummy_tracer = DummyTracer()
     with dummy_tracer.trace("LLMObs span", span_type=SpanTypes.LLM):
         with dummy_tracer.trace("LLMObs child span", span_type=SpanTypes.LLM):
             with dummy_tracer.trace("Last LLMObs child span", span_type=SpanTypes.LLM) as last_llmobs_span:
-                request_headers = LLMObs.inject_distributed_headers({}, span=last_llmobs_span)
+                request_headers = llmobs.inject_distributed_headers({}, span=last_llmobs_span)
     assert PROPAGATED_PARENT_ID_KEY in request_headers["x-datadog-tags"]
 
 
-def test_activate_distributed_headers_propagate_correct_llmobs_parent_id_simple(run_python_code_in_subprocess, LLMObs):
+def test_activate_distributed_headers_propagate_correct_llmobs_parent_id_simple(run_python_code_in_subprocess, llmobs):
     """Test that the correct LLMObs parent ID is propagated in the headers in a simple distributed scenario.
     Service A (subprocess) has a root LLMObs span and a non-LLMObs child span.
     Service B (outside subprocess) has a LLMObs span.
@@ -218,13 +218,13 @@ def test_activate_distributed_headers_propagate_correct_llmobs_parent_id_simple(
     assert status == 0, (stdout, stderr)
 
     headers = json.loads(stdout.decode())
-    LLMObs.activate_distributed_headers(headers)
-    with LLMObs.workflow("LLMObs span") as span:
+    llmobs.activate_distributed_headers(headers)
+    with llmobs.workflow("LLMObs span") as span:
         assert str(span.parent_id) == headers["x-datadog-parent-id"]
         assert _get_llmobs_parent_id(span) == headers["_DD_LLMOBS_SPAN_ID"]
 
 
-def test_activate_distributed_headers_propagate_llmobs_parent_id_complex(run_python_code_in_subprocess, LLMObs):
+def test_activate_distributed_headers_propagate_llmobs_parent_id_complex(run_python_code_in_subprocess, llmobs):
     """Test that the correct LLMObs parent ID is propagated in the headers in a more complex trace.
     Service A (subprocess) has a root LLMObs span and a non-LLMObs child span.
     Service B (outside subprocess) has a non-LLMObs local root span and a LLMObs child span.
@@ -253,16 +253,16 @@ def test_activate_distributed_headers_propagate_llmobs_parent_id_complex(run_pyt
     assert status == 0, (stdout, stderr)
 
     headers = json.loads(stdout.decode())
-    LLMObs.activate_distributed_headers(headers)
+    llmobs.activate_distributed_headers(headers)
     dummy_tracer = DummyTracer()
     with dummy_tracer.trace("Non-LLMObs span") as span:
-        with LLMObs.llm(model_name="llm_model", name="LLMObs span") as llm_span:
+        with llmobs.llm(model_name="llm_model", name="LLMObs span") as llm_span:
             assert str(span.parent_id) == headers["x-datadog-parent-id"]
             assert _get_llmobs_parent_id(span) == headers["_DD_LLMOBS_SPAN_ID"]
             assert _get_llmobs_parent_id(llm_span) == headers["_DD_LLMOBS_SPAN_ID"]
 
 
-def test_activate_distributed_headers_does_not_propagate_if_no_llmobs_spans(run_python_code_in_subprocess, LLMObs):
+def test_activate_distributed_headers_does_not_propagate_if_no_llmobs_spans(run_python_code_in_subprocess, llmobs):
     """Test that the correct LLMObs parent ID (None) is extracted from the headers in a simple distributed scenario.
     Service A (subprocess) has spans, but none are LLMObs spans.
     Service B (outside subprocess) has a LLMObs span.
@@ -287,10 +287,9 @@ def test_activate_distributed_headers_does_not_propagate_if_no_llmobs_spans(run_
     env["DD_TRACE_ENABLED"] = "0"
     stdout, stderr, status, _ = run_python_code_in_subprocess(code=code, env=env)
     assert status == 0, (stdout, stderr)
-    assert stderr == b"", (stdout, stderr)
 
     headers = json.loads(stdout.decode())
-    LLMObs.activate_distributed_headers(headers)
-    with LLMObs.task("LLMObs span") as span:
+    llmobs.activate_distributed_headers(headers)
+    with llmobs.task("LLMObs span") as span:
         assert str(span.parent_id) == headers["x-datadog-parent-id"]
         assert _get_llmobs_parent_id(span) == "undefined"

From 629139727e33822e6305835dcb3a57f0c4008a8b Mon Sep 17 00:00:00 2001
From: Taegyun Kim <taegyun.kim@datadoghq.com>
Date: Thu, 9 Jan 2025 18:17:21 -0500
Subject: [PATCH 12/19] fix(profiling): access lock acquired time only when the
 lock is held (#11881)

## Checklist
- [x] PR author has checked that all the criteria below are met
- The PR description includes an overview of the change
- The PR description articulates the motivation for the change
- The change includes tests OR the PR description describes a testing
strategy
- The PR description notes risks associated with the change, if any
- Newly-added code is easy to change
- The change follows the [library release note
guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html)
- The change includes or references documentation updates if necessary
- Backport labels are set (if
[applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting))

## Reviewer Checklist
- [x] Reviewer has checked that all the criteria below are met
- Title is accurate
- All changes are related to the pull request's stated goal
- Avoids breaking
[API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces)
changes
- Testing strategy adequately addresses listed risks
- Newly-added code is easy to change
- Release note makes sense to a user of the library
- If necessary, author has acknowledged and discussed the performance
implications of this PR as reported in the benchmarks PR comment
- Backport labels are set in a manner that is consistent with the
[release branch maintenance
policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)
---
 ddtrace/profiling/collector/_lock.py          | 114 +++++++++---------
 ...ing-lock-acquired-at-e308547cffdca9f7.yaml |   6 +
 2 files changed, 60 insertions(+), 60 deletions(-)
 create mode 100644 releasenotes/notes/profiling-lock-acquired-at-e308547cffdca9f7.yaml

diff --git a/ddtrace/profiling/collector/_lock.py b/ddtrace/profiling/collector/_lock.py
index 6dedf3295f7..ec62c5c0eee 100644
--- a/ddtrace/profiling/collector/_lock.py
+++ b/ddtrace/profiling/collector/_lock.py
@@ -179,69 +179,63 @@ def acquire(self, *args, **kwargs):
 
     def _release(self, inner_func, *args, **kwargs):
         # type (typing.Any, typing.Any) -> None
+
+        start = None
+        if hasattr(self, "_self_acquired_at"):
+            # _self_acquired_at is only set when the acquire was captured
+            # if it's not set, we're not capturing the release
+            start = self._self_acquired_at
+
         try:
             return inner_func(*args, **kwargs)
         finally:
-            try:
-                if hasattr(self, "_self_acquired_at"):
-                    try:
-                        end = time.monotonic_ns()
-                        thread_id, thread_name = _current_thread()
-                        task_id, task_name, task_frame = _task.get_task(thread_id)
-                        lock_name = (
-                            "%s:%s" % (self._self_init_loc, self._self_name) if self._self_name else self._self_init_loc
-                        )
-
-                        if task_frame is None:
-                            # See the comments in _acquire
-                            frame = sys._getframe(2)
-                        else:
-                            frame = task_frame
-
-                        frames, nframes = _traceback.pyframe_to_frames(frame, self._self_max_nframes)
-
-                        if self._self_export_libdd_enabled:
-                            thread_native_id = _threading.get_thread_native_id(thread_id)
-
-                            handle = ddup.SampleHandle()
-                            handle.push_monotonic_ns(end)
-                            handle.push_lock_name(lock_name)
-                            handle.push_release(
-                                end - self._self_acquired_at, 1
-                            )  # AFAICT, capture_pct does not adjust anything here
-                            handle.push_threadinfo(thread_id, thread_native_id, thread_name)
-                            handle.push_task_id(task_id)
-                            handle.push_task_name(task_name)
-
-                            if self._self_tracer is not None:
-                                handle.push_span(self._self_tracer.current_span())
-                            for frame in frames:
-                                handle.push_frame(frame.function_name, frame.file_name, 0, frame.lineno)
-                            handle.flush_sample()
-                        else:
-                            event = self.RELEASE_EVENT_CLASS(
-                                lock_name=lock_name,
-                                frames=frames,
-                                nframes=nframes,
-                                thread_id=thread_id,
-                                thread_name=thread_name,
-                                task_id=task_id,
-                                task_name=task_name,
-                                locked_for_ns=end - self._self_acquired_at,
-                                sampling_pct=self._self_capture_sampler.capture_pct,
-                            )
-
-                            if self._self_tracer is not None:
-                                event.set_trace_info(
-                                    self._self_tracer.current_span(), self._self_endpoint_collection_enabled
-                                )
-
-                            self._self_recorder.push_event(event)
-                    finally:
-                        del self._self_acquired_at
-            except Exception as e:
-                LOG.warning("Error recording lock release event: %s", e)
-                pass  # nosec
+            if start is not None:
+                end = time.monotonic_ns()
+                thread_id, thread_name = _current_thread()
+                task_id, task_name, task_frame = _task.get_task(thread_id)
+                lock_name = "%s:%s" % (self._self_init_loc, self._self_name) if self._self_name else self._self_init_loc
+
+                if task_frame is None:
+                    # See the comments in _acquire
+                    frame = sys._getframe(2)
+                else:
+                    frame = task_frame
+
+                frames, nframes = _traceback.pyframe_to_frames(frame, self._self_max_nframes)
+
+                if self._self_export_libdd_enabled:
+                    thread_native_id = _threading.get_thread_native_id(thread_id)
+
+                    handle = ddup.SampleHandle()
+                    handle.push_monotonic_ns(end)
+                    handle.push_lock_name(lock_name)
+                    handle.push_release(end - start, 1)  # AFAICT, capture_pct does not adjust anything here
+                    handle.push_threadinfo(thread_id, thread_native_id, thread_name)
+                    handle.push_task_id(task_id)
+                    handle.push_task_name(task_name)
+
+                    if self._self_tracer is not None:
+                        handle.push_span(self._self_tracer.current_span())
+                    for frame in frames:
+                        handle.push_frame(frame.function_name, frame.file_name, 0, frame.lineno)
+                    handle.flush_sample()
+                else:
+                    event = self.RELEASE_EVENT_CLASS(
+                        lock_name=lock_name,
+                        frames=frames,
+                        nframes=nframes,
+                        thread_id=thread_id,
+                        thread_name=thread_name,
+                        task_id=task_id,
+                        task_name=task_name,
+                        locked_for_ns=end - start,
+                        sampling_pct=self._self_capture_sampler.capture_pct,
+                    )
+
+                    if self._self_tracer is not None:
+                        event.set_trace_info(self._self_tracer.current_span(), self._self_endpoint_collection_enabled)
+
+                    self._self_recorder.push_event(event)
 
     def release(self, *args, **kwargs):
         return self._release(self.__wrapped__.release, *args, **kwargs)
diff --git a/releasenotes/notes/profiling-lock-acquired-at-e308547cffdca9f7.yaml b/releasenotes/notes/profiling-lock-acquired-at-e308547cffdca9f7.yaml
new file mode 100644
index 00000000000..de86c8227b6
--- /dev/null
+++ b/releasenotes/notes/profiling-lock-acquired-at-e308547cffdca9f7.yaml
@@ -0,0 +1,6 @@
+---
+fixes:
+  - |
+    profiling: This fix resolves a data race issue accessing lock's acquired
+    time, leading to an ``AttributeError``: ``_Profiled_ThreadingLock`` object
+    has no attribute ``self_acquired_at``

From afe439e311ee1b3dd23684a5df72020a0e95e77e Mon Sep 17 00:00:00 2001
From: Juanjo Alvarez Martinez <juanjo.alvarezmartinez@datadoghq.com>
Date: Fri, 10 Jan 2025 10:14:31 +0100
Subject: [PATCH 13/19] chore(iast): optimize _should_iast_patch (#11885)

## Description

Optimize the `_should_iast_patch` function by:

- Using a Trie tree for the substring matching of the modules with the
DENY / ALLOW lists.
- Move a call to `lower()` that was being done on each call to
`_in_python_stdlib()` to the `_NOT_PATH_MODULE_NAMES` static set
creation.

These two changes (specially the second) make the function 14x faster.

Signed-off-by: Juanjo Alvarez <juanjo.alvarezmartinez@datadoghq.com>##
Checklist
- [X] PR author has checked that all the criteria below are met
- The PR description includes an overview of the change
- The PR description articulates the motivation for the change
- The change includes tests OR the PR description describes a testing
strategy
- The PR description notes risks associated with the change, if any
- Newly-added code is easy to change
- The change follows the [library release note
guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html)
- The change includes or references documentation updates if necessary
- Backport labels are set (if
[applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting))

## Reviewer Checklist
- [x] Reviewer has checked that all the criteria below are met
- Title is accurate
- All changes are related to the pull request's stated goal
- Avoids breaking
[API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces)
changes
- Testing strategy adequately addresses listed risks
- Newly-added code is easy to change
- Release note makes sense to a user of the library
- If necessary, author has acknowledged and discussed the performance
implications of this PR as reported in the benchmarks PR comment
- Backport labels are set in a manner that is consistent with the
[release branch maintenance
policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)

---------

Signed-off-by: Juanjo Alvarez <juanjo.alvarezmartinez@datadoghq.com>
---
 ddtrace/appsec/_iast/_ast/ast_patching.py     | 52 ++++++++++-
 .../appsec/_python_info/stdlib/__init__.py    |  2 +-
 tests/appsec/iast/_ast/test_ast_patching.py   | 86 +++++++++++++++++++
 3 files changed, 135 insertions(+), 5 deletions(-)

diff --git a/ddtrace/appsec/_iast/_ast/ast_patching.py b/ddtrace/appsec/_iast/_ast/ast_patching.py
index 7e2258bd556..2c7e958d087 100644
--- a/ddtrace/appsec/_iast/_ast/ast_patching.py
+++ b/ddtrace/appsec/_iast/_ast/ast_patching.py
@@ -7,6 +7,7 @@
 from sys import version_info
 import textwrap
 from types import ModuleType
+from typing import Iterable
 from typing import Optional
 from typing import Text
 from typing import Tuple
@@ -327,6 +328,49 @@
 log = get_logger(__name__)
 
 
+class _TrieNode:
+    __slots__ = ("children", "is_end")
+
+    def __init__(self):
+        self.children = {}
+        self.is_end = False
+
+    def __iter__(self):
+        if self.is_end:
+            yield ("", None)
+        else:
+            for k, v in self.children.items():
+                yield (k, dict(v))
+
+
+def build_trie(words: Iterable[str]) -> _TrieNode:
+    root = _TrieNode()
+    for word in words:
+        node = root
+        for char in word:
+            if char not in node.children:
+                node.children[char] = _TrieNode()
+            node = node.children[char]
+        node.is_end = True
+    return root
+
+
+_TRIE_ALLOWLIST = build_trie(IAST_ALLOWLIST)
+_TRIE_DENYLIST = build_trie(IAST_DENYLIST)
+
+
+def _trie_has_prefix_for(trie: _TrieNode, string: str) -> bool:
+    node = trie
+    for char in string:
+        node = node.children.get(char)
+        if not node:
+            return False
+
+        if node.is_end:
+            return True
+    return node.is_end
+
+
 def get_encoding(module_path: Text) -> Text:
     """
     First tries to detect the encoding for the file,
@@ -341,11 +385,11 @@ def get_encoding(module_path: Text) -> Text:
     return ENCODING
 
 
-_NOT_PATCH_MODULE_NAMES = _stdlib_for_python_version() | set(builtin_module_names)
+_NOT_PATCH_MODULE_NAMES = {i.lower() for i in _stdlib_for_python_version() | set(builtin_module_names)}
 
 
 def _in_python_stdlib(module_name: str) -> bool:
-    return module_name.split(".")[0].lower() in [x.lower() for x in _NOT_PATCH_MODULE_NAMES]
+    return module_name.split(".")[0].lower() in _NOT_PATCH_MODULE_NAMES
 
 
 def _should_iast_patch(module_name: Text) -> bool:
@@ -359,10 +403,10 @@ def _should_iast_patch(module_name: Text) -> bool:
     # diff = max_allow - max_deny
     # return diff > 0 or (diff == 0 and not _in_python_stdlib_or_third_party(module_name))
     dotted_module_name = module_name.lower() + "."
-    if dotted_module_name.startswith(IAST_ALLOWLIST):
+    if _trie_has_prefix_for(_TRIE_ALLOWLIST, dotted_module_name):
         log.debug("IAST: allowing %s. it's in the IAST_ALLOWLIST", module_name)
         return True
-    if dotted_module_name.startswith(IAST_DENYLIST):
+    if _trie_has_prefix_for(_TRIE_DENYLIST, dotted_module_name):
         log.debug("IAST: denying %s. it's in the IAST_DENYLIST", module_name)
         return False
     if _in_python_stdlib(module_name):
diff --git a/ddtrace/appsec/_python_info/stdlib/__init__.py b/ddtrace/appsec/_python_info/stdlib/__init__.py
index a040e57f859..e745c392f55 100644
--- a/ddtrace/appsec/_python_info/stdlib/__init__.py
+++ b/ddtrace/appsec/_python_info/stdlib/__init__.py
@@ -19,5 +19,5 @@
     from .module_names_py312 import STDLIB_MODULE_NAMES
 
 
-def _stdlib_for_python_version():  # type: () -> set
+def _stdlib_for_python_version():  # type: () -> set[str]
     return STDLIB_MODULE_NAMES
diff --git a/tests/appsec/iast/_ast/test_ast_patching.py b/tests/appsec/iast/_ast/test_ast_patching.py
index cf0fabd14e4..d014496942b 100644
--- a/tests/appsec/iast/_ast/test_ast_patching.py
+++ b/tests/appsec/iast/_ast/test_ast_patching.py
@@ -9,7 +9,9 @@
 from ddtrace.appsec._constants import IAST
 from ddtrace.appsec._iast._ast.ast_patching import _in_python_stdlib
 from ddtrace.appsec._iast._ast.ast_patching import _should_iast_patch
+from ddtrace.appsec._iast._ast.ast_patching import _trie_has_prefix_for
 from ddtrace.appsec._iast._ast.ast_patching import astpatch_module
+from ddtrace.appsec._iast._ast.ast_patching import build_trie
 from ddtrace.appsec._iast._ast.ast_patching import visit_ast
 from ddtrace.internal.utils.formats import asbool
 from tests.utils import override_env
@@ -308,3 +310,87 @@ def test_astpatch_dir_patched_with_or_without_custom_dir(module_name, expected_n
         # Check that all the symbols in the expected set are in the patched dir() result
         for name in expected_names:
             assert name in patched_dir
+
+
+def test_build_trie():
+    from ddtrace.appsec._iast._ast.ast_patching import build_trie
+
+    trie = build_trie(["abc", "def", "ghi", "jkl", "mno", "pqr", "stu", "vwx", "yz"])
+    assert dict(trie) == {
+        "a": {
+            "b": {
+                "c": {"": None},
+            },
+        },
+        "d": {
+            "e": {
+                "f": {"": None},
+            },
+        },
+        "g": {
+            "h": {
+                "i": {"": None},
+            },
+        },
+        "j": {
+            "k": {
+                "l": {"": None},
+            },
+        },
+        "m": {
+            "n": {
+                "o": {"": None},
+            },
+        },
+        "p": {
+            "q": {
+                "r": {"": None},
+            },
+        },
+        "s": {
+            "t": {
+                "u": {"": None},
+            },
+        },
+        "v": {
+            "w": {
+                "x": {"": None},
+            },
+        },
+        "y": {
+            "z": {"": None},
+        },
+    }
+
+
+def test_trie_has_string_match():
+    trie = build_trie(["abc", "def", "ghi", "jkl", "mno", "pqr", "stu", "vwx", "yz"])
+    assert _trie_has_prefix_for(trie, "abc")
+    assert not _trie_has_prefix_for(trie, "ab")
+    assert _trie_has_prefix_for(trie, "abcd")
+    assert _trie_has_prefix_for(trie, "def")
+    assert not _trie_has_prefix_for(trie, "de")
+    assert _trie_has_prefix_for(trie, "defg")
+    assert _trie_has_prefix_for(trie, "ghi")
+    assert not _trie_has_prefix_for(trie, "gh")
+    assert _trie_has_prefix_for(trie, "ghij")
+    assert _trie_has_prefix_for(trie, "jkl")
+    assert not _trie_has_prefix_for(trie, "jk")
+    assert _trie_has_prefix_for(trie, "jklm")
+    assert _trie_has_prefix_for(trie, "mno")
+    assert not _trie_has_prefix_for(trie, "mn")
+    assert _trie_has_prefix_for(trie, "mnop")
+    assert _trie_has_prefix_for(trie, "pqr")
+    assert not _trie_has_prefix_for(trie, "pq")
+    assert _trie_has_prefix_for(trie, "pqrs")
+    assert _trie_has_prefix_for(trie, "stu")
+    assert not _trie_has_prefix_for(trie, "st")
+    assert _trie_has_prefix_for(trie, "stuv")
+    assert _trie_has_prefix_for(trie, "vwx")
+    assert not _trie_has_prefix_for(trie, "vw")
+    assert _trie_has_prefix_for(trie, "vwxy")
+    assert _trie_has_prefix_for(trie, "yz")
+    assert not _trie_has_prefix_for(trie, "y")
+    assert _trie_has_prefix_for(trie, "yza")
+    assert not _trie_has_prefix_for(trie, "z")
+    assert not _trie_has_prefix_for(trie, "zzz")

From 1020545f6db0f4a89a378acd33e90cbcbae7fc1f Mon Sep 17 00:00:00 2001
From: Christophe Papazian
 <114495376+christophe-papazian@users.noreply.github.com>
Date: Fri, 10 Jan 2025 14:30:41 +0100
Subject: [PATCH 14/19] chore(asm): update python version for threats tests
 (#11893)

Update the python version for threats tests, to include python 3.13 when
available.

## Checklist
- [x] PR author has checked that all the criteria below are met
- The PR description includes an overview of the change
- The PR description articulates the motivation for the change
- The change includes tests OR the PR description describes a testing
strategy
- The PR description notes risks associated with the change, if any
- Newly-added code is easy to change
- The change follows the [library release note
guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html)
- The change includes or references documentation updates if necessary
- Backport labels are set (if
[applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting))

## Reviewer Checklist
- [x] Reviewer has checked that all the criteria below are met
- Title is accurate
- All changes are related to the pull request's stated goal
- Avoids breaking
[API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces)
changes
- Testing strategy adequately addresses listed risks
- Newly-added code is easy to change
- Release note makes sense to a user of the library
- If necessary, author has acknowledged and discussed the performance
implications of this PR as reported in the benchmarks PR comment
- Backport labels are set in a manner that is consistent with the
[release branch maintenance
policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)
---
 hatch.toml | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/hatch.toml b/hatch.toml
index 0baca1fd235..1f073332547 100644
--- a/hatch.toml
+++ b/hatch.toml
@@ -214,11 +214,11 @@ test = [
 # if you add or remove a version here, please also update the parallelism parameter
 # in .circleci/config.templ.yml
 [[envs.appsec_threats_django.matrix]]
-python = ["3.7", "3.9"]
+python = ["3.8", "3.9"]
 django = ["~=2.2"]
 
 [[envs.appsec_threats_django.matrix]]
-python = ["3.7", "3.9", "3.10"]
+python = ["3.8", "3.9", "3.10"]
 django = ["~=3.2"]
 
 [[envs.appsec_threats_django.matrix]]
@@ -226,11 +226,11 @@ python = ["3.8", "3.10"]
 django = ["==4.0.10"]
 
 [[envs.appsec_threats_django.matrix]]
-python = ["3.8", "3.10", "3.12"]
+python = ["3.8", "3.11", "3.13"]
 django = ["~=4.2"]
 
 [[envs.appsec_threats_django.matrix]]
-python = ["3.10", "3.12"]
+python = ["3.10", "3.13"]
 django = ["~=5.1"]
 
 
@@ -262,21 +262,21 @@ test = [
 # if you add or remove a version here, please also update the parallelism parameter
 # in .circleci/config.templ.yml
 [[envs.appsec_threats_flask.matrix]]
-python = ["3.7", "3.9"]
+python = ["3.8", "3.9"]
 flask = ["~=1.1"]
 markupsafe = ["~=1.1"]
 
 [[envs.appsec_threats_flask.matrix]]
-python = ["3.7", "3.9"]
+python = ["3.8", "3.9"]
 flask = ["==2.1.3"]
 werkzeug = ["<3.0"]
 
 [[envs.appsec_threats_flask.matrix]]
-python = ["3.8", "3.9", "3.12"]
+python = ["3.8", "3.10", "3.13"]
 flask = ["~=2.3"]
 
 [[envs.appsec_threats_flask.matrix]]
-python = ["3.8", "3.10", "3.12"]
+python = ["3.8", "3.11", "3.13"]
 flask = ["~=3.0"]
 
 ## ASM Native IAST module
@@ -327,16 +327,16 @@ test = [
 # if you add or remove a version here, please also update the parallelism parameter
 # in .circleci/config.templ.yml
 [[envs.appsec_threats_fastapi.matrix]]
-python = ["3.7", "3.9", "3.11"]
+python = ["3.8", "3.10", "3.13"]
 fastapi = ["==0.86.0"]
 anyio = ["==3.7.1"]
 
 [[envs.appsec_threats_fastapi.matrix]]
-python = ["3.7", "3.9", "3.12"]
+python = ["3.8", "3.10", "3.13"]
 fastapi = ["==0.94.1"]
 
 [[envs.appsec_threats_fastapi.matrix]]
-python = ["3.8", "3.10", "3.12"]
+python = ["3.8", "3.10", "3.13"]
 fastapi = ["~=0.114.2"]
 
 
From 5e688237d4baaee86a74461e0dd0162a9634907d Mon Sep 17 00:00:00 2001
From: lievan <42917263+lievan@users.noreply.github.com>
Date: Fri, 10 Jan 2025 09:56:49 -0500
Subject: [PATCH 15/19] chore(llmobs): refactor out ragas base evaluator
 (#11846)

Creates a `RagasBaseEvaluator` class that ragas evaluators can share.
This is a split-out version of [this
PR](https://github.com/DataDog/dd-trace-py/pull/11716) that has
incorporated changes based on the first round of comments from @Yun-Kim
and @Kyle-Verhoog

This class contains shared logic for:
- checking ragas dependencies are present, and throwing otherwise
- `submit_and_evaluate` function, which generally takes a span event,
calls evaluate to generate a score, and then submits an evaluation
- extracting out question, contexts, and answer from an LLM span

### Misc changes
- rename tests to specify they are faithfulness-specific tests
- throw when we parse an unsupported evaluator instead of logging a
warning
###

## Checklist
- [x] PR author has checked that all the criteria below are met
- The PR description includes an overview of the change
- The PR description articulates the motivation for the change
- The change includes tests OR the PR description describes a testing
strategy
- The PR description notes risks associated with the change, if any
- Newly-added code is easy to change
- The change follows the [library release note
guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html)
- The change includes or references documentation updates if necessary
- Backport labels are set (if
[applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting))

## Reviewer Checklist
- [x] Reviewer has checked that all the criteria below are met
- Title is accurate
- All changes are related to the pull request's stated goal
- Avoids breaking
[API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces)
changes
- Testing strategy adequately addresses listed risks
- Newly-added code is easy to change
- Release note makes sense to a user of the library
- If necessary, author has acknowledged and discussed the performance
implications of this PR as reported in the benchmarks PR comment
- Backport labels are set in a manner that is consistent with the
[release branch maintenance
policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)

---------

Co-authored-by: lievan <evan.li@datadoqhq.com>
---
 ddtrace/llmobs/_evaluators/ragas/base.py      | 213 ++++++++++++++++
 .../llmobs/_evaluators/ragas/faithfulness.py  | 228 +++---------------
 ddtrace/llmobs/_evaluators/runner.py          |   4 +-
 tests/llmobs/_utils.py                        |  43 +++-
 ...emits_traces_and_evaluations_on_exit.yaml} |   0
 ...test_ragas_faithfulness_emits_traces.yaml} |   0
 ...agas_faithfulness_submits_evaluation.yaml} |   0
 ..._evaluation_on_span_with_custom_keys.yaml} |   0
 ...on_on_span_with_question_in_messages.yaml} |   0
 tests/llmobs/test_llmobs_evaluator_runner.py  |   8 +-
 ...tor.py => test_llmobs_ragas_evaluators.py} |   9 +-
 11 files changed, 297 insertions(+), 208 deletions(-)
 create mode 100644 ddtrace/llmobs/_evaluators/ragas/base.py
 rename tests/llmobs/llmobs_cassettes/{tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.emits_traces_and_evaluations_on_exit.yaml => tests.llmobs.test_llmobs_ragas_evaluators.emits_traces_and_evaluations_on_exit.yaml} (100%)
 rename tests/llmobs/llmobs_cassettes/{tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_emits_traces.yaml => tests.llmobs.test_llmobs_ragas_evaluators.test_ragas_faithfulness_emits_traces.yaml} (100%)
 rename tests/llmobs/llmobs_cassettes/{tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_submits_evaluation.yaml => tests.llmobs.test_llmobs_ragas_evaluators.test_ragas_faithfulness_submits_evaluation.yaml} (100%)
 rename tests/llmobs/llmobs_cassettes/{tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_submits_evaluation_on_span_with_custom_keys.yaml => tests.llmobs.test_llmobs_ragas_evaluators.test_ragas_faithfulness_submits_evaluation_on_span_with_custom_keys.yaml} (100%)
 rename tests/llmobs/llmobs_cassettes/{tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_submits_evaluation_on_span_with_question_in_messages.yaml => tests.llmobs.test_llmobs_ragas_evaluators.test_ragas_faithfulness_submits_evaluation_on_span_with_question_in_messages.yaml} (100%)
 rename tests/llmobs/{test_llmobs_ragas_faithfulness_evaluator.py => test_llmobs_ragas_evaluators.py} (97%)

diff --git a/ddtrace/llmobs/_evaluators/ragas/base.py b/ddtrace/llmobs/_evaluators/ragas/base.py
new file mode 100644
index 00000000000..23aa4cd3caa
--- /dev/null
+++ b/ddtrace/llmobs/_evaluators/ragas/base.py
@@ -0,0 +1,213 @@
+import traceback
+from typing import List
+from typing import Optional
+from typing import Tuple
+from typing import Union
+
+from ddtrace.internal.logger import get_logger
+from ddtrace.internal.telemetry import telemetry_writer
+from ddtrace.internal.telemetry.constants import TELEMETRY_APM_PRODUCT
+from ddtrace.internal.telemetry.constants import TELEMETRY_LOG_LEVEL
+from ddtrace.internal.utils.version import parse_version
+from ddtrace.llmobs._constants import INTERNAL_CONTEXT_VARIABLE_KEYS
+from ddtrace.llmobs._constants import INTERNAL_QUERY_VARIABLE_KEYS
+from ddtrace.llmobs._constants import RAGAS_ML_APP_PREFIX
+
+
+logger = get_logger(__name__)
+
+
+class RagasDependencies:
+    """
+    A helper class to store instances of ragas classes and functions
+    that may or may not exist in a user's environment.
+    """
+
+    def __init__(self):
+        import ragas
+
+        self.ragas_version = parse_version(ragas.__version__)
+        if self.ragas_version >= (0, 2, 0) or self.ragas_version < (0, 1, 10):
+            raise NotImplementedError(
+                "Ragas version: {} is not supported".format(self.ragas_version),
+            )
+
+        from ragas.llms import llm_factory
+
+        self.llm_factory = llm_factory
+
+        from ragas.llms.output_parser import RagasoutputParser
+
+        self.RagasoutputParser = RagasoutputParser
+
+        from ragas.metrics import context_precision
+
+        self.context_precision = context_precision
+
+        from ragas.metrics.base import ensembler
+
+        self.ensembler = ensembler
+
+        from ragas.metrics import faithfulness
+
+        self.faithfulness = faithfulness
+
+        from ragas.metrics.base import get_segmenter
+
+        self.get_segmenter = get_segmenter
+
+        from ddtrace.llmobs._evaluators.ragas.models import StatementFaithfulnessAnswers
+
+        self.StatementFaithfulnessAnswers = StatementFaithfulnessAnswers
+
+        from ddtrace.llmobs._evaluators.ragas.models import StatementsAnswers
+
+        self.StatementsAnswers = StatementsAnswers
+
+
+def _get_ml_app_for_ragas_trace(span_event: dict) -> str:
+    """
+    The `ml_app` spans generated from traces of ragas will be named as `dd-ragas-<ml_app>`
+    or `dd-ragas` if `ml_app` is not present in the span event.
+    """
+    tags: List[str] = span_event.get("tags", [])
+    ml_app = None
+    for tag in tags:
+        if isinstance(tag, str) and tag.startswith("ml_app:"):
+            ml_app = tag.split(":")[1]
+            break
+    if not ml_app:
+        return RAGAS_ML_APP_PREFIX
+    return "{}-{}".format(RAGAS_ML_APP_PREFIX, ml_app)
+
+
+class BaseRagasEvaluator:
+    """A class used by EvaluatorRunner to conduct ragas evaluations
+    on LLM Observability span events. The job of an Evaluator is to take a span and
+    submit evaluation metrics based on the span's attributes.
+
+    Extenders of this class should only need to implement the `evaluate` method.
+    """
+
+    LABEL = "ragas"
+    METRIC_TYPE = "score"
+
+    def __init__(self, llmobs_service):
+        """
+        Initialize an evaluator that uses the ragas library to generate a score on finished LLM spans.
+
+        :param llmobs_service: An instance of the LLM Observability service used for tracing the evaluation and
+                                      submitting evaluation metrics.
+
+        Raises: NotImplementedError if the ragas library is not found or if ragas version is not supported.
+        """
+        self.llmobs_service = llmobs_service
+        self.ragas_version = "unknown"
+        telemetry_state = "ok"
+        try:
+            self.ragas_dependencies = RagasDependencies()
+            self.ragas_version = self.ragas_dependencies.ragas_version
+        except ImportError as e:
+            telemetry_state = "fail_import_error"
+            raise NotImplementedError("Failed to load dependencies for `{}` evaluator".format(self.LABEL)) from e
+        except AttributeError as e:
+            telemetry_state = "fail_attribute_error"
+            raise NotImplementedError("Failed to load dependencies for `{}` evaluator".format(self.LABEL)) from e
+        except NotImplementedError as e:
+            telemetry_state = "fail_not_supported"
+            raise NotImplementedError("Failed to load dependencies for `{}` evaluator".format(self.LABEL)) from e
+        except Exception as e:
+            telemetry_state = "fail_unknown"
+            raise NotImplementedError("Failed to load dependencies for `{}` evaluator".format(self.LABEL)) from e
+        finally:
+            telemetry_writer.add_count_metric(
+                namespace=TELEMETRY_APM_PRODUCT.LLMOBS,
+                name="evaluators.init",
+                value=1,
+                tags=(
+                    ("evaluator_label", self.LABEL),
+                    ("state", telemetry_state),
+                    ("evaluator_version", self.ragas_version),
+                ),
+            )
+            if telemetry_state != "ok":
+                telemetry_writer.add_log(
+                    level=TELEMETRY_LOG_LEVEL.ERROR,
+                    message="Failed to import Ragas dependencies",
+                    stack_trace=traceback.format_exc(),
+                    tags={"evaluator_version": self.ragas_version},
+                )
+
+    def run_and_submit_evaluation(self, span_event: dict):
+        if not span_event:
+            return
+        score_result_or_failure, metric_metadata = self.evaluate(span_event)
+        telemetry_writer.add_count_metric(
+            TELEMETRY_APM_PRODUCT.LLMOBS,
+            "evaluators.run",
+            1,
+            tags=(
+                ("evaluator_label", self.LABEL),
+                ("state", score_result_or_failure if isinstance(score_result_or_failure, str) else "success"),
+                ("evaluator_version", self.ragas_version),
+            ),
+        )
+        if isinstance(score_result_or_failure, float):
+            self.llmobs_service.submit_evaluation(
+                span_context={"trace_id": span_event.get("trace_id"), "span_id": span_event.get("span_id")},
+                label=self.LABEL,
+                metric_type=self.METRIC_TYPE,
+                value=score_result_or_failure,
+                metadata=metric_metadata,
+            )
+
+    def evaluate(self, span_event: dict) -> Tuple[Union[float, str], Optional[dict]]:
+        raise NotImplementedError("evaluate method must be implemented by individual evaluators")
+
+    def _extract_evaluation_inputs_from_span(self, span_event: dict) -> Optional[dict]:
+        """
+        Extracts the question, answer, and context used as inputs for a ragas evaluation on a span event.
+        """
+        with self.llmobs_service.workflow("dd-ragas.extract_evaluation_inputs_from_span") as extract_inputs_workflow:
+            self.llmobs_service.annotate(span=extract_inputs_workflow, input_data=span_event)
+            question, answer, contexts = None, None, None
+
+            meta_io = span_event.get("meta")
+            if meta_io is None:
+                return None
+
+            meta_input = meta_io.get("input")
+            meta_output = meta_io.get("output")
+
+            if not (meta_input and meta_output):
+                return None
+
+            prompt = meta_input.get("prompt")
+            if prompt is None:
+                logger.debug("Failed to extract `prompt` from span for ragas evaluation")
+                return None
+            prompt_variables = prompt.get("variables")
+
+            input_messages = meta_input.get("messages")
+
+            messages = meta_output.get("messages")
+            if messages is not None and len(messages) > 0:
+                answer = messages[-1].get("content")
+
+            if prompt_variables:
+                context_keys = prompt.get(INTERNAL_CONTEXT_VARIABLE_KEYS, ["context"])
+                question_keys = prompt.get(INTERNAL_QUERY_VARIABLE_KEYS, ["question"])
+                contexts = [prompt_variables.get(key) for key in context_keys if prompt_variables.get(key)]
+                question = " ".join([prompt_variables.get(key) for key in question_keys if prompt_variables.get(key)])
+
+            if not question and input_messages is not None and len(input_messages) > 0:
+                question = input_messages[-1].get("content")
+
+            self.llmobs_service.annotate(
+                span=extract_inputs_workflow, output_data={"question": question, "contexts": contexts, "answer": answer}
+            )
+            if any(field is None for field in (question, contexts, answer)):
+                logger.debug("Failed to extract inputs required for ragas evaluation")
+                return None
+
+            return {"question": question, "contexts": contexts, "answer": answer}
diff --git a/ddtrace/llmobs/_evaluators/ragas/faithfulness.py b/ddtrace/llmobs/_evaluators/ragas/faithfulness.py
index d651c2443a4..98725b1f27e 100644
--- a/ddtrace/llmobs/_evaluators/ragas/faithfulness.py
+++ b/ddtrace/llmobs/_evaluators/ragas/faithfulness.py
@@ -1,73 +1,22 @@
 import json
 import math
-import traceback
 from typing import List
 from typing import Optional
 from typing import Tuple
 from typing import Union
 
 from ddtrace.internal.logger import get_logger
-from ddtrace.internal.telemetry import telemetry_writer
-from ddtrace.internal.telemetry.constants import TELEMETRY_APM_PRODUCT
-from ddtrace.internal.telemetry.constants import TELEMETRY_LOG_LEVEL
-from ddtrace.internal.utils.version import parse_version
 from ddtrace.llmobs._constants import EVALUATION_KIND_METADATA
 from ddtrace.llmobs._constants import EVALUATION_SPAN_METADATA
 from ddtrace.llmobs._constants import FAITHFULNESS_DISAGREEMENTS_METADATA
-from ddtrace.llmobs._constants import INTERNAL_CONTEXT_VARIABLE_KEYS
-from ddtrace.llmobs._constants import INTERNAL_QUERY_VARIABLE_KEYS
-from ddtrace.llmobs._constants import RAGAS_ML_APP_PREFIX
+from ddtrace.llmobs._evaluators.ragas.base import BaseRagasEvaluator
+from ddtrace.llmobs._evaluators.ragas.base import _get_ml_app_for_ragas_trace
 
 
 logger = get_logger(__name__)
 
 
-class MiniRagas:
-    """
-    A helper class to store instances of ragas classes and functions
-    that may or may not exist in a user's environment.
-    """
-
-    llm_factory = None
-    RagasoutputParser = None
-    faithfulness = None
-    ensembler = None
-    get_segmenter = None
-    StatementFaithfulnessAnswers = None
-    StatementsAnswers = None
-
-
-def _get_ml_app_for_ragas_trace(span_event: dict) -> str:
-    """
-    The `ml_app` spans generated from traces of ragas will be named as `dd-ragas-<ml_app>`
-    or `dd-ragas` if `ml_app` is not present in the span event.
-    """
-    tags = span_event.get("tags", [])  # list[str]
-    ml_app = None
-    for tag in tags:
-        if isinstance(tag, str) and tag.startswith("ml_app:"):
-            ml_app = tag.split(":")[1]
-            break
-    if not ml_app:
-        return RAGAS_ML_APP_PREFIX
-    return "{}-{}".format(RAGAS_ML_APP_PREFIX, ml_app)
-
-
-def _get_faithfulness_instance() -> Optional[object]:
-    """
-    This helper function ensures the faithfulness instance used in
-    ragas evaluator is updated with the latest ragas faithfulness
-    instance AND has an non-null llm
-    """
-    if MiniRagas.faithfulness is None:
-        return None
-    ragas_faithfulness_instance = MiniRagas.faithfulness
-    if not ragas_faithfulness_instance.llm:
-        ragas_faithfulness_instance.llm = MiniRagas.llm_factory()
-    return ragas_faithfulness_instance
-
-
-class RagasFaithfulnessEvaluator:
+class RagasFaithfulnessEvaluator(BaseRagasEvaluator):
     """A class used by EvaluatorRunner to conduct ragas faithfulness evaluations
     on LLM Observability span events. The job of an Evaluator is to take a span and
     submit evaluation metrics based on the span's attributes.
@@ -95,98 +44,30 @@ def __init__(self, llmobs_service):
 
         Raises: NotImplementedError if the ragas library is not found or if ragas version is not supported.
         """
-        self.llmobs_service = llmobs_service
-        self.ragas_version = "unknown"
-        telemetry_state = "ok"
-        try:
-            import ragas
-
-            self.ragas_version = parse_version(ragas.__version__)
-            if self.ragas_version >= (0, 2, 0) or self.ragas_version < (0, 1, 10):
-                raise NotImplementedError(
-                    "Ragas version: {} is not supported for `ragas_faithfulness` evaluator".format(self.ragas_version),
-                )
-
-            from ragas.llms import llm_factory
-
-            MiniRagas.llm_factory = llm_factory
-
-            from ragas.llms.output_parser import RagasoutputParser
-
-            MiniRagas.RagasoutputParser = RagasoutputParser
-
-            from ragas.metrics import faithfulness
-
-            MiniRagas.faithfulness = faithfulness
-
-            from ragas.metrics.base import ensembler
-
-            MiniRagas.ensembler = ensembler
-
-            from ragas.metrics.base import get_segmenter
-
-            MiniRagas.get_segmenter = get_segmenter
-
-            from ddtrace.llmobs._evaluators.ragas.models import StatementFaithfulnessAnswers
-
-            MiniRagas.StatementFaithfulnessAnswers = StatementFaithfulnessAnswers
-
-            from ddtrace.llmobs._evaluators.ragas.models import StatementsAnswers
-
-            MiniRagas.StatementsAnswers = StatementsAnswers
-        except Exception as e:
-            telemetry_state = "fail"
-            telemetry_writer.add_log(
-                level=TELEMETRY_LOG_LEVEL.ERROR,
-                message="Failed to import Ragas dependencies",
-                stack_trace=traceback.format_exc(),
-                tags={"ragas_version": self.ragas_version},
-            )
-            raise NotImplementedError("Failed to load dependencies for `ragas_faithfulness` evaluator") from e
-        finally:
-            telemetry_writer.add_count_metric(
-                namespace=TELEMETRY_APM_PRODUCT.LLMOBS,
-                name="evaluators.init",
-                value=1,
-                tags=(
-                    ("evaluator_label", self.LABEL),
-                    ("state", telemetry_state),
-                    ("ragas_version", self.ragas_version),
-                ),
-            )
-
-        self.ragas_faithfulness_instance = _get_faithfulness_instance()
-        self.llm_output_parser_for_generated_statements = MiniRagas.RagasoutputParser(
-            pydantic_object=MiniRagas.StatementsAnswers
+        super().__init__(llmobs_service)
+        self.ragas_faithfulness_instance = self._get_faithfulness_instance()
+        self.llm_output_parser_for_generated_statements = self.ragas_dependencies.RagasoutputParser(
+            pydantic_object=self.ragas_dependencies.StatementsAnswers
         )
-        self.llm_output_parser_for_faithfulness_score = MiniRagas.RagasoutputParser(
-            pydantic_object=MiniRagas.StatementFaithfulnessAnswers
+        self.llm_output_parser_for_faithfulness_score = self.ragas_dependencies.RagasoutputParser(
+            pydantic_object=self.ragas_dependencies.StatementFaithfulnessAnswers
         )
-        self.split_answer_into_sentences = MiniRagas.get_segmenter(
+        self.split_answer_into_sentences = self.ragas_dependencies.get_segmenter(
             language=self.ragas_faithfulness_instance.nli_statements_message.language, clean=False
         )
 
-    def run_and_submit_evaluation(self, span_event: dict):
-        if not span_event:
-            return
-        score_result_or_failure, metric_metadata = self.evaluate(span_event)
-        telemetry_writer.add_count_metric(
-            TELEMETRY_APM_PRODUCT.LLMOBS,
-            "evaluators.run",
-            1,
-            tags=(
-                ("evaluator_label", self.LABEL),
-                ("state", score_result_or_failure if isinstance(score_result_or_failure, str) else "success"),
-            ),
-        )
-        if isinstance(score_result_or_failure, float):
-            self.llmobs_service.submit_evaluation(
-                span_context={"trace_id": span_event.get("trace_id"), "span_id": span_event.get("span_id")},
-                label=RagasFaithfulnessEvaluator.LABEL,
-                metric_type=RagasFaithfulnessEvaluator.METRIC_TYPE,
-                value=score_result_or_failure,
-                metadata=metric_metadata,
-            )
+    def _get_faithfulness_instance(self) -> Optional[object]:
+        """
+        This helper function ensures the faithfulness instance used in
+        ragas evaluator is updated with the latest ragas faithfulness
+        instance AND has an non-null llm
+        """
+        if self.ragas_dependencies.faithfulness is None:
+            return None
+        ragas_faithfulness_instance = self.ragas_dependencies.faithfulness
+        if not ragas_faithfulness_instance.llm:
+            ragas_faithfulness_instance.llm = self.ragas_dependencies.llm_factory()
+        return ragas_faithfulness_instance
 
     def evaluate(self, span_event: dict) -> Tuple[Union[float, str], Optional[dict]]:
         """
@@ -196,7 +77,7 @@ def evaluate(self, span_event: dict) -> Tuple[Union[float, str], Optional[dict]]
         If the ragas faithfulness instance does not have `llm` set, we set `llm` using the `llm_factory()`
         method from ragas which defaults to openai's gpt-4o-turbo.
         """
-        self.ragas_faithfulness_instance = _get_faithfulness_instance()
+        self.ragas_faithfulness_instance = self._get_faithfulness_instance()
         if not self.ragas_faithfulness_instance:
             return "fail_faithfulness_is_none", {}
 
@@ -220,16 +101,16 @@ def evaluate(self, span_event: dict) -> Tuple[Union[float, str], Optional[dict]]
                     span=ragas_faithfulness_workflow
                 )
 
-                faithfulness_inputs = self._extract_faithfulness_inputs(span_event)
+                faithfulness_inputs = self._extract_evaluation_inputs_from_span(span_event)
                 if faithfulness_inputs is None:
                     logger.debug(
-                        "Failed to extract question and context from span sampled for ragas_faithfulness evaluation"
+                        "Failed to extract evaluation inputs from span sampled for `ragas_faithfulness` evaluation"
                     )
                     return "fail_extract_faithfulness_inputs", evaluation_metadata
 
                 question = faithfulness_inputs["question"]
                 answer = faithfulness_inputs["answer"]
-                context = faithfulness_inputs["context"]
+                context = " ".join(faithfulness_inputs["contexts"])
 
                 statements = self._create_statements(question, answer)
                 if statements is None:
@@ -318,9 +199,9 @@ def _create_verdicts(self, context: str, statements: List[str]):
                 return None
 
             # collapse multiple generations into a single faithfulness list
-            faithfulness_list = MiniRagas.ensembler.from_discrete(raw_faithfulness_list, "verdict")  # type: ignore
+            faithfulness_list = self.ragas_dependencies.ensembler.from_discrete(raw_faithfulness_list, "verdict")
             try:
-                return MiniRagas.StatementFaithfulnessAnswers.parse_obj(faithfulness_list)  # type: ignore
+                return self.ragas_dependencies.StatementFaithfulnessAnswers.parse_obj(faithfulness_list)
             except Exception as e:
                 logger.debug("Failed to parse faithfulness_list", exc_info=e)
                 return None
@@ -330,59 +211,6 @@ def _create_verdicts(self, context: str, statements: List[str]):
                     output_data=faithfulness_list,
                 )
 
-    def _extract_faithfulness_inputs(self, span_event: dict) -> Optional[dict]:
-        """
-        Extracts the question, answer, and context used as inputs to faithfulness
-        evaluation from a span event.
-
-        question - input.prompt.variables.question OR input.messages[-1].content
-        context - input.prompt.variables.context
-        answer - output.messages[-1].content
-        """
-        with self.llmobs_service.workflow("dd-ragas.extract_faithfulness_inputs") as extract_inputs_workflow:
-            self.llmobs_service.annotate(span=extract_inputs_workflow, input_data=span_event)
-            question, answer, context = None, None, None
-
-            meta_io = span_event.get("meta")
-            if meta_io is None:
-                return None
-
-            meta_input = meta_io.get("input")
-            meta_output = meta_io.get("output")
-
-            if not (meta_input and meta_output):
-                return None
-
-            prompt = meta_input.get("prompt")
-            if prompt is None:
-                logger.debug("Failed to extract `prompt` from span for `ragas_faithfulness` evaluation")
-                return None
-            prompt_variables = prompt.get("variables")
-
-            input_messages = meta_input.get("messages")
-
-            messages = meta_output.get("messages")
-            if messages is not None and len(messages) > 0:
-                answer = messages[-1].get("content")
-
-            if prompt_variables:
-                context_keys = prompt.get(INTERNAL_CONTEXT_VARIABLE_KEYS, ["context"])
-                question_keys = prompt.get(INTERNAL_QUERY_VARIABLE_KEYS, ["question"])
-                context = " ".join([prompt_variables.get(key) for key in context_keys if prompt_variables.get(key)])
-                question = " ".join([prompt_variables.get(key) for key in question_keys if prompt_variables.get(key)])
-
-            if not question and input_messages is not None and len(input_messages) > 0:
-                question = input_messages[-1].get("content")
-
-            self.llmobs_service.annotate(
-                span=extract_inputs_workflow, output_data={"question": question, "context": context, "answer": answer}
-            )
-            if any(field is None for field in (question, context, answer)):
-                logger.debug("Failed to extract inputs required for faithfulness evaluation")
-                return None
-
-            return {"question": question, "context": context, "answer": answer}
-
     def _create_statements_prompt(self, answer, question):
         # Returns: `ragas.llms.PromptValue` object
         with self.llmobs_service.task("dd-ragas.create_statements_prompt"):
diff --git a/ddtrace/llmobs/_evaluators/runner.py b/ddtrace/llmobs/_evaluators/runner.py
index bf45e618e01..3d26998f1b4 100644
--- a/ddtrace/llmobs/_evaluators/runner.py
+++ b/ddtrace/llmobs/_evaluators/runner.py
@@ -64,13 +64,15 @@ def __init__(self, interval: float, llmobs_service=None, evaluators=None):
                             ("state", evaluator_init_state),
                         ),
                     )
+            else:
+                raise ValueError("Parsed unsupported evaluator: {}".format(evaluator))
 
     def start(self, *args, **kwargs):
         if not self.evaluators:
             logger.debug("no evaluators configured, not starting %r", self.__class__.__name__)
             return
         super(EvaluatorRunner, self).start()
-        logger.debug("started %r to %r", self.__class__.__name__)
+        logger.debug("started %r", self.__class__.__name__)
 
     def _stop_service(self) -> None:
         """
diff --git a/tests/llmobs/_utils.py b/tests/llmobs/_utils.py
index 0ecdde36ee6..32bbce849db 100644
--- a/tests/llmobs/_utils.py
+++ b/tests/llmobs/_utils.py
@@ -553,7 +553,46 @@ def _dummy_evaluator_eval_metric_event(span_id, trace_id):
     )
 
 
-def _expected_ragas_spans(ragas_inputs=None):
+def _expected_ragas_context_precision_spans(ragas_inputs=None):
+    if not ragas_inputs:
+        ragas_inputs = default_ragas_inputs
+    return [
+        {
+            "trace_id": mock.ANY,
+            "span_id": mock.ANY,
+            "parent_id": "undefined",
+            "name": "dd-ragas.context_precision",
+            "start_ns": mock.ANY,
+            "duration": mock.ANY,
+            "status": "ok",
+            "meta": {
+                "span.kind": "workflow",
+                "input": {"value": mock.ANY},
+                "output": {"value": "1.0"},
+            },
+            "metrics": {},
+            "tags": expected_ragas_trace_tags(),
+        },
+        {
+            "trace_id": mock.ANY,
+            "span_id": mock.ANY,
+            "parent_id": mock.ANY,
+            "name": "dd-ragas.extract_evaluation_inputs_from_span",
+            "start_ns": mock.ANY,
+            "duration": mock.ANY,
+            "status": "ok",
+            "meta": {
+                "span.kind": "workflow",
+                "input": {"value": mock.ANY},
+                "output": {"value": mock.ANY},
+            },
+            "metrics": {},
+            "tags": expected_ragas_trace_tags(),
+        },
+    ]
+
+
+def _expected_ragas_faithfulness_spans(ragas_inputs=None):
     if not ragas_inputs:
         ragas_inputs = default_ragas_inputs
     return [
@@ -581,7 +620,7 @@ def _expected_ragas_spans(ragas_inputs=None):
             "trace_id": mock.ANY,
             "span_id": mock.ANY,
             "parent_id": mock.ANY,
-            "name": "dd-ragas.extract_faithfulness_inputs",
+            "name": "dd-ragas.extract_evaluation_inputs_from_span",
             "start_ns": mock.ANY,
             "duration": mock.ANY,
             "status": "ok",
diff --git a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.emits_traces_and_evaluations_on_exit.yaml b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_evaluators.emits_traces_and_evaluations_on_exit.yaml
similarity index 100%
rename from tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.emits_traces_and_evaluations_on_exit.yaml
rename to tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_evaluators.emits_traces_and_evaluations_on_exit.yaml
diff --git a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_emits_traces.yaml b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_evaluators.test_ragas_faithfulness_emits_traces.yaml
similarity index 100%
rename from tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_emits_traces.yaml
rename to tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_evaluators.test_ragas_faithfulness_emits_traces.yaml
diff --git a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_submits_evaluation.yaml b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_evaluators.test_ragas_faithfulness_submits_evaluation.yaml
similarity index 100%
rename from tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_submits_evaluation.yaml
rename to tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_evaluators.test_ragas_faithfulness_submits_evaluation.yaml
diff --git a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_submits_evaluation_on_span_with_custom_keys.yaml b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_evaluators.test_ragas_faithfulness_submits_evaluation_on_span_with_custom_keys.yaml
similarity index 100%
rename from tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_submits_evaluation_on_span_with_custom_keys.yaml
rename to tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_evaluators.test_ragas_faithfulness_submits_evaluation_on_span_with_custom_keys.yaml
diff --git a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_submits_evaluation_on_span_with_question_in_messages.yaml b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_evaluators.test_ragas_faithfulness_submits_evaluation_on_span_with_question_in_messages.yaml
similarity index 100%
rename from tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_submits_evaluation_on_span_with_question_in_messages.yaml
rename to tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_evaluators.test_ragas_faithfulness_submits_evaluation_on_span_with_question_in_messages.yaml
diff --git a/tests/llmobs/test_llmobs_evaluator_runner.py b/tests/llmobs/test_llmobs_evaluator_runner.py
index 128c4639946..eb0be25c91b 100644
--- a/tests/llmobs/test_llmobs_evaluator_runner.py
+++ b/tests/llmobs/test_llmobs_evaluator_runner.py
@@ -22,7 +22,7 @@ def test_evaluator_runner_start(mock_evaluator_logs):
     evaluator_runner = EvaluatorRunner(interval=0.01, llmobs_service=mock.MagicMock())
     evaluator_runner.evaluators.append(DummyEvaluator(llmobs_service=mock.MagicMock()))
     evaluator_runner.start()
-    mock_evaluator_logs.debug.assert_has_calls([mock.call("started %r to %r", "EvaluatorRunner")])
+    mock_evaluator_logs.debug.assert_has_calls([mock.call("started %r", "EvaluatorRunner")])
 
 
 def test_evaluator_runner_buffer_limit(mock_evaluator_logs):
@@ -99,6 +99,12 @@ def test_evaluator_runner_on_exit(mock_writer_logs, run_python_code_in_subproces
     assert err == b""
 
 
+def test_evaluator_runner_unsupported_evaluator():
+    with override_env({"_DD_LLMOBS_EVALUATORS": "unsupported"}):
+        with pytest.raises(ValueError):
+            EvaluatorRunner(interval=0.01, llmobs_service=mock.MagicMock())
+
+
 def test_evaluator_runner_sampler_single_rule(monkeypatch):
     monkeypatch.setenv(
         EvaluatorRunnerSampler.SAMPLING_RULES_ENV_VAR,
diff --git a/tests/llmobs/test_llmobs_ragas_faithfulness_evaluator.py b/tests/llmobs/test_llmobs_ragas_evaluators.py
similarity index 97%
rename from tests/llmobs/test_llmobs_ragas_faithfulness_evaluator.py
rename to tests/llmobs/test_llmobs_ragas_evaluators.py
index 39e315b37e4..9df6c392470 100644
--- a/tests/llmobs/test_llmobs_ragas_faithfulness_evaluator.py
+++ b/tests/llmobs/test_llmobs_ragas_evaluators.py
@@ -6,7 +6,7 @@
 from ddtrace.llmobs._evaluators.ragas.faithfulness import RagasFaithfulnessEvaluator
 from ddtrace.span import Span
 from tests.llmobs._utils import _expected_llmobs_llm_span_event
-from tests.llmobs._utils import _expected_ragas_spans
+from tests.llmobs._utils import _expected_ragas_faithfulness_spans
 from tests.llmobs._utils import _llm_span_with_expected_ragas_inputs_in_messages
 from tests.llmobs._utils import _llm_span_with_expected_ragas_inputs_in_prompt
 
@@ -177,7 +177,8 @@ def test_ragas_faithfulness_emits_traces(ragas, llmobs, llmobs_events):
     ragas_spans = sorted(ragas_spans, key=lambda d: d["start_ns"])
     assert len(ragas_spans) == 7
     # check name, io, span kinds match
-    assert ragas_spans == _expected_ragas_spans()
+    assert ragas_spans == _expected_ragas_faithfulness_spans()
+
     # verify the trace structure
     root_span = ragas_spans[0]
     root_span_id = root_span["span_id"]
@@ -212,7 +213,7 @@ def test_llmobs_with_faithfulness_emits_traces_and_evals_on_exit(mock_writer_log
             "DD_LLMOBS_ML_APP": "unnamed-ml-app",
             "_DD_LLMOBS_EVALUATOR_INTERVAL": "5",
             "_DD_LLMOBS_EVALUATORS": "ragas_faithfulness",
-            "DD_LLMOBS_AGENTLESS_ENABLED": "true",
+            "DD_LLMOBS_AGENTLESS_ENABLED": "1",
         }
     )
     out, err, status, pid = run_python_code_in_subprocess(
@@ -227,7 +228,7 @@ def test_llmobs_with_faithfulness_emits_traces_and_evals_on_exit(mock_writer_log
 from tests.llmobs._utils import logs_vcr
 
 ctx = logs_vcr.use_cassette(
-    "tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.emits_traces_and_evaluations_on_exit.yaml"
+    "tests.llmobs.test_llmobs_ragas_evaluators.emits_traces_and_evaluations_on_exit.yaml"
 )
 ctx.__enter__()
 atexit.register(lambda: ctx.__exit__())

From 1b223aaf78123cdf1ae13ebc24e9387177e500a5 Mon Sep 17 00:00:00 2001
From: lievan <42917263+lievan@users.noreply.github.com>
Date: Fri, 10 Jan 2025 11:07:11 -0500
Subject: [PATCH 16/19] feat(llmobs): support joining custom evaluations via
 tags (#11535)

This PR implements `LLMObs.submit_evaluation_for` method, which gives
users two options for joining custom evaluations
- by tag via the `span_with_tag` argument, which accepts a tuple
containing a tag key/value pair
- by span via the `span` argument, which accepts a dictionary containing
`span_id` and `trace_id` keys

There are also a couple behavior differences between
`submit_evaluation_for` and `submit_evaluation`. In the new method, we
- throw whenever a required argument is the wrong value or type
- remove `metadata` argument
- move the warning log for missing api key to the eval metric writer's
`periodic` method

Other changes:
#### Eval metric writer
Update the eval metric writer to write to the `v2` eval metric endpoint.
The main difference with this endpoint is that it accepts a `join_with`
field that holds joining information instead of a top-level trace and
span id fields.

#### Deprecate `submit_evaluation`
Deprecates `submit_evaluation`. **I've set the removal version to be
`3.0.0`.**


## Checklist
- [x] PR author has checked that all the criteria below are met
- The PR description includes an overview of the change
- The PR description articulates the motivation for the change
- The change includes tests OR the PR description describes a testing
strategy
- The PR description notes risks associated with the change, if any
- Newly-added code is easy to change
- The change follows the [library release note
guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html)
- The change includes or references documentation updates if necessary
- Backport labels are set (if
[applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting))

## Reviewer Checklist
- [x] Reviewer has checked that all the criteria below are met
- Title is accurate
- All changes are related to the pull request's stated goal
- Avoids breaking
[API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces)
changes
- Testing strategy adequately addresses listed risks
- Newly-added code is easy to change
- Release note makes sense to a user of the library
- If necessary, author has acknowledged and discussed the performance
implications of this PR as reported in the benchmarks PR comment
- Backport labels are set in a manner that is consistent with the
[release branch maintenance
policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)

---------

Co-authored-by: lievan <evan.li@datadoqhq.com>
---
 ddtrace/llmobs/_llmobs.py                     | 135 +++++++-
 ddtrace/llmobs/_writer.py                     |  12 +-
 ...ubmit-evaluation-for-01096d803d969e3e.yaml |  17 +
 tests/llmobs/_utils.py                        |  16 +-
 ..._eval_metric_writer.send_score_metric.yaml |  15 +-
 ...c_writer.test_send_categorical_metric.yaml |  15 +-
 ...c_writer.test_send_metric_bad_api_key.yaml |  11 +-
 ...tric_writer.test_send_multiple_events.yaml |  30 +-
 ..._metric_writer.test_send_score_metric.yaml |  15 +-
 ..._metric_writer.test_send_timed_events.yaml |  30 +-
 ...bs_evaluator_runner.send_score_metric.yaml |  16 +-
 ....emits_traces_and_evaluations_on_exit.yaml |  79 ++---
 ....test_ragas_faithfulness_emits_traces.yaml |  75 +++--
 .../llmobs/test_llmobs_eval_metric_writer.py  |  43 ++-
 tests/llmobs/test_llmobs_service.py           | 301 +++++++++++++++++-
 15 files changed, 648 insertions(+), 162 deletions(-)
 create mode 100644 releasenotes/notes/submit-evaluation-for-01096d803d969e3e.yaml

diff --git a/ddtrace/llmobs/_llmobs.py b/ddtrace/llmobs/_llmobs.py
index cd4069b4094..91c49306b35 100644
--- a/ddtrace/llmobs/_llmobs.py
+++ b/ddtrace/llmobs/_llmobs.py
@@ -28,6 +28,7 @@
 from ddtrace.internal.service import ServiceStatusError
 from ddtrace.internal.telemetry import telemetry_writer
 from ddtrace.internal.telemetry.constants import TELEMETRY_APM_PRODUCT
+from ddtrace.internal.utils.deprecations import DDTraceDeprecationWarning
 from ddtrace.internal.utils.formats import asbool
 from ddtrace.internal.utils.formats import parse_tags_str
 from ddtrace.llmobs import _constants as constants
@@ -66,6 +67,7 @@
 from ddtrace.llmobs.utils import ExportedLLMObsSpan
 from ddtrace.llmobs.utils import Messages
 from ddtrace.propagation.http import HTTPPropagator
+from ddtrace.vendor.debtcollector import deprecate
 
 
 log = get_logger(__name__)
@@ -904,6 +906,127 @@ def _tag_metrics(span: Span, metrics: Dict[str, Any]) -> None:
             return
         span._set_ctx_item(METRICS, metrics)
 
+    @classmethod
+    def submit_evaluation_for(
+        cls,
+        label: str,
+        metric_type: str,
+        value: Union[str, int, float],
+        span: Optional[dict] = None,
+        span_with_tag_value: Optional[Dict[str, str]] = None,
+        tags: Optional[Dict[str, str]] = None,
+        ml_app: Optional[str] = None,
+        timestamp_ms: Optional[int] = None,
+    ) -> None:
+        """
+        Submits a custom evaluation metric for a given span.
+
+        :param str label: The name of the evaluation metric.
+        :param str metric_type: The type of the evaluation metric. One of "categorical", "score".
+        :param value: The value of the evaluation metric.
+                      Must be a string (categorical), integer (score), or float (score).
+        :param dict span: A dictionary of shape {'span_id': str, 'trace_id': str} uniquely identifying
+                            the span associated with this evaluation.
+        :param dict span_with_tag_value: A dictionary with the format {'tag_key': str, 'tag_value': str}
+                            uniquely identifying the span associated with this evaluation.
+        :param tags: A dictionary of string key-value pairs to tag the evaluation metric with.
+        :param str ml_app: The name of the ML application
+        :param int timestamp_ms: The unix timestamp in milliseconds when the evaluation metric result was generated.
+                                    If not set, the current time will be used.
+        """
+        if cls.enabled is False:
+            log.debug(
+                "LLMObs.submit_evaluation_for() called when LLMObs is not enabled. ",
+                "Evaluation metric data will not be sent.",
+            )
+            return
+
+        has_exactly_one_joining_key = (span is not None) ^ (span_with_tag_value is not None)
+
+        if not has_exactly_one_joining_key:
+            raise ValueError(
+                "Exactly one of `span` or `span_with_tag_value` must be specified to submit an evaluation metric."
+            )
+
+        join_on = {}
+        if span is not None:
+            if (
+                not isinstance(span, dict)
+                or not isinstance(span.get("span_id"), str)
+                or not isinstance(span.get("trace_id"), str)
+            ):
+                raise TypeError(
+                    "`span` must be a dictionary containing both span_id and trace_id keys. "
+                    "LLMObs.export_span() can be used to generate this dictionary from a given span."
+                )
+            join_on["span"] = span
+        elif span_with_tag_value is not None:
+            if (
+                not isinstance(span_with_tag_value, dict)
+                or not isinstance(span_with_tag_value.get("tag_key"), str)
+                or not isinstance(span_with_tag_value.get("tag_value"), str)
+            ):
+                raise TypeError(
+                    "`span_with_tag_value` must be a dict with keys 'tag_key' and 'tag_value' containing string values"
+                )
+            join_on["tag"] = {
+                "key": span_with_tag_value.get("tag_key"),
+                "value": span_with_tag_value.get("tag_value"),
+            }
+
+        timestamp_ms = timestamp_ms if timestamp_ms else int(time.time() * 1000)
+
+        if not isinstance(timestamp_ms, int) or timestamp_ms < 0:
+            raise ValueError("timestamp_ms must be a non-negative integer. Evaluation metric data will not be sent")
+
+        if not label:
+            raise ValueError("label must be the specified name of the evaluation metric.")
+
+        metric_type = metric_type.lower()
+        if metric_type not in ("categorical", "score"):
+            raise ValueError("metric_type must be one of 'categorical' or 'score'.")
+
+        if metric_type == "categorical" and not isinstance(value, str):
+            raise TypeError("value must be a string for a categorical metric.")
+        if metric_type == "score" and not isinstance(value, (int, float)):
+            raise TypeError("value must be an integer or float for a score metric.")
+
+        if tags is not None and not isinstance(tags, dict):
+            log.warning("tags must be a dictionary of string key-value pairs.")
+            tags = {}
+
+        evaluation_tags = {
+            "ddtrace.version": ddtrace.__version__,
+            "ml_app": ml_app,
+        }
+
+        if tags:
+            for k, v in tags.items():
+                try:
+                    evaluation_tags[ensure_text(k)] = ensure_text(v)
+                except TypeError:
+                    log.warning("Failed to parse tags. Tags for evaluation metrics must be strings.")
+
+        ml_app = ml_app if ml_app else config._llmobs_ml_app
+        if not ml_app:
+            log.warning(
+                "ML App name is required for sending evaluation metrics. Evaluation metric data will not be sent. "
+                "Ensure this configuration is set before running your application."
+            )
+            return
+
+        evaluation_metric = {
+            "join_on": join_on,
+            "label": str(label),
+            "metric_type": metric_type,
+            "timestamp_ms": timestamp_ms,
+            "{}_value".format(metric_type): value,
+            "ml_app": ml_app,
+            "tags": ["{}:{}".format(k, v) for k, v in evaluation_tags.items()],
+        }
+
+        cls._instance._llmobs_eval_metric_writer.enqueue(evaluation_metric)
+
     @classmethod
     def submit_evaluation(
         cls,
@@ -916,6 +1039,13 @@ def submit_evaluation(
         timestamp_ms: Optional[int] = None,
         metadata: Optional[Dict[str, object]] = None,
     ) -> None:
+        deprecate(
+            "Using `LLMObs.submit_evaluation` is deprecated",
+            message="Please use `LLMObs.submit_evaluation_for` instead.",
+            removal_version="3.0.0",
+            category=DDTraceDeprecationWarning,
+        )
+
         """
         Submits a custom evaluation metric for a given span ID and trace ID.
 
@@ -931,7 +1061,7 @@ def submit_evaluation(
                                 evaluation metric.
         """
         if cls.enabled is False:
-            log.warning(
+            log.debug(
                 "LLMObs.submit_evaluation() called when LLMObs is not enabled. Evaluation metric data will not be sent."
             )
             return
@@ -1007,8 +1137,7 @@ def submit_evaluation(
                     log.warning("Failed to parse tags. Tags for evaluation metrics must be strings.")
 
         evaluation_metric = {
-            "span_id": span_id,
-            "trace_id": trace_id,
+            "join_on": {"span": {"span_id": span_id, "trace_id": trace_id}},
             "label": str(label),
             "metric_type": metric_type.lower(),
             "timestamp_ms": timestamp_ms,
diff --git a/ddtrace/llmobs/_writer.py b/ddtrace/llmobs/_writer.py
index 5a293f05c4e..5880019d67f 100644
--- a/ddtrace/llmobs/_writer.py
+++ b/ddtrace/llmobs/_writer.py
@@ -55,8 +55,7 @@ class LLMObsSpanEvent(TypedDict):
 
 
 class LLMObsEvaluationMetricEvent(TypedDict, total=False):
-    span_id: str
-    trace_id: str
+    join_on: Dict[str, Dict[str, str]]
     metric_type: str
     label: str
     categorical_value: str
@@ -107,6 +106,13 @@ def periodic(self) -> None:
             events = self._buffer
             self._buffer = []
 
+        if not self._headers.get("DD-API-KEY"):
+            logger.warning(
+                "DD_API_KEY is required for sending evaluation metrics. Evaluation metric data will not be sent. ",
+                "Ensure this configuration is set before running your application.",
+            )
+            return
+
         data = self._data(events)
         enc_llm_events = safe_json(data)
         conn = httplib.HTTPSConnection(self._intake, 443, timeout=self._timeout)
@@ -154,7 +160,7 @@ def __init__(self, site: str, api_key: str, interval: float, timeout: float) ->
         super(LLMObsEvalMetricWriter, self).__init__(site, api_key, interval, timeout)
         self._event_type = "evaluation_metric"
         self._buffer = []
-        self._endpoint = "/api/intake/llm-obs/v1/eval-metric"
+        self._endpoint = "/api/intake/llm-obs/v2/eval-metric"
         self._intake = "api.%s" % self._site  # type: str
 
     def enqueue(self, event: LLMObsEvaluationMetricEvent) -> None:
diff --git a/releasenotes/notes/submit-evaluation-for-01096d803d969e3e.yaml b/releasenotes/notes/submit-evaluation-for-01096d803d969e3e.yaml
new file mode 100644
index 00000000000..c2e4b25f255
--- /dev/null
+++ b/releasenotes/notes/submit-evaluation-for-01096d803d969e3e.yaml
@@ -0,0 +1,17 @@
+---
+features:
+  - |
+    LLM Observability: This introduces the `LLMObs.submit_evaluation_for` method, which provides the ability to join a custom evaluation 
+                        to a span using a tag key-value pair on the span. The tag key-value pair is expected to uniquely identify a single span. 
+                        Tag-based joining is an alternative to the existing method of joining evaluations to spans using trace and span IDs.
+                      Example usage:
+                        - Evaluation joined by tag: `LLMObs.submit_evaluation_for(span_with_tag_value={"tag_key": "message_id", "tag_value": "dummy_message_id"}, label="rating", ...)`.
+                        - Evaluation joined by trace/span ID: `LLMObs.submit_evaluation_for(span={"trace_id": "...", "span_id": "..."}, label="rating", ...)`.
+deprecations:
+  - |
+    LLM Observability: `LLMObs.submit_evaluation` is deprecated and will be removed in ddtrace 3.0.0.
+                        As an alternative to `LLMObs.submit_evaluation`, you can use `LLMObs.submit_evaluation_for` instead.
+                        To migrate, replace `LLMObs.submit_evaluation(span_context={"span_id": ..., "trace_id": ...}, ...)` with:
+                          `LLMObs.submit_evaluation_for(span={"span_id": ..., "trace_id": ...}, ...)
+                        You may also join an evaluation to a span using a tag key-value pair like so:
+                          `LLMObs.submit_evaluation_for(span_with_tag_value={"tag_key": ..., "tag_val": ...}, ...)`.
diff --git a/tests/llmobs/_utils.py b/tests/llmobs/_utils.py
index 32bbce849db..4e60a8f3996 100644
--- a/tests/llmobs/_utils.py
+++ b/tests/llmobs/_utils.py
@@ -210,11 +210,13 @@ def _get_llmobs_parent_id(span: Span):
 
 
 def _expected_llmobs_eval_metric_event(
-    span_id,
-    trace_id,
     metric_type,
     label,
     ml_app,
+    tag_key=None,
+    tag_value=None,
+    span_id=None,
+    trace_id=None,
     timestamp_ms=None,
     categorical_value=None,
     score_value=None,
@@ -223,8 +225,7 @@ def _expected_llmobs_eval_metric_event(
     metadata=None,
 ):
     eval_metric_event = {
-        "span_id": span_id,
-        "trace_id": trace_id,
+        "join_on": {},
         "metric_type": metric_type,
         "label": label,
         "tags": [
@@ -232,6 +233,10 @@ def _expected_llmobs_eval_metric_event(
             "ml_app:{}".format(ml_app if ml_app is not None else "unnamed-ml-app"),
         ],
     }
+    if tag_key is not None and tag_value is not None:
+        eval_metric_event["join_on"]["tag"] = {"key": tag_key, "value": tag_value}
+    if span_id is not None and trace_id is not None:
+        eval_metric_event["join_on"]["span"] = {"span_id": span_id, "trace_id": trace_id}
     if categorical_value is not None:
         eval_metric_event["categorical_value"] = categorical_value
     if score_value is not None:
@@ -542,8 +547,7 @@ def run_and_submit_evaluation(self, span):
 
 def _dummy_evaluator_eval_metric_event(span_id, trace_id):
     return LLMObsEvaluationMetricEvent(
-        span_id=span_id,
-        trace_id=trace_id,
+        join_on={"span": {"span_id": span_id, "trace_id": trace_id}},
         score_value=1.0,
         ml_app="unnamed-ml-app",
         timestamp_ms=mock.ANY,
diff --git a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_eval_metric_writer.send_score_metric.yaml b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_eval_metric_writer.send_score_metric.yaml
index 61c26ff7bf0..f767f5de303 100644
--- a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_eval_metric_writer.send_score_metric.yaml
+++ b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_eval_metric_writer.send_score_metric.yaml
@@ -1,27 +1,28 @@
 interactions:
 - request:
-    body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"span_id":
-      "12345678902", "trace_id": "98765432102", "metric_type": "score", "label": "sentiment",
-      "score_value": 0.9, "ml_app": "dummy-ml-app", "timestamp_ms": 1724249500942}]}}}'
+    body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"join_on":
+      {"span": {"span_id": "12345678902", "trace_id": "98765432102"}}, "metric_type":
+      "score", "label": "sentiment", "score_value": 0.9, "ml_app": "dummy-ml-app",
+      "timestamp_ms": 1732568298743}]}}}'
     headers:
       Content-Type:
       - application/json
       DD-API-KEY:
       - XXXXXX
     method: POST
-    uri: https://api.datad0g.com/api/intake/llm-obs/v1/eval-metric
+    uri: https://api.datad0g.com/api/intake/llm-obs/v2/eval-metric
   response:
     body:
-      string: '{"data":{"id":"e66c93b9-ca0a-4f0a-9207-497e0a1b6eec","type":"evaluation_metric","attributes":{"metrics":[{"id":"5fb5ed5d-20c1-4f34-abf9-c0bdc09680e3","trace_id":"98765432102","span_id":"12345678902","timestamp_ms":1724249500942,"ml_app":"dummy-ml-app","metric_type":"score","label":"sentiment","score_value":0.9}]}}}'
+      string: '{"data":{"id":"5b998846-53af-4b0e-a658-fd9e06726d6d","type":"evaluation_metric","attributes":{"metrics":[{"id":"jbGbAMC7Rk","join_on":{"span":{"trace_id":"98765432102","span_id":"12345678902"}},"timestamp_ms":1732568298743,"ml_app":"dummy-ml-app","metric_type":"score","label":"sentiment","score_value":0.9}]}}}'
     headers:
       content-length:
-      - '316'
+      - '311'
       content-security-policy:
       - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pub293163a918901030b79492fe1ab424cf&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatad0g.com
       content-type:
       - application/vnd.api+json
       date:
-      - Wed, 21 Aug 2024 14:11:41 GMT
+      - Mon, 25 Nov 2024 20:58:19 GMT
       strict-transport-security:
       - max-age=31536000; includeSubDomains; preload
       vary:
diff --git a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_eval_metric_writer.test_send_categorical_metric.yaml b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_eval_metric_writer.test_send_categorical_metric.yaml
index 92498e86e9e..f4404b30832 100644
--- a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_eval_metric_writer.test_send_categorical_metric.yaml
+++ b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_eval_metric_writer.test_send_categorical_metric.yaml
@@ -1,27 +1,28 @@
 interactions:
 - request:
-    body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"span_id":
-      "12345678901", "trace_id": "98765432101", "metric_type": "categorical", "categorical_value":
-      "very", "label": "toxicity", "ml_app": "dummy-ml-app", "timestamp_ms": 1724249500339}]}}}'
+    body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"join_on":
+      {"span": {"span_id": "12345678901", "trace_id": "98765432101"}}, "metric_type":
+      "categorical", "categorical_value": "very", "label": "toxicity", "ml_app": "dummy-ml-app",
+      "timestamp_ms": 1732568297450}]}}}'
     headers:
       Content-Type:
       - application/json
       DD-API-KEY:
       - XXXXXX
     method: POST
-    uri: https://api.datad0g.com/api/intake/llm-obs/v1/eval-metric
+    uri: https://api.datad0g.com/api/intake/llm-obs/v2/eval-metric
   response:
     body:
-      string: '{"data":{"id":"36d88c24-d7d4-4d3e-853c-b695aff61344","type":"evaluation_metric","attributes":{"metrics":[{"id":"0c189d9c-a730-4c5d-bbc2-55ef3455900f","trace_id":"98765432101","span_id":"12345678901","timestamp_ms":1724249500339,"ml_app":"dummy-ml-app","metric_type":"categorical","label":"toxicity","categorical_value":"very"}]}}}'
+      string: '{"data":{"id":"49c5c927-76f1-4de4-ad97-e1a0a159229f","type":"evaluation_metric","attributes":{"metrics":[{"id":"okVf1U4XzA","join_on":{"span":{"trace_id":"98765432101","span_id":"12345678901"}},"timestamp_ms":1732568297450,"ml_app":"dummy-ml-app","metric_type":"categorical","label":"toxicity","categorical_value":"very"}]}}}'
     headers:
       content-length:
-      - '330'
+      - '325'
       content-security-policy:
       - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pub293163a918901030b79492fe1ab424cf&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatad0g.com
       content-type:
       - application/vnd.api+json
       date:
-      - Wed, 21 Aug 2024 14:11:40 GMT
+      - Mon, 25 Nov 2024 20:58:17 GMT
       strict-transport-security:
       - max-age=31536000; includeSubDomains; preload
       vary:
diff --git a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_eval_metric_writer.test_send_metric_bad_api_key.yaml b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_eval_metric_writer.test_send_metric_bad_api_key.yaml
index 68fe0315870..ef6f4cf445e 100644
--- a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_eval_metric_writer.test_send_metric_bad_api_key.yaml
+++ b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_eval_metric_writer.test_send_metric_bad_api_key.yaml
@@ -1,15 +1,16 @@
 interactions:
 - request:
-    body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"span_id":
-      "12345678901", "trace_id": "98765432101", "metric_type": "categorical", "categorical_value":
-      "very", "label": "toxicity", "ml_app": "dummy-ml-app", "timestamp_ms": 1724249500253}]}}}'
+    body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"join_on":
+      {"span": {"span_id": "12345678901", "trace_id": "98765432101"}}, "metric_type":
+      "categorical", "categorical_value": "very", "label": "toxicity", "ml_app": "dummy-ml-app",
+      "timestamp_ms": 1732568297307}]}}}'
     headers:
       Content-Type:
       - application/json
       DD-API-KEY:
       - XXXXXX
     method: POST
-    uri: https://api.datad0g.com/api/intake/llm-obs/v1/eval-metric
+    uri: https://api.datad0g.com/api/intake/llm-obs/v2/eval-metric
   response:
     body:
       string: '{"status":"error","code":403,"errors":["Forbidden"],"statuspage":"http://status.datadoghq.com","twitter":"http://twitter.com/datadogops","email":"support@datadoghq.com"}'
@@ -21,7 +22,7 @@ interactions:
       content-type:
       - application/json
       date:
-      - Wed, 21 Aug 2024 14:11:40 GMT
+      - Mon, 25 Nov 2024 20:58:17 GMT
       strict-transport-security:
       - max-age=31536000; includeSubDomains; preload
       x-content-type-options:
diff --git a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_eval_metric_writer.test_send_multiple_events.yaml b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_eval_metric_writer.test_send_multiple_events.yaml
index 61da12cd3fa..3638a1cf608 100644
--- a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_eval_metric_writer.test_send_multiple_events.yaml
+++ b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_eval_metric_writer.test_send_multiple_events.yaml
@@ -1,32 +1,30 @@
 interactions:
 - request:
-    body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"span_id":
-      "12345678902", "trace_id": "98765432102", "metric_type": "score", "label": "sentiment",
-      "score_value": 0.9, "ml_app": "dummy-ml-app", "timestamp_ms": 1724249589510},
-      {"span_id": "12345678901", "trace_id": "98765432101", "metric_type": "categorical",
-      "categorical_value": "very", "label": "toxicity", "ml_app": "dummy-ml-app",
-      "timestamp_ms": 1724249589510}]}}}'
+    body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"join_on":
+      {"span": {"span_id": "12345678902", "trace_id": "98765432102"}}, "metric_type":
+      "score", "label": "sentiment", "score_value": 0.9, "ml_app": "dummy-ml-app",
+      "timestamp_ms": 1732568728793}, {"join_on": {"span": {"span_id": "12345678901",
+      "trace_id": "98765432101"}}, "metric_type": "categorical", "categorical_value":
+      "very", "label": "toxicity", "ml_app": "dummy-ml-app", "timestamp_ms": 1732568728793}]}}}'
     headers:
       Content-Type:
       - application/json
       DD-API-KEY:
       - XXXXXX
     method: POST
-    uri: https://api.datad0g.com/api/intake/llm-obs/v1/eval-metric
+    uri: https://api.datad0g.com/api/intake/llm-obs/v2/eval-metric
   response:
     body:
-      string: '{"data":{"id":"2ccffdfc-024b-49e6-881c-4e4d1c5f450e","type":"evaluation_metric","attributes":{"metrics":[{"id":"ed072901-fd70-4417-9cab-1bad62b6ac09","trace_id":"98765432102","span_id":"12345678902","timestamp_ms":1724249589510,"ml_app":"dummy-ml-app","metric_type":"score","label":"sentiment","score_value":0.9},{"id":"16175a34-7c25-43ca-8551-bd2f7242ab77","trace_id":"98765432101","span_id":"12345678901","timestamp_ms":1724249589510,"ml_app":"dummy-ml-app","metric_type":"categorical","label":"toxicity","categorical_value":"very"}]}}}'
+      string: '{"data":{"id":"844be0cd-9dd4-45d3-9763-8ccb20f4e7c8","type":"evaluation_metric","attributes":{"metrics":[{"id":"IZhAbBsXBJ","join_on":{"span":{"trace_id":"98765432102","span_id":"12345678902"}},"timestamp_ms":1732568728793,"ml_app":"dummy-ml-app","metric_type":"score","label":"sentiment","score_value":0.9},{"id":"ME868fTl0T","join_on":{"span":{"trace_id":"98765432101","span_id":"12345678901"}},"timestamp_ms":1732568728793,"ml_app":"dummy-ml-app","metric_type":"categorical","label":"toxicity","categorical_value":"very"}]}}}'
     headers:
-      Connection:
-      - keep-alive
-      Content-Length:
-      - '538'
-      Content-Type:
-      - application/vnd.api+json
-      Date:
-      - Wed, 21 Aug 2024 14:13:09 GMT
+      content-length:
+      - '528'
       content-security-policy:
       - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pub293163a918901030b79492fe1ab424cf&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatad0g.com
+      content-type:
+      - application/vnd.api+json
+      date:
+      - Mon, 25 Nov 2024 21:05:29 GMT
       strict-transport-security:
       - max-age=31536000; includeSubDomains; preload
       vary:
diff --git a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_eval_metric_writer.test_send_score_metric.yaml b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_eval_metric_writer.test_send_score_metric.yaml
index 1394f9fbb43..65bb0fa1562 100644
--- a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_eval_metric_writer.test_send_score_metric.yaml
+++ b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_eval_metric_writer.test_send_score_metric.yaml
@@ -1,27 +1,28 @@
 interactions:
 - request:
-    body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"span_id":
-      "12345678902", "trace_id": "98765432102", "metric_type": "score", "label": "sentiment",
-      "score_value": 0.9, "ml_app": "dummy-ml-app", "timestamp_ms": 1724249500471}]}}}'
+    body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"join_on":
+      {"span": {"span_id": "12345678902", "trace_id": "98765432102"}}, "metric_type":
+      "score", "label": "sentiment", "score_value": 0.9, "ml_app": "dummy-ml-app",
+      "timestamp_ms": 1732568297772}]}}}'
     headers:
       Content-Type:
       - application/json
       DD-API-KEY:
       - XXXXXX
     method: POST
-    uri: https://api.datad0g.com/api/intake/llm-obs/v1/eval-metric
+    uri: https://api.datad0g.com/api/intake/llm-obs/v2/eval-metric
   response:
     body:
-      string: '{"data":{"id":"5bd1b0b7-0acd-46e2-8ff6-3ee6a92457b6","type":"evaluation_metric","attributes":{"metrics":[{"id":"d8aa2a23-3137-4c49-b87b-d1eb1c3af04e","trace_id":"98765432102","span_id":"12345678902","timestamp_ms":1724249500471,"ml_app":"dummy-ml-app","metric_type":"score","label":"sentiment","score_value":0.9}]}}}'
+      string: '{"data":{"id":"d1518236-84b1-4b47-9cbc-ffc24188b5cc","type":"evaluation_metric","attributes":{"metrics":[{"id":"jiKtwDKR0B","join_on":{"span":{"trace_id":"98765432102","span_id":"12345678902"}},"timestamp_ms":1732568297772,"ml_app":"dummy-ml-app","metric_type":"score","label":"sentiment","score_value":0.9}]}}}'
     headers:
       content-length:
-      - '316'
+      - '311'
       content-security-policy:
       - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pub293163a918901030b79492fe1ab424cf&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatad0g.com
       content-type:
       - application/vnd.api+json
       date:
-      - Wed, 21 Aug 2024 14:11:40 GMT
+      - Mon, 25 Nov 2024 20:58:18 GMT
       strict-transport-security:
       - max-age=31536000; includeSubDomains; preload
       vary:
diff --git a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_eval_metric_writer.test_send_timed_events.yaml b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_eval_metric_writer.test_send_timed_events.yaml
index c9797ace419..c31d610bd57 100644
--- a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_eval_metric_writer.test_send_timed_events.yaml
+++ b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_eval_metric_writer.test_send_timed_events.yaml
@@ -1,27 +1,28 @@
 interactions:
 - request:
-    body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"span_id":
-      "12345678902", "trace_id": "98765432102", "metric_type": "score", "label": "sentiment",
-      "score_value": 0.9, "ml_app": "dummy-ml-app", "timestamp_ms": 1724249982978}]}}}'
+    body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"join_on":
+      {"span": {"span_id": "12345678902", "trace_id": "98765432102"}}, "metric_type":
+      "score", "label": "sentiment", "score_value": 0.9, "ml_app": "dummy-ml-app",
+      "timestamp_ms": 1732568764624}]}}}'
     headers:
       Content-Type:
       - application/json
       DD-API-KEY:
       - XXXXXX
     method: POST
-    uri: https://api.datad0g.com/api/intake/llm-obs/v1/eval-metric
+    uri: https://api.datad0g.com/api/intake/llm-obs/v2/eval-metric
   response:
     body:
-      string: '{"data":{"id":"aba22157-cc3a-4601-a6a5-7afa99eee73e","type":"evaluation_metric","attributes":{"metrics":[{"id":"c2f6f63c-17ca-48c3-ad2d-676b2a35e726","trace_id":"98765432102","span_id":"12345678902","timestamp_ms":1724249982978,"ml_app":"dummy-ml-app","metric_type":"score","label":"sentiment","score_value":0.9}]}}}'
+      string: '{"data":{"id":"5352c11a-dcdd-449b-af72-2ae0b5dac3a1","type":"evaluation_metric","attributes":{"metrics":[{"id":"WmMD7E_fAD","join_on":{"span":{"trace_id":"98765432102","span_id":"12345678902"}},"timestamp_ms":1732568764624,"ml_app":"dummy-ml-app","metric_type":"score","label":"sentiment","score_value":0.9}]}}}'
     headers:
       content-length:
-      - '316'
+      - '311'
       content-security-policy:
       - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pub293163a918901030b79492fe1ab424cf&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatad0g.com
       content-type:
       - application/vnd.api+json
       date:
-      - Wed, 21 Aug 2024 14:19:45 GMT
+      - Mon, 25 Nov 2024 21:06:04 GMT
       strict-transport-security:
       - max-age=31536000; includeSubDomains; preload
       vary:
@@ -34,28 +35,29 @@ interactions:
       code: 202
       message: Accepted
 - request:
-    body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"span_id":
-      "12345678901", "trace_id": "98765432101", "metric_type": "categorical", "categorical_value":
-      "very", "label": "toxicity", "ml_app": "dummy-ml-app", "timestamp_ms": 1724249983284}]}}}'
+    body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"join_on":
+      {"span": {"span_id": "12345678901", "trace_id": "98765432101"}}, "metric_type":
+      "categorical", "categorical_value": "very", "label": "toxicity", "ml_app": "dummy-ml-app",
+      "timestamp_ms": 1732568765127}]}}}'
     headers:
       Content-Type:
       - application/json
       DD-API-KEY:
       - XXXXXX
     method: POST
-    uri: https://api.datad0g.com/api/intake/llm-obs/v1/eval-metric
+    uri: https://api.datad0g.com/api/intake/llm-obs/v2/eval-metric
   response:
     body:
-      string: '{"data":{"id":"0bc39c40-6c72-4b11-9eea-826248f9fe37","type":"evaluation_metric","attributes":{"metrics":[{"id":"7da7eb5b-32d2-43b3-adf5-208313f822c5","trace_id":"98765432101","span_id":"12345678901","timestamp_ms":1724249983284,"ml_app":"dummy-ml-app","metric_type":"categorical","label":"toxicity","categorical_value":"very"}]}}}'
+      string: '{"data":{"id":"d39e806e-40c5-4b3c-b539-440390afca85","type":"evaluation_metric","attributes":{"metrics":[{"id":"403hQLmrQW","join_on":{"span":{"trace_id":"98765432101","span_id":"12345678901"}},"timestamp_ms":1732568765127,"ml_app":"dummy-ml-app","metric_type":"categorical","label":"toxicity","categorical_value":"very"}]}}}'
     headers:
       content-length:
-      - '330'
+      - '325'
       content-security-policy:
       - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pub293163a918901030b79492fe1ab424cf&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatad0g.com
       content-type:
       - application/vnd.api+json
       date:
-      - Wed, 21 Aug 2024 14:19:45 GMT
+      - Mon, 25 Nov 2024 21:06:05 GMT
       strict-transport-security:
       - max-age=31536000; includeSubDomains; preload
       vary:
diff --git a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_evaluator_runner.send_score_metric.yaml b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_evaluator_runner.send_score_metric.yaml
index e2e17e715cf..f5deea8ef90 100644
--- a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_evaluator_runner.send_score_metric.yaml
+++ b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_evaluator_runner.send_score_metric.yaml
@@ -1,28 +1,28 @@
 interactions:
 - request:
-    body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"span_id":
-      "123", "trace_id": "1234", "label": "dummy", "metric_type": "score", "timestamp_ms":
-      1729569649880, "score_value": 1.0, "ml_app": "unnamed-ml-app", "tags": ["ddtrace.version:2.15.0.dev219+ge047e25bb.d20241022",
-      "ml_app:unnamed-ml-app"]}]}}}'
+    body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"join_on":
+      {"span": {"span_id": "123", "trace_id": "1234"}}, "label": "dummy", "metric_type":
+      "score", "timestamp_ms": 1732569321978, "score_value": 1.0, "ml_app": "unnamed-ml-app",
+      "tags": ["ddtrace.version:2.15.0.dev351+g152f3e3b6.d20241122", "ml_app:unnamed-ml-app"]}]}}}'
     headers:
       Content-Type:
       - application/json
       DD-API-KEY:
       - XXXXXX
     method: POST
-    uri: https://api.datad0g.com/api/intake/llm-obs/v1/eval-metric
+    uri: https://api.datad0g.com/api/intake/llm-obs/v2/eval-metric
   response:
     body:
-      string: '{"data":{"id":"2131dbc0-d085-401c-8b2d-8506a9ac8c13","type":"evaluation_metric","attributes":{"metrics":[{"id":"YutAyQc6F4","trace_id":"1234","span_id":"123","timestamp_ms":1729569649880,"ml_app":"unnamed-ml-app","metric_type":"score","label":"dummy","score_value":1,"tags":["ddtrace.version:2.15.0.dev219+ge047e25bb.d20241022","ml_app:unnamed-ml-app"]}]}}}'
+      string: '{"data":{"id":"06c00db0-1898-44be-ae0b-f0149f819c59","type":"evaluation_metric","attributes":{"metrics":[{"id":"1DrSMXmWcP","join_on":{"span":{"trace_id":"1234","span_id":"123"}},"timestamp_ms":1732569321978,"ml_app":"unnamed-ml-app","metric_type":"score","label":"dummy","score_value":1,"tags":["ddtrace.version:2.15.0.dev351+g152f3e3b6.d20241122","ml_app:unnamed-ml-app"]}]}}}'
     headers:
       content-length:
-      - '357'
+      - '378'
       content-security-policy:
       - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pub293163a918901030b79492fe1ab424cf&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatad0g.com
       content-type:
       - application/vnd.api+json
       date:
-      - Tue, 22 Oct 2024 04:00:50 GMT
+      - Mon, 25 Nov 2024 21:15:22 GMT
       strict-transport-security:
       - max-age=31536000; includeSubDomains; preload
       vary:
diff --git a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_evaluators.emits_traces_and_evaluations_on_exit.yaml b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_evaluators.emits_traces_and_evaluations_on_exit.yaml
index 757f875443f..367024a712d 100644
--- a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_evaluators.emits_traces_and_evaluations_on_exit.yaml
+++ b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_evaluators.emits_traces_and_evaluations_on_exit.yaml
@@ -73,19 +73,19 @@ interactions:
   response:
     body:
       string: !!binary |
-        H4sIAAAAAAAAA2yRW4vbMBCF3/0rxDzHi+2mySZvbUOhsPRGoSxxsBV5bKuVJaGZ9ELIfy9yvEnK
-        7oseztH5NGd0TIQA3cBagOolq8Gb9M3D5tPi1zuz+r0qHjavv27a74+vPj5+sG8z8wVmMeH2P1Dx
-        U+pOucEbZO3s2VYBJWOk5stitchXy3w+GoNr0MRY5zmdu3TQVqdFVszTbJnm91O6d1ohwVpsEyGE
-        OI5nnNM2+AfWIps9KQMSyQ5hfbkkBARnogKSSBNLyzC7mspZRjuOXtf19lgCYVQUViO+HPmiBNKx
-        U6iIJeOAlila2xK+9SiU9JqlEa4V74O0CoUm8VkGTXcl7E67uq5vHw3YHkjG4vZgzKSfLi2M63xw
-        e5r8i95qq6mvAkpyNk5M7DyM7ikRYjdu6/DfAsAHN3iu2P1EG4GLLD/z4PpJV7dYTCY7luYmVSxn
-        L/CqBllqQzf7BiVVj801miU35Z4/+hLiXFDb7hklmUhAf4lxqFptOww+6PMPtr6a3xeqKORyryA5
-        Jf8AAAD//wMAn6C7Cc8CAAA=
+        H4sIAAAAAAAAA4xSwYrbMBS8+yvEO8eL403iTW49bEtuWdpCIQ62Ij/bam1J6L1AS8i/FznZ2Mtu
+        oRcdZt6MZp50joQAXcFGgGolq9518acf0hyX6+02W26T3e7Ly/r5JTltv9rvNT/DLCjs8ScqflU9
+        KNu7Dllbc6WVR8kYXOfZY7pcrVeLx4HobYVdkDWO44WNe210nCbpIk6yeP50U7dWKyTYiH0khBDn
+        4Qw5TYW/YSOS2SvSI5FsEDb3ISHA2y4gIIk0sTQMs5FU1jCaIXpZlvtzDoQBUVgM9vngL3IgHTr5
+        glgy9miYArXP4VuLQkmnWXbC1uKzl0ah0CR20mt6yOFwOZRlOb3UY30iGYqbU9fd8Mu9RWcb5+2R
+        bvwdr7XR1BYeJVkTEhNbBwN7iYQ4DNs6vVkAOG97xwXbX2iC4SqZX/1gfKSRTVc3ki3LbqJKs9kH
+        fkWFLHVHk32DkqrFapSOjyNPlbYTIpq0fp/mI+9rc22a/7EfCaXQMVaF81hp9bbxOOYx/OF/jd23
+        PAQG+kOMfVFr06B3Xl9/UO2KJEuWx/opUwlEl+gvAAAA//8DABrEjBtPAwAA
     headers:
       CF-Cache-Status:
       - DYNAMIC
       CF-RAY:
-      - 8d6b5b701f294367-EWR
+      - 8e84af2fba19c952-IAD
       Connection:
       - keep-alive
       Content-Encoding:
@@ -93,14 +93,14 @@ interactions:
       Content-Type:
       - application/json
       Date:
-      - Tue, 22 Oct 2024 17:55:15 GMT
+      - Mon, 25 Nov 2024 21:20:43 GMT
       Server:
       - cloudflare
       Set-Cookie:
-      - __cf_bm=iQaF937ylY7BvvBCyWYQoxiJwi1nBp5.LILrHLw1uno-1729619715-1.0.1.1-jS4Dz7yc_ud.hKZlJ_CAZkSQesqzVkfrA5F30zI7CtJsbEKyAiuVlpX0CPf816UtlhXQEW8T5nsc.UvnsCOzOw;
-        path=/; expires=Tue, 22-Oct-24 18:25:15 GMT; domain=.api.openai.com; HttpOnly;
+      - __cf_bm=CVMxqIcHUHNjX56k1MKjj4MgiYNVAlg_B7yyVaP_z1o-1732569643-1.0.1.1-HOtZfXprHWr_DjtorQ_ZK6bbSmcOsBrphniRCaC9XQ2tTtO5JVpyDQK1HRFo3kUE9GEi9J.sR0_L6nBtXlGj8w;
+        path=/; expires=Mon, 25-Nov-24 21:50:43 GMT; domain=.api.openai.com; HttpOnly;
         Secure; SameSite=None
-      - _cfuvid=wQzHCwLW6CPU768K_tlLklWp36I8zYCVJkKlAMtnMkk-1729619715162-0.0.1.1-604800000;
+      - _cfuvid=sqtPZaucqBJu1r4exJtYym3vbKmuuSO6o0np5VglPsw-1732569643935-0.0.1.1-604800000;
         path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None
       Transfer-Encoding:
       - chunked
@@ -113,7 +113,7 @@ interactions:
       openai-organization:
       - datadog-staging
       openai-processing-ms:
-      - '496'
+      - '370'
       openai-version:
       - '2020-10-01'
       strict-transport-security:
@@ -131,7 +131,7 @@ interactions:
       x-ratelimit-reset-tokens:
       - 0s
       x-request-id:
-      - req_33b8cddecaab8b8bc36e90f58f844636
+      - req_02ed729afc2d9083921e3fe5b7528550
     status:
       code: 200
       message: OK
@@ -193,8 +193,8 @@ interactions:
       content-type:
       - application/json
       cookie:
-      - __cf_bm=iQaF937ylY7BvvBCyWYQoxiJwi1nBp5.LILrHLw1uno-1729619715-1.0.1.1-jS4Dz7yc_ud.hKZlJ_CAZkSQesqzVkfrA5F30zI7CtJsbEKyAiuVlpX0CPf816UtlhXQEW8T5nsc.UvnsCOzOw;
-        _cfuvid=wQzHCwLW6CPU768K_tlLklWp36I8zYCVJkKlAMtnMkk-1729619715162-0.0.1.1-604800000
+      - __cf_bm=CVMxqIcHUHNjX56k1MKjj4MgiYNVAlg_B7yyVaP_z1o-1732569643-1.0.1.1-HOtZfXprHWr_DjtorQ_ZK6bbSmcOsBrphniRCaC9XQ2tTtO5JVpyDQK1HRFo3kUE9GEi9J.sR0_L6nBtXlGj8w;
+        _cfuvid=sqtPZaucqBJu1r4exJtYym3vbKmuuSO6o0np5VglPsw-1732569643935-0.0.1.1-604800000
       host:
       - api.openai.com
       user-agent:
@@ -220,19 +220,20 @@ interactions:
   response:
     body:
       string: !!binary |
-        H4sIAAAAAAAAA2xSQW7bMBC86xWLPVuBpTqR7VuAoGiBAmmLHhrEgUVTK2tdiSTIdZDA8N8Lyorl
-        ILnwMLMznB3ykAAgV7gE1I0S3bk2vf1xd19M/c3u4frhu/37Tet73qm7383r7fwXTqLCbnak5U11
-        pW3nWhK25kRrT0ooumZFvrjJFkV23ROdraiNsq2TdGbTjg2n+TSfpdMizeaDurGsKeASHhMAgEN/
-        xpymohdcwnTyhnQUgtoSLs9DAOhtGxFUIXAQZQQnI6mtETJ99LIsHw8rDKKEOjKywiWs8E9DoJVj
-        US3YGr56ZTQBB/ipPIerFU5ghZ5UsGYUnD3ioIKKPWkBT46EYy3RSRoCNrX1neoh5+0zV1QBm57r
-        k73IcMMz+Yp1nyk7PpVlebmEp3ofVCzS7Nt2wI/nVlq7dd5uwsCf8ZoNh2Z9Ch8bCGId9uwxAXjq
-        29+/KxSdt52Ttdh/ZKJhUcxPfjg++sh+WQykWFHtiM+zYvKJ37oiUdyGi/dDrXRD1SidJhfLfbz0
-        M4vTgmy2H1ySwQnDaxDq1jWbLXnn+fQjareezXOd56rYaEyOyX8AAAD//wMAUtzROh8DAAA=
+        H4sIAAAAAAAAA4xTwW6bQBC98xWjPZsIOzg43Fqp7a2y2iqqFEew3h1gWthd7Y6jpJb/vVrsGEdJ
+        pV44vDfv8eYN7BMAQVqUIFQnWQ2uTz/8lEZ9/NI9//mafVt9Wt61uxbvtt+3Oa3XYhYVdvsLFb+o
+        rpQdXI9M1hxp5VEyRtd5cb1Y3tze5PlIDFZjH2Wt4zS36UCG0kW2yNOsSOerk7qzpDCIEu4TAID9
+        +Iw5jcYnUUI2e0EGDEG2KMrzEIDwto+IkCFQYGlYzCZSWcNoxuh1Xd/vNyKwZBzQ8EaUsBE/OgQl
+        HbHswTbw2UujECjAWnoKVxsxg43wKIM1k+DsEQclaPKoGDw6ZIq1RCfuEMg01g9yhJy3j6RRA5mR
+        G5M98ekNj+g1qTHT/PBQ1/XlEh6bXZCxSLPr+xN+OLfS29Z5uw0n/ow3ZCh01TF8bCCwdWJkDwnA
+        w9j+7lWhwnk7OK7Y/kYTDYtidfQT09En9vr2RLJl2U/4al7M3vGrNLKkPlzcTyipOtSTdDq23Gmy
+        F0RysfXbNO95Hzcn0/6P/UQohY5RV85jvMmrjacxj/Gf+NfYueUxsAjPgXGoGjIteufp+EU2rsqK
+        bLltVoXKRHJI/gIAAP//AwDgYzoinwMAAA==
     headers:
       CF-Cache-Status:
       - DYNAMIC
       CF-RAY:
-      - 8d6b5b744e034367-EWR
+      - 8e84af32dc70c952-IAD
       Connection:
       - keep-alive
       Content-Encoding:
@@ -240,7 +241,7 @@ interactions:
       Content-Type:
       - application/json
       Date:
-      - Tue, 22 Oct 2024 17:55:16 GMT
+      - Mon, 25 Nov 2024 21:20:45 GMT
       Server:
       - cloudflare
       Transfer-Encoding:
@@ -254,7 +255,7 @@ interactions:
       openai-organization:
       - datadog-staging
       openai-processing-ms:
-      - '749'
+      - '1168'
       openai-version:
       - '2020-10-01'
       strict-transport-security:
@@ -272,35 +273,37 @@ interactions:
       x-ratelimit-reset-tokens:
       - 0s
       x-request-id:
-      - req_fbb01161a03eb6f478ff52314b72cfd6
+      - req_702ebaa1edbab95fb42f52baa4b34661
     status:
       code: 200
       message: OK
 - request:
-    body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"span_id":
-      "6877142543397072040", "trace_id": "6717e70200000000a99ea8ad36f4f36d", "label":
-      "ragas_faithfulness", "metric_type": "score", "timestamp_ms": 1729619716093,
-      "score_value": 1.0, "ml_app": "unnamed-ml-app", "tags": ["ddtrace.version:2.15.0.dev219+ge047e25bb.d20241022",
-      "ml_app:unnamed-ml-app"]}]}}}'
+    body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"join_on":
+      {"span": {"span_id": "7678809694384023494", "trace_id": "6744ea2b00000000995e7b2ceabfce01"}},
+      "label": "ragas_faithfulness", "metric_type": "score", "timestamp_ms": 1732569645205,
+      "score_value": 1.0, "ml_app": "unnamed-ml-app", "tags": ["ddtrace.version:2.15.0.dev351+g152f3e3b6.d20241122",
+      "ml_app:unnamed-ml-app"], "metadata": {"_dd.evaluation_kind": "faithfulness",
+      "_dd.evaluation_span": {"span_id": "5771061714047746387", "trace_id": "6744ea2b000000007099aeb477077763"},
+      "_dd.faithfulness_disagreements": []}}]}}}'
     headers:
       Content-Type:
       - application/json
       DD-API-KEY:
       - XXXXXX
     method: POST
-    uri: https://api.datad0g.com/api/intake/llm-obs/v1/eval-metric
+    uri: https://api.datad0g.com/api/intake/llm-obs/v2/eval-metric
   response:
     body:
-      string: '{"data":{"id":"99fa371c-457c-4d2b-8d4c-61657e0ffd48","type":"evaluation_metric","attributes":{"metrics":[{"id":"CbapxUnzcX","trace_id":"6717e70200000000a99ea8ad36f4f36d","span_id":"6877142543397072040","timestamp_ms":1729619716093,"ml_app":"unnamed-ml-app","metric_type":"score","label":"ragas_faithfulness","score_value":1,"tags":["ddtrace.version:2.15.0.dev219+ge047e25bb.d20241022","ml_app:unnamed-ml-app"]}]}}}'
+      string: '{"data":{"id":"f1470aa7-b97f-4809-825d-6932af26a81c","type":"evaluation_metric","attributes":{"metrics":[{"id":"EPRU-72kfP","join_on":{"span":{"trace_id":"6744ea2b00000000995e7b2ceabfce01","span_id":"7678809694384023494"}},"timestamp_ms":1732569645205,"ml_app":"unnamed-ml-app","metric_type":"score","label":"ragas_faithfulness","score_value":1,"tags":["ddtrace.version:2.15.0.dev351+g152f3e3b6.d20241122","ml_app:unnamed-ml-app"],"metadata":{"_dd.evaluation_kind":"faithfulness","_dd.evaluation_span":{"span_id":"5771061714047746387","trace_id":"6744ea2b000000007099aeb477077763"},"_dd.faithfulness_disagreements":[]}}]}}}'
     headers:
       content-length:
-      - '414'
+      - '623'
       content-security-policy:
       - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pub293163a918901030b79492fe1ab424cf&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatad0g.com
       content-type:
       - application/vnd.api+json
       date:
-      - Tue, 22 Oct 2024 17:55:17 GMT
+      - Mon, 25 Nov 2024 21:20:45 GMT
       strict-transport-security:
       - max-age=31536000; includeSubDomains; preload
       vary:
diff --git a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_evaluators.test_ragas_faithfulness_emits_traces.yaml b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_evaluators.test_ragas_faithfulness_emits_traces.yaml
index 8efe7391c90..2100bb3d305 100644
--- a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_evaluators.test_ragas_faithfulness_emits_traces.yaml
+++ b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_evaluators.test_ragas_faithfulness_emits_traces.yaml
@@ -51,7 +51,7 @@ interactions:
       host:
       - api.openai.com
       user-agent:
-      - OpenAI/Python 1.47.1
+      - OpenAI/Python 1.52.0
       x-stainless-arch:
       - arm64
       x-stainless-async:
@@ -61,7 +61,9 @@ interactions:
       x-stainless-os:
       - MacOS
       x-stainless-package-version:
-      - 1.47.1
+      - 1.52.0
+      x-stainless-retry-count:
+      - '0'
       x-stainless-runtime:
       - CPython
       x-stainless-runtime-version:
@@ -71,19 +73,19 @@ interactions:
   response:
     body:
       string: !!binary |
-        H4sIAAAAAAAAAwAAAP//dJHBbtswEETv+gpiz1YgCY7k+JbCKAoEQYKih7SWIdH0SmJDkQR3jbYw
-        /O8FZcd2D73wMI8znF0eEiFA72ApQA2S1ehN+vipfFytqq+rt7eHX4v9+3NY/aDnL5y9DE/fYRYd
-        bvsTFX+47pQbvUHWzp6wCigZY2peFVWRLcryfgKj26GJtt5zOnfpqK1Oi6yYp1mV5ouze3BaIcFS
-        rBMhhDhMZ+xpd/gbliKbfSgjEskeYXm5JAQEZ6ICkkgTS8swu0LlLKOdqrdtuz7UQBgVhc0UX0/5
-        ogbScabQEEvGES1TROsavg0olPSapRGuE5+DtAqFJvEqg6a7GjbHTdu2t48G7PYk4+B2b8xZP16m
-        MK73wW3pzC96p62moQkoydnYmNh5mOgxEWIzbWv/zwLABzd6bti9o42BZZaf8uD6SVdalGfIjqW5
-        cRXV/1zNDllqQzc7h1NDbftrQnapOc0J9IcYx6bTtsfggz59QeebfLudl3lZdQ+QHJO/AAAA//8D
-        AL2Ti/mQAgAA
+        H4sIAAAAAAAAA4xSwY7aMBS85yusdyarkAKh3Payh6qVVitUVQWUGOclcevYrt9D7Rbx75UDS1h1
+        K/Xiw8yb8cyzj4kQoGtYCVCdZNV7k95/kaZtnpZr3++f159+Pk4/L35P5cevPz6YJ5hEhdt/Q8Uv
+        qjvlem+QtbNnWgWUjNF1WrzL54v383w2EL2r0URZ6zmdubTXVqd5ls/SrEiny4u6c1ohwUpsEiGE
+        OA5nzGlr/AUrkU1ekB6JZIuwug4JAcGZiIAk0sTSMkxGUjnLaIfoVVVtjlsgjIjCcrDfDv5iC6Rj
+        p1ASS8YeLVOkNltYdyiU9JqlEa4RD0FahUKTeJRB090WdqddVVW3lwZsDiRjcXsw5oKfri2Ma31w
+        e7rwV7zRVlNXBpTkbExM7DwM7CkRYjds6/BqAeCD6z2X7L6jjYaLbHr2g/GRRjZfXEh2LM2NKi8m
+        b/iVNbLUhm72DUqqDutROj6OPNTa3RDJTeu/07zlfW6ubfs/9iOhFHrGuvQBa61eNx7HAsY//K+x
+        65aHwEDPxNiXjbYtBh/0+Qc1vsyKbL5vloXKIDklfwAAAP//AwB8IvReTwMAAA==
     headers:
       CF-Cache-Status:
       - DYNAMIC
       CF-RAY:
-      - 8c856bee184d42d1-EWR
+      - 8e84ac4858349c52-IAD
       Connection:
       - keep-alive
       Content-Encoding:
@@ -91,14 +93,14 @@ interactions:
       Content-Type:
       - application/json
       Date:
-      - Tue, 24 Sep 2024 20:11:06 GMT
+      - Mon, 25 Nov 2024 21:18:45 GMT
       Server:
       - cloudflare
       Set-Cookie:
-      - __cf_bm=nMe4XLsotHph1aKmM6xJotYxeBsTIpCG1ULeQ2oiKLc-1727208666-1.0.1.1-eM1elzOCEnpbPLkOO61HSBvaeQPYHEyO4Ba3P2NsxkYV23Fybb7E8tIipei4YDbhyDiLXybnT7H0ETvjbsV89g;
-        path=/; expires=Tue, 24-Sep-24 20:41:06 GMT; domain=.api.openai.com; HttpOnly;
+      - __cf_bm=9IkhXEbzhF0QSjoZOW.EVqEQoEHTaAK7pnQ6K1m4EfY-1732569525-1.0.1.1-8FRhFy6jBsuonirbdG9jJ_IHnhXUakqpLsEg10YYrkhce9PwlXOKXNA2hiZwNqpM3D2TP2X4eFcZJjdEZt6.qQ;
+        path=/; expires=Mon, 25-Nov-24 21:48:45 GMT; domain=.api.openai.com; HttpOnly;
         Secure; SameSite=None
-      - _cfuvid=lKBj.JPFMKz3LiJyz12GeZI73UndAQfhN.5aqiwYPHA-1727208666121-0.0.1.1-604800000;
+      - _cfuvid=YGxjg63ZVaAJESO.Ouzjnkmhsg2izo9JySj6zJ3MRuc-1732569525322-0.0.1.1-604800000;
         path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None
       Transfer-Encoding:
       - chunked
@@ -106,10 +108,12 @@ interactions:
       - nosniff
       access-control-expose-headers:
       - X-Request-ID
+      alt-svc:
+      - h3=":443"; ma=86400
       openai-organization:
       - datadog-staging
       openai-processing-ms:
-      - '576'
+      - '469'
       openai-version:
       - '2020-10-01'
       strict-transport-security:
@@ -127,7 +131,7 @@ interactions:
       x-ratelimit-reset-tokens:
       - 0s
       x-request-id:
-      - req_ef3f2830eaf13bceea5db3a7369affda
+      - req_5d6c0d3f36d4cba76fbfea5b6c9f63fe
     status:
       code: 200
       message: OK
@@ -189,12 +193,12 @@ interactions:
       content-type:
       - application/json
       cookie:
-      - __cf_bm=nMe4XLsotHph1aKmM6xJotYxeBsTIpCG1ULeQ2oiKLc-1727208666-1.0.1.1-eM1elzOCEnpbPLkOO61HSBvaeQPYHEyO4Ba3P2NsxkYV23Fybb7E8tIipei4YDbhyDiLXybnT7H0ETvjbsV89g;
-        _cfuvid=lKBj.JPFMKz3LiJyz12GeZI73UndAQfhN.5aqiwYPHA-1727208666121-0.0.1.1-604800000
+      - __cf_bm=9IkhXEbzhF0QSjoZOW.EVqEQoEHTaAK7pnQ6K1m4EfY-1732569525-1.0.1.1-8FRhFy6jBsuonirbdG9jJ_IHnhXUakqpLsEg10YYrkhce9PwlXOKXNA2hiZwNqpM3D2TP2X4eFcZJjdEZt6.qQ;
+        _cfuvid=YGxjg63ZVaAJESO.Ouzjnkmhsg2izo9JySj6zJ3MRuc-1732569525322-0.0.1.1-604800000
       host:
       - api.openai.com
       user-agent:
-      - OpenAI/Python 1.47.1
+      - OpenAI/Python 1.52.0
       x-stainless-arch:
       - arm64
       x-stainless-async:
@@ -204,7 +208,9 @@ interactions:
       x-stainless-os:
       - MacOS
       x-stainless-package-version:
-      - 1.47.1
+      - 1.52.0
+      x-stainless-retry-count:
+      - '0'
       x-stainless-runtime:
       - CPython
       x-stainless-runtime-version:
@@ -214,19 +220,20 @@ interactions:
   response:
     body:
       string: !!binary |
-        H4sIAAAAAAAAAwAAAP//dFFBbtswELz7FQuerUByDcnRLS3aS4umSBugSBRINLWSNpFIglwHLgz/
-        vaCsSOmhFx5mdoazs6cVgKBa5CBUJ1kNto9uPqY3n+/U3e39/WH49vzQyObn7fevx98v2adErIPC
-        7J9R8ZvqSpnB9shk9IVWDiVjcE2yTbaJd2majsRgauyDrLUcbU00kKZoE2+2UZxFyW5Sd4YUepHD
-        4woA4DS+Iaeu8ShyiNdvyIDeyxZFPg8BCGf6gAjpPXmWmsV6IZXRjHqMXlXV46kQniXjgJoLkUMh
-        fnUISlpi2YNp4IuTWiGQhx/Skb8qxBoK4VB6oxfB7BEGJdTkUDE4tMgUaglO3CGQbowb5AhZZ16p
-        xhpIj9yY7MjTD6/oalJjpuT8VFXV+yUcNgcvQ5H60PcTfp5b6U1rndn7iZ/xhjT5rryEDw14NlaM
-        7HkF8DS2f/inUGGdGSyXbF5QB8Ms2138xHL0hf1wPZFsWPYLvkuy/6nKGllS79/dcKqXdLs4xHPM
-        cU/h/3jGoWxIt+iso8tJG1sm+/02TdKsuRar8+ovAAAA//8DADp8axngAgAA
+        H4sIAAAAAAAAA4xTwWrbQBC96yuGPVtBVuzK8a0YcimhLQRSiIO03h1Zk652l91xSDD+97KyYzk0
+        hV50eG/e05s30j4DEKTFEoTqJKvem/zrL2m6h3v78Ganqzv5c7XaVIv+rrTfv3UzMUkKt3lGxe+q
+        K+V6b5DJ2SOtAkrG5Dqtrsv5l5t5OR+I3mk0Sbb1nM9c3pOlvCzKWV5U+XRxUneOFEaxhMcMAGA/
+        PFNOq/FVLKGYvCM9xii3KJbnIQARnEmIkDFSZGlZTEZSOctoh+hN0zzu1yKyZOzR8losYS3uOwQl
+        PbE04Fq4DdIqBIrwQwaKV2sxgbUIKKOzo+DskQYlaAqoGAJ6ZEq1JCfuEMi2LvRygHxwL6RRA9mB
+        G5K98ukNLxg0qSHT9PDUNM3lEgHbXZSpSLsz5oQfzq0Yt/XBbeKJP+MtWYpdfQyfGojsvBjYQwbw
+        NLS/+1Co8MH1nmt2v9Emw6paHP3EePSRvb45kexYmhFfTKvJJ361RpZk4sX9hJKqQz1Kx2PLnSZ3
+        QWQXW/+d5jPv4+Zkt/9jPxJKoWfUtQ+YbvJh43EsYPon/jV2bnkILOJbZOzrluwWgw90/CJbXxdV
+        Md+0i0oVIjtkfwAAAP//AwD0sdbanwMAAA==
     headers:
       CF-Cache-Status:
       - DYNAMIC
       CF-RAY:
-      - 8c856bf38f3242d1-EWR
+      - 8e84ac4daf039c52-IAD
       Connection:
       - keep-alive
       Content-Encoding:
@@ -234,7 +241,7 @@ interactions:
       Content-Type:
       - application/json
       Date:
-      - Tue, 24 Sep 2024 20:11:06 GMT
+      - Mon, 25 Nov 2024 21:18:46 GMT
       Server:
       - cloudflare
       Transfer-Encoding:
@@ -243,10 +250,12 @@ interactions:
       - nosniff
       access-control-expose-headers:
       - X-Request-ID
+      alt-svc:
+      - h3=":443"; ma=86400
       openai-organization:
       - datadog-staging
       openai-processing-ms:
-      - '523'
+      - '1256'
       openai-version:
       - '2020-10-01'
       strict-transport-security:
@@ -264,7 +273,7 @@ interactions:
       x-ratelimit-reset-tokens:
       - 0s
       x-request-id:
-      - req_07733e2c20ff88f138f2ab4cd6a71cc6
+      - req_a58af2c6e743ac15ac528fb6233d9436
     status:
       code: 200
       message: OK
diff --git a/tests/llmobs/test_llmobs_eval_metric_writer.py b/tests/llmobs/test_llmobs_eval_metric_writer.py
index 2b8341e1616..eb168ef5a00 100644
--- a/tests/llmobs/test_llmobs_eval_metric_writer.py
+++ b/tests/llmobs/test_llmobs_eval_metric_writer.py
@@ -7,15 +7,19 @@
 from ddtrace.llmobs._writer import LLMObsEvalMetricWriter
 
 
-INTAKE_ENDPOINT = "https://api.datad0g.com/api/intake/llm-obs/v1/eval-metric"
+INTAKE_ENDPOINT = "https://api.datad0g.com/api/intake/llm-obs/v2/eval-metric"
 DD_SITE = "datad0g.com"
 dd_api_key = os.getenv("DD_API_KEY", default="<not-a-real-api-key>")
 
 
 def _categorical_metric_event():
     return {
-        "span_id": "12345678901",
-        "trace_id": "98765432101",
+        "join_on": {
+            "span": {
+                "span_id": "12345678901",
+                "trace_id": "98765432101",
+            },
+        },
         "metric_type": "categorical",
         "categorical_value": "very",
         "label": "toxicity",
@@ -26,8 +30,12 @@ def _categorical_metric_event():
 
 def _score_metric_event():
     return {
-        "span_id": "12345678902",
-        "trace_id": "98765432102",
+        "join_on": {
+            "span": {
+                "span_id": "12345678902",
+                "trace_id": "98765432102",
+            },
+        },
         "metric_type": "score",
         "label": "sentiment",
         "score_value": 0.9,
@@ -69,6 +77,18 @@ def test_send_metric_bad_api_key(mock_writer_logs):
     )
 
 
+@pytest.mark.vcr_logs
+def test_send_metric_no_api_key(mock_writer_logs):
+    llmobs_eval_metric_writer = LLMObsEvalMetricWriter(site="datad0g.com", api_key="", interval=1000, timeout=1)
+    llmobs_eval_metric_writer.start()
+    llmobs_eval_metric_writer.enqueue(_categorical_metric_event())
+    llmobs_eval_metric_writer.periodic()
+    mock_writer_logs.warning.assert_called_with(
+        "DD_API_KEY is required for sending evaluation metrics. Evaluation metric data will not be sent. ",
+        "Ensure this configuration is set before running your application.",
+    )
+
+
 @pytest.mark.vcr_logs
 def test_send_categorical_metric(mock_writer_logs):
     llmobs_eval_metric_writer = LLMObsEvalMetricWriter(site="datad0g.com", api_key=dd_api_key, interval=1000, timeout=1)
@@ -125,6 +145,18 @@ def test_send_multiple_events(mock_writer_logs):
 
 
 def test_send_on_exit(mock_writer_logs, run_python_code_in_subprocess):
+    env = os.environ.copy()
+    pypath = [os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))]
+    if "PYTHONPATH" in env:
+        pypath.append(env["PYTHONPATH"])
+    env.update(
+        {
+            "DD_API_KEY": os.getenv("DD_API_KEY", "dummy-api-key"),
+            "DD_SITE": "datad0g.com",
+            "PYTHONPATH": ":".join(pypath),
+            "DD_LLMOBS_ML_APP": "unnamed-ml-app",
+        }
+    )
     out, err, status, pid = run_python_code_in_subprocess(
         """
 import atexit
@@ -144,6 +176,7 @@ def test_send_on_exit(mock_writer_logs, run_python_code_in_subprocess):
 llmobs_eval_metric_writer.start()
 llmobs_eval_metric_writer.enqueue(_score_metric_event())
 """,
+        env=env,
     )
     assert status == 0, err
     assert out == b""
diff --git a/tests/llmobs/test_llmobs_service.py b/tests/llmobs/test_llmobs_service.py
index 69ebb216d7e..4c7f207066e 100644
--- a/tests/llmobs/test_llmobs_service.py
+++ b/tests/llmobs/test_llmobs_service.py
@@ -1,4 +1,5 @@
 import os
+import re
 import threading
 import time
 
@@ -846,16 +847,6 @@ def test_export_span_no_specified_span_returns_exported_active_span(llmobs):
         assert span_context["trace_id"] == "{:x}".format(span.trace_id)
 
 
-def test_submit_evaluation_llmobs_disabled_raises_warning(llmobs, mock_llmobs_logs):
-    llmobs.disable()
-    llmobs.submit_evaluation(
-        span_context={"span_id": "123", "trace_id": "456"}, label="toxicity", metric_type="categorical", value="high"
-    )
-    mock_llmobs_logs.warning.assert_called_once_with(
-        "LLMObs.submit_evaluation() called when LLMObs is not enabled. Evaluation metric data will not be sent."
-    )
-
-
 def test_submit_evaluation_no_api_key_raises_warning(llmobs, mock_llmobs_logs):
     with override_global_config(dict(_dd_api_key="")):
         llmobs.submit_evaluation(
@@ -1760,3 +1751,293 @@ def test_service_enable_does_not_start_evaluator_runner():
         assert llmobs_service._instance._llmobs_span_writer.status.value == "running"
         assert llmobs_service._instance._evaluator_runner.status.value == "stopped"
         llmobs_service.disable()
+
+
+def test_submit_evaluation_llmobs_disabled_raises_debug(llmobs, mock_llmobs_logs):
+    llmobs.disable()
+    mock_llmobs_logs.reset_mock()
+    llmobs.submit_evaluation(
+        span_context={"span_id": "123", "trace_id": "456"}, label="toxicity", metric_type="categorical", value="high"
+    )
+    mock_llmobs_logs.debug.assert_called_once_with(
+        "LLMObs.submit_evaluation() called when LLMObs is not enabled. Evaluation metric data will not be sent."
+    )
+
+
+def test_submit_evaluation_for_no_ml_app_raises_warning(llmobs, mock_llmobs_logs):
+    with override_global_config(dict(_llmobs_ml_app="")):
+        llmobs.submit_evaluation_for(
+            span={"span_id": "123", "trace_id": "456"},
+            label="toxicity",
+            metric_type="categorical",
+            value="high",
+        )
+        mock_llmobs_logs.warning.assert_called_once_with(
+            "ML App name is required for sending evaluation metrics. Evaluation metric data will not be sent. "
+            "Ensure this configuration is set before running your application."
+        )
+
+
+def test_submit_evaluation_for_span_incorrect_type_raises_error(llmobs, mock_llmobs_logs):
+    with pytest.raises(
+        TypeError,
+        match=re.escape(
+            (
+                "`span` must be a dictionary containing both span_id and trace_id keys. "
+                "LLMObs.export_span() can be used to generate this dictionary from a given span."
+            )
+        ),
+    ):
+        llmobs.submit_evaluation_for(span="asd", label="toxicity", metric_type="categorical", value="high")
+
+
+def test_submit_evaluation_for_span_with_tag_value_incorrect_type_raises_error(llmobs, mock_llmobs_logs):
+    with pytest.raises(
+        TypeError,
+        match=r"`span_with_tag_value` must be a dict with keys 'tag_key' and 'tag_value' containing string values",
+    ):
+        llmobs.submit_evaluation_for(
+            span_with_tag_value="asd", label="toxicity", metric_type="categorical", value="high"
+        )
+    with pytest.raises(
+        TypeError,
+        match=r"`span_with_tag_value` must be a dict with keys 'tag_key' and 'tag_value' containing string values",
+    ):
+        llmobs.submit_evaluation_for(
+            span_with_tag_value={"tag_key": "hi", "tag_value": 1},
+            label="toxicity",
+            metric_type="categorical",
+            value="high",
+        )
+
+
+def test_submit_evaluation_for_empty_span_or_trace_id_raises_error(llmobs, mock_llmobs_logs):
+    with pytest.raises(
+        TypeError,
+        match=re.escape(
+            (
+                "`span` must be a dictionary containing both span_id and trace_id keys. "
+                "LLMObs.export_span() can be used to generate this dictionary from a given span."
+            )
+        ),
+    ):
+        llmobs.submit_evaluation_for(
+            span={"trace_id": "456"}, label="toxicity", metric_type="categorical", value="high"
+        )
+    with pytest.raises(
+        TypeError,
+        match=re.escape(
+            "`span` must be a dictionary containing both span_id and trace_id keys. "
+            "LLMObs.export_span() can be used to generate this dictionary from a given span."
+        ),
+    ):
+        llmobs.submit_evaluation_for(span={"span_id": "456"}, label="toxicity", metric_type="categorical", value="high")
+
+
+def test_submit_evaluation_for_span_with_tag_value_empty_key_or_val_raises_error(llmobs, mock_llmobs_logs):
+    with pytest.raises(
+        TypeError,
+        match=r"`span_with_tag_value` must be a dict with keys 'tag_key' and 'tag_value' containing string values",
+    ):
+        llmobs.submit_evaluation_for(
+            span_with_tag_value={"tag_value": "123"}, label="toxicity", metric_type="categorical", value="high"
+        )
+
+
+def test_submit_evaluation_for_invalid_timestamp_raises_error(llmobs, mock_llmobs_logs):
+    with pytest.raises(
+        ValueError, match="timestamp_ms must be a non-negative integer. Evaluation metric data will not be sent"
+    ):
+        llmobs.submit_evaluation_for(
+            span={"span_id": "123", "trace_id": "456"},
+            label="",
+            metric_type="categorical",
+            value="high",
+            ml_app="dummy",
+            timestamp_ms="invalid",
+        )
+
+
+def test_submit_evaluation_for_empty_label_raises_error(llmobs, mock_llmobs_logs):
+    with pytest.raises(ValueError, match="label must be the specified name of the evaluation metric."):
+        llmobs.submit_evaluation_for(
+            span={"span_id": "123", "trace_id": "456"}, label="", metric_type="categorical", value="high"
+        )
+
+
+def test_submit_evaluation_for_incorrect_metric_type_raises_error(llmobs, mock_llmobs_logs):
+    with pytest.raises(ValueError, match="metric_type must be one of 'categorical' or 'score'."):
+        llmobs.submit_evaluation_for(
+            span={"span_id": "123", "trace_id": "456"}, label="toxicity", metric_type="wrong", value="high"
+        )
+    with pytest.raises(ValueError, match="metric_type must be one of 'categorical' or 'score'."):
+        llmobs.submit_evaluation_for(
+            span={"span_id": "123", "trace_id": "456"}, label="toxicity", metric_type="", value="high"
+        )
+
+
+def test_submit_evaluation_for_incorrect_score_value_type_raises_error(llmobs, mock_llmobs_logs):
+    with pytest.raises(TypeError, match="value must be an integer or float for a score metric."):
+        llmobs.submit_evaluation_for(
+            span={"span_id": "123", "trace_id": "456"}, label="token_count", metric_type="score", value="high"
+        )
+
+
+def test_submit_evaluation_for_invalid_tags_raises_warning(llmobs, mock_llmobs_logs):
+    llmobs.submit_evaluation_for(
+        span={"span_id": "123", "trace_id": "456"},
+        label="toxicity",
+        metric_type="categorical",
+        value="high",
+        tags=["invalid"],
+    )
+    mock_llmobs_logs.warning.assert_called_once_with("tags must be a dictionary of string key-value pairs.")
+
+
+@pytest.mark.parametrize(
+    "ddtrace_global_config",
+    [dict(_llmobs_ml_app="test_app_name")],
+)
+def test_submit_evaluation_for_non_string_tags_raises_warning_but_still_submits(
+    llmobs, mock_llmobs_logs, mock_llmobs_eval_metric_writer
+):
+    llmobs.submit_evaluation_for(
+        span={"span_id": "123", "trace_id": "456"},
+        label="toxicity",
+        metric_type="categorical",
+        value="high",
+        tags={1: 2, "foo": "bar"},
+        ml_app="dummy",
+    )
+    mock_llmobs_logs.warning.assert_called_once_with(
+        "Failed to parse tags. Tags for evaluation metrics must be strings."
+    )
+    mock_llmobs_logs.reset_mock()
+    mock_llmobs_eval_metric_writer.enqueue.assert_called_with(
+        _expected_llmobs_eval_metric_event(
+            ml_app="dummy",
+            span_id="123",
+            trace_id="456",
+            label="toxicity",
+            metric_type="categorical",
+            categorical_value="high",
+            tags=["ddtrace.version:{}".format(ddtrace.__version__), "ml_app:dummy", "foo:bar"],
+        )
+    )
+
+
+@pytest.mark.parametrize(
+    "ddtrace_global_config",
+    [dict(ddtrace="1.2.3", env="test_env", service="test_service", _llmobs_ml_app="test_app_name")],
+)
+def test_submit_evaluation_for_metric_tags(llmobs, mock_llmobs_eval_metric_writer):
+    llmobs.submit_evaluation_for(
+        span={"span_id": "123", "trace_id": "456"},
+        label="toxicity",
+        metric_type="categorical",
+        value="high",
+        tags={"foo": "bar", "bee": "baz", "ml_app": "ml_app_override"},
+        ml_app="ml_app_override",
+    )
+    mock_llmobs_eval_metric_writer.enqueue.assert_called_with(
+        _expected_llmobs_eval_metric_event(
+            ml_app="ml_app_override",
+            span_id="123",
+            trace_id="456",
+            label="toxicity",
+            metric_type="categorical",
+            categorical_value="high",
+            tags=["ddtrace.version:{}".format(ddtrace.__version__), "ml_app:ml_app_override", "foo:bar", "bee:baz"],
+        )
+    )
+
+
+def test_submit_evaluation_for_span_with_tag_value_enqueues_writer_with_categorical_metric(
+    llmobs, mock_llmobs_eval_metric_writer
+):
+    llmobs.submit_evaluation_for(
+        span_with_tag_value={"tag_key": "tag_key", "tag_value": "tag_val"},
+        label="toxicity",
+        metric_type="categorical",
+        value="high",
+        ml_app="dummy",
+    )
+    mock_llmobs_eval_metric_writer.enqueue.assert_called_with(
+        _expected_llmobs_eval_metric_event(
+            ml_app="dummy",
+            tag_key="tag_key",
+            tag_value="tag_val",
+            label="toxicity",
+            metric_type="categorical",
+            categorical_value="high",
+        )
+    )
+
+
+def test_submit_evaluation_for_enqueues_writer_with_categorical_metric(llmobs, mock_llmobs_eval_metric_writer):
+    llmobs.submit_evaluation_for(
+        span={"span_id": "123", "trace_id": "456"},
+        label="toxicity",
+        metric_type="categorical",
+        value="high",
+        ml_app="dummy",
+    )
+    mock_llmobs_eval_metric_writer.enqueue.assert_called_with(
+        _expected_llmobs_eval_metric_event(
+            ml_app="dummy",
+            span_id="123",
+            trace_id="456",
+            label="toxicity",
+            metric_type="categorical",
+            categorical_value="high",
+        )
+    )
+    mock_llmobs_eval_metric_writer.reset_mock()
+    with llmobs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span:
+        llmobs.submit_evaluation_for(
+            span=llmobs.export_span(span),
+            label="toxicity",
+            metric_type="categorical",
+            value="high",
+            ml_app="dummy",
+        )
+    mock_llmobs_eval_metric_writer.enqueue.assert_called_with(
+        _expected_llmobs_eval_metric_event(
+            ml_app="dummy",
+            span_id=str(span.span_id),
+            trace_id="{:x}".format(span.trace_id),
+            label="toxicity",
+            metric_type="categorical",
+            categorical_value="high",
+        )
+    )
+
+
+def test_submit_evaluation_for_enqueues_writer_with_score_metric(llmobs, mock_llmobs_eval_metric_writer):
+    llmobs.submit_evaluation_for(
+        span={"span_id": "123", "trace_id": "456"},
+        label="sentiment",
+        metric_type="score",
+        value=0.9,
+        ml_app="dummy",
+    )
+    mock_llmobs_eval_metric_writer.enqueue.assert_called_with(
+        _expected_llmobs_eval_metric_event(
+            span_id="123", trace_id="456", label="sentiment", metric_type="score", score_value=0.9, ml_app="dummy"
+        )
+    )
+    mock_llmobs_eval_metric_writer.reset_mock()
+    with llmobs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span:
+        llmobs.submit_evaluation_for(
+            span=llmobs.export_span(span), label="sentiment", metric_type="score", value=0.9, ml_app="dummy"
+        )
+    mock_llmobs_eval_metric_writer.enqueue.assert_called_with(
+        _expected_llmobs_eval_metric_event(
+            span_id=str(span.span_id),
+            trace_id="{:x}".format(span.trace_id),
+            label="sentiment",
+            metric_type="score",
+            score_value=0.9,
+            ml_app="dummy",
+        )
+    )

From 4beeccc975c0b00b490903eb0b5dd3f9ace5fdef Mon Sep 17 00:00:00 2001
From: Yun Kim <35776586+Yun-Kim@users.noreply.github.com>
Date: Fri, 10 Jan 2025 11:29:38 -0500
Subject: [PATCH 17/19] chore(llmobs): fix tests to be less noisy locally
 (#11890)

These tests rely on env vars that may already be present on dev envs and
will result in noisy errors as a result.

## Checklist
- [x] PR author has checked that all the criteria below are met
- The PR description includes an overview of the change
- The PR description articulates the motivation for the change
- The change includes tests OR the PR description describes a testing
strategy
- The PR description notes risks associated with the change, if any
- Newly-added code is easy to change
- The change follows the [library release note
guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html)
- The change includes or references documentation updates if necessary
- Backport labels are set (if
[applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting))

## Reviewer Checklist
- [x] Reviewer has checked that all the criteria below are met
- Title is accurate
- All changes are related to the pull request's stated goal
- Avoids breaking
[API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces)
changes
- Testing strategy adequately addresses listed risks
- Newly-added code is easy to change
- Release note makes sense to a user of the library
- If necessary, author has acknowledged and discussed the performance
implications of this PR as reported in the benchmarks PR comment
- Backport labels are set in a manner that is consistent with the
[release branch maintenance
policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)
---
 tests/llmobs/test_llmobs_evaluator_runner.py | 12 ++----------
 tests/llmobs/test_llmobs_ragas_evaluators.py | 15 +++------------
 2 files changed, 5 insertions(+), 22 deletions(-)

diff --git a/tests/llmobs/test_llmobs_evaluator_runner.py b/tests/llmobs/test_llmobs_evaluator_runner.py
index eb0be25c91b..40c9fb5bd2b 100644
--- a/tests/llmobs/test_llmobs_evaluator_runner.py
+++ b/tests/llmobs/test_llmobs_evaluator_runner.py
@@ -64,15 +64,7 @@ def test_evaluator_runner_on_exit(mock_writer_logs, run_python_code_in_subproces
     pypath = [os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))]
     if "PYTHONPATH" in env:
         pypath.append(env["PYTHONPATH"])
-    env.update(
-        {
-            "DD_API_KEY": os.getenv("DD_API_KEY", "dummy-api-key"),
-            "DD_SITE": "datad0g.com",
-            "PYTHONPATH": ":".join(pypath),
-            "DD_LLMOBS_ML_APP": "unnamed-ml-app",
-            "_DD_LLMOBS_EVALUATOR_INTERVAL": "5",
-        }
-    )
+    env.update({"PYTHONPATH": ":".join(pypath), "_DD_LLMOBS_EVALUATOR_INTERVAL": "5"})
     out, err, status, pid = run_python_code_in_subprocess(
         """
 import os
@@ -87,7 +79,7 @@ def test_evaluator_runner_on_exit(mock_writer_logs, run_python_code_in_subproces
 ctx = logs_vcr.use_cassette("tests.llmobs.test_llmobs_evaluator_runner.send_score_metric.yaml")
 ctx.__enter__()
 atexit.register(lambda: ctx.__exit__())
-LLMObs.enable()
+LLMObs.enable(api_key="dummy-api-key", site="datad0g.com", ml_app="unnamed-ml-app")
 LLMObs._instance._evaluator_runner.evaluators.append(DummyEvaluator(llmobs_service=LLMObs))
 LLMObs._instance._evaluator_runner.start()
 LLMObs._instance._evaluator_runner.enqueue({"span_id": "123", "trace_id": "1234"}, None)
diff --git a/tests/llmobs/test_llmobs_ragas_evaluators.py b/tests/llmobs/test_llmobs_ragas_evaluators.py
index 9df6c392470..251b2642040 100644
--- a/tests/llmobs/test_llmobs_ragas_evaluators.py
+++ b/tests/llmobs/test_llmobs_ragas_evaluators.py
@@ -206,14 +206,11 @@ def test_llmobs_with_faithfulness_emits_traces_and_evals_on_exit(mock_writer_log
         pypath.append(env["PYTHONPATH"])
     env.update(
         {
-            "DD_API_KEY": os.getenv("DD_API_KEY", "dummy-api-key"),
-            "DD_SITE": "datad0g.com",
             "PYTHONPATH": ":".join(pypath),
             "OPENAI_API_KEY": os.getenv("OPENAI_API_KEY", "dummy-openai-api-key"),
-            "DD_LLMOBS_ML_APP": "unnamed-ml-app",
             "_DD_LLMOBS_EVALUATOR_INTERVAL": "5",
             "_DD_LLMOBS_EVALUATORS": "ragas_faithfulness",
-            "DD_LLMOBS_AGENTLESS_ENABLED": "1",
+            "DD_TRACE_ENABLED": "0",
         }
     )
     out, err, status, pid = run_python_code_in_subprocess(
@@ -232,14 +229,8 @@ def test_llmobs_with_faithfulness_emits_traces_and_evals_on_exit(mock_writer_log
 )
 ctx.__enter__()
 atexit.register(lambda: ctx.__exit__())
-with mock.patch(
-    "ddtrace.internal.writer.HTTPWriter._send_payload",
-    return_value=Response(
-        status=200,
-        body="{}",
-    ),
-):
-    LLMObs.enable()
+with mock.patch("ddtrace.internal.writer.HTTPWriter._send_payload", return_value=Response(status=200, body="{}")):
+    LLMObs.enable(api_key="dummy-api-key", site="datad0g.com", ml_app="unnamed-ml-app", agentless_enabled=True)
     LLMObs._instance._evaluator_runner.enqueue(_llm_span_with_expected_ragas_inputs_in_messages(), None)
     """,
         env=env,

From 8702cab2464d12c6b1664a38213166ce31fdd63b Mon Sep 17 00:00:00 2001
From: Yun Kim <35776586+Yun-Kim@users.noreply.github.com>
Date: Fri, 10 Jan 2025 13:26:13 -0500
Subject: [PATCH 18/19] fix(llmobs): move listener hooks to enable instead of
 on init (#11889)

Follow up on #11781 to fix a weird duplicate span writing issue with the
new listener hook logic.

Since we were registering these hooks on `LLMObs.__init__()` which also
happens at startup (as we create a default LLMObs() instance) as well as
on `LLMObs.enable()`, we were double registering these hooks, and the
default LLMObsSpanWriter was still saved and called each time the tracer
finished a span. A symptom of this issue is that if a user was to
manually enable agentless mode, they would see noisy logs indicating a
failure to send spans to the agent proxy endpoint (which is the default
writer mode) even though they also submitted spans to the agentless
endpoint succesfully.

This fix resolves the issue by moving the hook registering to
`LLMObs.enable()`, and adding corresponding logic to deregister the
hooks on `_stop_service()`. This way we should only ever have one set of
hooks registered per process.

## Checklist
- [x] PR author has checked that all the criteria below are met
- The PR description includes an overview of the change
- The PR description articulates the motivation for the change
- The change includes tests OR the PR description describes a testing
strategy
- The PR description notes risks associated with the change, if any
- Newly-added code is easy to change
- The change follows the [library release note
guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html)
- The change includes or references documentation updates if necessary
- Backport labels are set (if
[applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting))

## Reviewer Checklist
- [x] Reviewer has checked that all the criteria below are met
- Title is accurate
- All changes are related to the pull request's stated goal
- Avoids breaking
[API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces)
changes
- Testing strategy adequately addresses listed risks
- Newly-added code is easy to change
- Release note makes sense to a user of the library
- If necessary, author has acknowledged and discussed the performance
implications of this PR as reported in the benchmarks PR comment
- Backport labels are set in a manner that is consistent with the
[release branch maintenance
policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)
---
 ddtrace/llmobs/_llmobs.py                     | 14 ++++++++--
 ...default-writer-hooks-5e456c2f7dfd4381.yaml |  4 +++
 tests/llmobs/test_llmobs_service.py           | 28 +++++++++++++++++++
 3 files changed, 43 insertions(+), 3 deletions(-)
 create mode 100644 releasenotes/notes/fix-llmobs-default-writer-hooks-5e456c2f7dfd4381.yaml

diff --git a/ddtrace/llmobs/_llmobs.py b/ddtrace/llmobs/_llmobs.py
index 91c49306b35..b4f1dc1b2f6 100644
--- a/ddtrace/llmobs/_llmobs.py
+++ b/ddtrace/llmobs/_llmobs.py
@@ -111,9 +111,9 @@ def __init__(self, tracer=None):
         self._annotations = []
         self._annotation_context_lock = forksafe.RLock()
 
-        # Register hooks for span events
-        core.on("trace.span_start", self._do_annotations)
-        core.on("trace.span_finish", self._on_span_finish)
+    def _on_span_start(self, span):
+        if self.enabled and span.span_type == SpanTypes.LLM:
+            self._do_annotations(span)
 
     def _on_span_finish(self, span):
         if self.enabled and span.span_type == SpanTypes.LLM:
@@ -272,6 +272,10 @@ def _start_service(self) -> None:
             log.debug("Error starting evaluator runner")
 
     def _stop_service(self) -> None:
+        # Remove listener hooks for span events
+        core.reset_listeners("trace.span_start", self._on_span_start)
+        core.reset_listeners("trace.span_finish", self._on_span_finish)
+
         try:
             self._evaluator_runner.stop()
             # flush remaining evaluation spans & evaluations
@@ -366,6 +370,10 @@ def enable(
         cls.enabled = True
         cls._instance.start()
 
+        # Register hooks for span events
+        core.on("trace.span_start", cls._instance._on_span_start)
+        core.on("trace.span_finish", cls._instance._on_span_finish)
+
         atexit.register(cls.disable)
         telemetry_writer.product_activated(TELEMETRY_APM_PRODUCT.LLMOBS, True)
 
diff --git a/releasenotes/notes/fix-llmobs-default-writer-hooks-5e456c2f7dfd4381.yaml b/releasenotes/notes/fix-llmobs-default-writer-hooks-5e456c2f7dfd4381.yaml
new file mode 100644
index 00000000000..702e2538b99
--- /dev/null
+++ b/releasenotes/notes/fix-llmobs-default-writer-hooks-5e456c2f7dfd4381.yaml
@@ -0,0 +1,4 @@
+---
+fixes:
+  - |
+    LLM Observability: Resolves an issue where enabling LLM Observability in agentless mode would result in traces also being sent to the agent proxy endpoint.
\ No newline at end of file
diff --git a/tests/llmobs/test_llmobs_service.py b/tests/llmobs/test_llmobs_service.py
index 4c7f207066e..dad6accdcfb 100644
--- a/tests/llmobs/test_llmobs_service.py
+++ b/tests/llmobs/test_llmobs_service.py
@@ -1310,6 +1310,34 @@ def test_activate_distributed_headers_activates_context(llmobs, mock_llmobs_logs
             mock_activate.assert_called_once_with(dummy_context)
 
 
+def test_listener_hooks_enqueue_correct_writer(run_python_code_in_subprocess):
+    """
+    Regression test that ensures that listener hooks enqueue span events to the correct writer,
+    not the default writer created at startup.
+    """
+    env = os.environ.copy()
+    pypath = [os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))]
+    if "PYTHONPATH" in env:
+        pypath.append(env["PYTHONPATH"])
+    env.update({"PYTHONPATH": ":".join(pypath), "DD_TRACE_ENABLED": "0"})
+    out, err, status, pid = run_python_code_in_subprocess(
+        """
+from ddtrace.llmobs import LLMObs
+
+LLMObs.enable(ml_app="repro-issue", agentless_enabled=True, api_key="foobar.baz", site="datad0g.com")
+with LLMObs.agent("dummy"):
+    pass
+""",
+        env=env,
+    )
+    assert status == 0, err
+    assert out == b""
+    agentless_writer_log = b"failed to send traces to intake at https://llmobs-intake.datad0g.com/api/v2/llmobs: HTTP error status 403, reason Forbidden\n"  # noqa: E501
+    agent_proxy_log = b"failed to send, dropping 1 traces to intake at http://localhost:8126/evp_proxy/v2/api/v2/llmobs after 5 retries"  # noqa: E501
+    assert err == agentless_writer_log
+    assert agent_proxy_log not in err
+
+
 def test_llmobs_fork_recreates_and_restarts_span_writer():
     """Test that forking a process correctly recreates and restarts the LLMObsSpanWriter."""
     with mock.patch("ddtrace.internal.writer.HTTPWriter._send_payload"):

From d7927e6296679a3d7def5b31347262b7cce8b7dd Mon Sep 17 00:00:00 2001
From: "Gabriele N. Tornetta" <P403n1x87@users.noreply.github.com>
Date: Fri, 10 Jan 2025 18:45:32 +0000
Subject: [PATCH 19/19] fix(er): include nonlocals in snapshots (#11894)

We include nonlocal variables in snapshots to provide for better
visibility into exception occurrences.

## Checklist
- [x] PR author has checked that all the criteria below are met
- The PR description includes an overview of the change
- The PR description articulates the motivation for the change
- The change includes tests OR the PR description describes a testing
strategy
- The PR description notes risks associated with the change, if any
- Newly-added code is easy to change
- The change follows the [library release note
guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html)
- The change includes or references documentation updates if necessary
- Backport labels are set (if
[applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting))

## Reviewer Checklist
- [ ] Reviewer has checked that all the criteria below are met
- Title is accurate
- All changes are related to the pull request's stated goal
- Avoids breaking
[API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces)
changes
- Testing strategy adequately addresses listed risks
- Newly-added code is easy to change
- Release note makes sense to a user of the library
- If necessary, author has acknowledged and discussed the performance
implications of this PR as reported in the benchmarks PR comment
- Backport labels are set in a manner that is consistent with the
[release branch maintenance
policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)
---
 ddtrace/debugging/_safety.py                  |  9 +++++----
 ...er-include-nonlocals-bbeecfbbbde35496.yaml |  4 ++++
 tests/debugging/exception/test_replay.py      | 20 +++++++++++++++++++
 tests/debugging/test_safety.py                |  5 ++++-
 4 files changed, 33 insertions(+), 5 deletions(-)
 create mode 100644 releasenotes/notes/fix-er-include-nonlocals-bbeecfbbbde35496.yaml

diff --git a/ddtrace/debugging/_safety.py b/ddtrace/debugging/_safety.py
index 118deddef40..92b38ff6bdc 100644
--- a/ddtrace/debugging/_safety.py
+++ b/ddtrace/debugging/_safety.py
@@ -1,5 +1,6 @@
 from inspect import CO_VARARGS
 from inspect import CO_VARKEYWORDS
+from itertools import chain
 from types import FrameType
 from typing import Any
 from typing import Dict
@@ -23,11 +24,11 @@ def get_args(frame: FrameType) -> Iterator[Tuple[str, Any]]:
 
 def get_locals(frame: FrameType) -> Iterator[Tuple[str, Any]]:
     code = frame.f_code
+    _locals = frame.f_locals
     nargs = code.co_argcount + bool(code.co_flags & CO_VARARGS) + bool(code.co_flags & CO_VARKEYWORDS)
-    names = code.co_varnames[nargs:]
-    values = (frame.f_locals.get(name) for name in names)
-
-    return zip(names, values)
+    return (
+        (name, _locals.get(name)) for name in chain(code.co_varnames[nargs:], code.co_freevars, code.co_cellvars)
+    )  # include freevars and cellvars
 
 
 def get_globals(frame: FrameType) -> Iterator[Tuple[str, Any]]:
diff --git a/releasenotes/notes/fix-er-include-nonlocals-bbeecfbbbde35496.yaml b/releasenotes/notes/fix-er-include-nonlocals-bbeecfbbbde35496.yaml
new file mode 100644
index 00000000000..4d77fddb710
--- /dev/null
+++ b/releasenotes/notes/fix-er-include-nonlocals-bbeecfbbbde35496.yaml
@@ -0,0 +1,4 @@
+---
+fixes:
+  - |
+    exception replay: include missing nonlocal variables in snapshot log messages.
diff --git a/tests/debugging/exception/test_replay.py b/tests/debugging/exception/test_replay.py
index 9aae75dae47..8261bfb5b47 100644
--- a/tests/debugging/exception/test_replay.py
+++ b/tests/debugging/exception/test_replay.py
@@ -294,3 +294,23 @@ def c(foo=42):
             self.assert_span_count(6)
             # no new snapshots
             assert len(uploader.collector.queue) == 3
+
+    def test_debugger_exception_in_closure(self):
+        def b():
+            with self.trace("b"):
+                nonloc = 4
+
+                def a(v):
+                    if nonloc:
+                        raise ValueError("hello", v)
+
+                a(nonloc)
+
+        with exception_replay() as uploader:
+            with with_rate_limiter(RateLimiter(limit_rate=1, raise_on_exceed=False)):
+                with pytest.raises(ValueError):
+                    b()
+
+            assert all(
+                s.line_capture["locals"]["nonloc"] == {"type": "int", "value": "4"} for s in uploader.collector.queue
+            )
diff --git a/tests/debugging/test_safety.py b/tests/debugging/test_safety.py
index 3acb0288924..cc44ca9ca12 100644
--- a/tests/debugging/test_safety.py
+++ b/tests/debugging/test_safety.py
@@ -15,7 +15,10 @@ def assert_args(args):
         assert set(dict(_safety.get_args(inspect.currentframe().f_back)).keys()) == args
 
     def assert_locals(_locals):
-        assert set(dict(_safety.get_locals(inspect.currentframe().f_back)).keys()) == _locals
+        assert set(dict(_safety.get_locals(inspect.currentframe().f_back)).keys()) == _locals | {
+            "assert_args",
+            "assert_locals",
+        }
 
     def assert_globals(_globals):
         assert set(dict(_safety.get_globals(inspect.currentframe().f_back)).keys()) == _globals