diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index d281fe80148..23a48b6f344 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -110,6 +110,7 @@ benchmarks/base/aspects_benchmarks_generate.py @DataDog/asm-python
 ddtrace/appsec/                     @DataDog/asm-python
 ddtrace/settings/asm.py             @DataDog/asm-python
 ddtrace/contrib/subprocess/         @DataDog/asm-python
+ddtrace/contrib/internal/subprocess/         @DataDog/asm-python
 ddtrace/contrib/flask_login/        @DataDog/asm-python
 ddtrace/contrib/webbrowser          @DataDog/asm-python
 ddtrace/contrib/urllib              @DataDog/asm-python
diff --git a/.riot/requirements/16562eb.txt b/.riot/requirements/16562eb.txt
new file mode 100644
index 00000000000..e2aac88c146
--- /dev/null
+++ b/.riot/requirements/16562eb.txt
@@ -0,0 +1,32 @@
+#
+# This file is autogenerated by pip-compile with Python 3.7
+# by the following command:
+#
+#    pip-compile --allow-unsafe --config=pyproject.toml --no-annotate --resolver=backtracking .riot/requirements/16562eb.in
+#
+attrs==24.2.0
+coverage[toml]==7.2.7
+exceptiongroup==1.2.2
+hypothesis==6.45.0
+idna==3.10
+importlib-metadata==6.7.0
+iniconfig==2.0.0
+mock==5.1.0
+multidict==6.0.5
+opentracing==2.4.0
+packaging==24.0
+pluggy==1.2.0
+pytest==7.4.4
+pytest-asyncio==0.21.1
+pytest-cov==4.1.0
+pytest-mock==3.11.1
+pyyaml==6.0.1
+six==1.17.0
+sortedcontainers==2.4.0
+tomli==2.0.1
+typing-extensions==4.7.1
+urllib3==1.26.20
+vcrpy==4.4.0
+wrapt==1.16.0
+yarl==1.9.4
+zipp==3.15.0
diff --git a/ddtrace/_trace/tracer.py b/ddtrace/_trace/tracer.py
index f815e0f184e..af9b09d3e02 100644
--- a/ddtrace/_trace/tracer.py
+++ b/ddtrace/_trace/tracer.py
@@ -41,6 +41,7 @@
 from ddtrace.internal.atexit import register_on_exit_signal
 from ddtrace.internal.constants import SAMPLING_DECISION_TRACE_TAG_KEY
 from ddtrace.internal.constants import SPAN_API_DATADOG
+from ddtrace.internal.core import dispatch
 from ddtrace.internal.dogstatsd import get_dogstatsd_client
 from ddtrace.internal.logger import get_logger
 from ddtrace.internal.peer_service.processor import PeerServiceProcessor
@@ -849,7 +850,7 @@ def _start_span(
             for p in chain(self._span_processors, SpanProcessor.__processors__, self._deferred_processors):
                 p.on_span_start(span)
         self._hooks.emit(self.__class__.start_span, span)
-
+        dispatch("trace.span_start", (span,))
         return span
 
     start_span = _start_span
@@ -866,6 +867,8 @@ def _on_span_finish(self, span: Span) -> None:
             for p in chain(self._span_processors, SpanProcessor.__processors__, self._deferred_processors):
                 p.on_span_finish(span)
 
+        dispatch("trace.span_finish", (span,))
+
         if log.isEnabledFor(logging.DEBUG):
             log.debug("finishing span %s (enabled:%s)", span._pprint(), self.enabled)
 
@@ -940,18 +943,23 @@ def trace(
         )
 
     def current_root_span(self) -> Optional[Span]:
-        """Returns the root span of the current execution.
+        """Returns the local root span of the current execution/process.
+
+        Note: This cannot be used to access the true root span of the trace
+        in a distributed tracing setup if the actual root span occurred in
+        another execution/process.
 
-        This is useful for attaching information related to the trace as a
-        whole without needing to add to child spans.
+        This is useful for attaching information to the local root span
+        of the current execution/process, which is often also service
+        entry span.
 
         For example::
 
-            # get the root span
-            root_span = tracer.current_root_span()
+            # get the local root span
+            local_root_span = tracer.current_root_span()
             # set the host just once on the root span
-            if root_span:
-                root_span.set_tag('host', '127.0.0.1')
+            if local_root_span:
+                local_root_span.set_tag('host', '127.0.0.1')
         """
         span = self.current_span()
         if span is None:
diff --git a/ddtrace/appsec/_capabilities.py b/ddtrace/appsec/_capabilities.py
index c173f2d6471..c999b61cb97 100644
--- a/ddtrace/appsec/_capabilities.py
+++ b/ddtrace/appsec/_capabilities.py
@@ -31,6 +31,7 @@ class Flags(enum.IntFlag):
     ASM_SESSION_FINGERPRINT = 1 << 33
     ASM_NETWORK_FINGERPRINT = 1 << 34
     ASM_HEADER_FINGERPRINT = 1 << 35
+    ASM_RASP_CMDI = 1 << 37
 
 
 _ALL_ASM_BLOCKING = (
@@ -49,7 +50,7 @@ class Flags(enum.IntFlag):
     | Flags.ASM_HEADER_FINGERPRINT
 )
 
-_ALL_RASP = Flags.ASM_RASP_SQLI | Flags.ASM_RASP_LFI | Flags.ASM_RASP_SSRF | Flags.ASM_RASP_SHI
+_ALL_RASP = Flags.ASM_RASP_SQLI | Flags.ASM_RASP_LFI | Flags.ASM_RASP_SSRF | Flags.ASM_RASP_SHI | Flags.ASM_RASP_CMDI
 _FEATURE_REQUIRED = Flags.ASM_ACTIVATION | Flags.ASM_AUTO_USER
 
 
diff --git a/ddtrace/appsec/_common_module_patches.py b/ddtrace/appsec/_common_module_patches.py
index 215d8b05ee6..0b455dbba6b 100644
--- a/ddtrace/appsec/_common_module_patches.py
+++ b/ddtrace/appsec/_common_module_patches.py
@@ -7,16 +7,20 @@
 from typing import Callable
 from typing import Dict
 from typing import Iterable
+from typing import List
+from typing import Union
 
 from wrapt import FunctionWrapper
 from wrapt import resolve_path
 
 import ddtrace
 from ddtrace.appsec._asm_request_context import get_blocked
+from ddtrace.appsec._constants import EXPLOIT_PREVENTION
 from ddtrace.appsec._constants import WAF_ACTIONS
 from ddtrace.appsec._iast._iast_request_context import is_iast_request_enabled
 from ddtrace.appsec._iast._metrics import _set_metric_iast_instrumented_sink
 from ddtrace.appsec._iast.constants import VULN_PATH_TRAVERSAL
+import ddtrace.contrib.internal.subprocess.patch as subprocess_patch
 from ddtrace.internal import core
 from ddtrace.internal._exceptions import BlockingException
 from ddtrace.internal._unpatched import _gc as gc
@@ -30,6 +34,9 @@
 
 _is_patched = False
 
+_RASP_SYSTEM = "rasp_os.system"
+_RASP_POPEN = "rasp_Popen"
+
 
 def patch_common_modules():
     global _is_patched
@@ -39,7 +46,10 @@ def patch_common_modules():
     try_wrap_function_wrapper("urllib.request", "OpenerDirector.open", wrapped_open_ED4CF71136E15EBF)
     try_wrap_function_wrapper("_io", "BytesIO.read", wrapped_read_F3E51D71B4EC16EF)
     try_wrap_function_wrapper("_io", "StringIO.read", wrapped_read_F3E51D71B4EC16EF)
-    try_wrap_function_wrapper("os", "system", wrapped_system_5542593D237084A7)
+    # ensure that the subprocess patch is applied even after one click activation
+    subprocess_patch.patch()
+    subprocess_patch.add_str_callback(_RASP_SYSTEM, wrapped_system_5542593D237084A7)
+    subprocess_patch.add_lst_callback(_RASP_POPEN, popen_FD233052260D8B4D)
     core.on("asm.block.dbapi.execute", execute_4C9BAC8E228EB347)
     if asm_config._iast_enabled:
         _set_metric_iast_instrumented_sink(VULN_PATH_TRAVERSAL)
@@ -54,6 +64,8 @@ def unpatch_common_modules():
     try_unwrap("urllib.request", "OpenerDirector.open")
     try_unwrap("_io", "BytesIO.read")
     try_unwrap("_io", "StringIO.read")
+    subprocess_patch.del_str_callback(_RASP_SYSTEM)
+    subprocess_patch.del_lst_callback(_RASP_POPEN)
     _is_patched = False
 
 
@@ -106,7 +118,6 @@ def wrapped_open_CFDDB7ABBA9081B6(original_open_callable, instance, args, kwargs
         try:
             from ddtrace.appsec._asm_request_context import call_waf_callback
             from ddtrace.appsec._asm_request_context import in_asm_context
-            from ddtrace.appsec._constants import EXPLOIT_PREVENTION
         except ImportError:
             # open is used during module initialization
             # and shouldn't be changed at that time
@@ -124,7 +135,9 @@ def wrapped_open_CFDDB7ABBA9081B6(original_open_callable, instance, args, kwargs
                 rule_type=EXPLOIT_PREVENTION.TYPE.LFI,
             )
             if res and _must_block(res.actions):
-                raise BlockingException(get_blocked(), "exploit_prevention", "lfi", filename)
+                raise BlockingException(
+                    get_blocked(), EXPLOIT_PREVENTION.BLOCKING, EXPLOIT_PREVENTION.TYPE.LFI, filename
+                )
     try:
         return original_open_callable(*args, **kwargs)
     except Exception as e:
@@ -151,7 +164,6 @@ def wrapped_open_ED4CF71136E15EBF(original_open_callable, instance, args, kwargs
         try:
             from ddtrace.appsec._asm_request_context import call_waf_callback
             from ddtrace.appsec._asm_request_context import in_asm_context
-            from ddtrace.appsec._constants import EXPLOIT_PREVENTION
         except ImportError:
             # open is used during module initialization
             # and shouldn't be changed at that time
@@ -168,7 +180,9 @@ def wrapped_open_ED4CF71136E15EBF(original_open_callable, instance, args, kwargs
                     rule_type=EXPLOIT_PREVENTION.TYPE.SSRF,
                 )
                 if res and _must_block(res.actions):
-                    raise BlockingException(get_blocked(), "exploit_prevention", "ssrf", url)
+                    raise BlockingException(
+                        get_blocked(), EXPLOIT_PREVENTION.BLOCKING, EXPLOIT_PREVENTION.TYPE.SSRF, url
+                    )
     return original_open_callable(*args, **kwargs)
 
 
@@ -191,7 +205,6 @@ def wrapped_request_D8CB81E472AF98A2(original_request_callable, instance, args,
         try:
             from ddtrace.appsec._asm_request_context import call_waf_callback
             from ddtrace.appsec._asm_request_context import in_asm_context
-            from ddtrace.appsec._constants import EXPLOIT_PREVENTION
         except ImportError:
             # open is used during module initialization
             # and shouldn't be changed at that time
@@ -206,50 +219,67 @@ def wrapped_request_D8CB81E472AF98A2(original_request_callable, instance, args,
                     rule_type=EXPLOIT_PREVENTION.TYPE.SSRF,
                 )
                 if res and _must_block(res.actions):
-                    raise BlockingException(get_blocked(), "exploit_prevention", "ssrf", url)
+                    raise BlockingException(
+                        get_blocked(), EXPLOIT_PREVENTION.BLOCKING, EXPLOIT_PREVENTION.TYPE.SSRF, url
+                    )
 
     return original_request_callable(*args, **kwargs)
 
 
-def wrapped_system_5542593D237084A7(original_command_callable, instance, args, kwargs):
+def wrapped_system_5542593D237084A7(command: str) -> None:
     """
     wrapper for os.system function
     """
-    command = args[0] if args else kwargs.get("command", None)
-    if command is not None:
-        if asm_config._iast_enabled and is_iast_request_enabled():
-            from ddtrace.appsec._iast.taint_sinks.command_injection import _iast_report_cmdi
-
-            _iast_report_cmdi(command)
-
-        if (
-            asm_config._asm_enabled
-            and asm_config._ep_enabled
-            and ddtrace.tracer._appsec_processor is not None
-            and ddtrace.tracer._appsec_processor.rasp_cmdi_enabled
-        ):
-            try:
-                from ddtrace.appsec._asm_request_context import call_waf_callback
-                from ddtrace.appsec._asm_request_context import in_asm_context
-                from ddtrace.appsec._constants import EXPLOIT_PREVENTION
-            except ImportError:
-                return original_command_callable(*args, **kwargs)
-
-            if in_asm_context():
-                res = call_waf_callback(
-                    {EXPLOIT_PREVENTION.ADDRESS.CMDI: command},
-                    crop_trace="wrapped_system_5542593D237084A7",
-                    rule_type=EXPLOIT_PREVENTION.TYPE.CMDI,
+    if (
+        asm_config._asm_enabled
+        and asm_config._ep_enabled
+        and ddtrace.tracer._appsec_processor is not None
+        and ddtrace.tracer._appsec_processor.rasp_shi_enabled
+    ):
+        try:
+            from ddtrace.appsec._asm_request_context import call_waf_callback
+            from ddtrace.appsec._asm_request_context import in_asm_context
+        except ImportError:
+            return
+
+        if in_asm_context():
+            res = call_waf_callback(
+                {EXPLOIT_PREVENTION.ADDRESS.SHI: command},
+                crop_trace="wrapped_system_5542593D237084A7",
+                rule_type=EXPLOIT_PREVENTION.TYPE.SHI,
+            )
+            if res and _must_block(res.actions):
+                raise BlockingException(
+                    get_blocked(), EXPLOIT_PREVENTION.BLOCKING, EXPLOIT_PREVENTION.TYPE.SHI, command
+                )
+
+
+def popen_FD233052260D8B4D(arg_list: Union[List[str], str]) -> None:
+    """
+    listener for subprocess.Popen class
+    """
+    if (
+        asm_config._asm_enabled
+        and asm_config._ep_enabled
+        and ddtrace.tracer._appsec_processor is not None
+        and ddtrace.tracer._appsec_processor.rasp_cmdi_enabled
+    ):
+        try:
+            from ddtrace.appsec._asm_request_context import call_waf_callback
+            from ddtrace.appsec._asm_request_context import in_asm_context
+        except ImportError:
+            return
+
+        if in_asm_context():
+            res = call_waf_callback(
+                {EXPLOIT_PREVENTION.ADDRESS.CMDI: arg_list if isinstance(arg_list, list) else [arg_list]},
+                crop_trace="popen_FD233052260D8B4D",
+                rule_type=EXPLOIT_PREVENTION.TYPE.CMDI,
+            )
+            if res and _must_block(res.actions):
+                raise BlockingException(
+                    get_blocked(), EXPLOIT_PREVENTION.BLOCKING, EXPLOIT_PREVENTION.TYPE.CMDI, arg_list
                 )
-                if res and _must_block(res.actions):
-                    raise BlockingException(get_blocked(), "exploit_prevention", "cmdi", command)
-    try:
-        return original_command_callable(*args, **kwargs)
-    except Exception as e:
-        previous_frame = e.__traceback__.tb_frame.f_back
-        raise e.with_traceback(
-            e.__traceback__.__class__(None, previous_frame, previous_frame.f_lasti, previous_frame.f_lineno)
-        )
 
 
 _DB_DIALECTS = {
@@ -279,7 +309,6 @@ def execute_4C9BAC8E228EB347(instrument_self, query, args, kwargs) -> None:
         try:
             from ddtrace.appsec._asm_request_context import call_waf_callback
             from ddtrace.appsec._asm_request_context import in_asm_context
-            from ddtrace.appsec._constants import EXPLOIT_PREVENTION
         except ImportError:
             # execute is used during module initialization
             # and shouldn't be changed at that time
@@ -296,7 +325,9 @@ def execute_4C9BAC8E228EB347(instrument_self, query, args, kwargs) -> None:
                     rule_type=EXPLOIT_PREVENTION.TYPE.SQLI,
                 )
                 if res and _must_block(res.actions):
-                    raise BlockingException(get_blocked(), "exploit_prevention", "sqli", query)
+                    raise BlockingException(
+                        get_blocked(), EXPLOIT_PREVENTION.BLOCKING, EXPLOIT_PREVENTION.TYPE.SQLI, query
+                    )
 
 
 def try_unwrap(module, name):
diff --git a/ddtrace/appsec/_constants.py b/ddtrace/appsec/_constants.py
index 83cb53e78ff..45a96834cc1 100644
--- a/ddtrace/appsec/_constants.py
+++ b/ddtrace/appsec/_constants.py
@@ -202,7 +202,8 @@ class WAF_DATA_NAMES(metaclass=Constant_Class):
 
     # EPHEMERAL ADDRESSES
     PROCESSOR_SETTINGS: Literal["waf.context.processor"] = "waf.context.processor"
-    CMDI_ADDRESS: Literal["server.sys.shell.cmd"] = "server.sys.shell.cmd"
+    CMDI_ADDRESS: Literal["server.sys.exec.cmd"] = "server.sys.exec.cmd"
+    SHI_ADDRESS: Literal["server.sys.shell.cmd"] = "server.sys.shell.cmd"
     LFI_ADDRESS: Literal["server.io.fs.file"] = "server.io.fs.file"
     SSRF_ADDRESS: Literal["server.io.net.url"] = "server.io.net.url"
     SQLI_ADDRESS: Literal["server.db.statement"] = "server.db.statement"
@@ -328,6 +329,7 @@ class DEFAULT(metaclass=Constant_Class):
 
 
 class EXPLOIT_PREVENTION(metaclass=Constant_Class):
+    BLOCKING: Literal["exploit_prevention"] = "exploit_prevention"
     STACK_TRACE_ID: Literal["stack_id"] = "stack_id"
     EP_ENABLED: Literal["DD_APPSEC_RASP_ENABLED"] = "DD_APPSEC_RASP_ENABLED"
     STACK_TRACE_ENABLED: Literal["DD_APPSEC_STACK_TRACE_ENABLED"] = "DD_APPSEC_STACK_TRACE_ENABLED"
@@ -339,6 +341,7 @@ class EXPLOIT_PREVENTION(metaclass=Constant_Class):
 
     class TYPE(metaclass=Constant_Class):
         CMDI: Literal["command_injection"] = "command_injection"
+        SHI: Literal["shell_injection"] = "shell_injection"
         LFI: Literal["lfi"] = "lfi"
         SSRF: Literal["ssrf"] = "ssrf"
         SQLI: Literal["sql_injection"] = "sql_injection"
@@ -346,6 +349,7 @@ class TYPE(metaclass=Constant_Class):
     class ADDRESS(metaclass=Constant_Class):
         CMDI: Literal["CMDI_ADDRESS"] = "CMDI_ADDRESS"
         LFI: Literal["LFI_ADDRESS"] = "LFI_ADDRESS"
+        SHI: Literal["SHI_ADDRESS"] = "SHI_ADDRESS"
         SSRF: Literal["SSRF_ADDRESS"] = "SSRF_ADDRESS"
         SQLI: Literal["SQLI_ADDRESS"] = "SQLI_ADDRESS"
         SQLI_TYPE: Literal["SQLI_SYSTEM_ADDRESS"] = "SQLI_SYSTEM_ADDRESS"
diff --git a/ddtrace/appsec/_iast/_ast/ast_patching.py b/ddtrace/appsec/_iast/_ast/ast_patching.py
index 7e2258bd556..2c7e958d087 100644
--- a/ddtrace/appsec/_iast/_ast/ast_patching.py
+++ b/ddtrace/appsec/_iast/_ast/ast_patching.py
@@ -7,6 +7,7 @@
 from sys import version_info
 import textwrap
 from types import ModuleType
+from typing import Iterable
 from typing import Optional
 from typing import Text
 from typing import Tuple
@@ -327,6 +328,49 @@
 log = get_logger(__name__)
 
 
+class _TrieNode:
+    __slots__ = ("children", "is_end")
+
+    def __init__(self):
+        self.children = {}
+        self.is_end = False
+
+    def __iter__(self):
+        if self.is_end:
+            yield ("", None)
+        else:
+            for k, v in self.children.items():
+                yield (k, dict(v))
+
+
+def build_trie(words: Iterable[str]) -> _TrieNode:
+    root = _TrieNode()
+    for word in words:
+        node = root
+        for char in word:
+            if char not in node.children:
+                node.children[char] = _TrieNode()
+            node = node.children[char]
+        node.is_end = True
+    return root
+
+
+_TRIE_ALLOWLIST = build_trie(IAST_ALLOWLIST)
+_TRIE_DENYLIST = build_trie(IAST_DENYLIST)
+
+
+def _trie_has_prefix_for(trie: _TrieNode, string: str) -> bool:
+    node = trie
+    for char in string:
+        node = node.children.get(char)
+        if not node:
+            return False
+
+        if node.is_end:
+            return True
+    return node.is_end
+
+
 def get_encoding(module_path: Text) -> Text:
     """
     First tries to detect the encoding for the file,
@@ -341,11 +385,11 @@ def get_encoding(module_path: Text) -> Text:
     return ENCODING
 
 
-_NOT_PATCH_MODULE_NAMES = _stdlib_for_python_version() | set(builtin_module_names)
+_NOT_PATCH_MODULE_NAMES = {i.lower() for i in _stdlib_for_python_version() | set(builtin_module_names)}
 
 
 def _in_python_stdlib(module_name: str) -> bool:
-    return module_name.split(".")[0].lower() in [x.lower() for x in _NOT_PATCH_MODULE_NAMES]
+    return module_name.split(".")[0].lower() in _NOT_PATCH_MODULE_NAMES
 
 
 def _should_iast_patch(module_name: Text) -> bool:
@@ -359,10 +403,10 @@ def _should_iast_patch(module_name: Text) -> bool:
     # diff = max_allow - max_deny
     # return diff > 0 or (diff == 0 and not _in_python_stdlib_or_third_party(module_name))
     dotted_module_name = module_name.lower() + "."
-    if dotted_module_name.startswith(IAST_ALLOWLIST):
+    if _trie_has_prefix_for(_TRIE_ALLOWLIST, dotted_module_name):
         log.debug("IAST: allowing %s. it's in the IAST_ALLOWLIST", module_name)
         return True
-    if dotted_module_name.startswith(IAST_DENYLIST):
+    if _trie_has_prefix_for(_TRIE_DENYLIST, dotted_module_name):
         log.debug("IAST: denying %s. it's in the IAST_DENYLIST", module_name)
         return False
     if _in_python_stdlib(module_name):
diff --git a/ddtrace/appsec/_iast/_pytest_plugin.py b/ddtrace/appsec/_iast/_pytest_plugin.py
index 672acc4a031..82c23c53174 100644
--- a/ddtrace/appsec/_iast/_pytest_plugin.py
+++ b/ddtrace/appsec/_iast/_pytest_plugin.py
@@ -27,6 +27,8 @@ def ddtrace_iast(request, ddspan):
         Optionally output the test as failed if vulnerabilities are found.
         """
         yield
+        if ddspan is None:
+            return
         data = ddspan.get_tag(IAST.JSON)
         if not data:
             return
diff --git a/ddtrace/appsec/_iast/taint_sinks/command_injection.py b/ddtrace/appsec/_iast/taint_sinks/command_injection.py
index ee22b294bfc..2607c6c9447 100644
--- a/ddtrace/appsec/_iast/taint_sinks/command_injection.py
+++ b/ddtrace/appsec/_iast/taint_sinks/command_injection.py
@@ -1,18 +1,15 @@
-import os
-import subprocess  # nosec
 from typing import List
 from typing import Union
 
-from ddtrace.appsec._common_module_patches import try_unwrap
 from ddtrace.appsec._constants import IAST_SPAN_TAGS
 from ddtrace.appsec._iast import oce
 from ddtrace.appsec._iast._iast_request_context import is_iast_request_enabled
 from ddtrace.appsec._iast._metrics import _set_metric_iast_executed_sink
 from ddtrace.appsec._iast._metrics import _set_metric_iast_instrumented_sink
 from ddtrace.appsec._iast._metrics import increment_iast_span_metric
-from ddtrace.appsec._iast._patch import try_wrap_function_wrapper
 from ddtrace.appsec._iast._taint_tracking._taint_objects import is_pyobject_tainted
 from ddtrace.appsec._iast.constants import VULN_CMDI
+import ddtrace.contrib.internal.subprocess.patch as subprocess_patch
 from ddtrace.internal.logger import get_logger
 from ddtrace.settings.asm import config as asm_config
 
@@ -26,48 +23,20 @@ def get_version() -> str:
     return ""
 
 
-def patch():
-    if not asm_config._iast_enabled:
-        return
-
-    if not getattr(os, "_datadog_cmdi_patch", False):
-        # all os.spawn* variants eventually use this one:
-        try_wrap_function_wrapper("os", "_spawnvef", _iast_cmdi_osspawn)
-
-    if not getattr(subprocess, "_datadog_cmdi_patch", False):
-        try_wrap_function_wrapper("subprocess", "Popen.__init__", _iast_cmdi_subprocess_init)
+_IAST_CMDI = "iast_cmdi"
 
-        os._datadog_cmdi_patch = True
-        subprocess._datadog_cmdi_patch = True
 
-    _set_metric_iast_instrumented_sink(VULN_CMDI)
+def patch():
+    if asm_config._iast_enabled:
+        subprocess_patch.patch()
+        subprocess_patch.add_str_callback(_IAST_CMDI, _iast_report_cmdi)
+        subprocess_patch.add_lst_callback(_IAST_CMDI, _iast_report_cmdi)
+        _set_metric_iast_instrumented_sink(VULN_CMDI)
 
 
 def unpatch() -> None:
-    try_unwrap("os", "system")
-    try_unwrap("os", "_spawnvef")
-    try_unwrap("subprocess", "Popen.__init__")
-
-    os._datadog_cmdi_patch = False  # type: ignore[attr-defined]
-    subprocess._datadog_cmdi_patch = False  # type: ignore[attr-defined]
-
-
-def _iast_cmdi_osspawn(wrapped, instance, args, kwargs):
-    mode, file, func_args, _, _ = args
-    _iast_report_cmdi(func_args)
-
-    if hasattr(wrapped, "__func__"):
-        return wrapped.__func__(instance, *args, **kwargs)
-    return wrapped(*args, **kwargs)
-
-
-def _iast_cmdi_subprocess_init(wrapped, instance, args, kwargs):
-    cmd_args = args[0] if len(args) else kwargs["args"]
-    _iast_report_cmdi(cmd_args)
-
-    if hasattr(wrapped, "__func__"):
-        return wrapped.__func__(instance, *args, **kwargs)
-    return wrapped(*args, **kwargs)
+    subprocess_patch.del_str_callback(_IAST_CMDI)
+    subprocess_patch.del_lst_callback(_IAST_CMDI)
 
 
 @oce.register
diff --git a/ddtrace/appsec/_metrics.py b/ddtrace/appsec/_metrics.py
index f8713dc5ea7..cbe8490d717 100644
--- a/ddtrace/appsec/_metrics.py
+++ b/ddtrace/appsec/_metrics.py
@@ -1,4 +1,5 @@
 from ddtrace.appsec import _asm_request_context
+from ddtrace.appsec import _constants
 from ddtrace.appsec._ddwaf import version as _version
 from ddtrace.appsec._deduplications import deduplication
 from ddtrace.internal import telemetry
@@ -64,6 +65,15 @@ def _set_waf_init_metric(info):
         log.warning("Error reporting ASM WAF init metrics", exc_info=True)
 
 
+_TYPES_AND_TAGS = {
+    _constants.EXPLOIT_PREVENTION.TYPE.CMDI: (("rule_type", "command_injection"), ("rule_variant", "exec")),
+    _constants.EXPLOIT_PREVENTION.TYPE.SHI: (("rule_type", "command_injection"), ("rule_variant", "shell")),
+    _constants.EXPLOIT_PREVENTION.TYPE.LFI: (("rule_type", "lfi"),),
+    _constants.EXPLOIT_PREVENTION.TYPE.SSRF: (("rule_type", "ssrf"),),
+    _constants.EXPLOIT_PREVENTION.TYPE.SQLI: (("rule_type", "sql_injection"),),
+}
+
+
 def _set_waf_request_metrics(*args):
     try:
         result = _asm_request_context.get_waf_telemetry_results()
@@ -94,10 +104,7 @@ def _set_waf_request_metrics(*args):
                                 TELEMETRY_NAMESPACE_TAG_APPSEC,
                                 n,
                                 float(value),
-                                tags=(
-                                    ("rule_type", rule_type),
-                                    ("waf_version", DDWAF_VERSION),
-                                ),
+                                tags=_TYPES_AND_TAGS.get(rule_type, ()) + (("waf_version", DDWAF_VERSION),),
                             )
 
     except Exception:
diff --git a/ddtrace/appsec/_processor.py b/ddtrace/appsec/_processor.py
index 06328d1201a..54a9f624afe 100644
--- a/ddtrace/appsec/_processor.py
+++ b/ddtrace/appsec/_processor.py
@@ -202,6 +202,10 @@ def _update_rules(self, new_rules: Dict[str, Any]) -> bool:
     def rasp_lfi_enabled(self) -> bool:
         return WAF_DATA_NAMES.LFI_ADDRESS in self._addresses_to_keep
 
+    @property
+    def rasp_shi_enabled(self) -> bool:
+        return WAF_DATA_NAMES.SHI_ADDRESS in self._addresses_to_keep
+
     @property
     def rasp_cmdi_enabled(self) -> bool:
         return WAF_DATA_NAMES.CMDI_ADDRESS in self._addresses_to_keep
diff --git a/ddtrace/appsec/_python_info/stdlib/__init__.py b/ddtrace/appsec/_python_info/stdlib/__init__.py
index a040e57f859..e745c392f55 100644
--- a/ddtrace/appsec/_python_info/stdlib/__init__.py
+++ b/ddtrace/appsec/_python_info/stdlib/__init__.py
@@ -19,5 +19,5 @@
     from .module_names_py312 import STDLIB_MODULE_NAMES
 
 
-def _stdlib_for_python_version():  # type: () -> set
+def _stdlib_for_python_version():  # type: () -> set[str]
     return STDLIB_MODULE_NAMES
diff --git a/ddtrace/contrib/internal/openai/_endpoint_hooks.py b/ddtrace/contrib/internal/openai/_endpoint_hooks.py
index 73a2b2511c9..979e1774a8a 100644
--- a/ddtrace/contrib/internal/openai/_endpoint_hooks.py
+++ b/ddtrace/contrib/internal/openai/_endpoint_hooks.py
@@ -255,6 +255,14 @@ def _record_request(self, pin, integration, span, args, kwargs):
                 span.set_tag_str("openai.request.messages.%d.content" % idx, integration.trunc(str(content)))
             span.set_tag_str("openai.request.messages.%d.role" % idx, str(role))
             span.set_tag_str("openai.request.messages.%d.name" % idx, str(name))
+        if parse_version(OPENAI_VERSION) >= (1, 26) and kwargs.get("stream"):
+            if kwargs.get("stream_options", {}).get("include_usage", None) is not None:
+                # Only perform token chunk auto-extraction if this option is not explicitly set
+                return
+            span._set_ctx_item("_dd.auto_extract_token_chunk", True)
+            stream_options = kwargs.get("stream_options", {})
+            stream_options["include_usage"] = True
+            kwargs["stream_options"] = stream_options
 
     def _record_response(self, pin, integration, span, args, kwargs, resp, error):
         resp = super()._record_response(pin, integration, span, args, kwargs, resp, error)
diff --git a/ddtrace/contrib/internal/openai/utils.py b/ddtrace/contrib/internal/openai/utils.py
index d967383e366..f5dfc10efef 100644
--- a/ddtrace/contrib/internal/openai/utils.py
+++ b/ddtrace/contrib/internal/openai/utils.py
@@ -48,11 +48,28 @@ def __exit__(self, exc_type, exc_val, exc_tb):
         self.__wrapped__.__exit__(exc_type, exc_val, exc_tb)
 
     def __iter__(self):
-        return self
+        exception_raised = False
+        try:
+            for chunk in self.__wrapped__:
+                self._extract_token_chunk(chunk)
+                yield chunk
+                _loop_handler(self._dd_span, chunk, self._streamed_chunks)
+        except Exception:
+            self._dd_span.set_exc_info(*sys.exc_info())
+            exception_raised = True
+            raise
+        finally:
+            if not exception_raised:
+                _process_finished_stream(
+                    self._dd_integration, self._dd_span, self._kwargs, self._streamed_chunks, self._is_completion
+                )
+            self._dd_span.finish()
+            self._dd_integration.metric(self._dd_span, "dist", "request.duration", self._dd_span.duration_ns)
 
     def __next__(self):
         try:
             chunk = self.__wrapped__.__next__()
+            self._extract_token_chunk(chunk)
             _loop_handler(self._dd_span, chunk, self._streamed_chunks)
             return chunk
         except StopIteration:
@@ -68,6 +85,22 @@ def __next__(self):
             self._dd_integration.metric(self._dd_span, "dist", "request.duration", self._dd_span.duration_ns)
             raise
 
+    def _extract_token_chunk(self, chunk):
+        """Attempt to extract the token chunk (last chunk in the stream) from the streamed response."""
+        if not self._dd_span._get_ctx_item("_dd.auto_extract_token_chunk"):
+            return
+        choice = getattr(chunk, "choices", [None])[0]
+        if not getattr(choice, "finish_reason", None):
+            # Only the second-last chunk in the stream with token usage enabled will have finish_reason set
+            return
+        try:
+            # User isn't expecting last token chunk to be present since it's not part of the default streamed response,
+            # so we consume it and extract the token usage metadata before it reaches the user.
+            usage_chunk = self.__wrapped__.__next__()
+            self._streamed_chunks[0].insert(0, usage_chunk)
+        except (StopIteration, GeneratorExit):
+            return
+
 
 class TracedOpenAIAsyncStream(BaseTracedOpenAIStream):
     async def __aenter__(self):
@@ -77,12 +110,29 @@ async def __aenter__(self):
     async def __aexit__(self, exc_type, exc_val, exc_tb):
         await self.__wrapped__.__aexit__(exc_type, exc_val, exc_tb)
 
-    def __aiter__(self):
-        return self
+    async def __aiter__(self):
+        exception_raised = False
+        try:
+            async for chunk in self.__wrapped__:
+                await self._extract_token_chunk(chunk)
+                yield chunk
+                _loop_handler(self._dd_span, chunk, self._streamed_chunks)
+        except Exception:
+            self._dd_span.set_exc_info(*sys.exc_info())
+            exception_raised = True
+            raise
+        finally:
+            if not exception_raised:
+                _process_finished_stream(
+                    self._dd_integration, self._dd_span, self._kwargs, self._streamed_chunks, self._is_completion
+                )
+            self._dd_span.finish()
+            self._dd_integration.metric(self._dd_span, "dist", "request.duration", self._dd_span.duration_ns)
 
     async def __anext__(self):
         try:
             chunk = await self.__wrapped__.__anext__()
+            await self._extract_token_chunk(chunk)
             _loop_handler(self._dd_span, chunk, self._streamed_chunks)
             return chunk
         except StopAsyncIteration:
@@ -98,6 +148,19 @@ async def __anext__(self):
             self._dd_integration.metric(self._dd_span, "dist", "request.duration", self._dd_span.duration_ns)
             raise
 
+    async def _extract_token_chunk(self, chunk):
+        """Attempt to extract the token chunk (last chunk in the stream) from the streamed response."""
+        if not self._dd_span._get_ctx_item("_dd.auto_extract_token_chunk"):
+            return
+        choice = getattr(chunk, "choices", [None])[0]
+        if not getattr(choice, "finish_reason", None):
+            return
+        try:
+            usage_chunk = await self.__wrapped__.__anext__()
+            self._streamed_chunks[0].insert(0, usage_chunk)
+        except (StopAsyncIteration, GeneratorExit):
+            return
+
 
 def _compute_token_count(content, model):
     # type: (Union[str, List[int]], Optional[str]) -> Tuple[bool, int]
diff --git a/ddtrace/contrib/internal/subprocess/patch.py b/ddtrace/contrib/internal/subprocess/patch.py
index 7380e72fdaf..76530c195df 100644
--- a/ddtrace/contrib/internal/subprocess/patch.py
+++ b/ddtrace/contrib/internal/subprocess/patch.py
@@ -4,8 +4,8 @@
 import os
 import re
 import shlex
-import subprocess  # nosec
 from threading import RLock
+from typing import Callable  # noqa:F401
 from typing import Deque  # noqa:F401
 from typing import Dict  # noqa:F401
 from typing import List  # noqa:F401
@@ -33,45 +33,71 @@
 )
 
 
-def get_version():
-    # type: () -> str
+def get_version() -> str:
     return ""
 
 
-def patch():
-    # type: () -> List[str]
-    patched = []  # type: List[str]
-    if not asm_config._asm_enabled:
-        return patched
+_STR_CALLBACKS: Dict[str, Callable[[str], None]] = {}
+_LST_CALLBACKS: Dict[str, Callable[[Union[List[str], str]], None]] = {}
 
-    import os
 
-    if not getattr(os, "_datadog_patch", False):
-        Pin().onto(os)
-        trace_utils.wrap(os, "system", _traced_ossystem(os))
-        trace_utils.wrap(os, "fork", _traced_fork(os))
+def add_str_callback(name: str, callback: Callable[[str], None]):
+    _STR_CALLBACKS[name] = callback
+
+
+def del_str_callback(name: str):
+    _STR_CALLBACKS.pop(name, None)
+
+
+def add_lst_callback(name: str, callback: Callable[[Union[List[str], str]], None]):
+    _LST_CALLBACKS[name] = callback
+
+
+def del_lst_callback(name: str):
+    _LST_CALLBACKS.pop(name, None)
+
 
-        # all os.spawn* variants eventually use this one:
-        trace_utils.wrap(os, "_spawnvef", _traced_osspawn(os))
+def patch() -> List[str]:
+    if not (asm_config._asm_enabled or asm_config._iast_enabled):
+        return []
+    patched: List[str] = []
 
+    import os  # nosec
+    import subprocess  # nosec
+
+    should_patch_system = not trace_utils.iswrapped(os.system)
+    should_patch_fork = not trace_utils.iswrapped(os.fork)
+    spawnvef = getattr(os, "_spawnvef", None)
+    should_patch_spawnvef = spawnvef is not None and not trace_utils.iswrapped(spawnvef)
+
+    if should_patch_system or should_patch_fork or should_patch_spawnvef:
+        Pin().onto(os)
+        if should_patch_system:
+            trace_utils.wrap(os, "system", _traced_ossystem(os))
+        if should_patch_fork:
+            trace_utils.wrap(os, "fork", _traced_fork(os))
+        if should_patch_spawnvef:
+            # all os.spawn* variants eventually use this one:
+            trace_utils.wrap(os, "_spawnvef", _traced_osspawn(os))
         patched.append("os")
 
-    if not getattr(subprocess, "_datadog_patch", False):
+    should_patch_Popen_init = not trace_utils.iswrapped(subprocess.Popen.__init__)
+    should_patch_Popen_wait = not trace_utils.iswrapped(subprocess.Popen.wait)
+    if should_patch_Popen_init or should_patch_Popen_wait:
         Pin().onto(subprocess)
         # We store the parameters on __init__ in the context and set the tags on wait
         # (where all the Popen objects eventually arrive, unless killed before it)
-        trace_utils.wrap(subprocess, "Popen.__init__", _traced_subprocess_init(subprocess))
-        trace_utils.wrap(subprocess, "Popen.wait", _traced_subprocess_wait(subprocess))
-
-        os._datadog_patch = True
-        subprocess._datadog_patch = True
+        if should_patch_Popen_init:
+            trace_utils.wrap(subprocess, "Popen.__init__", _traced_subprocess_init(subprocess))
+        if should_patch_Popen_wait:
+            trace_utils.wrap(subprocess, "Popen.wait", _traced_subprocess_wait(subprocess))
         patched.append("subprocess")
 
     return patched
 
 
 @dataclass(eq=False)
-class SubprocessCmdLineCacheEntry(object):
+class SubprocessCmdLineCacheEntry:
     binary: Optional[str] = None
     arguments: Optional[List] = None
     truncated: bool = False
@@ -80,10 +106,10 @@ class SubprocessCmdLineCacheEntry(object):
     as_string: Optional[str] = None
 
 
-class SubprocessCmdLine(object):
+class SubprocessCmdLine:
     # This catches the computed values into a SubprocessCmdLineCacheEntry object
-    _CACHE = {}  # type: Dict[str, SubprocessCmdLineCacheEntry]
-    _CACHE_DEQUE = collections.deque()  # type: Deque[str]
+    _CACHE: Dict[str, SubprocessCmdLineCacheEntry] = {}
+    _CACHE_DEQUE: Deque[str] = collections.deque()
     _CACHE_MAXSIZE = 32
     _CACHE_LOCK = RLock()
 
@@ -138,8 +164,7 @@ def _clear_cache(cls):
     ]
     _COMPILED_ENV_VAR_REGEXP = re.compile(r"\b[A-Z_]+=\w+")
 
-    def __init__(self, shell_args, shell=False):
-        # type: (Union[str, List[str]], bool) -> None
+    def __init__(self, shell_args: Union[str, List[str]], shell: bool = False) -> None:
         cache_key = str(shell_args) + str(shell)
         self._cache_entry = SubprocessCmdLine._CACHE.get(cache_key)
         if self._cache_entry:
@@ -250,8 +275,7 @@ def scrub_arguments(self):
 
         self.arguments = new_args
 
-    def truncate_string(self, str_):
-        # type: (str) -> str
+    def truncate_string(self, str_: str) -> str:
         oversize = len(str_) - self.TRUNCATE_LIMIT
 
         if oversize <= 0:
@@ -263,9 +287,7 @@ def truncate_string(self, str_):
         msg = ' "4kB argument truncated by %d characters"' % oversize
         return str_[0 : -(oversize + len(msg))] + msg
 
-    def _as_list_and_string(self):
-        # type: () -> Tuple[list[str], str]
-
+    def _as_list_and_string(self) -> Tuple[List[str], str]:
         total_list = self.env_vars + [self.binary] + self.arguments
         truncated_str = self.truncate_string(shjoin(total_list))
         truncated_list = shlex.split(truncated_str)
@@ -290,8 +312,10 @@ def as_string(self):
         return str_res
 
 
-def unpatch():
-    # type: () -> None
+def unpatch() -> None:
+    import os  # nosec
+    import subprocess  # nosec
+
     trace_utils.unwrap(os, "system")
     trace_utils.unwrap(os, "_spawnvef")
     trace_utils.unwrap(subprocess.Popen, "__init__")
@@ -299,13 +323,13 @@ def unpatch():
 
     SubprocessCmdLine._clear_cache()
 
-    os._datadog_patch = False
-    subprocess._datadog_patch = False
-
 
 @trace_utils.with_traced_module
 def _traced_ossystem(module, pin, wrapped, instance, args, kwargs):
     try:
+        if isinstance(args[0], str):
+            for callback in _STR_CALLBACKS.values():
+                callback(args[0])
         shellcmd = SubprocessCmdLine(args[0], shell=True)  # nosec
 
         with pin.tracer.trace(COMMANDS.SPAN_NAME, resource=shellcmd.binary, span_type=SpanTypes.SYSTEM) as span:
@@ -342,6 +366,10 @@ def _traced_fork(module, pin, wrapped, instance, args, kwargs):
 def _traced_osspawn(module, pin, wrapped, instance, args, kwargs):
     try:
         mode, file, func_args, _, _ = args
+        if isinstance(func_args, (list, tuple, str)):
+            commands = [file] + list(func_args)
+            for callback in _LST_CALLBACKS.values():
+                callback(commands)
         shellcmd = SubprocessCmdLine(func_args, shell=False)
 
         with pin.tracer.trace(COMMANDS.SPAN_NAME, resource=shellcmd.binary, span_type=SpanTypes.SYSTEM) as span:
@@ -366,6 +394,13 @@ def _traced_osspawn(module, pin, wrapped, instance, args, kwargs):
 def _traced_subprocess_init(module, pin, wrapped, instance, args, kwargs):
     try:
         cmd_args = args[0] if len(args) else kwargs["args"]
+        if isinstance(cmd_args, (list, tuple, str)):
+            if kwargs.get("shell", False):
+                for callback in _STR_CALLBACKS.values():
+                    callback(cmd_args)
+            else:
+                for callback in _LST_CALLBACKS.values():
+                    callback(cmd_args)
         cmd_args_list = shlex.split(cmd_args) if isinstance(cmd_args, str) else cmd_args
         is_shell = kwargs.get("shell", False)
         shellcmd = SubprocessCmdLine(cmd_args_list, shell=is_shell)  # nosec
diff --git a/ddtrace/debugging/_safety.py b/ddtrace/debugging/_safety.py
index 118deddef40..92b38ff6bdc 100644
--- a/ddtrace/debugging/_safety.py
+++ b/ddtrace/debugging/_safety.py
@@ -1,5 +1,6 @@
 from inspect import CO_VARARGS
 from inspect import CO_VARKEYWORDS
+from itertools import chain
 from types import FrameType
 from typing import Any
 from typing import Dict
@@ -23,11 +24,11 @@ def get_args(frame: FrameType) -> Iterator[Tuple[str, Any]]:
 
 def get_locals(frame: FrameType) -> Iterator[Tuple[str, Any]]:
     code = frame.f_code
+    _locals = frame.f_locals
     nargs = code.co_argcount + bool(code.co_flags & CO_VARARGS) + bool(code.co_flags & CO_VARKEYWORDS)
-    names = code.co_varnames[nargs:]
-    values = (frame.f_locals.get(name) for name in names)
-
-    return zip(names, values)
+    return (
+        (name, _locals.get(name)) for name in chain(code.co_varnames[nargs:], code.co_freevars, code.co_cellvars)
+    )  # include freevars and cellvars
 
 
 def get_globals(frame: FrameType) -> Iterator[Tuple[str, Any]]:
diff --git a/ddtrace/internal/debug.py b/ddtrace/internal/debug.py
index 4d533b604b6..c33ff5ad46d 100644
--- a/ddtrace/internal/debug.py
+++ b/ddtrace/internal/debug.py
@@ -117,8 +117,8 @@ def collect(tracer):
     from ddtrace._trace.tracer import log
 
     return dict(
-        # Timestamp UTC ISO 8601
-        date=datetime.datetime.utcnow().isoformat(),
+        # Timestamp UTC ISO 8601 with the trailing +00:00 removed
+        date=datetime.datetime.now(datetime.timezone.utc).isoformat()[0:-6],
         # eg. "Linux", "Darwin"
         os_name=platform.system(),
         # eg. 12.5.0
diff --git a/ddtrace/internal/telemetry/writer.py b/ddtrace/internal/telemetry/writer.py
index 71de6b03907..2be240c06fd 100644
--- a/ddtrace/internal/telemetry/writer.py
+++ b/ddtrace/internal/telemetry/writer.py
@@ -118,11 +118,17 @@ def send_event(self, request: Dict) -> Optional[httplib.HTTPResponse]:
                 conn.request("POST", self._endpoint, rb_json, headers)
                 resp = get_connection_response(conn)
             if resp.status < 300:
-                log.debug("sent %d in %.5fs to %s. response: %s", len(rb_json), sw.elapsed(), self.url, resp.status)
+                log.debug(
+                    "Instrumentation Telemetry sent %d in %.5fs to %s. response: %s",
+                    len(rb_json),
+                    sw.elapsed(),
+                    self.url,
+                    resp.status,
+                )
             else:
-                log.debug("failed to send telemetry to %s. response: %s", self.url, resp.status)
-        except Exception:
-            log.debug("failed to send telemetry to %s.", self.url, exc_info=True)
+                log.debug("Failed to send Instrumentation Telemetry to %s. response: %s", self.url, resp.status)
+        except Exception as e:
+            log.debug("Failed to send Instrumentation Telemetry to %s. Error: %s", self.url, str(e))
         finally:
             if conn is not None:
                 conn.close()
diff --git a/ddtrace/llmobs/_evaluators/ragas/base.py b/ddtrace/llmobs/_evaluators/ragas/base.py
new file mode 100644
index 00000000000..23aa4cd3caa
--- /dev/null
+++ b/ddtrace/llmobs/_evaluators/ragas/base.py
@@ -0,0 +1,213 @@
+import traceback
+from typing import List
+from typing import Optional
+from typing import Tuple
+from typing import Union
+
+from ddtrace.internal.logger import get_logger
+from ddtrace.internal.telemetry import telemetry_writer
+from ddtrace.internal.telemetry.constants import TELEMETRY_APM_PRODUCT
+from ddtrace.internal.telemetry.constants import TELEMETRY_LOG_LEVEL
+from ddtrace.internal.utils.version import parse_version
+from ddtrace.llmobs._constants import INTERNAL_CONTEXT_VARIABLE_KEYS
+from ddtrace.llmobs._constants import INTERNAL_QUERY_VARIABLE_KEYS
+from ddtrace.llmobs._constants import RAGAS_ML_APP_PREFIX
+
+
+logger = get_logger(__name__)
+
+
+class RagasDependencies:
+    """
+    A helper class to store instances of ragas classes and functions
+    that may or may not exist in a user's environment.
+    """
+
+    def __init__(self):
+        import ragas
+
+        self.ragas_version = parse_version(ragas.__version__)
+        if self.ragas_version >= (0, 2, 0) or self.ragas_version < (0, 1, 10):
+            raise NotImplementedError(
+                "Ragas version: {} is not supported".format(self.ragas_version),
+            )
+
+        from ragas.llms import llm_factory
+
+        self.llm_factory = llm_factory
+
+        from ragas.llms.output_parser import RagasoutputParser
+
+        self.RagasoutputParser = RagasoutputParser
+
+        from ragas.metrics import context_precision
+
+        self.context_precision = context_precision
+
+        from ragas.metrics.base import ensembler
+
+        self.ensembler = ensembler
+
+        from ragas.metrics import faithfulness
+
+        self.faithfulness = faithfulness
+
+        from ragas.metrics.base import get_segmenter
+
+        self.get_segmenter = get_segmenter
+
+        from ddtrace.llmobs._evaluators.ragas.models import StatementFaithfulnessAnswers
+
+        self.StatementFaithfulnessAnswers = StatementFaithfulnessAnswers
+
+        from ddtrace.llmobs._evaluators.ragas.models import StatementsAnswers
+
+        self.StatementsAnswers = StatementsAnswers
+
+
+def _get_ml_app_for_ragas_trace(span_event: dict) -> str:
+    """
+    The `ml_app` spans generated from traces of ragas will be named as `dd-ragas-<ml_app>`
+    or `dd-ragas` if `ml_app` is not present in the span event.
+    """
+    tags: List[str] = span_event.get("tags", [])
+    ml_app = None
+    for tag in tags:
+        if isinstance(tag, str) and tag.startswith("ml_app:"):
+            ml_app = tag.split(":")[1]
+            break
+    if not ml_app:
+        return RAGAS_ML_APP_PREFIX
+    return "{}-{}".format(RAGAS_ML_APP_PREFIX, ml_app)
+
+
+class BaseRagasEvaluator:
+    """A class used by EvaluatorRunner to conduct ragas evaluations
+    on LLM Observability span events. The job of an Evaluator is to take a span and
+    submit evaluation metrics based on the span's attributes.
+
+    Extenders of this class should only need to implement the `evaluate` method.
+    """
+
+    LABEL = "ragas"
+    METRIC_TYPE = "score"
+
+    def __init__(self, llmobs_service):
+        """
+        Initialize an evaluator that uses the ragas library to generate a score on finished LLM spans.
+
+        :param llmobs_service: An instance of the LLM Observability service used for tracing the evaluation and
+                                      submitting evaluation metrics.
+
+        Raises: NotImplementedError if the ragas library is not found or if ragas version is not supported.
+        """
+        self.llmobs_service = llmobs_service
+        self.ragas_version = "unknown"
+        telemetry_state = "ok"
+        try:
+            self.ragas_dependencies = RagasDependencies()
+            self.ragas_version = self.ragas_dependencies.ragas_version
+        except ImportError as e:
+            telemetry_state = "fail_import_error"
+            raise NotImplementedError("Failed to load dependencies for `{}` evaluator".format(self.LABEL)) from e
+        except AttributeError as e:
+            telemetry_state = "fail_attribute_error"
+            raise NotImplementedError("Failed to load dependencies for `{}` evaluator".format(self.LABEL)) from e
+        except NotImplementedError as e:
+            telemetry_state = "fail_not_supported"
+            raise NotImplementedError("Failed to load dependencies for `{}` evaluator".format(self.LABEL)) from e
+        except Exception as e:
+            telemetry_state = "fail_unknown"
+            raise NotImplementedError("Failed to load dependencies for `{}` evaluator".format(self.LABEL)) from e
+        finally:
+            telemetry_writer.add_count_metric(
+                namespace=TELEMETRY_APM_PRODUCT.LLMOBS,
+                name="evaluators.init",
+                value=1,
+                tags=(
+                    ("evaluator_label", self.LABEL),
+                    ("state", telemetry_state),
+                    ("evaluator_version", self.ragas_version),
+                ),
+            )
+            if telemetry_state != "ok":
+                telemetry_writer.add_log(
+                    level=TELEMETRY_LOG_LEVEL.ERROR,
+                    message="Failed to import Ragas dependencies",
+                    stack_trace=traceback.format_exc(),
+                    tags={"evaluator_version": self.ragas_version},
+                )
+
+    def run_and_submit_evaluation(self, span_event: dict):
+        if not span_event:
+            return
+        score_result_or_failure, metric_metadata = self.evaluate(span_event)
+        telemetry_writer.add_count_metric(
+            TELEMETRY_APM_PRODUCT.LLMOBS,
+            "evaluators.run",
+            1,
+            tags=(
+                ("evaluator_label", self.LABEL),
+                ("state", score_result_or_failure if isinstance(score_result_or_failure, str) else "success"),
+                ("evaluator_version", self.ragas_version),
+            ),
+        )
+        if isinstance(score_result_or_failure, float):
+            self.llmobs_service.submit_evaluation(
+                span_context={"trace_id": span_event.get("trace_id"), "span_id": span_event.get("span_id")},
+                label=self.LABEL,
+                metric_type=self.METRIC_TYPE,
+                value=score_result_or_failure,
+                metadata=metric_metadata,
+            )
+
+    def evaluate(self, span_event: dict) -> Tuple[Union[float, str], Optional[dict]]:
+        raise NotImplementedError("evaluate method must be implemented by individual evaluators")
+
+    def _extract_evaluation_inputs_from_span(self, span_event: dict) -> Optional[dict]:
+        """
+        Extracts the question, answer, and context used as inputs for a ragas evaluation on a span event.
+        """
+        with self.llmobs_service.workflow("dd-ragas.extract_evaluation_inputs_from_span") as extract_inputs_workflow:
+            self.llmobs_service.annotate(span=extract_inputs_workflow, input_data=span_event)
+            question, answer, contexts = None, None, None
+
+            meta_io = span_event.get("meta")
+            if meta_io is None:
+                return None
+
+            meta_input = meta_io.get("input")
+            meta_output = meta_io.get("output")
+
+            if not (meta_input and meta_output):
+                return None
+
+            prompt = meta_input.get("prompt")
+            if prompt is None:
+                logger.debug("Failed to extract `prompt` from span for ragas evaluation")
+                return None
+            prompt_variables = prompt.get("variables")
+
+            input_messages = meta_input.get("messages")
+
+            messages = meta_output.get("messages")
+            if messages is not None and len(messages) > 0:
+                answer = messages[-1].get("content")
+
+            if prompt_variables:
+                context_keys = prompt.get(INTERNAL_CONTEXT_VARIABLE_KEYS, ["context"])
+                question_keys = prompt.get(INTERNAL_QUERY_VARIABLE_KEYS, ["question"])
+                contexts = [prompt_variables.get(key) for key in context_keys if prompt_variables.get(key)]
+                question = " ".join([prompt_variables.get(key) for key in question_keys if prompt_variables.get(key)])
+
+            if not question and input_messages is not None and len(input_messages) > 0:
+                question = input_messages[-1].get("content")
+
+            self.llmobs_service.annotate(
+                span=extract_inputs_workflow, output_data={"question": question, "contexts": contexts, "answer": answer}
+            )
+            if any(field is None for field in (question, contexts, answer)):
+                logger.debug("Failed to extract inputs required for ragas evaluation")
+                return None
+
+            return {"question": question, "contexts": contexts, "answer": answer}
diff --git a/ddtrace/llmobs/_evaluators/ragas/faithfulness.py b/ddtrace/llmobs/_evaluators/ragas/faithfulness.py
index d651c2443a4..98725b1f27e 100644
--- a/ddtrace/llmobs/_evaluators/ragas/faithfulness.py
+++ b/ddtrace/llmobs/_evaluators/ragas/faithfulness.py
@@ -1,73 +1,22 @@
 import json
 import math
-import traceback
 from typing import List
 from typing import Optional
 from typing import Tuple
 from typing import Union
 
 from ddtrace.internal.logger import get_logger
-from ddtrace.internal.telemetry import telemetry_writer
-from ddtrace.internal.telemetry.constants import TELEMETRY_APM_PRODUCT
-from ddtrace.internal.telemetry.constants import TELEMETRY_LOG_LEVEL
-from ddtrace.internal.utils.version import parse_version
 from ddtrace.llmobs._constants import EVALUATION_KIND_METADATA
 from ddtrace.llmobs._constants import EVALUATION_SPAN_METADATA
 from ddtrace.llmobs._constants import FAITHFULNESS_DISAGREEMENTS_METADATA
-from ddtrace.llmobs._constants import INTERNAL_CONTEXT_VARIABLE_KEYS
-from ddtrace.llmobs._constants import INTERNAL_QUERY_VARIABLE_KEYS
-from ddtrace.llmobs._constants import RAGAS_ML_APP_PREFIX
+from ddtrace.llmobs._evaluators.ragas.base import BaseRagasEvaluator
+from ddtrace.llmobs._evaluators.ragas.base import _get_ml_app_for_ragas_trace
 
 
 logger = get_logger(__name__)
 
 
-class MiniRagas:
-    """
-    A helper class to store instances of ragas classes and functions
-    that may or may not exist in a user's environment.
-    """
-
-    llm_factory = None
-    RagasoutputParser = None
-    faithfulness = None
-    ensembler = None
-    get_segmenter = None
-    StatementFaithfulnessAnswers = None
-    StatementsAnswers = None
-
-
-def _get_ml_app_for_ragas_trace(span_event: dict) -> str:
-    """
-    The `ml_app` spans generated from traces of ragas will be named as `dd-ragas-<ml_app>`
-    or `dd-ragas` if `ml_app` is not present in the span event.
-    """
-    tags = span_event.get("tags", [])  # list[str]
-    ml_app = None
-    for tag in tags:
-        if isinstance(tag, str) and tag.startswith("ml_app:"):
-            ml_app = tag.split(":")[1]
-            break
-    if not ml_app:
-        return RAGAS_ML_APP_PREFIX
-    return "{}-{}".format(RAGAS_ML_APP_PREFIX, ml_app)
-
-
-def _get_faithfulness_instance() -> Optional[object]:
-    """
-    This helper function ensures the faithfulness instance used in
-    ragas evaluator is updated with the latest ragas faithfulness
-    instance AND has an non-null llm
-    """
-    if MiniRagas.faithfulness is None:
-        return None
-    ragas_faithfulness_instance = MiniRagas.faithfulness
-    if not ragas_faithfulness_instance.llm:
-        ragas_faithfulness_instance.llm = MiniRagas.llm_factory()
-    return ragas_faithfulness_instance
-
-
-class RagasFaithfulnessEvaluator:
+class RagasFaithfulnessEvaluator(BaseRagasEvaluator):
     """A class used by EvaluatorRunner to conduct ragas faithfulness evaluations
     on LLM Observability span events. The job of an Evaluator is to take a span and
     submit evaluation metrics based on the span's attributes.
@@ -95,98 +44,30 @@ def __init__(self, llmobs_service):
 
         Raises: NotImplementedError if the ragas library is not found or if ragas version is not supported.
         """
-        self.llmobs_service = llmobs_service
-        self.ragas_version = "unknown"
-        telemetry_state = "ok"
-        try:
-            import ragas
-
-            self.ragas_version = parse_version(ragas.__version__)
-            if self.ragas_version >= (0, 2, 0) or self.ragas_version < (0, 1, 10):
-                raise NotImplementedError(
-                    "Ragas version: {} is not supported for `ragas_faithfulness` evaluator".format(self.ragas_version),
-                )
-
-            from ragas.llms import llm_factory
-
-            MiniRagas.llm_factory = llm_factory
-
-            from ragas.llms.output_parser import RagasoutputParser
-
-            MiniRagas.RagasoutputParser = RagasoutputParser
-
-            from ragas.metrics import faithfulness
-
-            MiniRagas.faithfulness = faithfulness
-
-            from ragas.metrics.base import ensembler
-
-            MiniRagas.ensembler = ensembler
-
-            from ragas.metrics.base import get_segmenter
-
-            MiniRagas.get_segmenter = get_segmenter
-
-            from ddtrace.llmobs._evaluators.ragas.models import StatementFaithfulnessAnswers
-
-            MiniRagas.StatementFaithfulnessAnswers = StatementFaithfulnessAnswers
-
-            from ddtrace.llmobs._evaluators.ragas.models import StatementsAnswers
-
-            MiniRagas.StatementsAnswers = StatementsAnswers
-        except Exception as e:
-            telemetry_state = "fail"
-            telemetry_writer.add_log(
-                level=TELEMETRY_LOG_LEVEL.ERROR,
-                message="Failed to import Ragas dependencies",
-                stack_trace=traceback.format_exc(),
-                tags={"ragas_version": self.ragas_version},
-            )
-            raise NotImplementedError("Failed to load dependencies for `ragas_faithfulness` evaluator") from e
-        finally:
-            telemetry_writer.add_count_metric(
-                namespace=TELEMETRY_APM_PRODUCT.LLMOBS,
-                name="evaluators.init",
-                value=1,
-                tags=(
-                    ("evaluator_label", self.LABEL),
-                    ("state", telemetry_state),
-                    ("ragas_version", self.ragas_version),
-                ),
-            )
-
-        self.ragas_faithfulness_instance = _get_faithfulness_instance()
-        self.llm_output_parser_for_generated_statements = MiniRagas.RagasoutputParser(
-            pydantic_object=MiniRagas.StatementsAnswers
+        super().__init__(llmobs_service)
+        self.ragas_faithfulness_instance = self._get_faithfulness_instance()
+        self.llm_output_parser_for_generated_statements = self.ragas_dependencies.RagasoutputParser(
+            pydantic_object=self.ragas_dependencies.StatementsAnswers
         )
-        self.llm_output_parser_for_faithfulness_score = MiniRagas.RagasoutputParser(
-            pydantic_object=MiniRagas.StatementFaithfulnessAnswers
+        self.llm_output_parser_for_faithfulness_score = self.ragas_dependencies.RagasoutputParser(
+            pydantic_object=self.ragas_dependencies.StatementFaithfulnessAnswers
         )
-        self.split_answer_into_sentences = MiniRagas.get_segmenter(
+        self.split_answer_into_sentences = self.ragas_dependencies.get_segmenter(
             language=self.ragas_faithfulness_instance.nli_statements_message.language, clean=False
         )
 
-    def run_and_submit_evaluation(self, span_event: dict):
-        if not span_event:
-            return
-        score_result_or_failure, metric_metadata = self.evaluate(span_event)
-        telemetry_writer.add_count_metric(
-            TELEMETRY_APM_PRODUCT.LLMOBS,
-            "evaluators.run",
-            1,
-            tags=(
-                ("evaluator_label", self.LABEL),
-                ("state", score_result_or_failure if isinstance(score_result_or_failure, str) else "success"),
-            ),
-        )
-        if isinstance(score_result_or_failure, float):
-            self.llmobs_service.submit_evaluation(
-                span_context={"trace_id": span_event.get("trace_id"), "span_id": span_event.get("span_id")},
-                label=RagasFaithfulnessEvaluator.LABEL,
-                metric_type=RagasFaithfulnessEvaluator.METRIC_TYPE,
-                value=score_result_or_failure,
-                metadata=metric_metadata,
-            )
+    def _get_faithfulness_instance(self) -> Optional[object]:
+        """
+        This helper function ensures the faithfulness instance used in
+        ragas evaluator is updated with the latest ragas faithfulness
+        instance AND has an non-null llm
+        """
+        if self.ragas_dependencies.faithfulness is None:
+            return None
+        ragas_faithfulness_instance = self.ragas_dependencies.faithfulness
+        if not ragas_faithfulness_instance.llm:
+            ragas_faithfulness_instance.llm = self.ragas_dependencies.llm_factory()
+        return ragas_faithfulness_instance
 
     def evaluate(self, span_event: dict) -> Tuple[Union[float, str], Optional[dict]]:
         """
@@ -196,7 +77,7 @@ def evaluate(self, span_event: dict) -> Tuple[Union[float, str], Optional[dict]]
         If the ragas faithfulness instance does not have `llm` set, we set `llm` using the `llm_factory()`
         method from ragas which defaults to openai's gpt-4o-turbo.
         """
-        self.ragas_faithfulness_instance = _get_faithfulness_instance()
+        self.ragas_faithfulness_instance = self._get_faithfulness_instance()
         if not self.ragas_faithfulness_instance:
             return "fail_faithfulness_is_none", {}
 
@@ -220,16 +101,16 @@ def evaluate(self, span_event: dict) -> Tuple[Union[float, str], Optional[dict]]
                     span=ragas_faithfulness_workflow
                 )
 
-                faithfulness_inputs = self._extract_faithfulness_inputs(span_event)
+                faithfulness_inputs = self._extract_evaluation_inputs_from_span(span_event)
                 if faithfulness_inputs is None:
                     logger.debug(
-                        "Failed to extract question and context from span sampled for ragas_faithfulness evaluation"
+                        "Failed to extract evaluation inputs from span sampled for `ragas_faithfulness` evaluation"
                     )
                     return "fail_extract_faithfulness_inputs", evaluation_metadata
 
                 question = faithfulness_inputs["question"]
                 answer = faithfulness_inputs["answer"]
-                context = faithfulness_inputs["context"]
+                context = " ".join(faithfulness_inputs["contexts"])
 
                 statements = self._create_statements(question, answer)
                 if statements is None:
@@ -318,9 +199,9 @@ def _create_verdicts(self, context: str, statements: List[str]):
                 return None
 
             # collapse multiple generations into a single faithfulness list
-            faithfulness_list = MiniRagas.ensembler.from_discrete(raw_faithfulness_list, "verdict")  # type: ignore
+            faithfulness_list = self.ragas_dependencies.ensembler.from_discrete(raw_faithfulness_list, "verdict")
             try:
-                return MiniRagas.StatementFaithfulnessAnswers.parse_obj(faithfulness_list)  # type: ignore
+                return self.ragas_dependencies.StatementFaithfulnessAnswers.parse_obj(faithfulness_list)
             except Exception as e:
                 logger.debug("Failed to parse faithfulness_list", exc_info=e)
                 return None
@@ -330,59 +211,6 @@ def _create_verdicts(self, context: str, statements: List[str]):
                     output_data=faithfulness_list,
                 )
 
-    def _extract_faithfulness_inputs(self, span_event: dict) -> Optional[dict]:
-        """
-        Extracts the question, answer, and context used as inputs to faithfulness
-        evaluation from a span event.
-
-        question - input.prompt.variables.question OR input.messages[-1].content
-        context - input.prompt.variables.context
-        answer - output.messages[-1].content
-        """
-        with self.llmobs_service.workflow("dd-ragas.extract_faithfulness_inputs") as extract_inputs_workflow:
-            self.llmobs_service.annotate(span=extract_inputs_workflow, input_data=span_event)
-            question, answer, context = None, None, None
-
-            meta_io = span_event.get("meta")
-            if meta_io is None:
-                return None
-
-            meta_input = meta_io.get("input")
-            meta_output = meta_io.get("output")
-
-            if not (meta_input and meta_output):
-                return None
-
-            prompt = meta_input.get("prompt")
-            if prompt is None:
-                logger.debug("Failed to extract `prompt` from span for `ragas_faithfulness` evaluation")
-                return None
-            prompt_variables = prompt.get("variables")
-
-            input_messages = meta_input.get("messages")
-
-            messages = meta_output.get("messages")
-            if messages is not None and len(messages) > 0:
-                answer = messages[-1].get("content")
-
-            if prompt_variables:
-                context_keys = prompt.get(INTERNAL_CONTEXT_VARIABLE_KEYS, ["context"])
-                question_keys = prompt.get(INTERNAL_QUERY_VARIABLE_KEYS, ["question"])
-                context = " ".join([prompt_variables.get(key) for key in context_keys if prompt_variables.get(key)])
-                question = " ".join([prompt_variables.get(key) for key in question_keys if prompt_variables.get(key)])
-
-            if not question and input_messages is not None and len(input_messages) > 0:
-                question = input_messages[-1].get("content")
-
-            self.llmobs_service.annotate(
-                span=extract_inputs_workflow, output_data={"question": question, "context": context, "answer": answer}
-            )
-            if any(field is None for field in (question, context, answer)):
-                logger.debug("Failed to extract inputs required for faithfulness evaluation")
-                return None
-
-            return {"question": question, "context": context, "answer": answer}
-
     def _create_statements_prompt(self, answer, question):
         # Returns: `ragas.llms.PromptValue` object
         with self.llmobs_service.task("dd-ragas.create_statements_prompt"):
diff --git a/ddtrace/llmobs/_evaluators/runner.py b/ddtrace/llmobs/_evaluators/runner.py
index bf45e618e01..3d26998f1b4 100644
--- a/ddtrace/llmobs/_evaluators/runner.py
+++ b/ddtrace/llmobs/_evaluators/runner.py
@@ -64,13 +64,15 @@ def __init__(self, interval: float, llmobs_service=None, evaluators=None):
                             ("state", evaluator_init_state),
                         ),
                     )
+            else:
+                raise ValueError("Parsed unsupported evaluator: {}".format(evaluator))
 
     def start(self, *args, **kwargs):
         if not self.evaluators:
             logger.debug("no evaluators configured, not starting %r", self.__class__.__name__)
             return
         super(EvaluatorRunner, self).start()
-        logger.debug("started %r to %r", self.__class__.__name__)
+        logger.debug("started %r", self.__class__.__name__)
 
     def _stop_service(self) -> None:
         """
diff --git a/ddtrace/llmobs/_llmobs.py b/ddtrace/llmobs/_llmobs.py
index 49815151118..b4f1dc1b2f6 100644
--- a/ddtrace/llmobs/_llmobs.py
+++ b/ddtrace/llmobs/_llmobs.py
@@ -3,7 +3,9 @@
 import time
 from typing import Any
 from typing import Dict
+from typing import List
 from typing import Optional
+from typing import Tuple
 from typing import Union
 
 import ddtrace
@@ -11,8 +13,12 @@
 from ddtrace import config
 from ddtrace import patch
 from ddtrace._trace.context import Context
+from ddtrace.constants import ERROR_MSG
+from ddtrace.constants import ERROR_STACK
+from ddtrace.constants import ERROR_TYPE
 from ddtrace.ext import SpanTypes
 from ddtrace.internal import atexit
+from ddtrace.internal import core
 from ddtrace.internal import forksafe
 from ddtrace.internal._rand import rand64bits
 from ddtrace.internal.compat import ensure_text
@@ -22,8 +28,10 @@
 from ddtrace.internal.service import ServiceStatusError
 from ddtrace.internal.telemetry import telemetry_writer
 from ddtrace.internal.telemetry.constants import TELEMETRY_APM_PRODUCT
+from ddtrace.internal.utils.deprecations import DDTraceDeprecationWarning
 from ddtrace.internal.utils.formats import asbool
 from ddtrace.internal.utils.formats import parse_tags_str
+from ddtrace.llmobs import _constants as constants
 from ddtrace.llmobs._constants import ANNOTATIONS_CONTEXT_ID
 from ddtrace.llmobs._constants import INPUT_DOCUMENTS
 from ddtrace.llmobs._constants import INPUT_MESSAGES
@@ -45,11 +53,11 @@
 from ddtrace.llmobs._constants import SPAN_START_WHILE_DISABLED_WARNING
 from ddtrace.llmobs._constants import TAGS
 from ddtrace.llmobs._evaluators.runner import EvaluatorRunner
-from ddtrace.llmobs._trace_processor import LLMObsTraceProcessor
 from ddtrace.llmobs._utils import AnnotationContext
 from ddtrace.llmobs._utils import _get_llmobs_parent_id
 from ddtrace.llmobs._utils import _get_ml_app
 from ddtrace.llmobs._utils import _get_session_id
+from ddtrace.llmobs._utils import _get_span_name
 from ddtrace.llmobs._utils import _inject_llmobs_parent_id
 from ddtrace.llmobs._utils import safe_json
 from ddtrace.llmobs._utils import validate_prompt
@@ -59,6 +67,7 @@
 from ddtrace.llmobs.utils import ExportedLLMObsSpan
 from ddtrace.llmobs.utils import Messages
 from ddtrace.propagation.http import HTTPPropagator
+from ddtrace.vendor.debtcollector import deprecate
 
 
 log = get_logger(__name__)
@@ -81,34 +90,157 @@ class LLMObs(Service):
     def __init__(self, tracer=None):
         super(LLMObs, self).__init__()
         self.tracer = tracer or ddtrace.tracer
-        self._llmobs_span_writer = None
-
         self._llmobs_span_writer = LLMObsSpanWriter(
             is_agentless=config._llmobs_agentless_enabled,
             interval=float(os.getenv("_DD_LLMOBS_WRITER_INTERVAL", 1.0)),
             timeout=float(os.getenv("_DD_LLMOBS_WRITER_TIMEOUT", 5.0)),
         )
-
         self._llmobs_eval_metric_writer = LLMObsEvalMetricWriter(
             site=config._dd_site,
             api_key=config._dd_api_key,
             interval=float(os.getenv("_DD_LLMOBS_WRITER_INTERVAL", 1.0)),
             timeout=float(os.getenv("_DD_LLMOBS_WRITER_TIMEOUT", 5.0)),
         )
-
         self._evaluator_runner = EvaluatorRunner(
             interval=float(os.getenv("_DD_LLMOBS_EVALUATOR_INTERVAL", 1.0)),
             llmobs_service=self,
         )
 
-        self._trace_processor = LLMObsTraceProcessor(self._llmobs_span_writer, self._evaluator_runner)
         forksafe.register(self._child_after_fork)
 
         self._annotations = []
         self._annotation_context_lock = forksafe.RLock()
-        self.tracer.on_start_span(self._do_annotations)
 
-    def _do_annotations(self, span):
+    def _on_span_start(self, span):
+        if self.enabled and span.span_type == SpanTypes.LLM:
+            self._do_annotations(span)
+
+    def _on_span_finish(self, span):
+        if self.enabled and span.span_type == SpanTypes.LLM:
+            self._submit_llmobs_span(span)
+
+    def _submit_llmobs_span(self, span: Span) -> None:
+        """Generate and submit an LLMObs span event to be sent to LLMObs."""
+        span_event = None
+        is_llm_span = span._get_ctx_item(SPAN_KIND) == "llm"
+        is_ragas_integration_span = False
+        try:
+            span_event, is_ragas_integration_span = self._llmobs_span_event(span)
+            self._llmobs_span_writer.enqueue(span_event)
+        except (KeyError, TypeError):
+            log.error(
+                "Error generating LLMObs span event for span %s, likely due to malformed span", span, exc_info=True
+            )
+        finally:
+            if not span_event or not is_llm_span or is_ragas_integration_span:
+                return
+            if self._evaluator_runner:
+                self._evaluator_runner.enqueue(span_event, span)
+
+    @classmethod
+    def _llmobs_span_event(cls, span: Span) -> Tuple[Dict[str, Any], bool]:
+        """Span event object structure."""
+        span_kind = span._get_ctx_item(SPAN_KIND)
+        if not span_kind:
+            raise KeyError("Span kind not found in span context")
+        meta: Dict[str, Any] = {"span.kind": span_kind, "input": {}, "output": {}}
+        if span_kind in ("llm", "embedding") and span._get_ctx_item(MODEL_NAME) is not None:
+            meta["model_name"] = span._get_ctx_item(MODEL_NAME)
+            meta["model_provider"] = (span._get_ctx_item(MODEL_PROVIDER) or "custom").lower()
+        meta["metadata"] = span._get_ctx_item(METADATA) or {}
+        if span._get_ctx_item(INPUT_PARAMETERS):
+            meta["input"]["parameters"] = span._get_ctx_item(INPUT_PARAMETERS)
+        if span_kind == "llm" and span._get_ctx_item(INPUT_MESSAGES) is not None:
+            meta["input"]["messages"] = span._get_ctx_item(INPUT_MESSAGES)
+        if span._get_ctx_item(INPUT_VALUE) is not None:
+            meta["input"]["value"] = safe_json(span._get_ctx_item(INPUT_VALUE))
+        if span_kind == "llm" and span._get_ctx_item(OUTPUT_MESSAGES) is not None:
+            meta["output"]["messages"] = span._get_ctx_item(OUTPUT_MESSAGES)
+        if span_kind == "embedding" and span._get_ctx_item(INPUT_DOCUMENTS) is not None:
+            meta["input"]["documents"] = span._get_ctx_item(INPUT_DOCUMENTS)
+        if span._get_ctx_item(OUTPUT_VALUE) is not None:
+            meta["output"]["value"] = safe_json(span._get_ctx_item(OUTPUT_VALUE))
+        if span_kind == "retrieval" and span._get_ctx_item(OUTPUT_DOCUMENTS) is not None:
+            meta["output"]["documents"] = span._get_ctx_item(OUTPUT_DOCUMENTS)
+        if span._get_ctx_item(INPUT_PROMPT) is not None:
+            prompt_json_str = span._get_ctx_item(INPUT_PROMPT)
+            if span_kind != "llm":
+                log.warning(
+                    "Dropping prompt on non-LLM span kind, annotating prompts is only supported for LLM span kinds."
+                )
+            else:
+                meta["input"]["prompt"] = prompt_json_str
+        if span.error:
+            meta.update(
+                {
+                    ERROR_MSG: span.get_tag(ERROR_MSG),
+                    ERROR_STACK: span.get_tag(ERROR_STACK),
+                    ERROR_TYPE: span.get_tag(ERROR_TYPE),
+                }
+            )
+        if not meta["input"]:
+            meta.pop("input")
+        if not meta["output"]:
+            meta.pop("output")
+        metrics = span._get_ctx_item(METRICS) or {}
+        ml_app = _get_ml_app(span)
+
+        is_ragas_integration_span = False
+
+        if ml_app.startswith(constants.RAGAS_ML_APP_PREFIX):
+            is_ragas_integration_span = True
+
+        span._set_ctx_item(ML_APP, ml_app)
+        parent_id = str(_get_llmobs_parent_id(span) or "undefined")
+
+        llmobs_span_event = {
+            "trace_id": "{:x}".format(span.trace_id),
+            "span_id": str(span.span_id),
+            "parent_id": parent_id,
+            "name": _get_span_name(span),
+            "start_ns": span.start_ns,
+            "duration": span.duration_ns,
+            "status": "error" if span.error else "ok",
+            "meta": meta,
+            "metrics": metrics,
+        }
+        session_id = _get_session_id(span)
+        if session_id is not None:
+            span._set_ctx_item(SESSION_ID, session_id)
+            llmobs_span_event["session_id"] = session_id
+
+        llmobs_span_event["tags"] = cls._llmobs_tags(
+            span, ml_app, session_id, is_ragas_integration_span=is_ragas_integration_span
+        )
+        return llmobs_span_event, is_ragas_integration_span
+
+    @staticmethod
+    def _llmobs_tags(
+        span: Span, ml_app: str, session_id: Optional[str] = None, is_ragas_integration_span: bool = False
+    ) -> List[str]:
+        tags = {
+            "version": config.version or "",
+            "env": config.env or "",
+            "service": span.service or "",
+            "source": "integration",
+            "ml_app": ml_app,
+            "ddtrace.version": ddtrace.__version__,
+            "language": "python",
+            "error": span.error,
+        }
+        err_type = span.get_tag(ERROR_TYPE)
+        if err_type:
+            tags["error_type"] = err_type
+        if session_id:
+            tags["session_id"] = session_id
+        if is_ragas_integration_span:
+            tags[constants.RUNNER_IS_INTEGRATION_SPAN_TAG] = "ragas"
+        existing_tags = span._get_ctx_item(TAGS)
+        if existing_tags is not None:
+            tags.update(existing_tags)
+        return ["{}:{}".format(k, v) for k, v in tags.items()]
+
+    def _do_annotations(self, span: Span) -> None:
         # get the current span context
         # only do the annotations if it matches the context
         if span.span_type != SpanTypes.LLM:  # do this check to avoid the warning log in `annotate`
@@ -120,20 +252,14 @@ def _do_annotations(self, span):
                 if current_context_id == context_id:
                     self.annotate(span, **annotation_kwargs)
 
-    def _child_after_fork(self):
+    def _child_after_fork(self) -> None:
         self._llmobs_span_writer = self._llmobs_span_writer.recreate()
         self._llmobs_eval_metric_writer = self._llmobs_eval_metric_writer.recreate()
         self._evaluator_runner = self._evaluator_runner.recreate()
-        self._trace_processor._span_writer = self._llmobs_span_writer
-        self._trace_processor._evaluator_runner = self._evaluator_runner
         if self.enabled:
             self._start_service()
 
     def _start_service(self) -> None:
-        tracer_filters = self.tracer._filters
-        if not any(isinstance(tracer_filter, LLMObsTraceProcessor) for tracer_filter in tracer_filters):
-            tracer_filters += [self._trace_processor]
-            self.tracer.configure(settings={"FILTERS": tracer_filters})
         try:
             self._llmobs_span_writer.start()
             self._llmobs_eval_metric_writer.start()
@@ -146,6 +272,10 @@ def _start_service(self) -> None:
             log.debug("Error starting evaluator runner")
 
     def _stop_service(self) -> None:
+        # Remove listener hooks for span events
+        core.reset_listeners("trace.span_start", self._on_span_start)
+        core.reset_listeners("trace.span_finish", self._on_span_finish)
+
         try:
             self._evaluator_runner.stop()
             # flush remaining evaluation spans & evaluations
@@ -160,11 +290,7 @@ def _stop_service(self) -> None:
         except ServiceStatusError:
             log.debug("Error stopping LLMObs writers")
 
-        try:
-            forksafe.unregister(self._child_after_fork)
-            self.tracer.shutdown()
-        except Exception:
-            log.warning("Failed to shutdown tracer", exc_info=True)
+        forksafe.unregister(self._child_after_fork)
 
     @classmethod
     def enable(
@@ -244,6 +370,10 @@ def enable(
         cls.enabled = True
         cls._instance.start()
 
+        # Register hooks for span events
+        core.on("trace.span_start", cls._instance._on_span_start)
+        core.on("trace.span_finish", cls._instance._on_span_finish)
+
         atexit.register(cls.disable)
         telemetry_writer.product_activated(TELEMETRY_APM_PRODUCT.LLMOBS, True)
 
@@ -265,7 +395,6 @@ def disable(cls) -> None:
 
         cls._instance.stop()
         cls.enabled = False
-        cls._instance.tracer.deregister_on_start_span(cls._instance._do_annotations)
         telemetry_writer.product_activated(TELEMETRY_APM_PRODUCT.LLMOBS, False)
 
         log.debug("%s disabled", cls.__name__)
@@ -785,6 +914,127 @@ def _tag_metrics(span: Span, metrics: Dict[str, Any]) -> None:
             return
         span._set_ctx_item(METRICS, metrics)
 
+    @classmethod
+    def submit_evaluation_for(
+        cls,
+        label: str,
+        metric_type: str,
+        value: Union[str, int, float],
+        span: Optional[dict] = None,
+        span_with_tag_value: Optional[Dict[str, str]] = None,
+        tags: Optional[Dict[str, str]] = None,
+        ml_app: Optional[str] = None,
+        timestamp_ms: Optional[int] = None,
+    ) -> None:
+        """
+        Submits a custom evaluation metric for a given span.
+
+        :param str label: The name of the evaluation metric.
+        :param str metric_type: The type of the evaluation metric. One of "categorical", "score".
+        :param value: The value of the evaluation metric.
+                      Must be a string (categorical), integer (score), or float (score).
+        :param dict span: A dictionary of shape {'span_id': str, 'trace_id': str} uniquely identifying
+                            the span associated with this evaluation.
+        :param dict span_with_tag_value: A dictionary with the format {'tag_key': str, 'tag_value': str}
+                            uniquely identifying the span associated with this evaluation.
+        :param tags: A dictionary of string key-value pairs to tag the evaluation metric with.
+        :param str ml_app: The name of the ML application
+        :param int timestamp_ms: The unix timestamp in milliseconds when the evaluation metric result was generated.
+                                    If not set, the current time will be used.
+        """
+        if cls.enabled is False:
+            log.debug(
+                "LLMObs.submit_evaluation_for() called when LLMObs is not enabled. ",
+                "Evaluation metric data will not be sent.",
+            )
+            return
+
+        has_exactly_one_joining_key = (span is not None) ^ (span_with_tag_value is not None)
+
+        if not has_exactly_one_joining_key:
+            raise ValueError(
+                "Exactly one of `span` or `span_with_tag_value` must be specified to submit an evaluation metric."
+            )
+
+        join_on = {}
+        if span is not None:
+            if (
+                not isinstance(span, dict)
+                or not isinstance(span.get("span_id"), str)
+                or not isinstance(span.get("trace_id"), str)
+            ):
+                raise TypeError(
+                    "`span` must be a dictionary containing both span_id and trace_id keys. "
+                    "LLMObs.export_span() can be used to generate this dictionary from a given span."
+                )
+            join_on["span"] = span
+        elif span_with_tag_value is not None:
+            if (
+                not isinstance(span_with_tag_value, dict)
+                or not isinstance(span_with_tag_value.get("tag_key"), str)
+                or not isinstance(span_with_tag_value.get("tag_value"), str)
+            ):
+                raise TypeError(
+                    "`span_with_tag_value` must be a dict with keys 'tag_key' and 'tag_value' containing string values"
+                )
+            join_on["tag"] = {
+                "key": span_with_tag_value.get("tag_key"),
+                "value": span_with_tag_value.get("tag_value"),
+            }
+
+        timestamp_ms = timestamp_ms if timestamp_ms else int(time.time() * 1000)
+
+        if not isinstance(timestamp_ms, int) or timestamp_ms < 0:
+            raise ValueError("timestamp_ms must be a non-negative integer. Evaluation metric data will not be sent")
+
+        if not label:
+            raise ValueError("label must be the specified name of the evaluation metric.")
+
+        metric_type = metric_type.lower()
+        if metric_type not in ("categorical", "score"):
+            raise ValueError("metric_type must be one of 'categorical' or 'score'.")
+
+        if metric_type == "categorical" and not isinstance(value, str):
+            raise TypeError("value must be a string for a categorical metric.")
+        if metric_type == "score" and not isinstance(value, (int, float)):
+            raise TypeError("value must be an integer or float for a score metric.")
+
+        if tags is not None and not isinstance(tags, dict):
+            log.warning("tags must be a dictionary of string key-value pairs.")
+            tags = {}
+
+        evaluation_tags = {
+            "ddtrace.version": ddtrace.__version__,
+            "ml_app": ml_app,
+        }
+
+        if tags:
+            for k, v in tags.items():
+                try:
+                    evaluation_tags[ensure_text(k)] = ensure_text(v)
+                except TypeError:
+                    log.warning("Failed to parse tags. Tags for evaluation metrics must be strings.")
+
+        ml_app = ml_app if ml_app else config._llmobs_ml_app
+        if not ml_app:
+            log.warning(
+                "ML App name is required for sending evaluation metrics. Evaluation metric data will not be sent. "
+                "Ensure this configuration is set before running your application."
+            )
+            return
+
+        evaluation_metric = {
+            "join_on": join_on,
+            "label": str(label),
+            "metric_type": metric_type,
+            "timestamp_ms": timestamp_ms,
+            "{}_value".format(metric_type): value,
+            "ml_app": ml_app,
+            "tags": ["{}:{}".format(k, v) for k, v in evaluation_tags.items()],
+        }
+
+        cls._instance._llmobs_eval_metric_writer.enqueue(evaluation_metric)
+
     @classmethod
     def submit_evaluation(
         cls,
@@ -797,6 +1047,13 @@ def submit_evaluation(
         timestamp_ms: Optional[int] = None,
         metadata: Optional[Dict[str, object]] = None,
     ) -> None:
+        deprecate(
+            "Using `LLMObs.submit_evaluation` is deprecated",
+            message="Please use `LLMObs.submit_evaluation_for` instead.",
+            removal_version="3.0.0",
+            category=DDTraceDeprecationWarning,
+        )
+
         """
         Submits a custom evaluation metric for a given span ID and trace ID.
 
@@ -812,7 +1069,7 @@ def submit_evaluation(
                                 evaluation metric.
         """
         if cls.enabled is False:
-            log.warning(
+            log.debug(
                 "LLMObs.submit_evaluation() called when LLMObs is not enabled. Evaluation metric data will not be sent."
             )
             return
@@ -888,8 +1145,7 @@ def submit_evaluation(
                     log.warning("Failed to parse tags. Tags for evaluation metrics must be strings.")
 
         evaluation_metric = {
-            "span_id": span_id,
-            "trace_id": trace_id,
+            "join_on": {"span": {"span_id": span_id, "trace_id": trace_id}},
             "label": str(label),
             "metric_type": metric_type.lower(),
             "timestamp_ms": timestamp_ms,
diff --git a/ddtrace/llmobs/_trace_processor.py b/ddtrace/llmobs/_trace_processor.py
deleted file mode 100644
index 231d53d7626..00000000000
--- a/ddtrace/llmobs/_trace_processor.py
+++ /dev/null
@@ -1,177 +0,0 @@
-from typing import Any
-from typing import Dict
-from typing import List
-from typing import Optional
-from typing import Tuple
-
-import ddtrace
-from ddtrace import Span
-from ddtrace import config
-from ddtrace._trace.processor import TraceProcessor
-from ddtrace.constants import ERROR_MSG
-from ddtrace.constants import ERROR_STACK
-from ddtrace.constants import ERROR_TYPE
-from ddtrace.ext import SpanTypes
-from ddtrace.internal.logger import get_logger
-from ddtrace.llmobs._constants import INPUT_DOCUMENTS
-from ddtrace.llmobs._constants import INPUT_MESSAGES
-from ddtrace.llmobs._constants import INPUT_PARAMETERS
-from ddtrace.llmobs._constants import INPUT_PROMPT
-from ddtrace.llmobs._constants import INPUT_VALUE
-from ddtrace.llmobs._constants import METADATA
-from ddtrace.llmobs._constants import METRICS
-from ddtrace.llmobs._constants import ML_APP
-from ddtrace.llmobs._constants import MODEL_NAME
-from ddtrace.llmobs._constants import MODEL_PROVIDER
-from ddtrace.llmobs._constants import OUTPUT_DOCUMENTS
-from ddtrace.llmobs._constants import OUTPUT_MESSAGES
-from ddtrace.llmobs._constants import OUTPUT_VALUE
-from ddtrace.llmobs._constants import RAGAS_ML_APP_PREFIX
-from ddtrace.llmobs._constants import RUNNER_IS_INTEGRATION_SPAN_TAG
-from ddtrace.llmobs._constants import SESSION_ID
-from ddtrace.llmobs._constants import SPAN_KIND
-from ddtrace.llmobs._constants import TAGS
-from ddtrace.llmobs._utils import _get_llmobs_parent_id
-from ddtrace.llmobs._utils import _get_ml_app
-from ddtrace.llmobs._utils import _get_session_id
-from ddtrace.llmobs._utils import _get_span_name
-from ddtrace.llmobs._utils import safe_json
-
-
-log = get_logger(__name__)
-
-
-class LLMObsTraceProcessor(TraceProcessor):
-    """
-    Processor that extracts LLM-type spans in a trace to submit as separate LLMObs span events to LLM Observability.
-    """
-
-    def __init__(self, llmobs_span_writer, evaluator_runner=None):
-        self._span_writer = llmobs_span_writer
-        self._evaluator_runner = evaluator_runner
-
-    def process_trace(self, trace: List[Span]) -> Optional[List[Span]]:
-        if not trace:
-            return None
-        for span in trace:
-            if span.span_type == SpanTypes.LLM:
-                self.submit_llmobs_span(span)
-        return None if config._llmobs_agentless_enabled else trace
-
-    def submit_llmobs_span(self, span: Span) -> None:
-        """Generate and submit an LLMObs span event to be sent to LLMObs."""
-        span_event = None
-        is_llm_span = span._get_ctx_item(SPAN_KIND) == "llm"
-        is_ragas_integration_span = False
-        try:
-            span_event, is_ragas_integration_span = self._llmobs_span_event(span)
-            self._span_writer.enqueue(span_event)
-        except (KeyError, TypeError):
-            log.error("Error generating LLMObs span event for span %s, likely due to malformed span", span)
-        finally:
-            if not span_event or not is_llm_span or is_ragas_integration_span:
-                return
-            if self._evaluator_runner:
-                self._evaluator_runner.enqueue(span_event, span)
-
-    def _llmobs_span_event(self, span: Span) -> Tuple[Dict[str, Any], bool]:
-        """Span event object structure."""
-        span_kind = span._get_ctx_item(SPAN_KIND)
-        if not span_kind:
-            raise KeyError("Span kind not found in span context")
-        meta: Dict[str, Any] = {"span.kind": span_kind, "input": {}, "output": {}}
-        if span_kind in ("llm", "embedding") and span._get_ctx_item(MODEL_NAME) is not None:
-            meta["model_name"] = span._get_ctx_item(MODEL_NAME)
-            meta["model_provider"] = (span._get_ctx_item(MODEL_PROVIDER) or "custom").lower()
-        meta["metadata"] = span._get_ctx_item(METADATA) or {}
-        if span._get_ctx_item(INPUT_PARAMETERS):
-            meta["input"]["parameters"] = span._get_ctx_item(INPUT_PARAMETERS)
-        if span_kind == "llm" and span._get_ctx_item(INPUT_MESSAGES) is not None:
-            meta["input"]["messages"] = span._get_ctx_item(INPUT_MESSAGES)
-        if span._get_ctx_item(INPUT_VALUE) is not None:
-            meta["input"]["value"] = safe_json(span._get_ctx_item(INPUT_VALUE))
-        if span_kind == "llm" and span._get_ctx_item(OUTPUT_MESSAGES) is not None:
-            meta["output"]["messages"] = span._get_ctx_item(OUTPUT_MESSAGES)
-        if span_kind == "embedding" and span._get_ctx_item(INPUT_DOCUMENTS) is not None:
-            meta["input"]["documents"] = span._get_ctx_item(INPUT_DOCUMENTS)
-        if span._get_ctx_item(OUTPUT_VALUE) is not None:
-            meta["output"]["value"] = safe_json(span._get_ctx_item(OUTPUT_VALUE))
-        if span_kind == "retrieval" and span._get_ctx_item(OUTPUT_DOCUMENTS) is not None:
-            meta["output"]["documents"] = span._get_ctx_item(OUTPUT_DOCUMENTS)
-        if span._get_ctx_item(INPUT_PROMPT) is not None:
-            prompt_json_str = span._get_ctx_item(INPUT_PROMPT)
-            if span_kind != "llm":
-                log.warning(
-                    "Dropping prompt on non-LLM span kind, annotating prompts is only supported for LLM span kinds."
-                )
-            else:
-                meta["input"]["prompt"] = prompt_json_str
-        if span.error:
-            meta.update(
-                {
-                    ERROR_MSG: span.get_tag(ERROR_MSG),
-                    ERROR_STACK: span.get_tag(ERROR_STACK),
-                    ERROR_TYPE: span.get_tag(ERROR_TYPE),
-                }
-            )
-        if not meta["input"]:
-            meta.pop("input")
-        if not meta["output"]:
-            meta.pop("output")
-        metrics = span._get_ctx_item(METRICS) or {}
-        ml_app = _get_ml_app(span)
-
-        is_ragas_integration_span = False
-
-        if ml_app.startswith(RAGAS_ML_APP_PREFIX):
-            is_ragas_integration_span = True
-
-        span._set_ctx_item(ML_APP, ml_app)
-        parent_id = str(_get_llmobs_parent_id(span) or "undefined")
-
-        llmobs_span_event = {
-            "trace_id": "{:x}".format(span.trace_id),
-            "span_id": str(span.span_id),
-            "parent_id": parent_id,
-            "name": _get_span_name(span),
-            "start_ns": span.start_ns,
-            "duration": span.duration_ns,
-            "status": "error" if span.error else "ok",
-            "meta": meta,
-            "metrics": metrics,
-        }
-        session_id = _get_session_id(span)
-        if session_id is not None:
-            span._set_ctx_item(SESSION_ID, session_id)
-            llmobs_span_event["session_id"] = session_id
-
-        llmobs_span_event["tags"] = self._llmobs_tags(
-            span, ml_app, session_id, is_ragas_integration_span=is_ragas_integration_span
-        )
-        return llmobs_span_event, is_ragas_integration_span
-
-    @staticmethod
-    def _llmobs_tags(
-        span: Span, ml_app: str, session_id: Optional[str] = None, is_ragas_integration_span: bool = False
-    ) -> List[str]:
-        tags = {
-            "version": config.version or "",
-            "env": config.env or "",
-            "service": span.service or "",
-            "source": "integration",
-            "ml_app": ml_app,
-            "ddtrace.version": ddtrace.__version__,
-            "language": "python",
-            "error": span.error,
-        }
-        err_type = span.get_tag(ERROR_TYPE)
-        if err_type:
-            tags["error_type"] = err_type
-        if session_id:
-            tags["session_id"] = session_id
-        if is_ragas_integration_span:
-            tags[RUNNER_IS_INTEGRATION_SPAN_TAG] = "ragas"
-        existing_tags = span._get_ctx_item(TAGS)
-        if existing_tags is not None:
-            tags.update(existing_tags)
-        return ["{}:{}".format(k, v) for k, v in tags.items()]
diff --git a/ddtrace/llmobs/_utils.py b/ddtrace/llmobs/_utils.py
index c1b1c4a776c..dd616db8bef 100644
--- a/ddtrace/llmobs/_utils.py
+++ b/ddtrace/llmobs/_utils.py
@@ -135,9 +135,12 @@ def _get_ml_app(span: Span) -> str:
     ml_app = span._get_ctx_item(ML_APP)
     if ml_app:
         return ml_app
-    nearest_llmobs_ancestor = _get_nearest_llmobs_ancestor(span)
-    if nearest_llmobs_ancestor:
-        ml_app = nearest_llmobs_ancestor._get_ctx_item(ML_APP)
+    llmobs_parent = _get_nearest_llmobs_ancestor(span)
+    while llmobs_parent:
+        ml_app = llmobs_parent._get_ctx_item(ML_APP)
+        if ml_app is not None:
+            return ml_app
+        llmobs_parent = _get_nearest_llmobs_ancestor(llmobs_parent)
     return ml_app or config._llmobs_ml_app or "unknown-ml-app"
 
 
@@ -149,9 +152,12 @@ def _get_session_id(span: Span) -> Optional[str]:
     session_id = span._get_ctx_item(SESSION_ID)
     if session_id:
         return session_id
-    nearest_llmobs_ancestor = _get_nearest_llmobs_ancestor(span)
-    if nearest_llmobs_ancestor:
-        session_id = nearest_llmobs_ancestor._get_ctx_item(SESSION_ID)
+    llmobs_parent = _get_nearest_llmobs_ancestor(span)
+    while llmobs_parent:
+        session_id = llmobs_parent._get_ctx_item(SESSION_ID)
+        if session_id is not None:
+            return session_id
+        llmobs_parent = _get_nearest_llmobs_ancestor(llmobs_parent)
     return session_id
 
 
diff --git a/ddtrace/llmobs/_writer.py b/ddtrace/llmobs/_writer.py
index 5a293f05c4e..5880019d67f 100644
--- a/ddtrace/llmobs/_writer.py
+++ b/ddtrace/llmobs/_writer.py
@@ -55,8 +55,7 @@ class LLMObsSpanEvent(TypedDict):
 
 
 class LLMObsEvaluationMetricEvent(TypedDict, total=False):
-    span_id: str
-    trace_id: str
+    join_on: Dict[str, Dict[str, str]]
     metric_type: str
     label: str
     categorical_value: str
@@ -107,6 +106,13 @@ def periodic(self) -> None:
             events = self._buffer
             self._buffer = []
 
+        if not self._headers.get("DD-API-KEY"):
+            logger.warning(
+                "DD_API_KEY is required for sending evaluation metrics. Evaluation metric data will not be sent. ",
+                "Ensure this configuration is set before running your application.",
+            )
+            return
+
         data = self._data(events)
         enc_llm_events = safe_json(data)
         conn = httplib.HTTPSConnection(self._intake, 443, timeout=self._timeout)
@@ -154,7 +160,7 @@ def __init__(self, site: str, api_key: str, interval: float, timeout: float) ->
         super(LLMObsEvalMetricWriter, self).__init__(site, api_key, interval, timeout)
         self._event_type = "evaluation_metric"
         self._buffer = []
-        self._endpoint = "/api/intake/llm-obs/v1/eval-metric"
+        self._endpoint = "/api/intake/llm-obs/v2/eval-metric"
         self._intake = "api.%s" % self._site  # type: str
 
     def enqueue(self, event: LLMObsEvaluationMetricEvent) -> None:
diff --git a/ddtrace/profiling/collector/_lock.py b/ddtrace/profiling/collector/_lock.py
index 6dedf3295f7..ec62c5c0eee 100644
--- a/ddtrace/profiling/collector/_lock.py
+++ b/ddtrace/profiling/collector/_lock.py
@@ -179,69 +179,63 @@ def acquire(self, *args, **kwargs):
 
     def _release(self, inner_func, *args, **kwargs):
         # type (typing.Any, typing.Any) -> None
+
+        start = None
+        if hasattr(self, "_self_acquired_at"):
+            # _self_acquired_at is only set when the acquire was captured
+            # if it's not set, we're not capturing the release
+            start = self._self_acquired_at
+
         try:
             return inner_func(*args, **kwargs)
         finally:
-            try:
-                if hasattr(self, "_self_acquired_at"):
-                    try:
-                        end = time.monotonic_ns()
-                        thread_id, thread_name = _current_thread()
-                        task_id, task_name, task_frame = _task.get_task(thread_id)
-                        lock_name = (
-                            "%s:%s" % (self._self_init_loc, self._self_name) if self._self_name else self._self_init_loc
-                        )
-
-                        if task_frame is None:
-                            # See the comments in _acquire
-                            frame = sys._getframe(2)
-                        else:
-                            frame = task_frame
-
-                        frames, nframes = _traceback.pyframe_to_frames(frame, self._self_max_nframes)
-
-                        if self._self_export_libdd_enabled:
-                            thread_native_id = _threading.get_thread_native_id(thread_id)
-
-                            handle = ddup.SampleHandle()
-                            handle.push_monotonic_ns(end)
-                            handle.push_lock_name(lock_name)
-                            handle.push_release(
-                                end - self._self_acquired_at, 1
-                            )  # AFAICT, capture_pct does not adjust anything here
-                            handle.push_threadinfo(thread_id, thread_native_id, thread_name)
-                            handle.push_task_id(task_id)
-                            handle.push_task_name(task_name)
-
-                            if self._self_tracer is not None:
-                                handle.push_span(self._self_tracer.current_span())
-                            for frame in frames:
-                                handle.push_frame(frame.function_name, frame.file_name, 0, frame.lineno)
-                            handle.flush_sample()
-                        else:
-                            event = self.RELEASE_EVENT_CLASS(
-                                lock_name=lock_name,
-                                frames=frames,
-                                nframes=nframes,
-                                thread_id=thread_id,
-                                thread_name=thread_name,
-                                task_id=task_id,
-                                task_name=task_name,
-                                locked_for_ns=end - self._self_acquired_at,
-                                sampling_pct=self._self_capture_sampler.capture_pct,
-                            )
-
-                            if self._self_tracer is not None:
-                                event.set_trace_info(
-                                    self._self_tracer.current_span(), self._self_endpoint_collection_enabled
-                                )
-
-                            self._self_recorder.push_event(event)
-                    finally:
-                        del self._self_acquired_at
-            except Exception as e:
-                LOG.warning("Error recording lock release event: %s", e)
-                pass  # nosec
+            if start is not None:
+                end = time.monotonic_ns()
+                thread_id, thread_name = _current_thread()
+                task_id, task_name, task_frame = _task.get_task(thread_id)
+                lock_name = "%s:%s" % (self._self_init_loc, self._self_name) if self._self_name else self._self_init_loc
+
+                if task_frame is None:
+                    # See the comments in _acquire
+                    frame = sys._getframe(2)
+                else:
+                    frame = task_frame
+
+                frames, nframes = _traceback.pyframe_to_frames(frame, self._self_max_nframes)
+
+                if self._self_export_libdd_enabled:
+                    thread_native_id = _threading.get_thread_native_id(thread_id)
+
+                    handle = ddup.SampleHandle()
+                    handle.push_monotonic_ns(end)
+                    handle.push_lock_name(lock_name)
+                    handle.push_release(end - start, 1)  # AFAICT, capture_pct does not adjust anything here
+                    handle.push_threadinfo(thread_id, thread_native_id, thread_name)
+                    handle.push_task_id(task_id)
+                    handle.push_task_name(task_name)
+
+                    if self._self_tracer is not None:
+                        handle.push_span(self._self_tracer.current_span())
+                    for frame in frames:
+                        handle.push_frame(frame.function_name, frame.file_name, 0, frame.lineno)
+                    handle.flush_sample()
+                else:
+                    event = self.RELEASE_EVENT_CLASS(
+                        lock_name=lock_name,
+                        frames=frames,
+                        nframes=nframes,
+                        thread_id=thread_id,
+                        thread_name=thread_name,
+                        task_id=task_id,
+                        task_name=task_name,
+                        locked_for_ns=end - start,
+                        sampling_pct=self._self_capture_sampler.capture_pct,
+                    )
+
+                    if self._self_tracer is not None:
+                        event.set_trace_info(self._self_tracer.current_span(), self._self_endpoint_collection_enabled)
+
+                    self._self_recorder.push_event(event)
 
     def release(self, *args, **kwargs):
         return self._release(self.__wrapped__.release, *args, **kwargs)
diff --git a/ddtrace/profiling/exporter/http.py b/ddtrace/profiling/exporter/http.py
index 6700e584ade..b4ec6994d72 100644
--- a/ddtrace/profiling/exporter/http.py
+++ b/ddtrace/profiling/exporter/http.py
@@ -220,8 +220,18 @@ def export(
             "family": "python",
             "attachments": [item["filename"].decode("utf-8") for item in data],
             "tags_profiler": self._get_tags(service),
-            "start": (datetime.datetime.utcfromtimestamp(start_time_ns / 1e9).replace(microsecond=0).isoformat() + "Z"),
-            "end": (datetime.datetime.utcfromtimestamp(end_time_ns / 1e9).replace(microsecond=0).isoformat() + "Z"),
+            "start": (
+                datetime.datetime.fromtimestamp(start_time_ns / 1e9, tz=datetime.timezone.utc)
+                .replace(microsecond=0)
+                .isoformat()[0:-6]  # removes the trailing +00:00 portion of the time
+                + "Z"
+            ),
+            "end": (
+                datetime.datetime.fromtimestamp(end_time_ns / 1e9, tz=datetime.timezone.utc)
+                .replace(microsecond=0)
+                .isoformat()[0:-6]  # removes the trailing +00:00 portion of the time
+                + "Z"
+            ),
         }  # type: Dict[str, Any]
 
         if self.endpoint_call_counter_span_processor is not None:
diff --git a/ddtrace/propagation/http.py b/ddtrace/propagation/http.py
index a1664664ace..563ee838d84 100644
--- a/ddtrace/propagation/http.py
+++ b/ddtrace/propagation/http.py
@@ -101,6 +101,7 @@ def _possible_header(header):
 _POSSIBLE_HTTP_HEADER_B3_FLAGS = _possible_header(_HTTP_HEADER_B3_FLAGS)
 _POSSIBLE_HTTP_HEADER_TRACEPARENT = _possible_header(_HTTP_HEADER_TRACEPARENT)
 _POSSIBLE_HTTP_HEADER_TRACESTATE = _possible_header(_HTTP_HEADER_TRACESTATE)
+_POSSIBLE_HTTP_BAGGAGE_HEADER = _possible_header(_HTTP_HEADER_BAGGAGE)
 
 
 # https://www.w3.org/TR/trace-context/#traceparent-header-field-values
@@ -937,7 +938,7 @@ def _inject(span_context: Context, headers: Dict[str, str]) -> None:
 
     @staticmethod
     def _extract(headers: Dict[str, str]) -> Context:
-        header_value = headers.get(_HTTP_HEADER_BAGGAGE)
+        header_value = _extract_header_value(_POSSIBLE_HTTP_BAGGAGE_HEADER, headers)
 
         if not header_value:
             return Context(baggage={})
diff --git a/hatch.toml b/hatch.toml
index 38471028f8f..b6555885ad0 100644
--- a/hatch.toml
+++ b/hatch.toml
@@ -214,11 +214,11 @@ test = [
 # if you add or remove a version here, please also update the parallelism parameter
 # in .circleci/config.templ.yml
 [[envs.appsec_threats_django.matrix]]
-python = ["3.7", "3.9"]
+python = ["3.8", "3.9"]
 django = ["~=2.2"]
 
 [[envs.appsec_threats_django.matrix]]
-python = ["3.7", "3.9", "3.10"]
+python = ["3.8", "3.9", "3.10"]
 django = ["~=3.2"]
 
 [[envs.appsec_threats_django.matrix]]
@@ -226,11 +226,11 @@ python = ["3.8", "3.10"]
 django = ["==4.0.10"]
 
 [[envs.appsec_threats_django.matrix]]
-python = ["3.8", "3.10", "3.12"]
+python = ["3.8", "3.11", "3.13"]
 django = ["~=4.2"]
 
 [[envs.appsec_threats_django.matrix]]
-python = ["3.10", "3.12"]
+python = ["3.10", "3.13"]
 django = ["~=5.1"]
 
 
@@ -262,21 +262,21 @@ test = [
 # if you add or remove a version here, please also update the parallelism parameter
 # in .circleci/config.templ.yml
 [[envs.appsec_threats_flask.matrix]]
-python = ["3.7", "3.9"]
+python = ["3.8", "3.9"]
 flask = ["~=1.1"]
 markupsafe = ["~=1.1"]
 
 [[envs.appsec_threats_flask.matrix]]
-python = ["3.7", "3.9"]
+python = ["3.8", "3.9"]
 flask = ["==2.1.3"]
 werkzeug = ["<3.0"]
 
 [[envs.appsec_threats_flask.matrix]]
-python = ["3.8", "3.9", "3.12"]
+python = ["3.8", "3.10", "3.13"]
 flask = ["~=2.3"]
 
 [[envs.appsec_threats_flask.matrix]]
-python = ["3.8", "3.10", "3.12"]
+python = ["3.8", "3.11", "3.13"]
 flask = ["~=3.0"]
 
 ## ASM Native IAST module
@@ -327,16 +327,16 @@ test = [
 # if you add or remove a version here, please also update the parallelism parameter
 # in .circleci/config.templ.yml
 [[envs.appsec_threats_fastapi.matrix]]
-python = ["3.7", "3.9", "3.11"]
+python = ["3.8", "3.10", "3.13"]
 fastapi = ["==0.86.0"]
 anyio = ["==3.7.1"]
 
 [[envs.appsec_threats_fastapi.matrix]]
-python = ["3.7", "3.9", "3.12"]
+python = ["3.8", "3.10", "3.13"]
 fastapi = ["==0.94.1"]
 
 [[envs.appsec_threats_fastapi.matrix]]
-python = ["3.8", "3.10", "3.12"]
+python = ["3.8", "3.10", "3.13"]
 fastapi = ["~=0.114.2"]
 
 
diff --git a/releasenotes/notes/case_insensitive_baggage_header_extraction-63167c492474da6f.yaml b/releasenotes/notes/case_insensitive_baggage_header_extraction-63167c492474da6f.yaml
new file mode 100644
index 00000000000..ad0eacb28e8
--- /dev/null
+++ b/releasenotes/notes/case_insensitive_baggage_header_extraction-63167c492474da6f.yaml
@@ -0,0 +1,6 @@
+---
+fixes:
+  - |
+    tracer: This fix resolves an issue where baggage header extraction was case sensitive and didn't accept the header prepended with HTTP.
+    Now the baggage header will be extracted regardless of casing and the HTTP format.
+    
diff --git a/releasenotes/notes/feat-openai-streamed-chunk-auto-extract-4cbaea8870b1df13.yaml b/releasenotes/notes/feat-openai-streamed-chunk-auto-extract-4cbaea8870b1df13.yaml
new file mode 100644
index 00000000000..afaf95876d5
--- /dev/null
+++ b/releasenotes/notes/feat-openai-streamed-chunk-auto-extract-4cbaea8870b1df13.yaml
@@ -0,0 +1,6 @@
+---
+features:
+  - |
+    openai: Introduces automatic extraction of token usage from streamed chat completions. 
+    Unless ``stream_options: {"include_usage": False}`` is explicitly set on your streamed chat completion request, 
+    the OpenAI integration will add ``stream_options: {"include_usage": True}`` to your request and automatically extract the token usage chunk from the streamed response.
diff --git a/releasenotes/notes/fix-er-include-nonlocals-bbeecfbbbde35496.yaml b/releasenotes/notes/fix-er-include-nonlocals-bbeecfbbbde35496.yaml
new file mode 100644
index 00000000000..4d77fddb710
--- /dev/null
+++ b/releasenotes/notes/fix-er-include-nonlocals-bbeecfbbbde35496.yaml
@@ -0,0 +1,4 @@
+---
+fixes:
+  - |
+    exception replay: include missing nonlocal variables in snapshot log messages.
diff --git a/releasenotes/notes/fix-llmobs-default-writer-hooks-5e456c2f7dfd4381.yaml b/releasenotes/notes/fix-llmobs-default-writer-hooks-5e456c2f7dfd4381.yaml
new file mode 100644
index 00000000000..702e2538b99
--- /dev/null
+++ b/releasenotes/notes/fix-llmobs-default-writer-hooks-5e456c2f7dfd4381.yaml
@@ -0,0 +1,4 @@
+---
+fixes:
+  - |
+    LLM Observability: Resolves an issue where enabling LLM Observability in agentless mode would result in traces also being sent to the agent proxy endpoint.
\ No newline at end of file
diff --git a/releasenotes/notes/fix-llmobs-processor-4afd715a84323d32.yaml b/releasenotes/notes/fix-llmobs-processor-4afd715a84323d32.yaml
new file mode 100644
index 00000000000..5912a415022
--- /dev/null
+++ b/releasenotes/notes/fix-llmobs-processor-4afd715a84323d32.yaml
@@ -0,0 +1,5 @@
+---
+fixes:
+  - |
+    LLM Observability: Resolves an issue where configuring custom trace filters/processors onto the tracer would disable LLM Observability.
+     Note that if LLM Observability is enabled in agentless mode, writing APM traces must be explicitly disabled by setting `DD_TRACE_ENABLED=0`.  
diff --git a/releasenotes/notes/profiling-lock-acquired-at-e308547cffdca9f7.yaml b/releasenotes/notes/profiling-lock-acquired-at-e308547cffdca9f7.yaml
new file mode 100644
index 00000000000..de86c8227b6
--- /dev/null
+++ b/releasenotes/notes/profiling-lock-acquired-at-e308547cffdca9f7.yaml
@@ -0,0 +1,6 @@
+---
+fixes:
+  - |
+    profiling: This fix resolves a data race issue accessing lock's acquired
+    time, leading to an ``AttributeError``: ``_Profiled_ThreadingLock`` object
+    has no attribute ``self_acquired_at``
diff --git a/releasenotes/notes/rasp_cmdi-3c7819ee9e33e447.yaml b/releasenotes/notes/rasp_cmdi-3c7819ee9e33e447.yaml
new file mode 100644
index 00000000000..89744bf9be2
--- /dev/null
+++ b/releasenotes/notes/rasp_cmdi-3c7819ee9e33e447.yaml
@@ -0,0 +1,6 @@
+---
+features:
+  - |
+    ASM: This introduces the support for command injection for Exploit Prevention. With previous support of shell injection with os.system, 
+    this provides automatic instrumentation for subprocess module functions and os.spawn* functions, 
+    ensuring monitoring and blocking for Exploit Prevention on those endpoints.
diff --git a/releasenotes/notes/submit-evaluation-for-01096d803d969e3e.yaml b/releasenotes/notes/submit-evaluation-for-01096d803d969e3e.yaml
new file mode 100644
index 00000000000..c2e4b25f255
--- /dev/null
+++ b/releasenotes/notes/submit-evaluation-for-01096d803d969e3e.yaml
@@ -0,0 +1,17 @@
+---
+features:
+  - |
+    LLM Observability: This introduces the `LLMObs.submit_evaluation_for` method, which provides the ability to join a custom evaluation 
+                        to a span using a tag key-value pair on the span. The tag key-value pair is expected to uniquely identify a single span. 
+                        Tag-based joining is an alternative to the existing method of joining evaluations to spans using trace and span IDs.
+                      Example usage:
+                        - Evaluation joined by tag: `LLMObs.submit_evaluation_for(span_with_tag_value={"tag_key": "message_id", "tag_value": "dummy_message_id"}, label="rating", ...)`.
+                        - Evaluation joined by trace/span ID: `LLMObs.submit_evaluation_for(span={"trace_id": "...", "span_id": "..."}, label="rating", ...)`.
+deprecations:
+  - |
+    LLM Observability: `LLMObs.submit_evaluation` is deprecated and will be removed in ddtrace 3.0.0.
+                        As an alternative to `LLMObs.submit_evaluation`, you can use `LLMObs.submit_evaluation_for` instead.
+                        To migrate, replace `LLMObs.submit_evaluation(span_context={"span_id": ..., "trace_id": ...}, ...)` with:
+                          `LLMObs.submit_evaluation_for(span={"span_id": ..., "trace_id": ...}, ...)
+                        You may also join an evaluation to a span using a tag key-value pair like so:
+                          `LLMObs.submit_evaluation_for(span_with_tag_value={"tag_key": ..., "tag_val": ...}, ...)`.
diff --git a/riotfile.py b/riotfile.py
index 3f62a0cce06..7365c481e7c 100644
--- a/riotfile.py
+++ b/riotfile.py
@@ -2956,8 +2956,8 @@ def select_pys(min_version: str = MIN_PYTHON_VERSION, max_version: str = MAX_PYT
             name="llmobs",
             command="pytest {cmdargs} tests/llmobs",
             pkgs={"vcrpy": latest, "pytest-asyncio": "==0.21.1"},
-            pys=select_pys(min_version="3.7"),
             venvs=[
+                Venv(pys="3.7"),
                 Venv(pys=select_pys(min_version="3.8"), pkgs={"ragas": "==0.1.21", "langchain": latest}),
             ],
         ),
diff --git a/scripts/gen_circleci_config.py b/scripts/gen_circleci_config.py
index bc51f2c5519..3a68a1a7975 100644
--- a/scripts/gen_circleci_config.py
+++ b/scripts/gen_circleci_config.py
@@ -17,10 +17,13 @@ def gen_required_suites(template: dict) -> None:
     required_suites = template["requires_tests"]["requires"] = []
     for_each_testrun_needed(
         suites=sorted(
-            set(n.rpartition("::")[-1] for n, s in get_suites().items() if not s.get("skip", False))
-            & set(template["jobs"].keys())
+            set(
+                n
+                for n, s in get_suites().items()
+                if not s.get("skip", False) and n.rpartition("::")[-1] in template["jobs"]
+            )
         ),
-        action=lambda suite: required_suites.append(suite),
+        action=lambda suite: required_suites.append(suite.rpartition("::")[-1]),
         git_selections=extract_git_commit_selections(os.getenv("GIT_COMMIT_DESC", "")),
     )
 
diff --git a/tests/appsec/appsec/rules-rasp-blocking.json b/tests/appsec/appsec/rules-rasp-blocking.json
index f2f8c4d7955..e5038e4a7c2 100644
--- a/tests/appsec/appsec/rules-rasp-blocking.json
+++ b/tests/appsec/appsec/rules-rasp-blocking.json
@@ -201,6 +201,55 @@
         "stack_trace",
         "block"
       ]
+    },
+    {
+      "id": "rasp-932-110",
+      "name": "OS command injection exploit",
+      "tags": {
+        "type": "command_injection",
+        "category": "vulnerability_trigger",
+        "cwe": "77",
+        "capec": "1000/152/248/88",
+        "confidence": "0",
+        "module": "rasp"
+      },
+      "conditions": [
+        {
+          "parameters": {
+            "resource": [
+              {
+                "address": "server.sys.exec.cmd"
+              }
+            ],
+            "params": [
+              {
+                "address": "server.request.query"
+              },
+              {
+                "address": "server.request.body"
+              },
+              {
+                "address": "server.request.path_params"
+              },
+              {
+                "address": "grpc.server.request.message"
+              },
+              {
+                "address": "graphql.server.all_resolvers"
+              },
+              {
+                "address": "graphql.server.resolver"
+              }
+            ]
+          },
+          "operator": "cmdi_detector"
+        }
+      ],
+      "transformers": [],
+      "on_match": [
+        "stack_trace",
+        "block"
+      ]
     }
    ]
 }
\ No newline at end of file
diff --git a/tests/appsec/appsec/rules-rasp-disabled.json b/tests/appsec/appsec/rules-rasp-disabled.json
index 4a0943a34fb..ec67b186732 100644
--- a/tests/appsec/appsec/rules-rasp-disabled.json
+++ b/tests/appsec/appsec/rules-rasp-disabled.json
@@ -201,6 +201,55 @@
       "on_match": [
         "stack_trace"
       ]
+    },
+    {
+      "id": "rasp-932-110",
+      "name": "OS command injection exploit",
+      "enabled": false,
+      "tags": {
+        "type": "command_injection",
+        "category": "vulnerability_trigger",
+        "cwe": "77",
+        "capec": "1000/152/248/88",
+        "confidence": "0",
+        "module": "rasp"
+      },
+      "conditions": [
+        {
+          "parameters": {
+            "resource": [
+              {
+                "address": "server.sys.exec.cmd"
+              }
+            ],
+            "params": [
+              {
+                "address": "server.request.query"
+              },
+              {
+                "address": "server.request.body"
+              },
+              {
+                "address": "server.request.path_params"
+              },
+              {
+                "address": "grpc.server.request.message"
+              },
+              {
+                "address": "graphql.server.all_resolvers"
+              },
+              {
+                "address": "graphql.server.resolver"
+              }
+            ]
+          },
+          "operator": "cmdi_detector"
+        }
+      ],
+      "transformers": [],
+      "on_match": [
+        "stack_trace"
+      ]
     }
   ]
 }
\ No newline at end of file
diff --git a/tests/appsec/appsec/rules-rasp-redirecting.json b/tests/appsec/appsec/rules-rasp-redirecting.json
index a7a53db6e3b..6e2080b2dbf 100644
--- a/tests/appsec/appsec/rules-rasp-redirecting.json
+++ b/tests/appsec/appsec/rules-rasp-redirecting.json
@@ -211,6 +211,55 @@
         "stack_trace",
         "block"
       ]
+    },
+    {
+      "id": "rasp-932-110",
+      "name": "OS command injection exploit",
+      "tags": {
+        "type": "command_injection",
+        "category": "vulnerability_trigger",
+        "cwe": "77",
+        "capec": "1000/152/248/88",
+        "confidence": "0",
+        "module": "rasp"
+      },
+      "conditions": [
+        {
+          "parameters": {
+            "resource": [
+              {
+                "address": "server.sys.exec.cmd"
+              }
+            ],
+            "params": [
+              {
+                "address": "server.request.query"
+              },
+              {
+                "address": "server.request.body"
+              },
+              {
+                "address": "server.request.path_params"
+              },
+              {
+                "address": "grpc.server.request.message"
+              },
+              {
+                "address": "graphql.server.all_resolvers"
+              },
+              {
+                "address": "graphql.server.resolver"
+              }
+            ]
+          },
+          "operator": "cmdi_detector"
+        }
+      ],
+      "transformers": [],
+      "on_match": [
+        "stack_trace",
+        "block"
+      ]
     }
   ]
 }
\ No newline at end of file
diff --git a/tests/appsec/appsec/rules-rasp.json b/tests/appsec/appsec/rules-rasp.json
index c1a6822d261..d73672392af 100644
--- a/tests/appsec/appsec/rules-rasp.json
+++ b/tests/appsec/appsec/rules-rasp.json
@@ -197,6 +197,54 @@
       "on_match": [
         "stack_trace"
       ]
+    },
+    {
+      "id": "rasp-932-110",
+      "name": "OS command injection exploit",
+      "tags": {
+        "type": "command_injection",
+        "category": "vulnerability_trigger",
+        "cwe": "77",
+        "capec": "1000/152/248/88",
+        "confidence": "0",
+        "module": "rasp"
+      },
+      "conditions": [
+        {
+          "parameters": {
+            "resource": [
+              {
+                "address": "server.sys.exec.cmd"
+              }
+            ],
+            "params": [
+              {
+                "address": "server.request.query"
+              },
+              {
+                "address": "server.request.body"
+              },
+              {
+                "address": "server.request.path_params"
+              },
+              {
+                "address": "grpc.server.request.message"
+              },
+              {
+                "address": "graphql.server.all_resolvers"
+              },
+              {
+                "address": "graphql.server.resolver"
+              }
+            ]
+          },
+          "operator": "cmdi_detector"
+        }
+      ],
+      "transformers": [],
+      "on_match": [
+        "stack_trace"
+      ]
     }
   ]
 }
\ No newline at end of file
diff --git a/tests/appsec/appsec/test_remoteconfiguration.py b/tests/appsec/appsec/test_remoteconfiguration.py
index f00167706dc..1d2c47bc190 100644
--- a/tests/appsec/appsec/test_remoteconfiguration.py
+++ b/tests/appsec/appsec/test_remoteconfiguration.py
@@ -117,7 +117,7 @@ def test_rc_activation_states_off(tracer, appsec_enabled, rc_value, remote_confi
 @pytest.mark.parametrize(
     "rc_enabled, appsec_enabled, capability",
     [
-        (True, "true", "D4HkA/w="),  # All capabilities except ASM_ACTIVATION
+        (True, "true", "L4HkA/w="),  # All capabilities except ASM_ACTIVATION
         (False, "true", ""),
         (True, "false", "gAAAAA=="),
         (False, "false", ""),
@@ -142,7 +142,7 @@ def test_rc_capabilities(rc_enabled, appsec_enabled, capability, tracer):
 @pytest.mark.parametrize(
     "env_rules, expected",
     [
-        ({}, "D4HkA/4="),  # All capabilities
+        ({}, "L4HkA/4="),  # All capabilities
         ({"_asm_static_rule_file": DEFAULT.RULES}, "gAAAAg=="),  # Only ASM_FEATURES
     ],
 )
diff --git a/tests/appsec/contrib_appsec/django_app/urls.py b/tests/appsec/contrib_appsec/django_app/urls.py
index 77ad7a7f0a6..aaff69169b5 100644
--- a/tests/appsec/contrib_appsec/django_app/urls.py
+++ b/tests/appsec/contrib_appsec/django_app/urls.py
@@ -1,5 +1,6 @@
 import os
 import sqlite3
+import subprocess
 import tempfile
 
 import django
@@ -129,13 +130,33 @@ def rasp(request, endpoint: str):
                 res.append(f"Error: {e}")
         tracer.current_span()._local_root.set_tag("rasp.request.done", endpoint)
         return HttpResponse("<\\br>\n".join(res))
+    elif endpoint == "shell_injection":
+        res = ["shell_injection endpoint"]
+        for param in query_params:
+            if param.startswith("cmd"):
+                cmd = query_params[param]
+                try:
+                    if param.startswith("cmdsys"):
+                        res.append(f'cmd stdout: {os.system(f"ls {cmd}")}')
+                    else:
+                        res.append(f'cmd stdout: {subprocess.run(f"ls {cmd}", shell=True)}')
+                except Exception as e:
+                    res.append(f"Error: {e}")
+        tracer.current_span()._local_root.set_tag("rasp.request.done", endpoint)
+        return HttpResponse("<\\br>\n".join(res))
     elif endpoint == "command_injection":
         res = ["command_injection endpoint"]
         for param in query_params:
-            if param.startswith("cmd"):
+            if param.startswith("cmda"):
+                cmd = query_params[param]
+                try:
+                    res.append(f'cmd stdout: {subprocess.run([cmd, "-c", "3", "localhost"])}')
+                except Exception as e:
+                    res.append(f"Error: {e}")
+            elif param.startswith("cmds"):
                 cmd = query_params[param]
                 try:
-                    res.append(f'cmd stdout: {os.system(f"ls {cmd}")}')
+                    res.append(f"cmd stdout: {subprocess.run(cmd)}")
                 except Exception as e:
                     res.append(f"Error: {e}")
         tracer.current_span()._local_root.set_tag("rasp.request.done", endpoint)
diff --git a/tests/appsec/contrib_appsec/fastapi_app/app.py b/tests/appsec/contrib_appsec/fastapi_app/app.py
index 10b7b430543..c5b765c4bbb 100644
--- a/tests/appsec/contrib_appsec/fastapi_app/app.py
+++ b/tests/appsec/contrib_appsec/fastapi_app/app.py
@@ -1,6 +1,7 @@
 import asyncio
 import os
 import sqlite3
+import subprocess
 from typing import Optional
 
 from fastapi import FastAPI
@@ -178,13 +179,33 @@ async def rasp(endpoint: str, request: Request):
                     res.append(f"Error: {e}")
             tracer.current_span()._local_root.set_tag("rasp.request.done", endpoint)
             return HTMLResponse("<\\br>\n".join(res))
+        elif endpoint == "shell_injection":
+            res = ["shell_injection endpoint"]
+            for param in query_params:
+                if param.startswith("cmd"):
+                    cmd = query_params[param]
+                    try:
+                        if param.startswith("cmdsys"):
+                            res.append(f'cmd stdout: {os.system(f"ls {cmd}")}')
+                        else:
+                            res.append(f'cmd stdout: {subprocess.run(f"ls {cmd}", shell=True)}')
+                    except Exception as e:
+                        res.append(f"Error: {e}")
+            tracer.current_span()._local_root.set_tag("rasp.request.done", endpoint)
+            return HTMLResponse("<\\br>\n".join(res))
         elif endpoint == "command_injection":
             res = ["command_injection endpoint"]
             for param in query_params:
-                if param.startswith("cmd"):
+                if param.startswith("cmda"):
+                    cmd = query_params[param]
+                    try:
+                        res.append(f'cmd stdout: {subprocess.run([cmd, "-c", "3", "localhost"])}')
+                    except Exception as e:
+                        res.append(f"Error: {e}")
+                elif param.startswith("cmds"):
                     cmd = query_params[param]
                     try:
-                        res.append(f'cmd stdout: {os.system(f"ls {cmd}")}')
+                        res.append(f"cmd stdout: {subprocess.run(cmd)}")
                     except Exception as e:
                         res.append(f"Error: {e}")
             tracer.current_span()._local_root.set_tag("rasp.request.done", endpoint)
diff --git a/tests/appsec/contrib_appsec/flask_app/app.py b/tests/appsec/contrib_appsec/flask_app/app.py
index 5270229d3e9..939a7cad678 100644
--- a/tests/appsec/contrib_appsec/flask_app/app.py
+++ b/tests/appsec/contrib_appsec/flask_app/app.py
@@ -1,5 +1,6 @@
 import os
 import sqlite3
+import subprocess
 from typing import Optional
 
 from flask import Flask
@@ -126,13 +127,33 @@ def rasp(endpoint: str):
                 res.append(f"Error: {e}")
         tracer.current_span()._local_root.set_tag("rasp.request.done", endpoint)
         return "<\\br>\n".join(res)
+    elif endpoint == "shell_injection":
+        res = ["shell_injection endpoint"]
+        for param in query_params:
+            if param.startswith("cmd"):
+                cmd = query_params[param]
+                try:
+                    if param.startswith("cmdsys"):
+                        res.append(f'cmd stdout: {os.system(f"ls {cmd}")}')
+                    else:
+                        res.append(f'cmd stdout: {subprocess.run(f"ls {cmd}", shell=True)}')
+                except Exception as e:
+                    res.append(f"Error: {e}")
+        tracer.current_span()._local_root.set_tag("rasp.request.done", endpoint)
+        return "<\\br>\n".join(res)
     elif endpoint == "command_injection":
         res = ["command_injection endpoint"]
         for param in query_params:
-            if param.startswith("cmd"):
+            if param.startswith("cmda"):
+                cmd = query_params[param]
+                try:
+                    res.append(f'cmd stdout: {subprocess.run([cmd, "-c", "3", "localhost"])}')
+                except Exception as e:
+                    res.append(f"Error: {e}")
+            elif param.startswith("cmds"):
                 cmd = query_params[param]
                 try:
-                    res.append(f'cmd stdout: {os.system(f"ls {cmd}")}')
+                    res.append(f"cmd stdout: {subprocess.run(cmd)}")
                 except Exception as e:
                     res.append(f"Error: {e}")
         tracer.current_span()._local_root.set_tag("rasp.request.done", endpoint)
diff --git a/tests/appsec/contrib_appsec/utils.py b/tests/appsec/contrib_appsec/utils.py
index 315caa49a5d..d3691e2bea3 100644
--- a/tests/appsec/contrib_appsec/utils.py
+++ b/tests/appsec/contrib_appsec/utils.py
@@ -1308,11 +1308,19 @@ def test_stream_response(
         + [("sql_injection", "user_id_1=1 OR 1=1&user_id_2=1 OR 1=1", "rasp-942-100", ("dispatch",))]
         + [
             (
-                "command_injection",
-                "cmd_1=$(cat /etc/passwd 1>%262 ; echo .)&cmd_2=$(uname -a 1>%262 ; echo .)",
+                "shell_injection",
+                "cmdsys_1=$(cat /etc/passwd 1>%262 ; echo .)&cmdrun_2=$(uname -a 1>%262 ; echo .)",
                 "rasp-932-100",
                 ("system", "rasp"),
             )
+        ]
+        + [
+            (
+                "command_injection",
+                "cmda_1=/sbin/ping&cmds_2=/usr/bin/ls%20-la",
+                "rasp-932-110",
+                ("Popen", "rasp"),
+            )
         ],
     )
     @pytest.mark.parametrize(
@@ -1381,11 +1389,23 @@ def validate_top_function(trace):
                         trace
                     ), f"unknown top function {trace['frames'][0]} {[t['function'] for t in trace['frames'][:4]]}"
                 # assert mocked.call_args_list == []
+                expected_rule_type = "command_injection" if endpoint == "shell_injection" else endpoint
+                expected_variant = (
+                    "exec" if endpoint == "command_injection" else "shell" if endpoint == "shell_injection" else None
+                )
                 matches = [t for c, n, t in telemetry_calls if c == "CountMetric" and n == "appsec.rasp.rule.match"]
-                assert matches == [(("rule_type", endpoint), ("waf_version", DDWAF_VERSION))], matches
+                if expected_variant:
+                    expected_tags = (
+                        ("rule_type", expected_rule_type),
+                        ("rule_variant", expected_variant),
+                        ("waf_version", DDWAF_VERSION),
+                    )
+                else:
+                    expected_tags = (("rule_type", expected_rule_type), ("waf_version", DDWAF_VERSION))
+                    assert matches == [expected_tags], matches
                 evals = [t for c, n, t in telemetry_calls if c == "CountMetric" and n == "appsec.rasp.rule.eval"]
                 # there may have been multiple evaluations of other rules too
-                assert (("rule_type", endpoint), ("waf_version", DDWAF_VERSION)) in evals
+                assert expected_tags in evals
                 if action_level == 2:
                     assert get_tag("rasp.request.done") is None, get_tag("rasp.request.done")
                 else:
@@ -1509,7 +1529,7 @@ def test_fingerprinting(self, interface, root_span, get_tag, asm_enabled, user_a
     def test_iast(self, interface, root_span, get_tag):
         from ddtrace.ext import http
 
-        url = "/rasp/command_injection/?cmd=."
+        url = "/rasp/command_injection/?cmds=."
         self.update_tracer(interface)
         response = interface.client.get(url)
         assert self.status(response) == 200
diff --git a/tests/appsec/iast/_ast/test_ast_patching.py b/tests/appsec/iast/_ast/test_ast_patching.py
index cf0fabd14e4..d014496942b 100644
--- a/tests/appsec/iast/_ast/test_ast_patching.py
+++ b/tests/appsec/iast/_ast/test_ast_patching.py
@@ -9,7 +9,9 @@
 from ddtrace.appsec._constants import IAST
 from ddtrace.appsec._iast._ast.ast_patching import _in_python_stdlib
 from ddtrace.appsec._iast._ast.ast_patching import _should_iast_patch
+from ddtrace.appsec._iast._ast.ast_patching import _trie_has_prefix_for
 from ddtrace.appsec._iast._ast.ast_patching import astpatch_module
+from ddtrace.appsec._iast._ast.ast_patching import build_trie
 from ddtrace.appsec._iast._ast.ast_patching import visit_ast
 from ddtrace.internal.utils.formats import asbool
 from tests.utils import override_env
@@ -308,3 +310,87 @@ def test_astpatch_dir_patched_with_or_without_custom_dir(module_name, expected_n
         # Check that all the symbols in the expected set are in the patched dir() result
         for name in expected_names:
             assert name in patched_dir
+
+
+def test_build_trie():
+    from ddtrace.appsec._iast._ast.ast_patching import build_trie
+
+    trie = build_trie(["abc", "def", "ghi", "jkl", "mno", "pqr", "stu", "vwx", "yz"])
+    assert dict(trie) == {
+        "a": {
+            "b": {
+                "c": {"": None},
+            },
+        },
+        "d": {
+            "e": {
+                "f": {"": None},
+            },
+        },
+        "g": {
+            "h": {
+                "i": {"": None},
+            },
+        },
+        "j": {
+            "k": {
+                "l": {"": None},
+            },
+        },
+        "m": {
+            "n": {
+                "o": {"": None},
+            },
+        },
+        "p": {
+            "q": {
+                "r": {"": None},
+            },
+        },
+        "s": {
+            "t": {
+                "u": {"": None},
+            },
+        },
+        "v": {
+            "w": {
+                "x": {"": None},
+            },
+        },
+        "y": {
+            "z": {"": None},
+        },
+    }
+
+
+def test_trie_has_string_match():
+    trie = build_trie(["abc", "def", "ghi", "jkl", "mno", "pqr", "stu", "vwx", "yz"])
+    assert _trie_has_prefix_for(trie, "abc")
+    assert not _trie_has_prefix_for(trie, "ab")
+    assert _trie_has_prefix_for(trie, "abcd")
+    assert _trie_has_prefix_for(trie, "def")
+    assert not _trie_has_prefix_for(trie, "de")
+    assert _trie_has_prefix_for(trie, "defg")
+    assert _trie_has_prefix_for(trie, "ghi")
+    assert not _trie_has_prefix_for(trie, "gh")
+    assert _trie_has_prefix_for(trie, "ghij")
+    assert _trie_has_prefix_for(trie, "jkl")
+    assert not _trie_has_prefix_for(trie, "jk")
+    assert _trie_has_prefix_for(trie, "jklm")
+    assert _trie_has_prefix_for(trie, "mno")
+    assert not _trie_has_prefix_for(trie, "mn")
+    assert _trie_has_prefix_for(trie, "mnop")
+    assert _trie_has_prefix_for(trie, "pqr")
+    assert not _trie_has_prefix_for(trie, "pq")
+    assert _trie_has_prefix_for(trie, "pqrs")
+    assert _trie_has_prefix_for(trie, "stu")
+    assert not _trie_has_prefix_for(trie, "st")
+    assert _trie_has_prefix_for(trie, "stuv")
+    assert _trie_has_prefix_for(trie, "vwx")
+    assert not _trie_has_prefix_for(trie, "vw")
+    assert _trie_has_prefix_for(trie, "vwxy")
+    assert _trie_has_prefix_for(trie, "yz")
+    assert not _trie_has_prefix_for(trie, "y")
+    assert _trie_has_prefix_for(trie, "yza")
+    assert not _trie_has_prefix_for(trie, "z")
+    assert not _trie_has_prefix_for(trie, "zzz")
diff --git a/tests/appsec/iast/taint_sinks/test_command_injection.py b/tests/appsec/iast/taint_sinks/test_command_injection.py
index b716f594e85..ab611c1969b 100644
--- a/tests/appsec/iast/taint_sinks/test_command_injection.py
+++ b/tests/appsec/iast/taint_sinks/test_command_injection.py
@@ -123,7 +123,7 @@ def test_popen_wait_shell_true(iast_context_defaults):
     _assert_vulnerability("test_popen_wait_shell_true", source_name=source_name)
 
 
-@pytest.mark.skipif(sys.platform != "linux", reason="Only for Linux")
+@pytest.mark.skipif(sys.platform not in ["linux", "darwin"], reason="Only for Unix")
 @pytest.mark.parametrize(
     "function,mode,arguments,tag",
     [
@@ -156,11 +156,11 @@ def test_osspawn_variants(iast_context_defaults, function, mode, arguments, tag)
 
     if "spawnv" in cleaned_name:
         # label test_osspawn_variants2
-        function(mode, copied_args[0], copied_args)
+        function(mode, copied_args[0], copied_args[1:])
         label = "test_osspawn_variants2"
     else:
         # label test_osspawn_variants1
-        function(mode, copied_args[0], *copied_args)
+        function(mode, copied_args[0], *copied_args[1:])
         label = "test_osspawn_variants1"
 
     _assert_vulnerability(
@@ -171,7 +171,7 @@ def test_osspawn_variants(iast_context_defaults, function, mode, arguments, tag)
     )
 
 
-@pytest.mark.skipif(sys.platform != "linux", reason="Only for Linux")
+@pytest.mark.skipif(sys.platform not in ["linux", "darwin"], reason="Only for Unix")
 def test_multiple_cmdi(iast_context_defaults):
     _BAD_DIR = taint_pyobject(
         pyobject=_BAD_DIR_DEFAULT,
@@ -193,7 +193,7 @@ def test_multiple_cmdi(iast_context_defaults):
     assert len(list(data["vulnerabilities"])) == 2
 
 
-@pytest.mark.skipif(sys.platform != "linux", reason="Only for Linux")
+@pytest.mark.skipif(sys.platform not in ["linux", "darwin"], reason="Only for Unix")
 def test_string_cmdi(iast_context_defaults):
     cmd = taint_pyobject(
         pyobject="dir -l .",
diff --git a/tests/appsec/iast_packages/packages/pkg_pyjwt.py b/tests/appsec/iast_packages/packages/pkg_pyjwt.py
index 4712f6cee0f..ec43d8a17d2 100644
--- a/tests/appsec/iast_packages/packages/pkg_pyjwt.py
+++ b/tests/appsec/iast_packages/packages/pkg_pyjwt.py
@@ -3,6 +3,7 @@
 
 https://pypi.org/project/PyJWT/
 """
+
 import datetime
 
 from flask import Blueprint
@@ -25,7 +26,10 @@ def pkg_pyjwt_view():
         secret_key = "your-256-bit-secret"
         user_payload = request.args.get("package_param", "default-user")
 
-        payload = {"user": user_payload, "exp": datetime.datetime.utcnow() + datetime.timedelta(seconds=30)}
+        payload = {
+            "user": user_payload,
+            "exp": datetime.datetime.now(datetime.timezone.utc) + datetime.timedelta(seconds=30),
+        }
 
         try:
             # Encode the payload to create a JWT
diff --git a/tests/contrib/elasticsearch/test_elasticsearch.py b/tests/contrib/elasticsearch/test_elasticsearch.py
index b80b4486e71..4a480c550c8 100644
--- a/tests/contrib/elasticsearch/test_elasticsearch.py
+++ b/tests/contrib/elasticsearch/test_elasticsearch.py
@@ -1,6 +1,7 @@
 import datetime
 from http.client import HTTPConnection
 from importlib import import_module
+import json
 import time
 
 import pytest
@@ -167,7 +168,12 @@ def test_elasticsearch(self):
             es.index(id=10, body={"name": "ten", "created": datetime.date(2016, 1, 1)}, **args)
             es.index(id=11, body={"name": "eleven", "created": datetime.date(2016, 2, 1)}, **args)
             es.index(id=12, body={"name": "twelve", "created": datetime.date(2016, 3, 1)}, **args)
-            result = es.search(sort=["name:desc"], size=100, body={"query": {"match_all": {}}}, **args)
+            result = es.search(
+                sort={"name": {"order": "desc", "unmapped_type": "keyword"}},
+                size=100,
+                body={"query": {"match_all": {}}},
+                **args,
+            )
 
         assert len(result["hits"]["hits"]) == 3, result
         spans = self.get_spans()
@@ -183,13 +189,25 @@ def test_elasticsearch(self):
         assert url.endswith("/_search")
         assert url == span.get_tag("elasticsearch.url")
         if elasticsearch.__version__ >= (8, 0, 0):
-            assert span.get_tag("elasticsearch.body").replace(" ", "") == '{"query":{"match_all":{}},"size":100}'
-            assert set(span.get_tag("elasticsearch.params").split("&")) == {"sort=name%3Adesc"}
-            assert set(span.get_tag(http.QUERY_STRING).split("&")) == {"sort=name%3Adesc"}
+            # Key order is not consistent, parse into dict to compare
+            body = json.loads(span.get_tag("elasticsearch.body"))
+            assert body == {
+                "query": {"match_all": {}},
+                "sort": {"name": {"order": "desc", "unmapped_type": "keyword"}},
+                "size": 100,
+            }
+            assert not span.get_tag("elasticsearch.params")
+            assert not span.get_tag(http.QUERY_STRING)
         else:
             assert span.get_tag("elasticsearch.body").replace(" ", "") == '{"query":{"match_all":{}}}'
-            assert set(span.get_tag("elasticsearch.params").split("&")) == {"sort=name%3Adesc", "size=100"}
-            assert set(span.get_tag(http.QUERY_STRING).split("&")) == {"sort=name%3Adesc", "size=100"}
+            assert set(span.get_tag("elasticsearch.params").split("&")) == {
+                "sort=%7B%27name%27%3A+%7B%27order%27%3A+%27desc%27%2C+%27unmapped_type%27%3A+%27keyword%27%7D%7D",
+                "size=100",
+            }
+            assert set(span.get_tag(http.QUERY_STRING).split("&")) == {
+                "sort=%7B%27name%27%3A+%7B%27order%27%3A+%27desc%27%2C+%27unmapped_type%27%3A+%27keyword%27%7D%7D",
+                "size=100",
+            }
         assert span.get_tag("component") == "elasticsearch"
         assert span.get_tag("span.kind") == "client"
 
diff --git a/tests/contrib/openai/test_openai_llmobs.py b/tests/contrib/openai/test_openai_llmobs.py
index a1a2b93a5ca..a145877c8c8 100644
--- a/tests/contrib/openai/test_openai_llmobs.py
+++ b/tests/contrib/openai/test_openai_llmobs.py
@@ -518,11 +518,17 @@ async def test_chat_completion_azure_async(
             )
         )
 
-    def test_chat_completion_stream(self, openai, ddtrace_global_config, mock_llmobs_writer, mock_tracer):
+    @pytest.mark.skipif(
+        parse_version(openai_module.version.VERSION) < (1, 26), reason="Stream options only available openai >= 1.26"
+    )
+    def test_chat_completion_stream_explicit_no_tokens(
+        self, openai, ddtrace_global_config, mock_llmobs_writer, mock_tracer
+    ):
         """Ensure llmobs records are emitted for chat completion endpoints when configured.
 
         Also ensure the llmobs records have the correct tagging including trace/span ID for trace correlation.
         """
+
         with get_openai_vcr(subdirectory_name="v1").use_cassette("chat_completion_streamed.yaml"):
             with mock.patch("ddtrace.contrib.internal.openai.utils.encoding_for_model", create=True) as mock_encoding:
                 with mock.patch("ddtrace.contrib.internal.openai.utils._est_tokens") as mock_est:
@@ -534,7 +540,11 @@ def test_chat_completion_stream(self, openai, ddtrace_global_config, mock_llmobs
                     expected_completion = "The Los Angeles Dodgers won the World Series in 2020."
                     client = openai.OpenAI()
                     resp = client.chat.completions.create(
-                        model=model, messages=input_messages, stream=True, user="ddtrace-test"
+                        model=model,
+                        messages=input_messages,
+                        stream=True,
+                        user="ddtrace-test",
+                        stream_options={"include_usage": False},
                     )
                     for chunk in resp:
                         resp_model = chunk.model
@@ -547,7 +557,7 @@ def test_chat_completion_stream(self, openai, ddtrace_global_config, mock_llmobs
                 model_provider="openai",
                 input_messages=input_messages,
                 output_messages=[{"content": expected_completion, "role": "assistant"}],
-                metadata={"stream": True, "user": "ddtrace-test"},
+                metadata={"stream": True, "stream_options": {"include_usage": False}, "user": "ddtrace-test"},
                 token_metrics={"input_tokens": 8, "output_tokens": 8, "total_tokens": 16},
                 tags={"ml_app": "<ml-app-name>", "service": "tests.contrib.openai"},
             )
@@ -557,20 +567,14 @@ def test_chat_completion_stream(self, openai, ddtrace_global_config, mock_llmobs
         parse_version(openai_module.version.VERSION) < (1, 26, 0), reason="Streamed tokens available in 1.26.0+"
     )
     def test_chat_completion_stream_tokens(self, openai, ddtrace_global_config, mock_llmobs_writer, mock_tracer):
-        """
-        Ensure llmobs records are emitted for chat completion endpoints when configured
-        with the `stream_options={"include_usage": True}`.
-        Also ensure the llmobs records have the correct tagging including trace/span ID for trace correlation.
-        """
+        """Assert that streamed token chunk extraction logic works when options are not explicitly passed from user."""
         with get_openai_vcr(subdirectory_name="v1").use_cassette("chat_completion_streamed_tokens.yaml"):
             model = "gpt-3.5-turbo"
             resp_model = model
             input_messages = [{"role": "user", "content": "Who won the world series in 2020?"}]
             expected_completion = "The Los Angeles Dodgers won the World Series in 2020."
             client = openai.OpenAI()
-            resp = client.chat.completions.create(
-                model=model, messages=input_messages, stream=True, stream_options={"include_usage": True}
-            )
+            resp = client.chat.completions.create(model=model, messages=input_messages, stream=True)
             for chunk in resp:
                 resp_model = chunk.model
         span = mock_tracer.pop_traces()[0][0]
@@ -671,7 +675,6 @@ def test_chat_completion_tool_call_stream(self, openai, ddtrace_global_config, m
                 messages=[{"role": "user", "content": chat_completion_input_description}],
                 user="ddtrace-test",
                 stream=True,
-                stream_options={"include_usage": True},
             )
             for chunk in resp:
                 resp_model = chunk.model
diff --git a/tests/contrib/openai/test_openai_v1.py b/tests/contrib/openai/test_openai_v1.py
index f13de144fc5..91737d9e5eb 100644
--- a/tests/contrib/openai/test_openai_v1.py
+++ b/tests/contrib/openai/test_openai_v1.py
@@ -921,128 +921,78 @@ def test_span_finish_on_stream_error(openai, openai_vcr, snapshot_tracer):
                 )
 
 
-def test_completion_stream(openai, openai_vcr, mock_metrics, mock_tracer):
+@pytest.mark.snapshot
+@pytest.mark.skipif(TIKTOKEN_AVAILABLE, reason="This test estimates token counts")
+def test_completion_stream_est_tokens(openai, openai_vcr, mock_metrics, snapshot_tracer):
     with openai_vcr.use_cassette("completion_streamed.yaml"):
         with mock.patch("ddtrace.contrib.internal.openai.utils.encoding_for_model", create=True) as mock_encoding:
             mock_encoding.return_value.encode.side_effect = lambda x: [1, 2]
-            expected_completion = '! ... A page layouts page drawer? ... Interesting. The "Tools" is'
             client = openai.OpenAI()
             resp = client.completions.create(model="ada", prompt="Hello world", stream=True, n=None)
-            chunks = [c for c in resp]
-
-    completion = "".join([c.choices[0].text for c in chunks])
-    assert completion == expected_completion
+            _ = [c for c in resp]
 
-    traces = mock_tracer.pop_traces()
-    assert len(traces) == 1
-    assert len(traces[0]) == 1
-    assert traces[0][0].get_tag("openai.response.choices.0.text") == expected_completion
-    assert traces[0][0].get_tag("openai.response.choices.0.finish_reason") == "length"
 
-    expected_tags = [
-        "version:",
-        "env:",
-        "service:tests.contrib.openai",
-        "openai.request.model:ada",
-        "model:ada",
-        "openai.request.endpoint:/v1/completions",
-        "openai.request.method:POST",
-        "openai.organization.id:",
-        "openai.organization.name:datadog-4",
-        "openai.user.api_key:sk-...key>",
-        "error:0",
-        "openai.estimated:true",
-    ]
-    if TIKTOKEN_AVAILABLE:
-        expected_tags = expected_tags[:-1]
-    assert mock.call.distribution("tokens.prompt", 2, tags=expected_tags) in mock_metrics.mock_calls
-    assert mock.call.distribution("tokens.completion", mock.ANY, tags=expected_tags) in mock_metrics.mock_calls
-    assert mock.call.distribution("tokens.total", mock.ANY, tags=expected_tags) in mock_metrics.mock_calls
+@pytest.mark.skipif(not TIKTOKEN_AVAILABLE, reason="This test computes token counts using tiktoken")
+@pytest.mark.snapshot(token="tests.contrib.openai.test_openai.test_completion_stream")
+def test_completion_stream(openai, openai_vcr, mock_metrics, snapshot_tracer):
+    with openai_vcr.use_cassette("completion_streamed.yaml"):
+        with mock.patch("ddtrace.contrib.internal.openai.utils.encoding_for_model", create=True) as mock_encoding:
+            mock_encoding.return_value.encode.side_effect = lambda x: [1, 2]
+            client = openai.OpenAI()
+            resp = client.completions.create(model="ada", prompt="Hello world", stream=True, n=None)
+            _ = [c for c in resp]
 
 
-async def test_completion_async_stream(openai, openai_vcr, mock_metrics, mock_tracer):
+@pytest.mark.skipif(not TIKTOKEN_AVAILABLE, reason="This test computes token counts using tiktoken")
+@pytest.mark.snapshot(token="tests.contrib.openai.test_openai.test_completion_stream")
+async def test_completion_async_stream(openai, openai_vcr, mock_metrics, snapshot_tracer):
     with openai_vcr.use_cassette("completion_streamed.yaml"):
         with mock.patch("ddtrace.contrib.internal.openai.utils.encoding_for_model", create=True) as mock_encoding:
             mock_encoding.return_value.encode.side_effect = lambda x: [1, 2]
-            expected_completion = '! ... A page layouts page drawer? ... Interesting. The "Tools" is'
             client = openai.AsyncOpenAI()
-            resp = await client.completions.create(model="ada", prompt="Hello world", stream=True)
-            chunks = [c async for c in resp]
-
-    completion = "".join([c.choices[0].text for c in chunks])
-    assert completion == expected_completion
-
-    traces = mock_tracer.pop_traces()
-    assert len(traces) == 1
-    assert len(traces[0]) == 1
-    assert traces[0][0].get_tag("openai.response.choices.0.text") == expected_completion
-    assert traces[0][0].get_tag("openai.response.choices.0.finish_reason") == "length"
-
-    expected_tags = [
-        "version:",
-        "env:",
-        "service:tests.contrib.openai",
-        "openai.request.model:ada",
-        "model:ada",
-        "openai.request.endpoint:/v1/completions",
-        "openai.request.method:POST",
-        "openai.organization.id:",
-        "openai.organization.name:datadog-4",
-        "openai.user.api_key:sk-...key>",
-        "error:0",
-        "openai.estimated:true",
-    ]
-    if TIKTOKEN_AVAILABLE:
-        expected_tags = expected_tags[:-1]
-    assert mock.call.distribution("tokens.prompt", 2, tags=expected_tags) in mock_metrics.mock_calls
-    assert mock.call.distribution("tokens.completion", mock.ANY, tags=expected_tags) in mock_metrics.mock_calls
-    assert mock.call.distribution("tokens.total", mock.ANY, tags=expected_tags) in mock_metrics.mock_calls
+            resp = await client.completions.create(model="ada", prompt="Hello world", stream=True, n=None)
+            _ = [c async for c in resp]
 
 
 @pytest.mark.skipif(
-    parse_version(openai_module.version.VERSION) < (1, 6, 0),
+    parse_version(openai_module.version.VERSION) < (1, 6, 0) or not TIKTOKEN_AVAILABLE,
     reason="Streamed response context managers are only available v1.6.0+",
 )
-def test_completion_stream_context_manager(openai, openai_vcr, mock_metrics, mock_tracer):
+@pytest.mark.snapshot(token="tests.contrib.openai.test_openai.test_completion_stream")
+def test_completion_stream_context_manager(openai, openai_vcr, mock_metrics, snapshot_tracer):
     with openai_vcr.use_cassette("completion_streamed.yaml"):
         with mock.patch("ddtrace.contrib.internal.openai.utils.encoding_for_model", create=True) as mock_encoding:
             mock_encoding.return_value.encode.side_effect = lambda x: [1, 2]
-            expected_completion = '! ... A page layouts page drawer? ... Interesting. The "Tools" is'
             client = openai.OpenAI()
             with client.completions.create(model="ada", prompt="Hello world", stream=True, n=None) as resp:
-                chunks = [c for c in resp]
+                _ = [c for c in resp]
 
-    completion = "".join([c.choices[0].text for c in chunks])
-    assert completion == expected_completion
-
-    traces = mock_tracer.pop_traces()
-    assert len(traces) == 1
-    assert len(traces[0]) == 1
-    assert traces[0][0].get_tag("openai.response.choices.0.text") == expected_completion
-    assert traces[0][0].get_tag("openai.response.choices.0.finish_reason") == "length"
 
-    expected_tags = [
-        "version:",
-        "env:",
-        "service:tests.contrib.openai",
-        "openai.request.model:ada",
-        "model:ada",
-        "openai.request.endpoint:/v1/completions",
-        "openai.request.method:POST",
-        "openai.organization.id:",
-        "openai.organization.name:datadog-4",
-        "openai.user.api_key:sk-...key>",
-        "error:0",
-        "openai.estimated:true",
-    ]
-    if TIKTOKEN_AVAILABLE:
-        expected_tags = expected_tags[:-1]
-    assert mock.call.distribution("tokens.prompt", 2, tags=expected_tags) in mock_metrics.mock_calls
-    assert mock.call.distribution("tokens.completion", mock.ANY, tags=expected_tags) in mock_metrics.mock_calls
-    assert mock.call.distribution("tokens.total", mock.ANY, tags=expected_tags) in mock_metrics.mock_calls
+@pytest.mark.skipif(
+    parse_version(openai_module.version.VERSION) < (1, 26), reason="Stream options only available openai >= 1.26"
+)
+@pytest.mark.snapshot(token="tests.contrib.openai.test_openai.test_chat_completion_stream")
+def test_chat_completion_stream(openai, openai_vcr, mock_metrics, snapshot_tracer):
+    """Assert that streamed token chunk extraction logic works automatically."""
+    with openai_vcr.use_cassette("chat_completion_streamed_tokens.yaml"):
+        with mock.patch("ddtrace.contrib.internal.openai.utils.encoding_for_model", create=True) as mock_encoding:
+            mock_encoding.return_value.encode.side_effect = lambda x: [1, 2, 3, 4, 5, 6, 7, 8]
+            client = openai.OpenAI()
+            resp = client.chat.completions.create(
+                model="gpt-3.5-turbo",
+                messages=[{"role": "user", "content": "Who won the world series in 2020?"}],
+                stream=True,
+                user="ddtrace-test",
+                n=None,
+            )
+            _ = [c for c in resp]
 
 
-def test_chat_completion_stream(openai, openai_vcr, mock_metrics, snapshot_tracer):
+@pytest.mark.skipif(
+    parse_version(openai_module.version.VERSION) < (1, 26), reason="Stream options only available openai >= 1.26"
+)
+def test_chat_completion_stream_explicit_no_tokens(openai, openai_vcr, mock_metrics, snapshot_tracer):
+    """Assert that streamed token chunk extraction logic is avoided if explicitly set to False by the user."""
     with openai_vcr.use_cassette("chat_completion_streamed.yaml"):
         with mock.patch("ddtrace.contrib.internal.openai.utils.encoding_for_model", create=True) as mock_encoding:
             mock_encoding.return_value.encode.side_effect = lambda x: [1, 2, 3, 4, 5, 6, 7, 8]
@@ -1054,20 +1004,16 @@ def test_chat_completion_stream(openai, openai_vcr, mock_metrics, snapshot_trace
                     {"role": "user", "content": "Who won the world series in 2020?"},
                 ],
                 stream=True,
+                stream_options={"include_usage": False},
                 user="ddtrace-test",
                 n=None,
             )
-            prompt_tokens = 8
             span = snapshot_tracer.current_span()
             chunks = [c for c in resp]
             assert len(chunks) == 15
             completion = "".join([c.choices[0].delta.content for c in chunks if c.choices[0].delta.content is not None])
             assert completion == expected_completion
 
-    assert span.get_tag("openai.response.choices.0.message.content") == expected_completion
-    assert span.get_tag("openai.response.choices.0.message.role") == "assistant"
-    assert span.get_tag("openai.response.choices.0.finish_reason") == "stop"
-
     expected_tags = [
         "version:",
         "env:",
@@ -1087,16 +1033,19 @@ def test_chat_completion_stream(openai, openai_vcr, mock_metrics, snapshot_trace
     expected_tags += ["openai.estimated:true"]
     if TIKTOKEN_AVAILABLE:
         expected_tags = expected_tags[:-1]
-    assert mock.call.distribution("tokens.prompt", prompt_tokens, tags=expected_tags) in mock_metrics.mock_calls
+    assert mock.call.distribution("tokens.prompt", 8, tags=expected_tags) in mock_metrics.mock_calls
     assert mock.call.distribution("tokens.completion", mock.ANY, tags=expected_tags) in mock_metrics.mock_calls
     assert mock.call.distribution("tokens.total", mock.ANY, tags=expected_tags) in mock_metrics.mock_calls
 
 
+@pytest.mark.skipif(
+    parse_version(openai_module.version.VERSION) < (1, 26, 0), reason="Streamed tokens available in 1.26.0+"
+)
+@pytest.mark.snapshot(token="tests.contrib.openai.test_openai.test_chat_completion_stream")
 async def test_chat_completion_async_stream(openai, openai_vcr, mock_metrics, snapshot_tracer):
-    with openai_vcr.use_cassette("chat_completion_streamed.yaml"):
+    with openai_vcr.use_cassette("chat_completion_streamed_tokens.yaml"):
         with mock.patch("ddtrace.contrib.internal.openai.utils.encoding_for_model", create=True) as mock_encoding:
             mock_encoding.return_value.encode.side_effect = lambda x: [1, 2, 3, 4, 5, 6, 7, 8]
-            expected_completion = "The Los Angeles Dodgers won the World Series in 2020."
             client = openai.AsyncOpenAI()
             resp = await client.chat.completions.create(
                 model="gpt-3.5-turbo",
@@ -1104,99 +1053,21 @@ async def test_chat_completion_async_stream(openai, openai_vcr, mock_metrics, sn
                     {"role": "user", "content": "Who won the world series in 2020?"},
                 ],
                 stream=True,
+                n=None,
                 user="ddtrace-test",
             )
-            prompt_tokens = 8
-            span = snapshot_tracer.current_span()
-            chunks = [c async for c in resp]
-            assert len(chunks) == 15
-            completion = "".join([c.choices[0].delta.content for c in chunks if c.choices[0].delta.content is not None])
-            assert completion == expected_completion
-
-    assert span.get_tag("openai.response.choices.0.message.content") == expected_completion
-    assert span.get_tag("openai.response.choices.0.message.role") == "assistant"
-    assert span.get_tag("openai.response.choices.0.finish_reason") == "stop"
-
-    expected_tags = [
-        "version:",
-        "env:",
-        "service:tests.contrib.openai",
-        "openai.request.model:gpt-3.5-turbo",
-        "model:gpt-3.5-turbo",
-        "openai.request.endpoint:/v1/chat/completions",
-        "openai.request.method:POST",
-        "openai.organization.id:",
-        "openai.organization.name:datadog-4",
-        "openai.user.api_key:sk-...key>",
-        "error:0",
-    ]
-    assert mock.call.distribution("request.duration", span.duration_ns, tags=expected_tags) in mock_metrics.mock_calls
-    assert mock.call.gauge("ratelimit.requests", 3000, tags=expected_tags) in mock_metrics.mock_calls
-    assert mock.call.gauge("ratelimit.remaining.requests", 2999, tags=expected_tags) in mock_metrics.mock_calls
-    expected_tags += ["openai.estimated:true"]
-    if TIKTOKEN_AVAILABLE:
-        expected_tags = expected_tags[:-1]
-    assert mock.call.distribution("tokens.prompt", prompt_tokens, tags=expected_tags) in mock_metrics.mock_calls
-    assert mock.call.distribution("tokens.completion", mock.ANY, tags=expected_tags) in mock_metrics.mock_calls
-    assert mock.call.distribution("tokens.total", mock.ANY, tags=expected_tags) in mock_metrics.mock_calls
-
-
-@pytest.mark.skipif(
-    parse_version(openai_module.version.VERSION) < (1, 26, 0), reason="Streamed tokens available in 1.26.0+"
-)
-def test_chat_completion_stream_tokens(openai, openai_vcr, mock_metrics, snapshot_tracer):
-    with openai_vcr.use_cassette("chat_completion_streamed_tokens.yaml"):
-        expected_completion = "The Los Angeles Dodgers won the World Series in 2020."
-        client = openai.OpenAI()
-        resp = client.chat.completions.create(
-            model="gpt-3.5-turbo",
-            messages=[{"role": "user", "content": "Who won the world series in 2020?"}],
-            stream=True,
-            user="ddtrace-test",
-            n=None,
-            stream_options={"include_usage": True},
-        )
-        span = snapshot_tracer.current_span()
-        chunks = [c for c in resp]
-        completion = "".join(
-            [c.choices[0].delta.content for c in chunks if c.choices and c.choices[0].delta.content is not None]
-        )
-        assert completion == expected_completion
-
-    assert span.get_tag("openai.response.choices.0.message.content") == expected_completion
-    assert span.get_tag("openai.response.choices.0.message.role") == "assistant"
-    assert span.get_tag("openai.response.choices.0.finish_reason") == "stop"
-
-    expected_tags = [
-        "version:",
-        "env:",
-        "service:tests.contrib.openai",
-        "openai.request.model:gpt-3.5-turbo",
-        "model:gpt-3.5-turbo",
-        "openai.request.endpoint:/v1/chat/completions",
-        "openai.request.method:POST",
-        "openai.organization.id:",
-        "openai.organization.name:datadog-4",
-        "openai.user.api_key:sk-...key>",
-        "error:0",
-    ]
-    assert mock.call.distribution("request.duration", span.duration_ns, tags=expected_tags) in mock_metrics.mock_calls
-    assert mock.call.gauge("ratelimit.requests", 3000, tags=expected_tags) in mock_metrics.mock_calls
-    assert mock.call.gauge("ratelimit.remaining.requests", 2999, tags=expected_tags) in mock_metrics.mock_calls
-    assert mock.call.distribution("tokens.prompt", 17, tags=expected_tags) in mock_metrics.mock_calls
-    assert mock.call.distribution("tokens.completion", 19, tags=expected_tags) in mock_metrics.mock_calls
-    assert mock.call.distribution("tokens.total", 36, tags=expected_tags) in mock_metrics.mock_calls
+            _ = [c async for c in resp]
 
 
 @pytest.mark.skipif(
-    parse_version(openai_module.version.VERSION) < (1, 6, 0),
-    reason="Streamed response context managers are only available v1.6.0+",
+    parse_version(openai_module.version.VERSION) < (1, 26, 0),
+    reason="Streamed response context managers are only available v1.6.0+, tokens available 1.26.0+",
 )
+@pytest.mark.snapshot(token="tests.contrib.openai.test_openai.test_chat_completion_stream")
 async def test_chat_completion_async_stream_context_manager(openai, openai_vcr, mock_metrics, snapshot_tracer):
-    with openai_vcr.use_cassette("chat_completion_streamed.yaml"):
+    with openai_vcr.use_cassette("chat_completion_streamed_tokens.yaml"):
         with mock.patch("ddtrace.contrib.internal.openai.utils.encoding_for_model", create=True) as mock_encoding:
             mock_encoding.return_value.encode.side_effect = lambda x: [1, 2, 3, 4, 5, 6, 7, 8]
-            expected_completion = "The Los Angeles Dodgers won the World Series in 2020."
             client = openai.AsyncOpenAI()
             async with await client.chat.completions.create(
                 model="gpt-3.5-turbo",
@@ -1207,41 +1078,7 @@ async def test_chat_completion_async_stream_context_manager(openai, openai_vcr,
                 user="ddtrace-test",
                 n=None,
             ) as resp:
-                prompt_tokens = 8
-                span = snapshot_tracer.current_span()
-                chunks = [c async for c in resp]
-                assert len(chunks) == 15
-                completion = "".join(
-                    [c.choices[0].delta.content for c in chunks if c.choices[0].delta.content is not None]
-                )
-                assert completion == expected_completion
-
-    assert span.get_tag("openai.response.choices.0.message.content") == expected_completion
-    assert span.get_tag("openai.response.choices.0.message.role") == "assistant"
-    assert span.get_tag("openai.response.choices.0.finish_reason") == "stop"
-
-    expected_tags = [
-        "version:",
-        "env:",
-        "service:tests.contrib.openai",
-        "openai.request.model:gpt-3.5-turbo",
-        "model:gpt-3.5-turbo",
-        "openai.request.endpoint:/v1/chat/completions",
-        "openai.request.method:POST",
-        "openai.organization.id:",
-        "openai.organization.name:datadog-4",
-        "openai.user.api_key:sk-...key>",
-        "error:0",
-    ]
-    assert mock.call.distribution("request.duration", span.duration_ns, tags=expected_tags) in mock_metrics.mock_calls
-    assert mock.call.gauge("ratelimit.requests", 3000, tags=expected_tags) in mock_metrics.mock_calls
-    assert mock.call.gauge("ratelimit.remaining.requests", 2999, tags=expected_tags) in mock_metrics.mock_calls
-    expected_tags += ["openai.estimated:true"]
-    if TIKTOKEN_AVAILABLE:
-        expected_tags = expected_tags[:-1]
-    assert mock.call.distribution("tokens.prompt", prompt_tokens, tags=expected_tags) in mock_metrics.mock_calls
-    assert mock.call.distribution("tokens.completion", mock.ANY, tags=expected_tags) in mock_metrics.mock_calls
-    assert mock.call.distribution("tokens.total", mock.ANY, tags=expected_tags) in mock_metrics.mock_calls
+                _ = [c async for c in resp]
 
 
 @pytest.mark.snapshot(
diff --git a/tests/contrib/urllib3/test_urllib3.py b/tests/contrib/urllib3/test_urllib3.py
index 2f0c447ee65..841e2c826ab 100644
--- a/tests/contrib/urllib3/test_urllib3.py
+++ b/tests/contrib/urllib3/test_urllib3.py
@@ -12,6 +12,7 @@
 from ddtrace.ext import http
 from ddtrace.internal.schema import DEFAULT_SPAN_SERVICE_NAME
 from ddtrace.pin import Pin
+from ddtrace.settings.asm import config as asm_config
 from tests.contrib.config import HTTPBIN_CONFIG
 from tests.opentracer.utils import init_tracer
 from tests.utils import TracerTestCase
@@ -527,12 +528,16 @@ def test_distributed_tracing_disabled(self):
                     timeout=mock.ANY,
                 )
 
+    @pytest.mark.skip(reason="urlib3 does not set the ASM Manual keep tag so x-datadog headers are not propagated")
     def test_distributed_tracing_apm_opt_out_true(self):
         """Tests distributed tracing headers are passed by default"""
         # Check that distributed tracing headers are passed down; raise an error rather than make the
         # request since we don't care about the response at all
         config.urllib3["distributed_tracing"] = True
         self.tracer.enabled = False
+        # Ensure the ASM SpanProcessor is set
+        self.tracer.configure(appsec_standalone_enabled=True, appsec_enabled=True)
+        assert asm_config._apm_opt_out
         with mock.patch(
             "urllib3.connectionpool.HTTPConnectionPool._make_request", side_effect=ValueError
         ) as m_make_request:
@@ -580,6 +585,9 @@ def test_distributed_tracing_apm_opt_out_false(self):
         """Test with distributed tracing disabled does not propagate the headers"""
         config.urllib3["distributed_tracing"] = True
         self.tracer.enabled = False
+        # Ensure the ASM SpanProcessor is set.
+        self.tracer.configure(appsec_standalone_enabled=False, appsec_enabled=True)
+        assert not asm_config._apm_opt_out
         with mock.patch(
             "urllib3.connectionpool.HTTPConnectionPool._make_request", side_effect=ValueError
         ) as m_make_request:
diff --git a/tests/debugging/exception/test_replay.py b/tests/debugging/exception/test_replay.py
index 9aae75dae47..8261bfb5b47 100644
--- a/tests/debugging/exception/test_replay.py
+++ b/tests/debugging/exception/test_replay.py
@@ -294,3 +294,23 @@ def c(foo=42):
             self.assert_span_count(6)
             # no new snapshots
             assert len(uploader.collector.queue) == 3
+
+    def test_debugger_exception_in_closure(self):
+        def b():
+            with self.trace("b"):
+                nonloc = 4
+
+                def a(v):
+                    if nonloc:
+                        raise ValueError("hello", v)
+
+                a(nonloc)
+
+        with exception_replay() as uploader:
+            with with_rate_limiter(RateLimiter(limit_rate=1, raise_on_exceed=False)):
+                with pytest.raises(ValueError):
+                    b()
+
+            assert all(
+                s.line_capture["locals"]["nonloc"] == {"type": "int", "value": "4"} for s in uploader.collector.queue
+            )
diff --git a/tests/debugging/test_safety.py b/tests/debugging/test_safety.py
index 3acb0288924..cc44ca9ca12 100644
--- a/tests/debugging/test_safety.py
+++ b/tests/debugging/test_safety.py
@@ -15,7 +15,10 @@ def assert_args(args):
         assert set(dict(_safety.get_args(inspect.currentframe().f_back)).keys()) == args
 
     def assert_locals(_locals):
-        assert set(dict(_safety.get_locals(inspect.currentframe().f_back)).keys()) == _locals
+        assert set(dict(_safety.get_locals(inspect.currentframe().f_back)).keys()) == _locals | {
+            "assert_args",
+            "assert_locals",
+        }
 
     def assert_globals(_globals):
         assert set(dict(_safety.get_globals(inspect.currentframe().f_back)).keys()) == _globals
diff --git a/tests/llmobs/_utils.py b/tests/llmobs/_utils.py
index 0ecdde36ee6..4e60a8f3996 100644
--- a/tests/llmobs/_utils.py
+++ b/tests/llmobs/_utils.py
@@ -210,11 +210,13 @@ def _get_llmobs_parent_id(span: Span):
 
 
 def _expected_llmobs_eval_metric_event(
-    span_id,
-    trace_id,
     metric_type,
     label,
     ml_app,
+    tag_key=None,
+    tag_value=None,
+    span_id=None,
+    trace_id=None,
     timestamp_ms=None,
     categorical_value=None,
     score_value=None,
@@ -223,8 +225,7 @@ def _expected_llmobs_eval_metric_event(
     metadata=None,
 ):
     eval_metric_event = {
-        "span_id": span_id,
-        "trace_id": trace_id,
+        "join_on": {},
         "metric_type": metric_type,
         "label": label,
         "tags": [
@@ -232,6 +233,10 @@ def _expected_llmobs_eval_metric_event(
             "ml_app:{}".format(ml_app if ml_app is not None else "unnamed-ml-app"),
         ],
     }
+    if tag_key is not None and tag_value is not None:
+        eval_metric_event["join_on"]["tag"] = {"key": tag_key, "value": tag_value}
+    if span_id is not None and trace_id is not None:
+        eval_metric_event["join_on"]["span"] = {"span_id": span_id, "trace_id": trace_id}
     if categorical_value is not None:
         eval_metric_event["categorical_value"] = categorical_value
     if score_value is not None:
@@ -542,8 +547,7 @@ def run_and_submit_evaluation(self, span):
 
 def _dummy_evaluator_eval_metric_event(span_id, trace_id):
     return LLMObsEvaluationMetricEvent(
-        span_id=span_id,
-        trace_id=trace_id,
+        join_on={"span": {"span_id": span_id, "trace_id": trace_id}},
         score_value=1.0,
         ml_app="unnamed-ml-app",
         timestamp_ms=mock.ANY,
@@ -553,7 +557,46 @@ def _dummy_evaluator_eval_metric_event(span_id, trace_id):
     )
 
 
-def _expected_ragas_spans(ragas_inputs=None):
+def _expected_ragas_context_precision_spans(ragas_inputs=None):
+    if not ragas_inputs:
+        ragas_inputs = default_ragas_inputs
+    return [
+        {
+            "trace_id": mock.ANY,
+            "span_id": mock.ANY,
+            "parent_id": "undefined",
+            "name": "dd-ragas.context_precision",
+            "start_ns": mock.ANY,
+            "duration": mock.ANY,
+            "status": "ok",
+            "meta": {
+                "span.kind": "workflow",
+                "input": {"value": mock.ANY},
+                "output": {"value": "1.0"},
+            },
+            "metrics": {},
+            "tags": expected_ragas_trace_tags(),
+        },
+        {
+            "trace_id": mock.ANY,
+            "span_id": mock.ANY,
+            "parent_id": mock.ANY,
+            "name": "dd-ragas.extract_evaluation_inputs_from_span",
+            "start_ns": mock.ANY,
+            "duration": mock.ANY,
+            "status": "ok",
+            "meta": {
+                "span.kind": "workflow",
+                "input": {"value": mock.ANY},
+                "output": {"value": mock.ANY},
+            },
+            "metrics": {},
+            "tags": expected_ragas_trace_tags(),
+        },
+    ]
+
+
+def _expected_ragas_faithfulness_spans(ragas_inputs=None):
     if not ragas_inputs:
         ragas_inputs = default_ragas_inputs
     return [
@@ -581,7 +624,7 @@ def _expected_ragas_spans(ragas_inputs=None):
             "trace_id": mock.ANY,
             "span_id": mock.ANY,
             "parent_id": mock.ANY,
-            "name": "dd-ragas.extract_faithfulness_inputs",
+            "name": "dd-ragas.extract_evaluation_inputs_from_span",
             "start_ns": mock.ANY,
             "duration": mock.ANY,
             "status": "ok",
diff --git a/tests/llmobs/conftest.py b/tests/llmobs/conftest.py
index a7d467b3985..5a63b7e2b8f 100644
--- a/tests/llmobs/conftest.py
+++ b/tests/llmobs/conftest.py
@@ -31,26 +31,6 @@ def pytest_configure(config):
     config.addinivalue_line("markers", "vcr_logs: mark test to use recorded request/responses")
 
 
-@pytest.fixture
-def mock_llmobs_span_writer():
-    patcher = mock.patch("ddtrace.llmobs._llmobs.LLMObsSpanWriter")
-    LLMObsSpanWriterMock = patcher.start()
-    m = mock.MagicMock()
-    LLMObsSpanWriterMock.return_value = m
-    yield m
-    patcher.stop()
-
-
-@pytest.fixture
-def mock_llmobs_span_agentless_writer():
-    patcher = mock.patch("ddtrace.llmobs._llmobs.LLMObsSpanWriter")
-    LLMObsSpanWriterMock = patcher.start()
-    m = mock.MagicMock()
-    LLMObsSpanWriterMock.return_value = m
-    yield m
-    patcher.stop()
-
-
 @pytest.fixture
 def mock_llmobs_eval_metric_writer():
     patcher = mock.patch("ddtrace.llmobs._llmobs.LLMObsEvalMetricWriter")
@@ -85,10 +65,7 @@ def mock_llmobs_submit_evaluation():
 def mock_http_writer_send_payload_response():
     with mock.patch(
         "ddtrace.internal.writer.HTTPWriter._send_payload",
-        return_value=Response(
-            status=200,
-            body="{}",
-        ),
+        return_value=Response(status=200, body="{}"),
     ):
         yield
 
@@ -124,9 +101,10 @@ def mock_evaluator_sampler_logs():
 
 
 @pytest.fixture
-def mock_http_writer_logs():
-    with mock.patch("ddtrace.internal.writer.writer.log") as m:
+def mock_llmobs_logs():
+    with mock.patch("ddtrace.llmobs._llmobs.log") as m:
         yield m
+        m.reset_mock()
 
 
 @pytest.fixture
@@ -139,44 +117,6 @@ def default_global_config():
     return {"_dd_api_key": "<not-a-real-api_key>", "_llmobs_ml_app": "unnamed-ml-app"}
 
 
-@pytest.fixture
-def LLMObs(
-    mock_llmobs_span_writer, mock_llmobs_eval_metric_writer, mock_llmobs_evaluator_runner, ddtrace_global_config
-):
-    global_config = default_global_config()
-    global_config.update(ddtrace_global_config)
-    with override_global_config(global_config):
-        dummy_tracer = DummyTracer()
-        llmobs_service.enable(_tracer=dummy_tracer)
-        yield llmobs_service
-        llmobs_service.disable()
-
-
-@pytest.fixture
-def AgentlessLLMObs(
-    mock_llmobs_span_agentless_writer,
-    mock_llmobs_eval_metric_writer,
-    mock_llmobs_evaluator_runner,
-    ddtrace_global_config,
-):
-    global_config = default_global_config()
-    global_config.update(ddtrace_global_config)
-    global_config.update(dict(_llmobs_agentless_enabled=True))
-    with override_global_config(global_config):
-        dummy_tracer = DummyTracer()
-        llmobs_service.enable(_tracer=dummy_tracer)
-        yield llmobs_service
-        llmobs_service.disable()
-
-
-@pytest.fixture
-def disabled_llmobs():
-    prev = llmobs_service.enabled
-    llmobs_service.enabled = False
-    yield
-    llmobs_service.enabled = prev
-
-
 @pytest.fixture
 def mock_ragas_dependencies_not_present():
     import ragas
@@ -189,18 +129,22 @@ def mock_ragas_dependencies_not_present():
 
 
 @pytest.fixture
-def ragas(mock_llmobs_span_writer, mock_llmobs_eval_metric_writer):
+def ragas(mock_llmobs_eval_metric_writer):
     with override_global_config(dict(_dd_api_key="<not-a-real-key>")):
-        import ragas
-
+        try:
+            import ragas
+        except ImportError:
+            pytest.skip("Ragas not installed")
         with override_env(dict(OPENAI_API_KEY=os.getenv("OPENAI_API_KEY", "<not-a-real-key>"))):
             yield ragas
 
 
 @pytest.fixture
 def reset_ragas_faithfulness_llm():
-    import ragas
-
+    try:
+        import ragas
+    except ImportError:
+        pytest.skip("Ragas not installed")
     previous_llm = ragas.metrics.faithfulness.llm
     yield
     ragas.metrics.faithfulness.llm = previous_llm
@@ -243,16 +187,25 @@ def llmobs_span_writer():
 
 
 @pytest.fixture
-def llmobs(monkeypatch, tracer, llmobs_env, llmobs_span_writer):
+def llmobs(
+    ddtrace_global_config,
+    monkeypatch,
+    tracer,
+    llmobs_env,
+    llmobs_span_writer,
+    mock_llmobs_eval_metric_writer,
+    mock_llmobs_evaluator_runner,
+):
     for env, val in llmobs_env.items():
         monkeypatch.setenv(env, val)
-
+    global_config = default_global_config()
+    global_config.update(dict(_llmobs_ml_app=llmobs_env.get("DD_LLMOBS_ML_APP")))
+    global_config.update(ddtrace_global_config)
     # TODO: remove once rest of tests are moved off of global config tampering
-    with override_global_config(dict(_llmobs_ml_app=llmobs_env.get("DD_LLMOBS_ML_APP"))):
+    with override_global_config(global_config):
         llmobs_service.enable(_tracer=tracer)
         llmobs_service._instance._llmobs_span_writer = llmobs_span_writer
-        llmobs_service._instance._trace_processor._span_writer = llmobs_span_writer
-        yield llmobs
+        yield llmobs_service
     llmobs_service.disable()
 
 
diff --git a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_eval_metric_writer.send_score_metric.yaml b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_eval_metric_writer.send_score_metric.yaml
index 61c26ff7bf0..f767f5de303 100644
--- a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_eval_metric_writer.send_score_metric.yaml
+++ b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_eval_metric_writer.send_score_metric.yaml
@@ -1,27 +1,28 @@
 interactions:
 - request:
-    body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"span_id":
-      "12345678902", "trace_id": "98765432102", "metric_type": "score", "label": "sentiment",
-      "score_value": 0.9, "ml_app": "dummy-ml-app", "timestamp_ms": 1724249500942}]}}}'
+    body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"join_on":
+      {"span": {"span_id": "12345678902", "trace_id": "98765432102"}}, "metric_type":
+      "score", "label": "sentiment", "score_value": 0.9, "ml_app": "dummy-ml-app",
+      "timestamp_ms": 1732568298743}]}}}'
     headers:
       Content-Type:
       - application/json
       DD-API-KEY:
       - XXXXXX
     method: POST
-    uri: https://api.datad0g.com/api/intake/llm-obs/v1/eval-metric
+    uri: https://api.datad0g.com/api/intake/llm-obs/v2/eval-metric
   response:
     body:
-      string: '{"data":{"id":"e66c93b9-ca0a-4f0a-9207-497e0a1b6eec","type":"evaluation_metric","attributes":{"metrics":[{"id":"5fb5ed5d-20c1-4f34-abf9-c0bdc09680e3","trace_id":"98765432102","span_id":"12345678902","timestamp_ms":1724249500942,"ml_app":"dummy-ml-app","metric_type":"score","label":"sentiment","score_value":0.9}]}}}'
+      string: '{"data":{"id":"5b998846-53af-4b0e-a658-fd9e06726d6d","type":"evaluation_metric","attributes":{"metrics":[{"id":"jbGbAMC7Rk","join_on":{"span":{"trace_id":"98765432102","span_id":"12345678902"}},"timestamp_ms":1732568298743,"ml_app":"dummy-ml-app","metric_type":"score","label":"sentiment","score_value":0.9}]}}}'
     headers:
       content-length:
-      - '316'
+      - '311'
       content-security-policy:
       - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pub293163a918901030b79492fe1ab424cf&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatad0g.com
       content-type:
       - application/vnd.api+json
       date:
-      - Wed, 21 Aug 2024 14:11:41 GMT
+      - Mon, 25 Nov 2024 20:58:19 GMT
       strict-transport-security:
       - max-age=31536000; includeSubDomains; preload
       vary:
diff --git a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_eval_metric_writer.test_send_categorical_metric.yaml b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_eval_metric_writer.test_send_categorical_metric.yaml
index 92498e86e9e..f4404b30832 100644
--- a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_eval_metric_writer.test_send_categorical_metric.yaml
+++ b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_eval_metric_writer.test_send_categorical_metric.yaml
@@ -1,27 +1,28 @@
 interactions:
 - request:
-    body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"span_id":
-      "12345678901", "trace_id": "98765432101", "metric_type": "categorical", "categorical_value":
-      "very", "label": "toxicity", "ml_app": "dummy-ml-app", "timestamp_ms": 1724249500339}]}}}'
+    body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"join_on":
+      {"span": {"span_id": "12345678901", "trace_id": "98765432101"}}, "metric_type":
+      "categorical", "categorical_value": "very", "label": "toxicity", "ml_app": "dummy-ml-app",
+      "timestamp_ms": 1732568297450}]}}}'
     headers:
       Content-Type:
       - application/json
       DD-API-KEY:
       - XXXXXX
     method: POST
-    uri: https://api.datad0g.com/api/intake/llm-obs/v1/eval-metric
+    uri: https://api.datad0g.com/api/intake/llm-obs/v2/eval-metric
   response:
     body:
-      string: '{"data":{"id":"36d88c24-d7d4-4d3e-853c-b695aff61344","type":"evaluation_metric","attributes":{"metrics":[{"id":"0c189d9c-a730-4c5d-bbc2-55ef3455900f","trace_id":"98765432101","span_id":"12345678901","timestamp_ms":1724249500339,"ml_app":"dummy-ml-app","metric_type":"categorical","label":"toxicity","categorical_value":"very"}]}}}'
+      string: '{"data":{"id":"49c5c927-76f1-4de4-ad97-e1a0a159229f","type":"evaluation_metric","attributes":{"metrics":[{"id":"okVf1U4XzA","join_on":{"span":{"trace_id":"98765432101","span_id":"12345678901"}},"timestamp_ms":1732568297450,"ml_app":"dummy-ml-app","metric_type":"categorical","label":"toxicity","categorical_value":"very"}]}}}'
     headers:
       content-length:
-      - '330'
+      - '325'
       content-security-policy:
       - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pub293163a918901030b79492fe1ab424cf&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatad0g.com
       content-type:
       - application/vnd.api+json
       date:
-      - Wed, 21 Aug 2024 14:11:40 GMT
+      - Mon, 25 Nov 2024 20:58:17 GMT
       strict-transport-security:
       - max-age=31536000; includeSubDomains; preload
       vary:
diff --git a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_eval_metric_writer.test_send_metric_bad_api_key.yaml b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_eval_metric_writer.test_send_metric_bad_api_key.yaml
index 68fe0315870..ef6f4cf445e 100644
--- a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_eval_metric_writer.test_send_metric_bad_api_key.yaml
+++ b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_eval_metric_writer.test_send_metric_bad_api_key.yaml
@@ -1,15 +1,16 @@
 interactions:
 - request:
-    body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"span_id":
-      "12345678901", "trace_id": "98765432101", "metric_type": "categorical", "categorical_value":
-      "very", "label": "toxicity", "ml_app": "dummy-ml-app", "timestamp_ms": 1724249500253}]}}}'
+    body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"join_on":
+      {"span": {"span_id": "12345678901", "trace_id": "98765432101"}}, "metric_type":
+      "categorical", "categorical_value": "very", "label": "toxicity", "ml_app": "dummy-ml-app",
+      "timestamp_ms": 1732568297307}]}}}'
     headers:
       Content-Type:
       - application/json
       DD-API-KEY:
       - XXXXXX
     method: POST
-    uri: https://api.datad0g.com/api/intake/llm-obs/v1/eval-metric
+    uri: https://api.datad0g.com/api/intake/llm-obs/v2/eval-metric
   response:
     body:
       string: '{"status":"error","code":403,"errors":["Forbidden"],"statuspage":"http://status.datadoghq.com","twitter":"http://twitter.com/datadogops","email":"support@datadoghq.com"}'
@@ -21,7 +22,7 @@ interactions:
       content-type:
       - application/json
       date:
-      - Wed, 21 Aug 2024 14:11:40 GMT
+      - Mon, 25 Nov 2024 20:58:17 GMT
       strict-transport-security:
       - max-age=31536000; includeSubDomains; preload
       x-content-type-options:
diff --git a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_eval_metric_writer.test_send_multiple_events.yaml b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_eval_metric_writer.test_send_multiple_events.yaml
index 61da12cd3fa..3638a1cf608 100644
--- a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_eval_metric_writer.test_send_multiple_events.yaml
+++ b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_eval_metric_writer.test_send_multiple_events.yaml
@@ -1,32 +1,30 @@
 interactions:
 - request:
-    body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"span_id":
-      "12345678902", "trace_id": "98765432102", "metric_type": "score", "label": "sentiment",
-      "score_value": 0.9, "ml_app": "dummy-ml-app", "timestamp_ms": 1724249589510},
-      {"span_id": "12345678901", "trace_id": "98765432101", "metric_type": "categorical",
-      "categorical_value": "very", "label": "toxicity", "ml_app": "dummy-ml-app",
-      "timestamp_ms": 1724249589510}]}}}'
+    body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"join_on":
+      {"span": {"span_id": "12345678902", "trace_id": "98765432102"}}, "metric_type":
+      "score", "label": "sentiment", "score_value": 0.9, "ml_app": "dummy-ml-app",
+      "timestamp_ms": 1732568728793}, {"join_on": {"span": {"span_id": "12345678901",
+      "trace_id": "98765432101"}}, "metric_type": "categorical", "categorical_value":
+      "very", "label": "toxicity", "ml_app": "dummy-ml-app", "timestamp_ms": 1732568728793}]}}}'
     headers:
       Content-Type:
       - application/json
       DD-API-KEY:
       - XXXXXX
     method: POST
-    uri: https://api.datad0g.com/api/intake/llm-obs/v1/eval-metric
+    uri: https://api.datad0g.com/api/intake/llm-obs/v2/eval-metric
   response:
     body:
-      string: '{"data":{"id":"2ccffdfc-024b-49e6-881c-4e4d1c5f450e","type":"evaluation_metric","attributes":{"metrics":[{"id":"ed072901-fd70-4417-9cab-1bad62b6ac09","trace_id":"98765432102","span_id":"12345678902","timestamp_ms":1724249589510,"ml_app":"dummy-ml-app","metric_type":"score","label":"sentiment","score_value":0.9},{"id":"16175a34-7c25-43ca-8551-bd2f7242ab77","trace_id":"98765432101","span_id":"12345678901","timestamp_ms":1724249589510,"ml_app":"dummy-ml-app","metric_type":"categorical","label":"toxicity","categorical_value":"very"}]}}}'
+      string: '{"data":{"id":"844be0cd-9dd4-45d3-9763-8ccb20f4e7c8","type":"evaluation_metric","attributes":{"metrics":[{"id":"IZhAbBsXBJ","join_on":{"span":{"trace_id":"98765432102","span_id":"12345678902"}},"timestamp_ms":1732568728793,"ml_app":"dummy-ml-app","metric_type":"score","label":"sentiment","score_value":0.9},{"id":"ME868fTl0T","join_on":{"span":{"trace_id":"98765432101","span_id":"12345678901"}},"timestamp_ms":1732568728793,"ml_app":"dummy-ml-app","metric_type":"categorical","label":"toxicity","categorical_value":"very"}]}}}'
     headers:
-      Connection:
-      - keep-alive
-      Content-Length:
-      - '538'
-      Content-Type:
-      - application/vnd.api+json
-      Date:
-      - Wed, 21 Aug 2024 14:13:09 GMT
+      content-length:
+      - '528'
       content-security-policy:
       - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pub293163a918901030b79492fe1ab424cf&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatad0g.com
+      content-type:
+      - application/vnd.api+json
+      date:
+      - Mon, 25 Nov 2024 21:05:29 GMT
       strict-transport-security:
       - max-age=31536000; includeSubDomains; preload
       vary:
diff --git a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_eval_metric_writer.test_send_score_metric.yaml b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_eval_metric_writer.test_send_score_metric.yaml
index 1394f9fbb43..65bb0fa1562 100644
--- a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_eval_metric_writer.test_send_score_metric.yaml
+++ b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_eval_metric_writer.test_send_score_metric.yaml
@@ -1,27 +1,28 @@
 interactions:
 - request:
-    body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"span_id":
-      "12345678902", "trace_id": "98765432102", "metric_type": "score", "label": "sentiment",
-      "score_value": 0.9, "ml_app": "dummy-ml-app", "timestamp_ms": 1724249500471}]}}}'
+    body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"join_on":
+      {"span": {"span_id": "12345678902", "trace_id": "98765432102"}}, "metric_type":
+      "score", "label": "sentiment", "score_value": 0.9, "ml_app": "dummy-ml-app",
+      "timestamp_ms": 1732568297772}]}}}'
     headers:
       Content-Type:
       - application/json
       DD-API-KEY:
       - XXXXXX
     method: POST
-    uri: https://api.datad0g.com/api/intake/llm-obs/v1/eval-metric
+    uri: https://api.datad0g.com/api/intake/llm-obs/v2/eval-metric
   response:
     body:
-      string: '{"data":{"id":"5bd1b0b7-0acd-46e2-8ff6-3ee6a92457b6","type":"evaluation_metric","attributes":{"metrics":[{"id":"d8aa2a23-3137-4c49-b87b-d1eb1c3af04e","trace_id":"98765432102","span_id":"12345678902","timestamp_ms":1724249500471,"ml_app":"dummy-ml-app","metric_type":"score","label":"sentiment","score_value":0.9}]}}}'
+      string: '{"data":{"id":"d1518236-84b1-4b47-9cbc-ffc24188b5cc","type":"evaluation_metric","attributes":{"metrics":[{"id":"jiKtwDKR0B","join_on":{"span":{"trace_id":"98765432102","span_id":"12345678902"}},"timestamp_ms":1732568297772,"ml_app":"dummy-ml-app","metric_type":"score","label":"sentiment","score_value":0.9}]}}}'
     headers:
       content-length:
-      - '316'
+      - '311'
       content-security-policy:
       - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pub293163a918901030b79492fe1ab424cf&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatad0g.com
       content-type:
       - application/vnd.api+json
       date:
-      - Wed, 21 Aug 2024 14:11:40 GMT
+      - Mon, 25 Nov 2024 20:58:18 GMT
       strict-transport-security:
       - max-age=31536000; includeSubDomains; preload
       vary:
diff --git a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_eval_metric_writer.test_send_timed_events.yaml b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_eval_metric_writer.test_send_timed_events.yaml
index c9797ace419..c31d610bd57 100644
--- a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_eval_metric_writer.test_send_timed_events.yaml
+++ b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_eval_metric_writer.test_send_timed_events.yaml
@@ -1,27 +1,28 @@
 interactions:
 - request:
-    body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"span_id":
-      "12345678902", "trace_id": "98765432102", "metric_type": "score", "label": "sentiment",
-      "score_value": 0.9, "ml_app": "dummy-ml-app", "timestamp_ms": 1724249982978}]}}}'
+    body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"join_on":
+      {"span": {"span_id": "12345678902", "trace_id": "98765432102"}}, "metric_type":
+      "score", "label": "sentiment", "score_value": 0.9, "ml_app": "dummy-ml-app",
+      "timestamp_ms": 1732568764624}]}}}'
     headers:
       Content-Type:
       - application/json
       DD-API-KEY:
       - XXXXXX
     method: POST
-    uri: https://api.datad0g.com/api/intake/llm-obs/v1/eval-metric
+    uri: https://api.datad0g.com/api/intake/llm-obs/v2/eval-metric
   response:
     body:
-      string: '{"data":{"id":"aba22157-cc3a-4601-a6a5-7afa99eee73e","type":"evaluation_metric","attributes":{"metrics":[{"id":"c2f6f63c-17ca-48c3-ad2d-676b2a35e726","trace_id":"98765432102","span_id":"12345678902","timestamp_ms":1724249982978,"ml_app":"dummy-ml-app","metric_type":"score","label":"sentiment","score_value":0.9}]}}}'
+      string: '{"data":{"id":"5352c11a-dcdd-449b-af72-2ae0b5dac3a1","type":"evaluation_metric","attributes":{"metrics":[{"id":"WmMD7E_fAD","join_on":{"span":{"trace_id":"98765432102","span_id":"12345678902"}},"timestamp_ms":1732568764624,"ml_app":"dummy-ml-app","metric_type":"score","label":"sentiment","score_value":0.9}]}}}'
     headers:
       content-length:
-      - '316'
+      - '311'
       content-security-policy:
       - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pub293163a918901030b79492fe1ab424cf&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatad0g.com
       content-type:
       - application/vnd.api+json
       date:
-      - Wed, 21 Aug 2024 14:19:45 GMT
+      - Mon, 25 Nov 2024 21:06:04 GMT
       strict-transport-security:
       - max-age=31536000; includeSubDomains; preload
       vary:
@@ -34,28 +35,29 @@ interactions:
       code: 202
       message: Accepted
 - request:
-    body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"span_id":
-      "12345678901", "trace_id": "98765432101", "metric_type": "categorical", "categorical_value":
-      "very", "label": "toxicity", "ml_app": "dummy-ml-app", "timestamp_ms": 1724249983284}]}}}'
+    body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"join_on":
+      {"span": {"span_id": "12345678901", "trace_id": "98765432101"}}, "metric_type":
+      "categorical", "categorical_value": "very", "label": "toxicity", "ml_app": "dummy-ml-app",
+      "timestamp_ms": 1732568765127}]}}}'
     headers:
       Content-Type:
       - application/json
       DD-API-KEY:
       - XXXXXX
     method: POST
-    uri: https://api.datad0g.com/api/intake/llm-obs/v1/eval-metric
+    uri: https://api.datad0g.com/api/intake/llm-obs/v2/eval-metric
   response:
     body:
-      string: '{"data":{"id":"0bc39c40-6c72-4b11-9eea-826248f9fe37","type":"evaluation_metric","attributes":{"metrics":[{"id":"7da7eb5b-32d2-43b3-adf5-208313f822c5","trace_id":"98765432101","span_id":"12345678901","timestamp_ms":1724249983284,"ml_app":"dummy-ml-app","metric_type":"categorical","label":"toxicity","categorical_value":"very"}]}}}'
+      string: '{"data":{"id":"d39e806e-40c5-4b3c-b539-440390afca85","type":"evaluation_metric","attributes":{"metrics":[{"id":"403hQLmrQW","join_on":{"span":{"trace_id":"98765432101","span_id":"12345678901"}},"timestamp_ms":1732568765127,"ml_app":"dummy-ml-app","metric_type":"categorical","label":"toxicity","categorical_value":"very"}]}}}'
     headers:
       content-length:
-      - '330'
+      - '325'
       content-security-policy:
       - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pub293163a918901030b79492fe1ab424cf&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatad0g.com
       content-type:
       - application/vnd.api+json
       date:
-      - Wed, 21 Aug 2024 14:19:45 GMT
+      - Mon, 25 Nov 2024 21:06:05 GMT
       strict-transport-security:
       - max-age=31536000; includeSubDomains; preload
       vary:
diff --git a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_evaluator_runner.send_score_metric.yaml b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_evaluator_runner.send_score_metric.yaml
index e2e17e715cf..f5deea8ef90 100644
--- a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_evaluator_runner.send_score_metric.yaml
+++ b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_evaluator_runner.send_score_metric.yaml
@@ -1,28 +1,28 @@
 interactions:
 - request:
-    body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"span_id":
-      "123", "trace_id": "1234", "label": "dummy", "metric_type": "score", "timestamp_ms":
-      1729569649880, "score_value": 1.0, "ml_app": "unnamed-ml-app", "tags": ["ddtrace.version:2.15.0.dev219+ge047e25bb.d20241022",
-      "ml_app:unnamed-ml-app"]}]}}}'
+    body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"join_on":
+      {"span": {"span_id": "123", "trace_id": "1234"}}, "label": "dummy", "metric_type":
+      "score", "timestamp_ms": 1732569321978, "score_value": 1.0, "ml_app": "unnamed-ml-app",
+      "tags": ["ddtrace.version:2.15.0.dev351+g152f3e3b6.d20241122", "ml_app:unnamed-ml-app"]}]}}}'
     headers:
       Content-Type:
       - application/json
       DD-API-KEY:
       - XXXXXX
     method: POST
-    uri: https://api.datad0g.com/api/intake/llm-obs/v1/eval-metric
+    uri: https://api.datad0g.com/api/intake/llm-obs/v2/eval-metric
   response:
     body:
-      string: '{"data":{"id":"2131dbc0-d085-401c-8b2d-8506a9ac8c13","type":"evaluation_metric","attributes":{"metrics":[{"id":"YutAyQc6F4","trace_id":"1234","span_id":"123","timestamp_ms":1729569649880,"ml_app":"unnamed-ml-app","metric_type":"score","label":"dummy","score_value":1,"tags":["ddtrace.version:2.15.0.dev219+ge047e25bb.d20241022","ml_app:unnamed-ml-app"]}]}}}'
+      string: '{"data":{"id":"06c00db0-1898-44be-ae0b-f0149f819c59","type":"evaluation_metric","attributes":{"metrics":[{"id":"1DrSMXmWcP","join_on":{"span":{"trace_id":"1234","span_id":"123"}},"timestamp_ms":1732569321978,"ml_app":"unnamed-ml-app","metric_type":"score","label":"dummy","score_value":1,"tags":["ddtrace.version:2.15.0.dev351+g152f3e3b6.d20241122","ml_app:unnamed-ml-app"]}]}}}'
     headers:
       content-length:
-      - '357'
+      - '378'
       content-security-policy:
       - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pub293163a918901030b79492fe1ab424cf&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatad0g.com
       content-type:
       - application/vnd.api+json
       date:
-      - Tue, 22 Oct 2024 04:00:50 GMT
+      - Mon, 25 Nov 2024 21:15:22 GMT
       strict-transport-security:
       - max-age=31536000; includeSubDomains; preload
       vary:
diff --git a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.emits_traces_and_evaluations_on_exit.yaml b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_evaluators.emits_traces_and_evaluations_on_exit.yaml
similarity index 76%
rename from tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.emits_traces_and_evaluations_on_exit.yaml
rename to tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_evaluators.emits_traces_and_evaluations_on_exit.yaml
index 757f875443f..367024a712d 100644
--- a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.emits_traces_and_evaluations_on_exit.yaml
+++ b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_evaluators.emits_traces_and_evaluations_on_exit.yaml
@@ -73,19 +73,19 @@ interactions:
   response:
     body:
       string: !!binary |
-        H4sIAAAAAAAAA2yRW4vbMBCF3/0rxDzHi+2mySZvbUOhsPRGoSxxsBV5bKuVJaGZ9ELIfy9yvEnK
-        7oseztH5NGd0TIQA3cBagOolq8Gb9M3D5tPi1zuz+r0qHjavv27a74+vPj5+sG8z8wVmMeH2P1Dx
-        U+pOucEbZO3s2VYBJWOk5stitchXy3w+GoNr0MRY5zmdu3TQVqdFVszTbJnm91O6d1ohwVpsEyGE
-        OI5nnNM2+AfWIps9KQMSyQ5hfbkkBARnogKSSBNLyzC7mspZRjuOXtf19lgCYVQUViO+HPmiBNKx
-        U6iIJeOAlila2xK+9SiU9JqlEa4V74O0CoUm8VkGTXcl7E67uq5vHw3YHkjG4vZgzKSfLi2M63xw
-        e5r8i95qq6mvAkpyNk5M7DyM7ikRYjdu6/DfAsAHN3iu2P1EG4GLLD/z4PpJV7dYTCY7luYmVSxn
-        L/CqBllqQzf7BiVVj801miU35Z4/+hLiXFDb7hklmUhAf4lxqFptOww+6PMPtr6a3xeqKORyryA5
-        Jf8AAAD//wMAn6C7Cc8CAAA=
+        H4sIAAAAAAAAA4xSwYrbMBS8+yvEO8eL403iTW49bEtuWdpCIQ62Ij/bam1J6L1AS8i/FznZ2Mtu
+        oRcdZt6MZp50joQAXcFGgGolq9518acf0hyX6+02W26T3e7Ly/r5JTltv9rvNT/DLCjs8ScqflU9
+        KNu7Dllbc6WVR8kYXOfZY7pcrVeLx4HobYVdkDWO44WNe210nCbpIk6yeP50U7dWKyTYiH0khBDn
+        4Qw5TYW/YSOS2SvSI5FsEDb3ISHA2y4gIIk0sTQMs5FU1jCaIXpZlvtzDoQBUVgM9vngL3IgHTr5
+        glgy9miYArXP4VuLQkmnWXbC1uKzl0ah0CR20mt6yOFwOZRlOb3UY30iGYqbU9fd8Mu9RWcb5+2R
+        bvwdr7XR1BYeJVkTEhNbBwN7iYQ4DNs6vVkAOG97xwXbX2iC4SqZX/1gfKSRTVc3ki3LbqJKs9kH
+        fkWFLHVHk32DkqrFapSOjyNPlbYTIpq0fp/mI+9rc22a/7EfCaXQMVaF81hp9bbxOOYx/OF/jd23
+        PAQG+kOMfVFr06B3Xl9/UO2KJEuWx/opUwlEl+gvAAAA//8DABrEjBtPAwAA
     headers:
       CF-Cache-Status:
       - DYNAMIC
       CF-RAY:
-      - 8d6b5b701f294367-EWR
+      - 8e84af2fba19c952-IAD
       Connection:
       - keep-alive
       Content-Encoding:
@@ -93,14 +93,14 @@ interactions:
       Content-Type:
       - application/json
       Date:
-      - Tue, 22 Oct 2024 17:55:15 GMT
+      - Mon, 25 Nov 2024 21:20:43 GMT
       Server:
       - cloudflare
       Set-Cookie:
-      - __cf_bm=iQaF937ylY7BvvBCyWYQoxiJwi1nBp5.LILrHLw1uno-1729619715-1.0.1.1-jS4Dz7yc_ud.hKZlJ_CAZkSQesqzVkfrA5F30zI7CtJsbEKyAiuVlpX0CPf816UtlhXQEW8T5nsc.UvnsCOzOw;
-        path=/; expires=Tue, 22-Oct-24 18:25:15 GMT; domain=.api.openai.com; HttpOnly;
+      - __cf_bm=CVMxqIcHUHNjX56k1MKjj4MgiYNVAlg_B7yyVaP_z1o-1732569643-1.0.1.1-HOtZfXprHWr_DjtorQ_ZK6bbSmcOsBrphniRCaC9XQ2tTtO5JVpyDQK1HRFo3kUE9GEi9J.sR0_L6nBtXlGj8w;
+        path=/; expires=Mon, 25-Nov-24 21:50:43 GMT; domain=.api.openai.com; HttpOnly;
         Secure; SameSite=None
-      - _cfuvid=wQzHCwLW6CPU768K_tlLklWp36I8zYCVJkKlAMtnMkk-1729619715162-0.0.1.1-604800000;
+      - _cfuvid=sqtPZaucqBJu1r4exJtYym3vbKmuuSO6o0np5VglPsw-1732569643935-0.0.1.1-604800000;
         path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None
       Transfer-Encoding:
       - chunked
@@ -113,7 +113,7 @@ interactions:
       openai-organization:
       - datadog-staging
       openai-processing-ms:
-      - '496'
+      - '370'
       openai-version:
       - '2020-10-01'
       strict-transport-security:
@@ -131,7 +131,7 @@ interactions:
       x-ratelimit-reset-tokens:
       - 0s
       x-request-id:
-      - req_33b8cddecaab8b8bc36e90f58f844636
+      - req_02ed729afc2d9083921e3fe5b7528550
     status:
       code: 200
       message: OK
@@ -193,8 +193,8 @@ interactions:
       content-type:
       - application/json
       cookie:
-      - __cf_bm=iQaF937ylY7BvvBCyWYQoxiJwi1nBp5.LILrHLw1uno-1729619715-1.0.1.1-jS4Dz7yc_ud.hKZlJ_CAZkSQesqzVkfrA5F30zI7CtJsbEKyAiuVlpX0CPf816UtlhXQEW8T5nsc.UvnsCOzOw;
-        _cfuvid=wQzHCwLW6CPU768K_tlLklWp36I8zYCVJkKlAMtnMkk-1729619715162-0.0.1.1-604800000
+      - __cf_bm=CVMxqIcHUHNjX56k1MKjj4MgiYNVAlg_B7yyVaP_z1o-1732569643-1.0.1.1-HOtZfXprHWr_DjtorQ_ZK6bbSmcOsBrphniRCaC9XQ2tTtO5JVpyDQK1HRFo3kUE9GEi9J.sR0_L6nBtXlGj8w;
+        _cfuvid=sqtPZaucqBJu1r4exJtYym3vbKmuuSO6o0np5VglPsw-1732569643935-0.0.1.1-604800000
       host:
       - api.openai.com
       user-agent:
@@ -220,19 +220,20 @@ interactions:
   response:
     body:
       string: !!binary |
-        H4sIAAAAAAAAA2xSQW7bMBC86xWLPVuBpTqR7VuAoGiBAmmLHhrEgUVTK2tdiSTIdZDA8N8Lyorl
-        ILnwMLMznB3ykAAgV7gE1I0S3bk2vf1xd19M/c3u4frhu/37Tet73qm7383r7fwXTqLCbnak5U11
-        pW3nWhK25kRrT0ooumZFvrjJFkV23ROdraiNsq2TdGbTjg2n+TSfpdMizeaDurGsKeASHhMAgEN/
-        xpymohdcwnTyhnQUgtoSLs9DAOhtGxFUIXAQZQQnI6mtETJ99LIsHw8rDKKEOjKywiWs8E9DoJVj
-        US3YGr56ZTQBB/ipPIerFU5ghZ5UsGYUnD3ioIKKPWkBT46EYy3RSRoCNrX1neoh5+0zV1QBm57r
-        k73IcMMz+Yp1nyk7PpVlebmEp3ofVCzS7Nt2wI/nVlq7dd5uwsCf8ZoNh2Z9Ch8bCGId9uwxAXjq
-        29+/KxSdt52Ttdh/ZKJhUcxPfjg++sh+WQykWFHtiM+zYvKJ37oiUdyGi/dDrXRD1SidJhfLfbz0
-        M4vTgmy2H1ySwQnDaxDq1jWbLXnn+fQjareezXOd56rYaEyOyX8AAAD//wMAUtzROh8DAAA=
+        H4sIAAAAAAAAA4xTwW6bQBC98xWjPZsIOzg43Fqp7a2y2iqqFEew3h1gWthd7Y6jpJb/vVrsGEdJ
+        pV44vDfv8eYN7BMAQVqUIFQnWQ2uTz/8lEZ9/NI9//mafVt9Wt61uxbvtt+3Oa3XYhYVdvsLFb+o
+        rpQdXI9M1hxp5VEyRtd5cb1Y3tze5PlIDFZjH2Wt4zS36UCG0kW2yNOsSOerk7qzpDCIEu4TAID9
+        +Iw5jcYnUUI2e0EGDEG2KMrzEIDwto+IkCFQYGlYzCZSWcNoxuh1Xd/vNyKwZBzQ8EaUsBE/OgQl
+        HbHswTbw2UujECjAWnoKVxsxg43wKIM1k+DsEQclaPKoGDw6ZIq1RCfuEMg01g9yhJy3j6RRA5mR
+        G5M98ekNj+g1qTHT/PBQ1/XlEh6bXZCxSLPr+xN+OLfS29Z5uw0n/ow3ZCh01TF8bCCwdWJkDwnA
+        w9j+7lWhwnk7OK7Y/kYTDYtidfQT09En9vr2RLJl2U/4al7M3vGrNLKkPlzcTyipOtSTdDq23Gmy
+        F0RysfXbNO95Hzcn0/6P/UQohY5RV85jvMmrjacxj/Gf+NfYueUxsAjPgXGoGjIteufp+EU2rsqK
+        bLltVoXKRHJI/gIAAP//AwDgYzoinwMAAA==
     headers:
       CF-Cache-Status:
       - DYNAMIC
       CF-RAY:
-      - 8d6b5b744e034367-EWR
+      - 8e84af32dc70c952-IAD
       Connection:
       - keep-alive
       Content-Encoding:
@@ -240,7 +241,7 @@ interactions:
       Content-Type:
       - application/json
       Date:
-      - Tue, 22 Oct 2024 17:55:16 GMT
+      - Mon, 25 Nov 2024 21:20:45 GMT
       Server:
       - cloudflare
       Transfer-Encoding:
@@ -254,7 +255,7 @@ interactions:
       openai-organization:
       - datadog-staging
       openai-processing-ms:
-      - '749'
+      - '1168'
       openai-version:
       - '2020-10-01'
       strict-transport-security:
@@ -272,35 +273,37 @@ interactions:
       x-ratelimit-reset-tokens:
       - 0s
       x-request-id:
-      - req_fbb01161a03eb6f478ff52314b72cfd6
+      - req_702ebaa1edbab95fb42f52baa4b34661
     status:
       code: 200
       message: OK
 - request:
-    body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"span_id":
-      "6877142543397072040", "trace_id": "6717e70200000000a99ea8ad36f4f36d", "label":
-      "ragas_faithfulness", "metric_type": "score", "timestamp_ms": 1729619716093,
-      "score_value": 1.0, "ml_app": "unnamed-ml-app", "tags": ["ddtrace.version:2.15.0.dev219+ge047e25bb.d20241022",
-      "ml_app:unnamed-ml-app"]}]}}}'
+    body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"join_on":
+      {"span": {"span_id": "7678809694384023494", "trace_id": "6744ea2b00000000995e7b2ceabfce01"}},
+      "label": "ragas_faithfulness", "metric_type": "score", "timestamp_ms": 1732569645205,
+      "score_value": 1.0, "ml_app": "unnamed-ml-app", "tags": ["ddtrace.version:2.15.0.dev351+g152f3e3b6.d20241122",
+      "ml_app:unnamed-ml-app"], "metadata": {"_dd.evaluation_kind": "faithfulness",
+      "_dd.evaluation_span": {"span_id": "5771061714047746387", "trace_id": "6744ea2b000000007099aeb477077763"},
+      "_dd.faithfulness_disagreements": []}}]}}}'
     headers:
       Content-Type:
       - application/json
       DD-API-KEY:
       - XXXXXX
     method: POST
-    uri: https://api.datad0g.com/api/intake/llm-obs/v1/eval-metric
+    uri: https://api.datad0g.com/api/intake/llm-obs/v2/eval-metric
   response:
     body:
-      string: '{"data":{"id":"99fa371c-457c-4d2b-8d4c-61657e0ffd48","type":"evaluation_metric","attributes":{"metrics":[{"id":"CbapxUnzcX","trace_id":"6717e70200000000a99ea8ad36f4f36d","span_id":"6877142543397072040","timestamp_ms":1729619716093,"ml_app":"unnamed-ml-app","metric_type":"score","label":"ragas_faithfulness","score_value":1,"tags":["ddtrace.version:2.15.0.dev219+ge047e25bb.d20241022","ml_app:unnamed-ml-app"]}]}}}'
+      string: '{"data":{"id":"f1470aa7-b97f-4809-825d-6932af26a81c","type":"evaluation_metric","attributes":{"metrics":[{"id":"EPRU-72kfP","join_on":{"span":{"trace_id":"6744ea2b00000000995e7b2ceabfce01","span_id":"7678809694384023494"}},"timestamp_ms":1732569645205,"ml_app":"unnamed-ml-app","metric_type":"score","label":"ragas_faithfulness","score_value":1,"tags":["ddtrace.version:2.15.0.dev351+g152f3e3b6.d20241122","ml_app:unnamed-ml-app"],"metadata":{"_dd.evaluation_kind":"faithfulness","_dd.evaluation_span":{"span_id":"5771061714047746387","trace_id":"6744ea2b000000007099aeb477077763"},"_dd.faithfulness_disagreements":[]}}]}}}'
     headers:
       content-length:
-      - '414'
+      - '623'
       content-security-policy:
       - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pub293163a918901030b79492fe1ab424cf&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatad0g.com
       content-type:
       - application/vnd.api+json
       date:
-      - Tue, 22 Oct 2024 17:55:17 GMT
+      - Mon, 25 Nov 2024 21:20:45 GMT
       strict-transport-security:
       - max-age=31536000; includeSubDomains; preload
       vary:
diff --git a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_emits_traces.yaml b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_evaluators.test_ragas_faithfulness_emits_traces.yaml
similarity index 80%
rename from tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_emits_traces.yaml
rename to tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_evaluators.test_ragas_faithfulness_emits_traces.yaml
index 8efe7391c90..2100bb3d305 100644
--- a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_emits_traces.yaml
+++ b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_evaluators.test_ragas_faithfulness_emits_traces.yaml
@@ -51,7 +51,7 @@ interactions:
       host:
       - api.openai.com
       user-agent:
-      - OpenAI/Python 1.47.1
+      - OpenAI/Python 1.52.0
       x-stainless-arch:
       - arm64
       x-stainless-async:
@@ -61,7 +61,9 @@ interactions:
       x-stainless-os:
       - MacOS
       x-stainless-package-version:
-      - 1.47.1
+      - 1.52.0
+      x-stainless-retry-count:
+      - '0'
       x-stainless-runtime:
       - CPython
       x-stainless-runtime-version:
@@ -71,19 +73,19 @@ interactions:
   response:
     body:
       string: !!binary |
-        H4sIAAAAAAAAAwAAAP//dJHBbtswEETv+gpiz1YgCY7k+JbCKAoEQYKih7SWIdH0SmJDkQR3jbYw
-        /O8FZcd2D73wMI8znF0eEiFA72ApQA2S1ehN+vipfFytqq+rt7eHX4v9+3NY/aDnL5y9DE/fYRYd
-        bvsTFX+47pQbvUHWzp6wCigZY2peFVWRLcryfgKj26GJtt5zOnfpqK1Oi6yYp1mV5ouze3BaIcFS
-        rBMhhDhMZ+xpd/gbliKbfSgjEskeYXm5JAQEZ6ICkkgTS8swu0LlLKOdqrdtuz7UQBgVhc0UX0/5
-        ogbScabQEEvGES1TROsavg0olPSapRGuE5+DtAqFJvEqg6a7GjbHTdu2t48G7PYk4+B2b8xZP16m
-        MK73wW3pzC96p62moQkoydnYmNh5mOgxEWIzbWv/zwLABzd6bti9o42BZZaf8uD6SVdalGfIjqW5
-        cRXV/1zNDllqQzc7h1NDbftrQnapOc0J9IcYx6bTtsfggz59QeebfLudl3lZdQ+QHJO/AAAA//8D
-        AL2Ti/mQAgAA
+        H4sIAAAAAAAAA4xSwY7aMBS85yusdyarkAKh3Payh6qVVitUVQWUGOclcevYrt9D7Rbx75UDS1h1
+        K/Xiw8yb8cyzj4kQoGtYCVCdZNV7k95/kaZtnpZr3++f159+Pk4/L35P5cevPz6YJ5hEhdt/Q8Uv
+        qjvlem+QtbNnWgWUjNF1WrzL54v383w2EL2r0URZ6zmdubTXVqd5ls/SrEiny4u6c1ohwUpsEiGE
+        OA5nzGlr/AUrkU1ekB6JZIuwug4JAcGZiIAk0sTSMkxGUjnLaIfoVVVtjlsgjIjCcrDfDv5iC6Rj
+        p1ASS8YeLVOkNltYdyiU9JqlEa4RD0FahUKTeJRB090WdqddVVW3lwZsDiRjcXsw5oKfri2Ma31w
+        e7rwV7zRVlNXBpTkbExM7DwM7CkRYjds6/BqAeCD6z2X7L6jjYaLbHr2g/GRRjZfXEh2LM2NKi8m
+        b/iVNbLUhm72DUqqDutROj6OPNTa3RDJTeu/07zlfW6ubfs/9iOhFHrGuvQBa61eNx7HAsY//K+x
+        65aHwEDPxNiXjbYtBh/0+Qc1vsyKbL5vloXKIDklfwAAAP//AwB8IvReTwMAAA==
     headers:
       CF-Cache-Status:
       - DYNAMIC
       CF-RAY:
-      - 8c856bee184d42d1-EWR
+      - 8e84ac4858349c52-IAD
       Connection:
       - keep-alive
       Content-Encoding:
@@ -91,14 +93,14 @@ interactions:
       Content-Type:
       - application/json
       Date:
-      - Tue, 24 Sep 2024 20:11:06 GMT
+      - Mon, 25 Nov 2024 21:18:45 GMT
       Server:
       - cloudflare
       Set-Cookie:
-      - __cf_bm=nMe4XLsotHph1aKmM6xJotYxeBsTIpCG1ULeQ2oiKLc-1727208666-1.0.1.1-eM1elzOCEnpbPLkOO61HSBvaeQPYHEyO4Ba3P2NsxkYV23Fybb7E8tIipei4YDbhyDiLXybnT7H0ETvjbsV89g;
-        path=/; expires=Tue, 24-Sep-24 20:41:06 GMT; domain=.api.openai.com; HttpOnly;
+      - __cf_bm=9IkhXEbzhF0QSjoZOW.EVqEQoEHTaAK7pnQ6K1m4EfY-1732569525-1.0.1.1-8FRhFy6jBsuonirbdG9jJ_IHnhXUakqpLsEg10YYrkhce9PwlXOKXNA2hiZwNqpM3D2TP2X4eFcZJjdEZt6.qQ;
+        path=/; expires=Mon, 25-Nov-24 21:48:45 GMT; domain=.api.openai.com; HttpOnly;
         Secure; SameSite=None
-      - _cfuvid=lKBj.JPFMKz3LiJyz12GeZI73UndAQfhN.5aqiwYPHA-1727208666121-0.0.1.1-604800000;
+      - _cfuvid=YGxjg63ZVaAJESO.Ouzjnkmhsg2izo9JySj6zJ3MRuc-1732569525322-0.0.1.1-604800000;
         path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None
       Transfer-Encoding:
       - chunked
@@ -106,10 +108,12 @@ interactions:
       - nosniff
       access-control-expose-headers:
       - X-Request-ID
+      alt-svc:
+      - h3=":443"; ma=86400
       openai-organization:
       - datadog-staging
       openai-processing-ms:
-      - '576'
+      - '469'
       openai-version:
       - '2020-10-01'
       strict-transport-security:
@@ -127,7 +131,7 @@ interactions:
       x-ratelimit-reset-tokens:
       - 0s
       x-request-id:
-      - req_ef3f2830eaf13bceea5db3a7369affda
+      - req_5d6c0d3f36d4cba76fbfea5b6c9f63fe
     status:
       code: 200
       message: OK
@@ -189,12 +193,12 @@ interactions:
       content-type:
       - application/json
       cookie:
-      - __cf_bm=nMe4XLsotHph1aKmM6xJotYxeBsTIpCG1ULeQ2oiKLc-1727208666-1.0.1.1-eM1elzOCEnpbPLkOO61HSBvaeQPYHEyO4Ba3P2NsxkYV23Fybb7E8tIipei4YDbhyDiLXybnT7H0ETvjbsV89g;
-        _cfuvid=lKBj.JPFMKz3LiJyz12GeZI73UndAQfhN.5aqiwYPHA-1727208666121-0.0.1.1-604800000
+      - __cf_bm=9IkhXEbzhF0QSjoZOW.EVqEQoEHTaAK7pnQ6K1m4EfY-1732569525-1.0.1.1-8FRhFy6jBsuonirbdG9jJ_IHnhXUakqpLsEg10YYrkhce9PwlXOKXNA2hiZwNqpM3D2TP2X4eFcZJjdEZt6.qQ;
+        _cfuvid=YGxjg63ZVaAJESO.Ouzjnkmhsg2izo9JySj6zJ3MRuc-1732569525322-0.0.1.1-604800000
       host:
       - api.openai.com
       user-agent:
-      - OpenAI/Python 1.47.1
+      - OpenAI/Python 1.52.0
       x-stainless-arch:
       - arm64
       x-stainless-async:
@@ -204,7 +208,9 @@ interactions:
       x-stainless-os:
       - MacOS
       x-stainless-package-version:
-      - 1.47.1
+      - 1.52.0
+      x-stainless-retry-count:
+      - '0'
       x-stainless-runtime:
       - CPython
       x-stainless-runtime-version:
@@ -214,19 +220,20 @@ interactions:
   response:
     body:
       string: !!binary |
-        H4sIAAAAAAAAAwAAAP//dFFBbtswELz7FQuerUByDcnRLS3aS4umSBugSBRINLWSNpFIglwHLgz/
-        vaCsSOmhFx5mdoazs6cVgKBa5CBUJ1kNto9uPqY3n+/U3e39/WH49vzQyObn7fevx98v2adErIPC
-        7J9R8ZvqSpnB9shk9IVWDiVjcE2yTbaJd2majsRgauyDrLUcbU00kKZoE2+2UZxFyW5Sd4YUepHD
-        4woA4DS+Iaeu8ShyiNdvyIDeyxZFPg8BCGf6gAjpPXmWmsV6IZXRjHqMXlXV46kQniXjgJoLkUMh
-        fnUISlpi2YNp4IuTWiGQhx/Skb8qxBoK4VB6oxfB7BEGJdTkUDE4tMgUaglO3CGQbowb5AhZZ16p
-        xhpIj9yY7MjTD6/oalJjpuT8VFXV+yUcNgcvQ5H60PcTfp5b6U1rndn7iZ/xhjT5rryEDw14NlaM
-        7HkF8DS2f/inUGGdGSyXbF5QB8Ms2138xHL0hf1wPZFsWPYLvkuy/6nKGllS79/dcKqXdLs4xHPM
-        cU/h/3jGoWxIt+iso8tJG1sm+/02TdKsuRar8+ovAAAA//8DADp8axngAgAA
+        H4sIAAAAAAAAA4xTwWrbQBC96yuGPVtBVuzK8a0YcimhLQRSiIO03h1Zk652l91xSDD+97KyYzk0
+        hV50eG/e05s30j4DEKTFEoTqJKvem/zrL2m6h3v78Ganqzv5c7XaVIv+rrTfv3UzMUkKt3lGxe+q
+        K+V6b5DJ2SOtAkrG5Dqtrsv5l5t5OR+I3mk0Sbb1nM9c3pOlvCzKWV5U+XRxUneOFEaxhMcMAGA/
+        PFNOq/FVLKGYvCM9xii3KJbnIQARnEmIkDFSZGlZTEZSOctoh+hN0zzu1yKyZOzR8losYS3uOwQl
+        PbE04Fq4DdIqBIrwQwaKV2sxgbUIKKOzo+DskQYlaAqoGAJ6ZEq1JCfuEMi2LvRygHxwL6RRA9mB
+        G5K98ukNLxg0qSHT9PDUNM3lEgHbXZSpSLsz5oQfzq0Yt/XBbeKJP+MtWYpdfQyfGojsvBjYQwbw
+        NLS/+1Co8MH1nmt2v9Emw6paHP3EePSRvb45kexYmhFfTKvJJ361RpZk4sX9hJKqQz1Kx2PLnSZ3
+        QWQXW/+d5jPv4+Zkt/9jPxJKoWfUtQ+YbvJh43EsYPon/jV2bnkILOJbZOzrluwWgw90/CJbXxdV
+        Md+0i0oVIjtkfwAAAP//AwD0sdbanwMAAA==
     headers:
       CF-Cache-Status:
       - DYNAMIC
       CF-RAY:
-      - 8c856bf38f3242d1-EWR
+      - 8e84ac4daf039c52-IAD
       Connection:
       - keep-alive
       Content-Encoding:
@@ -234,7 +241,7 @@ interactions:
       Content-Type:
       - application/json
       Date:
-      - Tue, 24 Sep 2024 20:11:06 GMT
+      - Mon, 25 Nov 2024 21:18:46 GMT
       Server:
       - cloudflare
       Transfer-Encoding:
@@ -243,10 +250,12 @@ interactions:
       - nosniff
       access-control-expose-headers:
       - X-Request-ID
+      alt-svc:
+      - h3=":443"; ma=86400
       openai-organization:
       - datadog-staging
       openai-processing-ms:
-      - '523'
+      - '1256'
       openai-version:
       - '2020-10-01'
       strict-transport-security:
@@ -264,7 +273,7 @@ interactions:
       x-ratelimit-reset-tokens:
       - 0s
       x-request-id:
-      - req_07733e2c20ff88f138f2ab4cd6a71cc6
+      - req_a58af2c6e743ac15ac528fb6233d9436
     status:
       code: 200
       message: OK
diff --git a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_submits_evaluation.yaml b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_evaluators.test_ragas_faithfulness_submits_evaluation.yaml
similarity index 100%
rename from tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_submits_evaluation.yaml
rename to tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_evaluators.test_ragas_faithfulness_submits_evaluation.yaml
diff --git a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_submits_evaluation_on_span_with_custom_keys.yaml b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_evaluators.test_ragas_faithfulness_submits_evaluation_on_span_with_custom_keys.yaml
similarity index 100%
rename from tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_submits_evaluation_on_span_with_custom_keys.yaml
rename to tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_evaluators.test_ragas_faithfulness_submits_evaluation_on_span_with_custom_keys.yaml
diff --git a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_submits_evaluation_on_span_with_question_in_messages.yaml b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_evaluators.test_ragas_faithfulness_submits_evaluation_on_span_with_question_in_messages.yaml
similarity index 100%
rename from tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_submits_evaluation_on_span_with_question_in_messages.yaml
rename to tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_evaluators.test_ragas_faithfulness_submits_evaluation_on_span_with_question_in_messages.yaml
diff --git a/tests/llmobs/test_llmobs.py b/tests/llmobs/test_llmobs.py
index 1bae7efe9ed..6cf19fc3e2c 100644
--- a/tests/llmobs/test_llmobs.py
+++ b/tests/llmobs/test_llmobs.py
@@ -1,4 +1,3 @@
-import mock
 import pytest
 
 from ddtrace.ext import SpanTypes
@@ -8,12 +7,6 @@
 from tests.llmobs._utils import _expected_llmobs_llm_span_event
 
 
-@pytest.fixture
-def mock_logs():
-    with mock.patch("ddtrace.llmobs._trace_processor.log") as mock_logs:
-        yield mock_logs
-
-
 class TestMLApp:
     @pytest.mark.parametrize("llmobs_env", [{"DD_LLMOBS_ML_APP": "<not-a-real-app-name>"}])
     def test_tag_defaults_to_env_var(self, tracer, llmobs_env, llmobs_events):
@@ -228,19 +221,19 @@ def test_model_and_provider_are_set(tracer, llmobs_events):
     assert span_event["meta"]["model_provider"] == "model_provider"
 
 
-def test_malformed_span_logs_error_instead_of_raising(mock_logs, tracer, llmobs_events):
+def test_malformed_span_logs_error_instead_of_raising(tracer, llmobs_events, mock_llmobs_logs):
     """Test that a trying to create a span event from a malformed span will log an error instead of crashing."""
     with tracer.trace("root_llm_span", span_type=SpanTypes.LLM) as llm_span:
         # span does not have SPAN_KIND tag
         pass
-    mock_logs.error.assert_called_once_with(
-        "Error generating LLMObs span event for span %s, likely due to malformed span", llm_span
+    mock_llmobs_logs.error.assert_called_with(
+        "Error generating LLMObs span event for span %s, likely due to malformed span", llm_span, exc_info=True
     )
     assert len(llmobs_events) == 0
 
 
-def test_processor_only_creates_llmobs_span_event(tracer, llmobs_events):
-    """Test that the LLMObsTraceProcessor only creates LLMObs span events for LLM span types."""
+def test_only_generate_span_events_from_llmobs_spans(tracer, llmobs_events):
+    """Test that we only generate LLMObs span events for LLM span types."""
     with tracer.trace("root_llm_span", service="tests.llmobs", span_type=SpanTypes.LLM) as root_span:
         root_span._set_ctx_item(const.SPAN_KIND, "llm")
         with tracer.trace("child_span"):
@@ -250,5 +243,5 @@ def test_processor_only_creates_llmobs_span_event(tracer, llmobs_events):
     expected_grandchild_llmobs_span["parent_id"] = str(root_span.span_id)
 
     assert len(llmobs_events) == 2
-    assert llmobs_events[0] == _expected_llmobs_llm_span_event(root_span, "llm")
-    assert llmobs_events[1] == expected_grandchild_llmobs_span
+    assert llmobs_events[1] == _expected_llmobs_llm_span_event(root_span, "llm")
+    assert llmobs_events[0] == expected_grandchild_llmobs_span
diff --git a/tests/llmobs/test_llmobs_decorators.py b/tests/llmobs/test_llmobs_decorators.py
index e94d72aec64..056de72ee96 100644
--- a/tests/llmobs/test_llmobs_decorators.py
+++ b/tests/llmobs/test_llmobs_decorators.py
@@ -19,7 +19,7 @@ def mock_logs():
         yield mock_logs
 
 
-def test_llm_decorator_with_llmobs_disabled_logs_warning(LLMObs, mock_logs):
+def test_llm_decorator_with_llmobs_disabled_logs_warning(llmobs, mock_logs):
     for decorator_name, decorator in (("llm", llm), ("embedding", embedding)):
 
         @decorator(
@@ -28,13 +28,13 @@ def test_llm_decorator_with_llmobs_disabled_logs_warning(LLMObs, mock_logs):
         def f():
             pass
 
-        LLMObs.disable()
+        llmobs.disable()
         f()
         mock_logs.warning.assert_called_with(SPAN_START_WHILE_DISABLED_WARNING)
         mock_logs.reset_mock()
 
 
-def test_non_llm_decorator_with_llmobs_disabled_logs_warning(LLMObs, mock_logs):
+def test_non_llm_decorator_with_llmobs_disabled_logs_warning(llmobs, mock_logs):
     for decorator_name, decorator in (
         ("task", task),
         ("workflow", workflow),
@@ -47,53 +47,49 @@ def test_non_llm_decorator_with_llmobs_disabled_logs_warning(LLMObs, mock_logs):
         def f():
             pass
 
-        LLMObs.disable()
+        llmobs.disable()
         f()
         mock_logs.warning.assert_called_with(SPAN_START_WHILE_DISABLED_WARNING)
         mock_logs.reset_mock()
 
 
-def test_llm_decorator(LLMObs, mock_llmobs_span_writer):
+def test_llm_decorator(llmobs, llmobs_events):
     @llm(model_name="test_model", model_provider="test_provider", name="test_function", session_id="test_session_id")
     def f():
         pass
 
     f()
-    span = LLMObs._instance.tracer.pop()[0]
-    mock_llmobs_span_writer.enqueue.assert_called_with(
-        _expected_llmobs_llm_span_event(
-            span, "llm", model_name="test_model", model_provider="test_provider", session_id="test_session_id"
-        )
+    span = llmobs._instance.tracer.pop()[0]
+    assert llmobs_events[0] == _expected_llmobs_llm_span_event(
+        span, "llm", model_name="test_model", model_provider="test_provider", session_id="test_session_id"
     )
 
 
-def test_llm_decorator_no_model_name_sets_default(LLMObs, mock_llmobs_span_writer):
+def test_llm_decorator_no_model_name_sets_default(llmobs, llmobs_events):
     @llm(model_provider="test_provider", name="test_function", session_id="test_session_id")
     def f():
         pass
 
     f()
-    span = LLMObs._instance.tracer.pop()[0]
-    mock_llmobs_span_writer.enqueue.assert_called_with(
-        _expected_llmobs_llm_span_event(
-            span, "llm", model_name="custom", model_provider="test_provider", session_id="test_session_id"
-        )
+    span = llmobs._instance.tracer.pop()[0]
+    assert llmobs_events[0] == _expected_llmobs_llm_span_event(
+        span, "llm", model_name="custom", model_provider="test_provider", session_id="test_session_id"
     )
 
 
-def test_llm_decorator_default_kwargs(LLMObs, mock_llmobs_span_writer):
+def test_llm_decorator_default_kwargs(llmobs, llmobs_events):
     @llm
     def f():
         pass
 
     f()
-    span = LLMObs._instance.tracer.pop()[0]
-    mock_llmobs_span_writer.enqueue.assert_called_with(
-        _expected_llmobs_llm_span_event(span, "llm", model_name="custom", model_provider="custom")
+    span = llmobs._instance.tracer.pop()[0]
+    assert llmobs_events[0] == _expected_llmobs_llm_span_event(
+        span, "llm", model_name="custom", model_provider="custom"
     )
 
 
-def test_embedding_decorator(LLMObs, mock_llmobs_span_writer):
+def test_embedding_decorator(llmobs, llmobs_events):
     @embedding(
         model_name="test_model", model_provider="test_provider", name="test_function", session_id="test_session_id"
     )
@@ -101,173 +97,157 @@ def f():
         pass
 
     f()
-    span = LLMObs._instance.tracer.pop()[0]
-    mock_llmobs_span_writer.enqueue.assert_called_with(
-        _expected_llmobs_llm_span_event(
-            span, "embedding", model_name="test_model", model_provider="test_provider", session_id="test_session_id"
-        )
+    span = llmobs._instance.tracer.pop()[0]
+    assert llmobs_events[0] == _expected_llmobs_llm_span_event(
+        span, "embedding", model_name="test_model", model_provider="test_provider", session_id="test_session_id"
     )
 
 
-def test_embedding_decorator_no_model_name_sets_default(LLMObs, mock_llmobs_span_writer):
+def test_embedding_decorator_no_model_name_sets_default(llmobs, llmobs_events):
     @embedding(model_provider="test_provider", name="test_function", session_id="test_session_id")
     def f():
         pass
 
     f()
-    span = LLMObs._instance.tracer.pop()[0]
-    mock_llmobs_span_writer.enqueue.assert_called_with(
-        _expected_llmobs_llm_span_event(
-            span, "embedding", model_name="custom", model_provider="test_provider", session_id="test_session_id"
-        )
+    span = llmobs._instance.tracer.pop()[0]
+    assert llmobs_events[0] == _expected_llmobs_llm_span_event(
+        span, "embedding", model_name="custom", model_provider="test_provider", session_id="test_session_id"
     )
 
 
-def test_embedding_decorator_default_kwargs(LLMObs, mock_llmobs_span_writer):
+def test_embedding_decorator_default_kwargs(llmobs, llmobs_events):
     @embedding
     def f():
         pass
 
     f()
-    span = LLMObs._instance.tracer.pop()[0]
-    mock_llmobs_span_writer.enqueue.assert_called_with(
-        _expected_llmobs_llm_span_event(span, "embedding", model_name="custom", model_provider="custom")
+    span = llmobs._instance.tracer.pop()[0]
+    assert llmobs_events[0] == _expected_llmobs_llm_span_event(
+        span, "embedding", model_name="custom", model_provider="custom"
     )
 
 
-def test_retrieval_decorator(LLMObs, mock_llmobs_span_writer):
+def test_retrieval_decorator(llmobs, llmobs_events):
     @retrieval(name="test_function", session_id="test_session_id")
     def f():
         pass
 
     f()
-    span = LLMObs._instance.tracer.pop()[0]
-    mock_llmobs_span_writer.enqueue.assert_called_with(
-        _expected_llmobs_non_llm_span_event(span, "retrieval", session_id="test_session_id")
-    )
+    span = llmobs._instance.tracer.pop()[0]
+    assert llmobs_events[0] == _expected_llmobs_non_llm_span_event(span, "retrieval", session_id="test_session_id")
 
 
-def test_retrieval_decorator_default_kwargs(LLMObs, mock_llmobs_span_writer):
+def test_retrieval_decorator_default_kwargs(llmobs, llmobs_events):
     @retrieval()
     def f():
         pass
 
     f()
-    span = LLMObs._instance.tracer.pop()[0]
-    mock_llmobs_span_writer.enqueue.assert_called_with(_expected_llmobs_non_llm_span_event(span, "retrieval"))
+    span = llmobs._instance.tracer.pop()[0]
+    assert llmobs_events[0] == _expected_llmobs_non_llm_span_event(span, "retrieval")
 
 
-def test_task_decorator(LLMObs, mock_llmobs_span_writer):
+def test_task_decorator(llmobs, llmobs_events):
     @task(name="test_function", session_id="test_session_id")
     def f():
         pass
 
     f()
-    span = LLMObs._instance.tracer.pop()[0]
-    mock_llmobs_span_writer.enqueue.assert_called_with(
-        _expected_llmobs_non_llm_span_event(span, "task", session_id="test_session_id")
-    )
+    span = llmobs._instance.tracer.pop()[0]
+    assert llmobs_events[0] == _expected_llmobs_non_llm_span_event(span, "task", session_id="test_session_id")
 
 
-def test_task_decorator_default_kwargs(LLMObs, mock_llmobs_span_writer):
+def test_task_decorator_default_kwargs(llmobs, llmobs_events):
     @task()
     def f():
         pass
 
     f()
-    span = LLMObs._instance.tracer.pop()[0]
-    mock_llmobs_span_writer.enqueue.assert_called_with(_expected_llmobs_non_llm_span_event(span, "task"))
+    span = llmobs._instance.tracer.pop()[0]
+    assert llmobs_events[0] == _expected_llmobs_non_llm_span_event(span, "task")
 
 
-def test_tool_decorator(LLMObs, mock_llmobs_span_writer):
+def test_tool_decorator(llmobs, llmobs_events):
     @tool(name="test_function", session_id="test_session_id")
     def f():
         pass
 
     f()
-    span = LLMObs._instance.tracer.pop()[0]
-    mock_llmobs_span_writer.enqueue.assert_called_with(
-        _expected_llmobs_non_llm_span_event(span, "tool", session_id="test_session_id")
-    )
+    span = llmobs._instance.tracer.pop()[0]
+    assert llmobs_events[0] == _expected_llmobs_non_llm_span_event(span, "tool", session_id="test_session_id")
 
 
-def test_tool_decorator_default_kwargs(LLMObs, mock_llmobs_span_writer):
+def test_tool_decorator_default_kwargs(llmobs, llmobs_events):
     @tool()
     def f():
         pass
 
     f()
-    span = LLMObs._instance.tracer.pop()[0]
-    mock_llmobs_span_writer.enqueue.assert_called_with(_expected_llmobs_non_llm_span_event(span, "tool"))
+    span = llmobs._instance.tracer.pop()[0]
+    assert llmobs_events[0] == _expected_llmobs_non_llm_span_event(span, "tool")
 
 
-def test_workflow_decorator(LLMObs, mock_llmobs_span_writer):
+def test_workflow_decorator(llmobs, llmobs_events):
     @workflow(name="test_function", session_id="test_session_id")
     def f():
         pass
 
     f()
-    span = LLMObs._instance.tracer.pop()[0]
-    mock_llmobs_span_writer.enqueue.assert_called_with(
-        _expected_llmobs_non_llm_span_event(span, "workflow", session_id="test_session_id")
-    )
+    span = llmobs._instance.tracer.pop()[0]
+    assert llmobs_events[0] == _expected_llmobs_non_llm_span_event(span, "workflow", session_id="test_session_id")
 
 
-def test_workflow_decorator_default_kwargs(LLMObs, mock_llmobs_span_writer):
+def test_workflow_decorator_default_kwargs(llmobs, llmobs_events):
     @workflow()
     def f():
         pass
 
     f()
-    span = LLMObs._instance.tracer.pop()[0]
-    mock_llmobs_span_writer.enqueue.assert_called_with(_expected_llmobs_non_llm_span_event(span, "workflow"))
+    span = llmobs._instance.tracer.pop()[0]
+    assert llmobs_events[0] == _expected_llmobs_non_llm_span_event(span, "workflow")
 
 
-def test_agent_decorator(LLMObs, mock_llmobs_span_writer):
+def test_agent_decorator(llmobs, llmobs_events):
     @agent(name="test_function", session_id="test_session_id")
     def f():
         pass
 
     f()
-    span = LLMObs._instance.tracer.pop()[0]
-    mock_llmobs_span_writer.enqueue.assert_called_with(
-        _expected_llmobs_llm_span_event(span, "agent", session_id="test_session_id")
-    )
+    span = llmobs._instance.tracer.pop()[0]
+    assert llmobs_events[0] == _expected_llmobs_llm_span_event(span, "agent", session_id="test_session_id")
 
 
-def test_agent_decorator_default_kwargs(LLMObs, mock_llmobs_span_writer):
+def test_agent_decorator_default_kwargs(llmobs, llmobs_events):
     @agent()
     def f():
         pass
 
     f()
-    span = LLMObs._instance.tracer.pop()[0]
-    mock_llmobs_span_writer.enqueue.assert_called_with(_expected_llmobs_llm_span_event(span, "agent"))
+    span = llmobs._instance.tracer.pop()[0]
+    assert llmobs_events[0] == _expected_llmobs_llm_span_event(span, "agent")
 
 
-def test_llm_decorator_with_error(LLMObs, mock_llmobs_span_writer):
+def test_llm_decorator_with_error(llmobs, llmobs_events):
     @llm(model_name="test_model", model_provider="test_provider", name="test_function", session_id="test_session_id")
     def f():
         raise ValueError("test_error")
 
     with pytest.raises(ValueError):
         f()
-    span = LLMObs._instance.tracer.pop()[0]
-    mock_llmobs_span_writer.enqueue.assert_called_with(
-        _expected_llmobs_llm_span_event(
-            span,
-            "llm",
-            model_name="test_model",
-            model_provider="test_provider",
-            session_id="test_session_id",
-            error=span.get_tag("error.type"),
-            error_message=span.get_tag("error.message"),
-            error_stack=span.get_tag("error.stack"),
-        )
+    span = llmobs._instance.tracer.pop()[0]
+    assert llmobs_events[0] == _expected_llmobs_llm_span_event(
+        span,
+        "llm",
+        model_name="test_model",
+        model_provider="test_provider",
+        session_id="test_session_id",
+        error=span.get_tag("error.type"),
+        error_message=span.get_tag("error.message"),
+        error_stack=span.get_tag("error.stack"),
     )
 
 
-def test_non_llm_decorators_with_error(LLMObs, mock_llmobs_span_writer):
+def test_non_llm_decorators_with_error(llmobs, llmobs_events):
     for decorator_name, decorator in [("task", task), ("workflow", workflow), ("tool", tool), ("agent", agent)]:
 
         @decorator(name="test_function", session_id="test_session_id")
@@ -276,23 +256,21 @@ def f():
 
         with pytest.raises(ValueError):
             f()
-        span = LLMObs._instance.tracer.pop()[0]
-        mock_llmobs_span_writer.enqueue.assert_called_with(
-            _expected_llmobs_non_llm_span_event(
-                span,
-                decorator_name,
-                session_id="test_session_id",
-                error=span.get_tag("error.type"),
-                error_message=span.get_tag("error.message"),
-                error_stack=span.get_tag("error.stack"),
-            )
+        span = llmobs._instance.tracer.pop()[0]
+        assert llmobs_events[-1] == _expected_llmobs_non_llm_span_event(
+            span,
+            decorator_name,
+            session_id="test_session_id",
+            error=span.get_tag("error.type"),
+            error_message=span.get_tag("error.message"),
+            error_stack=span.get_tag("error.stack"),
         )
 
 
-def test_llm_annotate(LLMObs, mock_llmobs_span_writer):
+def test_llm_annotate(llmobs, llmobs_events):
     @llm(model_name="test_model", model_provider="test_provider", name="test_function", session_id="test_session_id")
     def f():
-        LLMObs.annotate(
+        llmobs.annotate(
             parameters={"temperature": 0.9, "max_tokens": 50},
             input_data=[{"content": "test_prompt"}],
             output_data=[{"content": "test_response"}],
@@ -301,27 +279,25 @@ def f():
         )
 
     f()
-    span = LLMObs._instance.tracer.pop()[0]
-    mock_llmobs_span_writer.enqueue.assert_called_with(
-        _expected_llmobs_llm_span_event(
-            span,
-            "llm",
-            model_name="test_model",
-            model_provider="test_provider",
-            input_messages=[{"content": "test_prompt"}],
-            output_messages=[{"content": "test_response"}],
-            parameters={"temperature": 0.9, "max_tokens": 50},
-            token_metrics={"input_tokens": 10, "output_tokens": 20, "total_tokens": 30},
-            tags={"custom_tag": "tag_value"},
-            session_id="test_session_id",
-        )
+    span = llmobs._instance.tracer.pop()[0]
+    assert llmobs_events[0] == _expected_llmobs_llm_span_event(
+        span,
+        "llm",
+        model_name="test_model",
+        model_provider="test_provider",
+        input_messages=[{"content": "test_prompt"}],
+        output_messages=[{"content": "test_response"}],
+        parameters={"temperature": 0.9, "max_tokens": 50},
+        token_metrics={"input_tokens": 10, "output_tokens": 20, "total_tokens": 30},
+        tags={"custom_tag": "tag_value"},
+        session_id="test_session_id",
     )
 
 
-def test_llm_annotate_raw_string_io(LLMObs, mock_llmobs_span_writer):
+def test_llm_annotate_raw_string_io(llmobs, llmobs_events):
     @llm(model_name="test_model", model_provider="test_provider", name="test_function", session_id="test_session_id")
     def f():
-        LLMObs.annotate(
+        llmobs.annotate(
             parameters={"temperature": 0.9, "max_tokens": 50},
             input_data="test_prompt",
             output_data="test_response",
@@ -330,24 +306,22 @@ def f():
         )
 
     f()
-    span = LLMObs._instance.tracer.pop()[0]
-    mock_llmobs_span_writer.enqueue.assert_called_with(
-        _expected_llmobs_llm_span_event(
-            span,
-            "llm",
-            model_name="test_model",
-            model_provider="test_provider",
-            input_messages=[{"content": "test_prompt"}],
-            output_messages=[{"content": "test_response"}],
-            parameters={"temperature": 0.9, "max_tokens": 50},
-            token_metrics={"input_tokens": 10, "output_tokens": 20, "total_tokens": 30},
-            tags={"custom_tag": "tag_value"},
-            session_id="test_session_id",
-        )
+    span = llmobs._instance.tracer.pop()[0]
+    assert llmobs_events[0] == _expected_llmobs_llm_span_event(
+        span,
+        "llm",
+        model_name="test_model",
+        model_provider="test_provider",
+        input_messages=[{"content": "test_prompt"}],
+        output_messages=[{"content": "test_response"}],
+        parameters={"temperature": 0.9, "max_tokens": 50},
+        token_metrics={"input_tokens": 10, "output_tokens": 20, "total_tokens": 30},
+        tags={"custom_tag": "tag_value"},
+        session_id="test_session_id",
     )
 
 
-def test_non_llm_decorators_no_args(LLMObs, mock_llmobs_span_writer):
+def test_non_llm_decorators_no_args(llmobs, llmobs_events):
     """Test that using the decorators without any arguments, i.e. @tool, works the same as @tool(...)."""
     for decorator_name, decorator in [
         ("task", task),
@@ -362,11 +336,11 @@ def f():
             pass
 
         f()
-        span = LLMObs._instance.tracer.pop()[0]
-        mock_llmobs_span_writer.enqueue.assert_called_with(_expected_llmobs_non_llm_span_event(span, decorator_name))
+        span = llmobs._instance.tracer.pop()[0]
+        assert llmobs_events[-1] == _expected_llmobs_non_llm_span_event(span, decorator_name)
 
 
-def test_agent_decorator_no_args(LLMObs, mock_llmobs_span_writer):
+def test_agent_decorator_no_args(llmobs, llmobs_events):
     """Test that using agent decorator without any arguments, i.e. @agent, works the same as @agent(...)."""
 
     @agent
@@ -374,11 +348,11 @@ def f():
         pass
 
     f()
-    span = LLMObs._instance.tracer.pop()[0]
-    mock_llmobs_span_writer.enqueue.assert_called_with(_expected_llmobs_llm_span_event(span, "agent"))
+    span = llmobs._instance.tracer.pop()[0]
+    assert llmobs_events[0] == _expected_llmobs_llm_span_event(span, "agent")
 
 
-def test_ml_app_override(LLMObs, mock_llmobs_span_writer):
+def test_ml_app_override(llmobs, llmobs_events):
     """Test that setting ml_app kwarg on the LLMObs decorators will override the DD_LLMOBS_ML_APP value."""
     for decorator_name, decorator in [("task", task), ("workflow", workflow), ("tool", tool)]:
 
@@ -387,9 +361,9 @@ def f():
             pass
 
         f()
-        span = LLMObs._instance.tracer.pop()[0]
-        mock_llmobs_span_writer.enqueue.assert_called_with(
-            _expected_llmobs_non_llm_span_event(span, decorator_name, tags={"ml_app": "test_ml_app"})
+        span = llmobs._instance.tracer.pop()[0]
+        assert llmobs_events[-1] == _expected_llmobs_non_llm_span_event(
+            span, decorator_name, tags={"ml_app": "test_ml_app"}
         )
 
     @llm(model_name="test_model", ml_app="test_ml_app")
@@ -397,11 +371,9 @@ def g():
         pass
 
     g()
-    span = LLMObs._instance.tracer.pop()[0]
-    mock_llmobs_span_writer.enqueue.assert_called_with(
-        _expected_llmobs_llm_span_event(
-            span, "llm", model_name="test_model", model_provider="custom", tags={"ml_app": "test_ml_app"}
-        )
+    span = llmobs._instance.tracer.pop()[0]
+    assert llmobs_events[-1] == _expected_llmobs_llm_span_event(
+        span, "llm", model_name="test_model", model_provider="custom", tags={"ml_app": "test_ml_app"}
     )
 
     @embedding(model_name="test_model", ml_app="test_ml_app")
@@ -409,15 +381,13 @@ def h():
         pass
 
     h()
-    span = LLMObs._instance.tracer.pop()[0]
-    mock_llmobs_span_writer.enqueue.assert_called_with(
-        _expected_llmobs_llm_span_event(
-            span, "embedding", model_name="test_model", model_provider="custom", tags={"ml_app": "test_ml_app"}
-        )
+    span = llmobs._instance.tracer.pop()[0]
+    assert llmobs_events[-1] == _expected_llmobs_llm_span_event(
+        span, "embedding", model_name="test_model", model_provider="custom", tags={"ml_app": "test_ml_app"}
     )
 
 
-async def test_non_llm_async_decorators(LLMObs, mock_llmobs_span_writer):
+async def test_non_llm_async_decorators(llmobs, llmobs_events):
     """Test that decorators work with async functions."""
     for decorator_name, decorator in [
         ("task", task),
@@ -432,11 +402,11 @@ async def f():
             pass
 
         await f()
-        span = LLMObs._instance.tracer.pop()[0]
-        mock_llmobs_span_writer.enqueue.assert_called_with(_expected_llmobs_non_llm_span_event(span, decorator_name))
+        span = llmobs._instance.tracer.pop()[0]
+        assert llmobs_events[-1] == _expected_llmobs_non_llm_span_event(span, decorator_name)
 
 
-async def test_llm_async_decorators(LLMObs, mock_llmobs_span_writer):
+async def test_llm_async_decorators(llmobs, llmobs_events):
     """Test that decorators work with async functions."""
     for decorator_name, decorator in [("llm", llm), ("embedding", embedding)]:
 
@@ -445,15 +415,13 @@ async def f():
             pass
 
         await f()
-        span = LLMObs._instance.tracer.pop()[0]
-        mock_llmobs_span_writer.enqueue.assert_called_with(
-            _expected_llmobs_llm_span_event(
-                span, decorator_name, model_name="test_model", model_provider="test_provider"
-            )
+        span = llmobs._instance.tracer.pop()[0]
+        assert llmobs_events[-1] == _expected_llmobs_llm_span_event(
+            span, decorator_name, model_name="test_model", model_provider="test_provider"
         )
 
 
-def test_automatic_annotation_non_llm_decorators(LLMObs, mock_llmobs_span_writer):
+def test_automatic_annotation_non_llm_decorators(llmobs, llmobs_events):
     """Test that automatic input/output annotation works for non-LLM decorators."""
     for decorator_name, decorator in (("task", task), ("workflow", workflow), ("tool", tool), ("agent", agent)):
 
@@ -462,19 +430,17 @@ def f(prompt, arg_2, kwarg_1=None, kwarg_2=None):
             return prompt
 
         f("test_prompt", "arg_2", kwarg_2=12345)
-        span = LLMObs._instance.tracer.pop()[0]
-        mock_llmobs_span_writer.enqueue.assert_called_with(
-            _expected_llmobs_non_llm_span_event(
-                span,
-                decorator_name,
-                input_value=str({"prompt": "test_prompt", "arg_2": "arg_2", "kwarg_2": 12345}),
-                output_value="test_prompt",
-                session_id="test_session_id",
-            )
+        span = llmobs._instance.tracer.pop()[0]
+        assert llmobs_events[-1] == _expected_llmobs_non_llm_span_event(
+            span,
+            decorator_name,
+            input_value=str({"prompt": "test_prompt", "arg_2": "arg_2", "kwarg_2": 12345}),
+            output_value="test_prompt",
+            session_id="test_session_id",
         )
 
 
-def test_automatic_annotation_retrieval_decorator(LLMObs, mock_llmobs_span_writer):
+def test_automatic_annotation_retrieval_decorator(llmobs, llmobs_events):
     """Test that automatic input annotation works for retrieval decorators."""
 
     @retrieval(session_id="test_session_id")
@@ -482,18 +448,16 @@ def test_retrieval(query, arg_2, kwarg_1=None, kwarg_2=None):
         return [{"name": "name", "id": "1234567890", "score": 0.9}]
 
     test_retrieval("test_query", "arg_2", kwarg_2=12345)
-    span = LLMObs._instance.tracer.pop()[0]
-    mock_llmobs_span_writer.enqueue.assert_called_with(
-        _expected_llmobs_non_llm_span_event(
-            span,
-            "retrieval",
-            input_value=str({"query": "test_query", "arg_2": "arg_2", "kwarg_2": 12345}),
-            session_id="test_session_id",
-        )
+    span = llmobs._instance.tracer.pop()[0]
+    assert llmobs_events[0] == _expected_llmobs_non_llm_span_event(
+        span,
+        "retrieval",
+        input_value=str({"query": "test_query", "arg_2": "arg_2", "kwarg_2": 12345}),
+        session_id="test_session_id",
     )
 
 
-def test_automatic_annotation_off_non_llm_decorators(LLMObs, mock_llmobs_span_writer):
+def test_automatic_annotation_off_non_llm_decorators(llmobs, llmobs_events):
     """Test disabling automatic input/output annotation for non-LLM decorators."""
     for decorator_name, decorator in (
         ("task", task),
@@ -508,35 +472,33 @@ def f(prompt, arg_2, kwarg_1=None, kwarg_2=None):
             return prompt
 
         f("test_prompt", "arg_2", kwarg_2=12345)
-        span = LLMObs._instance.tracer.pop()[0]
-        mock_llmobs_span_writer.enqueue.assert_called_with(
-            _expected_llmobs_non_llm_span_event(span, decorator_name, session_id="test_session_id")
+        span = llmobs._instance.tracer.pop()[0]
+        assert llmobs_events[-1] == _expected_llmobs_non_llm_span_event(
+            span, decorator_name, session_id="test_session_id"
         )
 
 
-def test_automatic_annotation_off_if_manually_annotated(LLMObs, mock_llmobs_span_writer):
+def test_automatic_annotation_off_if_manually_annotated(llmobs, llmobs_events):
     """Test disabling automatic input/output annotation for non-LLM decorators."""
     for decorator_name, decorator in (("task", task), ("workflow", workflow), ("tool", tool), ("agent", agent)):
 
         @decorator(name="test_function", session_id="test_session_id")
         def f(prompt, arg_2, kwarg_1=None, kwarg_2=None):
-            LLMObs.annotate(input_data="my custom input", output_data="my custom output")
+            llmobs.annotate(input_data="my custom input", output_data="my custom output")
             return prompt
 
         f("test_prompt", "arg_2", kwarg_2=12345)
-        span = LLMObs._instance.tracer.pop()[0]
-        mock_llmobs_span_writer.enqueue.assert_called_with(
-            _expected_llmobs_non_llm_span_event(
-                span,
-                decorator_name,
-                session_id="test_session_id",
-                input_value="my custom input",
-                output_value="my custom output",
-            )
+        span = llmobs._instance.tracer.pop()[0]
+        assert llmobs_events[-1] == _expected_llmobs_non_llm_span_event(
+            span,
+            decorator_name,
+            session_id="test_session_id",
+            input_value="my custom input",
+            output_value="my custom output",
         )
 
 
-def test_generator_sync(LLMObs, mock_llmobs_span_writer):
+def test_generator_sync(llmobs, llmobs_events):
     """
     Test that decorators work with generator functions.
     The span should finish after the generator is exhausted.
@@ -556,7 +518,7 @@ def f():
             for i in range(3):
                 yield i
 
-            LLMObs.annotate(
+            llmobs.annotate(
                 input_data="hello",
                 output_data="world",
             )
@@ -566,7 +528,7 @@ def f():
             assert e == i
             i += 1
 
-        span = LLMObs._instance.tracer.pop()[0]
+        span = llmobs._instance.tracer.pop()[0]
         if decorator_name == "llm":
             expected_span_event = _expected_llmobs_llm_span_event(
                 span,
@@ -594,10 +556,10 @@ def f():
                 span, decorator_name, input_value="hello", output_value="world"
             )
 
-        mock_llmobs_span_writer.enqueue.assert_called_with(expected_span_event)
+        assert llmobs_events[-1] == expected_span_event
 
 
-async def test_generator_async(LLMObs, mock_llmobs_span_writer):
+async def test_generator_async(llmobs, llmobs_events):
     """
     Test that decorators work with generator functions.
     The span should finish after the generator is exhausted.
@@ -617,7 +579,7 @@ async def f():
             for i in range(3):
                 yield i
 
-            LLMObs.annotate(
+            llmobs.annotate(
                 input_data="hello",
                 output_data="world",
             )
@@ -627,7 +589,7 @@ async def f():
             assert e == i
             i += 1
 
-        span = LLMObs._instance.tracer.pop()[0]
+        span = llmobs._instance.tracer.pop()[0]
         if decorator_name == "llm":
             expected_span_event = _expected_llmobs_llm_span_event(
                 span,
@@ -655,11 +617,11 @@ async def f():
                 span, decorator_name, input_value="hello", output_value="world"
             )
 
-        mock_llmobs_span_writer.enqueue.assert_called_with(expected_span_event)
+        assert llmobs_events[-1] == expected_span_event
 
 
-def test_generator_sync_with_llmobs_disabled(LLMObs, mock_logs):
-    LLMObs.disable()
+def test_generator_sync_with_llmobs_disabled(llmobs, mock_logs):
+    llmobs.disable()
 
     @workflow()
     def f():
@@ -684,10 +646,11 @@ def g():
         i += 1
 
     mock_logs.warning.assert_called_with(SPAN_START_WHILE_DISABLED_WARNING)
+    llmobs.enable()
 
 
-async def test_generator_async_with_llmobs_disabled(LLMObs, mock_logs):
-    LLMObs.disable()
+async def test_generator_async_with_llmobs_disabled(llmobs, mock_logs):
+    llmobs.disable()
 
     @workflow()
     async def f():
@@ -712,9 +675,10 @@ async def g():
         i += 1
 
     mock_logs.warning.assert_called_with(SPAN_START_WHILE_DISABLED_WARNING)
+    llmobs.enable()
 
 
-def test_generator_sync_finishes_span_on_error(LLMObs, mock_llmobs_span_writer):
+def test_generator_sync_finishes_span_on_error(llmobs, llmobs_events):
     """Tests that"""
 
     @workflow()
@@ -728,19 +692,17 @@ def f():
         for _ in f():
             pass
 
-    span = LLMObs._instance.tracer.pop()[0]
-    mock_llmobs_span_writer.enqueue.assert_called_with(
-        _expected_llmobs_non_llm_span_event(
-            span,
-            "workflow",
-            error=span.get_tag("error.type"),
-            error_message=span.get_tag("error.message"),
-            error_stack=span.get_tag("error.stack"),
-        )
+    span = llmobs._instance.tracer.pop()[0]
+    assert llmobs_events[0] == _expected_llmobs_non_llm_span_event(
+        span,
+        "workflow",
+        error=span.get_tag("error.type"),
+        error_message=span.get_tag("error.message"),
+        error_stack=span.get_tag("error.stack"),
     )
 
 
-async def test_generator_async_finishes_span_on_error(LLMObs, mock_llmobs_span_writer):
+async def test_generator_async_finishes_span_on_error(llmobs, llmobs_events):
     @workflow()
     async def f():
         for i in range(3):
@@ -752,19 +714,17 @@ async def f():
         async for _ in f():
             pass
 
-    span = LLMObs._instance.tracer.pop()[0]
-    mock_llmobs_span_writer.enqueue.assert_called_with(
-        _expected_llmobs_non_llm_span_event(
-            span,
-            "workflow",
-            error=span.get_tag("error.type"),
-            error_message=span.get_tag("error.message"),
-            error_stack=span.get_tag("error.stack"),
-        )
+    span = llmobs._instance.tracer.pop()[0]
+    assert llmobs_events[0] == _expected_llmobs_non_llm_span_event(
+        span,
+        "workflow",
+        error=span.get_tag("error.type"),
+        error_message=span.get_tag("error.message"),
+        error_stack=span.get_tag("error.stack"),
     )
 
 
-def test_generator_sync_send(LLMObs, mock_llmobs_span_writer):
+def test_generator_sync_send(llmobs, llmobs_events):
     @workflow()
     def f():
         while True:
@@ -780,16 +740,11 @@ def f():
     assert gen.send(4) == 16
     gen.close()
 
-    span = LLMObs._instance.tracer.pop()[0]
-    mock_llmobs_span_writer.enqueue.assert_called_with(
-        _expected_llmobs_non_llm_span_event(
-            span,
-            "workflow",
-        )
-    )
+    span = llmobs._instance.tracer.pop()[0]
+    assert llmobs_events[0] == _expected_llmobs_non_llm_span_event(span, "workflow")
 
 
-async def test_generator_async_send(LLMObs, mock_llmobs_span_writer):
+async def test_generator_async_send(llmobs, llmobs_events):
     @workflow()
     async def f():
         while True:
@@ -805,16 +760,11 @@ async def f():
 
     await gen.aclose()
 
-    span = LLMObs._instance.tracer.pop()[0]
-    mock_llmobs_span_writer.enqueue.assert_called_with(
-        _expected_llmobs_non_llm_span_event(
-            span,
-            "workflow",
-        )
-    )
+    span = llmobs._instance.tracer.pop()[0]
+    assert llmobs_events[0] == _expected_llmobs_non_llm_span_event(span, "workflow")
 
 
-def test_generator_sync_throw(LLMObs, mock_llmobs_span_writer):
+def test_generator_sync_throw(llmobs, llmobs_events):
     @workflow()
     def f():
         for i in range(3):
@@ -825,19 +775,17 @@ def f():
         next(gen)
         gen.throw(ValueError("test_error"))
 
-    span = LLMObs._instance.tracer.pop()[0]
-    mock_llmobs_span_writer.enqueue.assert_called_with(
-        _expected_llmobs_non_llm_span_event(
-            span,
-            "workflow",
-            error=span.get_tag("error.type"),
-            error_message=span.get_tag("error.message"),
-            error_stack=span.get_tag("error.stack"),
-        )
+    span = llmobs._instance.tracer.pop()[0]
+    assert llmobs_events[0] == _expected_llmobs_non_llm_span_event(
+        span,
+        "workflow",
+        error=span.get_tag("error.type"),
+        error_message=span.get_tag("error.message"),
+        error_stack=span.get_tag("error.stack"),
     )
 
 
-async def test_generator_async_throw(LLMObs, mock_llmobs_span_writer):
+async def test_generator_async_throw(llmobs, llmobs_events):
     @workflow()
     async def f():
         for i in range(3):
@@ -848,19 +796,17 @@ async def f():
         await gen.asend(None)
         await gen.athrow(ValueError("test_error"))
 
-    span = LLMObs._instance.tracer.pop()[0]
-    mock_llmobs_span_writer.enqueue.assert_called_with(
-        _expected_llmobs_non_llm_span_event(
-            span,
-            "workflow",
-            error=span.get_tag("error.type"),
-            error_message=span.get_tag("error.message"),
-            error_stack=span.get_tag("error.stack"),
-        )
+    span = llmobs._instance.tracer.pop()[0]
+    assert llmobs_events[0] == _expected_llmobs_non_llm_span_event(
+        span,
+        "workflow",
+        error=span.get_tag("error.type"),
+        error_message=span.get_tag("error.message"),
+        error_stack=span.get_tag("error.stack"),
     )
 
 
-def test_generator_exit_exception_sync(LLMObs, mock_llmobs_span_writer):
+def test_generator_exit_exception_sync(llmobs, llmobs_events):
     @workflow()
     def get_next_element(alist):
         for element in alist:
@@ -873,14 +819,12 @@ def get_next_element(alist):
         if element == 5:
             break
 
-    span = LLMObs._instance.tracer.pop()[0]
-    mock_llmobs_span_writer.enqueue.assert_called_with(
-        _expected_llmobs_non_llm_span_event(
-            span,
-            "workflow",
-            input_value=str({"alist": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]}),
-            error=span.get_tag("error.type"),
-            error_message=span.get_tag("error.message"),
-            error_stack=span.get_tag("error.stack"),
-        )
+    span = llmobs._instance.tracer.pop()[0]
+    assert llmobs_events[0] == _expected_llmobs_non_llm_span_event(
+        span,
+        "workflow",
+        input_value=str({"alist": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]}),
+        error=span.get_tag("error.type"),
+        error_message=span.get_tag("error.message"),
+        error_stack=span.get_tag("error.stack"),
     )
diff --git a/tests/llmobs/test_llmobs_eval_metric_writer.py b/tests/llmobs/test_llmobs_eval_metric_writer.py
index 2b8341e1616..eb168ef5a00 100644
--- a/tests/llmobs/test_llmobs_eval_metric_writer.py
+++ b/tests/llmobs/test_llmobs_eval_metric_writer.py
@@ -7,15 +7,19 @@
 from ddtrace.llmobs._writer import LLMObsEvalMetricWriter
 
 
-INTAKE_ENDPOINT = "https://api.datad0g.com/api/intake/llm-obs/v1/eval-metric"
+INTAKE_ENDPOINT = "https://api.datad0g.com/api/intake/llm-obs/v2/eval-metric"
 DD_SITE = "datad0g.com"
 dd_api_key = os.getenv("DD_API_KEY", default="<not-a-real-api-key>")
 
 
 def _categorical_metric_event():
     return {
-        "span_id": "12345678901",
-        "trace_id": "98765432101",
+        "join_on": {
+            "span": {
+                "span_id": "12345678901",
+                "trace_id": "98765432101",
+            },
+        },
         "metric_type": "categorical",
         "categorical_value": "very",
         "label": "toxicity",
@@ -26,8 +30,12 @@ def _categorical_metric_event():
 
 def _score_metric_event():
     return {
-        "span_id": "12345678902",
-        "trace_id": "98765432102",
+        "join_on": {
+            "span": {
+                "span_id": "12345678902",
+                "trace_id": "98765432102",
+            },
+        },
         "metric_type": "score",
         "label": "sentiment",
         "score_value": 0.9,
@@ -69,6 +77,18 @@ def test_send_metric_bad_api_key(mock_writer_logs):
     )
 
 
+@pytest.mark.vcr_logs
+def test_send_metric_no_api_key(mock_writer_logs):
+    llmobs_eval_metric_writer = LLMObsEvalMetricWriter(site="datad0g.com", api_key="", interval=1000, timeout=1)
+    llmobs_eval_metric_writer.start()
+    llmobs_eval_metric_writer.enqueue(_categorical_metric_event())
+    llmobs_eval_metric_writer.periodic()
+    mock_writer_logs.warning.assert_called_with(
+        "DD_API_KEY is required for sending evaluation metrics. Evaluation metric data will not be sent. ",
+        "Ensure this configuration is set before running your application.",
+    )
+
+
 @pytest.mark.vcr_logs
 def test_send_categorical_metric(mock_writer_logs):
     llmobs_eval_metric_writer = LLMObsEvalMetricWriter(site="datad0g.com", api_key=dd_api_key, interval=1000, timeout=1)
@@ -125,6 +145,18 @@ def test_send_multiple_events(mock_writer_logs):
 
 
 def test_send_on_exit(mock_writer_logs, run_python_code_in_subprocess):
+    env = os.environ.copy()
+    pypath = [os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))]
+    if "PYTHONPATH" in env:
+        pypath.append(env["PYTHONPATH"])
+    env.update(
+        {
+            "DD_API_KEY": os.getenv("DD_API_KEY", "dummy-api-key"),
+            "DD_SITE": "datad0g.com",
+            "PYTHONPATH": ":".join(pypath),
+            "DD_LLMOBS_ML_APP": "unnamed-ml-app",
+        }
+    )
     out, err, status, pid = run_python_code_in_subprocess(
         """
 import atexit
@@ -144,6 +176,7 @@ def test_send_on_exit(mock_writer_logs, run_python_code_in_subprocess):
 llmobs_eval_metric_writer.start()
 llmobs_eval_metric_writer.enqueue(_score_metric_event())
 """,
+        env=env,
     )
     assert status == 0, err
     assert out == b""
diff --git a/tests/llmobs/test_llmobs_evaluator_runner.py b/tests/llmobs/test_llmobs_evaluator_runner.py
index 7ee7d510276..40c9fb5bd2b 100644
--- a/tests/llmobs/test_llmobs_evaluator_runner.py
+++ b/tests/llmobs/test_llmobs_evaluator_runner.py
@@ -22,7 +22,7 @@ def test_evaluator_runner_start(mock_evaluator_logs):
     evaluator_runner = EvaluatorRunner(interval=0.01, llmobs_service=mock.MagicMock())
     evaluator_runner.evaluators.append(DummyEvaluator(llmobs_service=mock.MagicMock()))
     evaluator_runner.start()
-    mock_evaluator_logs.debug.assert_has_calls([mock.call("started %r to %r", "EvaluatorRunner")])
+    mock_evaluator_logs.debug.assert_has_calls([mock.call("started %r", "EvaluatorRunner")])
 
 
 def test_evaluator_runner_buffer_limit(mock_evaluator_logs):
@@ -34,9 +34,9 @@ def test_evaluator_runner_buffer_limit(mock_evaluator_logs):
     )
 
 
-def test_evaluator_runner_periodic_enqueues_eval_metric(LLMObs, mock_llmobs_eval_metric_writer):
-    evaluator_runner = EvaluatorRunner(interval=0.01, llmobs_service=LLMObs)
-    evaluator_runner.evaluators.append(DummyEvaluator(llmobs_service=LLMObs))
+def test_evaluator_runner_periodic_enqueues_eval_metric(llmobs, mock_llmobs_eval_metric_writer):
+    evaluator_runner = EvaluatorRunner(interval=0.01, llmobs_service=llmobs)
+    evaluator_runner.evaluators.append(DummyEvaluator(llmobs_service=llmobs))
     evaluator_runner.enqueue({"span_id": "123", "trace_id": "1234"}, DUMMY_SPAN)
     evaluator_runner.periodic()
     mock_llmobs_eval_metric_writer.enqueue.assert_called_once_with(
@@ -45,9 +45,9 @@ def test_evaluator_runner_periodic_enqueues_eval_metric(LLMObs, mock_llmobs_eval
 
 
 @pytest.mark.vcr_logs
-def test_evaluator_runner_timed_enqueues_eval_metric(LLMObs, mock_llmobs_eval_metric_writer):
-    evaluator_runner = EvaluatorRunner(interval=0.01, llmobs_service=LLMObs)
-    evaluator_runner.evaluators.append(DummyEvaluator(llmobs_service=LLMObs))
+def test_evaluator_runner_timed_enqueues_eval_metric(llmobs, mock_llmobs_eval_metric_writer):
+    evaluator_runner = EvaluatorRunner(interval=0.01, llmobs_service=llmobs)
+    evaluator_runner.evaluators.append(DummyEvaluator(llmobs_service=llmobs))
     evaluator_runner.start()
 
     evaluator_runner.enqueue({"span_id": "123", "trace_id": "1234"}, DUMMY_SPAN)
@@ -64,15 +64,7 @@ def test_evaluator_runner_on_exit(mock_writer_logs, run_python_code_in_subproces
     pypath = [os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))]
     if "PYTHONPATH" in env:
         pypath.append(env["PYTHONPATH"])
-    env.update(
-        {
-            "DD_API_KEY": os.getenv("DD_API_KEY", "dummy-api-key"),
-            "DD_SITE": "datad0g.com",
-            "PYTHONPATH": ":".join(pypath),
-            "DD_LLMOBS_ML_APP": "unnamed-ml-app",
-            "_DD_LLMOBS_EVALUATOR_INTERVAL": "5",
-        }
-    )
+    env.update({"PYTHONPATH": ":".join(pypath), "_DD_LLMOBS_EVALUATOR_INTERVAL": "5"})
     out, err, status, pid = run_python_code_in_subprocess(
         """
 import os
@@ -87,7 +79,7 @@ def test_evaluator_runner_on_exit(mock_writer_logs, run_python_code_in_subproces
 ctx = logs_vcr.use_cassette("tests.llmobs.test_llmobs_evaluator_runner.send_score_metric.yaml")
 ctx.__enter__()
 atexit.register(lambda: ctx.__exit__())
-LLMObs.enable()
+LLMObs.enable(api_key="dummy-api-key", site="datad0g.com", ml_app="unnamed-ml-app")
 LLMObs._instance._evaluator_runner.evaluators.append(DummyEvaluator(llmobs_service=LLMObs))
 LLMObs._instance._evaluator_runner.start()
 LLMObs._instance._evaluator_runner.enqueue({"span_id": "123", "trace_id": "1234"}, None)
@@ -99,6 +91,12 @@ def test_evaluator_runner_on_exit(mock_writer_logs, run_python_code_in_subproces
     assert err == b""
 
 
+def test_evaluator_runner_unsupported_evaluator():
+    with override_env({"_DD_LLMOBS_EVALUATORS": "unsupported"}):
+        with pytest.raises(ValueError):
+            EvaluatorRunner(interval=0.01, llmobs_service=mock.MagicMock())
+
+
 def test_evaluator_runner_sampler_single_rule(monkeypatch):
     monkeypatch.setenv(
         EvaluatorRunnerSampler.SAMPLING_RULES_ENV_VAR,
diff --git a/tests/llmobs/test_llmobs_ragas_faithfulness_evaluator.py b/tests/llmobs/test_llmobs_ragas_evaluators.py
similarity index 76%
rename from tests/llmobs/test_llmobs_ragas_faithfulness_evaluator.py
rename to tests/llmobs/test_llmobs_ragas_evaluators.py
index 1f78b538f24..251b2642040 100644
--- a/tests/llmobs/test_llmobs_ragas_faithfulness_evaluator.py
+++ b/tests/llmobs/test_llmobs_ragas_evaluators.py
@@ -6,36 +6,39 @@
 from ddtrace.llmobs._evaluators.ragas.faithfulness import RagasFaithfulnessEvaluator
 from ddtrace.span import Span
 from tests.llmobs._utils import _expected_llmobs_llm_span_event
-from tests.llmobs._utils import _expected_ragas_spans
+from tests.llmobs._utils import _expected_ragas_faithfulness_spans
 from tests.llmobs._utils import _llm_span_with_expected_ragas_inputs_in_messages
 from tests.llmobs._utils import _llm_span_with_expected_ragas_inputs_in_prompt
 
 
+pytest.importorskip("ragas", reason="Tests require ragas to be available on user env")
+
+
 def _llm_span_without_io():
     return _expected_llmobs_llm_span_event(Span("dummy"))
 
 
-def test_ragas_evaluator_init(ragas, LLMObs):
-    rf_evaluator = RagasFaithfulnessEvaluator(LLMObs)
-    assert rf_evaluator.llmobs_service == LLMObs
+def test_ragas_evaluator_init(ragas, llmobs):
+    rf_evaluator = RagasFaithfulnessEvaluator(llmobs)
+    assert rf_evaluator.llmobs_service == llmobs
     assert rf_evaluator.ragas_faithfulness_instance == ragas.metrics.faithfulness
     assert rf_evaluator.ragas_faithfulness_instance.llm == ragas.llms.llm_factory()
 
 
-def test_ragas_faithfulness_throws_if_dependencies_not_present(LLMObs, mock_ragas_dependencies_not_present, ragas):
+def test_ragas_faithfulness_throws_if_dependencies_not_present(llmobs, mock_ragas_dependencies_not_present, ragas):
     with pytest.raises(NotImplementedError, match="Failed to load dependencies for `ragas_faithfulness` evaluator"):
-        RagasFaithfulnessEvaluator(LLMObs)
+        RagasFaithfulnessEvaluator(llmobs)
 
 
-def test_ragas_faithfulness_returns_none_if_inputs_extraction_fails(ragas, mock_llmobs_submit_evaluation, LLMObs):
-    rf_evaluator = RagasFaithfulnessEvaluator(LLMObs)
+def test_ragas_faithfulness_returns_none_if_inputs_extraction_fails(ragas, mock_llmobs_submit_evaluation, llmobs):
+    rf_evaluator = RagasFaithfulnessEvaluator(llmobs)
     failure_msg, _ = rf_evaluator.evaluate(_llm_span_without_io())
     assert failure_msg == "fail_extract_faithfulness_inputs"
     assert rf_evaluator.llmobs_service.submit_evaluation.call_count == 0
 
 
 def test_ragas_faithfulness_has_modified_faithfulness_instance(
-    ragas, mock_llmobs_submit_evaluation, reset_ragas_faithfulness_llm, LLMObs
+    ragas, mock_llmobs_submit_evaluation, reset_ragas_faithfulness_llm, llmobs
 ):
     """Faithfulness instance used in ragas evaluator should match the global ragas faithfulness instance"""
     from ragas.llms import BaseRagasLLM
@@ -53,7 +56,7 @@ def agenerate_text(self) -> str:
 
     faithfulness.llm = FirstDummyLLM()
 
-    rf_evaluator = RagasFaithfulnessEvaluator(LLMObs)
+    rf_evaluator = RagasFaithfulnessEvaluator(llmobs)
 
     assert rf_evaluator.ragas_faithfulness_instance.llm.generate_text() == "dummy llm"
 
@@ -74,9 +77,9 @@ def agenerate_text(self, statements) -> str:
 
 
 @pytest.mark.vcr_logs
-def test_ragas_faithfulness_submits_evaluation(ragas, LLMObs, mock_llmobs_submit_evaluation):
+def test_ragas_faithfulness_submits_evaluation(ragas, llmobs, mock_llmobs_submit_evaluation):
     """Test that evaluation is submitted for a valid llm span where question is in the prompt variables"""
-    rf_evaluator = RagasFaithfulnessEvaluator(LLMObs)
+    rf_evaluator = RagasFaithfulnessEvaluator(llmobs)
     llm_span = _llm_span_with_expected_ragas_inputs_in_prompt()
     rf_evaluator.run_and_submit_evaluation(llm_span)
     rf_evaluator.llmobs_service.submit_evaluation.assert_has_calls(
@@ -101,10 +104,10 @@ def test_ragas_faithfulness_submits_evaluation(ragas, LLMObs, mock_llmobs_submit
 
 @pytest.mark.vcr_logs
 def test_ragas_faithfulness_submits_evaluation_on_span_with_question_in_messages(
-    ragas, LLMObs, mock_llmobs_submit_evaluation
+    ragas, llmobs, mock_llmobs_submit_evaluation
 ):
     """Test that evaluation is submitted for a valid llm span where the last message content is the question"""
-    rf_evaluator = RagasFaithfulnessEvaluator(LLMObs)
+    rf_evaluator = RagasFaithfulnessEvaluator(llmobs)
     llm_span = _llm_span_with_expected_ragas_inputs_in_messages()
     rf_evaluator.run_and_submit_evaluation(llm_span)
     rf_evaluator.llmobs_service.submit_evaluation.assert_has_calls(
@@ -128,9 +131,9 @@ def test_ragas_faithfulness_submits_evaluation_on_span_with_question_in_messages
 
 
 @pytest.mark.vcr_logs
-def test_ragas_faithfulness_submits_evaluation_on_span_with_custom_keys(ragas, LLMObs, mock_llmobs_submit_evaluation):
+def test_ragas_faithfulness_submits_evaluation_on_span_with_custom_keys(ragas, llmobs, mock_llmobs_submit_evaluation):
     """Test that evaluation is submitted for a valid llm span where the last message content is the question"""
-    rf_evaluator = RagasFaithfulnessEvaluator(LLMObs)
+    rf_evaluator = RagasFaithfulnessEvaluator(llmobs)
     llm_span = _expected_llmobs_llm_span_event(
         Span("dummy"),
         prompt={
@@ -167,19 +170,17 @@ def test_ragas_faithfulness_submits_evaluation_on_span_with_custom_keys(ragas, L
 
 
 @pytest.mark.vcr_logs
-def test_ragas_faithfulness_emits_traces(ragas, LLMObs):
-    rf_evaluator = RagasFaithfulnessEvaluator(LLMObs)
+def test_ragas_faithfulness_emits_traces(ragas, llmobs, llmobs_events):
+    rf_evaluator = RagasFaithfulnessEvaluator(llmobs)
     rf_evaluator.evaluate(_llm_span_with_expected_ragas_inputs_in_prompt())
-    assert rf_evaluator.llmobs_service._instance._llmobs_span_writer.enqueue.call_count == 7
-    calls = rf_evaluator.llmobs_service._instance._llmobs_span_writer.enqueue.call_args_list
-
-    spans = [call[0][0] for call in calls]
-
+    ragas_spans = [event for event in llmobs_events if event["name"].startswith("dd-ragas.")]
+    ragas_spans = sorted(ragas_spans, key=lambda d: d["start_ns"])
+    assert len(ragas_spans) == 7
     # check name, io, span kinds match
-    assert spans == _expected_ragas_spans()
+    assert ragas_spans == _expected_ragas_faithfulness_spans()
 
     # verify the trace structure
-    root_span = spans[0]
+    root_span = ragas_spans[0]
     root_span_id = root_span["span_id"]
     assert root_span["parent_id"] == "undefined"
     assert root_span["meta"] is not None
@@ -187,16 +188,15 @@ def test_ragas_faithfulness_emits_traces(ragas, LLMObs):
     assert isinstance(root_span["meta"]["metadata"]["faithfulness_list"], list)
     assert isinstance(root_span["meta"]["metadata"]["statements"], list)
     root_span_trace_id = root_span["trace_id"]
-    for child_span in spans[1:]:
+    for child_span in ragas_spans[1:]:
         assert child_span["trace_id"] == root_span_trace_id
 
-    assert spans[1]["parent_id"] == root_span_id  # input extraction (task)
-    assert spans[2]["parent_id"] == root_span_id  # create statements (workflow)
-    assert spans[4]["parent_id"] == root_span_id  # create verdicts (workflow)
-    assert spans[6]["parent_id"] == root_span_id  # create score (task)
-
-    assert spans[3]["parent_id"] == spans[2]["span_id"]  # create statements prompt (task)
-    assert spans[5]["parent_id"] == spans[4]["span_id"]  # create verdicts prompt (task)
+    assert ragas_spans[1]["parent_id"] == root_span_id  # input extraction (task)
+    assert ragas_spans[2]["parent_id"] == root_span_id  # create statements (workflow)
+    assert ragas_spans[4]["parent_id"] == root_span_id  # create verdicts (workflow)
+    assert ragas_spans[6]["parent_id"] == root_span_id  # create score (task)
+    assert ragas_spans[3]["parent_id"] == ragas_spans[2]["span_id"]  # create statements prompt (task)
+    assert ragas_spans[5]["parent_id"] == ragas_spans[4]["span_id"]  # create verdicts prompt (task)
 
 
 def test_llmobs_with_faithfulness_emits_traces_and_evals_on_exit(mock_writer_logs, run_python_code_in_subprocess):
@@ -206,14 +206,11 @@ def test_llmobs_with_faithfulness_emits_traces_and_evals_on_exit(mock_writer_log
         pypath.append(env["PYTHONPATH"])
     env.update(
         {
-            "DD_API_KEY": os.getenv("DD_API_KEY", "dummy-api-key"),
-            "DD_SITE": "datad0g.com",
             "PYTHONPATH": ":".join(pypath),
             "OPENAI_API_KEY": os.getenv("OPENAI_API_KEY", "dummy-openai-api-key"),
-            "DD_LLMOBS_ML_APP": "unnamed-ml-app",
             "_DD_LLMOBS_EVALUATOR_INTERVAL": "5",
             "_DD_LLMOBS_EVALUATORS": "ragas_faithfulness",
-            "DD_LLMOBS_AGENTLESS_ENABLED": "true",
+            "DD_TRACE_ENABLED": "0",
         }
     )
     out, err, status, pid = run_python_code_in_subprocess(
@@ -228,20 +225,14 @@ def test_llmobs_with_faithfulness_emits_traces_and_evals_on_exit(mock_writer_log
 from tests.llmobs._utils import logs_vcr
 
 ctx = logs_vcr.use_cassette(
-    "tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.emits_traces_and_evaluations_on_exit.yaml"
+    "tests.llmobs.test_llmobs_ragas_evaluators.emits_traces_and_evaluations_on_exit.yaml"
 )
 ctx.__enter__()
 atexit.register(lambda: ctx.__exit__())
-with mock.patch(
-    "ddtrace.internal.writer.HTTPWriter._send_payload",
-    return_value=Response(
-        status=200,
-        body="{}",
-    ),
-):
-    LLMObs.enable()
+with mock.patch("ddtrace.internal.writer.HTTPWriter._send_payload", return_value=Response(status=200, body="{}")):
+    LLMObs.enable(api_key="dummy-api-key", site="datad0g.com", ml_app="unnamed-ml-app", agentless_enabled=True)
     LLMObs._instance._evaluator_runner.enqueue(_llm_span_with_expected_ragas_inputs_in_messages(), None)
-""",
+    """,
         env=env,
     )
     assert status == 0, err
diff --git a/tests/llmobs/test_llmobs_service.py b/tests/llmobs/test_llmobs_service.py
index 98748250c3a..dad6accdcfb 100644
--- a/tests/llmobs/test_llmobs_service.py
+++ b/tests/llmobs/test_llmobs_service.py
@@ -1,4 +1,5 @@
 import os
+import re
 import threading
 import time
 
@@ -7,9 +8,7 @@
 
 import ddtrace
 from ddtrace._trace.context import Context
-from ddtrace._trace.span import Span
 from ddtrace.ext import SpanTypes
-from ddtrace.filters import TraceFilter
 from ddtrace.internal.service import ServiceStatus
 from ddtrace.llmobs import LLMObs as llmobs_service
 from ddtrace.llmobs._constants import INPUT_DOCUMENTS
@@ -31,7 +30,8 @@
 from ddtrace.llmobs._constants import SPAN_START_WHILE_DISABLED_WARNING
 from ddtrace.llmobs._constants import TAGS
 from ddtrace.llmobs._llmobs import SUPPORTED_LLMOBS_INTEGRATIONS
-from ddtrace.llmobs._llmobs import LLMObsTraceProcessor
+from ddtrace.llmobs._writer import LLMObsAgentlessEventClient
+from ddtrace.llmobs._writer import LLMObsProxiedEventClient
 from ddtrace.llmobs.utils import Prompt
 from tests.llmobs._utils import _expected_llmobs_eval_metric_event
 from tests.llmobs._utils import _expected_llmobs_llm_span_event
@@ -41,23 +41,16 @@
 from tests.utils import override_global_config
 
 
-@pytest.fixture
-def mock_logs():
-    with mock.patch("ddtrace.llmobs._llmobs.log") as mock_logs:
-        yield mock_logs
+RAGAS_AVAILABLE = os.getenv("RAGAS_AVAILABLE", False)
 
 
 def run_llmobs_trace_filter(dummy_tracer):
-    for trace_filter in dummy_tracer._filters:
-        if isinstance(trace_filter, LLMObsTraceProcessor):
-            root_llm_span = Span(name="span1", span_type=SpanTypes.LLM)
-            root_llm_span.set_tag_str(SPAN_KIND, "llm")
-            trace1 = [root_llm_span]
-            return trace_filter.process_trace(trace1)
-    raise ValueError("LLMObsTraceProcessor not found in tracer filters.")
+    with dummy_tracer.trace("span1", span_type=SpanTypes.LLM) as span:
+        span.set_tag_str(SPAN_KIND, "llm")
+    return dummy_tracer._writer.pop()
 
 
-def test_service_enable():
+def test_service_enable_proxy_default():
     with override_global_config(dict(_dd_api_key="<not-a-real-api-key>", _llmobs_ml_app="<ml-app-name>")):
         dummy_tracer = DummyTracer()
         llmobs_service.enable(_tracer=dummy_tracer)
@@ -65,22 +58,22 @@ def test_service_enable():
         assert llmobs_instance is not None
         assert llmobs_service.enabled
         assert llmobs_instance.tracer == dummy_tracer
-        assert any(isinstance(tracer_filter, LLMObsTraceProcessor) for tracer_filter in dummy_tracer._filters)
+        assert isinstance(llmobs_instance._llmobs_span_writer._clients[0], LLMObsProxiedEventClient)
         assert run_llmobs_trace_filter(dummy_tracer) is not None
 
         llmobs_service.disable()
 
 
-def test_service_enable_with_apm_disabled(monkeypatch):
-    with override_global_config(dict(_dd_api_key="<not-a-real-api-key>", _llmobs_ml_app="<ml-app-name>")):
+def test_enable_agentless():
+    with override_global_config(dict(_dd_api_key="<not-a-real-key>", _llmobs_ml_app="<ml-app-name>")):
         dummy_tracer = DummyTracer()
         llmobs_service.enable(_tracer=dummy_tracer, agentless_enabled=True)
         llmobs_instance = llmobs_service._instance
         assert llmobs_instance is not None
         assert llmobs_service.enabled
         assert llmobs_instance.tracer == dummy_tracer
-        assert any(isinstance(tracer_filter, LLMObsTraceProcessor) for tracer_filter in dummy_tracer._filters)
-        assert run_llmobs_trace_filter(dummy_tracer) is None
+        assert isinstance(llmobs_instance._llmobs_span_writer._clients[0], LLMObsAgentlessEventClient)
+        assert run_llmobs_trace_filter(dummy_tracer) is not None
 
         llmobs_service.disable()
 
@@ -118,7 +111,7 @@ def test_service_enable_no_ml_app_specified():
         assert llmobs_service._instance._evaluator_runner.status.value == "stopped"
 
 
-def test_service_enable_deprecated_ml_app_name(monkeypatch, mock_logs):
+def test_service_enable_deprecated_ml_app_name(monkeypatch, mock_llmobs_logs):
     with override_global_config(dict(_dd_api_key="<not-a-real-key>", _llmobs_ml_app="")):
         dummy_tracer = DummyTracer()
         monkeypatch.setenv("DD_LLMOBS_APP_NAME", "test_ml_app")
@@ -126,11 +119,13 @@ def test_service_enable_deprecated_ml_app_name(monkeypatch, mock_logs):
         assert llmobs_service.enabled is True
         assert llmobs_service._instance._llmobs_eval_metric_writer.status.value == "running"
         assert llmobs_service._instance._llmobs_span_writer.status.value == "running"
-        mock_logs.warning.assert_called_once_with("`DD_LLMOBS_APP_NAME` is deprecated. Use `DD_LLMOBS_ML_APP` instead.")
+        mock_llmobs_logs.warning.assert_called_once_with(
+            "`DD_LLMOBS_APP_NAME` is deprecated. Use `DD_LLMOBS_ML_APP` instead."
+        )
         llmobs_service.disable()
 
 
-def test_service_enable_already_enabled(mock_logs):
+def test_service_enable_already_enabled(mock_llmobs_logs):
     with override_global_config(dict(_dd_api_key="<not-a-real-api-key>", _llmobs_ml_app="<ml-app-name>")):
         dummy_tracer = DummyTracer()
         llmobs_service.enable(_tracer=dummy_tracer)
@@ -139,9 +134,8 @@ def test_service_enable_already_enabled(mock_logs):
         assert llmobs_instance is not None
         assert llmobs_service.enabled
         assert llmobs_instance.tracer == dummy_tracer
-        assert any(isinstance(tracer_filter, LLMObsTraceProcessor) for tracer_filter in dummy_tracer._filters)
         llmobs_service.disable()
-        mock_logs.debug.assert_has_calls([mock.call("%s already enabled", "LLMObs")])
+        mock_llmobs_logs.debug.assert_has_calls([mock.call("%s already enabled", "LLMObs")])
 
 
 @mock.patch("ddtrace.llmobs._llmobs.patch")
@@ -203,107 +197,83 @@ def test_service_enable_does_not_override_global_patch_config(mock_tracer_patch,
         llmobs_service.disable()
 
 
-def test_start_span_while_disabled_logs_warning(LLMObs, mock_logs):
-    LLMObs.disable()
-    _ = LLMObs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider")
-    mock_logs.warning.assert_called_once_with(SPAN_START_WHILE_DISABLED_WARNING)
-    mock_logs.reset_mock()
-    _ = LLMObs.tool(name="test_tool")
-    mock_logs.warning.assert_called_once_with(SPAN_START_WHILE_DISABLED_WARNING)
-    mock_logs.reset_mock()
-    _ = LLMObs.task(name="test_task")
-    mock_logs.warning.assert_called_once_with(SPAN_START_WHILE_DISABLED_WARNING)
-    mock_logs.reset_mock()
-    _ = LLMObs.workflow(name="test_workflow")
-    mock_logs.warning.assert_called_once_with(SPAN_START_WHILE_DISABLED_WARNING)
-    mock_logs.reset_mock()
-    _ = LLMObs.agent(name="test_agent")
-    mock_logs.warning.assert_called_once_with(SPAN_START_WHILE_DISABLED_WARNING)
-
-
-def test_start_span_uses_kind_as_default_name(LLMObs):
-    with LLMObs.llm(model_name="test_model", model_provider="test_provider") as span:
+def test_start_span_while_disabled_logs_warning(llmobs, mock_llmobs_logs):
+    llmobs.disable()
+    _ = llmobs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider")
+    mock_llmobs_logs.warning.assert_called_once_with(SPAN_START_WHILE_DISABLED_WARNING)
+    mock_llmobs_logs.reset_mock()
+    _ = llmobs.tool(name="test_tool")
+    mock_llmobs_logs.warning.assert_called_once_with(SPAN_START_WHILE_DISABLED_WARNING)
+    mock_llmobs_logs.reset_mock()
+    _ = llmobs.task(name="test_task")
+    mock_llmobs_logs.warning.assert_called_once_with(SPAN_START_WHILE_DISABLED_WARNING)
+    mock_llmobs_logs.reset_mock()
+    _ = llmobs.workflow(name="test_workflow")
+    mock_llmobs_logs.warning.assert_called_once_with(SPAN_START_WHILE_DISABLED_WARNING)
+    mock_llmobs_logs.reset_mock()
+    _ = llmobs.agent(name="test_agent")
+    mock_llmobs_logs.warning.assert_called_once_with(SPAN_START_WHILE_DISABLED_WARNING)
+
+
+def test_start_span_uses_kind_as_default_name(llmobs):
+    with llmobs.llm(model_name="test_model", model_provider="test_provider") as span:
         assert span.name == "llm"
-    with LLMObs.tool() as span:
+    with llmobs.tool() as span:
         assert span.name == "tool"
-    with LLMObs.task() as span:
+    with llmobs.task() as span:
         assert span.name == "task"
-    with LLMObs.workflow() as span:
+    with llmobs.workflow() as span:
         assert span.name == "workflow"
-    with LLMObs.agent() as span:
+    with llmobs.agent() as span:
         assert span.name == "agent"
 
 
-def test_start_span_with_session_id(LLMObs):
-    with LLMObs.llm(model_name="test_model", session_id="test_session_id") as span:
+def test_start_span_with_session_id(llmobs):
+    with llmobs.llm(model_name="test_model", session_id="test_session_id") as span:
         assert span._get_ctx_item(SESSION_ID) == "test_session_id"
-    with LLMObs.tool(session_id="test_session_id") as span:
+    with llmobs.tool(session_id="test_session_id") as span:
         assert span._get_ctx_item(SESSION_ID) == "test_session_id"
-    with LLMObs.task(session_id="test_session_id") as span:
+    with llmobs.task(session_id="test_session_id") as span:
         assert span._get_ctx_item(SESSION_ID) == "test_session_id"
-    with LLMObs.workflow(session_id="test_session_id") as span:
+    with llmobs.workflow(session_id="test_session_id") as span:
         assert span._get_ctx_item(SESSION_ID) == "test_session_id"
-    with LLMObs.agent(session_id="test_session_id") as span:
+    with llmobs.agent(session_id="test_session_id") as span:
         assert span._get_ctx_item(SESSION_ID) == "test_session_id"
 
 
-def test_session_id_becomes_top_level_field(LLMObs, mock_llmobs_span_writer):
-    session_id = "test_session_id"
-    with LLMObs.task(session_id=session_id) as span:
-        pass
-    mock_llmobs_span_writer.enqueue.assert_called_with(
-        _expected_llmobs_non_llm_span_event(span, "task", session_id=session_id)
-    )
-
-
-def test_session_id_becomes_top_level_field_agentless(AgentlessLLMObs, mock_llmobs_span_agentless_writer):
+def test_session_id_becomes_top_level_field(llmobs, llmobs_events):
     session_id = "test_session_id"
-    with AgentlessLLMObs.task(session_id=session_id) as span:
+    with llmobs.task(session_id=session_id) as span:
         pass
-    mock_llmobs_span_agentless_writer.enqueue.assert_called_with(
-        _expected_llmobs_non_llm_span_event(span, "task", session_id=session_id)
-    )
-
+    assert len(llmobs_events) == 1
+    assert llmobs_events[0] == _expected_llmobs_non_llm_span_event(span, "task", session_id=session_id)
 
-def test_llm_span(LLMObs, mock_llmobs_span_writer):
-    with LLMObs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span:
-        assert span.name == "test_llm_call"
-        assert span.resource == "llm"
-        assert span.span_type == "llm"
-        assert span._get_ctx_item(SPAN_KIND) == "llm"
-        assert span._get_ctx_item(MODEL_NAME) == "test_model"
-        assert span._get_ctx_item(MODEL_PROVIDER) == "test_provider"
-
-    mock_llmobs_span_writer.enqueue.assert_called_with(
-        _expected_llmobs_llm_span_event(span, "llm", model_name="test_model", model_provider="test_provider")
-    )
 
-
-def test_llm_span_agentless(AgentlessLLMObs, mock_llmobs_span_agentless_writer):
-    with AgentlessLLMObs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span:
+def test_llm_span(llmobs, llmobs_events):
+    with llmobs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span:
         assert span.name == "test_llm_call"
         assert span.resource == "llm"
         assert span.span_type == "llm"
         assert span._get_ctx_item(SPAN_KIND) == "llm"
         assert span._get_ctx_item(MODEL_NAME) == "test_model"
         assert span._get_ctx_item(MODEL_PROVIDER) == "test_provider"
-
-    mock_llmobs_span_agentless_writer.enqueue.assert_called_with(
-        _expected_llmobs_llm_span_event(span, "llm", model_name="test_model", model_provider="test_provider")
+    assert len(llmobs_events) == 1
+    assert llmobs_events[0] == _expected_llmobs_llm_span_event(
+        span, "llm", model_name="test_model", model_provider="test_provider"
     )
 
 
-def test_llm_span_no_model_sets_default(LLMObs, mock_llmobs_span_writer):
-    with LLMObs.llm(name="test_llm_call", model_provider="test_provider") as span:
+def test_llm_span_no_model_sets_default(llmobs, llmobs_events):
+    with llmobs.llm(name="test_llm_call", model_provider="test_provider") as span:
         assert span._get_ctx_item(MODEL_NAME) == "custom"
-
-    mock_llmobs_span_writer.enqueue.assert_called_with(
-        _expected_llmobs_llm_span_event(span, "llm", model_name="custom", model_provider="test_provider")
+    assert len(llmobs_events) == 1
+    assert llmobs_events[0] == _expected_llmobs_llm_span_event(
+        span, "llm", model_name="custom", model_provider="test_provider"
     )
 
 
-def test_default_model_provider_set_to_custom(LLMObs):
-    with LLMObs.llm(model_name="test_model", name="test_llm_call") as span:
+def test_default_model_provider_set_to_custom(llmobs):
+    with llmobs.llm(model_name="test_model", name="test_llm_call") as span:
         assert span.name == "test_llm_call"
         assert span.resource == "llm"
         assert span.span_type == "llm"
@@ -312,88 +282,57 @@ def test_default_model_provider_set_to_custom(LLMObs):
         assert span._get_ctx_item(MODEL_PROVIDER) == "custom"
 
 
-def test_tool_span(LLMObs, mock_llmobs_span_writer):
-    with LLMObs.tool(name="test_tool") as span:
-        assert span.name == "test_tool"
-        assert span.resource == "tool"
-        assert span.span_type == "llm"
-        assert span._get_ctx_item(SPAN_KIND) == "tool"
-    mock_llmobs_span_writer.enqueue.assert_called_with(_expected_llmobs_non_llm_span_event(span, "tool"))
-
-
-def test_tool_span_agentless(AgentlessLLMObs, mock_llmobs_span_agentless_writer):
-    with AgentlessLLMObs.tool(name="test_tool") as span:
+def test_tool_span(llmobs, llmobs_events):
+    with llmobs.tool(name="test_tool") as span:
         assert span.name == "test_tool"
         assert span.resource == "tool"
         assert span.span_type == "llm"
         assert span._get_ctx_item(SPAN_KIND) == "tool"
-    mock_llmobs_span_agentless_writer.enqueue.assert_called_with(_expected_llmobs_non_llm_span_event(span, "tool"))
-
-
-def test_task_span(LLMObs, mock_llmobs_span_writer):
-    with LLMObs.task(name="test_task") as span:
-        assert span.name == "test_task"
-        assert span.resource == "task"
-        assert span.span_type == "llm"
-        assert span._get_ctx_item(SPAN_KIND) == "task"
-    mock_llmobs_span_writer.enqueue.assert_called_with(_expected_llmobs_non_llm_span_event(span, "task"))
+    assert len(llmobs_events) == 1
+    assert llmobs_events[0] == _expected_llmobs_non_llm_span_event(span, "tool")
 
 
-def test_task_span_agentless(AgentlessLLMObs, mock_llmobs_span_agentless_writer):
-    with AgentlessLLMObs.task(name="test_task") as span:
+def test_task_span(llmobs, llmobs_events):
+    with llmobs.task(name="test_task") as span:
         assert span.name == "test_task"
         assert span.resource == "task"
         assert span.span_type == "llm"
         assert span._get_ctx_item(SPAN_KIND) == "task"
-    mock_llmobs_span_agentless_writer.enqueue.assert_called_with(_expected_llmobs_non_llm_span_event(span, "task"))
-
-
-def test_workflow_span(LLMObs, mock_llmobs_span_writer):
-    with LLMObs.workflow(name="test_workflow") as span:
-        assert span.name == "test_workflow"
-        assert span.resource == "workflow"
-        assert span.span_type == "llm"
-        assert span._get_ctx_item(SPAN_KIND) == "workflow"
-    mock_llmobs_span_writer.enqueue.assert_called_with(_expected_llmobs_non_llm_span_event(span, "workflow"))
+    assert len(llmobs_events) == 1
+    assert llmobs_events[0] == _expected_llmobs_non_llm_span_event(span, "task")
 
 
-def test_workflow_span_agentless(AgentlessLLMObs, mock_llmobs_span_agentless_writer):
-    with AgentlessLLMObs.workflow(name="test_workflow") as span:
+def test_workflow_span(llmobs, llmobs_events):
+    with llmobs.workflow(name="test_workflow") as span:
         assert span.name == "test_workflow"
         assert span.resource == "workflow"
         assert span.span_type == "llm"
         assert span._get_ctx_item(SPAN_KIND) == "workflow"
-    mock_llmobs_span_agentless_writer.enqueue.assert_called_with(_expected_llmobs_non_llm_span_event(span, "workflow"))
+    assert len(llmobs_events) == 1
+    assert llmobs_events[0] == _expected_llmobs_non_llm_span_event(span, "workflow")
 
 
-def test_agent_span(LLMObs, mock_llmobs_span_writer):
-    with LLMObs.agent(name="test_agent") as span:
+def test_agent_span(llmobs, llmobs_events):
+    with llmobs.agent(name="test_agent") as span:
         assert span.name == "test_agent"
         assert span.resource == "agent"
         assert span.span_type == "llm"
         assert span._get_ctx_item(SPAN_KIND) == "agent"
-    mock_llmobs_span_writer.enqueue.assert_called_with(_expected_llmobs_llm_span_event(span, "agent"))
+    assert len(llmobs_events) == 1
+    assert llmobs_events[0] == _expected_llmobs_llm_span_event(span, "agent")
 
 
-def test_agent_span_agentless(AgentlessLLMObs, mock_llmobs_span_agentless_writer):
-    with AgentlessLLMObs.agent(name="test_agent") as span:
-        assert span.name == "test_agent"
-        assert span.resource == "agent"
-        assert span.span_type == "llm"
-        assert span._get_ctx_item(SPAN_KIND) == "agent"
-    mock_llmobs_span_agentless_writer.enqueue.assert_called_with(_expected_llmobs_llm_span_event(span, "agent"))
-
-
-def test_embedding_span_no_model_sets_default(LLMObs, mock_llmobs_span_writer):
-    with LLMObs.embedding(name="test_embedding", model_provider="test_provider") as span:
+def test_embedding_span_no_model_sets_default(llmobs, llmobs_events):
+    with llmobs.embedding(name="test_embedding", model_provider="test_provider") as span:
         assert span._get_ctx_item(MODEL_NAME) == "custom"
-    mock_llmobs_span_writer.enqueue.assert_called_with(
-        _expected_llmobs_llm_span_event(span, "embedding", model_name="custom", model_provider="test_provider")
+    assert len(llmobs_events) == 1
+    assert llmobs_events[0] == _expected_llmobs_llm_span_event(
+        span, "embedding", model_name="custom", model_provider="test_provider"
     )
 
 
-def test_embedding_default_model_provider_set_to_custom(LLMObs):
-    with LLMObs.embedding(model_name="test_model", name="test_embedding") as span:
+def test_embedding_default_model_provider_set_to_custom(llmobs):
+    with llmobs.embedding(model_name="test_model", name="test_embedding") as span:
         assert span.name == "test_embedding"
         assert span.resource == "embedding"
         assert span.span_type == "llm"
@@ -402,198 +341,182 @@ def test_embedding_default_model_provider_set_to_custom(LLMObs):
         assert span._get_ctx_item(MODEL_PROVIDER) == "custom"
 
 
-def test_embedding_span(LLMObs, mock_llmobs_span_writer):
-    with LLMObs.embedding(model_name="test_model", name="test_embedding", model_provider="test_provider") as span:
-        assert span.name == "test_embedding"
-        assert span.resource == "embedding"
-        assert span.span_type == "llm"
-        assert span._get_ctx_item(SPAN_KIND) == "embedding"
-        assert span._get_ctx_item(MODEL_NAME) == "test_model"
-        assert span._get_ctx_item(MODEL_PROVIDER) == "test_provider"
-
-    mock_llmobs_span_writer.enqueue.assert_called_with(
-        _expected_llmobs_llm_span_event(span, "embedding", model_name="test_model", model_provider="test_provider")
-    )
-
-
-def test_embedding_span_agentless(AgentlessLLMObs, mock_llmobs_span_agentless_writer):
-    with AgentlessLLMObs.embedding(
-        model_name="test_model", name="test_embedding", model_provider="test_provider"
-    ) as span:
+def test_embedding_span(llmobs, llmobs_events):
+    with llmobs.embedding(model_name="test_model", name="test_embedding", model_provider="test_provider") as span:
         assert span.name == "test_embedding"
         assert span.resource == "embedding"
         assert span.span_type == "llm"
         assert span._get_ctx_item(SPAN_KIND) == "embedding"
         assert span._get_ctx_item(MODEL_NAME) == "test_model"
         assert span._get_ctx_item(MODEL_PROVIDER) == "test_provider"
-
-    mock_llmobs_span_agentless_writer.enqueue.assert_called_with(
-        _expected_llmobs_llm_span_event(span, "embedding", model_name="test_model", model_provider="test_provider")
+    assert len(llmobs_events) == 1
+    assert llmobs_events[0] == _expected_llmobs_llm_span_event(
+        span, "embedding", model_name="test_model", model_provider="test_provider"
     )
 
 
-def test_annotate_no_active_span_logs_warning(LLMObs, mock_logs):
-    LLMObs.annotate(parameters={"test": "test"})
-    mock_logs.warning.assert_called_once_with("No span provided and no active LLMObs-generated span found.")
+def test_annotate_no_active_span_logs_warning(llmobs, mock_llmobs_logs):
+    llmobs.annotate(parameters={"test": "test"})
+    mock_llmobs_logs.warning.assert_called_once_with("No span provided and no active LLMObs-generated span found.")
 
 
-def test_annotate_non_llm_span_logs_warning(LLMObs, mock_logs):
+def test_annotate_non_llm_span_logs_warning(llmobs, mock_llmobs_logs):
     dummy_tracer = DummyTracer()
     with dummy_tracer.trace("root") as non_llmobs_span:
-        LLMObs.annotate(span=non_llmobs_span, parameters={"test": "test"})
-        mock_logs.warning.assert_called_once_with("Span must be an LLMObs-generated span.")
+        llmobs.annotate(span=non_llmobs_span, parameters={"test": "test"})
+        mock_llmobs_logs.warning.assert_called_once_with("Span must be an LLMObs-generated span.")
 
 
-def test_annotate_finished_span_does_nothing(LLMObs, mock_logs):
-    with LLMObs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span:
+def test_annotate_finished_span_does_nothing(llmobs, mock_llmobs_logs):
+    with llmobs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span:
         pass
-    LLMObs.annotate(span=span, parameters={"test": "test"})
-    mock_logs.warning.assert_called_once_with("Cannot annotate a finished span.")
+    llmobs.annotate(span=span, parameters={"test": "test"})
+    mock_llmobs_logs.warning.assert_called_once_with("Cannot annotate a finished span.")
 
 
-def test_annotate_parameters(LLMObs, mock_logs):
-    with LLMObs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span:
-        LLMObs.annotate(span=span, parameters={"temperature": 0.9, "max_tokens": 50})
+def test_annotate_parameters(llmobs, mock_llmobs_logs):
+    with llmobs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span:
+        llmobs.annotate(span=span, parameters={"temperature": 0.9, "max_tokens": 50})
         assert span._get_ctx_item(INPUT_PARAMETERS) == {"temperature": 0.9, "max_tokens": 50}
-        mock_logs.warning.assert_called_once_with(
+        mock_llmobs_logs.warning.assert_called_once_with(
             "Setting parameters is deprecated, please set parameters and other metadata as tags instead."
         )
 
 
-def test_annotate_metadata(LLMObs):
-    with LLMObs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span:
-        LLMObs.annotate(span=span, metadata={"temperature": 0.5, "max_tokens": 20, "top_k": 10, "n": 3})
+def test_annotate_metadata(llmobs):
+    with llmobs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span:
+        llmobs.annotate(span=span, metadata={"temperature": 0.5, "max_tokens": 20, "top_k": 10, "n": 3})
         assert span._get_ctx_item(METADATA) == {"temperature": 0.5, "max_tokens": 20, "top_k": 10, "n": 3}
 
 
-def test_annotate_metadata_wrong_type_raises_warning(LLMObs, mock_logs):
-    with LLMObs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span:
-        LLMObs.annotate(span=span, metadata="wrong_metadata")
+def test_annotate_metadata_wrong_type_raises_warning(llmobs, mock_llmobs_logs):
+    with llmobs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span:
+        llmobs.annotate(span=span, metadata="wrong_metadata")
         assert span._get_ctx_item(METADATA) is None
-        mock_logs.warning.assert_called_once_with("metadata must be a dictionary of string key-value pairs.")
-        mock_logs.reset_mock()
+        mock_llmobs_logs.warning.assert_called_once_with("metadata must be a dictionary of string key-value pairs.")
+        mock_llmobs_logs.reset_mock()
 
 
-def test_annotate_tag(LLMObs):
-    with LLMObs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span:
-        LLMObs.annotate(span=span, tags={"test_tag_name": "test_tag_value", "test_numeric_tag": 10})
+def test_annotate_tag(llmobs):
+    with llmobs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span:
+        llmobs.annotate(span=span, tags={"test_tag_name": "test_tag_value", "test_numeric_tag": 10})
         assert span._get_ctx_item(TAGS) == {"test_tag_name": "test_tag_value", "test_numeric_tag": 10}
 
 
-def test_annotate_tag_wrong_type(LLMObs, mock_logs):
-    with LLMObs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span:
-        LLMObs.annotate(span=span, tags=12345)
+def test_annotate_tag_wrong_type(llmobs, mock_llmobs_logs):
+    with llmobs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span:
+        llmobs.annotate(span=span, tags=12345)
         assert span._get_ctx_item(TAGS) is None
-        mock_logs.warning.assert_called_once_with(
+        mock_llmobs_logs.warning.assert_called_once_with(
             "span_tags must be a dictionary of string key - primitive value pairs."
         )
 
 
-def test_annotate_input_string(LLMObs):
-    with LLMObs.llm(model_name="test_model") as llm_span:
-        LLMObs.annotate(span=llm_span, input_data="test_input")
+def test_annotate_input_string(llmobs):
+    with llmobs.llm(model_name="test_model") as llm_span:
+        llmobs.annotate(span=llm_span, input_data="test_input")
         assert llm_span._get_ctx_item(INPUT_MESSAGES) == [{"content": "test_input"}]
-    with LLMObs.task() as task_span:
-        LLMObs.annotate(span=task_span, input_data="test_input")
+    with llmobs.task() as task_span:
+        llmobs.annotate(span=task_span, input_data="test_input")
         assert task_span._get_ctx_item(INPUT_VALUE) == "test_input"
-    with LLMObs.tool() as tool_span:
-        LLMObs.annotate(span=tool_span, input_data="test_input")
+    with llmobs.tool() as tool_span:
+        llmobs.annotate(span=tool_span, input_data="test_input")
         assert tool_span._get_ctx_item(INPUT_VALUE) == "test_input"
-    with LLMObs.workflow() as workflow_span:
-        LLMObs.annotate(span=workflow_span, input_data="test_input")
+    with llmobs.workflow() as workflow_span:
+        llmobs.annotate(span=workflow_span, input_data="test_input")
         assert workflow_span._get_ctx_item(INPUT_VALUE) == "test_input"
-    with LLMObs.agent() as agent_span:
-        LLMObs.annotate(span=agent_span, input_data="test_input")
+    with llmobs.agent() as agent_span:
+        llmobs.annotate(span=agent_span, input_data="test_input")
         assert agent_span._get_ctx_item(INPUT_VALUE) == "test_input"
-    with LLMObs.retrieval() as retrieval_span:
-        LLMObs.annotate(span=retrieval_span, input_data="test_input")
+    with llmobs.retrieval() as retrieval_span:
+        llmobs.annotate(span=retrieval_span, input_data="test_input")
         assert retrieval_span._get_ctx_item(INPUT_VALUE) == "test_input"
 
 
-def test_annotate_numeric_io(LLMObs):
-    with LLMObs.task() as task_span:
-        LLMObs.annotate(span=task_span, input_data=0, output_data=0)
+def test_annotate_numeric_io(llmobs):
+    with llmobs.task() as task_span:
+        llmobs.annotate(span=task_span, input_data=0, output_data=0)
         assert task_span._get_ctx_item(INPUT_VALUE) == "0"
         assert task_span._get_ctx_item(OUTPUT_VALUE) == "0"
-    with LLMObs.task() as task_span:
-        LLMObs.annotate(span=task_span, input_data=1.23, output_data=1.23)
+    with llmobs.task() as task_span:
+        llmobs.annotate(span=task_span, input_data=1.23, output_data=1.23)
         assert task_span._get_ctx_item(INPUT_VALUE) == "1.23"
         assert task_span._get_ctx_item(OUTPUT_VALUE) == "1.23"
 
 
-def test_annotate_input_serializable_value(LLMObs):
-    with LLMObs.task() as task_span:
-        LLMObs.annotate(span=task_span, input_data=["test_input"])
+def test_annotate_input_serializable_value(llmobs):
+    with llmobs.task() as task_span:
+        llmobs.annotate(span=task_span, input_data=["test_input"])
         assert task_span._get_ctx_item(INPUT_VALUE) == str(["test_input"])
-    with LLMObs.tool() as tool_span:
-        LLMObs.annotate(span=tool_span, input_data={"test_input": "hello world"})
+    with llmobs.tool() as tool_span:
+        llmobs.annotate(span=tool_span, input_data={"test_input": "hello world"})
         assert tool_span._get_ctx_item(INPUT_VALUE) == str({"test_input": "hello world"})
-    with LLMObs.workflow() as workflow_span:
-        LLMObs.annotate(span=workflow_span, input_data=("asd", 123))
+    with llmobs.workflow() as workflow_span:
+        llmobs.annotate(span=workflow_span, input_data=("asd", 123))
         assert workflow_span._get_ctx_item(INPUT_VALUE) == str(("asd", 123))
-    with LLMObs.agent() as agent_span:
-        LLMObs.annotate(span=agent_span, input_data="test_input")
+    with llmobs.agent() as agent_span:
+        llmobs.annotate(span=agent_span, input_data="test_input")
         assert agent_span._get_ctx_item(INPUT_VALUE) == "test_input"
-    with LLMObs.retrieval() as retrieval_span:
-        LLMObs.annotate(span=retrieval_span, input_data=[0, 1, 2, 3, 4])
+    with llmobs.retrieval() as retrieval_span:
+        llmobs.annotate(span=retrieval_span, input_data=[0, 1, 2, 3, 4])
         assert retrieval_span._get_ctx_item(INPUT_VALUE) == str([0, 1, 2, 3, 4])
 
 
-def test_annotate_input_llm_message(LLMObs):
-    with LLMObs.llm(model_name="test_model") as span:
-        LLMObs.annotate(span=span, input_data=[{"content": "test_input", "role": "human"}])
+def test_annotate_input_llm_message(llmobs):
+    with llmobs.llm(model_name="test_model") as span:
+        llmobs.annotate(span=span, input_data=[{"content": "test_input", "role": "human"}])
         assert span._get_ctx_item(INPUT_MESSAGES) == [{"content": "test_input", "role": "human"}]
 
 
-def test_annotate_input_llm_message_wrong_type(LLMObs, mock_logs):
-    with LLMObs.llm(model_name="test_model") as span:
-        LLMObs.annotate(span=span, input_data=[{"content": object()}])
+def test_annotate_input_llm_message_wrong_type(llmobs, mock_llmobs_logs):
+    with llmobs.llm(model_name="test_model") as span:
+        llmobs.annotate(span=span, input_data=[{"content": object()}])
         assert span._get_ctx_item(INPUT_MESSAGES) is None
-        mock_logs.warning.assert_called_once_with("Failed to parse input messages.", exc_info=True)
+        mock_llmobs_logs.warning.assert_called_once_with("Failed to parse input messages.", exc_info=True)
 
 
-def test_llmobs_annotate_incorrect_message_content_type_raises_warning(LLMObs, mock_logs):
-    with LLMObs.llm(model_name="test_model") as span:
-        LLMObs.annotate(span=span, input_data={"role": "user", "content": {"nested": "yes"}})
-        mock_logs.warning.assert_called_once_with("Failed to parse input messages.", exc_info=True)
-        mock_logs.reset_mock()
-        LLMObs.annotate(span=span, output_data={"role": "user", "content": {"nested": "yes"}})
-        mock_logs.warning.assert_called_once_with("Failed to parse output messages.", exc_info=True)
+def test_llmobs_annotate_incorrect_message_content_type_raises_warning(llmobs, mock_llmobs_logs):
+    with llmobs.llm(model_name="test_model") as span:
+        llmobs.annotate(span=span, input_data={"role": "user", "content": {"nested": "yes"}})
+        mock_llmobs_logs.warning.assert_called_once_with("Failed to parse input messages.", exc_info=True)
+        mock_llmobs_logs.reset_mock()
+        llmobs.annotate(span=span, output_data={"role": "user", "content": {"nested": "yes"}})
+        mock_llmobs_logs.warning.assert_called_once_with("Failed to parse output messages.", exc_info=True)
 
 
-def test_annotate_document_str(LLMObs):
-    with LLMObs.embedding(model_name="test_model") as span:
-        LLMObs.annotate(span=span, input_data="test_document_text")
+def test_annotate_document_str(llmobs):
+    with llmobs.embedding(model_name="test_model") as span:
+        llmobs.annotate(span=span, input_data="test_document_text")
         documents = span._get_ctx_item(INPUT_DOCUMENTS)
         assert documents
         assert len(documents) == 1
         assert documents[0]["text"] == "test_document_text"
-    with LLMObs.retrieval() as span:
-        LLMObs.annotate(span=span, output_data="test_document_text")
+    with llmobs.retrieval() as span:
+        llmobs.annotate(span=span, output_data="test_document_text")
         documents = span._get_ctx_item(OUTPUT_DOCUMENTS)
         assert documents
         assert len(documents) == 1
         assert documents[0]["text"] == "test_document_text"
 
 
-def test_annotate_document_dict(LLMObs):
-    with LLMObs.embedding(model_name="test_model") as span:
-        LLMObs.annotate(span=span, input_data={"text": "test_document_text"})
+def test_annotate_document_dict(llmobs):
+    with llmobs.embedding(model_name="test_model") as span:
+        llmobs.annotate(span=span, input_data={"text": "test_document_text"})
         documents = span._get_ctx_item(INPUT_DOCUMENTS)
         assert documents
         assert len(documents) == 1
         assert documents[0]["text"] == "test_document_text"
-    with LLMObs.retrieval() as span:
-        LLMObs.annotate(span=span, output_data={"text": "test_document_text"})
+    with llmobs.retrieval() as span:
+        llmobs.annotate(span=span, output_data={"text": "test_document_text"})
         documents = span._get_ctx_item(OUTPUT_DOCUMENTS)
         assert documents
         assert len(documents) == 1
         assert documents[0]["text"] == "test_document_text"
 
 
-def test_annotate_document_list(LLMObs):
-    with LLMObs.embedding(model_name="test_model") as span:
-        LLMObs.annotate(
+def test_annotate_document_list(llmobs):
+    with llmobs.embedding(model_name="test_model") as span:
+        llmobs.annotate(
             span=span,
             input_data=[{"text": "test_document_text"}, {"text": "text", "name": "name", "score": 0.9, "id": "id"}],
         )
@@ -605,8 +528,8 @@ def test_annotate_document_list(LLMObs):
         assert documents[1]["name"] == "name"
         assert documents[1]["id"] == "id"
         assert documents[1]["score"] == 0.9
-    with LLMObs.retrieval() as span:
-        LLMObs.annotate(
+    with llmobs.retrieval() as span:
+        llmobs.annotate(
             span=span,
             output_data=[{"text": "test_document_text"}, {"text": "text", "name": "name", "score": 0.9, "id": "id"}],
         )
@@ -620,129 +543,131 @@ def test_annotate_document_list(LLMObs):
         assert documents[1]["score"] == 0.9
 
 
-def test_annotate_incorrect_document_type_raises_warning(LLMObs, mock_logs):
-    with LLMObs.embedding(model_name="test_model") as span:
-        LLMObs.annotate(span=span, input_data={"text": 123})
-        mock_logs.warning.assert_called_once_with("Failed to parse input documents.", exc_info=True)
-        mock_logs.reset_mock()
-        LLMObs.annotate(span=span, input_data=123)
-        mock_logs.warning.assert_called_once_with("Failed to parse input documents.", exc_info=True)
-        mock_logs.reset_mock()
-        LLMObs.annotate(span=span, input_data=object())
-        mock_logs.warning.assert_called_once_with("Failed to parse input documents.", exc_info=True)
-        mock_logs.reset_mock()
-    with LLMObs.retrieval() as span:
-        LLMObs.annotate(span=span, output_data=[{"score": 0.9, "id": "id", "name": "name"}])
-        mock_logs.warning.assert_called_once_with("Failed to parse output documents.", exc_info=True)
-        mock_logs.reset_mock()
-        LLMObs.annotate(span=span, output_data=123)
-        mock_logs.warning.assert_called_once_with("Failed to parse output documents.", exc_info=True)
-        mock_logs.reset_mock()
-        LLMObs.annotate(span=span, output_data=object())
-        mock_logs.warning.assert_called_once_with("Failed to parse output documents.", exc_info=True)
-
-
-def test_annotate_document_no_text_raises_warning(LLMObs, mock_logs):
-    with LLMObs.embedding(model_name="test_model") as span:
-        LLMObs.annotate(span=span, input_data=[{"score": 0.9, "id": "id", "name": "name"}])
-        mock_logs.warning.assert_called_once_with("Failed to parse input documents.", exc_info=True)
-    mock_logs.reset_mock()
-    with LLMObs.retrieval() as span:
-        LLMObs.annotate(span=span, output_data=[{"score": 0.9, "id": "id", "name": "name"}])
-        mock_logs.warning.assert_called_once_with("Failed to parse output documents.", exc_info=True)
-
-
-def test_annotate_incorrect_document_field_type_raises_warning(LLMObs, mock_logs):
-    with LLMObs.embedding(model_name="test_model") as span:
-        LLMObs.annotate(span=span, input_data=[{"text": "test_document_text", "score": "0.9"}])
-        mock_logs.warning.assert_called_once_with("Failed to parse input documents.", exc_info=True)
-    mock_logs.reset_mock()
-    with LLMObs.embedding(model_name="test_model") as span:
-        LLMObs.annotate(
+def test_annotate_incorrect_document_type_raises_warning(llmobs, mock_llmobs_logs):
+    with llmobs.embedding(model_name="test_model") as span:
+        llmobs.annotate(span=span, input_data={"text": 123})
+        mock_llmobs_logs.warning.assert_called_once_with("Failed to parse input documents.", exc_info=True)
+        mock_llmobs_logs.reset_mock()
+        llmobs.annotate(span=span, input_data=123)
+        mock_llmobs_logs.warning.assert_called_once_with("Failed to parse input documents.", exc_info=True)
+        mock_llmobs_logs.reset_mock()
+        llmobs.annotate(span=span, input_data=object())
+        mock_llmobs_logs.warning.assert_called_once_with("Failed to parse input documents.", exc_info=True)
+        mock_llmobs_logs.reset_mock()
+    with llmobs.retrieval() as span:
+        llmobs.annotate(span=span, output_data=[{"score": 0.9, "id": "id", "name": "name"}])
+        mock_llmobs_logs.warning.assert_called_once_with("Failed to parse output documents.", exc_info=True)
+        mock_llmobs_logs.reset_mock()
+        llmobs.annotate(span=span, output_data=123)
+        mock_llmobs_logs.warning.assert_called_once_with("Failed to parse output documents.", exc_info=True)
+        mock_llmobs_logs.reset_mock()
+        llmobs.annotate(span=span, output_data=object())
+        mock_llmobs_logs.warning.assert_called_once_with("Failed to parse output documents.", exc_info=True)
+
+
+def test_annotate_document_no_text_raises_warning(llmobs, mock_llmobs_logs):
+    with llmobs.embedding(model_name="test_model") as span:
+        llmobs.annotate(span=span, input_data=[{"score": 0.9, "id": "id", "name": "name"}])
+        mock_llmobs_logs.warning.assert_called_once_with("Failed to parse input documents.", exc_info=True)
+    mock_llmobs_logs.reset_mock()
+    with llmobs.retrieval() as span:
+        llmobs.annotate(span=span, output_data=[{"score": 0.9, "id": "id", "name": "name"}])
+        mock_llmobs_logs.warning.assert_called_once_with("Failed to parse output documents.", exc_info=True)
+
+
+def test_annotate_incorrect_document_field_type_raises_warning(llmobs, mock_llmobs_logs):
+    with llmobs.embedding(model_name="test_model") as span:
+        llmobs.annotate(span=span, input_data=[{"text": "test_document_text", "score": "0.9"}])
+        mock_llmobs_logs.warning.assert_called_once_with("Failed to parse input documents.", exc_info=True)
+    mock_llmobs_logs.reset_mock()
+    with llmobs.embedding(model_name="test_model") as span:
+        llmobs.annotate(
             span=span, input_data=[{"text": "text", "id": 123, "score": "0.9", "name": ["h", "e", "l", "l", "o"]}]
         )
-        mock_logs.warning.assert_called_once_with("Failed to parse input documents.", exc_info=True)
-    mock_logs.reset_mock()
-    with LLMObs.retrieval() as span:
-        LLMObs.annotate(span=span, output_data=[{"text": "test_document_text", "score": "0.9"}])
-        mock_logs.warning.assert_called_once_with("Failed to parse output documents.", exc_info=True)
-    mock_logs.reset_mock()
-    with LLMObs.retrieval() as span:
-        LLMObs.annotate(
+        mock_llmobs_logs.warning.assert_called_once_with("Failed to parse input documents.", exc_info=True)
+    mock_llmobs_logs.reset_mock()
+    with llmobs.retrieval() as span:
+        llmobs.annotate(span=span, output_data=[{"text": "test_document_text", "score": "0.9"}])
+        mock_llmobs_logs.warning.assert_called_once_with("Failed to parse output documents.", exc_info=True)
+    mock_llmobs_logs.reset_mock()
+    with llmobs.retrieval() as span:
+        llmobs.annotate(
             span=span, output_data=[{"text": "text", "id": 123, "score": "0.9", "name": ["h", "e", "l", "l", "o"]}]
         )
-        mock_logs.warning.assert_called_once_with("Failed to parse output documents.", exc_info=True)
+        mock_llmobs_logs.warning.assert_called_once_with("Failed to parse output documents.", exc_info=True)
 
 
-def test_annotate_output_string(LLMObs):
-    with LLMObs.llm(model_name="test_model") as llm_span:
-        LLMObs.annotate(span=llm_span, output_data="test_output")
+def test_annotate_output_string(llmobs):
+    with llmobs.llm(model_name="test_model") as llm_span:
+        llmobs.annotate(span=llm_span, output_data="test_output")
         assert llm_span._get_ctx_item(OUTPUT_MESSAGES) == [{"content": "test_output"}]
-    with LLMObs.embedding(model_name="test_model") as embedding_span:
-        LLMObs.annotate(span=embedding_span, output_data="test_output")
+    with llmobs.embedding(model_name="test_model") as embedding_span:
+        llmobs.annotate(span=embedding_span, output_data="test_output")
         assert embedding_span._get_ctx_item(OUTPUT_VALUE) == "test_output"
-    with LLMObs.task() as task_span:
-        LLMObs.annotate(span=task_span, output_data="test_output")
+    with llmobs.task() as task_span:
+        llmobs.annotate(span=task_span, output_data="test_output")
         assert task_span._get_ctx_item(OUTPUT_VALUE) == "test_output"
-    with LLMObs.tool() as tool_span:
-        LLMObs.annotate(span=tool_span, output_data="test_output")
+    with llmobs.tool() as tool_span:
+        llmobs.annotate(span=tool_span, output_data="test_output")
         assert tool_span._get_ctx_item(OUTPUT_VALUE) == "test_output"
-    with LLMObs.workflow() as workflow_span:
-        LLMObs.annotate(span=workflow_span, output_data="test_output")
+    with llmobs.workflow() as workflow_span:
+        llmobs.annotate(span=workflow_span, output_data="test_output")
         assert workflow_span._get_ctx_item(OUTPUT_VALUE) == "test_output"
-    with LLMObs.agent() as agent_span:
-        LLMObs.annotate(span=agent_span, output_data="test_output")
+    with llmobs.agent() as agent_span:
+        llmobs.annotate(span=agent_span, output_data="test_output")
         assert agent_span._get_ctx_item(OUTPUT_VALUE) == "test_output"
 
 
-def test_annotate_output_serializable_value(LLMObs):
-    with LLMObs.embedding(model_name="test_model") as embedding_span:
-        LLMObs.annotate(span=embedding_span, output_data=[[0, 1, 2, 3], [4, 5, 6, 7]])
+def test_annotate_output_serializable_value(llmobs):
+    with llmobs.embedding(model_name="test_model") as embedding_span:
+        llmobs.annotate(span=embedding_span, output_data=[[0, 1, 2, 3], [4, 5, 6, 7]])
         assert embedding_span._get_ctx_item(OUTPUT_VALUE) == str([[0, 1, 2, 3], [4, 5, 6, 7]])
-    with LLMObs.task() as task_span:
-        LLMObs.annotate(span=task_span, output_data=["test_output"])
+    with llmobs.task() as task_span:
+        llmobs.annotate(span=task_span, output_data=["test_output"])
         assert task_span._get_ctx_item(OUTPUT_VALUE) == str(["test_output"])
-    with LLMObs.tool() as tool_span:
-        LLMObs.annotate(span=tool_span, output_data={"test_output": "hello world"})
+    with llmobs.tool() as tool_span:
+        llmobs.annotate(span=tool_span, output_data={"test_output": "hello world"})
         assert tool_span._get_ctx_item(OUTPUT_VALUE) == str({"test_output": "hello world"})
-    with LLMObs.workflow() as workflow_span:
-        LLMObs.annotate(span=workflow_span, output_data=("asd", 123))
+    with llmobs.workflow() as workflow_span:
+        llmobs.annotate(span=workflow_span, output_data=("asd", 123))
         assert workflow_span._get_ctx_item(OUTPUT_VALUE) == str(("asd", 123))
-    with LLMObs.agent() as agent_span:
-        LLMObs.annotate(span=agent_span, output_data="test_output")
+    with llmobs.agent() as agent_span:
+        llmobs.annotate(span=agent_span, output_data="test_output")
         assert agent_span._get_ctx_item(OUTPUT_VALUE) == "test_output"
 
 
-def test_annotate_output_llm_message(LLMObs):
-    with LLMObs.llm(model_name="test_model") as llm_span:
-        LLMObs.annotate(span=llm_span, output_data=[{"content": "test_output", "role": "human"}])
+def test_annotate_output_llm_message(llmobs):
+    with llmobs.llm(model_name="test_model") as llm_span:
+        llmobs.annotate(span=llm_span, output_data=[{"content": "test_output", "role": "human"}])
         assert llm_span._get_ctx_item(OUTPUT_MESSAGES) == [{"content": "test_output", "role": "human"}]
 
 
-def test_annotate_output_llm_message_wrong_type(LLMObs, mock_logs):
-    with LLMObs.llm(model_name="test_model") as llm_span:
-        LLMObs.annotate(span=llm_span, output_data=[{"content": object()}])
+def test_annotate_output_llm_message_wrong_type(llmobs, mock_llmobs_logs):
+    with llmobs.llm(model_name="test_model") as llm_span:
+        llmobs.annotate(span=llm_span, output_data=[{"content": object()}])
         assert llm_span._get_ctx_item(OUTPUT_MESSAGES) is None
-        mock_logs.warning.assert_called_once_with("Failed to parse output messages.", exc_info=True)
+        mock_llmobs_logs.warning.assert_called_once_with("Failed to parse output messages.", exc_info=True)
 
 
-def test_annotate_metrics(LLMObs):
-    with LLMObs.llm(model_name="test_model") as span:
-        LLMObs.annotate(span=span, metrics={"input_tokens": 10, "output_tokens": 20, "total_tokens": 30})
+def test_annotate_metrics(llmobs):
+    with llmobs.llm(model_name="test_model") as span:
+        llmobs.annotate(span=span, metrics={"input_tokens": 10, "output_tokens": 20, "total_tokens": 30})
         assert span._get_ctx_item(METRICS) == {"input_tokens": 10, "output_tokens": 20, "total_tokens": 30}
 
 
-def test_annotate_metrics_wrong_type(LLMObs, mock_logs):
-    with LLMObs.llm(model_name="test_model") as llm_span:
-        LLMObs.annotate(span=llm_span, metrics=12345)
+def test_annotate_metrics_wrong_type(llmobs, mock_llmobs_logs):
+    with llmobs.llm(model_name="test_model") as llm_span:
+        llmobs.annotate(span=llm_span, metrics=12345)
         assert llm_span._get_ctx_item(METRICS) is None
-        mock_logs.warning.assert_called_once_with("metrics must be a dictionary of string key - numeric value pairs.")
-        mock_logs.reset_mock()
+        mock_llmobs_logs.warning.assert_called_once_with(
+            "metrics must be a dictionary of string key - numeric value pairs."
+        )
+        mock_llmobs_logs.reset_mock()
 
 
-def test_annotate_prompt_dict(LLMObs):
-    with LLMObs.llm(model_name="test_model") as span:
-        LLMObs.annotate(
+def test_annotate_prompt_dict(llmobs):
+    with llmobs.llm(model_name="test_model") as span:
+        llmobs.annotate(
             span=span,
             prompt={
                 "template": "{var1} {var3}",
@@ -761,9 +686,9 @@ def test_annotate_prompt_dict(LLMObs):
         }
 
 
-def test_annotate_prompt_dict_with_context_var_keys(LLMObs):
-    with LLMObs.llm(model_name="test_model") as span:
-        LLMObs.annotate(
+def test_annotate_prompt_dict_with_context_var_keys(llmobs):
+    with llmobs.llm(model_name="test_model") as span:
+        llmobs.annotate(
             span=span,
             prompt={
                 "template": "{var1} {var3}",
@@ -784,9 +709,9 @@ def test_annotate_prompt_dict_with_context_var_keys(LLMObs):
         }
 
 
-def test_annotate_prompt_typed_dict(LLMObs):
-    with LLMObs.llm(model_name="test_model") as span:
-        LLMObs.annotate(
+def test_annotate_prompt_typed_dict(llmobs):
+    with llmobs.llm(model_name="test_model") as span:
+        llmobs.annotate(
             span=span,
             prompt=Prompt(
                 template="{var1} {var3}",
@@ -807,47 +732,30 @@ def test_annotate_prompt_typed_dict(LLMObs):
         }
 
 
-def test_annotate_prompt_wrong_type(LLMObs, mock_logs):
-    with LLMObs.llm(model_name="test_model") as span:
-        LLMObs.annotate(span=span, prompt="prompt")
+def test_annotate_prompt_wrong_type(llmobs, mock_llmobs_logs):
+    with llmobs.llm(model_name="test_model") as span:
+        llmobs.annotate(span=span, prompt="prompt")
         assert span._get_ctx_item(INPUT_PROMPT) is None
-        mock_logs.warning.assert_called_once_with("Failed to validate prompt with error: ", exc_info=True)
-        mock_logs.reset_mock()
+        mock_llmobs_logs.warning.assert_called_once_with("Failed to validate prompt with error: ", exc_info=True)
+        mock_llmobs_logs.reset_mock()
 
-        LLMObs.annotate(span=span, prompt={"template": 1})
-        mock_logs.warning.assert_called_once_with("Failed to validate prompt with error: ", exc_info=True)
-        mock_logs.reset_mock()
+        llmobs.annotate(span=span, prompt={"template": 1})
+        mock_llmobs_logs.warning.assert_called_once_with("Failed to validate prompt with error: ", exc_info=True)
+        mock_llmobs_logs.reset_mock()
 
 
-def test_span_error_sets_error(LLMObs, mock_llmobs_span_writer):
+def test_span_error_sets_error(llmobs, llmobs_events):
     with pytest.raises(ValueError):
-        with LLMObs.llm(model_name="test_model", model_provider="test_model_provider") as span:
+        with llmobs.llm(model_name="test_model", model_provider="test_model_provider") as span:
             raise ValueError("test error message")
-    mock_llmobs_span_writer.enqueue.assert_called_with(
-        _expected_llmobs_llm_span_event(
-            span,
-            model_name="test_model",
-            model_provider="test_model_provider",
-            error="builtins.ValueError",
-            error_message="test error message",
-            error_stack=span.get_tag("error.stack"),
-        )
-    )
-
-
-def test_span_error_sets_error_agentless(AgentlessLLMObs, mock_llmobs_span_agentless_writer):
-    with pytest.raises(ValueError):
-        with AgentlessLLMObs.llm(model_name="test_model", model_provider="test_model_provider") as span:
-            raise ValueError("test error message")
-    mock_llmobs_span_agentless_writer.enqueue.assert_called_with(
-        _expected_llmobs_llm_span_event(
-            span,
-            model_name="test_model",
-            model_provider="test_model_provider",
-            error="builtins.ValueError",
-            error_message="test error message",
-            error_stack=span.get_tag("error.stack"),
-        )
+    assert len(llmobs_events) == 1
+    assert llmobs_events[0] == _expected_llmobs_llm_span_event(
+        span,
+        model_name="test_model",
+        model_provider="test_model_provider",
+        error="builtins.ValueError",
+        error_message="test error message",
+        error_stack=span.get_tag("error.stack"),
     )
 
 
@@ -855,218 +763,142 @@ def test_span_error_sets_error_agentless(AgentlessLLMObs, mock_llmobs_span_agent
     "ddtrace_global_config",
     [dict(version="1.2.3", env="test_env", service="test_service", _llmobs_ml_app="test_app_name")],
 )
-def test_tags(ddtrace_global_config, LLMObs, mock_llmobs_span_writer, monkeypatch):
-    with LLMObs.task(name="test_task") as span:
-        pass
-    mock_llmobs_span_writer.enqueue.assert_called_with(
-        _expected_llmobs_non_llm_span_event(
-            span,
-            "task",
-            tags={"version": "1.2.3", "env": "test_env", "service": "test_service", "ml_app": "test_app_name"},
-        )
-    )
-
-
-@pytest.mark.parametrize(
-    "ddtrace_global_config",
-    [dict(version="1.2.3", env="test_env", service="test_service", _llmobs_ml_app="test_app_name")],
-)
-def test_tags_agentless(ddtrace_global_config, AgentlessLLMObs, mock_llmobs_span_agentless_writer, monkeypatch):
-    with AgentlessLLMObs.task(name="test_task") as span:
-        pass
-    mock_llmobs_span_agentless_writer.enqueue.assert_called_with(
-        _expected_llmobs_non_llm_span_event(
-            span,
-            "task",
-            tags={"version": "1.2.3", "env": "test_env", "service": "test_service", "ml_app": "test_app_name"},
-        )
-    )
-
-
-def test_ml_app_override(LLMObs, mock_llmobs_span_writer):
-    with LLMObs.task(name="test_task", ml_app="test_app") as span:
-        pass
-    mock_llmobs_span_writer.enqueue.assert_called_with(
-        _expected_llmobs_non_llm_span_event(span, "task", tags={"ml_app": "test_app"})
-    )
-    with LLMObs.tool(name="test_tool", ml_app="test_app") as span:
-        pass
-    mock_llmobs_span_writer.enqueue.assert_called_with(
-        _expected_llmobs_non_llm_span_event(span, "tool", tags={"ml_app": "test_app"})
-    )
-    with LLMObs.llm(model_name="model_name", name="test_llm", ml_app="test_app") as span:
-        pass
-    mock_llmobs_span_writer.enqueue.assert_called_with(
-        _expected_llmobs_llm_span_event(
-            span, "llm", model_name="model_name", model_provider="custom", tags={"ml_app": "test_app"}
-        )
-    )
-    with LLMObs.embedding(model_name="model_name", name="test_embedding", ml_app="test_app") as span:
-        pass
-    mock_llmobs_span_writer.enqueue.assert_called_with(
-        _expected_llmobs_llm_span_event(
-            span, "embedding", model_name="model_name", model_provider="custom", tags={"ml_app": "test_app"}
-        )
-    )
-    with LLMObs.workflow(name="test_workflow", ml_app="test_app") as span:
-        pass
-    mock_llmobs_span_writer.enqueue.assert_called_with(
-        _expected_llmobs_non_llm_span_event(span, "workflow", tags={"ml_app": "test_app"})
-    )
-    with LLMObs.agent(name="test_agent", ml_app="test_app") as span:
-        pass
-    mock_llmobs_span_writer.enqueue.assert_called_with(
-        _expected_llmobs_llm_span_event(span, "agent", tags={"ml_app": "test_app"})
-    )
-    with LLMObs.retrieval(name="test_retrieval", ml_app="test_app") as span:
+def test_tags(ddtrace_global_config, llmobs, llmobs_events, monkeypatch):
+    with llmobs.task(name="test_task") as span:
         pass
-    mock_llmobs_span_writer.enqueue.assert_called_with(
-        _expected_llmobs_non_llm_span_event(span, "retrieval", tags={"ml_app": "test_app"})
+    assert len(llmobs_events) == 1
+    assert llmobs_events[0] == _expected_llmobs_non_llm_span_event(
+        span,
+        "task",
+        tags={"version": "1.2.3", "env": "test_env", "service": "test_service", "ml_app": "test_app_name"},
     )
 
 
-def test_ml_app_override_agentless(AgentlessLLMObs, mock_llmobs_span_agentless_writer):
-    with AgentlessLLMObs.task(name="test_task", ml_app="test_app") as span:
+def test_ml_app_override(llmobs, llmobs_events):
+    with llmobs.task(name="test_task", ml_app="test_app") as span:
         pass
-    mock_llmobs_span_agentless_writer.enqueue.assert_called_with(
-        _expected_llmobs_non_llm_span_event(span, "task", tags={"ml_app": "test_app"})
-    )
-    with AgentlessLLMObs.tool(name="test_tool", ml_app="test_app") as span:
+    assert len(llmobs_events) == 1
+    assert llmobs_events[0] == _expected_llmobs_non_llm_span_event(span, "task", tags={"ml_app": "test_app"})
+    with llmobs.tool(name="test_tool", ml_app="test_app") as span:
         pass
-    mock_llmobs_span_agentless_writer.enqueue.assert_called_with(
-        _expected_llmobs_non_llm_span_event(span, "tool", tags={"ml_app": "test_app"})
-    )
-    with AgentlessLLMObs.llm(model_name="model_name", name="test_llm", ml_app="test_app") as span:
+    assert len(llmobs_events) == 2
+    assert llmobs_events[1] == _expected_llmobs_non_llm_span_event(span, "tool", tags={"ml_app": "test_app"})
+    with llmobs.llm(model_name="model_name", name="test_llm", ml_app="test_app") as span:
         pass
-    mock_llmobs_span_agentless_writer.enqueue.assert_called_with(
-        _expected_llmobs_llm_span_event(
-            span, "llm", model_name="model_name", model_provider="custom", tags={"ml_app": "test_app"}
-        )
+    assert len(llmobs_events) == 3
+    assert llmobs_events[2] == _expected_llmobs_llm_span_event(
+        span, "llm", model_name="model_name", model_provider="custom", tags={"ml_app": "test_app"}
     )
-    with AgentlessLLMObs.embedding(model_name="model_name", name="test_embedding", ml_app="test_app") as span:
+    with llmobs.embedding(model_name="model_name", name="test_embedding", ml_app="test_app") as span:
         pass
-    mock_llmobs_span_agentless_writer.enqueue.assert_called_with(
-        _expected_llmobs_llm_span_event(
-            span, "embedding", model_name="model_name", model_provider="custom", tags={"ml_app": "test_app"}
-        )
+    assert len(llmobs_events) == 4
+    assert llmobs_events[3] == _expected_llmobs_llm_span_event(
+        span, "embedding", model_name="model_name", model_provider="custom", tags={"ml_app": "test_app"}
     )
-    with AgentlessLLMObs.workflow(name="test_workflow", ml_app="test_app") as span:
+    with llmobs.workflow(name="test_workflow", ml_app="test_app") as span:
         pass
-    mock_llmobs_span_agentless_writer.enqueue.assert_called_with(
-        _expected_llmobs_non_llm_span_event(span, "workflow", tags={"ml_app": "test_app"})
-    )
-    with AgentlessLLMObs.agent(name="test_agent", ml_app="test_app") as span:
+    assert len(llmobs_events) == 5
+    assert llmobs_events[4] == _expected_llmobs_non_llm_span_event(span, "workflow", tags={"ml_app": "test_app"})
+    with llmobs.agent(name="test_agent", ml_app="test_app") as span:
         pass
-    mock_llmobs_span_agentless_writer.enqueue.assert_called_with(
-        _expected_llmobs_llm_span_event(span, "agent", tags={"ml_app": "test_app"})
-    )
-    with AgentlessLLMObs.retrieval(name="test_retrieval", ml_app="test_app") as span:
+    assert len(llmobs_events) == 6
+    assert llmobs_events[5] == _expected_llmobs_llm_span_event(span, "agent", tags={"ml_app": "test_app"})
+    with llmobs.retrieval(name="test_retrieval", ml_app="test_app") as span:
         pass
-    mock_llmobs_span_agentless_writer.enqueue.assert_called_with(
-        _expected_llmobs_non_llm_span_event(span, "retrieval", tags={"ml_app": "test_app"})
-    )
+    assert len(llmobs_events) == 7
+    assert llmobs_events[6] == _expected_llmobs_non_llm_span_event(span, "retrieval", tags={"ml_app": "test_app"})
 
 
-def test_export_span_specified_span_is_incorrect_type_raises_warning(LLMObs, mock_logs):
-    LLMObs.export_span(span="asd")
-    mock_logs.warning.assert_called_once_with("Failed to export span. Span must be a valid Span object.")
+def test_export_span_specified_span_is_incorrect_type_raises_warning(llmobs, mock_llmobs_logs):
+    llmobs.export_span(span="asd")
+    mock_llmobs_logs.warning.assert_called_once_with("Failed to export span. Span must be a valid Span object.")
 
 
-def test_export_span_specified_span_is_not_llmobs_span_raises_warning(LLMObs, mock_logs):
+def test_export_span_specified_span_is_not_llmobs_span_raises_warning(llmobs, mock_llmobs_logs):
     with DummyTracer().trace("non_llmobs_span") as span:
-        LLMObs.export_span(span=span)
-    mock_logs.warning.assert_called_once_with("Span must be an LLMObs-generated span.")
+        llmobs.export_span(span=span)
+    mock_llmobs_logs.warning.assert_called_once_with("Span must be an LLMObs-generated span.")
 
 
-def test_export_span_specified_span_returns_span_context(LLMObs):
-    with LLMObs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span:
-        span_context = LLMObs.export_span(span=span)
+def test_export_span_specified_span_returns_span_context(llmobs):
+    with llmobs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span:
+        span_context = llmobs.export_span(span=span)
         assert span_context is not None
         assert span_context["span_id"] == str(span.span_id)
         assert span_context["trace_id"] == "{:x}".format(span.trace_id)
 
 
-def test_export_span_no_specified_span_no_active_span_raises_warning(LLMObs, mock_logs):
-    LLMObs.export_span()
-    mock_logs.warning.assert_called_once_with("No span provided and no active LLMObs-generated span found.")
+def test_export_span_no_specified_span_no_active_span_raises_warning(llmobs, mock_llmobs_logs):
+    llmobs.export_span()
+    mock_llmobs_logs.warning.assert_called_once_with("No span provided and no active LLMObs-generated span found.")
 
 
-def test_export_span_active_span_not_llmobs_span_raises_warning(LLMObs, mock_logs):
-    with LLMObs._instance.tracer.trace("non_llmobs_span"):
-        LLMObs.export_span()
-    mock_logs.warning.assert_called_once_with("Span must be an LLMObs-generated span.")
+def test_export_span_active_span_not_llmobs_span_raises_warning(llmobs, mock_llmobs_logs):
+    with llmobs._instance.tracer.trace("non_llmobs_span"):
+        llmobs.export_span()
+    mock_llmobs_logs.warning.assert_called_once_with("Span must be an LLMObs-generated span.")
 
 
-def test_export_span_no_specified_span_returns_exported_active_span(LLMObs):
-    with LLMObs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span:
-        span_context = LLMObs.export_span()
+def test_export_span_no_specified_span_returns_exported_active_span(llmobs):
+    with llmobs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span:
+        span_context = llmobs.export_span()
         assert span_context is not None
         assert span_context["span_id"] == str(span.span_id)
         assert span_context["trace_id"] == "{:x}".format(span.trace_id)
 
 
-def test_submit_evaluation_llmobs_disabled_raises_warning(LLMObs, mock_logs):
-    LLMObs.disable()
-    LLMObs.submit_evaluation(
-        span_context={"span_id": "123", "trace_id": "456"}, label="toxicity", metric_type="categorical", value="high"
-    )
-    mock_logs.warning.assert_called_once_with(
-        "LLMObs.submit_evaluation() called when LLMObs is not enabled. Evaluation metric data will not be sent."
-    )
-
-
-def test_submit_evaluation_no_api_key_raises_warning(AgentlessLLMObs, mock_logs):
+def test_submit_evaluation_no_api_key_raises_warning(llmobs, mock_llmobs_logs):
     with override_global_config(dict(_dd_api_key="")):
-        AgentlessLLMObs.submit_evaluation(
+        llmobs.submit_evaluation(
             span_context={"span_id": "123", "trace_id": "456"},
             label="toxicity",
             metric_type="categorical",
             value="high",
         )
-        mock_logs.warning.assert_called_once_with(
+        mock_llmobs_logs.warning.assert_called_once_with(
             "DD_API_KEY is required for sending evaluation metrics. Evaluation metric data will not be sent. "
             "Ensure this configuration is set before running your application."
         )
 
 
-def test_submit_evaluation_ml_app_raises_warning(LLMObs, mock_logs):
+def test_submit_evaluation_ml_app_raises_warning(llmobs, mock_llmobs_logs):
     with override_global_config(dict(_llmobs_ml_app="")):
-        LLMObs.submit_evaluation(
+        llmobs.submit_evaluation(
             span_context={"span_id": "123", "trace_id": "456"},
             label="toxicity",
             metric_type="categorical",
             value="high",
         )
-        mock_logs.warning.assert_called_once_with(
+        mock_llmobs_logs.warning.assert_called_once_with(
             "ML App name is required for sending evaluation metrics. Evaluation metric data will not be sent. "
             "Ensure this configuration is set before running your application."
         )
 
 
-def test_submit_evaluation_span_context_incorrect_type_raises_warning(LLMObs, mock_logs):
-    LLMObs.submit_evaluation(span_context="asd", label="toxicity", metric_type="categorical", value="high")
-    mock_logs.warning.assert_called_once_with(
+def test_submit_evaluation_span_context_incorrect_type_raises_warning(llmobs, mock_llmobs_logs):
+    llmobs.submit_evaluation(span_context="asd", label="toxicity", metric_type="categorical", value="high")
+    mock_llmobs_logs.warning.assert_called_once_with(
         "span_context must be a dictionary containing both span_id and trace_id keys. "
         "LLMObs.export_span() can be used to generate this dictionary from a given span."
     )
 
 
-def test_submit_evaluation_empty_span_or_trace_id_raises_warning(LLMObs, mock_logs):
-    LLMObs.submit_evaluation(
+def test_submit_evaluation_empty_span_or_trace_id_raises_warning(llmobs, mock_llmobs_logs):
+    llmobs.submit_evaluation(
         span_context={"trace_id": "456"}, label="toxicity", metric_type="categorical", value="high"
     )
-    mock_logs.warning.assert_called_once_with(
+    mock_llmobs_logs.warning.assert_called_once_with(
         "span_id and trace_id must both be specified for the given evaluation metric to be submitted."
     )
-    mock_logs.reset_mock()
-    LLMObs.submit_evaluation(span_context={"span_id": "456"}, label="toxicity", metric_type="categorical", value="high")
-    mock_logs.warning.assert_called_once_with(
+    mock_llmobs_logs.reset_mock()
+    llmobs.submit_evaluation(span_context={"span_id": "456"}, label="toxicity", metric_type="categorical", value="high")
+    mock_llmobs_logs.warning.assert_called_once_with(
         "span_id and trace_id must both be specified for the given evaluation metric to be submitted."
     )
 
 
-def test_submit_evaluation_invalid_timestamp_raises_warning(LLMObs, mock_logs):
-    LLMObs.submit_evaluation(
+def test_submit_evaluation_invalid_timestamp_raises_warning(llmobs, mock_llmobs_logs):
+    llmobs.submit_evaluation(
         span_context={"span_id": "123", "trace_id": "456"},
         label="",
         metric_type="categorical",
@@ -1074,35 +906,35 @@ def test_submit_evaluation_invalid_timestamp_raises_warning(LLMObs, mock_logs):
         ml_app="dummy",
         timestamp_ms="invalid",
     )
-    mock_logs.warning.assert_called_once_with(
+    mock_llmobs_logs.warning.assert_called_once_with(
         "timestamp_ms must be a non-negative integer. Evaluation metric data will not be sent"
     )
 
 
-def test_submit_evaluation_empty_label_raises_warning(LLMObs, mock_logs):
-    LLMObs.submit_evaluation(
+def test_submit_evaluation_empty_label_raises_warning(llmobs, mock_llmobs_logs):
+    llmobs.submit_evaluation(
         span_context={"span_id": "123", "trace_id": "456"}, label="", metric_type="categorical", value="high"
     )
-    mock_logs.warning.assert_called_once_with("label must be the specified name of the evaluation metric.")
+    mock_llmobs_logs.warning.assert_called_once_with("label must be the specified name of the evaluation metric.")
 
 
-def test_submit_evaluation_incorrect_metric_type_raises_warning(LLMObs, mock_logs):
-    LLMObs.submit_evaluation(
+def test_submit_evaluation_incorrect_metric_type_raises_warning(llmobs, mock_llmobs_logs):
+    llmobs.submit_evaluation(
         span_context={"span_id": "123", "trace_id": "456"}, label="toxicity", metric_type="wrong", value="high"
     )
-    mock_logs.warning.assert_called_once_with("metric_type must be one of 'categorical' or 'score'.")
-    mock_logs.reset_mock()
-    LLMObs.submit_evaluation(
+    mock_llmobs_logs.warning.assert_called_once_with("metric_type must be one of 'categorical' or 'score'.")
+    mock_llmobs_logs.reset_mock()
+    llmobs.submit_evaluation(
         span_context={"span_id": "123", "trace_id": "456"}, label="toxicity", metric_type="", value="high"
     )
-    mock_logs.warning.assert_called_once_with("metric_type must be one of 'categorical' or 'score'.")
+    mock_llmobs_logs.warning.assert_called_once_with("metric_type must be one of 'categorical' or 'score'.")
 
 
-def test_submit_evaluation_numerical_value_raises_unsupported_warning(LLMObs, mock_logs):
-    LLMObs.submit_evaluation(
+def test_submit_evaluation_numerical_value_raises_unsupported_warning(llmobs, mock_llmobs_logs):
+    llmobs.submit_evaluation(
         span_context={"span_id": "123", "trace_id": "456"}, label="token_count", metric_type="numerical", value="high"
     )
-    mock_logs.warning.assert_has_calls(
+    mock_llmobs_logs.warning.assert_has_calls(
         [
             mock.call(
                 "The evaluation metric type 'numerical' is unsupported. Use 'score' instead. "
@@ -1112,44 +944,44 @@ def test_submit_evaluation_numerical_value_raises_unsupported_warning(LLMObs, mo
     )
 
 
-def test_submit_evaluation_incorrect_numerical_value_type_raises_warning(LLMObs, mock_logs):
-    LLMObs.submit_evaluation(
+def test_submit_evaluation_incorrect_numerical_value_type_raises_warning(llmobs, mock_llmobs_logs):
+    llmobs.submit_evaluation(
         span_context={"span_id": "123", "trace_id": "456"}, label="token_count", metric_type="numerical", value="high"
     )
-    mock_logs.warning.assert_has_calls(
+    mock_llmobs_logs.warning.assert_has_calls(
         [
             mock.call("value must be an integer or float for a score metric."),
         ]
     )
 
 
-def test_submit_evaluation_incorrect_score_value_type_raises_warning(LLMObs, mock_logs):
-    LLMObs.submit_evaluation(
+def test_submit_evaluation_incorrect_score_value_type_raises_warning(llmobs, mock_llmobs_logs):
+    llmobs.submit_evaluation(
         span_context={"span_id": "123", "trace_id": "456"}, label="token_count", metric_type="score", value="high"
     )
-    mock_logs.warning.assert_called_once_with("value must be an integer or float for a score metric.")
+    mock_llmobs_logs.warning.assert_called_once_with("value must be an integer or float for a score metric.")
 
 
-def test_submit_evaluation_invalid_tags_raises_warning(LLMObs, mock_logs):
-    LLMObs.submit_evaluation(
+def test_submit_evaluation_invalid_tags_raises_warning(llmobs, mock_llmobs_logs):
+    llmobs.submit_evaluation(
         span_context={"span_id": "123", "trace_id": "456"},
         label="toxicity",
         metric_type="categorical",
         value="high",
         tags=["invalid"],
     )
-    mock_logs.warning.assert_called_once_with("tags must be a dictionary of string key-value pairs.")
+    mock_llmobs_logs.warning.assert_called_once_with("tags must be a dictionary of string key-value pairs.")
 
 
-def test_submit_evaluation_invalid_metadata_raises_warning(LLMObs, mock_logs):
-    LLMObs.submit_evaluation(
+def test_submit_evaluation_invalid_metadata_raises_warning(llmobs, mock_llmobs_logs):
+    llmobs.submit_evaluation(
         span_context={"span_id": "123", "trace_id": "456"},
         label="toxicity",
         metric_type="categorical",
         value="high",
         metadata=1,
     )
-    mock_logs.warning.assert_called_once_with("metadata must be json serializable dictionary.")
+    mock_llmobs_logs.warning.assert_called_once_with("metadata must be json serializable dictionary.")
 
 
 @pytest.mark.parametrize(
@@ -1157,9 +989,9 @@ def test_submit_evaluation_invalid_metadata_raises_warning(LLMObs, mock_logs):
     [dict(_llmobs_ml_app="test_app_name")],
 )
 def test_submit_evaluation_non_string_tags_raises_warning_but_still_submits(
-    LLMObs, mock_logs, mock_llmobs_eval_metric_writer
+    llmobs, mock_llmobs_logs, mock_llmobs_eval_metric_writer
 ):
-    LLMObs.submit_evaluation(
+    llmobs.submit_evaluation(
         span_context={"span_id": "123", "trace_id": "456"},
         label="toxicity",
         metric_type="categorical",
@@ -1167,8 +999,10 @@ def test_submit_evaluation_non_string_tags_raises_warning_but_still_submits(
         tags={1: 2, "foo": "bar"},
         ml_app="dummy",
     )
-    mock_logs.warning.assert_called_once_with("Failed to parse tags. Tags for evaluation metrics must be strings.")
-    mock_logs.reset_mock()
+    mock_llmobs_logs.warning.assert_called_once_with(
+        "Failed to parse tags. Tags for evaluation metrics must be strings."
+    )
+    mock_llmobs_logs.reset_mock()
     mock_llmobs_eval_metric_writer.enqueue.assert_called_with(
         _expected_llmobs_eval_metric_event(
             ml_app="dummy",
@@ -1186,8 +1020,8 @@ def test_submit_evaluation_non_string_tags_raises_warning_but_still_submits(
     "ddtrace_global_config",
     [dict(ddtrace="1.2.3", env="test_env", service="test_service", _llmobs_ml_app="test_app_name")],
 )
-def test_submit_evaluation_metric_tags(LLMObs, mock_llmobs_eval_metric_writer):
-    LLMObs.submit_evaluation(
+def test_submit_evaluation_metric_tags(llmobs, mock_llmobs_eval_metric_writer):
+    llmobs.submit_evaluation(
         span_context={"span_id": "123", "trace_id": "456"},
         label="toxicity",
         metric_type="categorical",
@@ -1212,8 +1046,8 @@ def test_submit_evaluation_metric_tags(LLMObs, mock_llmobs_eval_metric_writer):
     "ddtrace_global_config",
     [dict(ddtrace="1.2.3", env="test_env", service="test_service", _llmobs_ml_app="test_app_name")],
 )
-def test_submit_evaluation_metric_with_metadata_enqueues_metric(LLMObs, mock_llmobs_eval_metric_writer):
-    LLMObs.submit_evaluation(
+def test_submit_evaluation_metric_with_metadata_enqueues_metric(llmobs, mock_llmobs_eval_metric_writer):
+    llmobs.submit_evaluation(
         span_context={"span_id": "123", "trace_id": "456"},
         label="toxicity",
         metric_type="categorical",
@@ -1235,7 +1069,7 @@ def test_submit_evaluation_metric_with_metadata_enqueues_metric(LLMObs, mock_llm
         )
     )
     mock_llmobs_eval_metric_writer.reset()
-    LLMObs.submit_evaluation(
+    llmobs.submit_evaluation(
         span_context={"span_id": "123", "trace_id": "456"},
         label="toxicity",
         metric_type="categorical",
@@ -1257,8 +1091,8 @@ def test_submit_evaluation_metric_with_metadata_enqueues_metric(LLMObs, mock_llm
     )
 
 
-def test_submit_evaluation_enqueues_writer_with_categorical_metric(LLMObs, mock_llmobs_eval_metric_writer):
-    LLMObs.submit_evaluation(
+def test_submit_evaluation_enqueues_writer_with_categorical_metric(llmobs, mock_llmobs_eval_metric_writer):
+    llmobs.submit_evaluation(
         span_context={"span_id": "123", "trace_id": "456"},
         label="toxicity",
         metric_type="categorical",
@@ -1276,9 +1110,9 @@ def test_submit_evaluation_enqueues_writer_with_categorical_metric(LLMObs, mock_
         )
     )
     mock_llmobs_eval_metric_writer.reset_mock()
-    with LLMObs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span:
-        LLMObs.submit_evaluation(
-            span_context=LLMObs.export_span(span),
+    with llmobs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span:
+        llmobs.submit_evaluation(
+            span_context=llmobs.export_span(span),
             label="toxicity",
             metric_type="categorical",
             value="high",
@@ -1296,8 +1130,8 @@ def test_submit_evaluation_enqueues_writer_with_categorical_metric(LLMObs, mock_
     )
 
 
-def test_submit_evaluation_enqueues_writer_with_score_metric(LLMObs, mock_llmobs_eval_metric_writer):
-    LLMObs.submit_evaluation(
+def test_submit_evaluation_enqueues_writer_with_score_metric(llmobs, mock_llmobs_eval_metric_writer):
+    llmobs.submit_evaluation(
         span_context={"span_id": "123", "trace_id": "456"},
         label="sentiment",
         metric_type="score",
@@ -1310,9 +1144,9 @@ def test_submit_evaluation_enqueues_writer_with_score_metric(LLMObs, mock_llmobs
         )
     )
     mock_llmobs_eval_metric_writer.reset_mock()
-    with LLMObs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span:
-        LLMObs.submit_evaluation(
-            span_context=LLMObs.export_span(span), label="sentiment", metric_type="score", value=0.9, ml_app="dummy"
+    with llmobs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span:
+        llmobs.submit_evaluation(
+            span_context=llmobs.export_span(span), label="sentiment", metric_type="score", value=0.9, ml_app="dummy"
         )
     mock_llmobs_eval_metric_writer.enqueue.assert_called_with(
         _expected_llmobs_eval_metric_event(
@@ -1327,9 +1161,9 @@ def test_submit_evaluation_enqueues_writer_with_score_metric(LLMObs, mock_llmobs
 
 
 def test_submit_evaluation_with_numerical_metric_enqueues_writer_with_score_metric(
-    LLMObs, mock_llmobs_eval_metric_writer
+    llmobs, mock_llmobs_eval_metric_writer
 ):
-    LLMObs.submit_evaluation(
+    llmobs.submit_evaluation(
         span_context={"span_id": "123", "trace_id": "456"},
         label="token_count",
         metric_type="numerical",
@@ -1342,9 +1176,9 @@ def test_submit_evaluation_with_numerical_metric_enqueues_writer_with_score_metr
         )
     )
     mock_llmobs_eval_metric_writer.reset_mock()
-    with LLMObs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span:
-        LLMObs.submit_evaluation(
-            span_context=LLMObs.export_span(span),
+    with llmobs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span:
+        llmobs.submit_evaluation(
+            span_context=llmobs.export_span(span),
             label="token_count",
             metric_type="numerical",
             value=35,
@@ -1362,148 +1196,148 @@ def test_submit_evaluation_with_numerical_metric_enqueues_writer_with_score_metr
     )
 
 
-def test_flush_calls_periodic_agentless(
-    AgentlessLLMObs, mock_llmobs_span_agentless_writer, mock_llmobs_eval_metric_writer, mock_llmobs_evaluator_runner
-):
-    AgentlessLLMObs.flush()
-    mock_llmobs_span_agentless_writer.periodic.assert_called_once()
-    mock_llmobs_eval_metric_writer.periodic.assert_called_once()
-    mock_llmobs_evaluator_runner.periodic.assert_called_once()
-
-
 def test_flush_does_not_call_periodic_when_llmobs_is_disabled(
-    LLMObs,
-    mock_llmobs_span_writer,
+    llmobs,
     mock_llmobs_eval_metric_writer,
     mock_llmobs_evaluator_runner,
-    mock_logs,
-    disabled_llmobs,
+    mock_llmobs_logs,
 ):
-    LLMObs.flush()
-    mock_llmobs_span_writer.periodic.assert_not_called()
+    llmobs.enabled = False
+    llmobs.flush()
     mock_llmobs_eval_metric_writer.periodic.assert_not_called()
     mock_llmobs_evaluator_runner.periodic.assert_not_called()
-    mock_logs.warning.assert_has_calls(
+    mock_llmobs_logs.warning.assert_has_calls(
         [mock.call("flushing when LLMObs is disabled. No spans or evaluation metrics will be sent.")]
     )
 
 
-def test_flush_does_not_call_periodic_when_llmobs_is_disabled_agentless(
-    AgentlessLLMObs,
-    mock_llmobs_span_agentless_writer,
-    mock_llmobs_eval_metric_writer,
-    mock_llmobs_evaluator_runner,
-    mock_logs,
-    disabled_llmobs,
-):
-    AgentlessLLMObs.flush()
-    mock_llmobs_span_agentless_writer.periodic.assert_not_called()
-    mock_llmobs_eval_metric_writer.periodic.assert_not_called()
-    mock_llmobs_evaluator_runner.periodic.assert_not_called()
-    mock_logs.warning.assert_has_calls(
-        [mock.call("flushing when LLMObs is disabled. No spans or evaluation metrics will be sent.")]
-    )
-
-
-def test_inject_distributed_headers_llmobs_disabled_does_nothing(LLMObs, mock_logs):
-    LLMObs.disable()
-    headers = LLMObs.inject_distributed_headers({}, span=None)
-    mock_logs.warning.assert_called_once_with(
+def test_inject_distributed_headers_llmobs_disabled_does_nothing(llmobs, mock_llmobs_logs):
+    llmobs.disable()
+    headers = llmobs.inject_distributed_headers({}, span=None)
+    mock_llmobs_logs.warning.assert_called_once_with(
         "LLMObs.inject_distributed_headers() called when LLMObs is not enabled. "
         "Distributed context will not be injected."
     )
     assert headers == {}
 
 
-def test_inject_distributed_headers_not_dict_logs_warning(LLMObs, mock_logs):
-    headers = LLMObs.inject_distributed_headers("not a dictionary", span=None)
-    mock_logs.warning.assert_called_once_with("request_headers must be a dictionary of string key-value pairs.")
+def test_inject_distributed_headers_not_dict_logs_warning(llmobs, mock_llmobs_logs):
+    headers = llmobs.inject_distributed_headers("not a dictionary", span=None)
+    mock_llmobs_logs.warning.assert_called_once_with("request_headers must be a dictionary of string key-value pairs.")
     assert headers == "not a dictionary"
-    mock_logs.reset_mock()
-    headers = LLMObs.inject_distributed_headers(123, span=None)
-    mock_logs.warning.assert_called_once_with("request_headers must be a dictionary of string key-value pairs.")
+    mock_llmobs_logs.reset_mock()
+    headers = llmobs.inject_distributed_headers(123, span=None)
+    mock_llmobs_logs.warning.assert_called_once_with("request_headers must be a dictionary of string key-value pairs.")
     assert headers == 123
-    mock_logs.reset_mock()
-    headers = LLMObs.inject_distributed_headers(None, span=None)
-    mock_logs.warning.assert_called_once_with("request_headers must be a dictionary of string key-value pairs.")
+    mock_llmobs_logs.reset_mock()
+    headers = llmobs.inject_distributed_headers(None, span=None)
+    mock_llmobs_logs.warning.assert_called_once_with("request_headers must be a dictionary of string key-value pairs.")
     assert headers is None
 
 
-def test_inject_distributed_headers_no_active_span_logs_warning(LLMObs, mock_logs):
-    headers = LLMObs.inject_distributed_headers({}, span=None)
-    mock_logs.warning.assert_called_once_with("No span provided and no currently active span found.")
+def test_inject_distributed_headers_no_active_span_logs_warning(llmobs, mock_llmobs_logs):
+    headers = llmobs.inject_distributed_headers({}, span=None)
+    mock_llmobs_logs.warning.assert_called_once_with("No span provided and no currently active span found.")
     assert headers == {}
 
 
-def test_inject_distributed_headers_span_calls_httppropagator_inject(LLMObs, mock_logs):
-    span = LLMObs._instance.tracer.trace("test_span")
+def test_inject_distributed_headers_span_calls_httppropagator_inject(llmobs, mock_llmobs_logs):
+    span = llmobs._instance.tracer.trace("test_span")
     with mock.patch("ddtrace.propagation.http.HTTPPropagator.inject") as mock_inject:
-        LLMObs.inject_distributed_headers({}, span=span)
+        llmobs.inject_distributed_headers({}, span=span)
         assert mock_inject.call_count == 1
         mock_inject.assert_called_once_with(span.context, {})
 
 
-def test_inject_distributed_headers_current_active_span_injected(LLMObs, mock_logs):
-    span = LLMObs._instance.tracer.trace("test_span")
+def test_inject_distributed_headers_current_active_span_injected(llmobs, mock_llmobs_logs):
+    span = llmobs._instance.tracer.trace("test_span")
     with mock.patch("ddtrace.llmobs._llmobs.HTTPPropagator.inject") as mock_inject:
-        LLMObs.inject_distributed_headers({}, span=None)
+        llmobs.inject_distributed_headers({}, span=None)
         assert mock_inject.call_count == 1
         mock_inject.assert_called_once_with(span.context, {})
 
 
-def test_activate_distributed_headers_llmobs_disabled_does_nothing(LLMObs, mock_logs):
-    LLMObs.disable()
-    LLMObs.activate_distributed_headers({})
-    mock_logs.warning.assert_called_once_with(
+def test_activate_distributed_headers_llmobs_disabled_does_nothing(llmobs, mock_llmobs_logs):
+    llmobs.disable()
+    llmobs.activate_distributed_headers({})
+    mock_llmobs_logs.warning.assert_called_once_with(
         "LLMObs.activate_distributed_headers() called when LLMObs is not enabled. "
         "Distributed context will not be activated."
     )
 
 
-def test_activate_distributed_headers_calls_httppropagator_extract(LLMObs, mock_logs):
+def test_activate_distributed_headers_calls_httppropagator_extract(llmobs, mock_llmobs_logs):
     with mock.patch("ddtrace.llmobs._llmobs.HTTPPropagator.extract") as mock_extract:
-        LLMObs.activate_distributed_headers({})
+        llmobs.activate_distributed_headers({})
         assert mock_extract.call_count == 1
         mock_extract.assert_called_once_with({})
 
 
-def test_activate_distributed_headers_no_trace_id_does_nothing(LLMObs, mock_logs):
+def test_activate_distributed_headers_no_trace_id_does_nothing(llmobs, mock_llmobs_logs):
     with mock.patch("ddtrace.llmobs._llmobs.HTTPPropagator.extract") as mock_extract:
         mock_extract.return_value = Context(span_id="123", meta={PROPAGATED_PARENT_ID_KEY: "123"})
-        LLMObs.activate_distributed_headers({})
+        llmobs.activate_distributed_headers({})
         assert mock_extract.call_count == 1
-        mock_logs.warning.assert_called_once_with("Failed to extract trace ID or span ID from request headers.")
+        mock_llmobs_logs.warning.assert_called_once_with("Failed to extract trace ID or span ID from request headers.")
 
 
-def test_activate_distributed_headers_no_span_id_does_nothing(LLMObs, mock_logs):
+def test_activate_distributed_headers_no_span_id_does_nothing(llmobs, mock_llmobs_logs):
     with mock.patch("ddtrace.llmobs._llmobs.HTTPPropagator.extract") as mock_extract:
         mock_extract.return_value = Context(trace_id="123", meta={PROPAGATED_PARENT_ID_KEY: "123"})
-        LLMObs.activate_distributed_headers({})
+        llmobs.activate_distributed_headers({})
         assert mock_extract.call_count == 1
-        mock_logs.warning.assert_called_once_with("Failed to extract trace ID or span ID from request headers.")
+        mock_llmobs_logs.warning.assert_called_once_with("Failed to extract trace ID or span ID from request headers.")
 
 
-def test_activate_distributed_headers_no_llmobs_parent_id_does_nothing(LLMObs, mock_logs):
+def test_activate_distributed_headers_no_llmobs_parent_id_does_nothing(llmobs, mock_llmobs_logs):
     with mock.patch("ddtrace.llmobs._llmobs.HTTPPropagator.extract") as mock_extract:
         dummy_context = Context(trace_id="123", span_id="456")
         mock_extract.return_value = dummy_context
         with mock.patch("ddtrace.llmobs.LLMObs._instance.tracer.context_provider.activate") as mock_activate:
-            LLMObs.activate_distributed_headers({})
+            llmobs.activate_distributed_headers({})
             assert mock_extract.call_count == 1
-            mock_logs.warning.assert_called_once_with("Failed to extract LLMObs parent ID from request headers.")
+            mock_llmobs_logs.warning.assert_called_once_with("Failed to extract LLMObs parent ID from request headers.")
             mock_activate.assert_called_once_with(dummy_context)
 
 
-def test_activate_distributed_headers_activates_context(LLMObs, mock_logs):
+def test_activate_distributed_headers_activates_context(llmobs, mock_llmobs_logs):
     with mock.patch("ddtrace.llmobs._llmobs.HTTPPropagator.extract") as mock_extract:
         dummy_context = Context(trace_id="123", span_id="456", meta={PROPAGATED_PARENT_ID_KEY: "789"})
         mock_extract.return_value = dummy_context
         with mock.patch("ddtrace.llmobs.LLMObs._instance.tracer.context_provider.activate") as mock_activate:
-            LLMObs.activate_distributed_headers({})
+            llmobs.activate_distributed_headers({})
             assert mock_extract.call_count == 1
             mock_activate.assert_called_once_with(dummy_context)
 
 
+def test_listener_hooks_enqueue_correct_writer(run_python_code_in_subprocess):
+    """
+    Regression test that ensures that listener hooks enqueue span events to the correct writer,
+    not the default writer created at startup.
+    """
+    env = os.environ.copy()
+    pypath = [os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))]
+    if "PYTHONPATH" in env:
+        pypath.append(env["PYTHONPATH"])
+    env.update({"PYTHONPATH": ":".join(pypath), "DD_TRACE_ENABLED": "0"})
+    out, err, status, pid = run_python_code_in_subprocess(
+        """
+from ddtrace.llmobs import LLMObs
+
+LLMObs.enable(ml_app="repro-issue", agentless_enabled=True, api_key="foobar.baz", site="datad0g.com")
+with LLMObs.agent("dummy"):
+    pass
+""",
+        env=env,
+    )
+    assert status == 0, err
+    assert out == b""
+    agentless_writer_log = b"failed to send traces to intake at https://llmobs-intake.datad0g.com/api/v2/llmobs: HTTP error status 403, reason Forbidden\n"  # noqa: E501
+    agent_proxy_log = b"failed to send, dropping 1 traces to intake at http://localhost:8126/evp_proxy/v2/api/v2/llmobs after 5 retries"  # noqa: E501
+    assert err == agentless_writer_log
+    assert agent_proxy_log not in err
+
+
 def test_llmobs_fork_recreates_and_restarts_span_writer():
     """Test that forking a process correctly recreates and restarts the LLMObsSpanWriter."""
     with mock.patch("ddtrace.internal.writer.HTTPWriter._send_payload"):
@@ -1514,16 +1348,10 @@ def test_llmobs_fork_recreates_and_restarts_span_writer():
         if pid:  # parent
             assert llmobs_service._instance.tracer._pid == original_pid
             assert llmobs_service._instance._llmobs_span_writer == original_span_writer
-            assert (
-                llmobs_service._instance._trace_processor._span_writer == llmobs_service._instance._llmobs_span_writer
-            )
             assert llmobs_service._instance._llmobs_span_writer.status == ServiceStatus.RUNNING
         else:  # child
             assert llmobs_service._instance.tracer._pid != original_pid
             assert llmobs_service._instance._llmobs_span_writer != original_span_writer
-            assert (
-                llmobs_service._instance._trace_processor._span_writer == llmobs_service._instance._llmobs_span_writer
-            )
             assert llmobs_service._instance._llmobs_span_writer.status == ServiceStatus.RUNNING
             llmobs_service.disable()
             os._exit(12)
@@ -1569,18 +1397,10 @@ def test_llmobs_fork_recreates_and_restarts_evaluator_runner(mock_ragas_evaluato
             if pid:  # parent
                 assert llmobs_service._instance.tracer._pid == original_pid
                 assert llmobs_service._instance._evaluator_runner == original_evaluator_runner
-                assert (
-                    llmobs_service._instance._trace_processor._evaluator_runner
-                    == llmobs_service._instance._evaluator_runner
-                )
                 assert llmobs_service._instance._evaluator_runner.status == ServiceStatus.RUNNING
             else:  # child
                 assert llmobs_service._instance.tracer._pid != original_pid
                 assert llmobs_service._instance._evaluator_runner != original_evaluator_runner
-                assert (
-                    llmobs_service._instance._trace_processor._evaluator_runner
-                    == llmobs_service._instance._evaluator_runner
-                )
                 assert llmobs_service._instance._evaluator_runner.status == ServiceStatus.RUNNING
                 llmobs_service.disable()
                 os._exit(12)
@@ -1667,42 +1487,6 @@ def test_llmobs_fork_evaluator_runner_run(monkeypatch):
         llmobs_service.disable()
 
 
-def test_llmobs_fork_custom_filter(monkeypatch):
-    """Test that forking a process correctly keeps any custom filters."""
-
-    class CustomFilter(TraceFilter):
-        def process_trace(self, trace):
-            return trace
-
-    monkeypatch.setenv("_DD_LLMOBS_WRITER_INTERVAL", 5.0)
-    with mock.patch("ddtrace.internal.writer.HTTPWriter._send_payload"):
-        tracer = DummyTracer()
-        custom_filter = CustomFilter()
-        tracer.configure(settings={"FILTERS": [custom_filter]})
-        llmobs_service.enable(_tracer=tracer, ml_app="test_app")
-        assert custom_filter in llmobs_service._instance.tracer._filters
-        pid = os.fork()
-        if pid:  # parent
-            assert custom_filter in llmobs_service._instance.tracer._filters
-            assert any(
-                isinstance(tracer_filter, LLMObsTraceProcessor)
-                for tracer_filter in llmobs_service._instance.tracer._filters
-            )
-        else:  # child
-            assert custom_filter in llmobs_service._instance.tracer._filters
-            assert any(
-                isinstance(tracer_filter, LLMObsTraceProcessor)
-                for tracer_filter in llmobs_service._instance.tracer._filters
-            )
-            llmobs_service.disable()
-            os._exit(12)
-
-        _, status = os.waitpid(pid, 0)
-        exit_code = os.WEXITSTATUS(status)
-        assert exit_code == 12
-        llmobs_service.disable()
-
-
 def test_llmobs_fork_disabled(monkeypatch):
     """Test that after being disabled the service remains disabled when forking"""
     monkeypatch.setenv("DD_LLMOBS_ENABLED", "0")
@@ -1746,46 +1530,46 @@ def test_llmobs_fork_disabled_then_enabled(monkeypatch):
     svc.disable()
 
 
-def test_llmobs_with_evaluator_runner(LLMObs, mock_llmobs_evaluator_runner):
-    with LLMObs.llm(model_name="test_model"):
+def test_llmobs_with_evaluator_runner(llmobs, mock_llmobs_evaluator_runner):
+    with llmobs.llm(model_name="test_model"):
         pass
     time.sleep(0.1)
-    assert LLMObs._instance._evaluator_runner.enqueue.call_count == 1
+    assert llmobs._instance._evaluator_runner.enqueue.call_count == 1
 
 
-def test_llmobs_with_evaluator_runner_does_not_enqueue_evaluation_spans(mock_llmobs_evaluator_runner, LLMObs):
-    with LLMObs.llm(model_name="test_model", ml_app="{}-dummy".format(RAGAS_ML_APP_PREFIX)):
+def test_llmobs_with_evaluator_runner_does_not_enqueue_evaluation_spans(mock_llmobs_evaluator_runner, llmobs):
+    with llmobs.llm(model_name="test_model", ml_app="{}-dummy".format(RAGAS_ML_APP_PREFIX)):
         pass
     time.sleep(0.1)
-    assert LLMObs._instance._evaluator_runner.enqueue.call_count == 0
+    assert llmobs._instance._evaluator_runner.enqueue.call_count == 0
 
 
-def test_llmobs_with_evaluation_runner_does_not_enqueue_non_llm_spans(mock_llmobs_evaluator_runner, LLMObs):
-    with LLMObs.workflow(name="test"):
+def test_llmobs_with_evaluation_runner_does_not_enqueue_non_llm_spans(mock_llmobs_evaluator_runner, llmobs):
+    with llmobs.workflow(name="test"):
         pass
-    with LLMObs.agent(name="test"):
+    with llmobs.agent(name="test"):
         pass
-    with LLMObs.task(name="test"):
+    with llmobs.task(name="test"):
         pass
-    with LLMObs.embedding(model_name="test"):
+    with llmobs.embedding(model_name="test"):
         pass
-    with LLMObs.retrieval(name="test"):
+    with llmobs.retrieval(name="test"):
         pass
-    with LLMObs.tool(name="test"):
+    with llmobs.tool(name="test"):
         pass
     time.sleep(0.1)
-    assert LLMObs._instance._evaluator_runner.enqueue.call_count == 0
+    assert llmobs._instance._evaluator_runner.enqueue.call_count == 0
 
 
-def test_annotation_context_modifies_span_tags(LLMObs):
-    with LLMObs.annotation_context(tags={"foo": "bar"}):
-        with LLMObs.agent(name="test_agent") as span:
+def test_annotation_context_modifies_span_tags(llmobs):
+    with llmobs.annotation_context(tags={"foo": "bar"}):
+        with llmobs.agent(name="test_agent") as span:
             assert span._get_ctx_item(TAGS) == {"foo": "bar"}
 
 
-def test_annotation_context_modifies_prompt(LLMObs):
-    with LLMObs.annotation_context(prompt={"template": "test_template"}):
-        with LLMObs.llm(name="test_agent", model_name="test") as span:
+def test_annotation_context_modifies_prompt(llmobs):
+    with llmobs.annotation_context(prompt={"template": "test_template"}):
+        with llmobs.llm(name="test_agent", model_name="test") as span:
             assert span._get_ctx_item(INPUT_PROMPT) == {
                 "template": "test_template",
                 "_dd_context_variable_keys": ["context"],
@@ -1793,80 +1577,80 @@ def test_annotation_context_modifies_prompt(LLMObs):
             }
 
 
-def test_annotation_context_modifies_name(LLMObs):
-    with LLMObs.annotation_context(name="test_agent_override"):
-        with LLMObs.llm(name="test_agent", model_name="test") as span:
+def test_annotation_context_modifies_name(llmobs):
+    with llmobs.annotation_context(name="test_agent_override"):
+        with llmobs.llm(name="test_agent", model_name="test") as span:
             assert span.name == "test_agent_override"
 
 
-def test_annotation_context_finished_context_does_not_modify_tags(LLMObs):
-    with LLMObs.annotation_context(tags={"foo": "bar"}):
+def test_annotation_context_finished_context_does_not_modify_tags(llmobs):
+    with llmobs.annotation_context(tags={"foo": "bar"}):
         pass
-    with LLMObs.agent(name="test_agent") as span:
+    with llmobs.agent(name="test_agent") as span:
         assert span._get_ctx_item(TAGS) is None
 
 
-def test_annotation_context_finished_context_does_not_modify_prompt(LLMObs):
-    with LLMObs.annotation_context(prompt={"template": "test_template"}):
+def test_annotation_context_finished_context_does_not_modify_prompt(llmobs):
+    with llmobs.annotation_context(prompt={"template": "test_template"}):
         pass
-    with LLMObs.llm(name="test_agent", model_name="test") as span:
+    with llmobs.llm(name="test_agent", model_name="test") as span:
         assert span._get_ctx_item(INPUT_PROMPT) is None
 
 
-def test_annotation_context_finished_context_does_not_modify_name(LLMObs):
-    with LLMObs.annotation_context(name="test_agent_override"):
+def test_annotation_context_finished_context_does_not_modify_name(llmobs):
+    with llmobs.annotation_context(name="test_agent_override"):
         pass
-    with LLMObs.agent(name="test_agent") as span:
+    with llmobs.agent(name="test_agent") as span:
         assert span.name == "test_agent"
 
 
-def test_annotation_context_nested(LLMObs):
-    with LLMObs.annotation_context(tags={"foo": "bar", "boo": "bar"}):
-        with LLMObs.annotation_context(tags={"foo": "baz"}):
-            with LLMObs.agent(name="test_agent") as span:
+def test_annotation_context_nested(llmobs):
+    with llmobs.annotation_context(tags={"foo": "bar", "boo": "bar"}):
+        with llmobs.annotation_context(tags={"foo": "baz"}):
+            with llmobs.agent(name="test_agent") as span:
                 assert span._get_ctx_item(TAGS) == {"foo": "baz", "boo": "bar"}
 
 
-def test_annotation_context_nested_overrides_name(LLMObs):
-    with LLMObs.annotation_context(name="unexpected"):
-        with LLMObs.annotation_context(name="expected"):
-            with LLMObs.agent(name="test_agent") as span:
+def test_annotation_context_nested_overrides_name(llmobs):
+    with llmobs.annotation_context(name="unexpected"):
+        with llmobs.annotation_context(name="expected"):
+            with llmobs.agent(name="test_agent") as span:
                 assert span.name == "expected"
 
 
-def test_annotation_context_nested_maintains_trace_structure(LLMObs, mock_llmobs_span_writer):
+def test_annotation_context_nested_maintains_trace_structure(llmobs, llmobs_events):
     """This test makes sure starting/stopping annotation contexts do not modify the llmobs trace structure"""
-    with LLMObs.annotation_context(tags={"foo": "bar", "boo": "bar"}):
-        with LLMObs.agent(name="parent_span") as parent_span:
-            with LLMObs.annotation_context(tags={"foo": "baz"}):
-                with LLMObs.workflow(name="child_span") as child_span:
+    with llmobs.annotation_context(tags={"foo": "bar", "boo": "bar"}):
+        with llmobs.agent(name="parent_span") as parent_span:
+            with llmobs.annotation_context(tags={"foo": "baz"}):
+                with llmobs.workflow(name="child_span") as child_span:
                     assert child_span._get_ctx_item(TAGS) == {"foo": "baz", "boo": "bar"}
                     assert parent_span._get_ctx_item(TAGS) == {"foo": "bar", "boo": "bar"}
 
-    assert len(mock_llmobs_span_writer.enqueue.call_args_list) == 2
-    parent_span, child_span = [span[0] for span, _ in mock_llmobs_span_writer.enqueue.call_args_list]
+    assert len(llmobs_events) == 2
+    parent_span, child_span = llmobs_events[1], llmobs_events[0]
     assert child_span["trace_id"] == parent_span["trace_id"]
     assert child_span["span_id"] != parent_span["span_id"]
     assert child_span["parent_id"] == parent_span["span_id"]
     assert parent_span["parent_id"] == "undefined"
 
-    mock_llmobs_span_writer.reset_mock()
 
-    with LLMObs.annotation_context(tags={"foo": "bar", "boo": "bar"}):
-        with LLMObs.agent(name="parent_span"):
+def test_annotation_context_separate_traces_maintained(llmobs, llmobs_events):
+    with llmobs.annotation_context(tags={"foo": "bar", "boo": "bar"}):
+        with llmobs.agent(name="parent_span"):
             pass
-        with LLMObs.workflow(name="child_span"):
+        with llmobs.workflow(name="child_span"):
             pass
 
-    assert len(mock_llmobs_span_writer.enqueue.call_args_list) == 2
-    trace_one, trace_two = [span[0] for span, _ in mock_llmobs_span_writer.enqueue.call_args_list]
-    assert trace_one["trace_id"] != trace_two["trace_id"]
-    assert trace_one["span_id"] != trace_two["span_id"]
-    assert trace_two["parent_id"] == "undefined"
-    assert trace_one["parent_id"] == "undefined"
+    assert len(llmobs_events) == 2
+    agent_span, workflow_span = llmobs_events[1], llmobs_events[0]
+    assert agent_span["trace_id"] != workflow_span["trace_id"]
+    assert agent_span["span_id"] != workflow_span["span_id"]
+    assert workflow_span["parent_id"] == "undefined"
+    assert agent_span["parent_id"] == "undefined"
 
 
-def test_annotation_context_only_applies_to_local_context(LLMObs):
+def test_annotation_context_only_applies_to_local_context(llmobs):
     """
     tests that annotation contexts only apply to spans belonging to the same
     trace context and not globally to all spans.
@@ -1882,8 +1666,8 @@ def test_annotation_context_only_applies_to_local_context(LLMObs):
     def context_one():
         nonlocal agent_has_correct_name
         nonlocal agent_has_correct_tags
-        with LLMObs.annotation_context(name="expected_agent", tags={"foo": "bar"}):
-            with LLMObs.agent(name="test_agent") as span:
+        with llmobs.annotation_context(name="expected_agent", tags={"foo": "bar"}):
+            with llmobs.agent(name="test_agent") as span:
                 event.wait()
                 agent_has_correct_tags = span._get_ctx_item(TAGS) == {"foo": "bar"}
                 agent_has_correct_name = span.name == "expected_agent"
@@ -1892,9 +1676,9 @@ def context_one():
     def context_two():
         nonlocal tool_has_correct_name
         nonlocal tool_does_not_have_tags
-        with LLMObs.agent(name="test_agent"):
-            with LLMObs.annotation_context(name="expected_tool"):
-                with LLMObs.tool(name="test_tool") as tool_span:
+        with llmobs.agent(name="test_agent"):
+            with llmobs.annotation_context(name="expected_tool"):
+                with llmobs.tool(name="test_tool") as tool_span:
                     event.wait()
                     tool_does_not_have_tags = tool_span._get_ctx_item(TAGS) is None
                     tool_has_correct_name = tool_span.name == "expected_tool"
@@ -1904,7 +1688,7 @@ def context_two():
     thread_one.start()
     thread_two.start()
 
-    with LLMObs.agent(name="test_agent") as span:
+    with llmobs.agent(name="test_agent") as span:
         assert span.name == "test_agent"
         assert span._get_ctx_item(TAGS) is None
 
@@ -1920,15 +1704,15 @@ def context_two():
     assert tool_does_not_have_tags is True
 
 
-async def test_annotation_context_async_modifies_span_tags(LLMObs):
-    async with LLMObs.annotation_context(tags={"foo": "bar"}):
-        with LLMObs.agent(name="test_agent") as span:
+async def test_annotation_context_async_modifies_span_tags(llmobs):
+    async with llmobs.annotation_context(tags={"foo": "bar"}):
+        with llmobs.agent(name="test_agent") as span:
             assert span._get_ctx_item(TAGS) == {"foo": "bar"}
 
 
-async def test_annotation_context_async_modifies_prompt(LLMObs):
-    async with LLMObs.annotation_context(prompt={"template": "test_template"}):
-        with LLMObs.llm(name="test_agent", model_name="test") as span:
+async def test_annotation_context_async_modifies_prompt(llmobs):
+    async with llmobs.annotation_context(prompt={"template": "test_template"}):
+        with llmobs.llm(name="test_agent", model_name="test") as span:
             assert span._get_ctx_item(INPUT_PROMPT) == {
                 "template": "test_template",
                 "_dd_context_variable_keys": ["context"],
@@ -1936,41 +1720,42 @@ async def test_annotation_context_async_modifies_prompt(LLMObs):
             }
 
 
-async def test_annotation_context_async_modifies_name(LLMObs):
-    async with LLMObs.annotation_context(name="test_agent_override"):
-        with LLMObs.llm(name="test_agent", model_name="test") as span:
+async def test_annotation_context_async_modifies_name(llmobs):
+    async with llmobs.annotation_context(name="test_agent_override"):
+        with llmobs.llm(name="test_agent", model_name="test") as span:
             assert span.name == "test_agent_override"
 
 
-async def test_annotation_context_async_finished_context_does_not_modify_tags(LLMObs):
-    async with LLMObs.annotation_context(tags={"foo": "bar"}):
+async def test_annotation_context_async_finished_context_does_not_modify_tags(llmobs):
+    async with llmobs.annotation_context(tags={"foo": "bar"}):
         pass
-    with LLMObs.agent(name="test_agent") as span:
+    with llmobs.agent(name="test_agent") as span:
         assert span._get_ctx_item(TAGS) is None
 
 
-async def test_annotation_context_async_finished_context_does_not_modify_prompt(LLMObs):
-    async with LLMObs.annotation_context(prompt={"template": "test_template"}):
+async def test_annotation_context_async_finished_context_does_not_modify_prompt(llmobs):
+    async with llmobs.annotation_context(prompt={"template": "test_template"}):
         pass
-    with LLMObs.llm(name="test_agent", model_name="test") as span:
+    with llmobs.llm(name="test_agent", model_name="test") as span:
         assert span._get_ctx_item(INPUT_PROMPT) is None
 
 
-async def test_annotation_context_finished_context_async_does_not_modify_name(LLMObs):
-    async with LLMObs.annotation_context(name="test_agent_override"):
+async def test_annotation_context_finished_context_async_does_not_modify_name(llmobs):
+    async with llmobs.annotation_context(name="test_agent_override"):
         pass
-    with LLMObs.agent(name="test_agent") as span:
+    with llmobs.agent(name="test_agent") as span:
         assert span.name == "test_agent"
 
 
-async def test_annotation_context_async_nested(LLMObs):
-    async with LLMObs.annotation_context(tags={"foo": "bar", "boo": "bar"}):
-        async with LLMObs.annotation_context(tags={"foo": "baz"}):
-            with LLMObs.agent(name="test_agent") as span:
+async def test_annotation_context_async_nested(llmobs):
+    async with llmobs.annotation_context(tags={"foo": "bar", "boo": "bar"}):
+        async with llmobs.annotation_context(tags={"foo": "baz"}):
+            with llmobs.agent(name="test_agent") as span:
                 assert span._get_ctx_item(TAGS) == {"foo": "baz", "boo": "bar"}
 
 
 def test_service_enable_starts_evaluator_runner_when_evaluators_exist():
+    pytest.importorskip("ragas")
     with override_global_config(dict(_dd_api_key="<not-a-real-api-key>", _llmobs_ml_app="<ml-app-name>")):
         with override_env(dict(_DD_LLMOBS_EVALUATORS="ragas_faithfulness")):
             dummy_tracer = DummyTracer()
@@ -1994,3 +1779,293 @@ def test_service_enable_does_not_start_evaluator_runner():
         assert llmobs_service._instance._llmobs_span_writer.status.value == "running"
         assert llmobs_service._instance._evaluator_runner.status.value == "stopped"
         llmobs_service.disable()
+
+
+def test_submit_evaluation_llmobs_disabled_raises_debug(llmobs, mock_llmobs_logs):
+    llmobs.disable()
+    mock_llmobs_logs.reset_mock()
+    llmobs.submit_evaluation(
+        span_context={"span_id": "123", "trace_id": "456"}, label="toxicity", metric_type="categorical", value="high"
+    )
+    mock_llmobs_logs.debug.assert_called_once_with(
+        "LLMObs.submit_evaluation() called when LLMObs is not enabled. Evaluation metric data will not be sent."
+    )
+
+
+def test_submit_evaluation_for_no_ml_app_raises_warning(llmobs, mock_llmobs_logs):
+    with override_global_config(dict(_llmobs_ml_app="")):
+        llmobs.submit_evaluation_for(
+            span={"span_id": "123", "trace_id": "456"},
+            label="toxicity",
+            metric_type="categorical",
+            value="high",
+        )
+        mock_llmobs_logs.warning.assert_called_once_with(
+            "ML App name is required for sending evaluation metrics. Evaluation metric data will not be sent. "
+            "Ensure this configuration is set before running your application."
+        )
+
+
+def test_submit_evaluation_for_span_incorrect_type_raises_error(llmobs, mock_llmobs_logs):
+    with pytest.raises(
+        TypeError,
+        match=re.escape(
+            (
+                "`span` must be a dictionary containing both span_id and trace_id keys. "
+                "LLMObs.export_span() can be used to generate this dictionary from a given span."
+            )
+        ),
+    ):
+        llmobs.submit_evaluation_for(span="asd", label="toxicity", metric_type="categorical", value="high")
+
+
+def test_submit_evaluation_for_span_with_tag_value_incorrect_type_raises_error(llmobs, mock_llmobs_logs):
+    with pytest.raises(
+        TypeError,
+        match=r"`span_with_tag_value` must be a dict with keys 'tag_key' and 'tag_value' containing string values",
+    ):
+        llmobs.submit_evaluation_for(
+            span_with_tag_value="asd", label="toxicity", metric_type="categorical", value="high"
+        )
+    with pytest.raises(
+        TypeError,
+        match=r"`span_with_tag_value` must be a dict with keys 'tag_key' and 'tag_value' containing string values",
+    ):
+        llmobs.submit_evaluation_for(
+            span_with_tag_value={"tag_key": "hi", "tag_value": 1},
+            label="toxicity",
+            metric_type="categorical",
+            value="high",
+        )
+
+
+def test_submit_evaluation_for_empty_span_or_trace_id_raises_error(llmobs, mock_llmobs_logs):
+    with pytest.raises(
+        TypeError,
+        match=re.escape(
+            (
+                "`span` must be a dictionary containing both span_id and trace_id keys. "
+                "LLMObs.export_span() can be used to generate this dictionary from a given span."
+            )
+        ),
+    ):
+        llmobs.submit_evaluation_for(
+            span={"trace_id": "456"}, label="toxicity", metric_type="categorical", value="high"
+        )
+    with pytest.raises(
+        TypeError,
+        match=re.escape(
+            "`span` must be a dictionary containing both span_id and trace_id keys. "
+            "LLMObs.export_span() can be used to generate this dictionary from a given span."
+        ),
+    ):
+        llmobs.submit_evaluation_for(span={"span_id": "456"}, label="toxicity", metric_type="categorical", value="high")
+
+
+def test_submit_evaluation_for_span_with_tag_value_empty_key_or_val_raises_error(llmobs, mock_llmobs_logs):
+    with pytest.raises(
+        TypeError,
+        match=r"`span_with_tag_value` must be a dict with keys 'tag_key' and 'tag_value' containing string values",
+    ):
+        llmobs.submit_evaluation_for(
+            span_with_tag_value={"tag_value": "123"}, label="toxicity", metric_type="categorical", value="high"
+        )
+
+
+def test_submit_evaluation_for_invalid_timestamp_raises_error(llmobs, mock_llmobs_logs):
+    with pytest.raises(
+        ValueError, match="timestamp_ms must be a non-negative integer. Evaluation metric data will not be sent"
+    ):
+        llmobs.submit_evaluation_for(
+            span={"span_id": "123", "trace_id": "456"},
+            label="",
+            metric_type="categorical",
+            value="high",
+            ml_app="dummy",
+            timestamp_ms="invalid",
+        )
+
+
+def test_submit_evaluation_for_empty_label_raises_error(llmobs, mock_llmobs_logs):
+    with pytest.raises(ValueError, match="label must be the specified name of the evaluation metric."):
+        llmobs.submit_evaluation_for(
+            span={"span_id": "123", "trace_id": "456"}, label="", metric_type="categorical", value="high"
+        )
+
+
+def test_submit_evaluation_for_incorrect_metric_type_raises_error(llmobs, mock_llmobs_logs):
+    with pytest.raises(ValueError, match="metric_type must be one of 'categorical' or 'score'."):
+        llmobs.submit_evaluation_for(
+            span={"span_id": "123", "trace_id": "456"}, label="toxicity", metric_type="wrong", value="high"
+        )
+    with pytest.raises(ValueError, match="metric_type must be one of 'categorical' or 'score'."):
+        llmobs.submit_evaluation_for(
+            span={"span_id": "123", "trace_id": "456"}, label="toxicity", metric_type="", value="high"
+        )
+
+
+def test_submit_evaluation_for_incorrect_score_value_type_raises_error(llmobs, mock_llmobs_logs):
+    with pytest.raises(TypeError, match="value must be an integer or float for a score metric."):
+        llmobs.submit_evaluation_for(
+            span={"span_id": "123", "trace_id": "456"}, label="token_count", metric_type="score", value="high"
+        )
+
+
+def test_submit_evaluation_for_invalid_tags_raises_warning(llmobs, mock_llmobs_logs):
+    llmobs.submit_evaluation_for(
+        span={"span_id": "123", "trace_id": "456"},
+        label="toxicity",
+        metric_type="categorical",
+        value="high",
+        tags=["invalid"],
+    )
+    mock_llmobs_logs.warning.assert_called_once_with("tags must be a dictionary of string key-value pairs.")
+
+
+@pytest.mark.parametrize(
+    "ddtrace_global_config",
+    [dict(_llmobs_ml_app="test_app_name")],
+)
+def test_submit_evaluation_for_non_string_tags_raises_warning_but_still_submits(
+    llmobs, mock_llmobs_logs, mock_llmobs_eval_metric_writer
+):
+    llmobs.submit_evaluation_for(
+        span={"span_id": "123", "trace_id": "456"},
+        label="toxicity",
+        metric_type="categorical",
+        value="high",
+        tags={1: 2, "foo": "bar"},
+        ml_app="dummy",
+    )
+    mock_llmobs_logs.warning.assert_called_once_with(
+        "Failed to parse tags. Tags for evaluation metrics must be strings."
+    )
+    mock_llmobs_logs.reset_mock()
+    mock_llmobs_eval_metric_writer.enqueue.assert_called_with(
+        _expected_llmobs_eval_metric_event(
+            ml_app="dummy",
+            span_id="123",
+            trace_id="456",
+            label="toxicity",
+            metric_type="categorical",
+            categorical_value="high",
+            tags=["ddtrace.version:{}".format(ddtrace.__version__), "ml_app:dummy", "foo:bar"],
+        )
+    )
+
+
+@pytest.mark.parametrize(
+    "ddtrace_global_config",
+    [dict(ddtrace="1.2.3", env="test_env", service="test_service", _llmobs_ml_app="test_app_name")],
+)
+def test_submit_evaluation_for_metric_tags(llmobs, mock_llmobs_eval_metric_writer):
+    llmobs.submit_evaluation_for(
+        span={"span_id": "123", "trace_id": "456"},
+        label="toxicity",
+        metric_type="categorical",
+        value="high",
+        tags={"foo": "bar", "bee": "baz", "ml_app": "ml_app_override"},
+        ml_app="ml_app_override",
+    )
+    mock_llmobs_eval_metric_writer.enqueue.assert_called_with(
+        _expected_llmobs_eval_metric_event(
+            ml_app="ml_app_override",
+            span_id="123",
+            trace_id="456",
+            label="toxicity",
+            metric_type="categorical",
+            categorical_value="high",
+            tags=["ddtrace.version:{}".format(ddtrace.__version__), "ml_app:ml_app_override", "foo:bar", "bee:baz"],
+        )
+    )
+
+
+def test_submit_evaluation_for_span_with_tag_value_enqueues_writer_with_categorical_metric(
+    llmobs, mock_llmobs_eval_metric_writer
+):
+    llmobs.submit_evaluation_for(
+        span_with_tag_value={"tag_key": "tag_key", "tag_value": "tag_val"},
+        label="toxicity",
+        metric_type="categorical",
+        value="high",
+        ml_app="dummy",
+    )
+    mock_llmobs_eval_metric_writer.enqueue.assert_called_with(
+        _expected_llmobs_eval_metric_event(
+            ml_app="dummy",
+            tag_key="tag_key",
+            tag_value="tag_val",
+            label="toxicity",
+            metric_type="categorical",
+            categorical_value="high",
+        )
+    )
+
+
+def test_submit_evaluation_for_enqueues_writer_with_categorical_metric(llmobs, mock_llmobs_eval_metric_writer):
+    llmobs.submit_evaluation_for(
+        span={"span_id": "123", "trace_id": "456"},
+        label="toxicity",
+        metric_type="categorical",
+        value="high",
+        ml_app="dummy",
+    )
+    mock_llmobs_eval_metric_writer.enqueue.assert_called_with(
+        _expected_llmobs_eval_metric_event(
+            ml_app="dummy",
+            span_id="123",
+            trace_id="456",
+            label="toxicity",
+            metric_type="categorical",
+            categorical_value="high",
+        )
+    )
+    mock_llmobs_eval_metric_writer.reset_mock()
+    with llmobs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span:
+        llmobs.submit_evaluation_for(
+            span=llmobs.export_span(span),
+            label="toxicity",
+            metric_type="categorical",
+            value="high",
+            ml_app="dummy",
+        )
+    mock_llmobs_eval_metric_writer.enqueue.assert_called_with(
+        _expected_llmobs_eval_metric_event(
+            ml_app="dummy",
+            span_id=str(span.span_id),
+            trace_id="{:x}".format(span.trace_id),
+            label="toxicity",
+            metric_type="categorical",
+            categorical_value="high",
+        )
+    )
+
+
+def test_submit_evaluation_for_enqueues_writer_with_score_metric(llmobs, mock_llmobs_eval_metric_writer):
+    llmobs.submit_evaluation_for(
+        span={"span_id": "123", "trace_id": "456"},
+        label="sentiment",
+        metric_type="score",
+        value=0.9,
+        ml_app="dummy",
+    )
+    mock_llmobs_eval_metric_writer.enqueue.assert_called_with(
+        _expected_llmobs_eval_metric_event(
+            span_id="123", trace_id="456", label="sentiment", metric_type="score", score_value=0.9, ml_app="dummy"
+        )
+    )
+    mock_llmobs_eval_metric_writer.reset_mock()
+    with llmobs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span:
+        llmobs.submit_evaluation_for(
+            span=llmobs.export_span(span), label="sentiment", metric_type="score", value=0.9, ml_app="dummy"
+        )
+    mock_llmobs_eval_metric_writer.enqueue.assert_called_with(
+        _expected_llmobs_eval_metric_event(
+            span_id=str(span.span_id),
+            trace_id="{:x}".format(span.trace_id),
+            label="sentiment",
+            metric_type="score",
+            score_value=0.9,
+            ml_app="dummy",
+        )
+    )
diff --git a/tests/llmobs/test_llmobs_span_agent_writer.py b/tests/llmobs/test_llmobs_span_agent_writer.py
index 76fe0f21aef..d16bb9f0e2c 100644
--- a/tests/llmobs/test_llmobs_span_agent_writer.py
+++ b/tests/llmobs/test_llmobs_span_agent_writer.py
@@ -44,7 +44,8 @@ def test_flush_queue_when_event_cause_queue_to_exceed_payload_limit(
         [
             mock.call("flushing queue because queuing next event will exceed EVP payload limit"),
             mock.call("encode %d LLMObs span events to be sent", 5),
-        ]
+        ],
+        any_order=True,
     )
 
 
diff --git a/tests/llmobs/test_llmobs_span_agentless_writer.py b/tests/llmobs/test_llmobs_span_agentless_writer.py
index 4882f3553d8..4a54faf130d 100644
--- a/tests/llmobs/test_llmobs_span_agentless_writer.py
+++ b/tests/llmobs/test_llmobs_span_agentless_writer.py
@@ -75,26 +75,25 @@ def test_truncating_oversized_events(mock_writer_logs, mock_http_writer_send_pay
         )
 
 
-def test_send_completion_event(mock_writer_logs, mock_http_writer_logs, mock_http_writer_send_payload_response):
+def test_send_completion_event(mock_writer_logs, mock_http_writer_send_payload_response):
     with override_global_config(dict(_dd_site=DATADOG_SITE, _dd_api_key="foobar.baz")):
         llmobs_span_writer = LLMObsSpanWriter(is_agentless=True, interval=1, timeout=1)
         llmobs_span_writer.start()
         llmobs_span_writer.enqueue(_completion_event())
         llmobs_span_writer.periodic()
         mock_writer_logs.debug.assert_has_calls([mock.call("encode %d LLMObs span events to be sent", 1)])
-        mock_http_writer_logs.error.assert_not_called()
 
 
-def test_send_chat_completion_event(mock_writer_logs, mock_http_writer_logs, mock_http_writer_send_payload_response):
+def test_send_chat_completion_event(mock_writer_logs, mock_http_writer_send_payload_response):
     with override_global_config(dict(_dd_site=DATADOG_SITE, _dd_api_key="foobar.baz")):
         llmobs_span_writer = LLMObsSpanWriter(is_agentless=True, interval=1, timeout=1)
         llmobs_span_writer.start()
         llmobs_span_writer.enqueue(_chat_completion_event())
         llmobs_span_writer.periodic()
         mock_writer_logs.debug.assert_has_calls([mock.call("encode %d LLMObs span events to be sent", 1)])
-        mock_http_writer_logs.error.assert_not_called()
 
 
+@mock.patch("ddtrace.internal.writer.writer.log")
 def test_send_completion_bad_api_key(mock_http_writer_logs, mock_http_writer_put_response_forbidden):
     with override_global_config(dict(_dd_site=DATADOG_SITE, _dd_api_key="<bad-api-key>")):
         llmobs_span_writer = LLMObsSpanWriter(is_agentless=True, interval=1, timeout=1)
@@ -109,7 +108,7 @@ def test_send_completion_bad_api_key(mock_http_writer_logs, mock_http_writer_put
         )
 
 
-def test_send_timed_events(mock_writer_logs, mock_http_writer_logs, mock_http_writer_send_payload_response):
+def test_send_timed_events(mock_writer_logs, mock_http_writer_send_payload_response):
     with override_global_config(dict(_dd_site=DATADOG_SITE, _dd_api_key="foobar.baz")):
         llmobs_span_writer = LLMObsSpanWriter(is_agentless=True, interval=0.01, timeout=1)
         llmobs_span_writer.start()
@@ -122,10 +121,9 @@ def test_send_timed_events(mock_writer_logs, mock_http_writer_logs, mock_http_wr
         llmobs_span_writer.enqueue(_chat_completion_event())
         time.sleep(0.1)
         mock_writer_logs.debug.assert_has_calls([mock.call("encode %d LLMObs span events to be sent", 1)])
-        mock_http_writer_logs.error.assert_not_called()
 
 
-def test_send_multiple_events(mock_writer_logs, mock_http_writer_logs, mock_http_writer_send_payload_response):
+def test_send_multiple_events(mock_writer_logs, mock_http_writer_send_payload_response):
     with override_global_config(dict(_dd_site=DATADOG_SITE, _dd_api_key="foobar.baz")):
         llmobs_span_writer = LLMObsSpanWriter(is_agentless=True, interval=0.01, timeout=1)
         llmobs_span_writer.start()
@@ -135,10 +133,9 @@ def test_send_multiple_events(mock_writer_logs, mock_http_writer_logs, mock_http
         llmobs_span_writer.enqueue(_chat_completion_event())
         time.sleep(0.1)
         mock_writer_logs.debug.assert_has_calls([mock.call("encode %d LLMObs span events to be sent", 2)])
-        mock_http_writer_logs.error.assert_not_called()
 
 
-def test_send_on_exit(mock_writer_logs, run_python_code_in_subprocess):
+def test_send_on_exit(run_python_code_in_subprocess):
     env = os.environ.copy()
     pypath = [os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))]
     if "PYTHONPATH" in env:
diff --git a/tests/llmobs/test_llmobs_trace_processor.py b/tests/llmobs/test_llmobs_trace_processor.py
deleted file mode 100644
index b55286d49c8..00000000000
--- a/tests/llmobs/test_llmobs_trace_processor.py
+++ /dev/null
@@ -1,36 +0,0 @@
-import mock
-
-from ddtrace._trace.span import Span
-from ddtrace.ext import SpanTypes
-from ddtrace.llmobs._constants import SPAN_KIND
-from ddtrace.llmobs._trace_processor import LLMObsTraceProcessor
-from tests.utils import override_global_config
-
-
-def test_processor_returns_all_traces_by_default():
-    """Test that the LLMObsTraceProcessor returns all traces by default."""
-    trace_filter = LLMObsTraceProcessor(llmobs_span_writer=mock.MagicMock())
-    root_llm_span = Span(name="span1", span_type=SpanTypes.LLM)
-    root_llm_span._set_ctx_item(SPAN_KIND, "llm")
-    trace1 = [root_llm_span]
-    assert trace_filter.process_trace(trace1) == trace1
-
-
-def test_processor_returns_all_traces_if_not_agentless():
-    """Test that the LLMObsTraceProcessor returns all traces if DD_LLMOBS_AGENTLESS_ENABLED is not set to true."""
-    with override_global_config(dict(_llmobs_agentless_enabled=False)):
-        trace_filter = LLMObsTraceProcessor(llmobs_span_writer=mock.MagicMock())
-        root_llm_span = Span(name="span1", span_type=SpanTypes.LLM)
-        root_llm_span._set_ctx_item(SPAN_KIND, "llm")
-        trace1 = [root_llm_span]
-        assert trace_filter.process_trace(trace1) == trace1
-
-
-def test_processor_returns_none_in_agentless_mode():
-    """Test that the LLMObsTraceProcessor returns None if DD_LLMOBS_AGENTLESS_ENABLED is set to true."""
-    with override_global_config(dict(_llmobs_agentless_enabled=True)):
-        trace_filter = LLMObsTraceProcessor(llmobs_span_writer=mock.MagicMock())
-        root_llm_span = Span(name="span1", span_type=SpanTypes.LLM)
-        root_llm_span._set_ctx_item(SPAN_KIND, "llm")
-        trace1 = [root_llm_span]
-        assert trace_filter.process_trace(trace1) is None
diff --git a/tests/llmobs/test_propagation.py b/tests/llmobs/test_propagation.py
index d892c6b98a2..e3ab9c80d66 100644
--- a/tests/llmobs/test_propagation.py
+++ b/tests/llmobs/test_propagation.py
@@ -157,39 +157,39 @@ def test_no_llmobs_parent_id_propagated_if_no_llmobs_spans(run_python_code_in_su
         assert _get_llmobs_parent_id(span) == "undefined"
 
 
-def test_inject_distributed_headers_simple(LLMObs):
+def test_inject_distributed_headers_simple(llmobs):
     dummy_tracer = DummyTracer()
     with dummy_tracer.trace("LLMObs span", span_type=SpanTypes.LLM) as root_span:
-        request_headers = LLMObs.inject_distributed_headers({}, span=root_span)
+        request_headers = llmobs.inject_distributed_headers({}, span=root_span)
     assert PROPAGATED_PARENT_ID_KEY in request_headers["x-datadog-tags"]
 
 
-def test_inject_distributed_headers_nested_llmobs_non_llmobs(LLMObs):
+def test_inject_distributed_headers_nested_llmobs_non_llmobs(llmobs):
     dummy_tracer = DummyTracer()
     with dummy_tracer.trace("LLMObs span", span_type=SpanTypes.LLM):
         with dummy_tracer.trace("Non-LLMObs span") as child_span:
-            request_headers = LLMObs.inject_distributed_headers({}, span=child_span)
+            request_headers = llmobs.inject_distributed_headers({}, span=child_span)
     assert PROPAGATED_PARENT_ID_KEY in request_headers["x-datadog-tags"]
 
 
-def test_inject_distributed_headers_non_llmobs_root_span(LLMObs):
+def test_inject_distributed_headers_non_llmobs_root_span(llmobs):
     dummy_tracer = DummyTracer()
     with dummy_tracer.trace("Non-LLMObs span"):
         with dummy_tracer.trace("LLMObs span", span_type=SpanTypes.LLM) as child_span:
-            request_headers = LLMObs.inject_distributed_headers({}, span=child_span)
+            request_headers = llmobs.inject_distributed_headers({}, span=child_span)
     assert PROPAGATED_PARENT_ID_KEY in request_headers["x-datadog-tags"]
 
 
-def test_inject_distributed_headers_nested_llmobs_spans(LLMObs):
+def test_inject_distributed_headers_nested_llmobs_spans(llmobs):
     dummy_tracer = DummyTracer()
     with dummy_tracer.trace("LLMObs span", span_type=SpanTypes.LLM):
         with dummy_tracer.trace("LLMObs child span", span_type=SpanTypes.LLM):
             with dummy_tracer.trace("Last LLMObs child span", span_type=SpanTypes.LLM) as last_llmobs_span:
-                request_headers = LLMObs.inject_distributed_headers({}, span=last_llmobs_span)
+                request_headers = llmobs.inject_distributed_headers({}, span=last_llmobs_span)
     assert PROPAGATED_PARENT_ID_KEY in request_headers["x-datadog-tags"]
 
 
-def test_activate_distributed_headers_propagate_correct_llmobs_parent_id_simple(run_python_code_in_subprocess, LLMObs):
+def test_activate_distributed_headers_propagate_correct_llmobs_parent_id_simple(run_python_code_in_subprocess, llmobs):
     """Test that the correct LLMObs parent ID is propagated in the headers in a simple distributed scenario.
     Service A (subprocess) has a root LLMObs span and a non-LLMObs child span.
     Service B (outside subprocess) has a LLMObs span.
@@ -216,16 +216,15 @@ def test_activate_distributed_headers_propagate_correct_llmobs_parent_id_simple(
     env["DD_TRACE_ENABLED"] = "0"
     stdout, stderr, status, _ = run_python_code_in_subprocess(code=code, env=env)
     assert status == 0, (stdout, stderr)
-    assert stderr == b"", (stdout, stderr)
 
     headers = json.loads(stdout.decode())
-    LLMObs.activate_distributed_headers(headers)
-    with LLMObs.workflow("LLMObs span") as span:
+    llmobs.activate_distributed_headers(headers)
+    with llmobs.workflow("LLMObs span") as span:
         assert str(span.parent_id) == headers["x-datadog-parent-id"]
         assert _get_llmobs_parent_id(span) == headers["_DD_LLMOBS_SPAN_ID"]
 
 
-def test_activate_distributed_headers_propagate_llmobs_parent_id_complex(run_python_code_in_subprocess, LLMObs):
+def test_activate_distributed_headers_propagate_llmobs_parent_id_complex(run_python_code_in_subprocess, llmobs):
     """Test that the correct LLMObs parent ID is propagated in the headers in a more complex trace.
     Service A (subprocess) has a root LLMObs span and a non-LLMObs child span.
     Service B (outside subprocess) has a non-LLMObs local root span and a LLMObs child span.
@@ -252,19 +251,18 @@ def test_activate_distributed_headers_propagate_llmobs_parent_id_complex(run_pyt
     env["DD_TRACE_ENABLED"] = "0"
     stdout, stderr, status, _ = run_python_code_in_subprocess(code=code, env=env)
     assert status == 0, (stdout, stderr)
-    assert stderr == b"", (stdout, stderr)
 
     headers = json.loads(stdout.decode())
-    LLMObs.activate_distributed_headers(headers)
+    llmobs.activate_distributed_headers(headers)
     dummy_tracer = DummyTracer()
     with dummy_tracer.trace("Non-LLMObs span") as span:
-        with LLMObs.llm(model_name="llm_model", name="LLMObs span") as llm_span:
+        with llmobs.llm(model_name="llm_model", name="LLMObs span") as llm_span:
             assert str(span.parent_id) == headers["x-datadog-parent-id"]
             assert _get_llmobs_parent_id(span) == headers["_DD_LLMOBS_SPAN_ID"]
             assert _get_llmobs_parent_id(llm_span) == headers["_DD_LLMOBS_SPAN_ID"]
 
 
-def test_activate_distributed_headers_does_not_propagate_if_no_llmobs_spans(run_python_code_in_subprocess, LLMObs):
+def test_activate_distributed_headers_does_not_propagate_if_no_llmobs_spans(run_python_code_in_subprocess, llmobs):
     """Test that the correct LLMObs parent ID (None) is extracted from the headers in a simple distributed scenario.
     Service A (subprocess) has spans, but none are LLMObs spans.
     Service B (outside subprocess) has a LLMObs span.
@@ -289,10 +287,9 @@ def test_activate_distributed_headers_does_not_propagate_if_no_llmobs_spans(run_
     env["DD_TRACE_ENABLED"] = "0"
     stdout, stderr, status, _ = run_python_code_in_subprocess(code=code, env=env)
     assert status == 0, (stdout, stderr)
-    assert stderr == b"", (stdout, stderr)
 
     headers = json.loads(stdout.decode())
-    LLMObs.activate_distributed_headers(headers)
-    with LLMObs.task("LLMObs span") as span:
+    llmobs.activate_distributed_headers(headers)
+    with llmobs.task("LLMObs span") as span:
         assert str(span.parent_id) == headers["x-datadog-parent-id"]
         assert _get_llmobs_parent_id(span) == "undefined"
diff --git a/tests/snapshots/tests.contrib.openai.test_openai.test_chat_completion_stream.json b/tests/snapshots/tests.contrib.openai.test_openai.test_chat_completion_stream.json
new file mode 100644
index 00000000000..fe7c9e3b0f2
--- /dev/null
+++ b/tests/snapshots/tests.contrib.openai.test_openai.test_chat_completion_stream.json
@@ -0,0 +1,53 @@
+[[
+  {
+    "name": "openai.request",
+    "service": "tests.contrib.openai",
+    "resource": "createChatCompletion",
+    "trace_id": 0,
+    "span_id": 1,
+    "parent_id": 0,
+    "type": "",
+    "error": 0,
+    "meta": {
+      "_dd.p.dm": "-0",
+      "_dd.p.tid": "67741fca00000000",
+      "component": "openai",
+      "language": "python",
+      "openai.base_url": "https://api.openai.com/v1/",
+      "openai.organization.name": "datadog-4",
+      "openai.request.client": "OpenAI",
+      "openai.request.endpoint": "/v1/chat/completions",
+      "openai.request.messages.0.content": "Who won the world series in 2020?",
+      "openai.request.messages.0.name": "",
+      "openai.request.messages.0.role": "user",
+      "openai.request.method": "POST",
+      "openai.request.model": "gpt-3.5-turbo",
+      "openai.request.n": "None",
+      "openai.request.stream": "True",
+      "openai.request.user": "ddtrace-test",
+      "openai.response.choices.0.finish_reason": "stop",
+      "openai.response.choices.0.message.content": "The Los Angeles Dodgers won the World Series in 2020.",
+      "openai.response.choices.0.message.role": "assistant",
+      "openai.response.model": "gpt-3.5-turbo-0301",
+      "openai.user.api_key": "sk-...key>",
+      "runtime-id": "d174f65e33314f43ad1de8cf0a5ca4e0"
+    },
+    "metrics": {
+      "_dd.measured": 1,
+      "_dd.top_level": 1,
+      "_dd.tracer_kr": 1.0,
+      "_sampling_priority_v1": 1,
+      "openai.organization.ratelimit.requests.limit": 3000,
+      "openai.organization.ratelimit.requests.remaining": 2999,
+      "openai.organization.ratelimit.tokens.limit": 250000,
+      "openai.organization.ratelimit.tokens.remaining": 249979,
+      "openai.request.prompt_tokens_estimated": 0,
+      "openai.response.completion_tokens_estimated": 0,
+      "openai.response.usage.completion_tokens": 19,
+      "openai.response.usage.prompt_tokens": 17,
+      "openai.response.usage.total_tokens": 36,
+      "process_id": 22982
+    },
+    "duration": 29869000,
+    "start": 1735663562179157000
+  }]]
diff --git a/tests/snapshots/tests.contrib.openai.test_openai.test_completion_stream.json b/tests/snapshots/tests.contrib.openai.test_openai.test_completion_stream.json
new file mode 100644
index 00000000000..7cf644cfb3d
--- /dev/null
+++ b/tests/snapshots/tests.contrib.openai.test_openai.test_completion_stream.json
@@ -0,0 +1,49 @@
+[[
+  {
+    "name": "openai.request",
+    "service": "tests.contrib.openai",
+    "resource": "createCompletion",
+    "trace_id": 0,
+    "span_id": 1,
+    "parent_id": 0,
+    "type": "",
+    "error": 0,
+    "meta": {
+      "_dd.p.dm": "-0",
+      "_dd.p.tid": "6774231f00000000",
+      "component": "openai",
+      "language": "python",
+      "openai.base_url": "https://api.openai.com/v1/",
+      "openai.organization.name": "datadog-4",
+      "openai.request.client": "OpenAI",
+      "openai.request.endpoint": "/v1/completions",
+      "openai.request.method": "POST",
+      "openai.request.model": "ada",
+      "openai.request.n": "None",
+      "openai.request.prompt.0": "Hello world",
+      "openai.request.stream": "True",
+      "openai.response.choices.0.finish_reason": "length",
+      "openai.response.choices.0.text": "! ... A page layouts page drawer? ... Interesting. The \"Tools\" is",
+      "openai.response.model": "ada",
+      "openai.user.api_key": "sk-...key>",
+      "runtime-id": "11872c9ca653441db861b108a4f795eb"
+    },
+    "metrics": {
+      "_dd.measured": 1,
+      "_dd.top_level": 1,
+      "_dd.tracer_kr": 1.0,
+      "_sampling_priority_v1": 1,
+      "openai.organization.ratelimit.requests.limit": 3000,
+      "openai.organization.ratelimit.requests.remaining": 2999,
+      "openai.organization.ratelimit.tokens.limit": 250000,
+      "openai.organization.ratelimit.tokens.remaining": 249979,
+      "openai.request.prompt_tokens_estimated": 0,
+      "openai.response.completion_tokens_estimated": 0,
+      "openai.response.usage.completion_tokens": 2,
+      "openai.response.usage.prompt_tokens": 2,
+      "openai.response.usage.total_tokens": 4,
+      "process_id": 27488
+    },
+    "duration": 28739000,
+    "start": 1735664415266386000
+  }]]
diff --git a/tests/snapshots/tests.contrib.openai.test_openai_v1.test_completion_stream_est_tokens.json b/tests/snapshots/tests.contrib.openai.test_openai_v1.test_completion_stream_est_tokens.json
new file mode 100644
index 00000000000..445dc39db98
--- /dev/null
+++ b/tests/snapshots/tests.contrib.openai.test_openai_v1.test_completion_stream_est_tokens.json
@@ -0,0 +1,49 @@
+[[
+  {
+    "name": "openai.request",
+    "service": "tests.contrib.openai",
+    "resource": "createCompletion",
+    "trace_id": 0,
+    "span_id": 1,
+    "parent_id": 0,
+    "type": "",
+    "error": 0,
+    "meta": {
+      "_dd.p.dm": "-0",
+      "_dd.p.tid": "677c221c00000000",
+      "component": "openai",
+      "language": "python",
+      "openai.base_url": "https://api.openai.com/v1/",
+      "openai.organization.name": "datadog-4",
+      "openai.request.client": "OpenAI",
+      "openai.request.endpoint": "/v1/completions",
+      "openai.request.method": "POST",
+      "openai.request.model": "ada",
+      "openai.request.n": "None",
+      "openai.request.prompt.0": "Hello world",
+      "openai.request.stream": "True",
+      "openai.response.choices.0.finish_reason": "length",
+      "openai.response.choices.0.text": "! ... A page layouts page drawer? ... Interesting. The \"Tools\" is",
+      "openai.response.model": "ada",
+      "openai.user.api_key": "sk-...key>",
+      "runtime-id": "24f8e851c87e4f758c73d6acd0aaf82b"
+    },
+    "metrics": {
+      "_dd.measured": 1,
+      "_dd.top_level": 1,
+      "_dd.tracer_kr": 1.0,
+      "_sampling_priority_v1": 1,
+      "openai.organization.ratelimit.requests.limit": 3000,
+      "openai.organization.ratelimit.requests.remaining": 2999,
+      "openai.organization.ratelimit.tokens.limit": 250000,
+      "openai.organization.ratelimit.tokens.remaining": 249979,
+      "openai.request.prompt_tokens_estimated": 1,
+      "openai.response.completion_tokens_estimated": 1,
+      "openai.response.usage.completion_tokens": 16,
+      "openai.response.usage.prompt_tokens": 2,
+      "openai.response.usage.total_tokens": 18,
+      "process_id": 47101
+    },
+    "duration": 37957000,
+    "start": 1736188444222291000
+  }]]
diff --git a/tests/telemetry/test_writer.py b/tests/telemetry/test_writer.py
index 9579f35ce04..e15c3be08b9 100644
--- a/tests/telemetry/test_writer.py
+++ b/tests/telemetry/test_writer.py
@@ -651,7 +651,7 @@ def test_send_failing_request(mock_status, telemetry_writer):
                 telemetry_writer.periodic(force_flush=True)
                 # asserts unsuccessful status code was logged
                 log.debug.assert_called_with(
-                    "failed to send telemetry to %s. response: %s",
+                    "Failed to send Instrumentation Telemetry to %s. response: %s",
                     telemetry_writer._client.url,
                     mock_status,
                 )
diff --git a/tests/tracer/test_propagation.py b/tests/tracer/test_propagation.py
index 61fec650a70..0d4c5d7c01d 100644
--- a/tests/tracer/test_propagation.py
+++ b/tests/tracer/test_propagation.py
@@ -1888,6 +1888,14 @@ def test_extract_tracecontext(headers, expected_context):
         B3_SINGLE_HEADERS_VALID,
         CONTEXT_EMPTY,
     ),
+    (
+        "baggage_case_insensitive",
+        None,
+        {"BAgGage": "key1=val1,key2=val2"},
+        {
+            "baggage": {"key1": "val1", "key2": "val2"},
+        },
+    ),
     # All valid headers
     (
         "valid_all_headers_default_style",
@@ -2278,14 +2286,14 @@ def test_propagation_extract_w_config(name, styles, headers, expected_context, r
     overrides = {}
     if styles is not None:
         overrides["_propagation_style_extract"] = styles
-        with override_global_config(overrides):
-            context = HTTPPropagator.extract(headers)
-            if not expected_context.get("tracestate"):
-                assert context == Context(**expected_context)
-            else:
-                copied_expectation = expected_context.copy()
-                tracestate = copied_expectation.pop("tracestate")
-                assert context == Context(**copied_expectation, meta={"tracestate": tracestate})
+    with override_global_config(overrides):
+        context = HTTPPropagator.extract(headers)
+        if not expected_context.get("tracestate"):
+            assert context == Context(**expected_context)
+        else:
+            copied_expectation = expected_context.copy()
+            tracestate = copied_expectation.pop("tracestate")
+            assert context == Context(**copied_expectation, meta={"tracestate": tracestate})
 
 
 EXTRACT_OVERRIDE_FIXTURES = [