From 629139727e33822e6305835dcb3a57f0c4008a8b Mon Sep 17 00:00:00 2001
From: Taegyun Kim <taegyun.kim@datadoghq.com>
Date: Thu, 9 Jan 2025 18:17:21 -0500
Subject: [PATCH 1/4] fix(profiling): access lock acquired time only when the
 lock is held (#11881)

## Checklist
- [x] PR author has checked that all the criteria below are met
- The PR description includes an overview of the change
- The PR description articulates the motivation for the change
- The change includes tests OR the PR description describes a testing
strategy
- The PR description notes risks associated with the change, if any
- Newly-added code is easy to change
- The change follows the [library release note
guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html)
- The change includes or references documentation updates if necessary
- Backport labels are set (if
[applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting))

## Reviewer Checklist
- [x] Reviewer has checked that all the criteria below are met
- Title is accurate
- All changes are related to the pull request's stated goal
- Avoids breaking
[API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces)
changes
- Testing strategy adequately addresses listed risks
- Newly-added code is easy to change
- Release note makes sense to a user of the library
- If necessary, author has acknowledged and discussed the performance
implications of this PR as reported in the benchmarks PR comment
- Backport labels are set in a manner that is consistent with the
[release branch maintenance
policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)
---
 ddtrace/profiling/collector/_lock.py          | 114 +++++++++---------
 ...ing-lock-acquired-at-e308547cffdca9f7.yaml |   6 +
 2 files changed, 60 insertions(+), 60 deletions(-)
 create mode 100644 releasenotes/notes/profiling-lock-acquired-at-e308547cffdca9f7.yaml

diff --git a/ddtrace/profiling/collector/_lock.py b/ddtrace/profiling/collector/_lock.py
index 6dedf3295f7..ec62c5c0eee 100644
--- a/ddtrace/profiling/collector/_lock.py
+++ b/ddtrace/profiling/collector/_lock.py
@@ -179,69 +179,63 @@ def acquire(self, *args, **kwargs):
 
     def _release(self, inner_func, *args, **kwargs):
         # type (typing.Any, typing.Any) -> None
+
+        start = None
+        if hasattr(self, "_self_acquired_at"):
+            # _self_acquired_at is only set when the acquire was captured
+            # if it's not set, we're not capturing the release
+            start = self._self_acquired_at
+
         try:
             return inner_func(*args, **kwargs)
         finally:
-            try:
-                if hasattr(self, "_self_acquired_at"):
-                    try:
-                        end = time.monotonic_ns()
-                        thread_id, thread_name = _current_thread()
-                        task_id, task_name, task_frame = _task.get_task(thread_id)
-                        lock_name = (
-                            "%s:%s" % (self._self_init_loc, self._self_name) if self._self_name else self._self_init_loc
-                        )
-
-                        if task_frame is None:
-                            # See the comments in _acquire
-                            frame = sys._getframe(2)
-                        else:
-                            frame = task_frame
-
-                        frames, nframes = _traceback.pyframe_to_frames(frame, self._self_max_nframes)
-
-                        if self._self_export_libdd_enabled:
-                            thread_native_id = _threading.get_thread_native_id(thread_id)
-
-                            handle = ddup.SampleHandle()
-                            handle.push_monotonic_ns(end)
-                            handle.push_lock_name(lock_name)
-                            handle.push_release(
-                                end - self._self_acquired_at, 1
-                            )  # AFAICT, capture_pct does not adjust anything here
-                            handle.push_threadinfo(thread_id, thread_native_id, thread_name)
-                            handle.push_task_id(task_id)
-                            handle.push_task_name(task_name)
-
-                            if self._self_tracer is not None:
-                                handle.push_span(self._self_tracer.current_span())
-                            for frame in frames:
-                                handle.push_frame(frame.function_name, frame.file_name, 0, frame.lineno)
-                            handle.flush_sample()
-                        else:
-                            event = self.RELEASE_EVENT_CLASS(
-                                lock_name=lock_name,
-                                frames=frames,
-                                nframes=nframes,
-                                thread_id=thread_id,
-                                thread_name=thread_name,
-                                task_id=task_id,
-                                task_name=task_name,
-                                locked_for_ns=end - self._self_acquired_at,
-                                sampling_pct=self._self_capture_sampler.capture_pct,
-                            )
-
-                            if self._self_tracer is not None:
-                                event.set_trace_info(
-                                    self._self_tracer.current_span(), self._self_endpoint_collection_enabled
-                                )
-
-                            self._self_recorder.push_event(event)
-                    finally:
-                        del self._self_acquired_at
-            except Exception as e:
-                LOG.warning("Error recording lock release event: %s", e)
-                pass  # nosec
+            if start is not None:
+                end = time.monotonic_ns()
+                thread_id, thread_name = _current_thread()
+                task_id, task_name, task_frame = _task.get_task(thread_id)
+                lock_name = "%s:%s" % (self._self_init_loc, self._self_name) if self._self_name else self._self_init_loc
+
+                if task_frame is None:
+                    # See the comments in _acquire
+                    frame = sys._getframe(2)
+                else:
+                    frame = task_frame
+
+                frames, nframes = _traceback.pyframe_to_frames(frame, self._self_max_nframes)
+
+                if self._self_export_libdd_enabled:
+                    thread_native_id = _threading.get_thread_native_id(thread_id)
+
+                    handle = ddup.SampleHandle()
+                    handle.push_monotonic_ns(end)
+                    handle.push_lock_name(lock_name)
+                    handle.push_release(end - start, 1)  # AFAICT, capture_pct does not adjust anything here
+                    handle.push_threadinfo(thread_id, thread_native_id, thread_name)
+                    handle.push_task_id(task_id)
+                    handle.push_task_name(task_name)
+
+                    if self._self_tracer is not None:
+                        handle.push_span(self._self_tracer.current_span())
+                    for frame in frames:
+                        handle.push_frame(frame.function_name, frame.file_name, 0, frame.lineno)
+                    handle.flush_sample()
+                else:
+                    event = self.RELEASE_EVENT_CLASS(
+                        lock_name=lock_name,
+                        frames=frames,
+                        nframes=nframes,
+                        thread_id=thread_id,
+                        thread_name=thread_name,
+                        task_id=task_id,
+                        task_name=task_name,
+                        locked_for_ns=end - start,
+                        sampling_pct=self._self_capture_sampler.capture_pct,
+                    )
+
+                    if self._self_tracer is not None:
+                        event.set_trace_info(self._self_tracer.current_span(), self._self_endpoint_collection_enabled)
+
+                    self._self_recorder.push_event(event)
 
     def release(self, *args, **kwargs):
         return self._release(self.__wrapped__.release, *args, **kwargs)
diff --git a/releasenotes/notes/profiling-lock-acquired-at-e308547cffdca9f7.yaml b/releasenotes/notes/profiling-lock-acquired-at-e308547cffdca9f7.yaml
new file mode 100644
index 00000000000..de86c8227b6
--- /dev/null
+++ b/releasenotes/notes/profiling-lock-acquired-at-e308547cffdca9f7.yaml
@@ -0,0 +1,6 @@
+---
+fixes:
+  - |
+    profiling: This fix resolves a data race issue accessing lock's acquired
+    time, leading to an ``AttributeError``: ``_Profiled_ThreadingLock`` object
+    has no attribute ``self_acquired_at``

From afe439e311ee1b3dd23684a5df72020a0e95e77e Mon Sep 17 00:00:00 2001
From: Juanjo Alvarez Martinez <juanjo.alvarezmartinez@datadoghq.com>
Date: Fri, 10 Jan 2025 10:14:31 +0100
Subject: [PATCH 2/4] chore(iast): optimize _should_iast_patch (#11885)

## Description

Optimize the `_should_iast_patch` function by:

- Using a Trie tree for the substring matching of the modules with the
DENY / ALLOW lists.
- Move a call to `lower()` that was being done on each call to
`_in_python_stdlib()` to the `_NOT_PATH_MODULE_NAMES` static set
creation.

These two changes (specially the second) make the function 14x faster.

Signed-off-by: Juanjo Alvarez <juanjo.alvarezmartinez@datadoghq.com>##
Checklist
- [X] PR author has checked that all the criteria below are met
- The PR description includes an overview of the change
- The PR description articulates the motivation for the change
- The change includes tests OR the PR description describes a testing
strategy
- The PR description notes risks associated with the change, if any
- Newly-added code is easy to change
- The change follows the [library release note
guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html)
- The change includes or references documentation updates if necessary
- Backport labels are set (if
[applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting))

## Reviewer Checklist
- [x] Reviewer has checked that all the criteria below are met
- Title is accurate
- All changes are related to the pull request's stated goal
- Avoids breaking
[API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces)
changes
- Testing strategy adequately addresses listed risks
- Newly-added code is easy to change
- Release note makes sense to a user of the library
- If necessary, author has acknowledged and discussed the performance
implications of this PR as reported in the benchmarks PR comment
- Backport labels are set in a manner that is consistent with the
[release branch maintenance
policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)

---------

Signed-off-by: Juanjo Alvarez <juanjo.alvarezmartinez@datadoghq.com>
---
 ddtrace/appsec/_iast/_ast/ast_patching.py     | 52 ++++++++++-
 .../appsec/_python_info/stdlib/__init__.py    |  2 +-
 tests/appsec/iast/_ast/test_ast_patching.py   | 86 +++++++++++++++++++
 3 files changed, 135 insertions(+), 5 deletions(-)

diff --git a/ddtrace/appsec/_iast/_ast/ast_patching.py b/ddtrace/appsec/_iast/_ast/ast_patching.py
index 7e2258bd556..2c7e958d087 100644
--- a/ddtrace/appsec/_iast/_ast/ast_patching.py
+++ b/ddtrace/appsec/_iast/_ast/ast_patching.py
@@ -7,6 +7,7 @@
 from sys import version_info
 import textwrap
 from types import ModuleType
+from typing import Iterable
 from typing import Optional
 from typing import Text
 from typing import Tuple
@@ -327,6 +328,49 @@
 log = get_logger(__name__)
 
 
+class _TrieNode:
+    __slots__ = ("children", "is_end")
+
+    def __init__(self):
+        self.children = {}
+        self.is_end = False
+
+    def __iter__(self):
+        if self.is_end:
+            yield ("", None)
+        else:
+            for k, v in self.children.items():
+                yield (k, dict(v))
+
+
+def build_trie(words: Iterable[str]) -> _TrieNode:
+    root = _TrieNode()
+    for word in words:
+        node = root
+        for char in word:
+            if char not in node.children:
+                node.children[char] = _TrieNode()
+            node = node.children[char]
+        node.is_end = True
+    return root
+
+
+_TRIE_ALLOWLIST = build_trie(IAST_ALLOWLIST)
+_TRIE_DENYLIST = build_trie(IAST_DENYLIST)
+
+
+def _trie_has_prefix_for(trie: _TrieNode, string: str) -> bool:
+    node = trie
+    for char in string:
+        node = node.children.get(char)
+        if not node:
+            return False
+
+        if node.is_end:
+            return True
+    return node.is_end
+
+
 def get_encoding(module_path: Text) -> Text:
     """
     First tries to detect the encoding for the file,
@@ -341,11 +385,11 @@ def get_encoding(module_path: Text) -> Text:
     return ENCODING
 
 
-_NOT_PATCH_MODULE_NAMES = _stdlib_for_python_version() | set(builtin_module_names)
+_NOT_PATCH_MODULE_NAMES = {i.lower() for i in _stdlib_for_python_version() | set(builtin_module_names)}
 
 
 def _in_python_stdlib(module_name: str) -> bool:
-    return module_name.split(".")[0].lower() in [x.lower() for x in _NOT_PATCH_MODULE_NAMES]
+    return module_name.split(".")[0].lower() in _NOT_PATCH_MODULE_NAMES
 
 
 def _should_iast_patch(module_name: Text) -> bool:
@@ -359,10 +403,10 @@ def _should_iast_patch(module_name: Text) -> bool:
     # diff = max_allow - max_deny
     # return diff > 0 or (diff == 0 and not _in_python_stdlib_or_third_party(module_name))
     dotted_module_name = module_name.lower() + "."
-    if dotted_module_name.startswith(IAST_ALLOWLIST):
+    if _trie_has_prefix_for(_TRIE_ALLOWLIST, dotted_module_name):
         log.debug("IAST: allowing %s. it's in the IAST_ALLOWLIST", module_name)
         return True
-    if dotted_module_name.startswith(IAST_DENYLIST):
+    if _trie_has_prefix_for(_TRIE_DENYLIST, dotted_module_name):
         log.debug("IAST: denying %s. it's in the IAST_DENYLIST", module_name)
         return False
     if _in_python_stdlib(module_name):
diff --git a/ddtrace/appsec/_python_info/stdlib/__init__.py b/ddtrace/appsec/_python_info/stdlib/__init__.py
index a040e57f859..e745c392f55 100644
--- a/ddtrace/appsec/_python_info/stdlib/__init__.py
+++ b/ddtrace/appsec/_python_info/stdlib/__init__.py
@@ -19,5 +19,5 @@
     from .module_names_py312 import STDLIB_MODULE_NAMES
 
 
-def _stdlib_for_python_version():  # type: () -> set
+def _stdlib_for_python_version():  # type: () -> set[str]
     return STDLIB_MODULE_NAMES
diff --git a/tests/appsec/iast/_ast/test_ast_patching.py b/tests/appsec/iast/_ast/test_ast_patching.py
index cf0fabd14e4..d014496942b 100644
--- a/tests/appsec/iast/_ast/test_ast_patching.py
+++ b/tests/appsec/iast/_ast/test_ast_patching.py
@@ -9,7 +9,9 @@
 from ddtrace.appsec._constants import IAST
 from ddtrace.appsec._iast._ast.ast_patching import _in_python_stdlib
 from ddtrace.appsec._iast._ast.ast_patching import _should_iast_patch
+from ddtrace.appsec._iast._ast.ast_patching import _trie_has_prefix_for
 from ddtrace.appsec._iast._ast.ast_patching import astpatch_module
+from ddtrace.appsec._iast._ast.ast_patching import build_trie
 from ddtrace.appsec._iast._ast.ast_patching import visit_ast
 from ddtrace.internal.utils.formats import asbool
 from tests.utils import override_env
@@ -308,3 +310,87 @@ def test_astpatch_dir_patched_with_or_without_custom_dir(module_name, expected_n
         # Check that all the symbols in the expected set are in the patched dir() result
         for name in expected_names:
             assert name in patched_dir
+
+
+def test_build_trie():
+    from ddtrace.appsec._iast._ast.ast_patching import build_trie
+
+    trie = build_trie(["abc", "def", "ghi", "jkl", "mno", "pqr", "stu", "vwx", "yz"])
+    assert dict(trie) == {
+        "a": {
+            "b": {
+                "c": {"": None},
+            },
+        },
+        "d": {
+            "e": {
+                "f": {"": None},
+            },
+        },
+        "g": {
+            "h": {
+                "i": {"": None},
+            },
+        },
+        "j": {
+            "k": {
+                "l": {"": None},
+            },
+        },
+        "m": {
+            "n": {
+                "o": {"": None},
+            },
+        },
+        "p": {
+            "q": {
+                "r": {"": None},
+            },
+        },
+        "s": {
+            "t": {
+                "u": {"": None},
+            },
+        },
+        "v": {
+            "w": {
+                "x": {"": None},
+            },
+        },
+        "y": {
+            "z": {"": None},
+        },
+    }
+
+
+def test_trie_has_string_match():
+    trie = build_trie(["abc", "def", "ghi", "jkl", "mno", "pqr", "stu", "vwx", "yz"])
+    assert _trie_has_prefix_for(trie, "abc")
+    assert not _trie_has_prefix_for(trie, "ab")
+    assert _trie_has_prefix_for(trie, "abcd")
+    assert _trie_has_prefix_for(trie, "def")
+    assert not _trie_has_prefix_for(trie, "de")
+    assert _trie_has_prefix_for(trie, "defg")
+    assert _trie_has_prefix_for(trie, "ghi")
+    assert not _trie_has_prefix_for(trie, "gh")
+    assert _trie_has_prefix_for(trie, "ghij")
+    assert _trie_has_prefix_for(trie, "jkl")
+    assert not _trie_has_prefix_for(trie, "jk")
+    assert _trie_has_prefix_for(trie, "jklm")
+    assert _trie_has_prefix_for(trie, "mno")
+    assert not _trie_has_prefix_for(trie, "mn")
+    assert _trie_has_prefix_for(trie, "mnop")
+    assert _trie_has_prefix_for(trie, "pqr")
+    assert not _trie_has_prefix_for(trie, "pq")
+    assert _trie_has_prefix_for(trie, "pqrs")
+    assert _trie_has_prefix_for(trie, "stu")
+    assert not _trie_has_prefix_for(trie, "st")
+    assert _trie_has_prefix_for(trie, "stuv")
+    assert _trie_has_prefix_for(trie, "vwx")
+    assert not _trie_has_prefix_for(trie, "vw")
+    assert _trie_has_prefix_for(trie, "vwxy")
+    assert _trie_has_prefix_for(trie, "yz")
+    assert not _trie_has_prefix_for(trie, "y")
+    assert _trie_has_prefix_for(trie, "yza")
+    assert not _trie_has_prefix_for(trie, "z")
+    assert not _trie_has_prefix_for(trie, "zzz")

From 1020545f6db0f4a89a378acd33e90cbcbae7fc1f Mon Sep 17 00:00:00 2001
From: Christophe Papazian
 <114495376+christophe-papazian@users.noreply.github.com>
Date: Fri, 10 Jan 2025 14:30:41 +0100
Subject: [PATCH 3/4] chore(asm): update python version for threats tests
 (#11893)

Update the python version for threats tests, to include python 3.13 when
available.

## Checklist
- [x] PR author has checked that all the criteria below are met
- The PR description includes an overview of the change
- The PR description articulates the motivation for the change
- The change includes tests OR the PR description describes a testing
strategy
- The PR description notes risks associated with the change, if any
- Newly-added code is easy to change
- The change follows the [library release note
guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html)
- The change includes or references documentation updates if necessary
- Backport labels are set (if
[applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting))

## Reviewer Checklist
- [x] Reviewer has checked that all the criteria below are met
- Title is accurate
- All changes are related to the pull request's stated goal
- Avoids breaking
[API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces)
changes
- Testing strategy adequately addresses listed risks
- Newly-added code is easy to change
- Release note makes sense to a user of the library
- If necessary, author has acknowledged and discussed the performance
implications of this PR as reported in the benchmarks PR comment
- Backport labels are set in a manner that is consistent with the
[release branch maintenance
policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)
---
 hatch.toml | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/hatch.toml b/hatch.toml
index 0baca1fd235..1f073332547 100644
--- a/hatch.toml
+++ b/hatch.toml
@@ -214,11 +214,11 @@ test = [
 # if you add or remove a version here, please also update the parallelism parameter
 # in .circleci/config.templ.yml
 [[envs.appsec_threats_django.matrix]]
-python = ["3.7", "3.9"]
+python = ["3.8", "3.9"]
 django = ["~=2.2"]
 
 [[envs.appsec_threats_django.matrix]]
-python = ["3.7", "3.9", "3.10"]
+python = ["3.8", "3.9", "3.10"]
 django = ["~=3.2"]
 
 [[envs.appsec_threats_django.matrix]]
@@ -226,11 +226,11 @@ python = ["3.8", "3.10"]
 django = ["==4.0.10"]
 
 [[envs.appsec_threats_django.matrix]]
-python = ["3.8", "3.10", "3.12"]
+python = ["3.8", "3.11", "3.13"]
 django = ["~=4.2"]
 
 [[envs.appsec_threats_django.matrix]]
-python = ["3.10", "3.12"]
+python = ["3.10", "3.13"]
 django = ["~=5.1"]
 
 
@@ -262,21 +262,21 @@ test = [
 # if you add or remove a version here, please also update the parallelism parameter
 # in .circleci/config.templ.yml
 [[envs.appsec_threats_flask.matrix]]
-python = ["3.7", "3.9"]
+python = ["3.8", "3.9"]
 flask = ["~=1.1"]
 markupsafe = ["~=1.1"]
 
 [[envs.appsec_threats_flask.matrix]]
-python = ["3.7", "3.9"]
+python = ["3.8", "3.9"]
 flask = ["==2.1.3"]
 werkzeug = ["<3.0"]
 
 [[envs.appsec_threats_flask.matrix]]
-python = ["3.8", "3.9", "3.12"]
+python = ["3.8", "3.10", "3.13"]
 flask = ["~=2.3"]
 
 [[envs.appsec_threats_flask.matrix]]
-python = ["3.8", "3.10", "3.12"]
+python = ["3.8", "3.11", "3.13"]
 flask = ["~=3.0"]
 
 ## ASM Native IAST module
@@ -327,16 +327,16 @@ test = [
 # if you add or remove a version here, please also update the parallelism parameter
 # in .circleci/config.templ.yml
 [[envs.appsec_threats_fastapi.matrix]]
-python = ["3.7", "3.9", "3.11"]
+python = ["3.8", "3.10", "3.13"]
 fastapi = ["==0.86.0"]
 anyio = ["==3.7.1"]
 
 [[envs.appsec_threats_fastapi.matrix]]
-python = ["3.7", "3.9", "3.12"]
+python = ["3.8", "3.10", "3.13"]
 fastapi = ["==0.94.1"]
 
 [[envs.appsec_threats_fastapi.matrix]]
-python = ["3.8", "3.10", "3.12"]
+python = ["3.8", "3.10", "3.13"]
 fastapi = ["~=0.114.2"]
 
 

From 5e688237d4baaee86a74461e0dd0162a9634907d Mon Sep 17 00:00:00 2001
From: lievan <42917263+lievan@users.noreply.github.com>
Date: Fri, 10 Jan 2025 09:56:49 -0500
Subject: [PATCH 4/4] chore(llmobs): refactor out ragas base evaluator (#11846)

Creates a `RagasBaseEvaluator` class that ragas evaluators can share.
This is a split-out version of [this
PR](https://github.com/DataDog/dd-trace-py/pull/11716) that has
incorporated changes based on the first round of comments from @Yun-Kim
and @Kyle-Verhoog

This class contains shared logic for:
- checking ragas dependencies are present, and throwing otherwise
- `submit_and_evaluate` function, which generally takes a span event,
calls evaluate to generate a score, and then submits an evaluation
- extracting out question, contexts, and answer from an LLM span

### Misc changes
- rename tests to specify they are faithfulness-specific tests
- throw when we parse an unsupported evaluator instead of logging a
warning
###

## Checklist
- [x] PR author has checked that all the criteria below are met
- The PR description includes an overview of the change
- The PR description articulates the motivation for the change
- The change includes tests OR the PR description describes a testing
strategy
- The PR description notes risks associated with the change, if any
- Newly-added code is easy to change
- The change follows the [library release note
guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html)
- The change includes or references documentation updates if necessary
- Backport labels are set (if
[applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting))

## Reviewer Checklist
- [x] Reviewer has checked that all the criteria below are met
- Title is accurate
- All changes are related to the pull request's stated goal
- Avoids breaking
[API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces)
changes
- Testing strategy adequately addresses listed risks
- Newly-added code is easy to change
- Release note makes sense to a user of the library
- If necessary, author has acknowledged and discussed the performance
implications of this PR as reported in the benchmarks PR comment
- Backport labels are set in a manner that is consistent with the
[release branch maintenance
policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)

---------

Co-authored-by: lievan <evan.li@datadoqhq.com>
---
 ddtrace/llmobs/_evaluators/ragas/base.py      | 213 ++++++++++++++++
 .../llmobs/_evaluators/ragas/faithfulness.py  | 228 +++---------------
 ddtrace/llmobs/_evaluators/runner.py          |   4 +-
 tests/llmobs/_utils.py                        |  43 +++-
 ...emits_traces_and_evaluations_on_exit.yaml} |   0
 ...test_ragas_faithfulness_emits_traces.yaml} |   0
 ...agas_faithfulness_submits_evaluation.yaml} |   0
 ..._evaluation_on_span_with_custom_keys.yaml} |   0
 ...on_on_span_with_question_in_messages.yaml} |   0
 tests/llmobs/test_llmobs_evaluator_runner.py  |   8 +-
 ...tor.py => test_llmobs_ragas_evaluators.py} |   9 +-
 11 files changed, 297 insertions(+), 208 deletions(-)
 create mode 100644 ddtrace/llmobs/_evaluators/ragas/base.py
 rename tests/llmobs/llmobs_cassettes/{tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.emits_traces_and_evaluations_on_exit.yaml => tests.llmobs.test_llmobs_ragas_evaluators.emits_traces_and_evaluations_on_exit.yaml} (100%)
 rename tests/llmobs/llmobs_cassettes/{tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_emits_traces.yaml => tests.llmobs.test_llmobs_ragas_evaluators.test_ragas_faithfulness_emits_traces.yaml} (100%)
 rename tests/llmobs/llmobs_cassettes/{tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_submits_evaluation.yaml => tests.llmobs.test_llmobs_ragas_evaluators.test_ragas_faithfulness_submits_evaluation.yaml} (100%)
 rename tests/llmobs/llmobs_cassettes/{tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_submits_evaluation_on_span_with_custom_keys.yaml => tests.llmobs.test_llmobs_ragas_evaluators.test_ragas_faithfulness_submits_evaluation_on_span_with_custom_keys.yaml} (100%)
 rename tests/llmobs/llmobs_cassettes/{tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_submits_evaluation_on_span_with_question_in_messages.yaml => tests.llmobs.test_llmobs_ragas_evaluators.test_ragas_faithfulness_submits_evaluation_on_span_with_question_in_messages.yaml} (100%)
 rename tests/llmobs/{test_llmobs_ragas_faithfulness_evaluator.py => test_llmobs_ragas_evaluators.py} (97%)

diff --git a/ddtrace/llmobs/_evaluators/ragas/base.py b/ddtrace/llmobs/_evaluators/ragas/base.py
new file mode 100644
index 00000000000..23aa4cd3caa
--- /dev/null
+++ b/ddtrace/llmobs/_evaluators/ragas/base.py
@@ -0,0 +1,213 @@
+import traceback
+from typing import List
+from typing import Optional
+from typing import Tuple
+from typing import Union
+
+from ddtrace.internal.logger import get_logger
+from ddtrace.internal.telemetry import telemetry_writer
+from ddtrace.internal.telemetry.constants import TELEMETRY_APM_PRODUCT
+from ddtrace.internal.telemetry.constants import TELEMETRY_LOG_LEVEL
+from ddtrace.internal.utils.version import parse_version
+from ddtrace.llmobs._constants import INTERNAL_CONTEXT_VARIABLE_KEYS
+from ddtrace.llmobs._constants import INTERNAL_QUERY_VARIABLE_KEYS
+from ddtrace.llmobs._constants import RAGAS_ML_APP_PREFIX
+
+
+logger = get_logger(__name__)
+
+
+class RagasDependencies:
+    """
+    A helper class to store instances of ragas classes and functions
+    that may or may not exist in a user's environment.
+    """
+
+    def __init__(self):
+        import ragas
+
+        self.ragas_version = parse_version(ragas.__version__)
+        if self.ragas_version >= (0, 2, 0) or self.ragas_version < (0, 1, 10):
+            raise NotImplementedError(
+                "Ragas version: {} is not supported".format(self.ragas_version),
+            )
+
+        from ragas.llms import llm_factory
+
+        self.llm_factory = llm_factory
+
+        from ragas.llms.output_parser import RagasoutputParser
+
+        self.RagasoutputParser = RagasoutputParser
+
+        from ragas.metrics import context_precision
+
+        self.context_precision = context_precision
+
+        from ragas.metrics.base import ensembler
+
+        self.ensembler = ensembler
+
+        from ragas.metrics import faithfulness
+
+        self.faithfulness = faithfulness
+
+        from ragas.metrics.base import get_segmenter
+
+        self.get_segmenter = get_segmenter
+
+        from ddtrace.llmobs._evaluators.ragas.models import StatementFaithfulnessAnswers
+
+        self.StatementFaithfulnessAnswers = StatementFaithfulnessAnswers
+
+        from ddtrace.llmobs._evaluators.ragas.models import StatementsAnswers
+
+        self.StatementsAnswers = StatementsAnswers
+
+
+def _get_ml_app_for_ragas_trace(span_event: dict) -> str:
+    """
+    The `ml_app` spans generated from traces of ragas will be named as `dd-ragas-<ml_app>`
+    or `dd-ragas` if `ml_app` is not present in the span event.
+    """
+    tags: List[str] = span_event.get("tags", [])
+    ml_app = None
+    for tag in tags:
+        if isinstance(tag, str) and tag.startswith("ml_app:"):
+            ml_app = tag.split(":")[1]
+            break
+    if not ml_app:
+        return RAGAS_ML_APP_PREFIX
+    return "{}-{}".format(RAGAS_ML_APP_PREFIX, ml_app)
+
+
+class BaseRagasEvaluator:
+    """A class used by EvaluatorRunner to conduct ragas evaluations
+    on LLM Observability span events. The job of an Evaluator is to take a span and
+    submit evaluation metrics based on the span's attributes.
+
+    Extenders of this class should only need to implement the `evaluate` method.
+    """
+
+    LABEL = "ragas"
+    METRIC_TYPE = "score"
+
+    def __init__(self, llmobs_service):
+        """
+        Initialize an evaluator that uses the ragas library to generate a score on finished LLM spans.
+
+        :param llmobs_service: An instance of the LLM Observability service used for tracing the evaluation and
+                                      submitting evaluation metrics.
+
+        Raises: NotImplementedError if the ragas library is not found or if ragas version is not supported.
+        """
+        self.llmobs_service = llmobs_service
+        self.ragas_version = "unknown"
+        telemetry_state = "ok"
+        try:
+            self.ragas_dependencies = RagasDependencies()
+            self.ragas_version = self.ragas_dependencies.ragas_version
+        except ImportError as e:
+            telemetry_state = "fail_import_error"
+            raise NotImplementedError("Failed to load dependencies for `{}` evaluator".format(self.LABEL)) from e
+        except AttributeError as e:
+            telemetry_state = "fail_attribute_error"
+            raise NotImplementedError("Failed to load dependencies for `{}` evaluator".format(self.LABEL)) from e
+        except NotImplementedError as e:
+            telemetry_state = "fail_not_supported"
+            raise NotImplementedError("Failed to load dependencies for `{}` evaluator".format(self.LABEL)) from e
+        except Exception as e:
+            telemetry_state = "fail_unknown"
+            raise NotImplementedError("Failed to load dependencies for `{}` evaluator".format(self.LABEL)) from e
+        finally:
+            telemetry_writer.add_count_metric(
+                namespace=TELEMETRY_APM_PRODUCT.LLMOBS,
+                name="evaluators.init",
+                value=1,
+                tags=(
+                    ("evaluator_label", self.LABEL),
+                    ("state", telemetry_state),
+                    ("evaluator_version", self.ragas_version),
+                ),
+            )
+            if telemetry_state != "ok":
+                telemetry_writer.add_log(
+                    level=TELEMETRY_LOG_LEVEL.ERROR,
+                    message="Failed to import Ragas dependencies",
+                    stack_trace=traceback.format_exc(),
+                    tags={"evaluator_version": self.ragas_version},
+                )
+
+    def run_and_submit_evaluation(self, span_event: dict):
+        if not span_event:
+            return
+        score_result_or_failure, metric_metadata = self.evaluate(span_event)
+        telemetry_writer.add_count_metric(
+            TELEMETRY_APM_PRODUCT.LLMOBS,
+            "evaluators.run",
+            1,
+            tags=(
+                ("evaluator_label", self.LABEL),
+                ("state", score_result_or_failure if isinstance(score_result_or_failure, str) else "success"),
+                ("evaluator_version", self.ragas_version),
+            ),
+        )
+        if isinstance(score_result_or_failure, float):
+            self.llmobs_service.submit_evaluation(
+                span_context={"trace_id": span_event.get("trace_id"), "span_id": span_event.get("span_id")},
+                label=self.LABEL,
+                metric_type=self.METRIC_TYPE,
+                value=score_result_or_failure,
+                metadata=metric_metadata,
+            )
+
+    def evaluate(self, span_event: dict) -> Tuple[Union[float, str], Optional[dict]]:
+        raise NotImplementedError("evaluate method must be implemented by individual evaluators")
+
+    def _extract_evaluation_inputs_from_span(self, span_event: dict) -> Optional[dict]:
+        """
+        Extracts the question, answer, and context used as inputs for a ragas evaluation on a span event.
+        """
+        with self.llmobs_service.workflow("dd-ragas.extract_evaluation_inputs_from_span") as extract_inputs_workflow:
+            self.llmobs_service.annotate(span=extract_inputs_workflow, input_data=span_event)
+            question, answer, contexts = None, None, None
+
+            meta_io = span_event.get("meta")
+            if meta_io is None:
+                return None
+
+            meta_input = meta_io.get("input")
+            meta_output = meta_io.get("output")
+
+            if not (meta_input and meta_output):
+                return None
+
+            prompt = meta_input.get("prompt")
+            if prompt is None:
+                logger.debug("Failed to extract `prompt` from span for ragas evaluation")
+                return None
+            prompt_variables = prompt.get("variables")
+
+            input_messages = meta_input.get("messages")
+
+            messages = meta_output.get("messages")
+            if messages is not None and len(messages) > 0:
+                answer = messages[-1].get("content")
+
+            if prompt_variables:
+                context_keys = prompt.get(INTERNAL_CONTEXT_VARIABLE_KEYS, ["context"])
+                question_keys = prompt.get(INTERNAL_QUERY_VARIABLE_KEYS, ["question"])
+                contexts = [prompt_variables.get(key) for key in context_keys if prompt_variables.get(key)]
+                question = " ".join([prompt_variables.get(key) for key in question_keys if prompt_variables.get(key)])
+
+            if not question and input_messages is not None and len(input_messages) > 0:
+                question = input_messages[-1].get("content")
+
+            self.llmobs_service.annotate(
+                span=extract_inputs_workflow, output_data={"question": question, "contexts": contexts, "answer": answer}
+            )
+            if any(field is None for field in (question, contexts, answer)):
+                logger.debug("Failed to extract inputs required for ragas evaluation")
+                return None
+
+            return {"question": question, "contexts": contexts, "answer": answer}
diff --git a/ddtrace/llmobs/_evaluators/ragas/faithfulness.py b/ddtrace/llmobs/_evaluators/ragas/faithfulness.py
index d651c2443a4..98725b1f27e 100644
--- a/ddtrace/llmobs/_evaluators/ragas/faithfulness.py
+++ b/ddtrace/llmobs/_evaluators/ragas/faithfulness.py
@@ -1,73 +1,22 @@
 import json
 import math
-import traceback
 from typing import List
 from typing import Optional
 from typing import Tuple
 from typing import Union
 
 from ddtrace.internal.logger import get_logger
-from ddtrace.internal.telemetry import telemetry_writer
-from ddtrace.internal.telemetry.constants import TELEMETRY_APM_PRODUCT
-from ddtrace.internal.telemetry.constants import TELEMETRY_LOG_LEVEL
-from ddtrace.internal.utils.version import parse_version
 from ddtrace.llmobs._constants import EVALUATION_KIND_METADATA
 from ddtrace.llmobs._constants import EVALUATION_SPAN_METADATA
 from ddtrace.llmobs._constants import FAITHFULNESS_DISAGREEMENTS_METADATA
-from ddtrace.llmobs._constants import INTERNAL_CONTEXT_VARIABLE_KEYS
-from ddtrace.llmobs._constants import INTERNAL_QUERY_VARIABLE_KEYS
-from ddtrace.llmobs._constants import RAGAS_ML_APP_PREFIX
+from ddtrace.llmobs._evaluators.ragas.base import BaseRagasEvaluator
+from ddtrace.llmobs._evaluators.ragas.base import _get_ml_app_for_ragas_trace
 
 
 logger = get_logger(__name__)
 
 
-class MiniRagas:
-    """
-    A helper class to store instances of ragas classes and functions
-    that may or may not exist in a user's environment.
-    """
-
-    llm_factory = None
-    RagasoutputParser = None
-    faithfulness = None
-    ensembler = None
-    get_segmenter = None
-    StatementFaithfulnessAnswers = None
-    StatementsAnswers = None
-
-
-def _get_ml_app_for_ragas_trace(span_event: dict) -> str:
-    """
-    The `ml_app` spans generated from traces of ragas will be named as `dd-ragas-<ml_app>`
-    or `dd-ragas` if `ml_app` is not present in the span event.
-    """
-    tags = span_event.get("tags", [])  # list[str]
-    ml_app = None
-    for tag in tags:
-        if isinstance(tag, str) and tag.startswith("ml_app:"):
-            ml_app = tag.split(":")[1]
-            break
-    if not ml_app:
-        return RAGAS_ML_APP_PREFIX
-    return "{}-{}".format(RAGAS_ML_APP_PREFIX, ml_app)
-
-
-def _get_faithfulness_instance() -> Optional[object]:
-    """
-    This helper function ensures the faithfulness instance used in
-    ragas evaluator is updated with the latest ragas faithfulness
-    instance AND has an non-null llm
-    """
-    if MiniRagas.faithfulness is None:
-        return None
-    ragas_faithfulness_instance = MiniRagas.faithfulness
-    if not ragas_faithfulness_instance.llm:
-        ragas_faithfulness_instance.llm = MiniRagas.llm_factory()
-    return ragas_faithfulness_instance
-
-
-class RagasFaithfulnessEvaluator:
+class RagasFaithfulnessEvaluator(BaseRagasEvaluator):
     """A class used by EvaluatorRunner to conduct ragas faithfulness evaluations
     on LLM Observability span events. The job of an Evaluator is to take a span and
     submit evaluation metrics based on the span's attributes.
@@ -95,98 +44,30 @@ def __init__(self, llmobs_service):
 
         Raises: NotImplementedError if the ragas library is not found or if ragas version is not supported.
         """
-        self.llmobs_service = llmobs_service
-        self.ragas_version = "unknown"
-        telemetry_state = "ok"
-        try:
-            import ragas
-
-            self.ragas_version = parse_version(ragas.__version__)
-            if self.ragas_version >= (0, 2, 0) or self.ragas_version < (0, 1, 10):
-                raise NotImplementedError(
-                    "Ragas version: {} is not supported for `ragas_faithfulness` evaluator".format(self.ragas_version),
-                )
-
-            from ragas.llms import llm_factory
-
-            MiniRagas.llm_factory = llm_factory
-
-            from ragas.llms.output_parser import RagasoutputParser
-
-            MiniRagas.RagasoutputParser = RagasoutputParser
-
-            from ragas.metrics import faithfulness
-
-            MiniRagas.faithfulness = faithfulness
-
-            from ragas.metrics.base import ensembler
-
-            MiniRagas.ensembler = ensembler
-
-            from ragas.metrics.base import get_segmenter
-
-            MiniRagas.get_segmenter = get_segmenter
-
-            from ddtrace.llmobs._evaluators.ragas.models import StatementFaithfulnessAnswers
-
-            MiniRagas.StatementFaithfulnessAnswers = StatementFaithfulnessAnswers
-
-            from ddtrace.llmobs._evaluators.ragas.models import StatementsAnswers
-
-            MiniRagas.StatementsAnswers = StatementsAnswers
-        except Exception as e:
-            telemetry_state = "fail"
-            telemetry_writer.add_log(
-                level=TELEMETRY_LOG_LEVEL.ERROR,
-                message="Failed to import Ragas dependencies",
-                stack_trace=traceback.format_exc(),
-                tags={"ragas_version": self.ragas_version},
-            )
-            raise NotImplementedError("Failed to load dependencies for `ragas_faithfulness` evaluator") from e
-        finally:
-            telemetry_writer.add_count_metric(
-                namespace=TELEMETRY_APM_PRODUCT.LLMOBS,
-                name="evaluators.init",
-                value=1,
-                tags=(
-                    ("evaluator_label", self.LABEL),
-                    ("state", telemetry_state),
-                    ("ragas_version", self.ragas_version),
-                ),
-            )
-
-        self.ragas_faithfulness_instance = _get_faithfulness_instance()
-        self.llm_output_parser_for_generated_statements = MiniRagas.RagasoutputParser(
-            pydantic_object=MiniRagas.StatementsAnswers
+        super().__init__(llmobs_service)
+        self.ragas_faithfulness_instance = self._get_faithfulness_instance()
+        self.llm_output_parser_for_generated_statements = self.ragas_dependencies.RagasoutputParser(
+            pydantic_object=self.ragas_dependencies.StatementsAnswers
         )
-        self.llm_output_parser_for_faithfulness_score = MiniRagas.RagasoutputParser(
-            pydantic_object=MiniRagas.StatementFaithfulnessAnswers
+        self.llm_output_parser_for_faithfulness_score = self.ragas_dependencies.RagasoutputParser(
+            pydantic_object=self.ragas_dependencies.StatementFaithfulnessAnswers
         )
-        self.split_answer_into_sentences = MiniRagas.get_segmenter(
+        self.split_answer_into_sentences = self.ragas_dependencies.get_segmenter(
             language=self.ragas_faithfulness_instance.nli_statements_message.language, clean=False
         )
 
-    def run_and_submit_evaluation(self, span_event: dict):
-        if not span_event:
-            return
-        score_result_or_failure, metric_metadata = self.evaluate(span_event)
-        telemetry_writer.add_count_metric(
-            TELEMETRY_APM_PRODUCT.LLMOBS,
-            "evaluators.run",
-            1,
-            tags=(
-                ("evaluator_label", self.LABEL),
-                ("state", score_result_or_failure if isinstance(score_result_or_failure, str) else "success"),
-            ),
-        )
-        if isinstance(score_result_or_failure, float):
-            self.llmobs_service.submit_evaluation(
-                span_context={"trace_id": span_event.get("trace_id"), "span_id": span_event.get("span_id")},
-                label=RagasFaithfulnessEvaluator.LABEL,
-                metric_type=RagasFaithfulnessEvaluator.METRIC_TYPE,
-                value=score_result_or_failure,
-                metadata=metric_metadata,
-            )
+    def _get_faithfulness_instance(self) -> Optional[object]:
+        """
+        This helper function ensures the faithfulness instance used in
+        ragas evaluator is updated with the latest ragas faithfulness
+        instance AND has an non-null llm
+        """
+        if self.ragas_dependencies.faithfulness is None:
+            return None
+        ragas_faithfulness_instance = self.ragas_dependencies.faithfulness
+        if not ragas_faithfulness_instance.llm:
+            ragas_faithfulness_instance.llm = self.ragas_dependencies.llm_factory()
+        return ragas_faithfulness_instance
 
     def evaluate(self, span_event: dict) -> Tuple[Union[float, str], Optional[dict]]:
         """
@@ -196,7 +77,7 @@ def evaluate(self, span_event: dict) -> Tuple[Union[float, str], Optional[dict]]
         If the ragas faithfulness instance does not have `llm` set, we set `llm` using the `llm_factory()`
         method from ragas which defaults to openai's gpt-4o-turbo.
         """
-        self.ragas_faithfulness_instance = _get_faithfulness_instance()
+        self.ragas_faithfulness_instance = self._get_faithfulness_instance()
         if not self.ragas_faithfulness_instance:
             return "fail_faithfulness_is_none", {}
 
@@ -220,16 +101,16 @@ def evaluate(self, span_event: dict) -> Tuple[Union[float, str], Optional[dict]]
                     span=ragas_faithfulness_workflow
                 )
 
-                faithfulness_inputs = self._extract_faithfulness_inputs(span_event)
+                faithfulness_inputs = self._extract_evaluation_inputs_from_span(span_event)
                 if faithfulness_inputs is None:
                     logger.debug(
-                        "Failed to extract question and context from span sampled for ragas_faithfulness evaluation"
+                        "Failed to extract evaluation inputs from span sampled for `ragas_faithfulness` evaluation"
                     )
                     return "fail_extract_faithfulness_inputs", evaluation_metadata
 
                 question = faithfulness_inputs["question"]
                 answer = faithfulness_inputs["answer"]
-                context = faithfulness_inputs["context"]
+                context = " ".join(faithfulness_inputs["contexts"])
 
                 statements = self._create_statements(question, answer)
                 if statements is None:
@@ -318,9 +199,9 @@ def _create_verdicts(self, context: str, statements: List[str]):
                 return None
 
             # collapse multiple generations into a single faithfulness list
-            faithfulness_list = MiniRagas.ensembler.from_discrete(raw_faithfulness_list, "verdict")  # type: ignore
+            faithfulness_list = self.ragas_dependencies.ensembler.from_discrete(raw_faithfulness_list, "verdict")
             try:
-                return MiniRagas.StatementFaithfulnessAnswers.parse_obj(faithfulness_list)  # type: ignore
+                return self.ragas_dependencies.StatementFaithfulnessAnswers.parse_obj(faithfulness_list)
             except Exception as e:
                 logger.debug("Failed to parse faithfulness_list", exc_info=e)
                 return None
@@ -330,59 +211,6 @@ def _create_verdicts(self, context: str, statements: List[str]):
                     output_data=faithfulness_list,
                 )
 
-    def _extract_faithfulness_inputs(self, span_event: dict) -> Optional[dict]:
-        """
-        Extracts the question, answer, and context used as inputs to faithfulness
-        evaluation from a span event.
-
-        question - input.prompt.variables.question OR input.messages[-1].content
-        context - input.prompt.variables.context
-        answer - output.messages[-1].content
-        """
-        with self.llmobs_service.workflow("dd-ragas.extract_faithfulness_inputs") as extract_inputs_workflow:
-            self.llmobs_service.annotate(span=extract_inputs_workflow, input_data=span_event)
-            question, answer, context = None, None, None
-
-            meta_io = span_event.get("meta")
-            if meta_io is None:
-                return None
-
-            meta_input = meta_io.get("input")
-            meta_output = meta_io.get("output")
-
-            if not (meta_input and meta_output):
-                return None
-
-            prompt = meta_input.get("prompt")
-            if prompt is None:
-                logger.debug("Failed to extract `prompt` from span for `ragas_faithfulness` evaluation")
-                return None
-            prompt_variables = prompt.get("variables")
-
-            input_messages = meta_input.get("messages")
-
-            messages = meta_output.get("messages")
-            if messages is not None and len(messages) > 0:
-                answer = messages[-1].get("content")
-
-            if prompt_variables:
-                context_keys = prompt.get(INTERNAL_CONTEXT_VARIABLE_KEYS, ["context"])
-                question_keys = prompt.get(INTERNAL_QUERY_VARIABLE_KEYS, ["question"])
-                context = " ".join([prompt_variables.get(key) for key in context_keys if prompt_variables.get(key)])
-                question = " ".join([prompt_variables.get(key) for key in question_keys if prompt_variables.get(key)])
-
-            if not question and input_messages is not None and len(input_messages) > 0:
-                question = input_messages[-1].get("content")
-
-            self.llmobs_service.annotate(
-                span=extract_inputs_workflow, output_data={"question": question, "context": context, "answer": answer}
-            )
-            if any(field is None for field in (question, context, answer)):
-                logger.debug("Failed to extract inputs required for faithfulness evaluation")
-                return None
-
-            return {"question": question, "context": context, "answer": answer}
-
     def _create_statements_prompt(self, answer, question):
         # Returns: `ragas.llms.PromptValue` object
         with self.llmobs_service.task("dd-ragas.create_statements_prompt"):
diff --git a/ddtrace/llmobs/_evaluators/runner.py b/ddtrace/llmobs/_evaluators/runner.py
index bf45e618e01..3d26998f1b4 100644
--- a/ddtrace/llmobs/_evaluators/runner.py
+++ b/ddtrace/llmobs/_evaluators/runner.py
@@ -64,13 +64,15 @@ def __init__(self, interval: float, llmobs_service=None, evaluators=None):
                             ("state", evaluator_init_state),
                         ),
                     )
+            else:
+                raise ValueError("Parsed unsupported evaluator: {}".format(evaluator))
 
     def start(self, *args, **kwargs):
         if not self.evaluators:
             logger.debug("no evaluators configured, not starting %r", self.__class__.__name__)
             return
         super(EvaluatorRunner, self).start()
-        logger.debug("started %r to %r", self.__class__.__name__)
+        logger.debug("started %r", self.__class__.__name__)
 
     def _stop_service(self) -> None:
         """
diff --git a/tests/llmobs/_utils.py b/tests/llmobs/_utils.py
index 0ecdde36ee6..32bbce849db 100644
--- a/tests/llmobs/_utils.py
+++ b/tests/llmobs/_utils.py
@@ -553,7 +553,46 @@ def _dummy_evaluator_eval_metric_event(span_id, trace_id):
     )
 
 
-def _expected_ragas_spans(ragas_inputs=None):
+def _expected_ragas_context_precision_spans(ragas_inputs=None):
+    if not ragas_inputs:
+        ragas_inputs = default_ragas_inputs
+    return [
+        {
+            "trace_id": mock.ANY,
+            "span_id": mock.ANY,
+            "parent_id": "undefined",
+            "name": "dd-ragas.context_precision",
+            "start_ns": mock.ANY,
+            "duration": mock.ANY,
+            "status": "ok",
+            "meta": {
+                "span.kind": "workflow",
+                "input": {"value": mock.ANY},
+                "output": {"value": "1.0"},
+            },
+            "metrics": {},
+            "tags": expected_ragas_trace_tags(),
+        },
+        {
+            "trace_id": mock.ANY,
+            "span_id": mock.ANY,
+            "parent_id": mock.ANY,
+            "name": "dd-ragas.extract_evaluation_inputs_from_span",
+            "start_ns": mock.ANY,
+            "duration": mock.ANY,
+            "status": "ok",
+            "meta": {
+                "span.kind": "workflow",
+                "input": {"value": mock.ANY},
+                "output": {"value": mock.ANY},
+            },
+            "metrics": {},
+            "tags": expected_ragas_trace_tags(),
+        },
+    ]
+
+
+def _expected_ragas_faithfulness_spans(ragas_inputs=None):
     if not ragas_inputs:
         ragas_inputs = default_ragas_inputs
     return [
@@ -581,7 +620,7 @@ def _expected_ragas_spans(ragas_inputs=None):
             "trace_id": mock.ANY,
             "span_id": mock.ANY,
             "parent_id": mock.ANY,
-            "name": "dd-ragas.extract_faithfulness_inputs",
+            "name": "dd-ragas.extract_evaluation_inputs_from_span",
             "start_ns": mock.ANY,
             "duration": mock.ANY,
             "status": "ok",
diff --git a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.emits_traces_and_evaluations_on_exit.yaml b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_evaluators.emits_traces_and_evaluations_on_exit.yaml
similarity index 100%
rename from tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.emits_traces_and_evaluations_on_exit.yaml
rename to tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_evaluators.emits_traces_and_evaluations_on_exit.yaml
diff --git a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_emits_traces.yaml b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_evaluators.test_ragas_faithfulness_emits_traces.yaml
similarity index 100%
rename from tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_emits_traces.yaml
rename to tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_evaluators.test_ragas_faithfulness_emits_traces.yaml
diff --git a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_submits_evaluation.yaml b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_evaluators.test_ragas_faithfulness_submits_evaluation.yaml
similarity index 100%
rename from tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_submits_evaluation.yaml
rename to tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_evaluators.test_ragas_faithfulness_submits_evaluation.yaml
diff --git a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_submits_evaluation_on_span_with_custom_keys.yaml b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_evaluators.test_ragas_faithfulness_submits_evaluation_on_span_with_custom_keys.yaml
similarity index 100%
rename from tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_submits_evaluation_on_span_with_custom_keys.yaml
rename to tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_evaluators.test_ragas_faithfulness_submits_evaluation_on_span_with_custom_keys.yaml
diff --git a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_submits_evaluation_on_span_with_question_in_messages.yaml b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_evaluators.test_ragas_faithfulness_submits_evaluation_on_span_with_question_in_messages.yaml
similarity index 100%
rename from tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_submits_evaluation_on_span_with_question_in_messages.yaml
rename to tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_evaluators.test_ragas_faithfulness_submits_evaluation_on_span_with_question_in_messages.yaml
diff --git a/tests/llmobs/test_llmobs_evaluator_runner.py b/tests/llmobs/test_llmobs_evaluator_runner.py
index 128c4639946..eb0be25c91b 100644
--- a/tests/llmobs/test_llmobs_evaluator_runner.py
+++ b/tests/llmobs/test_llmobs_evaluator_runner.py
@@ -22,7 +22,7 @@ def test_evaluator_runner_start(mock_evaluator_logs):
     evaluator_runner = EvaluatorRunner(interval=0.01, llmobs_service=mock.MagicMock())
     evaluator_runner.evaluators.append(DummyEvaluator(llmobs_service=mock.MagicMock()))
     evaluator_runner.start()
-    mock_evaluator_logs.debug.assert_has_calls([mock.call("started %r to %r", "EvaluatorRunner")])
+    mock_evaluator_logs.debug.assert_has_calls([mock.call("started %r", "EvaluatorRunner")])
 
 
 def test_evaluator_runner_buffer_limit(mock_evaluator_logs):
@@ -99,6 +99,12 @@ def test_evaluator_runner_on_exit(mock_writer_logs, run_python_code_in_subproces
     assert err == b""
 
 
+def test_evaluator_runner_unsupported_evaluator():
+    with override_env({"_DD_LLMOBS_EVALUATORS": "unsupported"}):
+        with pytest.raises(ValueError):
+            EvaluatorRunner(interval=0.01, llmobs_service=mock.MagicMock())
+
+
 def test_evaluator_runner_sampler_single_rule(monkeypatch):
     monkeypatch.setenv(
         EvaluatorRunnerSampler.SAMPLING_RULES_ENV_VAR,
diff --git a/tests/llmobs/test_llmobs_ragas_faithfulness_evaluator.py b/tests/llmobs/test_llmobs_ragas_evaluators.py
similarity index 97%
rename from tests/llmobs/test_llmobs_ragas_faithfulness_evaluator.py
rename to tests/llmobs/test_llmobs_ragas_evaluators.py
index 39e315b37e4..9df6c392470 100644
--- a/tests/llmobs/test_llmobs_ragas_faithfulness_evaluator.py
+++ b/tests/llmobs/test_llmobs_ragas_evaluators.py
@@ -6,7 +6,7 @@
 from ddtrace.llmobs._evaluators.ragas.faithfulness import RagasFaithfulnessEvaluator
 from ddtrace.span import Span
 from tests.llmobs._utils import _expected_llmobs_llm_span_event
-from tests.llmobs._utils import _expected_ragas_spans
+from tests.llmobs._utils import _expected_ragas_faithfulness_spans
 from tests.llmobs._utils import _llm_span_with_expected_ragas_inputs_in_messages
 from tests.llmobs._utils import _llm_span_with_expected_ragas_inputs_in_prompt
 
@@ -177,7 +177,8 @@ def test_ragas_faithfulness_emits_traces(ragas, llmobs, llmobs_events):
     ragas_spans = sorted(ragas_spans, key=lambda d: d["start_ns"])
     assert len(ragas_spans) == 7
     # check name, io, span kinds match
-    assert ragas_spans == _expected_ragas_spans()
+    assert ragas_spans == _expected_ragas_faithfulness_spans()
+
     # verify the trace structure
     root_span = ragas_spans[0]
     root_span_id = root_span["span_id"]
@@ -212,7 +213,7 @@ def test_llmobs_with_faithfulness_emits_traces_and_evals_on_exit(mock_writer_log
             "DD_LLMOBS_ML_APP": "unnamed-ml-app",
             "_DD_LLMOBS_EVALUATOR_INTERVAL": "5",
             "_DD_LLMOBS_EVALUATORS": "ragas_faithfulness",
-            "DD_LLMOBS_AGENTLESS_ENABLED": "true",
+            "DD_LLMOBS_AGENTLESS_ENABLED": "1",
         }
     )
     out, err, status, pid = run_python_code_in_subprocess(
@@ -227,7 +228,7 @@ def test_llmobs_with_faithfulness_emits_traces_and_evals_on_exit(mock_writer_log
 from tests.llmobs._utils import logs_vcr
 
 ctx = logs_vcr.use_cassette(
-    "tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.emits_traces_and_evaluations_on_exit.yaml"
+    "tests.llmobs.test_llmobs_ragas_evaluators.emits_traces_and_evaluations_on_exit.yaml"
 )
 ctx.__enter__()
 atexit.register(lambda: ctx.__exit__())