From 629139727e33822e6305835dcb3a57f0c4008a8b Mon Sep 17 00:00:00 2001 From: Taegyun Kim Date: Thu, 9 Jan 2025 18:17:21 -0500 Subject: [PATCH 1/4] fix(profiling): access lock acquired time only when the lock is held (#11881) ## Checklist - [x] PR author has checked that all the criteria below are met - The PR description includes an overview of the change - The PR description articulates the motivation for the change - The change includes tests OR the PR description describes a testing strategy - The PR description notes risks associated with the change, if any - Newly-added code is easy to change - The change follows the [library release note guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html) - The change includes or references documentation updates if necessary - Backport labels are set (if [applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)) ## Reviewer Checklist - [x] Reviewer has checked that all the criteria below are met - Title is accurate - All changes are related to the pull request's stated goal - Avoids breaking [API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces) changes - Testing strategy adequately addresses listed risks - Newly-added code is easy to change - Release note makes sense to a user of the library - If necessary, author has acknowledged and discussed the performance implications of this PR as reported in the benchmarks PR comment - Backport labels are set in a manner that is consistent with the [release branch maintenance policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting) --- ddtrace/profiling/collector/_lock.py | 114 +++++++++--------- ...ing-lock-acquired-at-e308547cffdca9f7.yaml | 6 + 2 files changed, 60 insertions(+), 60 deletions(-) create mode 100644 releasenotes/notes/profiling-lock-acquired-at-e308547cffdca9f7.yaml diff --git a/ddtrace/profiling/collector/_lock.py b/ddtrace/profiling/collector/_lock.py index 6dedf3295f7..ec62c5c0eee 100644 --- a/ddtrace/profiling/collector/_lock.py +++ b/ddtrace/profiling/collector/_lock.py @@ -179,69 +179,63 @@ def acquire(self, *args, **kwargs): def _release(self, inner_func, *args, **kwargs): # type (typing.Any, typing.Any) -> None + + start = None + if hasattr(self, "_self_acquired_at"): + # _self_acquired_at is only set when the acquire was captured + # if it's not set, we're not capturing the release + start = self._self_acquired_at + try: return inner_func(*args, **kwargs) finally: - try: - if hasattr(self, "_self_acquired_at"): - try: - end = time.monotonic_ns() - thread_id, thread_name = _current_thread() - task_id, task_name, task_frame = _task.get_task(thread_id) - lock_name = ( - "%s:%s" % (self._self_init_loc, self._self_name) if self._self_name else self._self_init_loc - ) - - if task_frame is None: - # See the comments in _acquire - frame = sys._getframe(2) - else: - frame = task_frame - - frames, nframes = _traceback.pyframe_to_frames(frame, self._self_max_nframes) - - if self._self_export_libdd_enabled: - thread_native_id = _threading.get_thread_native_id(thread_id) - - handle = ddup.SampleHandle() - handle.push_monotonic_ns(end) - handle.push_lock_name(lock_name) - handle.push_release( - end - self._self_acquired_at, 1 - ) # AFAICT, capture_pct does not adjust anything here - handle.push_threadinfo(thread_id, thread_native_id, thread_name) - handle.push_task_id(task_id) - handle.push_task_name(task_name) - - if self._self_tracer is not None: - handle.push_span(self._self_tracer.current_span()) - for frame in frames: - handle.push_frame(frame.function_name, frame.file_name, 0, frame.lineno) - handle.flush_sample() - else: - event = self.RELEASE_EVENT_CLASS( - lock_name=lock_name, - frames=frames, - nframes=nframes, - thread_id=thread_id, - thread_name=thread_name, - task_id=task_id, - task_name=task_name, - locked_for_ns=end - self._self_acquired_at, - sampling_pct=self._self_capture_sampler.capture_pct, - ) - - if self._self_tracer is not None: - event.set_trace_info( - self._self_tracer.current_span(), self._self_endpoint_collection_enabled - ) - - self._self_recorder.push_event(event) - finally: - del self._self_acquired_at - except Exception as e: - LOG.warning("Error recording lock release event: %s", e) - pass # nosec + if start is not None: + end = time.monotonic_ns() + thread_id, thread_name = _current_thread() + task_id, task_name, task_frame = _task.get_task(thread_id) + lock_name = "%s:%s" % (self._self_init_loc, self._self_name) if self._self_name else self._self_init_loc + + if task_frame is None: + # See the comments in _acquire + frame = sys._getframe(2) + else: + frame = task_frame + + frames, nframes = _traceback.pyframe_to_frames(frame, self._self_max_nframes) + + if self._self_export_libdd_enabled: + thread_native_id = _threading.get_thread_native_id(thread_id) + + handle = ddup.SampleHandle() + handle.push_monotonic_ns(end) + handle.push_lock_name(lock_name) + handle.push_release(end - start, 1) # AFAICT, capture_pct does not adjust anything here + handle.push_threadinfo(thread_id, thread_native_id, thread_name) + handle.push_task_id(task_id) + handle.push_task_name(task_name) + + if self._self_tracer is not None: + handle.push_span(self._self_tracer.current_span()) + for frame in frames: + handle.push_frame(frame.function_name, frame.file_name, 0, frame.lineno) + handle.flush_sample() + else: + event = self.RELEASE_EVENT_CLASS( + lock_name=lock_name, + frames=frames, + nframes=nframes, + thread_id=thread_id, + thread_name=thread_name, + task_id=task_id, + task_name=task_name, + locked_for_ns=end - start, + sampling_pct=self._self_capture_sampler.capture_pct, + ) + + if self._self_tracer is not None: + event.set_trace_info(self._self_tracer.current_span(), self._self_endpoint_collection_enabled) + + self._self_recorder.push_event(event) def release(self, *args, **kwargs): return self._release(self.__wrapped__.release, *args, **kwargs) diff --git a/releasenotes/notes/profiling-lock-acquired-at-e308547cffdca9f7.yaml b/releasenotes/notes/profiling-lock-acquired-at-e308547cffdca9f7.yaml new file mode 100644 index 00000000000..de86c8227b6 --- /dev/null +++ b/releasenotes/notes/profiling-lock-acquired-at-e308547cffdca9f7.yaml @@ -0,0 +1,6 @@ +--- +fixes: + - | + profiling: This fix resolves a data race issue accessing lock's acquired + time, leading to an ``AttributeError``: ``_Profiled_ThreadingLock`` object + has no attribute ``self_acquired_at`` From afe439e311ee1b3dd23684a5df72020a0e95e77e Mon Sep 17 00:00:00 2001 From: Juanjo Alvarez Martinez Date: Fri, 10 Jan 2025 10:14:31 +0100 Subject: [PATCH 2/4] chore(iast): optimize _should_iast_patch (#11885) ## Description Optimize the `_should_iast_patch` function by: - Using a Trie tree for the substring matching of the modules with the DENY / ALLOW lists. - Move a call to `lower()` that was being done on each call to `_in_python_stdlib()` to the `_NOT_PATH_MODULE_NAMES` static set creation. These two changes (specially the second) make the function 14x faster. Signed-off-by: Juanjo Alvarez ## Checklist - [X] PR author has checked that all the criteria below are met - The PR description includes an overview of the change - The PR description articulates the motivation for the change - The change includes tests OR the PR description describes a testing strategy - The PR description notes risks associated with the change, if any - Newly-added code is easy to change - The change follows the [library release note guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html) - The change includes or references documentation updates if necessary - Backport labels are set (if [applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)) ## Reviewer Checklist - [x] Reviewer has checked that all the criteria below are met - Title is accurate - All changes are related to the pull request's stated goal - Avoids breaking [API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces) changes - Testing strategy adequately addresses listed risks - Newly-added code is easy to change - Release note makes sense to a user of the library - If necessary, author has acknowledged and discussed the performance implications of this PR as reported in the benchmarks PR comment - Backport labels are set in a manner that is consistent with the [release branch maintenance policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting) --------- Signed-off-by: Juanjo Alvarez --- ddtrace/appsec/_iast/_ast/ast_patching.py | 52 ++++++++++- .../appsec/_python_info/stdlib/__init__.py | 2 +- tests/appsec/iast/_ast/test_ast_patching.py | 86 +++++++++++++++++++ 3 files changed, 135 insertions(+), 5 deletions(-) diff --git a/ddtrace/appsec/_iast/_ast/ast_patching.py b/ddtrace/appsec/_iast/_ast/ast_patching.py index 7e2258bd556..2c7e958d087 100644 --- a/ddtrace/appsec/_iast/_ast/ast_patching.py +++ b/ddtrace/appsec/_iast/_ast/ast_patching.py @@ -7,6 +7,7 @@ from sys import version_info import textwrap from types import ModuleType +from typing import Iterable from typing import Optional from typing import Text from typing import Tuple @@ -327,6 +328,49 @@ log = get_logger(__name__) +class _TrieNode: + __slots__ = ("children", "is_end") + + def __init__(self): + self.children = {} + self.is_end = False + + def __iter__(self): + if self.is_end: + yield ("", None) + else: + for k, v in self.children.items(): + yield (k, dict(v)) + + +def build_trie(words: Iterable[str]) -> _TrieNode: + root = _TrieNode() + for word in words: + node = root + for char in word: + if char not in node.children: + node.children[char] = _TrieNode() + node = node.children[char] + node.is_end = True + return root + + +_TRIE_ALLOWLIST = build_trie(IAST_ALLOWLIST) +_TRIE_DENYLIST = build_trie(IAST_DENYLIST) + + +def _trie_has_prefix_for(trie: _TrieNode, string: str) -> bool: + node = trie + for char in string: + node = node.children.get(char) + if not node: + return False + + if node.is_end: + return True + return node.is_end + + def get_encoding(module_path: Text) -> Text: """ First tries to detect the encoding for the file, @@ -341,11 +385,11 @@ def get_encoding(module_path: Text) -> Text: return ENCODING -_NOT_PATCH_MODULE_NAMES = _stdlib_for_python_version() | set(builtin_module_names) +_NOT_PATCH_MODULE_NAMES = {i.lower() for i in _stdlib_for_python_version() | set(builtin_module_names)} def _in_python_stdlib(module_name: str) -> bool: - return module_name.split(".")[0].lower() in [x.lower() for x in _NOT_PATCH_MODULE_NAMES] + return module_name.split(".")[0].lower() in _NOT_PATCH_MODULE_NAMES def _should_iast_patch(module_name: Text) -> bool: @@ -359,10 +403,10 @@ def _should_iast_patch(module_name: Text) -> bool: # diff = max_allow - max_deny # return diff > 0 or (diff == 0 and not _in_python_stdlib_or_third_party(module_name)) dotted_module_name = module_name.lower() + "." - if dotted_module_name.startswith(IAST_ALLOWLIST): + if _trie_has_prefix_for(_TRIE_ALLOWLIST, dotted_module_name): log.debug("IAST: allowing %s. it's in the IAST_ALLOWLIST", module_name) return True - if dotted_module_name.startswith(IAST_DENYLIST): + if _trie_has_prefix_for(_TRIE_DENYLIST, dotted_module_name): log.debug("IAST: denying %s. it's in the IAST_DENYLIST", module_name) return False if _in_python_stdlib(module_name): diff --git a/ddtrace/appsec/_python_info/stdlib/__init__.py b/ddtrace/appsec/_python_info/stdlib/__init__.py index a040e57f859..e745c392f55 100644 --- a/ddtrace/appsec/_python_info/stdlib/__init__.py +++ b/ddtrace/appsec/_python_info/stdlib/__init__.py @@ -19,5 +19,5 @@ from .module_names_py312 import STDLIB_MODULE_NAMES -def _stdlib_for_python_version(): # type: () -> set +def _stdlib_for_python_version(): # type: () -> set[str] return STDLIB_MODULE_NAMES diff --git a/tests/appsec/iast/_ast/test_ast_patching.py b/tests/appsec/iast/_ast/test_ast_patching.py index cf0fabd14e4..d014496942b 100644 --- a/tests/appsec/iast/_ast/test_ast_patching.py +++ b/tests/appsec/iast/_ast/test_ast_patching.py @@ -9,7 +9,9 @@ from ddtrace.appsec._constants import IAST from ddtrace.appsec._iast._ast.ast_patching import _in_python_stdlib from ddtrace.appsec._iast._ast.ast_patching import _should_iast_patch +from ddtrace.appsec._iast._ast.ast_patching import _trie_has_prefix_for from ddtrace.appsec._iast._ast.ast_patching import astpatch_module +from ddtrace.appsec._iast._ast.ast_patching import build_trie from ddtrace.appsec._iast._ast.ast_patching import visit_ast from ddtrace.internal.utils.formats import asbool from tests.utils import override_env @@ -308,3 +310,87 @@ def test_astpatch_dir_patched_with_or_without_custom_dir(module_name, expected_n # Check that all the symbols in the expected set are in the patched dir() result for name in expected_names: assert name in patched_dir + + +def test_build_trie(): + from ddtrace.appsec._iast._ast.ast_patching import build_trie + + trie = build_trie(["abc", "def", "ghi", "jkl", "mno", "pqr", "stu", "vwx", "yz"]) + assert dict(trie) == { + "a": { + "b": { + "c": {"": None}, + }, + }, + "d": { + "e": { + "f": {"": None}, + }, + }, + "g": { + "h": { + "i": {"": None}, + }, + }, + "j": { + "k": { + "l": {"": None}, + }, + }, + "m": { + "n": { + "o": {"": None}, + }, + }, + "p": { + "q": { + "r": {"": None}, + }, + }, + "s": { + "t": { + "u": {"": None}, + }, + }, + "v": { + "w": { + "x": {"": None}, + }, + }, + "y": { + "z": {"": None}, + }, + } + + +def test_trie_has_string_match(): + trie = build_trie(["abc", "def", "ghi", "jkl", "mno", "pqr", "stu", "vwx", "yz"]) + assert _trie_has_prefix_for(trie, "abc") + assert not _trie_has_prefix_for(trie, "ab") + assert _trie_has_prefix_for(trie, "abcd") + assert _trie_has_prefix_for(trie, "def") + assert not _trie_has_prefix_for(trie, "de") + assert _trie_has_prefix_for(trie, "defg") + assert _trie_has_prefix_for(trie, "ghi") + assert not _trie_has_prefix_for(trie, "gh") + assert _trie_has_prefix_for(trie, "ghij") + assert _trie_has_prefix_for(trie, "jkl") + assert not _trie_has_prefix_for(trie, "jk") + assert _trie_has_prefix_for(trie, "jklm") + assert _trie_has_prefix_for(trie, "mno") + assert not _trie_has_prefix_for(trie, "mn") + assert _trie_has_prefix_for(trie, "mnop") + assert _trie_has_prefix_for(trie, "pqr") + assert not _trie_has_prefix_for(trie, "pq") + assert _trie_has_prefix_for(trie, "pqrs") + assert _trie_has_prefix_for(trie, "stu") + assert not _trie_has_prefix_for(trie, "st") + assert _trie_has_prefix_for(trie, "stuv") + assert _trie_has_prefix_for(trie, "vwx") + assert not _trie_has_prefix_for(trie, "vw") + assert _trie_has_prefix_for(trie, "vwxy") + assert _trie_has_prefix_for(trie, "yz") + assert not _trie_has_prefix_for(trie, "y") + assert _trie_has_prefix_for(trie, "yza") + assert not _trie_has_prefix_for(trie, "z") + assert not _trie_has_prefix_for(trie, "zzz") From 1020545f6db0f4a89a378acd33e90cbcbae7fc1f Mon Sep 17 00:00:00 2001 From: Christophe Papazian <114495376+christophe-papazian@users.noreply.github.com> Date: Fri, 10 Jan 2025 14:30:41 +0100 Subject: [PATCH 3/4] chore(asm): update python version for threats tests (#11893) Update the python version for threats tests, to include python 3.13 when available. ## Checklist - [x] PR author has checked that all the criteria below are met - The PR description includes an overview of the change - The PR description articulates the motivation for the change - The change includes tests OR the PR description describes a testing strategy - The PR description notes risks associated with the change, if any - Newly-added code is easy to change - The change follows the [library release note guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html) - The change includes or references documentation updates if necessary - Backport labels are set (if [applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)) ## Reviewer Checklist - [x] Reviewer has checked that all the criteria below are met - Title is accurate - All changes are related to the pull request's stated goal - Avoids breaking [API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces) changes - Testing strategy adequately addresses listed risks - Newly-added code is easy to change - Release note makes sense to a user of the library - If necessary, author has acknowledged and discussed the performance implications of this PR as reported in the benchmarks PR comment - Backport labels are set in a manner that is consistent with the [release branch maintenance policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting) --- hatch.toml | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/hatch.toml b/hatch.toml index 0baca1fd235..1f073332547 100644 --- a/hatch.toml +++ b/hatch.toml @@ -214,11 +214,11 @@ test = [ # if you add or remove a version here, please also update the parallelism parameter # in .circleci/config.templ.yml [[envs.appsec_threats_django.matrix]] -python = ["3.7", "3.9"] +python = ["3.8", "3.9"] django = ["~=2.2"] [[envs.appsec_threats_django.matrix]] -python = ["3.7", "3.9", "3.10"] +python = ["3.8", "3.9", "3.10"] django = ["~=3.2"] [[envs.appsec_threats_django.matrix]] @@ -226,11 +226,11 @@ python = ["3.8", "3.10"] django = ["==4.0.10"] [[envs.appsec_threats_django.matrix]] -python = ["3.8", "3.10", "3.12"] +python = ["3.8", "3.11", "3.13"] django = ["~=4.2"] [[envs.appsec_threats_django.matrix]] -python = ["3.10", "3.12"] +python = ["3.10", "3.13"] django = ["~=5.1"] @@ -262,21 +262,21 @@ test = [ # if you add or remove a version here, please also update the parallelism parameter # in .circleci/config.templ.yml [[envs.appsec_threats_flask.matrix]] -python = ["3.7", "3.9"] +python = ["3.8", "3.9"] flask = ["~=1.1"] markupsafe = ["~=1.1"] [[envs.appsec_threats_flask.matrix]] -python = ["3.7", "3.9"] +python = ["3.8", "3.9"] flask = ["==2.1.3"] werkzeug = ["<3.0"] [[envs.appsec_threats_flask.matrix]] -python = ["3.8", "3.9", "3.12"] +python = ["3.8", "3.10", "3.13"] flask = ["~=2.3"] [[envs.appsec_threats_flask.matrix]] -python = ["3.8", "3.10", "3.12"] +python = ["3.8", "3.11", "3.13"] flask = ["~=3.0"] ## ASM Native IAST module @@ -327,16 +327,16 @@ test = [ # if you add or remove a version here, please also update the parallelism parameter # in .circleci/config.templ.yml [[envs.appsec_threats_fastapi.matrix]] -python = ["3.7", "3.9", "3.11"] +python = ["3.8", "3.10", "3.13"] fastapi = ["==0.86.0"] anyio = ["==3.7.1"] [[envs.appsec_threats_fastapi.matrix]] -python = ["3.7", "3.9", "3.12"] +python = ["3.8", "3.10", "3.13"] fastapi = ["==0.94.1"] [[envs.appsec_threats_fastapi.matrix]] -python = ["3.8", "3.10", "3.12"] +python = ["3.8", "3.10", "3.13"] fastapi = ["~=0.114.2"] From 5e688237d4baaee86a74461e0dd0162a9634907d Mon Sep 17 00:00:00 2001 From: lievan <42917263+lievan@users.noreply.github.com> Date: Fri, 10 Jan 2025 09:56:49 -0500 Subject: [PATCH 4/4] chore(llmobs): refactor out ragas base evaluator (#11846) Creates a `RagasBaseEvaluator` class that ragas evaluators can share. This is a split-out version of [this PR](https://github.com/DataDog/dd-trace-py/pull/11716) that has incorporated changes based on the first round of comments from @Yun-Kim and @Kyle-Verhoog This class contains shared logic for: - checking ragas dependencies are present, and throwing otherwise - `submit_and_evaluate` function, which generally takes a span event, calls evaluate to generate a score, and then submits an evaluation - extracting out question, contexts, and answer from an LLM span ### Misc changes - rename tests to specify they are faithfulness-specific tests - throw when we parse an unsupported evaluator instead of logging a warning ### ## Checklist - [x] PR author has checked that all the criteria below are met - The PR description includes an overview of the change - The PR description articulates the motivation for the change - The change includes tests OR the PR description describes a testing strategy - The PR description notes risks associated with the change, if any - Newly-added code is easy to change - The change follows the [library release note guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html) - The change includes or references documentation updates if necessary - Backport labels are set (if [applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)) ## Reviewer Checklist - [x] Reviewer has checked that all the criteria below are met - Title is accurate - All changes are related to the pull request's stated goal - Avoids breaking [API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces) changes - Testing strategy adequately addresses listed risks - Newly-added code is easy to change - Release note makes sense to a user of the library - If necessary, author has acknowledged and discussed the performance implications of this PR as reported in the benchmarks PR comment - Backport labels are set in a manner that is consistent with the [release branch maintenance policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting) --------- Co-authored-by: lievan --- ddtrace/llmobs/_evaluators/ragas/base.py | 213 ++++++++++++++++ .../llmobs/_evaluators/ragas/faithfulness.py | 228 +++--------------- ddtrace/llmobs/_evaluators/runner.py | 4 +- tests/llmobs/_utils.py | 43 +++- ...emits_traces_and_evaluations_on_exit.yaml} | 0 ...test_ragas_faithfulness_emits_traces.yaml} | 0 ...agas_faithfulness_submits_evaluation.yaml} | 0 ..._evaluation_on_span_with_custom_keys.yaml} | 0 ...on_on_span_with_question_in_messages.yaml} | 0 tests/llmobs/test_llmobs_evaluator_runner.py | 8 +- ...tor.py => test_llmobs_ragas_evaluators.py} | 9 +- 11 files changed, 297 insertions(+), 208 deletions(-) create mode 100644 ddtrace/llmobs/_evaluators/ragas/base.py rename tests/llmobs/llmobs_cassettes/{tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.emits_traces_and_evaluations_on_exit.yaml => tests.llmobs.test_llmobs_ragas_evaluators.emits_traces_and_evaluations_on_exit.yaml} (100%) rename tests/llmobs/llmobs_cassettes/{tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_emits_traces.yaml => tests.llmobs.test_llmobs_ragas_evaluators.test_ragas_faithfulness_emits_traces.yaml} (100%) rename tests/llmobs/llmobs_cassettes/{tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_submits_evaluation.yaml => tests.llmobs.test_llmobs_ragas_evaluators.test_ragas_faithfulness_submits_evaluation.yaml} (100%) rename tests/llmobs/llmobs_cassettes/{tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_submits_evaluation_on_span_with_custom_keys.yaml => tests.llmobs.test_llmobs_ragas_evaluators.test_ragas_faithfulness_submits_evaluation_on_span_with_custom_keys.yaml} (100%) rename tests/llmobs/llmobs_cassettes/{tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_submits_evaluation_on_span_with_question_in_messages.yaml => tests.llmobs.test_llmobs_ragas_evaluators.test_ragas_faithfulness_submits_evaluation_on_span_with_question_in_messages.yaml} (100%) rename tests/llmobs/{test_llmobs_ragas_faithfulness_evaluator.py => test_llmobs_ragas_evaluators.py} (97%) diff --git a/ddtrace/llmobs/_evaluators/ragas/base.py b/ddtrace/llmobs/_evaluators/ragas/base.py new file mode 100644 index 00000000000..23aa4cd3caa --- /dev/null +++ b/ddtrace/llmobs/_evaluators/ragas/base.py @@ -0,0 +1,213 @@ +import traceback +from typing import List +from typing import Optional +from typing import Tuple +from typing import Union + +from ddtrace.internal.logger import get_logger +from ddtrace.internal.telemetry import telemetry_writer +from ddtrace.internal.telemetry.constants import TELEMETRY_APM_PRODUCT +from ddtrace.internal.telemetry.constants import TELEMETRY_LOG_LEVEL +from ddtrace.internal.utils.version import parse_version +from ddtrace.llmobs._constants import INTERNAL_CONTEXT_VARIABLE_KEYS +from ddtrace.llmobs._constants import INTERNAL_QUERY_VARIABLE_KEYS +from ddtrace.llmobs._constants import RAGAS_ML_APP_PREFIX + + +logger = get_logger(__name__) + + +class RagasDependencies: + """ + A helper class to store instances of ragas classes and functions + that may or may not exist in a user's environment. + """ + + def __init__(self): + import ragas + + self.ragas_version = parse_version(ragas.__version__) + if self.ragas_version >= (0, 2, 0) or self.ragas_version < (0, 1, 10): + raise NotImplementedError( + "Ragas version: {} is not supported".format(self.ragas_version), + ) + + from ragas.llms import llm_factory + + self.llm_factory = llm_factory + + from ragas.llms.output_parser import RagasoutputParser + + self.RagasoutputParser = RagasoutputParser + + from ragas.metrics import context_precision + + self.context_precision = context_precision + + from ragas.metrics.base import ensembler + + self.ensembler = ensembler + + from ragas.metrics import faithfulness + + self.faithfulness = faithfulness + + from ragas.metrics.base import get_segmenter + + self.get_segmenter = get_segmenter + + from ddtrace.llmobs._evaluators.ragas.models import StatementFaithfulnessAnswers + + self.StatementFaithfulnessAnswers = StatementFaithfulnessAnswers + + from ddtrace.llmobs._evaluators.ragas.models import StatementsAnswers + + self.StatementsAnswers = StatementsAnswers + + +def _get_ml_app_for_ragas_trace(span_event: dict) -> str: + """ + The `ml_app` spans generated from traces of ragas will be named as `dd-ragas-` + or `dd-ragas` if `ml_app` is not present in the span event. + """ + tags: List[str] = span_event.get("tags", []) + ml_app = None + for tag in tags: + if isinstance(tag, str) and tag.startswith("ml_app:"): + ml_app = tag.split(":")[1] + break + if not ml_app: + return RAGAS_ML_APP_PREFIX + return "{}-{}".format(RAGAS_ML_APP_PREFIX, ml_app) + + +class BaseRagasEvaluator: + """A class used by EvaluatorRunner to conduct ragas evaluations + on LLM Observability span events. The job of an Evaluator is to take a span and + submit evaluation metrics based on the span's attributes. + + Extenders of this class should only need to implement the `evaluate` method. + """ + + LABEL = "ragas" + METRIC_TYPE = "score" + + def __init__(self, llmobs_service): + """ + Initialize an evaluator that uses the ragas library to generate a score on finished LLM spans. + + :param llmobs_service: An instance of the LLM Observability service used for tracing the evaluation and + submitting evaluation metrics. + + Raises: NotImplementedError if the ragas library is not found or if ragas version is not supported. + """ + self.llmobs_service = llmobs_service + self.ragas_version = "unknown" + telemetry_state = "ok" + try: + self.ragas_dependencies = RagasDependencies() + self.ragas_version = self.ragas_dependencies.ragas_version + except ImportError as e: + telemetry_state = "fail_import_error" + raise NotImplementedError("Failed to load dependencies for `{}` evaluator".format(self.LABEL)) from e + except AttributeError as e: + telemetry_state = "fail_attribute_error" + raise NotImplementedError("Failed to load dependencies for `{}` evaluator".format(self.LABEL)) from e + except NotImplementedError as e: + telemetry_state = "fail_not_supported" + raise NotImplementedError("Failed to load dependencies for `{}` evaluator".format(self.LABEL)) from e + except Exception as e: + telemetry_state = "fail_unknown" + raise NotImplementedError("Failed to load dependencies for `{}` evaluator".format(self.LABEL)) from e + finally: + telemetry_writer.add_count_metric( + namespace=TELEMETRY_APM_PRODUCT.LLMOBS, + name="evaluators.init", + value=1, + tags=( + ("evaluator_label", self.LABEL), + ("state", telemetry_state), + ("evaluator_version", self.ragas_version), + ), + ) + if telemetry_state != "ok": + telemetry_writer.add_log( + level=TELEMETRY_LOG_LEVEL.ERROR, + message="Failed to import Ragas dependencies", + stack_trace=traceback.format_exc(), + tags={"evaluator_version": self.ragas_version}, + ) + + def run_and_submit_evaluation(self, span_event: dict): + if not span_event: + return + score_result_or_failure, metric_metadata = self.evaluate(span_event) + telemetry_writer.add_count_metric( + TELEMETRY_APM_PRODUCT.LLMOBS, + "evaluators.run", + 1, + tags=( + ("evaluator_label", self.LABEL), + ("state", score_result_or_failure if isinstance(score_result_or_failure, str) else "success"), + ("evaluator_version", self.ragas_version), + ), + ) + if isinstance(score_result_or_failure, float): + self.llmobs_service.submit_evaluation( + span_context={"trace_id": span_event.get("trace_id"), "span_id": span_event.get("span_id")}, + label=self.LABEL, + metric_type=self.METRIC_TYPE, + value=score_result_or_failure, + metadata=metric_metadata, + ) + + def evaluate(self, span_event: dict) -> Tuple[Union[float, str], Optional[dict]]: + raise NotImplementedError("evaluate method must be implemented by individual evaluators") + + def _extract_evaluation_inputs_from_span(self, span_event: dict) -> Optional[dict]: + """ + Extracts the question, answer, and context used as inputs for a ragas evaluation on a span event. + """ + with self.llmobs_service.workflow("dd-ragas.extract_evaluation_inputs_from_span") as extract_inputs_workflow: + self.llmobs_service.annotate(span=extract_inputs_workflow, input_data=span_event) + question, answer, contexts = None, None, None + + meta_io = span_event.get("meta") + if meta_io is None: + return None + + meta_input = meta_io.get("input") + meta_output = meta_io.get("output") + + if not (meta_input and meta_output): + return None + + prompt = meta_input.get("prompt") + if prompt is None: + logger.debug("Failed to extract `prompt` from span for ragas evaluation") + return None + prompt_variables = prompt.get("variables") + + input_messages = meta_input.get("messages") + + messages = meta_output.get("messages") + if messages is not None and len(messages) > 0: + answer = messages[-1].get("content") + + if prompt_variables: + context_keys = prompt.get(INTERNAL_CONTEXT_VARIABLE_KEYS, ["context"]) + question_keys = prompt.get(INTERNAL_QUERY_VARIABLE_KEYS, ["question"]) + contexts = [prompt_variables.get(key) for key in context_keys if prompt_variables.get(key)] + question = " ".join([prompt_variables.get(key) for key in question_keys if prompt_variables.get(key)]) + + if not question and input_messages is not None and len(input_messages) > 0: + question = input_messages[-1].get("content") + + self.llmobs_service.annotate( + span=extract_inputs_workflow, output_data={"question": question, "contexts": contexts, "answer": answer} + ) + if any(field is None for field in (question, contexts, answer)): + logger.debug("Failed to extract inputs required for ragas evaluation") + return None + + return {"question": question, "contexts": contexts, "answer": answer} diff --git a/ddtrace/llmobs/_evaluators/ragas/faithfulness.py b/ddtrace/llmobs/_evaluators/ragas/faithfulness.py index d651c2443a4..98725b1f27e 100644 --- a/ddtrace/llmobs/_evaluators/ragas/faithfulness.py +++ b/ddtrace/llmobs/_evaluators/ragas/faithfulness.py @@ -1,73 +1,22 @@ import json import math -import traceback from typing import List from typing import Optional from typing import Tuple from typing import Union from ddtrace.internal.logger import get_logger -from ddtrace.internal.telemetry import telemetry_writer -from ddtrace.internal.telemetry.constants import TELEMETRY_APM_PRODUCT -from ddtrace.internal.telemetry.constants import TELEMETRY_LOG_LEVEL -from ddtrace.internal.utils.version import parse_version from ddtrace.llmobs._constants import EVALUATION_KIND_METADATA from ddtrace.llmobs._constants import EVALUATION_SPAN_METADATA from ddtrace.llmobs._constants import FAITHFULNESS_DISAGREEMENTS_METADATA -from ddtrace.llmobs._constants import INTERNAL_CONTEXT_VARIABLE_KEYS -from ddtrace.llmobs._constants import INTERNAL_QUERY_VARIABLE_KEYS -from ddtrace.llmobs._constants import RAGAS_ML_APP_PREFIX +from ddtrace.llmobs._evaluators.ragas.base import BaseRagasEvaluator +from ddtrace.llmobs._evaluators.ragas.base import _get_ml_app_for_ragas_trace logger = get_logger(__name__) -class MiniRagas: - """ - A helper class to store instances of ragas classes and functions - that may or may not exist in a user's environment. - """ - - llm_factory = None - RagasoutputParser = None - faithfulness = None - ensembler = None - get_segmenter = None - StatementFaithfulnessAnswers = None - StatementsAnswers = None - - -def _get_ml_app_for_ragas_trace(span_event: dict) -> str: - """ - The `ml_app` spans generated from traces of ragas will be named as `dd-ragas-` - or `dd-ragas` if `ml_app` is not present in the span event. - """ - tags = span_event.get("tags", []) # list[str] - ml_app = None - for tag in tags: - if isinstance(tag, str) and tag.startswith("ml_app:"): - ml_app = tag.split(":")[1] - break - if not ml_app: - return RAGAS_ML_APP_PREFIX - return "{}-{}".format(RAGAS_ML_APP_PREFIX, ml_app) - - -def _get_faithfulness_instance() -> Optional[object]: - """ - This helper function ensures the faithfulness instance used in - ragas evaluator is updated with the latest ragas faithfulness - instance AND has an non-null llm - """ - if MiniRagas.faithfulness is None: - return None - ragas_faithfulness_instance = MiniRagas.faithfulness - if not ragas_faithfulness_instance.llm: - ragas_faithfulness_instance.llm = MiniRagas.llm_factory() - return ragas_faithfulness_instance - - -class RagasFaithfulnessEvaluator: +class RagasFaithfulnessEvaluator(BaseRagasEvaluator): """A class used by EvaluatorRunner to conduct ragas faithfulness evaluations on LLM Observability span events. The job of an Evaluator is to take a span and submit evaluation metrics based on the span's attributes. @@ -95,98 +44,30 @@ def __init__(self, llmobs_service): Raises: NotImplementedError if the ragas library is not found or if ragas version is not supported. """ - self.llmobs_service = llmobs_service - self.ragas_version = "unknown" - telemetry_state = "ok" - try: - import ragas - - self.ragas_version = parse_version(ragas.__version__) - if self.ragas_version >= (0, 2, 0) or self.ragas_version < (0, 1, 10): - raise NotImplementedError( - "Ragas version: {} is not supported for `ragas_faithfulness` evaluator".format(self.ragas_version), - ) - - from ragas.llms import llm_factory - - MiniRagas.llm_factory = llm_factory - - from ragas.llms.output_parser import RagasoutputParser - - MiniRagas.RagasoutputParser = RagasoutputParser - - from ragas.metrics import faithfulness - - MiniRagas.faithfulness = faithfulness - - from ragas.metrics.base import ensembler - - MiniRagas.ensembler = ensembler - - from ragas.metrics.base import get_segmenter - - MiniRagas.get_segmenter = get_segmenter - - from ddtrace.llmobs._evaluators.ragas.models import StatementFaithfulnessAnswers - - MiniRagas.StatementFaithfulnessAnswers = StatementFaithfulnessAnswers - - from ddtrace.llmobs._evaluators.ragas.models import StatementsAnswers - - MiniRagas.StatementsAnswers = StatementsAnswers - except Exception as e: - telemetry_state = "fail" - telemetry_writer.add_log( - level=TELEMETRY_LOG_LEVEL.ERROR, - message="Failed to import Ragas dependencies", - stack_trace=traceback.format_exc(), - tags={"ragas_version": self.ragas_version}, - ) - raise NotImplementedError("Failed to load dependencies for `ragas_faithfulness` evaluator") from e - finally: - telemetry_writer.add_count_metric( - namespace=TELEMETRY_APM_PRODUCT.LLMOBS, - name="evaluators.init", - value=1, - tags=( - ("evaluator_label", self.LABEL), - ("state", telemetry_state), - ("ragas_version", self.ragas_version), - ), - ) - - self.ragas_faithfulness_instance = _get_faithfulness_instance() - self.llm_output_parser_for_generated_statements = MiniRagas.RagasoutputParser( - pydantic_object=MiniRagas.StatementsAnswers + super().__init__(llmobs_service) + self.ragas_faithfulness_instance = self._get_faithfulness_instance() + self.llm_output_parser_for_generated_statements = self.ragas_dependencies.RagasoutputParser( + pydantic_object=self.ragas_dependencies.StatementsAnswers ) - self.llm_output_parser_for_faithfulness_score = MiniRagas.RagasoutputParser( - pydantic_object=MiniRagas.StatementFaithfulnessAnswers + self.llm_output_parser_for_faithfulness_score = self.ragas_dependencies.RagasoutputParser( + pydantic_object=self.ragas_dependencies.StatementFaithfulnessAnswers ) - self.split_answer_into_sentences = MiniRagas.get_segmenter( + self.split_answer_into_sentences = self.ragas_dependencies.get_segmenter( language=self.ragas_faithfulness_instance.nli_statements_message.language, clean=False ) - def run_and_submit_evaluation(self, span_event: dict): - if not span_event: - return - score_result_or_failure, metric_metadata = self.evaluate(span_event) - telemetry_writer.add_count_metric( - TELEMETRY_APM_PRODUCT.LLMOBS, - "evaluators.run", - 1, - tags=( - ("evaluator_label", self.LABEL), - ("state", score_result_or_failure if isinstance(score_result_or_failure, str) else "success"), - ), - ) - if isinstance(score_result_or_failure, float): - self.llmobs_service.submit_evaluation( - span_context={"trace_id": span_event.get("trace_id"), "span_id": span_event.get("span_id")}, - label=RagasFaithfulnessEvaluator.LABEL, - metric_type=RagasFaithfulnessEvaluator.METRIC_TYPE, - value=score_result_or_failure, - metadata=metric_metadata, - ) + def _get_faithfulness_instance(self) -> Optional[object]: + """ + This helper function ensures the faithfulness instance used in + ragas evaluator is updated with the latest ragas faithfulness + instance AND has an non-null llm + """ + if self.ragas_dependencies.faithfulness is None: + return None + ragas_faithfulness_instance = self.ragas_dependencies.faithfulness + if not ragas_faithfulness_instance.llm: + ragas_faithfulness_instance.llm = self.ragas_dependencies.llm_factory() + return ragas_faithfulness_instance def evaluate(self, span_event: dict) -> Tuple[Union[float, str], Optional[dict]]: """ @@ -196,7 +77,7 @@ def evaluate(self, span_event: dict) -> Tuple[Union[float, str], Optional[dict]] If the ragas faithfulness instance does not have `llm` set, we set `llm` using the `llm_factory()` method from ragas which defaults to openai's gpt-4o-turbo. """ - self.ragas_faithfulness_instance = _get_faithfulness_instance() + self.ragas_faithfulness_instance = self._get_faithfulness_instance() if not self.ragas_faithfulness_instance: return "fail_faithfulness_is_none", {} @@ -220,16 +101,16 @@ def evaluate(self, span_event: dict) -> Tuple[Union[float, str], Optional[dict]] span=ragas_faithfulness_workflow ) - faithfulness_inputs = self._extract_faithfulness_inputs(span_event) + faithfulness_inputs = self._extract_evaluation_inputs_from_span(span_event) if faithfulness_inputs is None: logger.debug( - "Failed to extract question and context from span sampled for ragas_faithfulness evaluation" + "Failed to extract evaluation inputs from span sampled for `ragas_faithfulness` evaluation" ) return "fail_extract_faithfulness_inputs", evaluation_metadata question = faithfulness_inputs["question"] answer = faithfulness_inputs["answer"] - context = faithfulness_inputs["context"] + context = " ".join(faithfulness_inputs["contexts"]) statements = self._create_statements(question, answer) if statements is None: @@ -318,9 +199,9 @@ def _create_verdicts(self, context: str, statements: List[str]): return None # collapse multiple generations into a single faithfulness list - faithfulness_list = MiniRagas.ensembler.from_discrete(raw_faithfulness_list, "verdict") # type: ignore + faithfulness_list = self.ragas_dependencies.ensembler.from_discrete(raw_faithfulness_list, "verdict") try: - return MiniRagas.StatementFaithfulnessAnswers.parse_obj(faithfulness_list) # type: ignore + return self.ragas_dependencies.StatementFaithfulnessAnswers.parse_obj(faithfulness_list) except Exception as e: logger.debug("Failed to parse faithfulness_list", exc_info=e) return None @@ -330,59 +211,6 @@ def _create_verdicts(self, context: str, statements: List[str]): output_data=faithfulness_list, ) - def _extract_faithfulness_inputs(self, span_event: dict) -> Optional[dict]: - """ - Extracts the question, answer, and context used as inputs to faithfulness - evaluation from a span event. - - question - input.prompt.variables.question OR input.messages[-1].content - context - input.prompt.variables.context - answer - output.messages[-1].content - """ - with self.llmobs_service.workflow("dd-ragas.extract_faithfulness_inputs") as extract_inputs_workflow: - self.llmobs_service.annotate(span=extract_inputs_workflow, input_data=span_event) - question, answer, context = None, None, None - - meta_io = span_event.get("meta") - if meta_io is None: - return None - - meta_input = meta_io.get("input") - meta_output = meta_io.get("output") - - if not (meta_input and meta_output): - return None - - prompt = meta_input.get("prompt") - if prompt is None: - logger.debug("Failed to extract `prompt` from span for `ragas_faithfulness` evaluation") - return None - prompt_variables = prompt.get("variables") - - input_messages = meta_input.get("messages") - - messages = meta_output.get("messages") - if messages is not None and len(messages) > 0: - answer = messages[-1].get("content") - - if prompt_variables: - context_keys = prompt.get(INTERNAL_CONTEXT_VARIABLE_KEYS, ["context"]) - question_keys = prompt.get(INTERNAL_QUERY_VARIABLE_KEYS, ["question"]) - context = " ".join([prompt_variables.get(key) for key in context_keys if prompt_variables.get(key)]) - question = " ".join([prompt_variables.get(key) for key in question_keys if prompt_variables.get(key)]) - - if not question and input_messages is not None and len(input_messages) > 0: - question = input_messages[-1].get("content") - - self.llmobs_service.annotate( - span=extract_inputs_workflow, output_data={"question": question, "context": context, "answer": answer} - ) - if any(field is None for field in (question, context, answer)): - logger.debug("Failed to extract inputs required for faithfulness evaluation") - return None - - return {"question": question, "context": context, "answer": answer} - def _create_statements_prompt(self, answer, question): # Returns: `ragas.llms.PromptValue` object with self.llmobs_service.task("dd-ragas.create_statements_prompt"): diff --git a/ddtrace/llmobs/_evaluators/runner.py b/ddtrace/llmobs/_evaluators/runner.py index bf45e618e01..3d26998f1b4 100644 --- a/ddtrace/llmobs/_evaluators/runner.py +++ b/ddtrace/llmobs/_evaluators/runner.py @@ -64,13 +64,15 @@ def __init__(self, interval: float, llmobs_service=None, evaluators=None): ("state", evaluator_init_state), ), ) + else: + raise ValueError("Parsed unsupported evaluator: {}".format(evaluator)) def start(self, *args, **kwargs): if not self.evaluators: logger.debug("no evaluators configured, not starting %r", self.__class__.__name__) return super(EvaluatorRunner, self).start() - logger.debug("started %r to %r", self.__class__.__name__) + logger.debug("started %r", self.__class__.__name__) def _stop_service(self) -> None: """ diff --git a/tests/llmobs/_utils.py b/tests/llmobs/_utils.py index 0ecdde36ee6..32bbce849db 100644 --- a/tests/llmobs/_utils.py +++ b/tests/llmobs/_utils.py @@ -553,7 +553,46 @@ def _dummy_evaluator_eval_metric_event(span_id, trace_id): ) -def _expected_ragas_spans(ragas_inputs=None): +def _expected_ragas_context_precision_spans(ragas_inputs=None): + if not ragas_inputs: + ragas_inputs = default_ragas_inputs + return [ + { + "trace_id": mock.ANY, + "span_id": mock.ANY, + "parent_id": "undefined", + "name": "dd-ragas.context_precision", + "start_ns": mock.ANY, + "duration": mock.ANY, + "status": "ok", + "meta": { + "span.kind": "workflow", + "input": {"value": mock.ANY}, + "output": {"value": "1.0"}, + }, + "metrics": {}, + "tags": expected_ragas_trace_tags(), + }, + { + "trace_id": mock.ANY, + "span_id": mock.ANY, + "parent_id": mock.ANY, + "name": "dd-ragas.extract_evaluation_inputs_from_span", + "start_ns": mock.ANY, + "duration": mock.ANY, + "status": "ok", + "meta": { + "span.kind": "workflow", + "input": {"value": mock.ANY}, + "output": {"value": mock.ANY}, + }, + "metrics": {}, + "tags": expected_ragas_trace_tags(), + }, + ] + + +def _expected_ragas_faithfulness_spans(ragas_inputs=None): if not ragas_inputs: ragas_inputs = default_ragas_inputs return [ @@ -581,7 +620,7 @@ def _expected_ragas_spans(ragas_inputs=None): "trace_id": mock.ANY, "span_id": mock.ANY, "parent_id": mock.ANY, - "name": "dd-ragas.extract_faithfulness_inputs", + "name": "dd-ragas.extract_evaluation_inputs_from_span", "start_ns": mock.ANY, "duration": mock.ANY, "status": "ok", diff --git a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.emits_traces_and_evaluations_on_exit.yaml b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_evaluators.emits_traces_and_evaluations_on_exit.yaml similarity index 100% rename from tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.emits_traces_and_evaluations_on_exit.yaml rename to tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_evaluators.emits_traces_and_evaluations_on_exit.yaml diff --git a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_emits_traces.yaml b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_evaluators.test_ragas_faithfulness_emits_traces.yaml similarity index 100% rename from tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_emits_traces.yaml rename to tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_evaluators.test_ragas_faithfulness_emits_traces.yaml diff --git a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_submits_evaluation.yaml b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_evaluators.test_ragas_faithfulness_submits_evaluation.yaml similarity index 100% rename from tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_submits_evaluation.yaml rename to tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_evaluators.test_ragas_faithfulness_submits_evaluation.yaml diff --git a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_submits_evaluation_on_span_with_custom_keys.yaml b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_evaluators.test_ragas_faithfulness_submits_evaluation_on_span_with_custom_keys.yaml similarity index 100% rename from tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_submits_evaluation_on_span_with_custom_keys.yaml rename to tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_evaluators.test_ragas_faithfulness_submits_evaluation_on_span_with_custom_keys.yaml diff --git a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_submits_evaluation_on_span_with_question_in_messages.yaml b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_evaluators.test_ragas_faithfulness_submits_evaluation_on_span_with_question_in_messages.yaml similarity index 100% rename from tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_submits_evaluation_on_span_with_question_in_messages.yaml rename to tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_evaluators.test_ragas_faithfulness_submits_evaluation_on_span_with_question_in_messages.yaml diff --git a/tests/llmobs/test_llmobs_evaluator_runner.py b/tests/llmobs/test_llmobs_evaluator_runner.py index 128c4639946..eb0be25c91b 100644 --- a/tests/llmobs/test_llmobs_evaluator_runner.py +++ b/tests/llmobs/test_llmobs_evaluator_runner.py @@ -22,7 +22,7 @@ def test_evaluator_runner_start(mock_evaluator_logs): evaluator_runner = EvaluatorRunner(interval=0.01, llmobs_service=mock.MagicMock()) evaluator_runner.evaluators.append(DummyEvaluator(llmobs_service=mock.MagicMock())) evaluator_runner.start() - mock_evaluator_logs.debug.assert_has_calls([mock.call("started %r to %r", "EvaluatorRunner")]) + mock_evaluator_logs.debug.assert_has_calls([mock.call("started %r", "EvaluatorRunner")]) def test_evaluator_runner_buffer_limit(mock_evaluator_logs): @@ -99,6 +99,12 @@ def test_evaluator_runner_on_exit(mock_writer_logs, run_python_code_in_subproces assert err == b"" +def test_evaluator_runner_unsupported_evaluator(): + with override_env({"_DD_LLMOBS_EVALUATORS": "unsupported"}): + with pytest.raises(ValueError): + EvaluatorRunner(interval=0.01, llmobs_service=mock.MagicMock()) + + def test_evaluator_runner_sampler_single_rule(monkeypatch): monkeypatch.setenv( EvaluatorRunnerSampler.SAMPLING_RULES_ENV_VAR, diff --git a/tests/llmobs/test_llmobs_ragas_faithfulness_evaluator.py b/tests/llmobs/test_llmobs_ragas_evaluators.py similarity index 97% rename from tests/llmobs/test_llmobs_ragas_faithfulness_evaluator.py rename to tests/llmobs/test_llmobs_ragas_evaluators.py index 39e315b37e4..9df6c392470 100644 --- a/tests/llmobs/test_llmobs_ragas_faithfulness_evaluator.py +++ b/tests/llmobs/test_llmobs_ragas_evaluators.py @@ -6,7 +6,7 @@ from ddtrace.llmobs._evaluators.ragas.faithfulness import RagasFaithfulnessEvaluator from ddtrace.span import Span from tests.llmobs._utils import _expected_llmobs_llm_span_event -from tests.llmobs._utils import _expected_ragas_spans +from tests.llmobs._utils import _expected_ragas_faithfulness_spans from tests.llmobs._utils import _llm_span_with_expected_ragas_inputs_in_messages from tests.llmobs._utils import _llm_span_with_expected_ragas_inputs_in_prompt @@ -177,7 +177,8 @@ def test_ragas_faithfulness_emits_traces(ragas, llmobs, llmobs_events): ragas_spans = sorted(ragas_spans, key=lambda d: d["start_ns"]) assert len(ragas_spans) == 7 # check name, io, span kinds match - assert ragas_spans == _expected_ragas_spans() + assert ragas_spans == _expected_ragas_faithfulness_spans() + # verify the trace structure root_span = ragas_spans[0] root_span_id = root_span["span_id"] @@ -212,7 +213,7 @@ def test_llmobs_with_faithfulness_emits_traces_and_evals_on_exit(mock_writer_log "DD_LLMOBS_ML_APP": "unnamed-ml-app", "_DD_LLMOBS_EVALUATOR_INTERVAL": "5", "_DD_LLMOBS_EVALUATORS": "ragas_faithfulness", - "DD_LLMOBS_AGENTLESS_ENABLED": "true", + "DD_LLMOBS_AGENTLESS_ENABLED": "1", } ) out, err, status, pid = run_python_code_in_subprocess( @@ -227,7 +228,7 @@ def test_llmobs_with_faithfulness_emits_traces_and_evals_on_exit(mock_writer_log from tests.llmobs._utils import logs_vcr ctx = logs_vcr.use_cassette( - "tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.emits_traces_and_evaluations_on_exit.yaml" + "tests.llmobs.test_llmobs_ragas_evaluators.emits_traces_and_evaluations_on_exit.yaml" ) ctx.__enter__() atexit.register(lambda: ctx.__exit__())