diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index d281fe80148..23a48b6f344 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -110,6 +110,7 @@ benchmarks/base/aspects_benchmarks_generate.py @DataDog/asm-python ddtrace/appsec/ @DataDog/asm-python ddtrace/settings/asm.py @DataDog/asm-python ddtrace/contrib/subprocess/ @DataDog/asm-python +ddtrace/contrib/internal/subprocess/ @DataDog/asm-python ddtrace/contrib/flask_login/ @DataDog/asm-python ddtrace/contrib/webbrowser @DataDog/asm-python ddtrace/contrib/urllib @DataDog/asm-python diff --git a/.riot/requirements/16562eb.txt b/.riot/requirements/16562eb.txt new file mode 100644 index 00000000000..e2aac88c146 --- /dev/null +++ b/.riot/requirements/16562eb.txt @@ -0,0 +1,32 @@ +# +# This file is autogenerated by pip-compile with Python 3.7 +# by the following command: +# +# pip-compile --allow-unsafe --config=pyproject.toml --no-annotate --resolver=backtracking .riot/requirements/16562eb.in +# +attrs==24.2.0 +coverage[toml]==7.2.7 +exceptiongroup==1.2.2 +hypothesis==6.45.0 +idna==3.10 +importlib-metadata==6.7.0 +iniconfig==2.0.0 +mock==5.1.0 +multidict==6.0.5 +opentracing==2.4.0 +packaging==24.0 +pluggy==1.2.0 +pytest==7.4.4 +pytest-asyncio==0.21.1 +pytest-cov==4.1.0 +pytest-mock==3.11.1 +pyyaml==6.0.1 +six==1.17.0 +sortedcontainers==2.4.0 +tomli==2.0.1 +typing-extensions==4.7.1 +urllib3==1.26.20 +vcrpy==4.4.0 +wrapt==1.16.0 +yarl==1.9.4 +zipp==3.15.0 diff --git a/ddtrace/_trace/tracer.py b/ddtrace/_trace/tracer.py index f815e0f184e..af9b09d3e02 100644 --- a/ddtrace/_trace/tracer.py +++ b/ddtrace/_trace/tracer.py @@ -41,6 +41,7 @@ from ddtrace.internal.atexit import register_on_exit_signal from ddtrace.internal.constants import SAMPLING_DECISION_TRACE_TAG_KEY from ddtrace.internal.constants import SPAN_API_DATADOG +from ddtrace.internal.core import dispatch from ddtrace.internal.dogstatsd import get_dogstatsd_client from ddtrace.internal.logger import get_logger from ddtrace.internal.peer_service.processor import PeerServiceProcessor @@ -849,7 +850,7 @@ def _start_span( for p in chain(self._span_processors, SpanProcessor.__processors__, self._deferred_processors): p.on_span_start(span) self._hooks.emit(self.__class__.start_span, span) - + dispatch("trace.span_start", (span,)) return span start_span = _start_span @@ -866,6 +867,8 @@ def _on_span_finish(self, span: Span) -> None: for p in chain(self._span_processors, SpanProcessor.__processors__, self._deferred_processors): p.on_span_finish(span) + dispatch("trace.span_finish", (span,)) + if log.isEnabledFor(logging.DEBUG): log.debug("finishing span %s (enabled:%s)", span._pprint(), self.enabled) @@ -940,18 +943,23 @@ def trace( ) def current_root_span(self) -> Optional[Span]: - """Returns the root span of the current execution. + """Returns the local root span of the current execution/process. + + Note: This cannot be used to access the true root span of the trace + in a distributed tracing setup if the actual root span occurred in + another execution/process. - This is useful for attaching information related to the trace as a - whole without needing to add to child spans. + This is useful for attaching information to the local root span + of the current execution/process, which is often also service + entry span. For example:: - # get the root span - root_span = tracer.current_root_span() + # get the local root span + local_root_span = tracer.current_root_span() # set the host just once on the root span - if root_span: - root_span.set_tag('host', '127.0.0.1') + if local_root_span: + local_root_span.set_tag('host', '127.0.0.1') """ span = self.current_span() if span is None: diff --git a/ddtrace/appsec/_capabilities.py b/ddtrace/appsec/_capabilities.py index c173f2d6471..c999b61cb97 100644 --- a/ddtrace/appsec/_capabilities.py +++ b/ddtrace/appsec/_capabilities.py @@ -31,6 +31,7 @@ class Flags(enum.IntFlag): ASM_SESSION_FINGERPRINT = 1 << 33 ASM_NETWORK_FINGERPRINT = 1 << 34 ASM_HEADER_FINGERPRINT = 1 << 35 + ASM_RASP_CMDI = 1 << 37 _ALL_ASM_BLOCKING = ( @@ -49,7 +50,7 @@ class Flags(enum.IntFlag): | Flags.ASM_HEADER_FINGERPRINT ) -_ALL_RASP = Flags.ASM_RASP_SQLI | Flags.ASM_RASP_LFI | Flags.ASM_RASP_SSRF | Flags.ASM_RASP_SHI +_ALL_RASP = Flags.ASM_RASP_SQLI | Flags.ASM_RASP_LFI | Flags.ASM_RASP_SSRF | Flags.ASM_RASP_SHI | Flags.ASM_RASP_CMDI _FEATURE_REQUIRED = Flags.ASM_ACTIVATION | Flags.ASM_AUTO_USER diff --git a/ddtrace/appsec/_common_module_patches.py b/ddtrace/appsec/_common_module_patches.py index 215d8b05ee6..0b455dbba6b 100644 --- a/ddtrace/appsec/_common_module_patches.py +++ b/ddtrace/appsec/_common_module_patches.py @@ -7,16 +7,20 @@ from typing import Callable from typing import Dict from typing import Iterable +from typing import List +from typing import Union from wrapt import FunctionWrapper from wrapt import resolve_path import ddtrace from ddtrace.appsec._asm_request_context import get_blocked +from ddtrace.appsec._constants import EXPLOIT_PREVENTION from ddtrace.appsec._constants import WAF_ACTIONS from ddtrace.appsec._iast._iast_request_context import is_iast_request_enabled from ddtrace.appsec._iast._metrics import _set_metric_iast_instrumented_sink from ddtrace.appsec._iast.constants import VULN_PATH_TRAVERSAL +import ddtrace.contrib.internal.subprocess.patch as subprocess_patch from ddtrace.internal import core from ddtrace.internal._exceptions import BlockingException from ddtrace.internal._unpatched import _gc as gc @@ -30,6 +34,9 @@ _is_patched = False +_RASP_SYSTEM = "rasp_os.system" +_RASP_POPEN = "rasp_Popen" + def patch_common_modules(): global _is_patched @@ -39,7 +46,10 @@ def patch_common_modules(): try_wrap_function_wrapper("urllib.request", "OpenerDirector.open", wrapped_open_ED4CF71136E15EBF) try_wrap_function_wrapper("_io", "BytesIO.read", wrapped_read_F3E51D71B4EC16EF) try_wrap_function_wrapper("_io", "StringIO.read", wrapped_read_F3E51D71B4EC16EF) - try_wrap_function_wrapper("os", "system", wrapped_system_5542593D237084A7) + # ensure that the subprocess patch is applied even after one click activation + subprocess_patch.patch() + subprocess_patch.add_str_callback(_RASP_SYSTEM, wrapped_system_5542593D237084A7) + subprocess_patch.add_lst_callback(_RASP_POPEN, popen_FD233052260D8B4D) core.on("asm.block.dbapi.execute", execute_4C9BAC8E228EB347) if asm_config._iast_enabled: _set_metric_iast_instrumented_sink(VULN_PATH_TRAVERSAL) @@ -54,6 +64,8 @@ def unpatch_common_modules(): try_unwrap("urllib.request", "OpenerDirector.open") try_unwrap("_io", "BytesIO.read") try_unwrap("_io", "StringIO.read") + subprocess_patch.del_str_callback(_RASP_SYSTEM) + subprocess_patch.del_lst_callback(_RASP_POPEN) _is_patched = False @@ -106,7 +118,6 @@ def wrapped_open_CFDDB7ABBA9081B6(original_open_callable, instance, args, kwargs try: from ddtrace.appsec._asm_request_context import call_waf_callback from ddtrace.appsec._asm_request_context import in_asm_context - from ddtrace.appsec._constants import EXPLOIT_PREVENTION except ImportError: # open is used during module initialization # and shouldn't be changed at that time @@ -124,7 +135,9 @@ def wrapped_open_CFDDB7ABBA9081B6(original_open_callable, instance, args, kwargs rule_type=EXPLOIT_PREVENTION.TYPE.LFI, ) if res and _must_block(res.actions): - raise BlockingException(get_blocked(), "exploit_prevention", "lfi", filename) + raise BlockingException( + get_blocked(), EXPLOIT_PREVENTION.BLOCKING, EXPLOIT_PREVENTION.TYPE.LFI, filename + ) try: return original_open_callable(*args, **kwargs) except Exception as e: @@ -151,7 +164,6 @@ def wrapped_open_ED4CF71136E15EBF(original_open_callable, instance, args, kwargs try: from ddtrace.appsec._asm_request_context import call_waf_callback from ddtrace.appsec._asm_request_context import in_asm_context - from ddtrace.appsec._constants import EXPLOIT_PREVENTION except ImportError: # open is used during module initialization # and shouldn't be changed at that time @@ -168,7 +180,9 @@ def wrapped_open_ED4CF71136E15EBF(original_open_callable, instance, args, kwargs rule_type=EXPLOIT_PREVENTION.TYPE.SSRF, ) if res and _must_block(res.actions): - raise BlockingException(get_blocked(), "exploit_prevention", "ssrf", url) + raise BlockingException( + get_blocked(), EXPLOIT_PREVENTION.BLOCKING, EXPLOIT_PREVENTION.TYPE.SSRF, url + ) return original_open_callable(*args, **kwargs) @@ -191,7 +205,6 @@ def wrapped_request_D8CB81E472AF98A2(original_request_callable, instance, args, try: from ddtrace.appsec._asm_request_context import call_waf_callback from ddtrace.appsec._asm_request_context import in_asm_context - from ddtrace.appsec._constants import EXPLOIT_PREVENTION except ImportError: # open is used during module initialization # and shouldn't be changed at that time @@ -206,50 +219,67 @@ def wrapped_request_D8CB81E472AF98A2(original_request_callable, instance, args, rule_type=EXPLOIT_PREVENTION.TYPE.SSRF, ) if res and _must_block(res.actions): - raise BlockingException(get_blocked(), "exploit_prevention", "ssrf", url) + raise BlockingException( + get_blocked(), EXPLOIT_PREVENTION.BLOCKING, EXPLOIT_PREVENTION.TYPE.SSRF, url + ) return original_request_callable(*args, **kwargs) -def wrapped_system_5542593D237084A7(original_command_callable, instance, args, kwargs): +def wrapped_system_5542593D237084A7(command: str) -> None: """ wrapper for os.system function """ - command = args[0] if args else kwargs.get("command", None) - if command is not None: - if asm_config._iast_enabled and is_iast_request_enabled(): - from ddtrace.appsec._iast.taint_sinks.command_injection import _iast_report_cmdi - - _iast_report_cmdi(command) - - if ( - asm_config._asm_enabled - and asm_config._ep_enabled - and ddtrace.tracer._appsec_processor is not None - and ddtrace.tracer._appsec_processor.rasp_cmdi_enabled - ): - try: - from ddtrace.appsec._asm_request_context import call_waf_callback - from ddtrace.appsec._asm_request_context import in_asm_context - from ddtrace.appsec._constants import EXPLOIT_PREVENTION - except ImportError: - return original_command_callable(*args, **kwargs) - - if in_asm_context(): - res = call_waf_callback( - {EXPLOIT_PREVENTION.ADDRESS.CMDI: command}, - crop_trace="wrapped_system_5542593D237084A7", - rule_type=EXPLOIT_PREVENTION.TYPE.CMDI, + if ( + asm_config._asm_enabled + and asm_config._ep_enabled + and ddtrace.tracer._appsec_processor is not None + and ddtrace.tracer._appsec_processor.rasp_shi_enabled + ): + try: + from ddtrace.appsec._asm_request_context import call_waf_callback + from ddtrace.appsec._asm_request_context import in_asm_context + except ImportError: + return + + if in_asm_context(): + res = call_waf_callback( + {EXPLOIT_PREVENTION.ADDRESS.SHI: command}, + crop_trace="wrapped_system_5542593D237084A7", + rule_type=EXPLOIT_PREVENTION.TYPE.SHI, + ) + if res and _must_block(res.actions): + raise BlockingException( + get_blocked(), EXPLOIT_PREVENTION.BLOCKING, EXPLOIT_PREVENTION.TYPE.SHI, command + ) + + +def popen_FD233052260D8B4D(arg_list: Union[List[str], str]) -> None: + """ + listener for subprocess.Popen class + """ + if ( + asm_config._asm_enabled + and asm_config._ep_enabled + and ddtrace.tracer._appsec_processor is not None + and ddtrace.tracer._appsec_processor.rasp_cmdi_enabled + ): + try: + from ddtrace.appsec._asm_request_context import call_waf_callback + from ddtrace.appsec._asm_request_context import in_asm_context + except ImportError: + return + + if in_asm_context(): + res = call_waf_callback( + {EXPLOIT_PREVENTION.ADDRESS.CMDI: arg_list if isinstance(arg_list, list) else [arg_list]}, + crop_trace="popen_FD233052260D8B4D", + rule_type=EXPLOIT_PREVENTION.TYPE.CMDI, + ) + if res and _must_block(res.actions): + raise BlockingException( + get_blocked(), EXPLOIT_PREVENTION.BLOCKING, EXPLOIT_PREVENTION.TYPE.CMDI, arg_list ) - if res and _must_block(res.actions): - raise BlockingException(get_blocked(), "exploit_prevention", "cmdi", command) - try: - return original_command_callable(*args, **kwargs) - except Exception as e: - previous_frame = e.__traceback__.tb_frame.f_back - raise e.with_traceback( - e.__traceback__.__class__(None, previous_frame, previous_frame.f_lasti, previous_frame.f_lineno) - ) _DB_DIALECTS = { @@ -279,7 +309,6 @@ def execute_4C9BAC8E228EB347(instrument_self, query, args, kwargs) -> None: try: from ddtrace.appsec._asm_request_context import call_waf_callback from ddtrace.appsec._asm_request_context import in_asm_context - from ddtrace.appsec._constants import EXPLOIT_PREVENTION except ImportError: # execute is used during module initialization # and shouldn't be changed at that time @@ -296,7 +325,9 @@ def execute_4C9BAC8E228EB347(instrument_self, query, args, kwargs) -> None: rule_type=EXPLOIT_PREVENTION.TYPE.SQLI, ) if res and _must_block(res.actions): - raise BlockingException(get_blocked(), "exploit_prevention", "sqli", query) + raise BlockingException( + get_blocked(), EXPLOIT_PREVENTION.BLOCKING, EXPLOIT_PREVENTION.TYPE.SQLI, query + ) def try_unwrap(module, name): diff --git a/ddtrace/appsec/_constants.py b/ddtrace/appsec/_constants.py index 83cb53e78ff..45a96834cc1 100644 --- a/ddtrace/appsec/_constants.py +++ b/ddtrace/appsec/_constants.py @@ -202,7 +202,8 @@ class WAF_DATA_NAMES(metaclass=Constant_Class): # EPHEMERAL ADDRESSES PROCESSOR_SETTINGS: Literal["waf.context.processor"] = "waf.context.processor" - CMDI_ADDRESS: Literal["server.sys.shell.cmd"] = "server.sys.shell.cmd" + CMDI_ADDRESS: Literal["server.sys.exec.cmd"] = "server.sys.exec.cmd" + SHI_ADDRESS: Literal["server.sys.shell.cmd"] = "server.sys.shell.cmd" LFI_ADDRESS: Literal["server.io.fs.file"] = "server.io.fs.file" SSRF_ADDRESS: Literal["server.io.net.url"] = "server.io.net.url" SQLI_ADDRESS: Literal["server.db.statement"] = "server.db.statement" @@ -328,6 +329,7 @@ class DEFAULT(metaclass=Constant_Class): class EXPLOIT_PREVENTION(metaclass=Constant_Class): + BLOCKING: Literal["exploit_prevention"] = "exploit_prevention" STACK_TRACE_ID: Literal["stack_id"] = "stack_id" EP_ENABLED: Literal["DD_APPSEC_RASP_ENABLED"] = "DD_APPSEC_RASP_ENABLED" STACK_TRACE_ENABLED: Literal["DD_APPSEC_STACK_TRACE_ENABLED"] = "DD_APPSEC_STACK_TRACE_ENABLED" @@ -339,6 +341,7 @@ class EXPLOIT_PREVENTION(metaclass=Constant_Class): class TYPE(metaclass=Constant_Class): CMDI: Literal["command_injection"] = "command_injection" + SHI: Literal["shell_injection"] = "shell_injection" LFI: Literal["lfi"] = "lfi" SSRF: Literal["ssrf"] = "ssrf" SQLI: Literal["sql_injection"] = "sql_injection" @@ -346,6 +349,7 @@ class TYPE(metaclass=Constant_Class): class ADDRESS(metaclass=Constant_Class): CMDI: Literal["CMDI_ADDRESS"] = "CMDI_ADDRESS" LFI: Literal["LFI_ADDRESS"] = "LFI_ADDRESS" + SHI: Literal["SHI_ADDRESS"] = "SHI_ADDRESS" SSRF: Literal["SSRF_ADDRESS"] = "SSRF_ADDRESS" SQLI: Literal["SQLI_ADDRESS"] = "SQLI_ADDRESS" SQLI_TYPE: Literal["SQLI_SYSTEM_ADDRESS"] = "SQLI_SYSTEM_ADDRESS" diff --git a/ddtrace/appsec/_iast/_ast/ast_patching.py b/ddtrace/appsec/_iast/_ast/ast_patching.py index 7e2258bd556..2c7e958d087 100644 --- a/ddtrace/appsec/_iast/_ast/ast_patching.py +++ b/ddtrace/appsec/_iast/_ast/ast_patching.py @@ -7,6 +7,7 @@ from sys import version_info import textwrap from types import ModuleType +from typing import Iterable from typing import Optional from typing import Text from typing import Tuple @@ -327,6 +328,49 @@ log = get_logger(__name__) +class _TrieNode: + __slots__ = ("children", "is_end") + + def __init__(self): + self.children = {} + self.is_end = False + + def __iter__(self): + if self.is_end: + yield ("", None) + else: + for k, v in self.children.items(): + yield (k, dict(v)) + + +def build_trie(words: Iterable[str]) -> _TrieNode: + root = _TrieNode() + for word in words: + node = root + for char in word: + if char not in node.children: + node.children[char] = _TrieNode() + node = node.children[char] + node.is_end = True + return root + + +_TRIE_ALLOWLIST = build_trie(IAST_ALLOWLIST) +_TRIE_DENYLIST = build_trie(IAST_DENYLIST) + + +def _trie_has_prefix_for(trie: _TrieNode, string: str) -> bool: + node = trie + for char in string: + node = node.children.get(char) + if not node: + return False + + if node.is_end: + return True + return node.is_end + + def get_encoding(module_path: Text) -> Text: """ First tries to detect the encoding for the file, @@ -341,11 +385,11 @@ def get_encoding(module_path: Text) -> Text: return ENCODING -_NOT_PATCH_MODULE_NAMES = _stdlib_for_python_version() | set(builtin_module_names) +_NOT_PATCH_MODULE_NAMES = {i.lower() for i in _stdlib_for_python_version() | set(builtin_module_names)} def _in_python_stdlib(module_name: str) -> bool: - return module_name.split(".")[0].lower() in [x.lower() for x in _NOT_PATCH_MODULE_NAMES] + return module_name.split(".")[0].lower() in _NOT_PATCH_MODULE_NAMES def _should_iast_patch(module_name: Text) -> bool: @@ -359,10 +403,10 @@ def _should_iast_patch(module_name: Text) -> bool: # diff = max_allow - max_deny # return diff > 0 or (diff == 0 and not _in_python_stdlib_or_third_party(module_name)) dotted_module_name = module_name.lower() + "." - if dotted_module_name.startswith(IAST_ALLOWLIST): + if _trie_has_prefix_for(_TRIE_ALLOWLIST, dotted_module_name): log.debug("IAST: allowing %s. it's in the IAST_ALLOWLIST", module_name) return True - if dotted_module_name.startswith(IAST_DENYLIST): + if _trie_has_prefix_for(_TRIE_DENYLIST, dotted_module_name): log.debug("IAST: denying %s. it's in the IAST_DENYLIST", module_name) return False if _in_python_stdlib(module_name): diff --git a/ddtrace/appsec/_iast/_pytest_plugin.py b/ddtrace/appsec/_iast/_pytest_plugin.py index 672acc4a031..82c23c53174 100644 --- a/ddtrace/appsec/_iast/_pytest_plugin.py +++ b/ddtrace/appsec/_iast/_pytest_plugin.py @@ -27,6 +27,8 @@ def ddtrace_iast(request, ddspan): Optionally output the test as failed if vulnerabilities are found. """ yield + if ddspan is None: + return data = ddspan.get_tag(IAST.JSON) if not data: return diff --git a/ddtrace/appsec/_iast/taint_sinks/command_injection.py b/ddtrace/appsec/_iast/taint_sinks/command_injection.py index ee22b294bfc..2607c6c9447 100644 --- a/ddtrace/appsec/_iast/taint_sinks/command_injection.py +++ b/ddtrace/appsec/_iast/taint_sinks/command_injection.py @@ -1,18 +1,15 @@ -import os -import subprocess # nosec from typing import List from typing import Union -from ddtrace.appsec._common_module_patches import try_unwrap from ddtrace.appsec._constants import IAST_SPAN_TAGS from ddtrace.appsec._iast import oce from ddtrace.appsec._iast._iast_request_context import is_iast_request_enabled from ddtrace.appsec._iast._metrics import _set_metric_iast_executed_sink from ddtrace.appsec._iast._metrics import _set_metric_iast_instrumented_sink from ddtrace.appsec._iast._metrics import increment_iast_span_metric -from ddtrace.appsec._iast._patch import try_wrap_function_wrapper from ddtrace.appsec._iast._taint_tracking._taint_objects import is_pyobject_tainted from ddtrace.appsec._iast.constants import VULN_CMDI +import ddtrace.contrib.internal.subprocess.patch as subprocess_patch from ddtrace.internal.logger import get_logger from ddtrace.settings.asm import config as asm_config @@ -26,48 +23,20 @@ def get_version() -> str: return "" -def patch(): - if not asm_config._iast_enabled: - return - - if not getattr(os, "_datadog_cmdi_patch", False): - # all os.spawn* variants eventually use this one: - try_wrap_function_wrapper("os", "_spawnvef", _iast_cmdi_osspawn) - - if not getattr(subprocess, "_datadog_cmdi_patch", False): - try_wrap_function_wrapper("subprocess", "Popen.__init__", _iast_cmdi_subprocess_init) +_IAST_CMDI = "iast_cmdi" - os._datadog_cmdi_patch = True - subprocess._datadog_cmdi_patch = True - _set_metric_iast_instrumented_sink(VULN_CMDI) +def patch(): + if asm_config._iast_enabled: + subprocess_patch.patch() + subprocess_patch.add_str_callback(_IAST_CMDI, _iast_report_cmdi) + subprocess_patch.add_lst_callback(_IAST_CMDI, _iast_report_cmdi) + _set_metric_iast_instrumented_sink(VULN_CMDI) def unpatch() -> None: - try_unwrap("os", "system") - try_unwrap("os", "_spawnvef") - try_unwrap("subprocess", "Popen.__init__") - - os._datadog_cmdi_patch = False # type: ignore[attr-defined] - subprocess._datadog_cmdi_patch = False # type: ignore[attr-defined] - - -def _iast_cmdi_osspawn(wrapped, instance, args, kwargs): - mode, file, func_args, _, _ = args - _iast_report_cmdi(func_args) - - if hasattr(wrapped, "__func__"): - return wrapped.__func__(instance, *args, **kwargs) - return wrapped(*args, **kwargs) - - -def _iast_cmdi_subprocess_init(wrapped, instance, args, kwargs): - cmd_args = args[0] if len(args) else kwargs["args"] - _iast_report_cmdi(cmd_args) - - if hasattr(wrapped, "__func__"): - return wrapped.__func__(instance, *args, **kwargs) - return wrapped(*args, **kwargs) + subprocess_patch.del_str_callback(_IAST_CMDI) + subprocess_patch.del_lst_callback(_IAST_CMDI) @oce.register diff --git a/ddtrace/appsec/_metrics.py b/ddtrace/appsec/_metrics.py index f8713dc5ea7..cbe8490d717 100644 --- a/ddtrace/appsec/_metrics.py +++ b/ddtrace/appsec/_metrics.py @@ -1,4 +1,5 @@ from ddtrace.appsec import _asm_request_context +from ddtrace.appsec import _constants from ddtrace.appsec._ddwaf import version as _version from ddtrace.appsec._deduplications import deduplication from ddtrace.internal import telemetry @@ -64,6 +65,15 @@ def _set_waf_init_metric(info): log.warning("Error reporting ASM WAF init metrics", exc_info=True) +_TYPES_AND_TAGS = { + _constants.EXPLOIT_PREVENTION.TYPE.CMDI: (("rule_type", "command_injection"), ("rule_variant", "exec")), + _constants.EXPLOIT_PREVENTION.TYPE.SHI: (("rule_type", "command_injection"), ("rule_variant", "shell")), + _constants.EXPLOIT_PREVENTION.TYPE.LFI: (("rule_type", "lfi"),), + _constants.EXPLOIT_PREVENTION.TYPE.SSRF: (("rule_type", "ssrf"),), + _constants.EXPLOIT_PREVENTION.TYPE.SQLI: (("rule_type", "sql_injection"),), +} + + def _set_waf_request_metrics(*args): try: result = _asm_request_context.get_waf_telemetry_results() @@ -94,10 +104,7 @@ def _set_waf_request_metrics(*args): TELEMETRY_NAMESPACE_TAG_APPSEC, n, float(value), - tags=( - ("rule_type", rule_type), - ("waf_version", DDWAF_VERSION), - ), + tags=_TYPES_AND_TAGS.get(rule_type, ()) + (("waf_version", DDWAF_VERSION),), ) except Exception: diff --git a/ddtrace/appsec/_processor.py b/ddtrace/appsec/_processor.py index 06328d1201a..54a9f624afe 100644 --- a/ddtrace/appsec/_processor.py +++ b/ddtrace/appsec/_processor.py @@ -202,6 +202,10 @@ def _update_rules(self, new_rules: Dict[str, Any]) -> bool: def rasp_lfi_enabled(self) -> bool: return WAF_DATA_NAMES.LFI_ADDRESS in self._addresses_to_keep + @property + def rasp_shi_enabled(self) -> bool: + return WAF_DATA_NAMES.SHI_ADDRESS in self._addresses_to_keep + @property def rasp_cmdi_enabled(self) -> bool: return WAF_DATA_NAMES.CMDI_ADDRESS in self._addresses_to_keep diff --git a/ddtrace/appsec/_python_info/stdlib/__init__.py b/ddtrace/appsec/_python_info/stdlib/__init__.py index a040e57f859..e745c392f55 100644 --- a/ddtrace/appsec/_python_info/stdlib/__init__.py +++ b/ddtrace/appsec/_python_info/stdlib/__init__.py @@ -19,5 +19,5 @@ from .module_names_py312 import STDLIB_MODULE_NAMES -def _stdlib_for_python_version(): # type: () -> set +def _stdlib_for_python_version(): # type: () -> set[str] return STDLIB_MODULE_NAMES diff --git a/ddtrace/contrib/internal/openai/_endpoint_hooks.py b/ddtrace/contrib/internal/openai/_endpoint_hooks.py index 73a2b2511c9..979e1774a8a 100644 --- a/ddtrace/contrib/internal/openai/_endpoint_hooks.py +++ b/ddtrace/contrib/internal/openai/_endpoint_hooks.py @@ -255,6 +255,14 @@ def _record_request(self, pin, integration, span, args, kwargs): span.set_tag_str("openai.request.messages.%d.content" % idx, integration.trunc(str(content))) span.set_tag_str("openai.request.messages.%d.role" % idx, str(role)) span.set_tag_str("openai.request.messages.%d.name" % idx, str(name)) + if parse_version(OPENAI_VERSION) >= (1, 26) and kwargs.get("stream"): + if kwargs.get("stream_options", {}).get("include_usage", None) is not None: + # Only perform token chunk auto-extraction if this option is not explicitly set + return + span._set_ctx_item("_dd.auto_extract_token_chunk", True) + stream_options = kwargs.get("stream_options", {}) + stream_options["include_usage"] = True + kwargs["stream_options"] = stream_options def _record_response(self, pin, integration, span, args, kwargs, resp, error): resp = super()._record_response(pin, integration, span, args, kwargs, resp, error) diff --git a/ddtrace/contrib/internal/openai/utils.py b/ddtrace/contrib/internal/openai/utils.py index d967383e366..f5dfc10efef 100644 --- a/ddtrace/contrib/internal/openai/utils.py +++ b/ddtrace/contrib/internal/openai/utils.py @@ -48,11 +48,28 @@ def __exit__(self, exc_type, exc_val, exc_tb): self.__wrapped__.__exit__(exc_type, exc_val, exc_tb) def __iter__(self): - return self + exception_raised = False + try: + for chunk in self.__wrapped__: + self._extract_token_chunk(chunk) + yield chunk + _loop_handler(self._dd_span, chunk, self._streamed_chunks) + except Exception: + self._dd_span.set_exc_info(*sys.exc_info()) + exception_raised = True + raise + finally: + if not exception_raised: + _process_finished_stream( + self._dd_integration, self._dd_span, self._kwargs, self._streamed_chunks, self._is_completion + ) + self._dd_span.finish() + self._dd_integration.metric(self._dd_span, "dist", "request.duration", self._dd_span.duration_ns) def __next__(self): try: chunk = self.__wrapped__.__next__() + self._extract_token_chunk(chunk) _loop_handler(self._dd_span, chunk, self._streamed_chunks) return chunk except StopIteration: @@ -68,6 +85,22 @@ def __next__(self): self._dd_integration.metric(self._dd_span, "dist", "request.duration", self._dd_span.duration_ns) raise + def _extract_token_chunk(self, chunk): + """Attempt to extract the token chunk (last chunk in the stream) from the streamed response.""" + if not self._dd_span._get_ctx_item("_dd.auto_extract_token_chunk"): + return + choice = getattr(chunk, "choices", [None])[0] + if not getattr(choice, "finish_reason", None): + # Only the second-last chunk in the stream with token usage enabled will have finish_reason set + return + try: + # User isn't expecting last token chunk to be present since it's not part of the default streamed response, + # so we consume it and extract the token usage metadata before it reaches the user. + usage_chunk = self.__wrapped__.__next__() + self._streamed_chunks[0].insert(0, usage_chunk) + except (StopIteration, GeneratorExit): + return + class TracedOpenAIAsyncStream(BaseTracedOpenAIStream): async def __aenter__(self): @@ -77,12 +110,29 @@ async def __aenter__(self): async def __aexit__(self, exc_type, exc_val, exc_tb): await self.__wrapped__.__aexit__(exc_type, exc_val, exc_tb) - def __aiter__(self): - return self + async def __aiter__(self): + exception_raised = False + try: + async for chunk in self.__wrapped__: + await self._extract_token_chunk(chunk) + yield chunk + _loop_handler(self._dd_span, chunk, self._streamed_chunks) + except Exception: + self._dd_span.set_exc_info(*sys.exc_info()) + exception_raised = True + raise + finally: + if not exception_raised: + _process_finished_stream( + self._dd_integration, self._dd_span, self._kwargs, self._streamed_chunks, self._is_completion + ) + self._dd_span.finish() + self._dd_integration.metric(self._dd_span, "dist", "request.duration", self._dd_span.duration_ns) async def __anext__(self): try: chunk = await self.__wrapped__.__anext__() + await self._extract_token_chunk(chunk) _loop_handler(self._dd_span, chunk, self._streamed_chunks) return chunk except StopAsyncIteration: @@ -98,6 +148,19 @@ async def __anext__(self): self._dd_integration.metric(self._dd_span, "dist", "request.duration", self._dd_span.duration_ns) raise + async def _extract_token_chunk(self, chunk): + """Attempt to extract the token chunk (last chunk in the stream) from the streamed response.""" + if not self._dd_span._get_ctx_item("_dd.auto_extract_token_chunk"): + return + choice = getattr(chunk, "choices", [None])[0] + if not getattr(choice, "finish_reason", None): + return + try: + usage_chunk = await self.__wrapped__.__anext__() + self._streamed_chunks[0].insert(0, usage_chunk) + except (StopAsyncIteration, GeneratorExit): + return + def _compute_token_count(content, model): # type: (Union[str, List[int]], Optional[str]) -> Tuple[bool, int] diff --git a/ddtrace/contrib/internal/subprocess/patch.py b/ddtrace/contrib/internal/subprocess/patch.py index 7380e72fdaf..76530c195df 100644 --- a/ddtrace/contrib/internal/subprocess/patch.py +++ b/ddtrace/contrib/internal/subprocess/patch.py @@ -4,8 +4,8 @@ import os import re import shlex -import subprocess # nosec from threading import RLock +from typing import Callable # noqa:F401 from typing import Deque # noqa:F401 from typing import Dict # noqa:F401 from typing import List # noqa:F401 @@ -33,45 +33,71 @@ ) -def get_version(): - # type: () -> str +def get_version() -> str: return "" -def patch(): - # type: () -> List[str] - patched = [] # type: List[str] - if not asm_config._asm_enabled: - return patched +_STR_CALLBACKS: Dict[str, Callable[[str], None]] = {} +_LST_CALLBACKS: Dict[str, Callable[[Union[List[str], str]], None]] = {} - import os - if not getattr(os, "_datadog_patch", False): - Pin().onto(os) - trace_utils.wrap(os, "system", _traced_ossystem(os)) - trace_utils.wrap(os, "fork", _traced_fork(os)) +def add_str_callback(name: str, callback: Callable[[str], None]): + _STR_CALLBACKS[name] = callback + + +def del_str_callback(name: str): + _STR_CALLBACKS.pop(name, None) + + +def add_lst_callback(name: str, callback: Callable[[Union[List[str], str]], None]): + _LST_CALLBACKS[name] = callback + + +def del_lst_callback(name: str): + _LST_CALLBACKS.pop(name, None) + - # all os.spawn* variants eventually use this one: - trace_utils.wrap(os, "_spawnvef", _traced_osspawn(os)) +def patch() -> List[str]: + if not (asm_config._asm_enabled or asm_config._iast_enabled): + return [] + patched: List[str] = [] + import os # nosec + import subprocess # nosec + + should_patch_system = not trace_utils.iswrapped(os.system) + should_patch_fork = not trace_utils.iswrapped(os.fork) + spawnvef = getattr(os, "_spawnvef", None) + should_patch_spawnvef = spawnvef is not None and not trace_utils.iswrapped(spawnvef) + + if should_patch_system or should_patch_fork or should_patch_spawnvef: + Pin().onto(os) + if should_patch_system: + trace_utils.wrap(os, "system", _traced_ossystem(os)) + if should_patch_fork: + trace_utils.wrap(os, "fork", _traced_fork(os)) + if should_patch_spawnvef: + # all os.spawn* variants eventually use this one: + trace_utils.wrap(os, "_spawnvef", _traced_osspawn(os)) patched.append("os") - if not getattr(subprocess, "_datadog_patch", False): + should_patch_Popen_init = not trace_utils.iswrapped(subprocess.Popen.__init__) + should_patch_Popen_wait = not trace_utils.iswrapped(subprocess.Popen.wait) + if should_patch_Popen_init or should_patch_Popen_wait: Pin().onto(subprocess) # We store the parameters on __init__ in the context and set the tags on wait # (where all the Popen objects eventually arrive, unless killed before it) - trace_utils.wrap(subprocess, "Popen.__init__", _traced_subprocess_init(subprocess)) - trace_utils.wrap(subprocess, "Popen.wait", _traced_subprocess_wait(subprocess)) - - os._datadog_patch = True - subprocess._datadog_patch = True + if should_patch_Popen_init: + trace_utils.wrap(subprocess, "Popen.__init__", _traced_subprocess_init(subprocess)) + if should_patch_Popen_wait: + trace_utils.wrap(subprocess, "Popen.wait", _traced_subprocess_wait(subprocess)) patched.append("subprocess") return patched @dataclass(eq=False) -class SubprocessCmdLineCacheEntry(object): +class SubprocessCmdLineCacheEntry: binary: Optional[str] = None arguments: Optional[List] = None truncated: bool = False @@ -80,10 +106,10 @@ class SubprocessCmdLineCacheEntry(object): as_string: Optional[str] = None -class SubprocessCmdLine(object): +class SubprocessCmdLine: # This catches the computed values into a SubprocessCmdLineCacheEntry object - _CACHE = {} # type: Dict[str, SubprocessCmdLineCacheEntry] - _CACHE_DEQUE = collections.deque() # type: Deque[str] + _CACHE: Dict[str, SubprocessCmdLineCacheEntry] = {} + _CACHE_DEQUE: Deque[str] = collections.deque() _CACHE_MAXSIZE = 32 _CACHE_LOCK = RLock() @@ -138,8 +164,7 @@ def _clear_cache(cls): ] _COMPILED_ENV_VAR_REGEXP = re.compile(r"\b[A-Z_]+=\w+") - def __init__(self, shell_args, shell=False): - # type: (Union[str, List[str]], bool) -> None + def __init__(self, shell_args: Union[str, List[str]], shell: bool = False) -> None: cache_key = str(shell_args) + str(shell) self._cache_entry = SubprocessCmdLine._CACHE.get(cache_key) if self._cache_entry: @@ -250,8 +275,7 @@ def scrub_arguments(self): self.arguments = new_args - def truncate_string(self, str_): - # type: (str) -> str + def truncate_string(self, str_: str) -> str: oversize = len(str_) - self.TRUNCATE_LIMIT if oversize <= 0: @@ -263,9 +287,7 @@ def truncate_string(self, str_): msg = ' "4kB argument truncated by %d characters"' % oversize return str_[0 : -(oversize + len(msg))] + msg - def _as_list_and_string(self): - # type: () -> Tuple[list[str], str] - + def _as_list_and_string(self) -> Tuple[List[str], str]: total_list = self.env_vars + [self.binary] + self.arguments truncated_str = self.truncate_string(shjoin(total_list)) truncated_list = shlex.split(truncated_str) @@ -290,8 +312,10 @@ def as_string(self): return str_res -def unpatch(): - # type: () -> None +def unpatch() -> None: + import os # nosec + import subprocess # nosec + trace_utils.unwrap(os, "system") trace_utils.unwrap(os, "_spawnvef") trace_utils.unwrap(subprocess.Popen, "__init__") @@ -299,13 +323,13 @@ def unpatch(): SubprocessCmdLine._clear_cache() - os._datadog_patch = False - subprocess._datadog_patch = False - @trace_utils.with_traced_module def _traced_ossystem(module, pin, wrapped, instance, args, kwargs): try: + if isinstance(args[0], str): + for callback in _STR_CALLBACKS.values(): + callback(args[0]) shellcmd = SubprocessCmdLine(args[0], shell=True) # nosec with pin.tracer.trace(COMMANDS.SPAN_NAME, resource=shellcmd.binary, span_type=SpanTypes.SYSTEM) as span: @@ -342,6 +366,10 @@ def _traced_fork(module, pin, wrapped, instance, args, kwargs): def _traced_osspawn(module, pin, wrapped, instance, args, kwargs): try: mode, file, func_args, _, _ = args + if isinstance(func_args, (list, tuple, str)): + commands = [file] + list(func_args) + for callback in _LST_CALLBACKS.values(): + callback(commands) shellcmd = SubprocessCmdLine(func_args, shell=False) with pin.tracer.trace(COMMANDS.SPAN_NAME, resource=shellcmd.binary, span_type=SpanTypes.SYSTEM) as span: @@ -366,6 +394,13 @@ def _traced_osspawn(module, pin, wrapped, instance, args, kwargs): def _traced_subprocess_init(module, pin, wrapped, instance, args, kwargs): try: cmd_args = args[0] if len(args) else kwargs["args"] + if isinstance(cmd_args, (list, tuple, str)): + if kwargs.get("shell", False): + for callback in _STR_CALLBACKS.values(): + callback(cmd_args) + else: + for callback in _LST_CALLBACKS.values(): + callback(cmd_args) cmd_args_list = shlex.split(cmd_args) if isinstance(cmd_args, str) else cmd_args is_shell = kwargs.get("shell", False) shellcmd = SubprocessCmdLine(cmd_args_list, shell=is_shell) # nosec diff --git a/ddtrace/debugging/_safety.py b/ddtrace/debugging/_safety.py index 118deddef40..92b38ff6bdc 100644 --- a/ddtrace/debugging/_safety.py +++ b/ddtrace/debugging/_safety.py @@ -1,5 +1,6 @@ from inspect import CO_VARARGS from inspect import CO_VARKEYWORDS +from itertools import chain from types import FrameType from typing import Any from typing import Dict @@ -23,11 +24,11 @@ def get_args(frame: FrameType) -> Iterator[Tuple[str, Any]]: def get_locals(frame: FrameType) -> Iterator[Tuple[str, Any]]: code = frame.f_code + _locals = frame.f_locals nargs = code.co_argcount + bool(code.co_flags & CO_VARARGS) + bool(code.co_flags & CO_VARKEYWORDS) - names = code.co_varnames[nargs:] - values = (frame.f_locals.get(name) for name in names) - - return zip(names, values) + return ( + (name, _locals.get(name)) for name in chain(code.co_varnames[nargs:], code.co_freevars, code.co_cellvars) + ) # include freevars and cellvars def get_globals(frame: FrameType) -> Iterator[Tuple[str, Any]]: diff --git a/ddtrace/internal/debug.py b/ddtrace/internal/debug.py index 4d533b604b6..c33ff5ad46d 100644 --- a/ddtrace/internal/debug.py +++ b/ddtrace/internal/debug.py @@ -117,8 +117,8 @@ def collect(tracer): from ddtrace._trace.tracer import log return dict( - # Timestamp UTC ISO 8601 - date=datetime.datetime.utcnow().isoformat(), + # Timestamp UTC ISO 8601 with the trailing +00:00 removed + date=datetime.datetime.now(datetime.timezone.utc).isoformat()[0:-6], # eg. "Linux", "Darwin" os_name=platform.system(), # eg. 12.5.0 diff --git a/ddtrace/internal/telemetry/writer.py b/ddtrace/internal/telemetry/writer.py index 71de6b03907..2be240c06fd 100644 --- a/ddtrace/internal/telemetry/writer.py +++ b/ddtrace/internal/telemetry/writer.py @@ -118,11 +118,17 @@ def send_event(self, request: Dict) -> Optional[httplib.HTTPResponse]: conn.request("POST", self._endpoint, rb_json, headers) resp = get_connection_response(conn) if resp.status < 300: - log.debug("sent %d in %.5fs to %s. response: %s", len(rb_json), sw.elapsed(), self.url, resp.status) + log.debug( + "Instrumentation Telemetry sent %d in %.5fs to %s. response: %s", + len(rb_json), + sw.elapsed(), + self.url, + resp.status, + ) else: - log.debug("failed to send telemetry to %s. response: %s", self.url, resp.status) - except Exception: - log.debug("failed to send telemetry to %s.", self.url, exc_info=True) + log.debug("Failed to send Instrumentation Telemetry to %s. response: %s", self.url, resp.status) + except Exception as e: + log.debug("Failed to send Instrumentation Telemetry to %s. Error: %s", self.url, str(e)) finally: if conn is not None: conn.close() diff --git a/ddtrace/llmobs/_evaluators/ragas/base.py b/ddtrace/llmobs/_evaluators/ragas/base.py new file mode 100644 index 00000000000..23aa4cd3caa --- /dev/null +++ b/ddtrace/llmobs/_evaluators/ragas/base.py @@ -0,0 +1,213 @@ +import traceback +from typing import List +from typing import Optional +from typing import Tuple +from typing import Union + +from ddtrace.internal.logger import get_logger +from ddtrace.internal.telemetry import telemetry_writer +from ddtrace.internal.telemetry.constants import TELEMETRY_APM_PRODUCT +from ddtrace.internal.telemetry.constants import TELEMETRY_LOG_LEVEL +from ddtrace.internal.utils.version import parse_version +from ddtrace.llmobs._constants import INTERNAL_CONTEXT_VARIABLE_KEYS +from ddtrace.llmobs._constants import INTERNAL_QUERY_VARIABLE_KEYS +from ddtrace.llmobs._constants import RAGAS_ML_APP_PREFIX + + +logger = get_logger(__name__) + + +class RagasDependencies: + """ + A helper class to store instances of ragas classes and functions + that may or may not exist in a user's environment. + """ + + def __init__(self): + import ragas + + self.ragas_version = parse_version(ragas.__version__) + if self.ragas_version >= (0, 2, 0) or self.ragas_version < (0, 1, 10): + raise NotImplementedError( + "Ragas version: {} is not supported".format(self.ragas_version), + ) + + from ragas.llms import llm_factory + + self.llm_factory = llm_factory + + from ragas.llms.output_parser import RagasoutputParser + + self.RagasoutputParser = RagasoutputParser + + from ragas.metrics import context_precision + + self.context_precision = context_precision + + from ragas.metrics.base import ensembler + + self.ensembler = ensembler + + from ragas.metrics import faithfulness + + self.faithfulness = faithfulness + + from ragas.metrics.base import get_segmenter + + self.get_segmenter = get_segmenter + + from ddtrace.llmobs._evaluators.ragas.models import StatementFaithfulnessAnswers + + self.StatementFaithfulnessAnswers = StatementFaithfulnessAnswers + + from ddtrace.llmobs._evaluators.ragas.models import StatementsAnswers + + self.StatementsAnswers = StatementsAnswers + + +def _get_ml_app_for_ragas_trace(span_event: dict) -> str: + """ + The `ml_app` spans generated from traces of ragas will be named as `dd-ragas-` + or `dd-ragas` if `ml_app` is not present in the span event. + """ + tags: List[str] = span_event.get("tags", []) + ml_app = None + for tag in tags: + if isinstance(tag, str) and tag.startswith("ml_app:"): + ml_app = tag.split(":")[1] + break + if not ml_app: + return RAGAS_ML_APP_PREFIX + return "{}-{}".format(RAGAS_ML_APP_PREFIX, ml_app) + + +class BaseRagasEvaluator: + """A class used by EvaluatorRunner to conduct ragas evaluations + on LLM Observability span events. The job of an Evaluator is to take a span and + submit evaluation metrics based on the span's attributes. + + Extenders of this class should only need to implement the `evaluate` method. + """ + + LABEL = "ragas" + METRIC_TYPE = "score" + + def __init__(self, llmobs_service): + """ + Initialize an evaluator that uses the ragas library to generate a score on finished LLM spans. + + :param llmobs_service: An instance of the LLM Observability service used for tracing the evaluation and + submitting evaluation metrics. + + Raises: NotImplementedError if the ragas library is not found or if ragas version is not supported. + """ + self.llmobs_service = llmobs_service + self.ragas_version = "unknown" + telemetry_state = "ok" + try: + self.ragas_dependencies = RagasDependencies() + self.ragas_version = self.ragas_dependencies.ragas_version + except ImportError as e: + telemetry_state = "fail_import_error" + raise NotImplementedError("Failed to load dependencies for `{}` evaluator".format(self.LABEL)) from e + except AttributeError as e: + telemetry_state = "fail_attribute_error" + raise NotImplementedError("Failed to load dependencies for `{}` evaluator".format(self.LABEL)) from e + except NotImplementedError as e: + telemetry_state = "fail_not_supported" + raise NotImplementedError("Failed to load dependencies for `{}` evaluator".format(self.LABEL)) from e + except Exception as e: + telemetry_state = "fail_unknown" + raise NotImplementedError("Failed to load dependencies for `{}` evaluator".format(self.LABEL)) from e + finally: + telemetry_writer.add_count_metric( + namespace=TELEMETRY_APM_PRODUCT.LLMOBS, + name="evaluators.init", + value=1, + tags=( + ("evaluator_label", self.LABEL), + ("state", telemetry_state), + ("evaluator_version", self.ragas_version), + ), + ) + if telemetry_state != "ok": + telemetry_writer.add_log( + level=TELEMETRY_LOG_LEVEL.ERROR, + message="Failed to import Ragas dependencies", + stack_trace=traceback.format_exc(), + tags={"evaluator_version": self.ragas_version}, + ) + + def run_and_submit_evaluation(self, span_event: dict): + if not span_event: + return + score_result_or_failure, metric_metadata = self.evaluate(span_event) + telemetry_writer.add_count_metric( + TELEMETRY_APM_PRODUCT.LLMOBS, + "evaluators.run", + 1, + tags=( + ("evaluator_label", self.LABEL), + ("state", score_result_or_failure if isinstance(score_result_or_failure, str) else "success"), + ("evaluator_version", self.ragas_version), + ), + ) + if isinstance(score_result_or_failure, float): + self.llmobs_service.submit_evaluation( + span_context={"trace_id": span_event.get("trace_id"), "span_id": span_event.get("span_id")}, + label=self.LABEL, + metric_type=self.METRIC_TYPE, + value=score_result_or_failure, + metadata=metric_metadata, + ) + + def evaluate(self, span_event: dict) -> Tuple[Union[float, str], Optional[dict]]: + raise NotImplementedError("evaluate method must be implemented by individual evaluators") + + def _extract_evaluation_inputs_from_span(self, span_event: dict) -> Optional[dict]: + """ + Extracts the question, answer, and context used as inputs for a ragas evaluation on a span event. + """ + with self.llmobs_service.workflow("dd-ragas.extract_evaluation_inputs_from_span") as extract_inputs_workflow: + self.llmobs_service.annotate(span=extract_inputs_workflow, input_data=span_event) + question, answer, contexts = None, None, None + + meta_io = span_event.get("meta") + if meta_io is None: + return None + + meta_input = meta_io.get("input") + meta_output = meta_io.get("output") + + if not (meta_input and meta_output): + return None + + prompt = meta_input.get("prompt") + if prompt is None: + logger.debug("Failed to extract `prompt` from span for ragas evaluation") + return None + prompt_variables = prompt.get("variables") + + input_messages = meta_input.get("messages") + + messages = meta_output.get("messages") + if messages is not None and len(messages) > 0: + answer = messages[-1].get("content") + + if prompt_variables: + context_keys = prompt.get(INTERNAL_CONTEXT_VARIABLE_KEYS, ["context"]) + question_keys = prompt.get(INTERNAL_QUERY_VARIABLE_KEYS, ["question"]) + contexts = [prompt_variables.get(key) for key in context_keys if prompt_variables.get(key)] + question = " ".join([prompt_variables.get(key) for key in question_keys if prompt_variables.get(key)]) + + if not question and input_messages is not None and len(input_messages) > 0: + question = input_messages[-1].get("content") + + self.llmobs_service.annotate( + span=extract_inputs_workflow, output_data={"question": question, "contexts": contexts, "answer": answer} + ) + if any(field is None for field in (question, contexts, answer)): + logger.debug("Failed to extract inputs required for ragas evaluation") + return None + + return {"question": question, "contexts": contexts, "answer": answer} diff --git a/ddtrace/llmobs/_evaluators/ragas/faithfulness.py b/ddtrace/llmobs/_evaluators/ragas/faithfulness.py index d651c2443a4..98725b1f27e 100644 --- a/ddtrace/llmobs/_evaluators/ragas/faithfulness.py +++ b/ddtrace/llmobs/_evaluators/ragas/faithfulness.py @@ -1,73 +1,22 @@ import json import math -import traceback from typing import List from typing import Optional from typing import Tuple from typing import Union from ddtrace.internal.logger import get_logger -from ddtrace.internal.telemetry import telemetry_writer -from ddtrace.internal.telemetry.constants import TELEMETRY_APM_PRODUCT -from ddtrace.internal.telemetry.constants import TELEMETRY_LOG_LEVEL -from ddtrace.internal.utils.version import parse_version from ddtrace.llmobs._constants import EVALUATION_KIND_METADATA from ddtrace.llmobs._constants import EVALUATION_SPAN_METADATA from ddtrace.llmobs._constants import FAITHFULNESS_DISAGREEMENTS_METADATA -from ddtrace.llmobs._constants import INTERNAL_CONTEXT_VARIABLE_KEYS -from ddtrace.llmobs._constants import INTERNAL_QUERY_VARIABLE_KEYS -from ddtrace.llmobs._constants import RAGAS_ML_APP_PREFIX +from ddtrace.llmobs._evaluators.ragas.base import BaseRagasEvaluator +from ddtrace.llmobs._evaluators.ragas.base import _get_ml_app_for_ragas_trace logger = get_logger(__name__) -class MiniRagas: - """ - A helper class to store instances of ragas classes and functions - that may or may not exist in a user's environment. - """ - - llm_factory = None - RagasoutputParser = None - faithfulness = None - ensembler = None - get_segmenter = None - StatementFaithfulnessAnswers = None - StatementsAnswers = None - - -def _get_ml_app_for_ragas_trace(span_event: dict) -> str: - """ - The `ml_app` spans generated from traces of ragas will be named as `dd-ragas-` - or `dd-ragas` if `ml_app` is not present in the span event. - """ - tags = span_event.get("tags", []) # list[str] - ml_app = None - for tag in tags: - if isinstance(tag, str) and tag.startswith("ml_app:"): - ml_app = tag.split(":")[1] - break - if not ml_app: - return RAGAS_ML_APP_PREFIX - return "{}-{}".format(RAGAS_ML_APP_PREFIX, ml_app) - - -def _get_faithfulness_instance() -> Optional[object]: - """ - This helper function ensures the faithfulness instance used in - ragas evaluator is updated with the latest ragas faithfulness - instance AND has an non-null llm - """ - if MiniRagas.faithfulness is None: - return None - ragas_faithfulness_instance = MiniRagas.faithfulness - if not ragas_faithfulness_instance.llm: - ragas_faithfulness_instance.llm = MiniRagas.llm_factory() - return ragas_faithfulness_instance - - -class RagasFaithfulnessEvaluator: +class RagasFaithfulnessEvaluator(BaseRagasEvaluator): """A class used by EvaluatorRunner to conduct ragas faithfulness evaluations on LLM Observability span events. The job of an Evaluator is to take a span and submit evaluation metrics based on the span's attributes. @@ -95,98 +44,30 @@ def __init__(self, llmobs_service): Raises: NotImplementedError if the ragas library is not found or if ragas version is not supported. """ - self.llmobs_service = llmobs_service - self.ragas_version = "unknown" - telemetry_state = "ok" - try: - import ragas - - self.ragas_version = parse_version(ragas.__version__) - if self.ragas_version >= (0, 2, 0) or self.ragas_version < (0, 1, 10): - raise NotImplementedError( - "Ragas version: {} is not supported for `ragas_faithfulness` evaluator".format(self.ragas_version), - ) - - from ragas.llms import llm_factory - - MiniRagas.llm_factory = llm_factory - - from ragas.llms.output_parser import RagasoutputParser - - MiniRagas.RagasoutputParser = RagasoutputParser - - from ragas.metrics import faithfulness - - MiniRagas.faithfulness = faithfulness - - from ragas.metrics.base import ensembler - - MiniRagas.ensembler = ensembler - - from ragas.metrics.base import get_segmenter - - MiniRagas.get_segmenter = get_segmenter - - from ddtrace.llmobs._evaluators.ragas.models import StatementFaithfulnessAnswers - - MiniRagas.StatementFaithfulnessAnswers = StatementFaithfulnessAnswers - - from ddtrace.llmobs._evaluators.ragas.models import StatementsAnswers - - MiniRagas.StatementsAnswers = StatementsAnswers - except Exception as e: - telemetry_state = "fail" - telemetry_writer.add_log( - level=TELEMETRY_LOG_LEVEL.ERROR, - message="Failed to import Ragas dependencies", - stack_trace=traceback.format_exc(), - tags={"ragas_version": self.ragas_version}, - ) - raise NotImplementedError("Failed to load dependencies for `ragas_faithfulness` evaluator") from e - finally: - telemetry_writer.add_count_metric( - namespace=TELEMETRY_APM_PRODUCT.LLMOBS, - name="evaluators.init", - value=1, - tags=( - ("evaluator_label", self.LABEL), - ("state", telemetry_state), - ("ragas_version", self.ragas_version), - ), - ) - - self.ragas_faithfulness_instance = _get_faithfulness_instance() - self.llm_output_parser_for_generated_statements = MiniRagas.RagasoutputParser( - pydantic_object=MiniRagas.StatementsAnswers + super().__init__(llmobs_service) + self.ragas_faithfulness_instance = self._get_faithfulness_instance() + self.llm_output_parser_for_generated_statements = self.ragas_dependencies.RagasoutputParser( + pydantic_object=self.ragas_dependencies.StatementsAnswers ) - self.llm_output_parser_for_faithfulness_score = MiniRagas.RagasoutputParser( - pydantic_object=MiniRagas.StatementFaithfulnessAnswers + self.llm_output_parser_for_faithfulness_score = self.ragas_dependencies.RagasoutputParser( + pydantic_object=self.ragas_dependencies.StatementFaithfulnessAnswers ) - self.split_answer_into_sentences = MiniRagas.get_segmenter( + self.split_answer_into_sentences = self.ragas_dependencies.get_segmenter( language=self.ragas_faithfulness_instance.nli_statements_message.language, clean=False ) - def run_and_submit_evaluation(self, span_event: dict): - if not span_event: - return - score_result_or_failure, metric_metadata = self.evaluate(span_event) - telemetry_writer.add_count_metric( - TELEMETRY_APM_PRODUCT.LLMOBS, - "evaluators.run", - 1, - tags=( - ("evaluator_label", self.LABEL), - ("state", score_result_or_failure if isinstance(score_result_or_failure, str) else "success"), - ), - ) - if isinstance(score_result_or_failure, float): - self.llmobs_service.submit_evaluation( - span_context={"trace_id": span_event.get("trace_id"), "span_id": span_event.get("span_id")}, - label=RagasFaithfulnessEvaluator.LABEL, - metric_type=RagasFaithfulnessEvaluator.METRIC_TYPE, - value=score_result_or_failure, - metadata=metric_metadata, - ) + def _get_faithfulness_instance(self) -> Optional[object]: + """ + This helper function ensures the faithfulness instance used in + ragas evaluator is updated with the latest ragas faithfulness + instance AND has an non-null llm + """ + if self.ragas_dependencies.faithfulness is None: + return None + ragas_faithfulness_instance = self.ragas_dependencies.faithfulness + if not ragas_faithfulness_instance.llm: + ragas_faithfulness_instance.llm = self.ragas_dependencies.llm_factory() + return ragas_faithfulness_instance def evaluate(self, span_event: dict) -> Tuple[Union[float, str], Optional[dict]]: """ @@ -196,7 +77,7 @@ def evaluate(self, span_event: dict) -> Tuple[Union[float, str], Optional[dict]] If the ragas faithfulness instance does not have `llm` set, we set `llm` using the `llm_factory()` method from ragas which defaults to openai's gpt-4o-turbo. """ - self.ragas_faithfulness_instance = _get_faithfulness_instance() + self.ragas_faithfulness_instance = self._get_faithfulness_instance() if not self.ragas_faithfulness_instance: return "fail_faithfulness_is_none", {} @@ -220,16 +101,16 @@ def evaluate(self, span_event: dict) -> Tuple[Union[float, str], Optional[dict]] span=ragas_faithfulness_workflow ) - faithfulness_inputs = self._extract_faithfulness_inputs(span_event) + faithfulness_inputs = self._extract_evaluation_inputs_from_span(span_event) if faithfulness_inputs is None: logger.debug( - "Failed to extract question and context from span sampled for ragas_faithfulness evaluation" + "Failed to extract evaluation inputs from span sampled for `ragas_faithfulness` evaluation" ) return "fail_extract_faithfulness_inputs", evaluation_metadata question = faithfulness_inputs["question"] answer = faithfulness_inputs["answer"] - context = faithfulness_inputs["context"] + context = " ".join(faithfulness_inputs["contexts"]) statements = self._create_statements(question, answer) if statements is None: @@ -318,9 +199,9 @@ def _create_verdicts(self, context: str, statements: List[str]): return None # collapse multiple generations into a single faithfulness list - faithfulness_list = MiniRagas.ensembler.from_discrete(raw_faithfulness_list, "verdict") # type: ignore + faithfulness_list = self.ragas_dependencies.ensembler.from_discrete(raw_faithfulness_list, "verdict") try: - return MiniRagas.StatementFaithfulnessAnswers.parse_obj(faithfulness_list) # type: ignore + return self.ragas_dependencies.StatementFaithfulnessAnswers.parse_obj(faithfulness_list) except Exception as e: logger.debug("Failed to parse faithfulness_list", exc_info=e) return None @@ -330,59 +211,6 @@ def _create_verdicts(self, context: str, statements: List[str]): output_data=faithfulness_list, ) - def _extract_faithfulness_inputs(self, span_event: dict) -> Optional[dict]: - """ - Extracts the question, answer, and context used as inputs to faithfulness - evaluation from a span event. - - question - input.prompt.variables.question OR input.messages[-1].content - context - input.prompt.variables.context - answer - output.messages[-1].content - """ - with self.llmobs_service.workflow("dd-ragas.extract_faithfulness_inputs") as extract_inputs_workflow: - self.llmobs_service.annotate(span=extract_inputs_workflow, input_data=span_event) - question, answer, context = None, None, None - - meta_io = span_event.get("meta") - if meta_io is None: - return None - - meta_input = meta_io.get("input") - meta_output = meta_io.get("output") - - if not (meta_input and meta_output): - return None - - prompt = meta_input.get("prompt") - if prompt is None: - logger.debug("Failed to extract `prompt` from span for `ragas_faithfulness` evaluation") - return None - prompt_variables = prompt.get("variables") - - input_messages = meta_input.get("messages") - - messages = meta_output.get("messages") - if messages is not None and len(messages) > 0: - answer = messages[-1].get("content") - - if prompt_variables: - context_keys = prompt.get(INTERNAL_CONTEXT_VARIABLE_KEYS, ["context"]) - question_keys = prompt.get(INTERNAL_QUERY_VARIABLE_KEYS, ["question"]) - context = " ".join([prompt_variables.get(key) for key in context_keys if prompt_variables.get(key)]) - question = " ".join([prompt_variables.get(key) for key in question_keys if prompt_variables.get(key)]) - - if not question and input_messages is not None and len(input_messages) > 0: - question = input_messages[-1].get("content") - - self.llmobs_service.annotate( - span=extract_inputs_workflow, output_data={"question": question, "context": context, "answer": answer} - ) - if any(field is None for field in (question, context, answer)): - logger.debug("Failed to extract inputs required for faithfulness evaluation") - return None - - return {"question": question, "context": context, "answer": answer} - def _create_statements_prompt(self, answer, question): # Returns: `ragas.llms.PromptValue` object with self.llmobs_service.task("dd-ragas.create_statements_prompt"): diff --git a/ddtrace/llmobs/_evaluators/runner.py b/ddtrace/llmobs/_evaluators/runner.py index bf45e618e01..3d26998f1b4 100644 --- a/ddtrace/llmobs/_evaluators/runner.py +++ b/ddtrace/llmobs/_evaluators/runner.py @@ -64,13 +64,15 @@ def __init__(self, interval: float, llmobs_service=None, evaluators=None): ("state", evaluator_init_state), ), ) + else: + raise ValueError("Parsed unsupported evaluator: {}".format(evaluator)) def start(self, *args, **kwargs): if not self.evaluators: logger.debug("no evaluators configured, not starting %r", self.__class__.__name__) return super(EvaluatorRunner, self).start() - logger.debug("started %r to %r", self.__class__.__name__) + logger.debug("started %r", self.__class__.__name__) def _stop_service(self) -> None: """ diff --git a/ddtrace/llmobs/_llmobs.py b/ddtrace/llmobs/_llmobs.py index 49815151118..b4f1dc1b2f6 100644 --- a/ddtrace/llmobs/_llmobs.py +++ b/ddtrace/llmobs/_llmobs.py @@ -3,7 +3,9 @@ import time from typing import Any from typing import Dict +from typing import List from typing import Optional +from typing import Tuple from typing import Union import ddtrace @@ -11,8 +13,12 @@ from ddtrace import config from ddtrace import patch from ddtrace._trace.context import Context +from ddtrace.constants import ERROR_MSG +from ddtrace.constants import ERROR_STACK +from ddtrace.constants import ERROR_TYPE from ddtrace.ext import SpanTypes from ddtrace.internal import atexit +from ddtrace.internal import core from ddtrace.internal import forksafe from ddtrace.internal._rand import rand64bits from ddtrace.internal.compat import ensure_text @@ -22,8 +28,10 @@ from ddtrace.internal.service import ServiceStatusError from ddtrace.internal.telemetry import telemetry_writer from ddtrace.internal.telemetry.constants import TELEMETRY_APM_PRODUCT +from ddtrace.internal.utils.deprecations import DDTraceDeprecationWarning from ddtrace.internal.utils.formats import asbool from ddtrace.internal.utils.formats import parse_tags_str +from ddtrace.llmobs import _constants as constants from ddtrace.llmobs._constants import ANNOTATIONS_CONTEXT_ID from ddtrace.llmobs._constants import INPUT_DOCUMENTS from ddtrace.llmobs._constants import INPUT_MESSAGES @@ -45,11 +53,11 @@ from ddtrace.llmobs._constants import SPAN_START_WHILE_DISABLED_WARNING from ddtrace.llmobs._constants import TAGS from ddtrace.llmobs._evaluators.runner import EvaluatorRunner -from ddtrace.llmobs._trace_processor import LLMObsTraceProcessor from ddtrace.llmobs._utils import AnnotationContext from ddtrace.llmobs._utils import _get_llmobs_parent_id from ddtrace.llmobs._utils import _get_ml_app from ddtrace.llmobs._utils import _get_session_id +from ddtrace.llmobs._utils import _get_span_name from ddtrace.llmobs._utils import _inject_llmobs_parent_id from ddtrace.llmobs._utils import safe_json from ddtrace.llmobs._utils import validate_prompt @@ -59,6 +67,7 @@ from ddtrace.llmobs.utils import ExportedLLMObsSpan from ddtrace.llmobs.utils import Messages from ddtrace.propagation.http import HTTPPropagator +from ddtrace.vendor.debtcollector import deprecate log = get_logger(__name__) @@ -81,34 +90,157 @@ class LLMObs(Service): def __init__(self, tracer=None): super(LLMObs, self).__init__() self.tracer = tracer or ddtrace.tracer - self._llmobs_span_writer = None - self._llmobs_span_writer = LLMObsSpanWriter( is_agentless=config._llmobs_agentless_enabled, interval=float(os.getenv("_DD_LLMOBS_WRITER_INTERVAL", 1.0)), timeout=float(os.getenv("_DD_LLMOBS_WRITER_TIMEOUT", 5.0)), ) - self._llmobs_eval_metric_writer = LLMObsEvalMetricWriter( site=config._dd_site, api_key=config._dd_api_key, interval=float(os.getenv("_DD_LLMOBS_WRITER_INTERVAL", 1.0)), timeout=float(os.getenv("_DD_LLMOBS_WRITER_TIMEOUT", 5.0)), ) - self._evaluator_runner = EvaluatorRunner( interval=float(os.getenv("_DD_LLMOBS_EVALUATOR_INTERVAL", 1.0)), llmobs_service=self, ) - self._trace_processor = LLMObsTraceProcessor(self._llmobs_span_writer, self._evaluator_runner) forksafe.register(self._child_after_fork) self._annotations = [] self._annotation_context_lock = forksafe.RLock() - self.tracer.on_start_span(self._do_annotations) - def _do_annotations(self, span): + def _on_span_start(self, span): + if self.enabled and span.span_type == SpanTypes.LLM: + self._do_annotations(span) + + def _on_span_finish(self, span): + if self.enabled and span.span_type == SpanTypes.LLM: + self._submit_llmobs_span(span) + + def _submit_llmobs_span(self, span: Span) -> None: + """Generate and submit an LLMObs span event to be sent to LLMObs.""" + span_event = None + is_llm_span = span._get_ctx_item(SPAN_KIND) == "llm" + is_ragas_integration_span = False + try: + span_event, is_ragas_integration_span = self._llmobs_span_event(span) + self._llmobs_span_writer.enqueue(span_event) + except (KeyError, TypeError): + log.error( + "Error generating LLMObs span event for span %s, likely due to malformed span", span, exc_info=True + ) + finally: + if not span_event or not is_llm_span or is_ragas_integration_span: + return + if self._evaluator_runner: + self._evaluator_runner.enqueue(span_event, span) + + @classmethod + def _llmobs_span_event(cls, span: Span) -> Tuple[Dict[str, Any], bool]: + """Span event object structure.""" + span_kind = span._get_ctx_item(SPAN_KIND) + if not span_kind: + raise KeyError("Span kind not found in span context") + meta: Dict[str, Any] = {"span.kind": span_kind, "input": {}, "output": {}} + if span_kind in ("llm", "embedding") and span._get_ctx_item(MODEL_NAME) is not None: + meta["model_name"] = span._get_ctx_item(MODEL_NAME) + meta["model_provider"] = (span._get_ctx_item(MODEL_PROVIDER) or "custom").lower() + meta["metadata"] = span._get_ctx_item(METADATA) or {} + if span._get_ctx_item(INPUT_PARAMETERS): + meta["input"]["parameters"] = span._get_ctx_item(INPUT_PARAMETERS) + if span_kind == "llm" and span._get_ctx_item(INPUT_MESSAGES) is not None: + meta["input"]["messages"] = span._get_ctx_item(INPUT_MESSAGES) + if span._get_ctx_item(INPUT_VALUE) is not None: + meta["input"]["value"] = safe_json(span._get_ctx_item(INPUT_VALUE)) + if span_kind == "llm" and span._get_ctx_item(OUTPUT_MESSAGES) is not None: + meta["output"]["messages"] = span._get_ctx_item(OUTPUT_MESSAGES) + if span_kind == "embedding" and span._get_ctx_item(INPUT_DOCUMENTS) is not None: + meta["input"]["documents"] = span._get_ctx_item(INPUT_DOCUMENTS) + if span._get_ctx_item(OUTPUT_VALUE) is not None: + meta["output"]["value"] = safe_json(span._get_ctx_item(OUTPUT_VALUE)) + if span_kind == "retrieval" and span._get_ctx_item(OUTPUT_DOCUMENTS) is not None: + meta["output"]["documents"] = span._get_ctx_item(OUTPUT_DOCUMENTS) + if span._get_ctx_item(INPUT_PROMPT) is not None: + prompt_json_str = span._get_ctx_item(INPUT_PROMPT) + if span_kind != "llm": + log.warning( + "Dropping prompt on non-LLM span kind, annotating prompts is only supported for LLM span kinds." + ) + else: + meta["input"]["prompt"] = prompt_json_str + if span.error: + meta.update( + { + ERROR_MSG: span.get_tag(ERROR_MSG), + ERROR_STACK: span.get_tag(ERROR_STACK), + ERROR_TYPE: span.get_tag(ERROR_TYPE), + } + ) + if not meta["input"]: + meta.pop("input") + if not meta["output"]: + meta.pop("output") + metrics = span._get_ctx_item(METRICS) or {} + ml_app = _get_ml_app(span) + + is_ragas_integration_span = False + + if ml_app.startswith(constants.RAGAS_ML_APP_PREFIX): + is_ragas_integration_span = True + + span._set_ctx_item(ML_APP, ml_app) + parent_id = str(_get_llmobs_parent_id(span) or "undefined") + + llmobs_span_event = { + "trace_id": "{:x}".format(span.trace_id), + "span_id": str(span.span_id), + "parent_id": parent_id, + "name": _get_span_name(span), + "start_ns": span.start_ns, + "duration": span.duration_ns, + "status": "error" if span.error else "ok", + "meta": meta, + "metrics": metrics, + } + session_id = _get_session_id(span) + if session_id is not None: + span._set_ctx_item(SESSION_ID, session_id) + llmobs_span_event["session_id"] = session_id + + llmobs_span_event["tags"] = cls._llmobs_tags( + span, ml_app, session_id, is_ragas_integration_span=is_ragas_integration_span + ) + return llmobs_span_event, is_ragas_integration_span + + @staticmethod + def _llmobs_tags( + span: Span, ml_app: str, session_id: Optional[str] = None, is_ragas_integration_span: bool = False + ) -> List[str]: + tags = { + "version": config.version or "", + "env": config.env or "", + "service": span.service or "", + "source": "integration", + "ml_app": ml_app, + "ddtrace.version": ddtrace.__version__, + "language": "python", + "error": span.error, + } + err_type = span.get_tag(ERROR_TYPE) + if err_type: + tags["error_type"] = err_type + if session_id: + tags["session_id"] = session_id + if is_ragas_integration_span: + tags[constants.RUNNER_IS_INTEGRATION_SPAN_TAG] = "ragas" + existing_tags = span._get_ctx_item(TAGS) + if existing_tags is not None: + tags.update(existing_tags) + return ["{}:{}".format(k, v) for k, v in tags.items()] + + def _do_annotations(self, span: Span) -> None: # get the current span context # only do the annotations if it matches the context if span.span_type != SpanTypes.LLM: # do this check to avoid the warning log in `annotate` @@ -120,20 +252,14 @@ def _do_annotations(self, span): if current_context_id == context_id: self.annotate(span, **annotation_kwargs) - def _child_after_fork(self): + def _child_after_fork(self) -> None: self._llmobs_span_writer = self._llmobs_span_writer.recreate() self._llmobs_eval_metric_writer = self._llmobs_eval_metric_writer.recreate() self._evaluator_runner = self._evaluator_runner.recreate() - self._trace_processor._span_writer = self._llmobs_span_writer - self._trace_processor._evaluator_runner = self._evaluator_runner if self.enabled: self._start_service() def _start_service(self) -> None: - tracer_filters = self.tracer._filters - if not any(isinstance(tracer_filter, LLMObsTraceProcessor) for tracer_filter in tracer_filters): - tracer_filters += [self._trace_processor] - self.tracer.configure(settings={"FILTERS": tracer_filters}) try: self._llmobs_span_writer.start() self._llmobs_eval_metric_writer.start() @@ -146,6 +272,10 @@ def _start_service(self) -> None: log.debug("Error starting evaluator runner") def _stop_service(self) -> None: + # Remove listener hooks for span events + core.reset_listeners("trace.span_start", self._on_span_start) + core.reset_listeners("trace.span_finish", self._on_span_finish) + try: self._evaluator_runner.stop() # flush remaining evaluation spans & evaluations @@ -160,11 +290,7 @@ def _stop_service(self) -> None: except ServiceStatusError: log.debug("Error stopping LLMObs writers") - try: - forksafe.unregister(self._child_after_fork) - self.tracer.shutdown() - except Exception: - log.warning("Failed to shutdown tracer", exc_info=True) + forksafe.unregister(self._child_after_fork) @classmethod def enable( @@ -244,6 +370,10 @@ def enable( cls.enabled = True cls._instance.start() + # Register hooks for span events + core.on("trace.span_start", cls._instance._on_span_start) + core.on("trace.span_finish", cls._instance._on_span_finish) + atexit.register(cls.disable) telemetry_writer.product_activated(TELEMETRY_APM_PRODUCT.LLMOBS, True) @@ -265,7 +395,6 @@ def disable(cls) -> None: cls._instance.stop() cls.enabled = False - cls._instance.tracer.deregister_on_start_span(cls._instance._do_annotations) telemetry_writer.product_activated(TELEMETRY_APM_PRODUCT.LLMOBS, False) log.debug("%s disabled", cls.__name__) @@ -785,6 +914,127 @@ def _tag_metrics(span: Span, metrics: Dict[str, Any]) -> None: return span._set_ctx_item(METRICS, metrics) + @classmethod + def submit_evaluation_for( + cls, + label: str, + metric_type: str, + value: Union[str, int, float], + span: Optional[dict] = None, + span_with_tag_value: Optional[Dict[str, str]] = None, + tags: Optional[Dict[str, str]] = None, + ml_app: Optional[str] = None, + timestamp_ms: Optional[int] = None, + ) -> None: + """ + Submits a custom evaluation metric for a given span. + + :param str label: The name of the evaluation metric. + :param str metric_type: The type of the evaluation metric. One of "categorical", "score". + :param value: The value of the evaluation metric. + Must be a string (categorical), integer (score), or float (score). + :param dict span: A dictionary of shape {'span_id': str, 'trace_id': str} uniquely identifying + the span associated with this evaluation. + :param dict span_with_tag_value: A dictionary with the format {'tag_key': str, 'tag_value': str} + uniquely identifying the span associated with this evaluation. + :param tags: A dictionary of string key-value pairs to tag the evaluation metric with. + :param str ml_app: The name of the ML application + :param int timestamp_ms: The unix timestamp in milliseconds when the evaluation metric result was generated. + If not set, the current time will be used. + """ + if cls.enabled is False: + log.debug( + "LLMObs.submit_evaluation_for() called when LLMObs is not enabled. ", + "Evaluation metric data will not be sent.", + ) + return + + has_exactly_one_joining_key = (span is not None) ^ (span_with_tag_value is not None) + + if not has_exactly_one_joining_key: + raise ValueError( + "Exactly one of `span` or `span_with_tag_value` must be specified to submit an evaluation metric." + ) + + join_on = {} + if span is not None: + if ( + not isinstance(span, dict) + or not isinstance(span.get("span_id"), str) + or not isinstance(span.get("trace_id"), str) + ): + raise TypeError( + "`span` must be a dictionary containing both span_id and trace_id keys. " + "LLMObs.export_span() can be used to generate this dictionary from a given span." + ) + join_on["span"] = span + elif span_with_tag_value is not None: + if ( + not isinstance(span_with_tag_value, dict) + or not isinstance(span_with_tag_value.get("tag_key"), str) + or not isinstance(span_with_tag_value.get("tag_value"), str) + ): + raise TypeError( + "`span_with_tag_value` must be a dict with keys 'tag_key' and 'tag_value' containing string values" + ) + join_on["tag"] = { + "key": span_with_tag_value.get("tag_key"), + "value": span_with_tag_value.get("tag_value"), + } + + timestamp_ms = timestamp_ms if timestamp_ms else int(time.time() * 1000) + + if not isinstance(timestamp_ms, int) or timestamp_ms < 0: + raise ValueError("timestamp_ms must be a non-negative integer. Evaluation metric data will not be sent") + + if not label: + raise ValueError("label must be the specified name of the evaluation metric.") + + metric_type = metric_type.lower() + if metric_type not in ("categorical", "score"): + raise ValueError("metric_type must be one of 'categorical' or 'score'.") + + if metric_type == "categorical" and not isinstance(value, str): + raise TypeError("value must be a string for a categorical metric.") + if metric_type == "score" and not isinstance(value, (int, float)): + raise TypeError("value must be an integer or float for a score metric.") + + if tags is not None and not isinstance(tags, dict): + log.warning("tags must be a dictionary of string key-value pairs.") + tags = {} + + evaluation_tags = { + "ddtrace.version": ddtrace.__version__, + "ml_app": ml_app, + } + + if tags: + for k, v in tags.items(): + try: + evaluation_tags[ensure_text(k)] = ensure_text(v) + except TypeError: + log.warning("Failed to parse tags. Tags for evaluation metrics must be strings.") + + ml_app = ml_app if ml_app else config._llmobs_ml_app + if not ml_app: + log.warning( + "ML App name is required for sending evaluation metrics. Evaluation metric data will not be sent. " + "Ensure this configuration is set before running your application." + ) + return + + evaluation_metric = { + "join_on": join_on, + "label": str(label), + "metric_type": metric_type, + "timestamp_ms": timestamp_ms, + "{}_value".format(metric_type): value, + "ml_app": ml_app, + "tags": ["{}:{}".format(k, v) for k, v in evaluation_tags.items()], + } + + cls._instance._llmobs_eval_metric_writer.enqueue(evaluation_metric) + @classmethod def submit_evaluation( cls, @@ -797,6 +1047,13 @@ def submit_evaluation( timestamp_ms: Optional[int] = None, metadata: Optional[Dict[str, object]] = None, ) -> None: + deprecate( + "Using `LLMObs.submit_evaluation` is deprecated", + message="Please use `LLMObs.submit_evaluation_for` instead.", + removal_version="3.0.0", + category=DDTraceDeprecationWarning, + ) + """ Submits a custom evaluation metric for a given span ID and trace ID. @@ -812,7 +1069,7 @@ def submit_evaluation( evaluation metric. """ if cls.enabled is False: - log.warning( + log.debug( "LLMObs.submit_evaluation() called when LLMObs is not enabled. Evaluation metric data will not be sent." ) return @@ -888,8 +1145,7 @@ def submit_evaluation( log.warning("Failed to parse tags. Tags for evaluation metrics must be strings.") evaluation_metric = { - "span_id": span_id, - "trace_id": trace_id, + "join_on": {"span": {"span_id": span_id, "trace_id": trace_id}}, "label": str(label), "metric_type": metric_type.lower(), "timestamp_ms": timestamp_ms, diff --git a/ddtrace/llmobs/_trace_processor.py b/ddtrace/llmobs/_trace_processor.py deleted file mode 100644 index 231d53d7626..00000000000 --- a/ddtrace/llmobs/_trace_processor.py +++ /dev/null @@ -1,177 +0,0 @@ -from typing import Any -from typing import Dict -from typing import List -from typing import Optional -from typing import Tuple - -import ddtrace -from ddtrace import Span -from ddtrace import config -from ddtrace._trace.processor import TraceProcessor -from ddtrace.constants import ERROR_MSG -from ddtrace.constants import ERROR_STACK -from ddtrace.constants import ERROR_TYPE -from ddtrace.ext import SpanTypes -from ddtrace.internal.logger import get_logger -from ddtrace.llmobs._constants import INPUT_DOCUMENTS -from ddtrace.llmobs._constants import INPUT_MESSAGES -from ddtrace.llmobs._constants import INPUT_PARAMETERS -from ddtrace.llmobs._constants import INPUT_PROMPT -from ddtrace.llmobs._constants import INPUT_VALUE -from ddtrace.llmobs._constants import METADATA -from ddtrace.llmobs._constants import METRICS -from ddtrace.llmobs._constants import ML_APP -from ddtrace.llmobs._constants import MODEL_NAME -from ddtrace.llmobs._constants import MODEL_PROVIDER -from ddtrace.llmobs._constants import OUTPUT_DOCUMENTS -from ddtrace.llmobs._constants import OUTPUT_MESSAGES -from ddtrace.llmobs._constants import OUTPUT_VALUE -from ddtrace.llmobs._constants import RAGAS_ML_APP_PREFIX -from ddtrace.llmobs._constants import RUNNER_IS_INTEGRATION_SPAN_TAG -from ddtrace.llmobs._constants import SESSION_ID -from ddtrace.llmobs._constants import SPAN_KIND -from ddtrace.llmobs._constants import TAGS -from ddtrace.llmobs._utils import _get_llmobs_parent_id -from ddtrace.llmobs._utils import _get_ml_app -from ddtrace.llmobs._utils import _get_session_id -from ddtrace.llmobs._utils import _get_span_name -from ddtrace.llmobs._utils import safe_json - - -log = get_logger(__name__) - - -class LLMObsTraceProcessor(TraceProcessor): - """ - Processor that extracts LLM-type spans in a trace to submit as separate LLMObs span events to LLM Observability. - """ - - def __init__(self, llmobs_span_writer, evaluator_runner=None): - self._span_writer = llmobs_span_writer - self._evaluator_runner = evaluator_runner - - def process_trace(self, trace: List[Span]) -> Optional[List[Span]]: - if not trace: - return None - for span in trace: - if span.span_type == SpanTypes.LLM: - self.submit_llmobs_span(span) - return None if config._llmobs_agentless_enabled else trace - - def submit_llmobs_span(self, span: Span) -> None: - """Generate and submit an LLMObs span event to be sent to LLMObs.""" - span_event = None - is_llm_span = span._get_ctx_item(SPAN_KIND) == "llm" - is_ragas_integration_span = False - try: - span_event, is_ragas_integration_span = self._llmobs_span_event(span) - self._span_writer.enqueue(span_event) - except (KeyError, TypeError): - log.error("Error generating LLMObs span event for span %s, likely due to malformed span", span) - finally: - if not span_event or not is_llm_span or is_ragas_integration_span: - return - if self._evaluator_runner: - self._evaluator_runner.enqueue(span_event, span) - - def _llmobs_span_event(self, span: Span) -> Tuple[Dict[str, Any], bool]: - """Span event object structure.""" - span_kind = span._get_ctx_item(SPAN_KIND) - if not span_kind: - raise KeyError("Span kind not found in span context") - meta: Dict[str, Any] = {"span.kind": span_kind, "input": {}, "output": {}} - if span_kind in ("llm", "embedding") and span._get_ctx_item(MODEL_NAME) is not None: - meta["model_name"] = span._get_ctx_item(MODEL_NAME) - meta["model_provider"] = (span._get_ctx_item(MODEL_PROVIDER) or "custom").lower() - meta["metadata"] = span._get_ctx_item(METADATA) or {} - if span._get_ctx_item(INPUT_PARAMETERS): - meta["input"]["parameters"] = span._get_ctx_item(INPUT_PARAMETERS) - if span_kind == "llm" and span._get_ctx_item(INPUT_MESSAGES) is not None: - meta["input"]["messages"] = span._get_ctx_item(INPUT_MESSAGES) - if span._get_ctx_item(INPUT_VALUE) is not None: - meta["input"]["value"] = safe_json(span._get_ctx_item(INPUT_VALUE)) - if span_kind == "llm" and span._get_ctx_item(OUTPUT_MESSAGES) is not None: - meta["output"]["messages"] = span._get_ctx_item(OUTPUT_MESSAGES) - if span_kind == "embedding" and span._get_ctx_item(INPUT_DOCUMENTS) is not None: - meta["input"]["documents"] = span._get_ctx_item(INPUT_DOCUMENTS) - if span._get_ctx_item(OUTPUT_VALUE) is not None: - meta["output"]["value"] = safe_json(span._get_ctx_item(OUTPUT_VALUE)) - if span_kind == "retrieval" and span._get_ctx_item(OUTPUT_DOCUMENTS) is not None: - meta["output"]["documents"] = span._get_ctx_item(OUTPUT_DOCUMENTS) - if span._get_ctx_item(INPUT_PROMPT) is not None: - prompt_json_str = span._get_ctx_item(INPUT_PROMPT) - if span_kind != "llm": - log.warning( - "Dropping prompt on non-LLM span kind, annotating prompts is only supported for LLM span kinds." - ) - else: - meta["input"]["prompt"] = prompt_json_str - if span.error: - meta.update( - { - ERROR_MSG: span.get_tag(ERROR_MSG), - ERROR_STACK: span.get_tag(ERROR_STACK), - ERROR_TYPE: span.get_tag(ERROR_TYPE), - } - ) - if not meta["input"]: - meta.pop("input") - if not meta["output"]: - meta.pop("output") - metrics = span._get_ctx_item(METRICS) or {} - ml_app = _get_ml_app(span) - - is_ragas_integration_span = False - - if ml_app.startswith(RAGAS_ML_APP_PREFIX): - is_ragas_integration_span = True - - span._set_ctx_item(ML_APP, ml_app) - parent_id = str(_get_llmobs_parent_id(span) or "undefined") - - llmobs_span_event = { - "trace_id": "{:x}".format(span.trace_id), - "span_id": str(span.span_id), - "parent_id": parent_id, - "name": _get_span_name(span), - "start_ns": span.start_ns, - "duration": span.duration_ns, - "status": "error" if span.error else "ok", - "meta": meta, - "metrics": metrics, - } - session_id = _get_session_id(span) - if session_id is not None: - span._set_ctx_item(SESSION_ID, session_id) - llmobs_span_event["session_id"] = session_id - - llmobs_span_event["tags"] = self._llmobs_tags( - span, ml_app, session_id, is_ragas_integration_span=is_ragas_integration_span - ) - return llmobs_span_event, is_ragas_integration_span - - @staticmethod - def _llmobs_tags( - span: Span, ml_app: str, session_id: Optional[str] = None, is_ragas_integration_span: bool = False - ) -> List[str]: - tags = { - "version": config.version or "", - "env": config.env or "", - "service": span.service or "", - "source": "integration", - "ml_app": ml_app, - "ddtrace.version": ddtrace.__version__, - "language": "python", - "error": span.error, - } - err_type = span.get_tag(ERROR_TYPE) - if err_type: - tags["error_type"] = err_type - if session_id: - tags["session_id"] = session_id - if is_ragas_integration_span: - tags[RUNNER_IS_INTEGRATION_SPAN_TAG] = "ragas" - existing_tags = span._get_ctx_item(TAGS) - if existing_tags is not None: - tags.update(existing_tags) - return ["{}:{}".format(k, v) for k, v in tags.items()] diff --git a/ddtrace/llmobs/_utils.py b/ddtrace/llmobs/_utils.py index c1b1c4a776c..dd616db8bef 100644 --- a/ddtrace/llmobs/_utils.py +++ b/ddtrace/llmobs/_utils.py @@ -135,9 +135,12 @@ def _get_ml_app(span: Span) -> str: ml_app = span._get_ctx_item(ML_APP) if ml_app: return ml_app - nearest_llmobs_ancestor = _get_nearest_llmobs_ancestor(span) - if nearest_llmobs_ancestor: - ml_app = nearest_llmobs_ancestor._get_ctx_item(ML_APP) + llmobs_parent = _get_nearest_llmobs_ancestor(span) + while llmobs_parent: + ml_app = llmobs_parent._get_ctx_item(ML_APP) + if ml_app is not None: + return ml_app + llmobs_parent = _get_nearest_llmobs_ancestor(llmobs_parent) return ml_app or config._llmobs_ml_app or "unknown-ml-app" @@ -149,9 +152,12 @@ def _get_session_id(span: Span) -> Optional[str]: session_id = span._get_ctx_item(SESSION_ID) if session_id: return session_id - nearest_llmobs_ancestor = _get_nearest_llmobs_ancestor(span) - if nearest_llmobs_ancestor: - session_id = nearest_llmobs_ancestor._get_ctx_item(SESSION_ID) + llmobs_parent = _get_nearest_llmobs_ancestor(span) + while llmobs_parent: + session_id = llmobs_parent._get_ctx_item(SESSION_ID) + if session_id is not None: + return session_id + llmobs_parent = _get_nearest_llmobs_ancestor(llmobs_parent) return session_id diff --git a/ddtrace/llmobs/_writer.py b/ddtrace/llmobs/_writer.py index 5a293f05c4e..5880019d67f 100644 --- a/ddtrace/llmobs/_writer.py +++ b/ddtrace/llmobs/_writer.py @@ -55,8 +55,7 @@ class LLMObsSpanEvent(TypedDict): class LLMObsEvaluationMetricEvent(TypedDict, total=False): - span_id: str - trace_id: str + join_on: Dict[str, Dict[str, str]] metric_type: str label: str categorical_value: str @@ -107,6 +106,13 @@ def periodic(self) -> None: events = self._buffer self._buffer = [] + if not self._headers.get("DD-API-KEY"): + logger.warning( + "DD_API_KEY is required for sending evaluation metrics. Evaluation metric data will not be sent. ", + "Ensure this configuration is set before running your application.", + ) + return + data = self._data(events) enc_llm_events = safe_json(data) conn = httplib.HTTPSConnection(self._intake, 443, timeout=self._timeout) @@ -154,7 +160,7 @@ def __init__(self, site: str, api_key: str, interval: float, timeout: float) -> super(LLMObsEvalMetricWriter, self).__init__(site, api_key, interval, timeout) self._event_type = "evaluation_metric" self._buffer = [] - self._endpoint = "/api/intake/llm-obs/v1/eval-metric" + self._endpoint = "/api/intake/llm-obs/v2/eval-metric" self._intake = "api.%s" % self._site # type: str def enqueue(self, event: LLMObsEvaluationMetricEvent) -> None: diff --git a/ddtrace/profiling/collector/_lock.py b/ddtrace/profiling/collector/_lock.py index 6dedf3295f7..ec62c5c0eee 100644 --- a/ddtrace/profiling/collector/_lock.py +++ b/ddtrace/profiling/collector/_lock.py @@ -179,69 +179,63 @@ def acquire(self, *args, **kwargs): def _release(self, inner_func, *args, **kwargs): # type (typing.Any, typing.Any) -> None + + start = None + if hasattr(self, "_self_acquired_at"): + # _self_acquired_at is only set when the acquire was captured + # if it's not set, we're not capturing the release + start = self._self_acquired_at + try: return inner_func(*args, **kwargs) finally: - try: - if hasattr(self, "_self_acquired_at"): - try: - end = time.monotonic_ns() - thread_id, thread_name = _current_thread() - task_id, task_name, task_frame = _task.get_task(thread_id) - lock_name = ( - "%s:%s" % (self._self_init_loc, self._self_name) if self._self_name else self._self_init_loc - ) - - if task_frame is None: - # See the comments in _acquire - frame = sys._getframe(2) - else: - frame = task_frame - - frames, nframes = _traceback.pyframe_to_frames(frame, self._self_max_nframes) - - if self._self_export_libdd_enabled: - thread_native_id = _threading.get_thread_native_id(thread_id) - - handle = ddup.SampleHandle() - handle.push_monotonic_ns(end) - handle.push_lock_name(lock_name) - handle.push_release( - end - self._self_acquired_at, 1 - ) # AFAICT, capture_pct does not adjust anything here - handle.push_threadinfo(thread_id, thread_native_id, thread_name) - handle.push_task_id(task_id) - handle.push_task_name(task_name) - - if self._self_tracer is not None: - handle.push_span(self._self_tracer.current_span()) - for frame in frames: - handle.push_frame(frame.function_name, frame.file_name, 0, frame.lineno) - handle.flush_sample() - else: - event = self.RELEASE_EVENT_CLASS( - lock_name=lock_name, - frames=frames, - nframes=nframes, - thread_id=thread_id, - thread_name=thread_name, - task_id=task_id, - task_name=task_name, - locked_for_ns=end - self._self_acquired_at, - sampling_pct=self._self_capture_sampler.capture_pct, - ) - - if self._self_tracer is not None: - event.set_trace_info( - self._self_tracer.current_span(), self._self_endpoint_collection_enabled - ) - - self._self_recorder.push_event(event) - finally: - del self._self_acquired_at - except Exception as e: - LOG.warning("Error recording lock release event: %s", e) - pass # nosec + if start is not None: + end = time.monotonic_ns() + thread_id, thread_name = _current_thread() + task_id, task_name, task_frame = _task.get_task(thread_id) + lock_name = "%s:%s" % (self._self_init_loc, self._self_name) if self._self_name else self._self_init_loc + + if task_frame is None: + # See the comments in _acquire + frame = sys._getframe(2) + else: + frame = task_frame + + frames, nframes = _traceback.pyframe_to_frames(frame, self._self_max_nframes) + + if self._self_export_libdd_enabled: + thread_native_id = _threading.get_thread_native_id(thread_id) + + handle = ddup.SampleHandle() + handle.push_monotonic_ns(end) + handle.push_lock_name(lock_name) + handle.push_release(end - start, 1) # AFAICT, capture_pct does not adjust anything here + handle.push_threadinfo(thread_id, thread_native_id, thread_name) + handle.push_task_id(task_id) + handle.push_task_name(task_name) + + if self._self_tracer is not None: + handle.push_span(self._self_tracer.current_span()) + for frame in frames: + handle.push_frame(frame.function_name, frame.file_name, 0, frame.lineno) + handle.flush_sample() + else: + event = self.RELEASE_EVENT_CLASS( + lock_name=lock_name, + frames=frames, + nframes=nframes, + thread_id=thread_id, + thread_name=thread_name, + task_id=task_id, + task_name=task_name, + locked_for_ns=end - start, + sampling_pct=self._self_capture_sampler.capture_pct, + ) + + if self._self_tracer is not None: + event.set_trace_info(self._self_tracer.current_span(), self._self_endpoint_collection_enabled) + + self._self_recorder.push_event(event) def release(self, *args, **kwargs): return self._release(self.__wrapped__.release, *args, **kwargs) diff --git a/ddtrace/profiling/exporter/http.py b/ddtrace/profiling/exporter/http.py index 6700e584ade..b4ec6994d72 100644 --- a/ddtrace/profiling/exporter/http.py +++ b/ddtrace/profiling/exporter/http.py @@ -220,8 +220,18 @@ def export( "family": "python", "attachments": [item["filename"].decode("utf-8") for item in data], "tags_profiler": self._get_tags(service), - "start": (datetime.datetime.utcfromtimestamp(start_time_ns / 1e9).replace(microsecond=0).isoformat() + "Z"), - "end": (datetime.datetime.utcfromtimestamp(end_time_ns / 1e9).replace(microsecond=0).isoformat() + "Z"), + "start": ( + datetime.datetime.fromtimestamp(start_time_ns / 1e9, tz=datetime.timezone.utc) + .replace(microsecond=0) + .isoformat()[0:-6] # removes the trailing +00:00 portion of the time + + "Z" + ), + "end": ( + datetime.datetime.fromtimestamp(end_time_ns / 1e9, tz=datetime.timezone.utc) + .replace(microsecond=0) + .isoformat()[0:-6] # removes the trailing +00:00 portion of the time + + "Z" + ), } # type: Dict[str, Any] if self.endpoint_call_counter_span_processor is not None: diff --git a/ddtrace/propagation/http.py b/ddtrace/propagation/http.py index a1664664ace..563ee838d84 100644 --- a/ddtrace/propagation/http.py +++ b/ddtrace/propagation/http.py @@ -101,6 +101,7 @@ def _possible_header(header): _POSSIBLE_HTTP_HEADER_B3_FLAGS = _possible_header(_HTTP_HEADER_B3_FLAGS) _POSSIBLE_HTTP_HEADER_TRACEPARENT = _possible_header(_HTTP_HEADER_TRACEPARENT) _POSSIBLE_HTTP_HEADER_TRACESTATE = _possible_header(_HTTP_HEADER_TRACESTATE) +_POSSIBLE_HTTP_BAGGAGE_HEADER = _possible_header(_HTTP_HEADER_BAGGAGE) # https://www.w3.org/TR/trace-context/#traceparent-header-field-values @@ -937,7 +938,7 @@ def _inject(span_context: Context, headers: Dict[str, str]) -> None: @staticmethod def _extract(headers: Dict[str, str]) -> Context: - header_value = headers.get(_HTTP_HEADER_BAGGAGE) + header_value = _extract_header_value(_POSSIBLE_HTTP_BAGGAGE_HEADER, headers) if not header_value: return Context(baggage={}) diff --git a/hatch.toml b/hatch.toml index 38471028f8f..b6555885ad0 100644 --- a/hatch.toml +++ b/hatch.toml @@ -214,11 +214,11 @@ test = [ # if you add or remove a version here, please also update the parallelism parameter # in .circleci/config.templ.yml [[envs.appsec_threats_django.matrix]] -python = ["3.7", "3.9"] +python = ["3.8", "3.9"] django = ["~=2.2"] [[envs.appsec_threats_django.matrix]] -python = ["3.7", "3.9", "3.10"] +python = ["3.8", "3.9", "3.10"] django = ["~=3.2"] [[envs.appsec_threats_django.matrix]] @@ -226,11 +226,11 @@ python = ["3.8", "3.10"] django = ["==4.0.10"] [[envs.appsec_threats_django.matrix]] -python = ["3.8", "3.10", "3.12"] +python = ["3.8", "3.11", "3.13"] django = ["~=4.2"] [[envs.appsec_threats_django.matrix]] -python = ["3.10", "3.12"] +python = ["3.10", "3.13"] django = ["~=5.1"] @@ -262,21 +262,21 @@ test = [ # if you add or remove a version here, please also update the parallelism parameter # in .circleci/config.templ.yml [[envs.appsec_threats_flask.matrix]] -python = ["3.7", "3.9"] +python = ["3.8", "3.9"] flask = ["~=1.1"] markupsafe = ["~=1.1"] [[envs.appsec_threats_flask.matrix]] -python = ["3.7", "3.9"] +python = ["3.8", "3.9"] flask = ["==2.1.3"] werkzeug = ["<3.0"] [[envs.appsec_threats_flask.matrix]] -python = ["3.8", "3.9", "3.12"] +python = ["3.8", "3.10", "3.13"] flask = ["~=2.3"] [[envs.appsec_threats_flask.matrix]] -python = ["3.8", "3.10", "3.12"] +python = ["3.8", "3.11", "3.13"] flask = ["~=3.0"] ## ASM Native IAST module @@ -327,16 +327,16 @@ test = [ # if you add or remove a version here, please also update the parallelism parameter # in .circleci/config.templ.yml [[envs.appsec_threats_fastapi.matrix]] -python = ["3.7", "3.9", "3.11"] +python = ["3.8", "3.10", "3.13"] fastapi = ["==0.86.0"] anyio = ["==3.7.1"] [[envs.appsec_threats_fastapi.matrix]] -python = ["3.7", "3.9", "3.12"] +python = ["3.8", "3.10", "3.13"] fastapi = ["==0.94.1"] [[envs.appsec_threats_fastapi.matrix]] -python = ["3.8", "3.10", "3.12"] +python = ["3.8", "3.10", "3.13"] fastapi = ["~=0.114.2"] diff --git a/releasenotes/notes/case_insensitive_baggage_header_extraction-63167c492474da6f.yaml b/releasenotes/notes/case_insensitive_baggage_header_extraction-63167c492474da6f.yaml new file mode 100644 index 00000000000..ad0eacb28e8 --- /dev/null +++ b/releasenotes/notes/case_insensitive_baggage_header_extraction-63167c492474da6f.yaml @@ -0,0 +1,6 @@ +--- +fixes: + - | + tracer: This fix resolves an issue where baggage header extraction was case sensitive and didn't accept the header prepended with HTTP. + Now the baggage header will be extracted regardless of casing and the HTTP format. + diff --git a/releasenotes/notes/feat-openai-streamed-chunk-auto-extract-4cbaea8870b1df13.yaml b/releasenotes/notes/feat-openai-streamed-chunk-auto-extract-4cbaea8870b1df13.yaml new file mode 100644 index 00000000000..afaf95876d5 --- /dev/null +++ b/releasenotes/notes/feat-openai-streamed-chunk-auto-extract-4cbaea8870b1df13.yaml @@ -0,0 +1,6 @@ +--- +features: + - | + openai: Introduces automatic extraction of token usage from streamed chat completions. + Unless ``stream_options: {"include_usage": False}`` is explicitly set on your streamed chat completion request, + the OpenAI integration will add ``stream_options: {"include_usage": True}`` to your request and automatically extract the token usage chunk from the streamed response. diff --git a/releasenotes/notes/fix-er-include-nonlocals-bbeecfbbbde35496.yaml b/releasenotes/notes/fix-er-include-nonlocals-bbeecfbbbde35496.yaml new file mode 100644 index 00000000000..4d77fddb710 --- /dev/null +++ b/releasenotes/notes/fix-er-include-nonlocals-bbeecfbbbde35496.yaml @@ -0,0 +1,4 @@ +--- +fixes: + - | + exception replay: include missing nonlocal variables in snapshot log messages. diff --git a/releasenotes/notes/fix-llmobs-default-writer-hooks-5e456c2f7dfd4381.yaml b/releasenotes/notes/fix-llmobs-default-writer-hooks-5e456c2f7dfd4381.yaml new file mode 100644 index 00000000000..702e2538b99 --- /dev/null +++ b/releasenotes/notes/fix-llmobs-default-writer-hooks-5e456c2f7dfd4381.yaml @@ -0,0 +1,4 @@ +--- +fixes: + - | + LLM Observability: Resolves an issue where enabling LLM Observability in agentless mode would result in traces also being sent to the agent proxy endpoint. \ No newline at end of file diff --git a/releasenotes/notes/fix-llmobs-processor-4afd715a84323d32.yaml b/releasenotes/notes/fix-llmobs-processor-4afd715a84323d32.yaml new file mode 100644 index 00000000000..5912a415022 --- /dev/null +++ b/releasenotes/notes/fix-llmobs-processor-4afd715a84323d32.yaml @@ -0,0 +1,5 @@ +--- +fixes: + - | + LLM Observability: Resolves an issue where configuring custom trace filters/processors onto the tracer would disable LLM Observability. + Note that if LLM Observability is enabled in agentless mode, writing APM traces must be explicitly disabled by setting `DD_TRACE_ENABLED=0`. diff --git a/releasenotes/notes/profiling-lock-acquired-at-e308547cffdca9f7.yaml b/releasenotes/notes/profiling-lock-acquired-at-e308547cffdca9f7.yaml new file mode 100644 index 00000000000..de86c8227b6 --- /dev/null +++ b/releasenotes/notes/profiling-lock-acquired-at-e308547cffdca9f7.yaml @@ -0,0 +1,6 @@ +--- +fixes: + - | + profiling: This fix resolves a data race issue accessing lock's acquired + time, leading to an ``AttributeError``: ``_Profiled_ThreadingLock`` object + has no attribute ``self_acquired_at`` diff --git a/releasenotes/notes/rasp_cmdi-3c7819ee9e33e447.yaml b/releasenotes/notes/rasp_cmdi-3c7819ee9e33e447.yaml new file mode 100644 index 00000000000..89744bf9be2 --- /dev/null +++ b/releasenotes/notes/rasp_cmdi-3c7819ee9e33e447.yaml @@ -0,0 +1,6 @@ +--- +features: + - | + ASM: This introduces the support for command injection for Exploit Prevention. With previous support of shell injection with os.system, + this provides automatic instrumentation for subprocess module functions and os.spawn* functions, + ensuring monitoring and blocking for Exploit Prevention on those endpoints. diff --git a/releasenotes/notes/submit-evaluation-for-01096d803d969e3e.yaml b/releasenotes/notes/submit-evaluation-for-01096d803d969e3e.yaml new file mode 100644 index 00000000000..c2e4b25f255 --- /dev/null +++ b/releasenotes/notes/submit-evaluation-for-01096d803d969e3e.yaml @@ -0,0 +1,17 @@ +--- +features: + - | + LLM Observability: This introduces the `LLMObs.submit_evaluation_for` method, which provides the ability to join a custom evaluation + to a span using a tag key-value pair on the span. The tag key-value pair is expected to uniquely identify a single span. + Tag-based joining is an alternative to the existing method of joining evaluations to spans using trace and span IDs. + Example usage: + - Evaluation joined by tag: `LLMObs.submit_evaluation_for(span_with_tag_value={"tag_key": "message_id", "tag_value": "dummy_message_id"}, label="rating", ...)`. + - Evaluation joined by trace/span ID: `LLMObs.submit_evaluation_for(span={"trace_id": "...", "span_id": "..."}, label="rating", ...)`. +deprecations: + - | + LLM Observability: `LLMObs.submit_evaluation` is deprecated and will be removed in ddtrace 3.0.0. + As an alternative to `LLMObs.submit_evaluation`, you can use `LLMObs.submit_evaluation_for` instead. + To migrate, replace `LLMObs.submit_evaluation(span_context={"span_id": ..., "trace_id": ...}, ...)` with: + `LLMObs.submit_evaluation_for(span={"span_id": ..., "trace_id": ...}, ...) + You may also join an evaluation to a span using a tag key-value pair like so: + `LLMObs.submit_evaluation_for(span_with_tag_value={"tag_key": ..., "tag_val": ...}, ...)`. diff --git a/riotfile.py b/riotfile.py index 3f62a0cce06..7365c481e7c 100644 --- a/riotfile.py +++ b/riotfile.py @@ -2956,8 +2956,8 @@ def select_pys(min_version: str = MIN_PYTHON_VERSION, max_version: str = MAX_PYT name="llmobs", command="pytest {cmdargs} tests/llmobs", pkgs={"vcrpy": latest, "pytest-asyncio": "==0.21.1"}, - pys=select_pys(min_version="3.7"), venvs=[ + Venv(pys="3.7"), Venv(pys=select_pys(min_version="3.8"), pkgs={"ragas": "==0.1.21", "langchain": latest}), ], ), diff --git a/scripts/gen_circleci_config.py b/scripts/gen_circleci_config.py index bc51f2c5519..3a68a1a7975 100644 --- a/scripts/gen_circleci_config.py +++ b/scripts/gen_circleci_config.py @@ -17,10 +17,13 @@ def gen_required_suites(template: dict) -> None: required_suites = template["requires_tests"]["requires"] = [] for_each_testrun_needed( suites=sorted( - set(n.rpartition("::")[-1] for n, s in get_suites().items() if not s.get("skip", False)) - & set(template["jobs"].keys()) + set( + n + for n, s in get_suites().items() + if not s.get("skip", False) and n.rpartition("::")[-1] in template["jobs"] + ) ), - action=lambda suite: required_suites.append(suite), + action=lambda suite: required_suites.append(suite.rpartition("::")[-1]), git_selections=extract_git_commit_selections(os.getenv("GIT_COMMIT_DESC", "")), ) diff --git a/tests/appsec/appsec/rules-rasp-blocking.json b/tests/appsec/appsec/rules-rasp-blocking.json index f2f8c4d7955..e5038e4a7c2 100644 --- a/tests/appsec/appsec/rules-rasp-blocking.json +++ b/tests/appsec/appsec/rules-rasp-blocking.json @@ -201,6 +201,55 @@ "stack_trace", "block" ] + }, + { + "id": "rasp-932-110", + "name": "OS command injection exploit", + "tags": { + "type": "command_injection", + "category": "vulnerability_trigger", + "cwe": "77", + "capec": "1000/152/248/88", + "confidence": "0", + "module": "rasp" + }, + "conditions": [ + { + "parameters": { + "resource": [ + { + "address": "server.sys.exec.cmd" + } + ], + "params": [ + { + "address": "server.request.query" + }, + { + "address": "server.request.body" + }, + { + "address": "server.request.path_params" + }, + { + "address": "grpc.server.request.message" + }, + { + "address": "graphql.server.all_resolvers" + }, + { + "address": "graphql.server.resolver" + } + ] + }, + "operator": "cmdi_detector" + } + ], + "transformers": [], + "on_match": [ + "stack_trace", + "block" + ] } ] } \ No newline at end of file diff --git a/tests/appsec/appsec/rules-rasp-disabled.json b/tests/appsec/appsec/rules-rasp-disabled.json index 4a0943a34fb..ec67b186732 100644 --- a/tests/appsec/appsec/rules-rasp-disabled.json +++ b/tests/appsec/appsec/rules-rasp-disabled.json @@ -201,6 +201,55 @@ "on_match": [ "stack_trace" ] + }, + { + "id": "rasp-932-110", + "name": "OS command injection exploit", + "enabled": false, + "tags": { + "type": "command_injection", + "category": "vulnerability_trigger", + "cwe": "77", + "capec": "1000/152/248/88", + "confidence": "0", + "module": "rasp" + }, + "conditions": [ + { + "parameters": { + "resource": [ + { + "address": "server.sys.exec.cmd" + } + ], + "params": [ + { + "address": "server.request.query" + }, + { + "address": "server.request.body" + }, + { + "address": "server.request.path_params" + }, + { + "address": "grpc.server.request.message" + }, + { + "address": "graphql.server.all_resolvers" + }, + { + "address": "graphql.server.resolver" + } + ] + }, + "operator": "cmdi_detector" + } + ], + "transformers": [], + "on_match": [ + "stack_trace" + ] } ] } \ No newline at end of file diff --git a/tests/appsec/appsec/rules-rasp-redirecting.json b/tests/appsec/appsec/rules-rasp-redirecting.json index a7a53db6e3b..6e2080b2dbf 100644 --- a/tests/appsec/appsec/rules-rasp-redirecting.json +++ b/tests/appsec/appsec/rules-rasp-redirecting.json @@ -211,6 +211,55 @@ "stack_trace", "block" ] + }, + { + "id": "rasp-932-110", + "name": "OS command injection exploit", + "tags": { + "type": "command_injection", + "category": "vulnerability_trigger", + "cwe": "77", + "capec": "1000/152/248/88", + "confidence": "0", + "module": "rasp" + }, + "conditions": [ + { + "parameters": { + "resource": [ + { + "address": "server.sys.exec.cmd" + } + ], + "params": [ + { + "address": "server.request.query" + }, + { + "address": "server.request.body" + }, + { + "address": "server.request.path_params" + }, + { + "address": "grpc.server.request.message" + }, + { + "address": "graphql.server.all_resolvers" + }, + { + "address": "graphql.server.resolver" + } + ] + }, + "operator": "cmdi_detector" + } + ], + "transformers": [], + "on_match": [ + "stack_trace", + "block" + ] } ] } \ No newline at end of file diff --git a/tests/appsec/appsec/rules-rasp.json b/tests/appsec/appsec/rules-rasp.json index c1a6822d261..d73672392af 100644 --- a/tests/appsec/appsec/rules-rasp.json +++ b/tests/appsec/appsec/rules-rasp.json @@ -197,6 +197,54 @@ "on_match": [ "stack_trace" ] + }, + { + "id": "rasp-932-110", + "name": "OS command injection exploit", + "tags": { + "type": "command_injection", + "category": "vulnerability_trigger", + "cwe": "77", + "capec": "1000/152/248/88", + "confidence": "0", + "module": "rasp" + }, + "conditions": [ + { + "parameters": { + "resource": [ + { + "address": "server.sys.exec.cmd" + } + ], + "params": [ + { + "address": "server.request.query" + }, + { + "address": "server.request.body" + }, + { + "address": "server.request.path_params" + }, + { + "address": "grpc.server.request.message" + }, + { + "address": "graphql.server.all_resolvers" + }, + { + "address": "graphql.server.resolver" + } + ] + }, + "operator": "cmdi_detector" + } + ], + "transformers": [], + "on_match": [ + "stack_trace" + ] } ] } \ No newline at end of file diff --git a/tests/appsec/appsec/test_remoteconfiguration.py b/tests/appsec/appsec/test_remoteconfiguration.py index f00167706dc..1d2c47bc190 100644 --- a/tests/appsec/appsec/test_remoteconfiguration.py +++ b/tests/appsec/appsec/test_remoteconfiguration.py @@ -117,7 +117,7 @@ def test_rc_activation_states_off(tracer, appsec_enabled, rc_value, remote_confi @pytest.mark.parametrize( "rc_enabled, appsec_enabled, capability", [ - (True, "true", "D4HkA/w="), # All capabilities except ASM_ACTIVATION + (True, "true", "L4HkA/w="), # All capabilities except ASM_ACTIVATION (False, "true", ""), (True, "false", "gAAAAA=="), (False, "false", ""), @@ -142,7 +142,7 @@ def test_rc_capabilities(rc_enabled, appsec_enabled, capability, tracer): @pytest.mark.parametrize( "env_rules, expected", [ - ({}, "D4HkA/4="), # All capabilities + ({}, "L4HkA/4="), # All capabilities ({"_asm_static_rule_file": DEFAULT.RULES}, "gAAAAg=="), # Only ASM_FEATURES ], ) diff --git a/tests/appsec/contrib_appsec/django_app/urls.py b/tests/appsec/contrib_appsec/django_app/urls.py index 77ad7a7f0a6..aaff69169b5 100644 --- a/tests/appsec/contrib_appsec/django_app/urls.py +++ b/tests/appsec/contrib_appsec/django_app/urls.py @@ -1,5 +1,6 @@ import os import sqlite3 +import subprocess import tempfile import django @@ -129,13 +130,33 @@ def rasp(request, endpoint: str): res.append(f"Error: {e}") tracer.current_span()._local_root.set_tag("rasp.request.done", endpoint) return HttpResponse("<\\br>\n".join(res)) + elif endpoint == "shell_injection": + res = ["shell_injection endpoint"] + for param in query_params: + if param.startswith("cmd"): + cmd = query_params[param] + try: + if param.startswith("cmdsys"): + res.append(f'cmd stdout: {os.system(f"ls {cmd}")}') + else: + res.append(f'cmd stdout: {subprocess.run(f"ls {cmd}", shell=True)}') + except Exception as e: + res.append(f"Error: {e}") + tracer.current_span()._local_root.set_tag("rasp.request.done", endpoint) + return HttpResponse("<\\br>\n".join(res)) elif endpoint == "command_injection": res = ["command_injection endpoint"] for param in query_params: - if param.startswith("cmd"): + if param.startswith("cmda"): + cmd = query_params[param] + try: + res.append(f'cmd stdout: {subprocess.run([cmd, "-c", "3", "localhost"])}') + except Exception as e: + res.append(f"Error: {e}") + elif param.startswith("cmds"): cmd = query_params[param] try: - res.append(f'cmd stdout: {os.system(f"ls {cmd}")}') + res.append(f"cmd stdout: {subprocess.run(cmd)}") except Exception as e: res.append(f"Error: {e}") tracer.current_span()._local_root.set_tag("rasp.request.done", endpoint) diff --git a/tests/appsec/contrib_appsec/fastapi_app/app.py b/tests/appsec/contrib_appsec/fastapi_app/app.py index 10b7b430543..c5b765c4bbb 100644 --- a/tests/appsec/contrib_appsec/fastapi_app/app.py +++ b/tests/appsec/contrib_appsec/fastapi_app/app.py @@ -1,6 +1,7 @@ import asyncio import os import sqlite3 +import subprocess from typing import Optional from fastapi import FastAPI @@ -178,13 +179,33 @@ async def rasp(endpoint: str, request: Request): res.append(f"Error: {e}") tracer.current_span()._local_root.set_tag("rasp.request.done", endpoint) return HTMLResponse("<\\br>\n".join(res)) + elif endpoint == "shell_injection": + res = ["shell_injection endpoint"] + for param in query_params: + if param.startswith("cmd"): + cmd = query_params[param] + try: + if param.startswith("cmdsys"): + res.append(f'cmd stdout: {os.system(f"ls {cmd}")}') + else: + res.append(f'cmd stdout: {subprocess.run(f"ls {cmd}", shell=True)}') + except Exception as e: + res.append(f"Error: {e}") + tracer.current_span()._local_root.set_tag("rasp.request.done", endpoint) + return HTMLResponse("<\\br>\n".join(res)) elif endpoint == "command_injection": res = ["command_injection endpoint"] for param in query_params: - if param.startswith("cmd"): + if param.startswith("cmda"): + cmd = query_params[param] + try: + res.append(f'cmd stdout: {subprocess.run([cmd, "-c", "3", "localhost"])}') + except Exception as e: + res.append(f"Error: {e}") + elif param.startswith("cmds"): cmd = query_params[param] try: - res.append(f'cmd stdout: {os.system(f"ls {cmd}")}') + res.append(f"cmd stdout: {subprocess.run(cmd)}") except Exception as e: res.append(f"Error: {e}") tracer.current_span()._local_root.set_tag("rasp.request.done", endpoint) diff --git a/tests/appsec/contrib_appsec/flask_app/app.py b/tests/appsec/contrib_appsec/flask_app/app.py index 5270229d3e9..939a7cad678 100644 --- a/tests/appsec/contrib_appsec/flask_app/app.py +++ b/tests/appsec/contrib_appsec/flask_app/app.py @@ -1,5 +1,6 @@ import os import sqlite3 +import subprocess from typing import Optional from flask import Flask @@ -126,13 +127,33 @@ def rasp(endpoint: str): res.append(f"Error: {e}") tracer.current_span()._local_root.set_tag("rasp.request.done", endpoint) return "<\\br>\n".join(res) + elif endpoint == "shell_injection": + res = ["shell_injection endpoint"] + for param in query_params: + if param.startswith("cmd"): + cmd = query_params[param] + try: + if param.startswith("cmdsys"): + res.append(f'cmd stdout: {os.system(f"ls {cmd}")}') + else: + res.append(f'cmd stdout: {subprocess.run(f"ls {cmd}", shell=True)}') + except Exception as e: + res.append(f"Error: {e}") + tracer.current_span()._local_root.set_tag("rasp.request.done", endpoint) + return "<\\br>\n".join(res) elif endpoint == "command_injection": res = ["command_injection endpoint"] for param in query_params: - if param.startswith("cmd"): + if param.startswith("cmda"): + cmd = query_params[param] + try: + res.append(f'cmd stdout: {subprocess.run([cmd, "-c", "3", "localhost"])}') + except Exception as e: + res.append(f"Error: {e}") + elif param.startswith("cmds"): cmd = query_params[param] try: - res.append(f'cmd stdout: {os.system(f"ls {cmd}")}') + res.append(f"cmd stdout: {subprocess.run(cmd)}") except Exception as e: res.append(f"Error: {e}") tracer.current_span()._local_root.set_tag("rasp.request.done", endpoint) diff --git a/tests/appsec/contrib_appsec/utils.py b/tests/appsec/contrib_appsec/utils.py index 315caa49a5d..d3691e2bea3 100644 --- a/tests/appsec/contrib_appsec/utils.py +++ b/tests/appsec/contrib_appsec/utils.py @@ -1308,11 +1308,19 @@ def test_stream_response( + [("sql_injection", "user_id_1=1 OR 1=1&user_id_2=1 OR 1=1", "rasp-942-100", ("dispatch",))] + [ ( - "command_injection", - "cmd_1=$(cat /etc/passwd 1>%262 ; echo .)&cmd_2=$(uname -a 1>%262 ; echo .)", + "shell_injection", + "cmdsys_1=$(cat /etc/passwd 1>%262 ; echo .)&cmdrun_2=$(uname -a 1>%262 ; echo .)", "rasp-932-100", ("system", "rasp"), ) + ] + + [ + ( + "command_injection", + "cmda_1=/sbin/ping&cmds_2=/usr/bin/ls%20-la", + "rasp-932-110", + ("Popen", "rasp"), + ) ], ) @pytest.mark.parametrize( @@ -1381,11 +1389,23 @@ def validate_top_function(trace): trace ), f"unknown top function {trace['frames'][0]} {[t['function'] for t in trace['frames'][:4]]}" # assert mocked.call_args_list == [] + expected_rule_type = "command_injection" if endpoint == "shell_injection" else endpoint + expected_variant = ( + "exec" if endpoint == "command_injection" else "shell" if endpoint == "shell_injection" else None + ) matches = [t for c, n, t in telemetry_calls if c == "CountMetric" and n == "appsec.rasp.rule.match"] - assert matches == [(("rule_type", endpoint), ("waf_version", DDWAF_VERSION))], matches + if expected_variant: + expected_tags = ( + ("rule_type", expected_rule_type), + ("rule_variant", expected_variant), + ("waf_version", DDWAF_VERSION), + ) + else: + expected_tags = (("rule_type", expected_rule_type), ("waf_version", DDWAF_VERSION)) + assert matches == [expected_tags], matches evals = [t for c, n, t in telemetry_calls if c == "CountMetric" and n == "appsec.rasp.rule.eval"] # there may have been multiple evaluations of other rules too - assert (("rule_type", endpoint), ("waf_version", DDWAF_VERSION)) in evals + assert expected_tags in evals if action_level == 2: assert get_tag("rasp.request.done") is None, get_tag("rasp.request.done") else: @@ -1509,7 +1529,7 @@ def test_fingerprinting(self, interface, root_span, get_tag, asm_enabled, user_a def test_iast(self, interface, root_span, get_tag): from ddtrace.ext import http - url = "/rasp/command_injection/?cmd=." + url = "/rasp/command_injection/?cmds=." self.update_tracer(interface) response = interface.client.get(url) assert self.status(response) == 200 diff --git a/tests/appsec/iast/_ast/test_ast_patching.py b/tests/appsec/iast/_ast/test_ast_patching.py index cf0fabd14e4..d014496942b 100644 --- a/tests/appsec/iast/_ast/test_ast_patching.py +++ b/tests/appsec/iast/_ast/test_ast_patching.py @@ -9,7 +9,9 @@ from ddtrace.appsec._constants import IAST from ddtrace.appsec._iast._ast.ast_patching import _in_python_stdlib from ddtrace.appsec._iast._ast.ast_patching import _should_iast_patch +from ddtrace.appsec._iast._ast.ast_patching import _trie_has_prefix_for from ddtrace.appsec._iast._ast.ast_patching import astpatch_module +from ddtrace.appsec._iast._ast.ast_patching import build_trie from ddtrace.appsec._iast._ast.ast_patching import visit_ast from ddtrace.internal.utils.formats import asbool from tests.utils import override_env @@ -308,3 +310,87 @@ def test_astpatch_dir_patched_with_or_without_custom_dir(module_name, expected_n # Check that all the symbols in the expected set are in the patched dir() result for name in expected_names: assert name in patched_dir + + +def test_build_trie(): + from ddtrace.appsec._iast._ast.ast_patching import build_trie + + trie = build_trie(["abc", "def", "ghi", "jkl", "mno", "pqr", "stu", "vwx", "yz"]) + assert dict(trie) == { + "a": { + "b": { + "c": {"": None}, + }, + }, + "d": { + "e": { + "f": {"": None}, + }, + }, + "g": { + "h": { + "i": {"": None}, + }, + }, + "j": { + "k": { + "l": {"": None}, + }, + }, + "m": { + "n": { + "o": {"": None}, + }, + }, + "p": { + "q": { + "r": {"": None}, + }, + }, + "s": { + "t": { + "u": {"": None}, + }, + }, + "v": { + "w": { + "x": {"": None}, + }, + }, + "y": { + "z": {"": None}, + }, + } + + +def test_trie_has_string_match(): + trie = build_trie(["abc", "def", "ghi", "jkl", "mno", "pqr", "stu", "vwx", "yz"]) + assert _trie_has_prefix_for(trie, "abc") + assert not _trie_has_prefix_for(trie, "ab") + assert _trie_has_prefix_for(trie, "abcd") + assert _trie_has_prefix_for(trie, "def") + assert not _trie_has_prefix_for(trie, "de") + assert _trie_has_prefix_for(trie, "defg") + assert _trie_has_prefix_for(trie, "ghi") + assert not _trie_has_prefix_for(trie, "gh") + assert _trie_has_prefix_for(trie, "ghij") + assert _trie_has_prefix_for(trie, "jkl") + assert not _trie_has_prefix_for(trie, "jk") + assert _trie_has_prefix_for(trie, "jklm") + assert _trie_has_prefix_for(trie, "mno") + assert not _trie_has_prefix_for(trie, "mn") + assert _trie_has_prefix_for(trie, "mnop") + assert _trie_has_prefix_for(trie, "pqr") + assert not _trie_has_prefix_for(trie, "pq") + assert _trie_has_prefix_for(trie, "pqrs") + assert _trie_has_prefix_for(trie, "stu") + assert not _trie_has_prefix_for(trie, "st") + assert _trie_has_prefix_for(trie, "stuv") + assert _trie_has_prefix_for(trie, "vwx") + assert not _trie_has_prefix_for(trie, "vw") + assert _trie_has_prefix_for(trie, "vwxy") + assert _trie_has_prefix_for(trie, "yz") + assert not _trie_has_prefix_for(trie, "y") + assert _trie_has_prefix_for(trie, "yza") + assert not _trie_has_prefix_for(trie, "z") + assert not _trie_has_prefix_for(trie, "zzz") diff --git a/tests/appsec/iast/taint_sinks/test_command_injection.py b/tests/appsec/iast/taint_sinks/test_command_injection.py index b716f594e85..ab611c1969b 100644 --- a/tests/appsec/iast/taint_sinks/test_command_injection.py +++ b/tests/appsec/iast/taint_sinks/test_command_injection.py @@ -123,7 +123,7 @@ def test_popen_wait_shell_true(iast_context_defaults): _assert_vulnerability("test_popen_wait_shell_true", source_name=source_name) -@pytest.mark.skipif(sys.platform != "linux", reason="Only for Linux") +@pytest.mark.skipif(sys.platform not in ["linux", "darwin"], reason="Only for Unix") @pytest.mark.parametrize( "function,mode,arguments,tag", [ @@ -156,11 +156,11 @@ def test_osspawn_variants(iast_context_defaults, function, mode, arguments, tag) if "spawnv" in cleaned_name: # label test_osspawn_variants2 - function(mode, copied_args[0], copied_args) + function(mode, copied_args[0], copied_args[1:]) label = "test_osspawn_variants2" else: # label test_osspawn_variants1 - function(mode, copied_args[0], *copied_args) + function(mode, copied_args[0], *copied_args[1:]) label = "test_osspawn_variants1" _assert_vulnerability( @@ -171,7 +171,7 @@ def test_osspawn_variants(iast_context_defaults, function, mode, arguments, tag) ) -@pytest.mark.skipif(sys.platform != "linux", reason="Only for Linux") +@pytest.mark.skipif(sys.platform not in ["linux", "darwin"], reason="Only for Unix") def test_multiple_cmdi(iast_context_defaults): _BAD_DIR = taint_pyobject( pyobject=_BAD_DIR_DEFAULT, @@ -193,7 +193,7 @@ def test_multiple_cmdi(iast_context_defaults): assert len(list(data["vulnerabilities"])) == 2 -@pytest.mark.skipif(sys.platform != "linux", reason="Only for Linux") +@pytest.mark.skipif(sys.platform not in ["linux", "darwin"], reason="Only for Unix") def test_string_cmdi(iast_context_defaults): cmd = taint_pyobject( pyobject="dir -l .", diff --git a/tests/appsec/iast_packages/packages/pkg_pyjwt.py b/tests/appsec/iast_packages/packages/pkg_pyjwt.py index 4712f6cee0f..ec43d8a17d2 100644 --- a/tests/appsec/iast_packages/packages/pkg_pyjwt.py +++ b/tests/appsec/iast_packages/packages/pkg_pyjwt.py @@ -3,6 +3,7 @@ https://pypi.org/project/PyJWT/ """ + import datetime from flask import Blueprint @@ -25,7 +26,10 @@ def pkg_pyjwt_view(): secret_key = "your-256-bit-secret" user_payload = request.args.get("package_param", "default-user") - payload = {"user": user_payload, "exp": datetime.datetime.utcnow() + datetime.timedelta(seconds=30)} + payload = { + "user": user_payload, + "exp": datetime.datetime.now(datetime.timezone.utc) + datetime.timedelta(seconds=30), + } try: # Encode the payload to create a JWT diff --git a/tests/contrib/elasticsearch/test_elasticsearch.py b/tests/contrib/elasticsearch/test_elasticsearch.py index b80b4486e71..4a480c550c8 100644 --- a/tests/contrib/elasticsearch/test_elasticsearch.py +++ b/tests/contrib/elasticsearch/test_elasticsearch.py @@ -1,6 +1,7 @@ import datetime from http.client import HTTPConnection from importlib import import_module +import json import time import pytest @@ -167,7 +168,12 @@ def test_elasticsearch(self): es.index(id=10, body={"name": "ten", "created": datetime.date(2016, 1, 1)}, **args) es.index(id=11, body={"name": "eleven", "created": datetime.date(2016, 2, 1)}, **args) es.index(id=12, body={"name": "twelve", "created": datetime.date(2016, 3, 1)}, **args) - result = es.search(sort=["name:desc"], size=100, body={"query": {"match_all": {}}}, **args) + result = es.search( + sort={"name": {"order": "desc", "unmapped_type": "keyword"}}, + size=100, + body={"query": {"match_all": {}}}, + **args, + ) assert len(result["hits"]["hits"]) == 3, result spans = self.get_spans() @@ -183,13 +189,25 @@ def test_elasticsearch(self): assert url.endswith("/_search") assert url == span.get_tag("elasticsearch.url") if elasticsearch.__version__ >= (8, 0, 0): - assert span.get_tag("elasticsearch.body").replace(" ", "") == '{"query":{"match_all":{}},"size":100}' - assert set(span.get_tag("elasticsearch.params").split("&")) == {"sort=name%3Adesc"} - assert set(span.get_tag(http.QUERY_STRING).split("&")) == {"sort=name%3Adesc"} + # Key order is not consistent, parse into dict to compare + body = json.loads(span.get_tag("elasticsearch.body")) + assert body == { + "query": {"match_all": {}}, + "sort": {"name": {"order": "desc", "unmapped_type": "keyword"}}, + "size": 100, + } + assert not span.get_tag("elasticsearch.params") + assert not span.get_tag(http.QUERY_STRING) else: assert span.get_tag("elasticsearch.body").replace(" ", "") == '{"query":{"match_all":{}}}' - assert set(span.get_tag("elasticsearch.params").split("&")) == {"sort=name%3Adesc", "size=100"} - assert set(span.get_tag(http.QUERY_STRING).split("&")) == {"sort=name%3Adesc", "size=100"} + assert set(span.get_tag("elasticsearch.params").split("&")) == { + "sort=%7B%27name%27%3A+%7B%27order%27%3A+%27desc%27%2C+%27unmapped_type%27%3A+%27keyword%27%7D%7D", + "size=100", + } + assert set(span.get_tag(http.QUERY_STRING).split("&")) == { + "sort=%7B%27name%27%3A+%7B%27order%27%3A+%27desc%27%2C+%27unmapped_type%27%3A+%27keyword%27%7D%7D", + "size=100", + } assert span.get_tag("component") == "elasticsearch" assert span.get_tag("span.kind") == "client" diff --git a/tests/contrib/openai/test_openai_llmobs.py b/tests/contrib/openai/test_openai_llmobs.py index a1a2b93a5ca..a145877c8c8 100644 --- a/tests/contrib/openai/test_openai_llmobs.py +++ b/tests/contrib/openai/test_openai_llmobs.py @@ -518,11 +518,17 @@ async def test_chat_completion_azure_async( ) ) - def test_chat_completion_stream(self, openai, ddtrace_global_config, mock_llmobs_writer, mock_tracer): + @pytest.mark.skipif( + parse_version(openai_module.version.VERSION) < (1, 26), reason="Stream options only available openai >= 1.26" + ) + def test_chat_completion_stream_explicit_no_tokens( + self, openai, ddtrace_global_config, mock_llmobs_writer, mock_tracer + ): """Ensure llmobs records are emitted for chat completion endpoints when configured. Also ensure the llmobs records have the correct tagging including trace/span ID for trace correlation. """ + with get_openai_vcr(subdirectory_name="v1").use_cassette("chat_completion_streamed.yaml"): with mock.patch("ddtrace.contrib.internal.openai.utils.encoding_for_model", create=True) as mock_encoding: with mock.patch("ddtrace.contrib.internal.openai.utils._est_tokens") as mock_est: @@ -534,7 +540,11 @@ def test_chat_completion_stream(self, openai, ddtrace_global_config, mock_llmobs expected_completion = "The Los Angeles Dodgers won the World Series in 2020." client = openai.OpenAI() resp = client.chat.completions.create( - model=model, messages=input_messages, stream=True, user="ddtrace-test" + model=model, + messages=input_messages, + stream=True, + user="ddtrace-test", + stream_options={"include_usage": False}, ) for chunk in resp: resp_model = chunk.model @@ -547,7 +557,7 @@ def test_chat_completion_stream(self, openai, ddtrace_global_config, mock_llmobs model_provider="openai", input_messages=input_messages, output_messages=[{"content": expected_completion, "role": "assistant"}], - metadata={"stream": True, "user": "ddtrace-test"}, + metadata={"stream": True, "stream_options": {"include_usage": False}, "user": "ddtrace-test"}, token_metrics={"input_tokens": 8, "output_tokens": 8, "total_tokens": 16}, tags={"ml_app": "", "service": "tests.contrib.openai"}, ) @@ -557,20 +567,14 @@ def test_chat_completion_stream(self, openai, ddtrace_global_config, mock_llmobs parse_version(openai_module.version.VERSION) < (1, 26, 0), reason="Streamed tokens available in 1.26.0+" ) def test_chat_completion_stream_tokens(self, openai, ddtrace_global_config, mock_llmobs_writer, mock_tracer): - """ - Ensure llmobs records are emitted for chat completion endpoints when configured - with the `stream_options={"include_usage": True}`. - Also ensure the llmobs records have the correct tagging including trace/span ID for trace correlation. - """ + """Assert that streamed token chunk extraction logic works when options are not explicitly passed from user.""" with get_openai_vcr(subdirectory_name="v1").use_cassette("chat_completion_streamed_tokens.yaml"): model = "gpt-3.5-turbo" resp_model = model input_messages = [{"role": "user", "content": "Who won the world series in 2020?"}] expected_completion = "The Los Angeles Dodgers won the World Series in 2020." client = openai.OpenAI() - resp = client.chat.completions.create( - model=model, messages=input_messages, stream=True, stream_options={"include_usage": True} - ) + resp = client.chat.completions.create(model=model, messages=input_messages, stream=True) for chunk in resp: resp_model = chunk.model span = mock_tracer.pop_traces()[0][0] @@ -671,7 +675,6 @@ def test_chat_completion_tool_call_stream(self, openai, ddtrace_global_config, m messages=[{"role": "user", "content": chat_completion_input_description}], user="ddtrace-test", stream=True, - stream_options={"include_usage": True}, ) for chunk in resp: resp_model = chunk.model diff --git a/tests/contrib/openai/test_openai_v1.py b/tests/contrib/openai/test_openai_v1.py index f13de144fc5..91737d9e5eb 100644 --- a/tests/contrib/openai/test_openai_v1.py +++ b/tests/contrib/openai/test_openai_v1.py @@ -921,128 +921,78 @@ def test_span_finish_on_stream_error(openai, openai_vcr, snapshot_tracer): ) -def test_completion_stream(openai, openai_vcr, mock_metrics, mock_tracer): +@pytest.mark.snapshot +@pytest.mark.skipif(TIKTOKEN_AVAILABLE, reason="This test estimates token counts") +def test_completion_stream_est_tokens(openai, openai_vcr, mock_metrics, snapshot_tracer): with openai_vcr.use_cassette("completion_streamed.yaml"): with mock.patch("ddtrace.contrib.internal.openai.utils.encoding_for_model", create=True) as mock_encoding: mock_encoding.return_value.encode.side_effect = lambda x: [1, 2] - expected_completion = '! ... A page layouts page drawer? ... Interesting. The "Tools" is' client = openai.OpenAI() resp = client.completions.create(model="ada", prompt="Hello world", stream=True, n=None) - chunks = [c for c in resp] - - completion = "".join([c.choices[0].text for c in chunks]) - assert completion == expected_completion + _ = [c for c in resp] - traces = mock_tracer.pop_traces() - assert len(traces) == 1 - assert len(traces[0]) == 1 - assert traces[0][0].get_tag("openai.response.choices.0.text") == expected_completion - assert traces[0][0].get_tag("openai.response.choices.0.finish_reason") == "length" - expected_tags = [ - "version:", - "env:", - "service:tests.contrib.openai", - "openai.request.model:ada", - "model:ada", - "openai.request.endpoint:/v1/completions", - "openai.request.method:POST", - "openai.organization.id:", - "openai.organization.name:datadog-4", - "openai.user.api_key:sk-...key>", - "error:0", - "openai.estimated:true", - ] - if TIKTOKEN_AVAILABLE: - expected_tags = expected_tags[:-1] - assert mock.call.distribution("tokens.prompt", 2, tags=expected_tags) in mock_metrics.mock_calls - assert mock.call.distribution("tokens.completion", mock.ANY, tags=expected_tags) in mock_metrics.mock_calls - assert mock.call.distribution("tokens.total", mock.ANY, tags=expected_tags) in mock_metrics.mock_calls +@pytest.mark.skipif(not TIKTOKEN_AVAILABLE, reason="This test computes token counts using tiktoken") +@pytest.mark.snapshot(token="tests.contrib.openai.test_openai.test_completion_stream") +def test_completion_stream(openai, openai_vcr, mock_metrics, snapshot_tracer): + with openai_vcr.use_cassette("completion_streamed.yaml"): + with mock.patch("ddtrace.contrib.internal.openai.utils.encoding_for_model", create=True) as mock_encoding: + mock_encoding.return_value.encode.side_effect = lambda x: [1, 2] + client = openai.OpenAI() + resp = client.completions.create(model="ada", prompt="Hello world", stream=True, n=None) + _ = [c for c in resp] -async def test_completion_async_stream(openai, openai_vcr, mock_metrics, mock_tracer): +@pytest.mark.skipif(not TIKTOKEN_AVAILABLE, reason="This test computes token counts using tiktoken") +@pytest.mark.snapshot(token="tests.contrib.openai.test_openai.test_completion_stream") +async def test_completion_async_stream(openai, openai_vcr, mock_metrics, snapshot_tracer): with openai_vcr.use_cassette("completion_streamed.yaml"): with mock.patch("ddtrace.contrib.internal.openai.utils.encoding_for_model", create=True) as mock_encoding: mock_encoding.return_value.encode.side_effect = lambda x: [1, 2] - expected_completion = '! ... A page layouts page drawer? ... Interesting. The "Tools" is' client = openai.AsyncOpenAI() - resp = await client.completions.create(model="ada", prompt="Hello world", stream=True) - chunks = [c async for c in resp] - - completion = "".join([c.choices[0].text for c in chunks]) - assert completion == expected_completion - - traces = mock_tracer.pop_traces() - assert len(traces) == 1 - assert len(traces[0]) == 1 - assert traces[0][0].get_tag("openai.response.choices.0.text") == expected_completion - assert traces[0][0].get_tag("openai.response.choices.0.finish_reason") == "length" - - expected_tags = [ - "version:", - "env:", - "service:tests.contrib.openai", - "openai.request.model:ada", - "model:ada", - "openai.request.endpoint:/v1/completions", - "openai.request.method:POST", - "openai.organization.id:", - "openai.organization.name:datadog-4", - "openai.user.api_key:sk-...key>", - "error:0", - "openai.estimated:true", - ] - if TIKTOKEN_AVAILABLE: - expected_tags = expected_tags[:-1] - assert mock.call.distribution("tokens.prompt", 2, tags=expected_tags) in mock_metrics.mock_calls - assert mock.call.distribution("tokens.completion", mock.ANY, tags=expected_tags) in mock_metrics.mock_calls - assert mock.call.distribution("tokens.total", mock.ANY, tags=expected_tags) in mock_metrics.mock_calls + resp = await client.completions.create(model="ada", prompt="Hello world", stream=True, n=None) + _ = [c async for c in resp] @pytest.mark.skipif( - parse_version(openai_module.version.VERSION) < (1, 6, 0), + parse_version(openai_module.version.VERSION) < (1, 6, 0) or not TIKTOKEN_AVAILABLE, reason="Streamed response context managers are only available v1.6.0+", ) -def test_completion_stream_context_manager(openai, openai_vcr, mock_metrics, mock_tracer): +@pytest.mark.snapshot(token="tests.contrib.openai.test_openai.test_completion_stream") +def test_completion_stream_context_manager(openai, openai_vcr, mock_metrics, snapshot_tracer): with openai_vcr.use_cassette("completion_streamed.yaml"): with mock.patch("ddtrace.contrib.internal.openai.utils.encoding_for_model", create=True) as mock_encoding: mock_encoding.return_value.encode.side_effect = lambda x: [1, 2] - expected_completion = '! ... A page layouts page drawer? ... Interesting. The "Tools" is' client = openai.OpenAI() with client.completions.create(model="ada", prompt="Hello world", stream=True, n=None) as resp: - chunks = [c for c in resp] + _ = [c for c in resp] - completion = "".join([c.choices[0].text for c in chunks]) - assert completion == expected_completion - - traces = mock_tracer.pop_traces() - assert len(traces) == 1 - assert len(traces[0]) == 1 - assert traces[0][0].get_tag("openai.response.choices.0.text") == expected_completion - assert traces[0][0].get_tag("openai.response.choices.0.finish_reason") == "length" - expected_tags = [ - "version:", - "env:", - "service:tests.contrib.openai", - "openai.request.model:ada", - "model:ada", - "openai.request.endpoint:/v1/completions", - "openai.request.method:POST", - "openai.organization.id:", - "openai.organization.name:datadog-4", - "openai.user.api_key:sk-...key>", - "error:0", - "openai.estimated:true", - ] - if TIKTOKEN_AVAILABLE: - expected_tags = expected_tags[:-1] - assert mock.call.distribution("tokens.prompt", 2, tags=expected_tags) in mock_metrics.mock_calls - assert mock.call.distribution("tokens.completion", mock.ANY, tags=expected_tags) in mock_metrics.mock_calls - assert mock.call.distribution("tokens.total", mock.ANY, tags=expected_tags) in mock_metrics.mock_calls +@pytest.mark.skipif( + parse_version(openai_module.version.VERSION) < (1, 26), reason="Stream options only available openai >= 1.26" +) +@pytest.mark.snapshot(token="tests.contrib.openai.test_openai.test_chat_completion_stream") +def test_chat_completion_stream(openai, openai_vcr, mock_metrics, snapshot_tracer): + """Assert that streamed token chunk extraction logic works automatically.""" + with openai_vcr.use_cassette("chat_completion_streamed_tokens.yaml"): + with mock.patch("ddtrace.contrib.internal.openai.utils.encoding_for_model", create=True) as mock_encoding: + mock_encoding.return_value.encode.side_effect = lambda x: [1, 2, 3, 4, 5, 6, 7, 8] + client = openai.OpenAI() + resp = client.chat.completions.create( + model="gpt-3.5-turbo", + messages=[{"role": "user", "content": "Who won the world series in 2020?"}], + stream=True, + user="ddtrace-test", + n=None, + ) + _ = [c for c in resp] -def test_chat_completion_stream(openai, openai_vcr, mock_metrics, snapshot_tracer): +@pytest.mark.skipif( + parse_version(openai_module.version.VERSION) < (1, 26), reason="Stream options only available openai >= 1.26" +) +def test_chat_completion_stream_explicit_no_tokens(openai, openai_vcr, mock_metrics, snapshot_tracer): + """Assert that streamed token chunk extraction logic is avoided if explicitly set to False by the user.""" with openai_vcr.use_cassette("chat_completion_streamed.yaml"): with mock.patch("ddtrace.contrib.internal.openai.utils.encoding_for_model", create=True) as mock_encoding: mock_encoding.return_value.encode.side_effect = lambda x: [1, 2, 3, 4, 5, 6, 7, 8] @@ -1054,20 +1004,16 @@ def test_chat_completion_stream(openai, openai_vcr, mock_metrics, snapshot_trace {"role": "user", "content": "Who won the world series in 2020?"}, ], stream=True, + stream_options={"include_usage": False}, user="ddtrace-test", n=None, ) - prompt_tokens = 8 span = snapshot_tracer.current_span() chunks = [c for c in resp] assert len(chunks) == 15 completion = "".join([c.choices[0].delta.content for c in chunks if c.choices[0].delta.content is not None]) assert completion == expected_completion - assert span.get_tag("openai.response.choices.0.message.content") == expected_completion - assert span.get_tag("openai.response.choices.0.message.role") == "assistant" - assert span.get_tag("openai.response.choices.0.finish_reason") == "stop" - expected_tags = [ "version:", "env:", @@ -1087,16 +1033,19 @@ def test_chat_completion_stream(openai, openai_vcr, mock_metrics, snapshot_trace expected_tags += ["openai.estimated:true"] if TIKTOKEN_AVAILABLE: expected_tags = expected_tags[:-1] - assert mock.call.distribution("tokens.prompt", prompt_tokens, tags=expected_tags) in mock_metrics.mock_calls + assert mock.call.distribution("tokens.prompt", 8, tags=expected_tags) in mock_metrics.mock_calls assert mock.call.distribution("tokens.completion", mock.ANY, tags=expected_tags) in mock_metrics.mock_calls assert mock.call.distribution("tokens.total", mock.ANY, tags=expected_tags) in mock_metrics.mock_calls +@pytest.mark.skipif( + parse_version(openai_module.version.VERSION) < (1, 26, 0), reason="Streamed tokens available in 1.26.0+" +) +@pytest.mark.snapshot(token="tests.contrib.openai.test_openai.test_chat_completion_stream") async def test_chat_completion_async_stream(openai, openai_vcr, mock_metrics, snapshot_tracer): - with openai_vcr.use_cassette("chat_completion_streamed.yaml"): + with openai_vcr.use_cassette("chat_completion_streamed_tokens.yaml"): with mock.patch("ddtrace.contrib.internal.openai.utils.encoding_for_model", create=True) as mock_encoding: mock_encoding.return_value.encode.side_effect = lambda x: [1, 2, 3, 4, 5, 6, 7, 8] - expected_completion = "The Los Angeles Dodgers won the World Series in 2020." client = openai.AsyncOpenAI() resp = await client.chat.completions.create( model="gpt-3.5-turbo", @@ -1104,99 +1053,21 @@ async def test_chat_completion_async_stream(openai, openai_vcr, mock_metrics, sn {"role": "user", "content": "Who won the world series in 2020?"}, ], stream=True, + n=None, user="ddtrace-test", ) - prompt_tokens = 8 - span = snapshot_tracer.current_span() - chunks = [c async for c in resp] - assert len(chunks) == 15 - completion = "".join([c.choices[0].delta.content for c in chunks if c.choices[0].delta.content is not None]) - assert completion == expected_completion - - assert span.get_tag("openai.response.choices.0.message.content") == expected_completion - assert span.get_tag("openai.response.choices.0.message.role") == "assistant" - assert span.get_tag("openai.response.choices.0.finish_reason") == "stop" - - expected_tags = [ - "version:", - "env:", - "service:tests.contrib.openai", - "openai.request.model:gpt-3.5-turbo", - "model:gpt-3.5-turbo", - "openai.request.endpoint:/v1/chat/completions", - "openai.request.method:POST", - "openai.organization.id:", - "openai.organization.name:datadog-4", - "openai.user.api_key:sk-...key>", - "error:0", - ] - assert mock.call.distribution("request.duration", span.duration_ns, tags=expected_tags) in mock_metrics.mock_calls - assert mock.call.gauge("ratelimit.requests", 3000, tags=expected_tags) in mock_metrics.mock_calls - assert mock.call.gauge("ratelimit.remaining.requests", 2999, tags=expected_tags) in mock_metrics.mock_calls - expected_tags += ["openai.estimated:true"] - if TIKTOKEN_AVAILABLE: - expected_tags = expected_tags[:-1] - assert mock.call.distribution("tokens.prompt", prompt_tokens, tags=expected_tags) in mock_metrics.mock_calls - assert mock.call.distribution("tokens.completion", mock.ANY, tags=expected_tags) in mock_metrics.mock_calls - assert mock.call.distribution("tokens.total", mock.ANY, tags=expected_tags) in mock_metrics.mock_calls - - -@pytest.mark.skipif( - parse_version(openai_module.version.VERSION) < (1, 26, 0), reason="Streamed tokens available in 1.26.0+" -) -def test_chat_completion_stream_tokens(openai, openai_vcr, mock_metrics, snapshot_tracer): - with openai_vcr.use_cassette("chat_completion_streamed_tokens.yaml"): - expected_completion = "The Los Angeles Dodgers won the World Series in 2020." - client = openai.OpenAI() - resp = client.chat.completions.create( - model="gpt-3.5-turbo", - messages=[{"role": "user", "content": "Who won the world series in 2020?"}], - stream=True, - user="ddtrace-test", - n=None, - stream_options={"include_usage": True}, - ) - span = snapshot_tracer.current_span() - chunks = [c for c in resp] - completion = "".join( - [c.choices[0].delta.content for c in chunks if c.choices and c.choices[0].delta.content is not None] - ) - assert completion == expected_completion - - assert span.get_tag("openai.response.choices.0.message.content") == expected_completion - assert span.get_tag("openai.response.choices.0.message.role") == "assistant" - assert span.get_tag("openai.response.choices.0.finish_reason") == "stop" - - expected_tags = [ - "version:", - "env:", - "service:tests.contrib.openai", - "openai.request.model:gpt-3.5-turbo", - "model:gpt-3.5-turbo", - "openai.request.endpoint:/v1/chat/completions", - "openai.request.method:POST", - "openai.organization.id:", - "openai.organization.name:datadog-4", - "openai.user.api_key:sk-...key>", - "error:0", - ] - assert mock.call.distribution("request.duration", span.duration_ns, tags=expected_tags) in mock_metrics.mock_calls - assert mock.call.gauge("ratelimit.requests", 3000, tags=expected_tags) in mock_metrics.mock_calls - assert mock.call.gauge("ratelimit.remaining.requests", 2999, tags=expected_tags) in mock_metrics.mock_calls - assert mock.call.distribution("tokens.prompt", 17, tags=expected_tags) in mock_metrics.mock_calls - assert mock.call.distribution("tokens.completion", 19, tags=expected_tags) in mock_metrics.mock_calls - assert mock.call.distribution("tokens.total", 36, tags=expected_tags) in mock_metrics.mock_calls + _ = [c async for c in resp] @pytest.mark.skipif( - parse_version(openai_module.version.VERSION) < (1, 6, 0), - reason="Streamed response context managers are only available v1.6.0+", + parse_version(openai_module.version.VERSION) < (1, 26, 0), + reason="Streamed response context managers are only available v1.6.0+, tokens available 1.26.0+", ) +@pytest.mark.snapshot(token="tests.contrib.openai.test_openai.test_chat_completion_stream") async def test_chat_completion_async_stream_context_manager(openai, openai_vcr, mock_metrics, snapshot_tracer): - with openai_vcr.use_cassette("chat_completion_streamed.yaml"): + with openai_vcr.use_cassette("chat_completion_streamed_tokens.yaml"): with mock.patch("ddtrace.contrib.internal.openai.utils.encoding_for_model", create=True) as mock_encoding: mock_encoding.return_value.encode.side_effect = lambda x: [1, 2, 3, 4, 5, 6, 7, 8] - expected_completion = "The Los Angeles Dodgers won the World Series in 2020." client = openai.AsyncOpenAI() async with await client.chat.completions.create( model="gpt-3.5-turbo", @@ -1207,41 +1078,7 @@ async def test_chat_completion_async_stream_context_manager(openai, openai_vcr, user="ddtrace-test", n=None, ) as resp: - prompt_tokens = 8 - span = snapshot_tracer.current_span() - chunks = [c async for c in resp] - assert len(chunks) == 15 - completion = "".join( - [c.choices[0].delta.content for c in chunks if c.choices[0].delta.content is not None] - ) - assert completion == expected_completion - - assert span.get_tag("openai.response.choices.0.message.content") == expected_completion - assert span.get_tag("openai.response.choices.0.message.role") == "assistant" - assert span.get_tag("openai.response.choices.0.finish_reason") == "stop" - - expected_tags = [ - "version:", - "env:", - "service:tests.contrib.openai", - "openai.request.model:gpt-3.5-turbo", - "model:gpt-3.5-turbo", - "openai.request.endpoint:/v1/chat/completions", - "openai.request.method:POST", - "openai.organization.id:", - "openai.organization.name:datadog-4", - "openai.user.api_key:sk-...key>", - "error:0", - ] - assert mock.call.distribution("request.duration", span.duration_ns, tags=expected_tags) in mock_metrics.mock_calls - assert mock.call.gauge("ratelimit.requests", 3000, tags=expected_tags) in mock_metrics.mock_calls - assert mock.call.gauge("ratelimit.remaining.requests", 2999, tags=expected_tags) in mock_metrics.mock_calls - expected_tags += ["openai.estimated:true"] - if TIKTOKEN_AVAILABLE: - expected_tags = expected_tags[:-1] - assert mock.call.distribution("tokens.prompt", prompt_tokens, tags=expected_tags) in mock_metrics.mock_calls - assert mock.call.distribution("tokens.completion", mock.ANY, tags=expected_tags) in mock_metrics.mock_calls - assert mock.call.distribution("tokens.total", mock.ANY, tags=expected_tags) in mock_metrics.mock_calls + _ = [c async for c in resp] @pytest.mark.snapshot( diff --git a/tests/contrib/urllib3/test_urllib3.py b/tests/contrib/urllib3/test_urllib3.py index 2f0c447ee65..841e2c826ab 100644 --- a/tests/contrib/urllib3/test_urllib3.py +++ b/tests/contrib/urllib3/test_urllib3.py @@ -12,6 +12,7 @@ from ddtrace.ext import http from ddtrace.internal.schema import DEFAULT_SPAN_SERVICE_NAME from ddtrace.pin import Pin +from ddtrace.settings.asm import config as asm_config from tests.contrib.config import HTTPBIN_CONFIG from tests.opentracer.utils import init_tracer from tests.utils import TracerTestCase @@ -527,12 +528,16 @@ def test_distributed_tracing_disabled(self): timeout=mock.ANY, ) + @pytest.mark.skip(reason="urlib3 does not set the ASM Manual keep tag so x-datadog headers are not propagated") def test_distributed_tracing_apm_opt_out_true(self): """Tests distributed tracing headers are passed by default""" # Check that distributed tracing headers are passed down; raise an error rather than make the # request since we don't care about the response at all config.urllib3["distributed_tracing"] = True self.tracer.enabled = False + # Ensure the ASM SpanProcessor is set + self.tracer.configure(appsec_standalone_enabled=True, appsec_enabled=True) + assert asm_config._apm_opt_out with mock.patch( "urllib3.connectionpool.HTTPConnectionPool._make_request", side_effect=ValueError ) as m_make_request: @@ -580,6 +585,9 @@ def test_distributed_tracing_apm_opt_out_false(self): """Test with distributed tracing disabled does not propagate the headers""" config.urllib3["distributed_tracing"] = True self.tracer.enabled = False + # Ensure the ASM SpanProcessor is set. + self.tracer.configure(appsec_standalone_enabled=False, appsec_enabled=True) + assert not asm_config._apm_opt_out with mock.patch( "urllib3.connectionpool.HTTPConnectionPool._make_request", side_effect=ValueError ) as m_make_request: diff --git a/tests/debugging/exception/test_replay.py b/tests/debugging/exception/test_replay.py index 9aae75dae47..8261bfb5b47 100644 --- a/tests/debugging/exception/test_replay.py +++ b/tests/debugging/exception/test_replay.py @@ -294,3 +294,23 @@ def c(foo=42): self.assert_span_count(6) # no new snapshots assert len(uploader.collector.queue) == 3 + + def test_debugger_exception_in_closure(self): + def b(): + with self.trace("b"): + nonloc = 4 + + def a(v): + if nonloc: + raise ValueError("hello", v) + + a(nonloc) + + with exception_replay() as uploader: + with with_rate_limiter(RateLimiter(limit_rate=1, raise_on_exceed=False)): + with pytest.raises(ValueError): + b() + + assert all( + s.line_capture["locals"]["nonloc"] == {"type": "int", "value": "4"} for s in uploader.collector.queue + ) diff --git a/tests/debugging/test_safety.py b/tests/debugging/test_safety.py index 3acb0288924..cc44ca9ca12 100644 --- a/tests/debugging/test_safety.py +++ b/tests/debugging/test_safety.py @@ -15,7 +15,10 @@ def assert_args(args): assert set(dict(_safety.get_args(inspect.currentframe().f_back)).keys()) == args def assert_locals(_locals): - assert set(dict(_safety.get_locals(inspect.currentframe().f_back)).keys()) == _locals + assert set(dict(_safety.get_locals(inspect.currentframe().f_back)).keys()) == _locals | { + "assert_args", + "assert_locals", + } def assert_globals(_globals): assert set(dict(_safety.get_globals(inspect.currentframe().f_back)).keys()) == _globals diff --git a/tests/llmobs/_utils.py b/tests/llmobs/_utils.py index 0ecdde36ee6..4e60a8f3996 100644 --- a/tests/llmobs/_utils.py +++ b/tests/llmobs/_utils.py @@ -210,11 +210,13 @@ def _get_llmobs_parent_id(span: Span): def _expected_llmobs_eval_metric_event( - span_id, - trace_id, metric_type, label, ml_app, + tag_key=None, + tag_value=None, + span_id=None, + trace_id=None, timestamp_ms=None, categorical_value=None, score_value=None, @@ -223,8 +225,7 @@ def _expected_llmobs_eval_metric_event( metadata=None, ): eval_metric_event = { - "span_id": span_id, - "trace_id": trace_id, + "join_on": {}, "metric_type": metric_type, "label": label, "tags": [ @@ -232,6 +233,10 @@ def _expected_llmobs_eval_metric_event( "ml_app:{}".format(ml_app if ml_app is not None else "unnamed-ml-app"), ], } + if tag_key is not None and tag_value is not None: + eval_metric_event["join_on"]["tag"] = {"key": tag_key, "value": tag_value} + if span_id is not None and trace_id is not None: + eval_metric_event["join_on"]["span"] = {"span_id": span_id, "trace_id": trace_id} if categorical_value is not None: eval_metric_event["categorical_value"] = categorical_value if score_value is not None: @@ -542,8 +547,7 @@ def run_and_submit_evaluation(self, span): def _dummy_evaluator_eval_metric_event(span_id, trace_id): return LLMObsEvaluationMetricEvent( - span_id=span_id, - trace_id=trace_id, + join_on={"span": {"span_id": span_id, "trace_id": trace_id}}, score_value=1.0, ml_app="unnamed-ml-app", timestamp_ms=mock.ANY, @@ -553,7 +557,46 @@ def _dummy_evaluator_eval_metric_event(span_id, trace_id): ) -def _expected_ragas_spans(ragas_inputs=None): +def _expected_ragas_context_precision_spans(ragas_inputs=None): + if not ragas_inputs: + ragas_inputs = default_ragas_inputs + return [ + { + "trace_id": mock.ANY, + "span_id": mock.ANY, + "parent_id": "undefined", + "name": "dd-ragas.context_precision", + "start_ns": mock.ANY, + "duration": mock.ANY, + "status": "ok", + "meta": { + "span.kind": "workflow", + "input": {"value": mock.ANY}, + "output": {"value": "1.0"}, + }, + "metrics": {}, + "tags": expected_ragas_trace_tags(), + }, + { + "trace_id": mock.ANY, + "span_id": mock.ANY, + "parent_id": mock.ANY, + "name": "dd-ragas.extract_evaluation_inputs_from_span", + "start_ns": mock.ANY, + "duration": mock.ANY, + "status": "ok", + "meta": { + "span.kind": "workflow", + "input": {"value": mock.ANY}, + "output": {"value": mock.ANY}, + }, + "metrics": {}, + "tags": expected_ragas_trace_tags(), + }, + ] + + +def _expected_ragas_faithfulness_spans(ragas_inputs=None): if not ragas_inputs: ragas_inputs = default_ragas_inputs return [ @@ -581,7 +624,7 @@ def _expected_ragas_spans(ragas_inputs=None): "trace_id": mock.ANY, "span_id": mock.ANY, "parent_id": mock.ANY, - "name": "dd-ragas.extract_faithfulness_inputs", + "name": "dd-ragas.extract_evaluation_inputs_from_span", "start_ns": mock.ANY, "duration": mock.ANY, "status": "ok", diff --git a/tests/llmobs/conftest.py b/tests/llmobs/conftest.py index a7d467b3985..5a63b7e2b8f 100644 --- a/tests/llmobs/conftest.py +++ b/tests/llmobs/conftest.py @@ -31,26 +31,6 @@ def pytest_configure(config): config.addinivalue_line("markers", "vcr_logs: mark test to use recorded request/responses") -@pytest.fixture -def mock_llmobs_span_writer(): - patcher = mock.patch("ddtrace.llmobs._llmobs.LLMObsSpanWriter") - LLMObsSpanWriterMock = patcher.start() - m = mock.MagicMock() - LLMObsSpanWriterMock.return_value = m - yield m - patcher.stop() - - -@pytest.fixture -def mock_llmobs_span_agentless_writer(): - patcher = mock.patch("ddtrace.llmobs._llmobs.LLMObsSpanWriter") - LLMObsSpanWriterMock = patcher.start() - m = mock.MagicMock() - LLMObsSpanWriterMock.return_value = m - yield m - patcher.stop() - - @pytest.fixture def mock_llmobs_eval_metric_writer(): patcher = mock.patch("ddtrace.llmobs._llmobs.LLMObsEvalMetricWriter") @@ -85,10 +65,7 @@ def mock_llmobs_submit_evaluation(): def mock_http_writer_send_payload_response(): with mock.patch( "ddtrace.internal.writer.HTTPWriter._send_payload", - return_value=Response( - status=200, - body="{}", - ), + return_value=Response(status=200, body="{}"), ): yield @@ -124,9 +101,10 @@ def mock_evaluator_sampler_logs(): @pytest.fixture -def mock_http_writer_logs(): - with mock.patch("ddtrace.internal.writer.writer.log") as m: +def mock_llmobs_logs(): + with mock.patch("ddtrace.llmobs._llmobs.log") as m: yield m + m.reset_mock() @pytest.fixture @@ -139,44 +117,6 @@ def default_global_config(): return {"_dd_api_key": "", "_llmobs_ml_app": "unnamed-ml-app"} -@pytest.fixture -def LLMObs( - mock_llmobs_span_writer, mock_llmobs_eval_metric_writer, mock_llmobs_evaluator_runner, ddtrace_global_config -): - global_config = default_global_config() - global_config.update(ddtrace_global_config) - with override_global_config(global_config): - dummy_tracer = DummyTracer() - llmobs_service.enable(_tracer=dummy_tracer) - yield llmobs_service - llmobs_service.disable() - - -@pytest.fixture -def AgentlessLLMObs( - mock_llmobs_span_agentless_writer, - mock_llmobs_eval_metric_writer, - mock_llmobs_evaluator_runner, - ddtrace_global_config, -): - global_config = default_global_config() - global_config.update(ddtrace_global_config) - global_config.update(dict(_llmobs_agentless_enabled=True)) - with override_global_config(global_config): - dummy_tracer = DummyTracer() - llmobs_service.enable(_tracer=dummy_tracer) - yield llmobs_service - llmobs_service.disable() - - -@pytest.fixture -def disabled_llmobs(): - prev = llmobs_service.enabled - llmobs_service.enabled = False - yield - llmobs_service.enabled = prev - - @pytest.fixture def mock_ragas_dependencies_not_present(): import ragas @@ -189,18 +129,22 @@ def mock_ragas_dependencies_not_present(): @pytest.fixture -def ragas(mock_llmobs_span_writer, mock_llmobs_eval_metric_writer): +def ragas(mock_llmobs_eval_metric_writer): with override_global_config(dict(_dd_api_key="")): - import ragas - + try: + import ragas + except ImportError: + pytest.skip("Ragas not installed") with override_env(dict(OPENAI_API_KEY=os.getenv("OPENAI_API_KEY", ""))): yield ragas @pytest.fixture def reset_ragas_faithfulness_llm(): - import ragas - + try: + import ragas + except ImportError: + pytest.skip("Ragas not installed") previous_llm = ragas.metrics.faithfulness.llm yield ragas.metrics.faithfulness.llm = previous_llm @@ -243,16 +187,25 @@ def llmobs_span_writer(): @pytest.fixture -def llmobs(monkeypatch, tracer, llmobs_env, llmobs_span_writer): +def llmobs( + ddtrace_global_config, + monkeypatch, + tracer, + llmobs_env, + llmobs_span_writer, + mock_llmobs_eval_metric_writer, + mock_llmobs_evaluator_runner, +): for env, val in llmobs_env.items(): monkeypatch.setenv(env, val) - + global_config = default_global_config() + global_config.update(dict(_llmobs_ml_app=llmobs_env.get("DD_LLMOBS_ML_APP"))) + global_config.update(ddtrace_global_config) # TODO: remove once rest of tests are moved off of global config tampering - with override_global_config(dict(_llmobs_ml_app=llmobs_env.get("DD_LLMOBS_ML_APP"))): + with override_global_config(global_config): llmobs_service.enable(_tracer=tracer) llmobs_service._instance._llmobs_span_writer = llmobs_span_writer - llmobs_service._instance._trace_processor._span_writer = llmobs_span_writer - yield llmobs + yield llmobs_service llmobs_service.disable() diff --git a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_eval_metric_writer.send_score_metric.yaml b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_eval_metric_writer.send_score_metric.yaml index 61c26ff7bf0..f767f5de303 100644 --- a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_eval_metric_writer.send_score_metric.yaml +++ b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_eval_metric_writer.send_score_metric.yaml @@ -1,27 +1,28 @@ interactions: - request: - body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"span_id": - "12345678902", "trace_id": "98765432102", "metric_type": "score", "label": "sentiment", - "score_value": 0.9, "ml_app": "dummy-ml-app", "timestamp_ms": 1724249500942}]}}}' + body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"join_on": + {"span": {"span_id": "12345678902", "trace_id": "98765432102"}}, "metric_type": + "score", "label": "sentiment", "score_value": 0.9, "ml_app": "dummy-ml-app", + "timestamp_ms": 1732568298743}]}}}' headers: Content-Type: - application/json DD-API-KEY: - XXXXXX method: POST - uri: https://api.datad0g.com/api/intake/llm-obs/v1/eval-metric + uri: https://api.datad0g.com/api/intake/llm-obs/v2/eval-metric response: body: - string: '{"data":{"id":"e66c93b9-ca0a-4f0a-9207-497e0a1b6eec","type":"evaluation_metric","attributes":{"metrics":[{"id":"5fb5ed5d-20c1-4f34-abf9-c0bdc09680e3","trace_id":"98765432102","span_id":"12345678902","timestamp_ms":1724249500942,"ml_app":"dummy-ml-app","metric_type":"score","label":"sentiment","score_value":0.9}]}}}' + string: '{"data":{"id":"5b998846-53af-4b0e-a658-fd9e06726d6d","type":"evaluation_metric","attributes":{"metrics":[{"id":"jbGbAMC7Rk","join_on":{"span":{"trace_id":"98765432102","span_id":"12345678902"}},"timestamp_ms":1732568298743,"ml_app":"dummy-ml-app","metric_type":"score","label":"sentiment","score_value":0.9}]}}}' headers: content-length: - - '316' + - '311' content-security-policy: - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pub293163a918901030b79492fe1ab424cf&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatad0g.com content-type: - application/vnd.api+json date: - - Wed, 21 Aug 2024 14:11:41 GMT + - Mon, 25 Nov 2024 20:58:19 GMT strict-transport-security: - max-age=31536000; includeSubDomains; preload vary: diff --git a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_eval_metric_writer.test_send_categorical_metric.yaml b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_eval_metric_writer.test_send_categorical_metric.yaml index 92498e86e9e..f4404b30832 100644 --- a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_eval_metric_writer.test_send_categorical_metric.yaml +++ b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_eval_metric_writer.test_send_categorical_metric.yaml @@ -1,27 +1,28 @@ interactions: - request: - body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"span_id": - "12345678901", "trace_id": "98765432101", "metric_type": "categorical", "categorical_value": - "very", "label": "toxicity", "ml_app": "dummy-ml-app", "timestamp_ms": 1724249500339}]}}}' + body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"join_on": + {"span": {"span_id": "12345678901", "trace_id": "98765432101"}}, "metric_type": + "categorical", "categorical_value": "very", "label": "toxicity", "ml_app": "dummy-ml-app", + "timestamp_ms": 1732568297450}]}}}' headers: Content-Type: - application/json DD-API-KEY: - XXXXXX method: POST - uri: https://api.datad0g.com/api/intake/llm-obs/v1/eval-metric + uri: https://api.datad0g.com/api/intake/llm-obs/v2/eval-metric response: body: - string: '{"data":{"id":"36d88c24-d7d4-4d3e-853c-b695aff61344","type":"evaluation_metric","attributes":{"metrics":[{"id":"0c189d9c-a730-4c5d-bbc2-55ef3455900f","trace_id":"98765432101","span_id":"12345678901","timestamp_ms":1724249500339,"ml_app":"dummy-ml-app","metric_type":"categorical","label":"toxicity","categorical_value":"very"}]}}}' + string: '{"data":{"id":"49c5c927-76f1-4de4-ad97-e1a0a159229f","type":"evaluation_metric","attributes":{"metrics":[{"id":"okVf1U4XzA","join_on":{"span":{"trace_id":"98765432101","span_id":"12345678901"}},"timestamp_ms":1732568297450,"ml_app":"dummy-ml-app","metric_type":"categorical","label":"toxicity","categorical_value":"very"}]}}}' headers: content-length: - - '330' + - '325' content-security-policy: - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pub293163a918901030b79492fe1ab424cf&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatad0g.com content-type: - application/vnd.api+json date: - - Wed, 21 Aug 2024 14:11:40 GMT + - Mon, 25 Nov 2024 20:58:17 GMT strict-transport-security: - max-age=31536000; includeSubDomains; preload vary: diff --git a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_eval_metric_writer.test_send_metric_bad_api_key.yaml b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_eval_metric_writer.test_send_metric_bad_api_key.yaml index 68fe0315870..ef6f4cf445e 100644 --- a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_eval_metric_writer.test_send_metric_bad_api_key.yaml +++ b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_eval_metric_writer.test_send_metric_bad_api_key.yaml @@ -1,15 +1,16 @@ interactions: - request: - body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"span_id": - "12345678901", "trace_id": "98765432101", "metric_type": "categorical", "categorical_value": - "very", "label": "toxicity", "ml_app": "dummy-ml-app", "timestamp_ms": 1724249500253}]}}}' + body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"join_on": + {"span": {"span_id": "12345678901", "trace_id": "98765432101"}}, "metric_type": + "categorical", "categorical_value": "very", "label": "toxicity", "ml_app": "dummy-ml-app", + "timestamp_ms": 1732568297307}]}}}' headers: Content-Type: - application/json DD-API-KEY: - XXXXXX method: POST - uri: https://api.datad0g.com/api/intake/llm-obs/v1/eval-metric + uri: https://api.datad0g.com/api/intake/llm-obs/v2/eval-metric response: body: string: '{"status":"error","code":403,"errors":["Forbidden"],"statuspage":"http://status.datadoghq.com","twitter":"http://twitter.com/datadogops","email":"support@datadoghq.com"}' @@ -21,7 +22,7 @@ interactions: content-type: - application/json date: - - Wed, 21 Aug 2024 14:11:40 GMT + - Mon, 25 Nov 2024 20:58:17 GMT strict-transport-security: - max-age=31536000; includeSubDomains; preload x-content-type-options: diff --git a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_eval_metric_writer.test_send_multiple_events.yaml b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_eval_metric_writer.test_send_multiple_events.yaml index 61da12cd3fa..3638a1cf608 100644 --- a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_eval_metric_writer.test_send_multiple_events.yaml +++ b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_eval_metric_writer.test_send_multiple_events.yaml @@ -1,32 +1,30 @@ interactions: - request: - body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"span_id": - "12345678902", "trace_id": "98765432102", "metric_type": "score", "label": "sentiment", - "score_value": 0.9, "ml_app": "dummy-ml-app", "timestamp_ms": 1724249589510}, - {"span_id": "12345678901", "trace_id": "98765432101", "metric_type": "categorical", - "categorical_value": "very", "label": "toxicity", "ml_app": "dummy-ml-app", - "timestamp_ms": 1724249589510}]}}}' + body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"join_on": + {"span": {"span_id": "12345678902", "trace_id": "98765432102"}}, "metric_type": + "score", "label": "sentiment", "score_value": 0.9, "ml_app": "dummy-ml-app", + "timestamp_ms": 1732568728793}, {"join_on": {"span": {"span_id": "12345678901", + "trace_id": "98765432101"}}, "metric_type": "categorical", "categorical_value": + "very", "label": "toxicity", "ml_app": "dummy-ml-app", "timestamp_ms": 1732568728793}]}}}' headers: Content-Type: - application/json DD-API-KEY: - XXXXXX method: POST - uri: https://api.datad0g.com/api/intake/llm-obs/v1/eval-metric + uri: https://api.datad0g.com/api/intake/llm-obs/v2/eval-metric response: body: - string: '{"data":{"id":"2ccffdfc-024b-49e6-881c-4e4d1c5f450e","type":"evaluation_metric","attributes":{"metrics":[{"id":"ed072901-fd70-4417-9cab-1bad62b6ac09","trace_id":"98765432102","span_id":"12345678902","timestamp_ms":1724249589510,"ml_app":"dummy-ml-app","metric_type":"score","label":"sentiment","score_value":0.9},{"id":"16175a34-7c25-43ca-8551-bd2f7242ab77","trace_id":"98765432101","span_id":"12345678901","timestamp_ms":1724249589510,"ml_app":"dummy-ml-app","metric_type":"categorical","label":"toxicity","categorical_value":"very"}]}}}' + string: '{"data":{"id":"844be0cd-9dd4-45d3-9763-8ccb20f4e7c8","type":"evaluation_metric","attributes":{"metrics":[{"id":"IZhAbBsXBJ","join_on":{"span":{"trace_id":"98765432102","span_id":"12345678902"}},"timestamp_ms":1732568728793,"ml_app":"dummy-ml-app","metric_type":"score","label":"sentiment","score_value":0.9},{"id":"ME868fTl0T","join_on":{"span":{"trace_id":"98765432101","span_id":"12345678901"}},"timestamp_ms":1732568728793,"ml_app":"dummy-ml-app","metric_type":"categorical","label":"toxicity","categorical_value":"very"}]}}}' headers: - Connection: - - keep-alive - Content-Length: - - '538' - Content-Type: - - application/vnd.api+json - Date: - - Wed, 21 Aug 2024 14:13:09 GMT + content-length: + - '528' content-security-policy: - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pub293163a918901030b79492fe1ab424cf&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatad0g.com + content-type: + - application/vnd.api+json + date: + - Mon, 25 Nov 2024 21:05:29 GMT strict-transport-security: - max-age=31536000; includeSubDomains; preload vary: diff --git a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_eval_metric_writer.test_send_score_metric.yaml b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_eval_metric_writer.test_send_score_metric.yaml index 1394f9fbb43..65bb0fa1562 100644 --- a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_eval_metric_writer.test_send_score_metric.yaml +++ b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_eval_metric_writer.test_send_score_metric.yaml @@ -1,27 +1,28 @@ interactions: - request: - body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"span_id": - "12345678902", "trace_id": "98765432102", "metric_type": "score", "label": "sentiment", - "score_value": 0.9, "ml_app": "dummy-ml-app", "timestamp_ms": 1724249500471}]}}}' + body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"join_on": + {"span": {"span_id": "12345678902", "trace_id": "98765432102"}}, "metric_type": + "score", "label": "sentiment", "score_value": 0.9, "ml_app": "dummy-ml-app", + "timestamp_ms": 1732568297772}]}}}' headers: Content-Type: - application/json DD-API-KEY: - XXXXXX method: POST - uri: https://api.datad0g.com/api/intake/llm-obs/v1/eval-metric + uri: https://api.datad0g.com/api/intake/llm-obs/v2/eval-metric response: body: - string: '{"data":{"id":"5bd1b0b7-0acd-46e2-8ff6-3ee6a92457b6","type":"evaluation_metric","attributes":{"metrics":[{"id":"d8aa2a23-3137-4c49-b87b-d1eb1c3af04e","trace_id":"98765432102","span_id":"12345678902","timestamp_ms":1724249500471,"ml_app":"dummy-ml-app","metric_type":"score","label":"sentiment","score_value":0.9}]}}}' + string: '{"data":{"id":"d1518236-84b1-4b47-9cbc-ffc24188b5cc","type":"evaluation_metric","attributes":{"metrics":[{"id":"jiKtwDKR0B","join_on":{"span":{"trace_id":"98765432102","span_id":"12345678902"}},"timestamp_ms":1732568297772,"ml_app":"dummy-ml-app","metric_type":"score","label":"sentiment","score_value":0.9}]}}}' headers: content-length: - - '316' + - '311' content-security-policy: - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pub293163a918901030b79492fe1ab424cf&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatad0g.com content-type: - application/vnd.api+json date: - - Wed, 21 Aug 2024 14:11:40 GMT + - Mon, 25 Nov 2024 20:58:18 GMT strict-transport-security: - max-age=31536000; includeSubDomains; preload vary: diff --git a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_eval_metric_writer.test_send_timed_events.yaml b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_eval_metric_writer.test_send_timed_events.yaml index c9797ace419..c31d610bd57 100644 --- a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_eval_metric_writer.test_send_timed_events.yaml +++ b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_eval_metric_writer.test_send_timed_events.yaml @@ -1,27 +1,28 @@ interactions: - request: - body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"span_id": - "12345678902", "trace_id": "98765432102", "metric_type": "score", "label": "sentiment", - "score_value": 0.9, "ml_app": "dummy-ml-app", "timestamp_ms": 1724249982978}]}}}' + body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"join_on": + {"span": {"span_id": "12345678902", "trace_id": "98765432102"}}, "metric_type": + "score", "label": "sentiment", "score_value": 0.9, "ml_app": "dummy-ml-app", + "timestamp_ms": 1732568764624}]}}}' headers: Content-Type: - application/json DD-API-KEY: - XXXXXX method: POST - uri: https://api.datad0g.com/api/intake/llm-obs/v1/eval-metric + uri: https://api.datad0g.com/api/intake/llm-obs/v2/eval-metric response: body: - string: '{"data":{"id":"aba22157-cc3a-4601-a6a5-7afa99eee73e","type":"evaluation_metric","attributes":{"metrics":[{"id":"c2f6f63c-17ca-48c3-ad2d-676b2a35e726","trace_id":"98765432102","span_id":"12345678902","timestamp_ms":1724249982978,"ml_app":"dummy-ml-app","metric_type":"score","label":"sentiment","score_value":0.9}]}}}' + string: '{"data":{"id":"5352c11a-dcdd-449b-af72-2ae0b5dac3a1","type":"evaluation_metric","attributes":{"metrics":[{"id":"WmMD7E_fAD","join_on":{"span":{"trace_id":"98765432102","span_id":"12345678902"}},"timestamp_ms":1732568764624,"ml_app":"dummy-ml-app","metric_type":"score","label":"sentiment","score_value":0.9}]}}}' headers: content-length: - - '316' + - '311' content-security-policy: - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pub293163a918901030b79492fe1ab424cf&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatad0g.com content-type: - application/vnd.api+json date: - - Wed, 21 Aug 2024 14:19:45 GMT + - Mon, 25 Nov 2024 21:06:04 GMT strict-transport-security: - max-age=31536000; includeSubDomains; preload vary: @@ -34,28 +35,29 @@ interactions: code: 202 message: Accepted - request: - body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"span_id": - "12345678901", "trace_id": "98765432101", "metric_type": "categorical", "categorical_value": - "very", "label": "toxicity", "ml_app": "dummy-ml-app", "timestamp_ms": 1724249983284}]}}}' + body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"join_on": + {"span": {"span_id": "12345678901", "trace_id": "98765432101"}}, "metric_type": + "categorical", "categorical_value": "very", "label": "toxicity", "ml_app": "dummy-ml-app", + "timestamp_ms": 1732568765127}]}}}' headers: Content-Type: - application/json DD-API-KEY: - XXXXXX method: POST - uri: https://api.datad0g.com/api/intake/llm-obs/v1/eval-metric + uri: https://api.datad0g.com/api/intake/llm-obs/v2/eval-metric response: body: - string: '{"data":{"id":"0bc39c40-6c72-4b11-9eea-826248f9fe37","type":"evaluation_metric","attributes":{"metrics":[{"id":"7da7eb5b-32d2-43b3-adf5-208313f822c5","trace_id":"98765432101","span_id":"12345678901","timestamp_ms":1724249983284,"ml_app":"dummy-ml-app","metric_type":"categorical","label":"toxicity","categorical_value":"very"}]}}}' + string: '{"data":{"id":"d39e806e-40c5-4b3c-b539-440390afca85","type":"evaluation_metric","attributes":{"metrics":[{"id":"403hQLmrQW","join_on":{"span":{"trace_id":"98765432101","span_id":"12345678901"}},"timestamp_ms":1732568765127,"ml_app":"dummy-ml-app","metric_type":"categorical","label":"toxicity","categorical_value":"very"}]}}}' headers: content-length: - - '330' + - '325' content-security-policy: - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pub293163a918901030b79492fe1ab424cf&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatad0g.com content-type: - application/vnd.api+json date: - - Wed, 21 Aug 2024 14:19:45 GMT + - Mon, 25 Nov 2024 21:06:05 GMT strict-transport-security: - max-age=31536000; includeSubDomains; preload vary: diff --git a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_evaluator_runner.send_score_metric.yaml b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_evaluator_runner.send_score_metric.yaml index e2e17e715cf..f5deea8ef90 100644 --- a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_evaluator_runner.send_score_metric.yaml +++ b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_evaluator_runner.send_score_metric.yaml @@ -1,28 +1,28 @@ interactions: - request: - body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"span_id": - "123", "trace_id": "1234", "label": "dummy", "metric_type": "score", "timestamp_ms": - 1729569649880, "score_value": 1.0, "ml_app": "unnamed-ml-app", "tags": ["ddtrace.version:2.15.0.dev219+ge047e25bb.d20241022", - "ml_app:unnamed-ml-app"]}]}}}' + body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"join_on": + {"span": {"span_id": "123", "trace_id": "1234"}}, "label": "dummy", "metric_type": + "score", "timestamp_ms": 1732569321978, "score_value": 1.0, "ml_app": "unnamed-ml-app", + "tags": ["ddtrace.version:2.15.0.dev351+g152f3e3b6.d20241122", "ml_app:unnamed-ml-app"]}]}}}' headers: Content-Type: - application/json DD-API-KEY: - XXXXXX method: POST - uri: https://api.datad0g.com/api/intake/llm-obs/v1/eval-metric + uri: https://api.datad0g.com/api/intake/llm-obs/v2/eval-metric response: body: - string: '{"data":{"id":"2131dbc0-d085-401c-8b2d-8506a9ac8c13","type":"evaluation_metric","attributes":{"metrics":[{"id":"YutAyQc6F4","trace_id":"1234","span_id":"123","timestamp_ms":1729569649880,"ml_app":"unnamed-ml-app","metric_type":"score","label":"dummy","score_value":1,"tags":["ddtrace.version:2.15.0.dev219+ge047e25bb.d20241022","ml_app:unnamed-ml-app"]}]}}}' + string: '{"data":{"id":"06c00db0-1898-44be-ae0b-f0149f819c59","type":"evaluation_metric","attributes":{"metrics":[{"id":"1DrSMXmWcP","join_on":{"span":{"trace_id":"1234","span_id":"123"}},"timestamp_ms":1732569321978,"ml_app":"unnamed-ml-app","metric_type":"score","label":"dummy","score_value":1,"tags":["ddtrace.version:2.15.0.dev351+g152f3e3b6.d20241122","ml_app:unnamed-ml-app"]}]}}}' headers: content-length: - - '357' + - '378' content-security-policy: - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pub293163a918901030b79492fe1ab424cf&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatad0g.com content-type: - application/vnd.api+json date: - - Tue, 22 Oct 2024 04:00:50 GMT + - Mon, 25 Nov 2024 21:15:22 GMT strict-transport-security: - max-age=31536000; includeSubDomains; preload vary: diff --git a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.emits_traces_and_evaluations_on_exit.yaml b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_evaluators.emits_traces_and_evaluations_on_exit.yaml similarity index 76% rename from tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.emits_traces_and_evaluations_on_exit.yaml rename to tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_evaluators.emits_traces_and_evaluations_on_exit.yaml index 757f875443f..367024a712d 100644 --- a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.emits_traces_and_evaluations_on_exit.yaml +++ b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_evaluators.emits_traces_and_evaluations_on_exit.yaml @@ -73,19 +73,19 @@ interactions: response: body: string: !!binary | - H4sIAAAAAAAAA2yRW4vbMBCF3/0rxDzHi+2mySZvbUOhsPRGoSxxsBV5bKuVJaGZ9ELIfy9yvEnK - 7oseztH5NGd0TIQA3cBagOolq8Gb9M3D5tPi1zuz+r0qHjavv27a74+vPj5+sG8z8wVmMeH2P1Dx - U+pOucEbZO3s2VYBJWOk5stitchXy3w+GoNr0MRY5zmdu3TQVqdFVszTbJnm91O6d1ohwVpsEyGE - OI5nnNM2+AfWIps9KQMSyQ5hfbkkBARnogKSSBNLyzC7mspZRjuOXtf19lgCYVQUViO+HPmiBNKx - U6iIJeOAlila2xK+9SiU9JqlEa4V74O0CoUm8VkGTXcl7E67uq5vHw3YHkjG4vZgzKSfLi2M63xw - e5r8i95qq6mvAkpyNk5M7DyM7ikRYjdu6/DfAsAHN3iu2P1EG4GLLD/z4PpJV7dYTCY7luYmVSxn - L/CqBllqQzf7BiVVj801miU35Z4/+hLiXFDb7hklmUhAf4lxqFptOww+6PMPtr6a3xeqKORyryA5 - Jf8AAAD//wMAn6C7Cc8CAAA= + H4sIAAAAAAAAA4xSwYrbMBS8+yvEO8eL403iTW49bEtuWdpCIQ62Ij/bam1J6L1AS8i/FznZ2Mtu + oRcdZt6MZp50joQAXcFGgGolq9518acf0hyX6+02W26T3e7Ly/r5JTltv9rvNT/DLCjs8ScqflU9 + KNu7Dllbc6WVR8kYXOfZY7pcrVeLx4HobYVdkDWO44WNe210nCbpIk6yeP50U7dWKyTYiH0khBDn + 4Qw5TYW/YSOS2SvSI5FsEDb3ISHA2y4gIIk0sTQMs5FU1jCaIXpZlvtzDoQBUVgM9vngL3IgHTr5 + glgy9miYArXP4VuLQkmnWXbC1uKzl0ah0CR20mt6yOFwOZRlOb3UY30iGYqbU9fd8Mu9RWcb5+2R + bvwdr7XR1BYeJVkTEhNbBwN7iYQ4DNs6vVkAOG97xwXbX2iC4SqZX/1gfKSRTVc3ki3LbqJKs9kH + fkWFLHVHk32DkqrFapSOjyNPlbYTIpq0fp/mI+9rc22a/7EfCaXQMVaF81hp9bbxOOYx/OF/jd23 + PAQG+kOMfVFr06B3Xl9/UO2KJEuWx/opUwlEl+gvAAAA//8DABrEjBtPAwAA headers: CF-Cache-Status: - DYNAMIC CF-RAY: - - 8d6b5b701f294367-EWR + - 8e84af2fba19c952-IAD Connection: - keep-alive Content-Encoding: @@ -93,14 +93,14 @@ interactions: Content-Type: - application/json Date: - - Tue, 22 Oct 2024 17:55:15 GMT + - Mon, 25 Nov 2024 21:20:43 GMT Server: - cloudflare Set-Cookie: - - __cf_bm=iQaF937ylY7BvvBCyWYQoxiJwi1nBp5.LILrHLw1uno-1729619715-1.0.1.1-jS4Dz7yc_ud.hKZlJ_CAZkSQesqzVkfrA5F30zI7CtJsbEKyAiuVlpX0CPf816UtlhXQEW8T5nsc.UvnsCOzOw; - path=/; expires=Tue, 22-Oct-24 18:25:15 GMT; domain=.api.openai.com; HttpOnly; + - __cf_bm=CVMxqIcHUHNjX56k1MKjj4MgiYNVAlg_B7yyVaP_z1o-1732569643-1.0.1.1-HOtZfXprHWr_DjtorQ_ZK6bbSmcOsBrphniRCaC9XQ2tTtO5JVpyDQK1HRFo3kUE9GEi9J.sR0_L6nBtXlGj8w; + path=/; expires=Mon, 25-Nov-24 21:50:43 GMT; domain=.api.openai.com; HttpOnly; Secure; SameSite=None - - _cfuvid=wQzHCwLW6CPU768K_tlLklWp36I8zYCVJkKlAMtnMkk-1729619715162-0.0.1.1-604800000; + - _cfuvid=sqtPZaucqBJu1r4exJtYym3vbKmuuSO6o0np5VglPsw-1732569643935-0.0.1.1-604800000; path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None Transfer-Encoding: - chunked @@ -113,7 +113,7 @@ interactions: openai-organization: - datadog-staging openai-processing-ms: - - '496' + - '370' openai-version: - '2020-10-01' strict-transport-security: @@ -131,7 +131,7 @@ interactions: x-ratelimit-reset-tokens: - 0s x-request-id: - - req_33b8cddecaab8b8bc36e90f58f844636 + - req_02ed729afc2d9083921e3fe5b7528550 status: code: 200 message: OK @@ -193,8 +193,8 @@ interactions: content-type: - application/json cookie: - - __cf_bm=iQaF937ylY7BvvBCyWYQoxiJwi1nBp5.LILrHLw1uno-1729619715-1.0.1.1-jS4Dz7yc_ud.hKZlJ_CAZkSQesqzVkfrA5F30zI7CtJsbEKyAiuVlpX0CPf816UtlhXQEW8T5nsc.UvnsCOzOw; - _cfuvid=wQzHCwLW6CPU768K_tlLklWp36I8zYCVJkKlAMtnMkk-1729619715162-0.0.1.1-604800000 + - __cf_bm=CVMxqIcHUHNjX56k1MKjj4MgiYNVAlg_B7yyVaP_z1o-1732569643-1.0.1.1-HOtZfXprHWr_DjtorQ_ZK6bbSmcOsBrphniRCaC9XQ2tTtO5JVpyDQK1HRFo3kUE9GEi9J.sR0_L6nBtXlGj8w; + _cfuvid=sqtPZaucqBJu1r4exJtYym3vbKmuuSO6o0np5VglPsw-1732569643935-0.0.1.1-604800000 host: - api.openai.com user-agent: @@ -220,19 +220,20 @@ interactions: response: body: string: !!binary | - H4sIAAAAAAAAA2xSQW7bMBC86xWLPVuBpTqR7VuAoGiBAmmLHhrEgUVTK2tdiSTIdZDA8N8Lyorl - ILnwMLMznB3ykAAgV7gE1I0S3bk2vf1xd19M/c3u4frhu/37Tet73qm7383r7fwXTqLCbnak5U11 - pW3nWhK25kRrT0ooumZFvrjJFkV23ROdraiNsq2TdGbTjg2n+TSfpdMizeaDurGsKeASHhMAgEN/ - xpymohdcwnTyhnQUgtoSLs9DAOhtGxFUIXAQZQQnI6mtETJ99LIsHw8rDKKEOjKywiWs8E9DoJVj - US3YGr56ZTQBB/ipPIerFU5ghZ5UsGYUnD3ioIKKPWkBT46EYy3RSRoCNrX1neoh5+0zV1QBm57r - k73IcMMz+Yp1nyk7PpVlebmEp3ofVCzS7Nt2wI/nVlq7dd5uwsCf8ZoNh2Z9Ch8bCGId9uwxAXjq - 29+/KxSdt52Ttdh/ZKJhUcxPfjg++sh+WQykWFHtiM+zYvKJ37oiUdyGi/dDrXRD1SidJhfLfbz0 - M4vTgmy2H1ySwQnDaxDq1jWbLXnn+fQjareezXOd56rYaEyOyX8AAAD//wMAUtzROh8DAAA= + H4sIAAAAAAAAA4xTwW6bQBC98xWjPZsIOzg43Fqp7a2y2iqqFEew3h1gWthd7Y6jpJb/vVrsGEdJ + pV44vDfv8eYN7BMAQVqUIFQnWQ2uTz/8lEZ9/NI9//mafVt9Wt61uxbvtt+3Oa3XYhYVdvsLFb+o + rpQdXI9M1hxp5VEyRtd5cb1Y3tze5PlIDFZjH2Wt4zS36UCG0kW2yNOsSOerk7qzpDCIEu4TAID9 + +Iw5jcYnUUI2e0EGDEG2KMrzEIDwto+IkCFQYGlYzCZSWcNoxuh1Xd/vNyKwZBzQ8EaUsBE/OgQl + HbHswTbw2UujECjAWnoKVxsxg43wKIM1k+DsEQclaPKoGDw6ZIq1RCfuEMg01g9yhJy3j6RRA5mR + G5M98ekNj+g1qTHT/PBQ1/XlEh6bXZCxSLPr+xN+OLfS29Z5uw0n/ow3ZCh01TF8bCCwdWJkDwnA + w9j+7lWhwnk7OK7Y/kYTDYtidfQT09En9vr2RLJl2U/4al7M3vGrNLKkPlzcTyipOtSTdDq23Gmy + F0RysfXbNO95Hzcn0/6P/UQohY5RV85jvMmrjacxj/Gf+NfYueUxsAjPgXGoGjIteufp+EU2rsqK + bLltVoXKRHJI/gIAAP//AwDgYzoinwMAAA== headers: CF-Cache-Status: - DYNAMIC CF-RAY: - - 8d6b5b744e034367-EWR + - 8e84af32dc70c952-IAD Connection: - keep-alive Content-Encoding: @@ -240,7 +241,7 @@ interactions: Content-Type: - application/json Date: - - Tue, 22 Oct 2024 17:55:16 GMT + - Mon, 25 Nov 2024 21:20:45 GMT Server: - cloudflare Transfer-Encoding: @@ -254,7 +255,7 @@ interactions: openai-organization: - datadog-staging openai-processing-ms: - - '749' + - '1168' openai-version: - '2020-10-01' strict-transport-security: @@ -272,35 +273,37 @@ interactions: x-ratelimit-reset-tokens: - 0s x-request-id: - - req_fbb01161a03eb6f478ff52314b72cfd6 + - req_702ebaa1edbab95fb42f52baa4b34661 status: code: 200 message: OK - request: - body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"span_id": - "6877142543397072040", "trace_id": "6717e70200000000a99ea8ad36f4f36d", "label": - "ragas_faithfulness", "metric_type": "score", "timestamp_ms": 1729619716093, - "score_value": 1.0, "ml_app": "unnamed-ml-app", "tags": ["ddtrace.version:2.15.0.dev219+ge047e25bb.d20241022", - "ml_app:unnamed-ml-app"]}]}}}' + body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"join_on": + {"span": {"span_id": "7678809694384023494", "trace_id": "6744ea2b00000000995e7b2ceabfce01"}}, + "label": "ragas_faithfulness", "metric_type": "score", "timestamp_ms": 1732569645205, + "score_value": 1.0, "ml_app": "unnamed-ml-app", "tags": ["ddtrace.version:2.15.0.dev351+g152f3e3b6.d20241122", + "ml_app:unnamed-ml-app"], "metadata": {"_dd.evaluation_kind": "faithfulness", + "_dd.evaluation_span": {"span_id": "5771061714047746387", "trace_id": "6744ea2b000000007099aeb477077763"}, + "_dd.faithfulness_disagreements": []}}]}}}' headers: Content-Type: - application/json DD-API-KEY: - XXXXXX method: POST - uri: https://api.datad0g.com/api/intake/llm-obs/v1/eval-metric + uri: https://api.datad0g.com/api/intake/llm-obs/v2/eval-metric response: body: - string: '{"data":{"id":"99fa371c-457c-4d2b-8d4c-61657e0ffd48","type":"evaluation_metric","attributes":{"metrics":[{"id":"CbapxUnzcX","trace_id":"6717e70200000000a99ea8ad36f4f36d","span_id":"6877142543397072040","timestamp_ms":1729619716093,"ml_app":"unnamed-ml-app","metric_type":"score","label":"ragas_faithfulness","score_value":1,"tags":["ddtrace.version:2.15.0.dev219+ge047e25bb.d20241022","ml_app:unnamed-ml-app"]}]}}}' + string: '{"data":{"id":"f1470aa7-b97f-4809-825d-6932af26a81c","type":"evaluation_metric","attributes":{"metrics":[{"id":"EPRU-72kfP","join_on":{"span":{"trace_id":"6744ea2b00000000995e7b2ceabfce01","span_id":"7678809694384023494"}},"timestamp_ms":1732569645205,"ml_app":"unnamed-ml-app","metric_type":"score","label":"ragas_faithfulness","score_value":1,"tags":["ddtrace.version:2.15.0.dev351+g152f3e3b6.d20241122","ml_app:unnamed-ml-app"],"metadata":{"_dd.evaluation_kind":"faithfulness","_dd.evaluation_span":{"span_id":"5771061714047746387","trace_id":"6744ea2b000000007099aeb477077763"},"_dd.faithfulness_disagreements":[]}}]}}}' headers: content-length: - - '414' + - '623' content-security-policy: - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pub293163a918901030b79492fe1ab424cf&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatad0g.com content-type: - application/vnd.api+json date: - - Tue, 22 Oct 2024 17:55:17 GMT + - Mon, 25 Nov 2024 21:20:45 GMT strict-transport-security: - max-age=31536000; includeSubDomains; preload vary: diff --git a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_emits_traces.yaml b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_evaluators.test_ragas_faithfulness_emits_traces.yaml similarity index 80% rename from tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_emits_traces.yaml rename to tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_evaluators.test_ragas_faithfulness_emits_traces.yaml index 8efe7391c90..2100bb3d305 100644 --- a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_emits_traces.yaml +++ b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_evaluators.test_ragas_faithfulness_emits_traces.yaml @@ -51,7 +51,7 @@ interactions: host: - api.openai.com user-agent: - - OpenAI/Python 1.47.1 + - OpenAI/Python 1.52.0 x-stainless-arch: - arm64 x-stainless-async: @@ -61,7 +61,9 @@ interactions: x-stainless-os: - MacOS x-stainless-package-version: - - 1.47.1 + - 1.52.0 + x-stainless-retry-count: + - '0' x-stainless-runtime: - CPython x-stainless-runtime-version: @@ -71,19 +73,19 @@ interactions: response: body: string: !!binary | - H4sIAAAAAAAAAwAAAP//dJHBbtswEETv+gpiz1YgCY7k+JbCKAoEQYKih7SWIdH0SmJDkQR3jbYw - /O8FZcd2D73wMI8znF0eEiFA72ApQA2S1ehN+vipfFytqq+rt7eHX4v9+3NY/aDnL5y9DE/fYRYd - bvsTFX+47pQbvUHWzp6wCigZY2peFVWRLcryfgKj26GJtt5zOnfpqK1Oi6yYp1mV5ouze3BaIcFS - rBMhhDhMZ+xpd/gbliKbfSgjEskeYXm5JAQEZ6ICkkgTS8swu0LlLKOdqrdtuz7UQBgVhc0UX0/5 - ogbScabQEEvGES1TROsavg0olPSapRGuE5+DtAqFJvEqg6a7GjbHTdu2t48G7PYk4+B2b8xZP16m - MK73wW3pzC96p62moQkoydnYmNh5mOgxEWIzbWv/zwLABzd6bti9o42BZZaf8uD6SVdalGfIjqW5 - cRXV/1zNDllqQzc7h1NDbftrQnapOc0J9IcYx6bTtsfggz59QeebfLudl3lZdQ+QHJO/AAAA//8D - AL2Ti/mQAgAA + H4sIAAAAAAAAA4xSwY7aMBS85yusdyarkAKh3Payh6qVVitUVQWUGOclcevYrt9D7Rbx75UDS1h1 + K/Xiw8yb8cyzj4kQoGtYCVCdZNV7k95/kaZtnpZr3++f159+Pk4/L35P5cevPz6YJ5hEhdt/Q8Uv + qjvlem+QtbNnWgWUjNF1WrzL54v383w2EL2r0URZ6zmdubTXVqd5ls/SrEiny4u6c1ohwUpsEiGE + OA5nzGlr/AUrkU1ekB6JZIuwug4JAcGZiIAk0sTSMkxGUjnLaIfoVVVtjlsgjIjCcrDfDv5iC6Rj + p1ASS8YeLVOkNltYdyiU9JqlEa4RD0FahUKTeJRB090WdqddVVW3lwZsDiRjcXsw5oKfri2Ma31w + e7rwV7zRVlNXBpTkbExM7DwM7CkRYjds6/BqAeCD6z2X7L6jjYaLbHr2g/GRRjZfXEh2LM2NKi8m + b/iVNbLUhm72DUqqDutROj6OPNTa3RDJTeu/07zlfW6ubfs/9iOhFHrGuvQBa61eNx7HAsY//K+x + 65aHwEDPxNiXjbYtBh/0+Qc1vsyKbL5vloXKIDklfwAAAP//AwB8IvReTwMAAA== headers: CF-Cache-Status: - DYNAMIC CF-RAY: - - 8c856bee184d42d1-EWR + - 8e84ac4858349c52-IAD Connection: - keep-alive Content-Encoding: @@ -91,14 +93,14 @@ interactions: Content-Type: - application/json Date: - - Tue, 24 Sep 2024 20:11:06 GMT + - Mon, 25 Nov 2024 21:18:45 GMT Server: - cloudflare Set-Cookie: - - __cf_bm=nMe4XLsotHph1aKmM6xJotYxeBsTIpCG1ULeQ2oiKLc-1727208666-1.0.1.1-eM1elzOCEnpbPLkOO61HSBvaeQPYHEyO4Ba3P2NsxkYV23Fybb7E8tIipei4YDbhyDiLXybnT7H0ETvjbsV89g; - path=/; expires=Tue, 24-Sep-24 20:41:06 GMT; domain=.api.openai.com; HttpOnly; + - __cf_bm=9IkhXEbzhF0QSjoZOW.EVqEQoEHTaAK7pnQ6K1m4EfY-1732569525-1.0.1.1-8FRhFy6jBsuonirbdG9jJ_IHnhXUakqpLsEg10YYrkhce9PwlXOKXNA2hiZwNqpM3D2TP2X4eFcZJjdEZt6.qQ; + path=/; expires=Mon, 25-Nov-24 21:48:45 GMT; domain=.api.openai.com; HttpOnly; Secure; SameSite=None - - _cfuvid=lKBj.JPFMKz3LiJyz12GeZI73UndAQfhN.5aqiwYPHA-1727208666121-0.0.1.1-604800000; + - _cfuvid=YGxjg63ZVaAJESO.Ouzjnkmhsg2izo9JySj6zJ3MRuc-1732569525322-0.0.1.1-604800000; path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None Transfer-Encoding: - chunked @@ -106,10 +108,12 @@ interactions: - nosniff access-control-expose-headers: - X-Request-ID + alt-svc: + - h3=":443"; ma=86400 openai-organization: - datadog-staging openai-processing-ms: - - '576' + - '469' openai-version: - '2020-10-01' strict-transport-security: @@ -127,7 +131,7 @@ interactions: x-ratelimit-reset-tokens: - 0s x-request-id: - - req_ef3f2830eaf13bceea5db3a7369affda + - req_5d6c0d3f36d4cba76fbfea5b6c9f63fe status: code: 200 message: OK @@ -189,12 +193,12 @@ interactions: content-type: - application/json cookie: - - __cf_bm=nMe4XLsotHph1aKmM6xJotYxeBsTIpCG1ULeQ2oiKLc-1727208666-1.0.1.1-eM1elzOCEnpbPLkOO61HSBvaeQPYHEyO4Ba3P2NsxkYV23Fybb7E8tIipei4YDbhyDiLXybnT7H0ETvjbsV89g; - _cfuvid=lKBj.JPFMKz3LiJyz12GeZI73UndAQfhN.5aqiwYPHA-1727208666121-0.0.1.1-604800000 + - __cf_bm=9IkhXEbzhF0QSjoZOW.EVqEQoEHTaAK7pnQ6K1m4EfY-1732569525-1.0.1.1-8FRhFy6jBsuonirbdG9jJ_IHnhXUakqpLsEg10YYrkhce9PwlXOKXNA2hiZwNqpM3D2TP2X4eFcZJjdEZt6.qQ; + _cfuvid=YGxjg63ZVaAJESO.Ouzjnkmhsg2izo9JySj6zJ3MRuc-1732569525322-0.0.1.1-604800000 host: - api.openai.com user-agent: - - OpenAI/Python 1.47.1 + - OpenAI/Python 1.52.0 x-stainless-arch: - arm64 x-stainless-async: @@ -204,7 +208,9 @@ interactions: x-stainless-os: - MacOS x-stainless-package-version: - - 1.47.1 + - 1.52.0 + x-stainless-retry-count: + - '0' x-stainless-runtime: - CPython x-stainless-runtime-version: @@ -214,19 +220,20 @@ interactions: response: body: string: !!binary | - H4sIAAAAAAAAAwAAAP//dFFBbtswELz7FQuerUByDcnRLS3aS4umSBugSBRINLWSNpFIglwHLgz/ - vaCsSOmhFx5mdoazs6cVgKBa5CBUJ1kNto9uPqY3n+/U3e39/WH49vzQyObn7fevx98v2adErIPC - 7J9R8ZvqSpnB9shk9IVWDiVjcE2yTbaJd2majsRgauyDrLUcbU00kKZoE2+2UZxFyW5Sd4YUepHD - 4woA4DS+Iaeu8ShyiNdvyIDeyxZFPg8BCGf6gAjpPXmWmsV6IZXRjHqMXlXV46kQniXjgJoLkUMh - fnUISlpi2YNp4IuTWiGQhx/Skb8qxBoK4VB6oxfB7BEGJdTkUDE4tMgUaglO3CGQbowb5AhZZ16p - xhpIj9yY7MjTD6/oalJjpuT8VFXV+yUcNgcvQ5H60PcTfp5b6U1rndn7iZ/xhjT5rryEDw14NlaM - 7HkF8DS2f/inUGGdGSyXbF5QB8Ms2138xHL0hf1wPZFsWPYLvkuy/6nKGllS79/dcKqXdLs4xHPM - cU/h/3jGoWxIt+iso8tJG1sm+/02TdKsuRar8+ovAAAA//8DADp8axngAgAA + H4sIAAAAAAAAA4xTwWrbQBC96yuGPVtBVuzK8a0YcimhLQRSiIO03h1Zk652l91xSDD+97KyYzk0 + hV50eG/e05s30j4DEKTFEoTqJKvem/zrL2m6h3v78Ganqzv5c7XaVIv+rrTfv3UzMUkKt3lGxe+q + K+V6b5DJ2SOtAkrG5Dqtrsv5l5t5OR+I3mk0Sbb1nM9c3pOlvCzKWV5U+XRxUneOFEaxhMcMAGA/ + PFNOq/FVLKGYvCM9xii3KJbnIQARnEmIkDFSZGlZTEZSOctoh+hN0zzu1yKyZOzR8losYS3uOwQl + PbE04Fq4DdIqBIrwQwaKV2sxgbUIKKOzo+DskQYlaAqoGAJ6ZEq1JCfuEMi2LvRygHxwL6RRA9mB + G5K98ukNLxg0qSHT9PDUNM3lEgHbXZSpSLsz5oQfzq0Yt/XBbeKJP+MtWYpdfQyfGojsvBjYQwbw + NLS/+1Co8MH1nmt2v9Emw6paHP3EePSRvb45kexYmhFfTKvJJ361RpZk4sX9hJKqQz1Kx2PLnSZ3 + QWQXW/+d5jPv4+Zkt/9jPxJKoWfUtQ+YbvJh43EsYPon/jV2bnkILOJbZOzrluwWgw90/CJbXxdV + Md+0i0oVIjtkfwAAAP//AwD0sdbanwMAAA== headers: CF-Cache-Status: - DYNAMIC CF-RAY: - - 8c856bf38f3242d1-EWR + - 8e84ac4daf039c52-IAD Connection: - keep-alive Content-Encoding: @@ -234,7 +241,7 @@ interactions: Content-Type: - application/json Date: - - Tue, 24 Sep 2024 20:11:06 GMT + - Mon, 25 Nov 2024 21:18:46 GMT Server: - cloudflare Transfer-Encoding: @@ -243,10 +250,12 @@ interactions: - nosniff access-control-expose-headers: - X-Request-ID + alt-svc: + - h3=":443"; ma=86400 openai-organization: - datadog-staging openai-processing-ms: - - '523' + - '1256' openai-version: - '2020-10-01' strict-transport-security: @@ -264,7 +273,7 @@ interactions: x-ratelimit-reset-tokens: - 0s x-request-id: - - req_07733e2c20ff88f138f2ab4cd6a71cc6 + - req_a58af2c6e743ac15ac528fb6233d9436 status: code: 200 message: OK diff --git a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_submits_evaluation.yaml b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_evaluators.test_ragas_faithfulness_submits_evaluation.yaml similarity index 100% rename from tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_submits_evaluation.yaml rename to tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_evaluators.test_ragas_faithfulness_submits_evaluation.yaml diff --git a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_submits_evaluation_on_span_with_custom_keys.yaml b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_evaluators.test_ragas_faithfulness_submits_evaluation_on_span_with_custom_keys.yaml similarity index 100% rename from tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_submits_evaluation_on_span_with_custom_keys.yaml rename to tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_evaluators.test_ragas_faithfulness_submits_evaluation_on_span_with_custom_keys.yaml diff --git a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_submits_evaluation_on_span_with_question_in_messages.yaml b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_evaluators.test_ragas_faithfulness_submits_evaluation_on_span_with_question_in_messages.yaml similarity index 100% rename from tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_submits_evaluation_on_span_with_question_in_messages.yaml rename to tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_evaluators.test_ragas_faithfulness_submits_evaluation_on_span_with_question_in_messages.yaml diff --git a/tests/llmobs/test_llmobs.py b/tests/llmobs/test_llmobs.py index 1bae7efe9ed..6cf19fc3e2c 100644 --- a/tests/llmobs/test_llmobs.py +++ b/tests/llmobs/test_llmobs.py @@ -1,4 +1,3 @@ -import mock import pytest from ddtrace.ext import SpanTypes @@ -8,12 +7,6 @@ from tests.llmobs._utils import _expected_llmobs_llm_span_event -@pytest.fixture -def mock_logs(): - with mock.patch("ddtrace.llmobs._trace_processor.log") as mock_logs: - yield mock_logs - - class TestMLApp: @pytest.mark.parametrize("llmobs_env", [{"DD_LLMOBS_ML_APP": ""}]) def test_tag_defaults_to_env_var(self, tracer, llmobs_env, llmobs_events): @@ -228,19 +221,19 @@ def test_model_and_provider_are_set(tracer, llmobs_events): assert span_event["meta"]["model_provider"] == "model_provider" -def test_malformed_span_logs_error_instead_of_raising(mock_logs, tracer, llmobs_events): +def test_malformed_span_logs_error_instead_of_raising(tracer, llmobs_events, mock_llmobs_logs): """Test that a trying to create a span event from a malformed span will log an error instead of crashing.""" with tracer.trace("root_llm_span", span_type=SpanTypes.LLM) as llm_span: # span does not have SPAN_KIND tag pass - mock_logs.error.assert_called_once_with( - "Error generating LLMObs span event for span %s, likely due to malformed span", llm_span + mock_llmobs_logs.error.assert_called_with( + "Error generating LLMObs span event for span %s, likely due to malformed span", llm_span, exc_info=True ) assert len(llmobs_events) == 0 -def test_processor_only_creates_llmobs_span_event(tracer, llmobs_events): - """Test that the LLMObsTraceProcessor only creates LLMObs span events for LLM span types.""" +def test_only_generate_span_events_from_llmobs_spans(tracer, llmobs_events): + """Test that we only generate LLMObs span events for LLM span types.""" with tracer.trace("root_llm_span", service="tests.llmobs", span_type=SpanTypes.LLM) as root_span: root_span._set_ctx_item(const.SPAN_KIND, "llm") with tracer.trace("child_span"): @@ -250,5 +243,5 @@ def test_processor_only_creates_llmobs_span_event(tracer, llmobs_events): expected_grandchild_llmobs_span["parent_id"] = str(root_span.span_id) assert len(llmobs_events) == 2 - assert llmobs_events[0] == _expected_llmobs_llm_span_event(root_span, "llm") - assert llmobs_events[1] == expected_grandchild_llmobs_span + assert llmobs_events[1] == _expected_llmobs_llm_span_event(root_span, "llm") + assert llmobs_events[0] == expected_grandchild_llmobs_span diff --git a/tests/llmobs/test_llmobs_decorators.py b/tests/llmobs/test_llmobs_decorators.py index e94d72aec64..056de72ee96 100644 --- a/tests/llmobs/test_llmobs_decorators.py +++ b/tests/llmobs/test_llmobs_decorators.py @@ -19,7 +19,7 @@ def mock_logs(): yield mock_logs -def test_llm_decorator_with_llmobs_disabled_logs_warning(LLMObs, mock_logs): +def test_llm_decorator_with_llmobs_disabled_logs_warning(llmobs, mock_logs): for decorator_name, decorator in (("llm", llm), ("embedding", embedding)): @decorator( @@ -28,13 +28,13 @@ def test_llm_decorator_with_llmobs_disabled_logs_warning(LLMObs, mock_logs): def f(): pass - LLMObs.disable() + llmobs.disable() f() mock_logs.warning.assert_called_with(SPAN_START_WHILE_DISABLED_WARNING) mock_logs.reset_mock() -def test_non_llm_decorator_with_llmobs_disabled_logs_warning(LLMObs, mock_logs): +def test_non_llm_decorator_with_llmobs_disabled_logs_warning(llmobs, mock_logs): for decorator_name, decorator in ( ("task", task), ("workflow", workflow), @@ -47,53 +47,49 @@ def test_non_llm_decorator_with_llmobs_disabled_logs_warning(LLMObs, mock_logs): def f(): pass - LLMObs.disable() + llmobs.disable() f() mock_logs.warning.assert_called_with(SPAN_START_WHILE_DISABLED_WARNING) mock_logs.reset_mock() -def test_llm_decorator(LLMObs, mock_llmobs_span_writer): +def test_llm_decorator(llmobs, llmobs_events): @llm(model_name="test_model", model_provider="test_provider", name="test_function", session_id="test_session_id") def f(): pass f() - span = LLMObs._instance.tracer.pop()[0] - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_llm_span_event( - span, "llm", model_name="test_model", model_provider="test_provider", session_id="test_session_id" - ) + span = llmobs._instance.tracer.pop()[0] + assert llmobs_events[0] == _expected_llmobs_llm_span_event( + span, "llm", model_name="test_model", model_provider="test_provider", session_id="test_session_id" ) -def test_llm_decorator_no_model_name_sets_default(LLMObs, mock_llmobs_span_writer): +def test_llm_decorator_no_model_name_sets_default(llmobs, llmobs_events): @llm(model_provider="test_provider", name="test_function", session_id="test_session_id") def f(): pass f() - span = LLMObs._instance.tracer.pop()[0] - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_llm_span_event( - span, "llm", model_name="custom", model_provider="test_provider", session_id="test_session_id" - ) + span = llmobs._instance.tracer.pop()[0] + assert llmobs_events[0] == _expected_llmobs_llm_span_event( + span, "llm", model_name="custom", model_provider="test_provider", session_id="test_session_id" ) -def test_llm_decorator_default_kwargs(LLMObs, mock_llmobs_span_writer): +def test_llm_decorator_default_kwargs(llmobs, llmobs_events): @llm def f(): pass f() - span = LLMObs._instance.tracer.pop()[0] - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_llm_span_event(span, "llm", model_name="custom", model_provider="custom") + span = llmobs._instance.tracer.pop()[0] + assert llmobs_events[0] == _expected_llmobs_llm_span_event( + span, "llm", model_name="custom", model_provider="custom" ) -def test_embedding_decorator(LLMObs, mock_llmobs_span_writer): +def test_embedding_decorator(llmobs, llmobs_events): @embedding( model_name="test_model", model_provider="test_provider", name="test_function", session_id="test_session_id" ) @@ -101,173 +97,157 @@ def f(): pass f() - span = LLMObs._instance.tracer.pop()[0] - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_llm_span_event( - span, "embedding", model_name="test_model", model_provider="test_provider", session_id="test_session_id" - ) + span = llmobs._instance.tracer.pop()[0] + assert llmobs_events[0] == _expected_llmobs_llm_span_event( + span, "embedding", model_name="test_model", model_provider="test_provider", session_id="test_session_id" ) -def test_embedding_decorator_no_model_name_sets_default(LLMObs, mock_llmobs_span_writer): +def test_embedding_decorator_no_model_name_sets_default(llmobs, llmobs_events): @embedding(model_provider="test_provider", name="test_function", session_id="test_session_id") def f(): pass f() - span = LLMObs._instance.tracer.pop()[0] - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_llm_span_event( - span, "embedding", model_name="custom", model_provider="test_provider", session_id="test_session_id" - ) + span = llmobs._instance.tracer.pop()[0] + assert llmobs_events[0] == _expected_llmobs_llm_span_event( + span, "embedding", model_name="custom", model_provider="test_provider", session_id="test_session_id" ) -def test_embedding_decorator_default_kwargs(LLMObs, mock_llmobs_span_writer): +def test_embedding_decorator_default_kwargs(llmobs, llmobs_events): @embedding def f(): pass f() - span = LLMObs._instance.tracer.pop()[0] - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_llm_span_event(span, "embedding", model_name="custom", model_provider="custom") + span = llmobs._instance.tracer.pop()[0] + assert llmobs_events[0] == _expected_llmobs_llm_span_event( + span, "embedding", model_name="custom", model_provider="custom" ) -def test_retrieval_decorator(LLMObs, mock_llmobs_span_writer): +def test_retrieval_decorator(llmobs, llmobs_events): @retrieval(name="test_function", session_id="test_session_id") def f(): pass f() - span = LLMObs._instance.tracer.pop()[0] - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_non_llm_span_event(span, "retrieval", session_id="test_session_id") - ) + span = llmobs._instance.tracer.pop()[0] + assert llmobs_events[0] == _expected_llmobs_non_llm_span_event(span, "retrieval", session_id="test_session_id") -def test_retrieval_decorator_default_kwargs(LLMObs, mock_llmobs_span_writer): +def test_retrieval_decorator_default_kwargs(llmobs, llmobs_events): @retrieval() def f(): pass f() - span = LLMObs._instance.tracer.pop()[0] - mock_llmobs_span_writer.enqueue.assert_called_with(_expected_llmobs_non_llm_span_event(span, "retrieval")) + span = llmobs._instance.tracer.pop()[0] + assert llmobs_events[0] == _expected_llmobs_non_llm_span_event(span, "retrieval") -def test_task_decorator(LLMObs, mock_llmobs_span_writer): +def test_task_decorator(llmobs, llmobs_events): @task(name="test_function", session_id="test_session_id") def f(): pass f() - span = LLMObs._instance.tracer.pop()[0] - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_non_llm_span_event(span, "task", session_id="test_session_id") - ) + span = llmobs._instance.tracer.pop()[0] + assert llmobs_events[0] == _expected_llmobs_non_llm_span_event(span, "task", session_id="test_session_id") -def test_task_decorator_default_kwargs(LLMObs, mock_llmobs_span_writer): +def test_task_decorator_default_kwargs(llmobs, llmobs_events): @task() def f(): pass f() - span = LLMObs._instance.tracer.pop()[0] - mock_llmobs_span_writer.enqueue.assert_called_with(_expected_llmobs_non_llm_span_event(span, "task")) + span = llmobs._instance.tracer.pop()[0] + assert llmobs_events[0] == _expected_llmobs_non_llm_span_event(span, "task") -def test_tool_decorator(LLMObs, mock_llmobs_span_writer): +def test_tool_decorator(llmobs, llmobs_events): @tool(name="test_function", session_id="test_session_id") def f(): pass f() - span = LLMObs._instance.tracer.pop()[0] - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_non_llm_span_event(span, "tool", session_id="test_session_id") - ) + span = llmobs._instance.tracer.pop()[0] + assert llmobs_events[0] == _expected_llmobs_non_llm_span_event(span, "tool", session_id="test_session_id") -def test_tool_decorator_default_kwargs(LLMObs, mock_llmobs_span_writer): +def test_tool_decorator_default_kwargs(llmobs, llmobs_events): @tool() def f(): pass f() - span = LLMObs._instance.tracer.pop()[0] - mock_llmobs_span_writer.enqueue.assert_called_with(_expected_llmobs_non_llm_span_event(span, "tool")) + span = llmobs._instance.tracer.pop()[0] + assert llmobs_events[0] == _expected_llmobs_non_llm_span_event(span, "tool") -def test_workflow_decorator(LLMObs, mock_llmobs_span_writer): +def test_workflow_decorator(llmobs, llmobs_events): @workflow(name="test_function", session_id="test_session_id") def f(): pass f() - span = LLMObs._instance.tracer.pop()[0] - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_non_llm_span_event(span, "workflow", session_id="test_session_id") - ) + span = llmobs._instance.tracer.pop()[0] + assert llmobs_events[0] == _expected_llmobs_non_llm_span_event(span, "workflow", session_id="test_session_id") -def test_workflow_decorator_default_kwargs(LLMObs, mock_llmobs_span_writer): +def test_workflow_decorator_default_kwargs(llmobs, llmobs_events): @workflow() def f(): pass f() - span = LLMObs._instance.tracer.pop()[0] - mock_llmobs_span_writer.enqueue.assert_called_with(_expected_llmobs_non_llm_span_event(span, "workflow")) + span = llmobs._instance.tracer.pop()[0] + assert llmobs_events[0] == _expected_llmobs_non_llm_span_event(span, "workflow") -def test_agent_decorator(LLMObs, mock_llmobs_span_writer): +def test_agent_decorator(llmobs, llmobs_events): @agent(name="test_function", session_id="test_session_id") def f(): pass f() - span = LLMObs._instance.tracer.pop()[0] - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_llm_span_event(span, "agent", session_id="test_session_id") - ) + span = llmobs._instance.tracer.pop()[0] + assert llmobs_events[0] == _expected_llmobs_llm_span_event(span, "agent", session_id="test_session_id") -def test_agent_decorator_default_kwargs(LLMObs, mock_llmobs_span_writer): +def test_agent_decorator_default_kwargs(llmobs, llmobs_events): @agent() def f(): pass f() - span = LLMObs._instance.tracer.pop()[0] - mock_llmobs_span_writer.enqueue.assert_called_with(_expected_llmobs_llm_span_event(span, "agent")) + span = llmobs._instance.tracer.pop()[0] + assert llmobs_events[0] == _expected_llmobs_llm_span_event(span, "agent") -def test_llm_decorator_with_error(LLMObs, mock_llmobs_span_writer): +def test_llm_decorator_with_error(llmobs, llmobs_events): @llm(model_name="test_model", model_provider="test_provider", name="test_function", session_id="test_session_id") def f(): raise ValueError("test_error") with pytest.raises(ValueError): f() - span = LLMObs._instance.tracer.pop()[0] - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_llm_span_event( - span, - "llm", - model_name="test_model", - model_provider="test_provider", - session_id="test_session_id", - error=span.get_tag("error.type"), - error_message=span.get_tag("error.message"), - error_stack=span.get_tag("error.stack"), - ) + span = llmobs._instance.tracer.pop()[0] + assert llmobs_events[0] == _expected_llmobs_llm_span_event( + span, + "llm", + model_name="test_model", + model_provider="test_provider", + session_id="test_session_id", + error=span.get_tag("error.type"), + error_message=span.get_tag("error.message"), + error_stack=span.get_tag("error.stack"), ) -def test_non_llm_decorators_with_error(LLMObs, mock_llmobs_span_writer): +def test_non_llm_decorators_with_error(llmobs, llmobs_events): for decorator_name, decorator in [("task", task), ("workflow", workflow), ("tool", tool), ("agent", agent)]: @decorator(name="test_function", session_id="test_session_id") @@ -276,23 +256,21 @@ def f(): with pytest.raises(ValueError): f() - span = LLMObs._instance.tracer.pop()[0] - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_non_llm_span_event( - span, - decorator_name, - session_id="test_session_id", - error=span.get_tag("error.type"), - error_message=span.get_tag("error.message"), - error_stack=span.get_tag("error.stack"), - ) + span = llmobs._instance.tracer.pop()[0] + assert llmobs_events[-1] == _expected_llmobs_non_llm_span_event( + span, + decorator_name, + session_id="test_session_id", + error=span.get_tag("error.type"), + error_message=span.get_tag("error.message"), + error_stack=span.get_tag("error.stack"), ) -def test_llm_annotate(LLMObs, mock_llmobs_span_writer): +def test_llm_annotate(llmobs, llmobs_events): @llm(model_name="test_model", model_provider="test_provider", name="test_function", session_id="test_session_id") def f(): - LLMObs.annotate( + llmobs.annotate( parameters={"temperature": 0.9, "max_tokens": 50}, input_data=[{"content": "test_prompt"}], output_data=[{"content": "test_response"}], @@ -301,27 +279,25 @@ def f(): ) f() - span = LLMObs._instance.tracer.pop()[0] - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_llm_span_event( - span, - "llm", - model_name="test_model", - model_provider="test_provider", - input_messages=[{"content": "test_prompt"}], - output_messages=[{"content": "test_response"}], - parameters={"temperature": 0.9, "max_tokens": 50}, - token_metrics={"input_tokens": 10, "output_tokens": 20, "total_tokens": 30}, - tags={"custom_tag": "tag_value"}, - session_id="test_session_id", - ) + span = llmobs._instance.tracer.pop()[0] + assert llmobs_events[0] == _expected_llmobs_llm_span_event( + span, + "llm", + model_name="test_model", + model_provider="test_provider", + input_messages=[{"content": "test_prompt"}], + output_messages=[{"content": "test_response"}], + parameters={"temperature": 0.9, "max_tokens": 50}, + token_metrics={"input_tokens": 10, "output_tokens": 20, "total_tokens": 30}, + tags={"custom_tag": "tag_value"}, + session_id="test_session_id", ) -def test_llm_annotate_raw_string_io(LLMObs, mock_llmobs_span_writer): +def test_llm_annotate_raw_string_io(llmobs, llmobs_events): @llm(model_name="test_model", model_provider="test_provider", name="test_function", session_id="test_session_id") def f(): - LLMObs.annotate( + llmobs.annotate( parameters={"temperature": 0.9, "max_tokens": 50}, input_data="test_prompt", output_data="test_response", @@ -330,24 +306,22 @@ def f(): ) f() - span = LLMObs._instance.tracer.pop()[0] - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_llm_span_event( - span, - "llm", - model_name="test_model", - model_provider="test_provider", - input_messages=[{"content": "test_prompt"}], - output_messages=[{"content": "test_response"}], - parameters={"temperature": 0.9, "max_tokens": 50}, - token_metrics={"input_tokens": 10, "output_tokens": 20, "total_tokens": 30}, - tags={"custom_tag": "tag_value"}, - session_id="test_session_id", - ) + span = llmobs._instance.tracer.pop()[0] + assert llmobs_events[0] == _expected_llmobs_llm_span_event( + span, + "llm", + model_name="test_model", + model_provider="test_provider", + input_messages=[{"content": "test_prompt"}], + output_messages=[{"content": "test_response"}], + parameters={"temperature": 0.9, "max_tokens": 50}, + token_metrics={"input_tokens": 10, "output_tokens": 20, "total_tokens": 30}, + tags={"custom_tag": "tag_value"}, + session_id="test_session_id", ) -def test_non_llm_decorators_no_args(LLMObs, mock_llmobs_span_writer): +def test_non_llm_decorators_no_args(llmobs, llmobs_events): """Test that using the decorators without any arguments, i.e. @tool, works the same as @tool(...).""" for decorator_name, decorator in [ ("task", task), @@ -362,11 +336,11 @@ def f(): pass f() - span = LLMObs._instance.tracer.pop()[0] - mock_llmobs_span_writer.enqueue.assert_called_with(_expected_llmobs_non_llm_span_event(span, decorator_name)) + span = llmobs._instance.tracer.pop()[0] + assert llmobs_events[-1] == _expected_llmobs_non_llm_span_event(span, decorator_name) -def test_agent_decorator_no_args(LLMObs, mock_llmobs_span_writer): +def test_agent_decorator_no_args(llmobs, llmobs_events): """Test that using agent decorator without any arguments, i.e. @agent, works the same as @agent(...).""" @agent @@ -374,11 +348,11 @@ def f(): pass f() - span = LLMObs._instance.tracer.pop()[0] - mock_llmobs_span_writer.enqueue.assert_called_with(_expected_llmobs_llm_span_event(span, "agent")) + span = llmobs._instance.tracer.pop()[0] + assert llmobs_events[0] == _expected_llmobs_llm_span_event(span, "agent") -def test_ml_app_override(LLMObs, mock_llmobs_span_writer): +def test_ml_app_override(llmobs, llmobs_events): """Test that setting ml_app kwarg on the LLMObs decorators will override the DD_LLMOBS_ML_APP value.""" for decorator_name, decorator in [("task", task), ("workflow", workflow), ("tool", tool)]: @@ -387,9 +361,9 @@ def f(): pass f() - span = LLMObs._instance.tracer.pop()[0] - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_non_llm_span_event(span, decorator_name, tags={"ml_app": "test_ml_app"}) + span = llmobs._instance.tracer.pop()[0] + assert llmobs_events[-1] == _expected_llmobs_non_llm_span_event( + span, decorator_name, tags={"ml_app": "test_ml_app"} ) @llm(model_name="test_model", ml_app="test_ml_app") @@ -397,11 +371,9 @@ def g(): pass g() - span = LLMObs._instance.tracer.pop()[0] - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_llm_span_event( - span, "llm", model_name="test_model", model_provider="custom", tags={"ml_app": "test_ml_app"} - ) + span = llmobs._instance.tracer.pop()[0] + assert llmobs_events[-1] == _expected_llmobs_llm_span_event( + span, "llm", model_name="test_model", model_provider="custom", tags={"ml_app": "test_ml_app"} ) @embedding(model_name="test_model", ml_app="test_ml_app") @@ -409,15 +381,13 @@ def h(): pass h() - span = LLMObs._instance.tracer.pop()[0] - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_llm_span_event( - span, "embedding", model_name="test_model", model_provider="custom", tags={"ml_app": "test_ml_app"} - ) + span = llmobs._instance.tracer.pop()[0] + assert llmobs_events[-1] == _expected_llmobs_llm_span_event( + span, "embedding", model_name="test_model", model_provider="custom", tags={"ml_app": "test_ml_app"} ) -async def test_non_llm_async_decorators(LLMObs, mock_llmobs_span_writer): +async def test_non_llm_async_decorators(llmobs, llmobs_events): """Test that decorators work with async functions.""" for decorator_name, decorator in [ ("task", task), @@ -432,11 +402,11 @@ async def f(): pass await f() - span = LLMObs._instance.tracer.pop()[0] - mock_llmobs_span_writer.enqueue.assert_called_with(_expected_llmobs_non_llm_span_event(span, decorator_name)) + span = llmobs._instance.tracer.pop()[0] + assert llmobs_events[-1] == _expected_llmobs_non_llm_span_event(span, decorator_name) -async def test_llm_async_decorators(LLMObs, mock_llmobs_span_writer): +async def test_llm_async_decorators(llmobs, llmobs_events): """Test that decorators work with async functions.""" for decorator_name, decorator in [("llm", llm), ("embedding", embedding)]: @@ -445,15 +415,13 @@ async def f(): pass await f() - span = LLMObs._instance.tracer.pop()[0] - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_llm_span_event( - span, decorator_name, model_name="test_model", model_provider="test_provider" - ) + span = llmobs._instance.tracer.pop()[0] + assert llmobs_events[-1] == _expected_llmobs_llm_span_event( + span, decorator_name, model_name="test_model", model_provider="test_provider" ) -def test_automatic_annotation_non_llm_decorators(LLMObs, mock_llmobs_span_writer): +def test_automatic_annotation_non_llm_decorators(llmobs, llmobs_events): """Test that automatic input/output annotation works for non-LLM decorators.""" for decorator_name, decorator in (("task", task), ("workflow", workflow), ("tool", tool), ("agent", agent)): @@ -462,19 +430,17 @@ def f(prompt, arg_2, kwarg_1=None, kwarg_2=None): return prompt f("test_prompt", "arg_2", kwarg_2=12345) - span = LLMObs._instance.tracer.pop()[0] - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_non_llm_span_event( - span, - decorator_name, - input_value=str({"prompt": "test_prompt", "arg_2": "arg_2", "kwarg_2": 12345}), - output_value="test_prompt", - session_id="test_session_id", - ) + span = llmobs._instance.tracer.pop()[0] + assert llmobs_events[-1] == _expected_llmobs_non_llm_span_event( + span, + decorator_name, + input_value=str({"prompt": "test_prompt", "arg_2": "arg_2", "kwarg_2": 12345}), + output_value="test_prompt", + session_id="test_session_id", ) -def test_automatic_annotation_retrieval_decorator(LLMObs, mock_llmobs_span_writer): +def test_automatic_annotation_retrieval_decorator(llmobs, llmobs_events): """Test that automatic input annotation works for retrieval decorators.""" @retrieval(session_id="test_session_id") @@ -482,18 +448,16 @@ def test_retrieval(query, arg_2, kwarg_1=None, kwarg_2=None): return [{"name": "name", "id": "1234567890", "score": 0.9}] test_retrieval("test_query", "arg_2", kwarg_2=12345) - span = LLMObs._instance.tracer.pop()[0] - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_non_llm_span_event( - span, - "retrieval", - input_value=str({"query": "test_query", "arg_2": "arg_2", "kwarg_2": 12345}), - session_id="test_session_id", - ) + span = llmobs._instance.tracer.pop()[0] + assert llmobs_events[0] == _expected_llmobs_non_llm_span_event( + span, + "retrieval", + input_value=str({"query": "test_query", "arg_2": "arg_2", "kwarg_2": 12345}), + session_id="test_session_id", ) -def test_automatic_annotation_off_non_llm_decorators(LLMObs, mock_llmobs_span_writer): +def test_automatic_annotation_off_non_llm_decorators(llmobs, llmobs_events): """Test disabling automatic input/output annotation for non-LLM decorators.""" for decorator_name, decorator in ( ("task", task), @@ -508,35 +472,33 @@ def f(prompt, arg_2, kwarg_1=None, kwarg_2=None): return prompt f("test_prompt", "arg_2", kwarg_2=12345) - span = LLMObs._instance.tracer.pop()[0] - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_non_llm_span_event(span, decorator_name, session_id="test_session_id") + span = llmobs._instance.tracer.pop()[0] + assert llmobs_events[-1] == _expected_llmobs_non_llm_span_event( + span, decorator_name, session_id="test_session_id" ) -def test_automatic_annotation_off_if_manually_annotated(LLMObs, mock_llmobs_span_writer): +def test_automatic_annotation_off_if_manually_annotated(llmobs, llmobs_events): """Test disabling automatic input/output annotation for non-LLM decorators.""" for decorator_name, decorator in (("task", task), ("workflow", workflow), ("tool", tool), ("agent", agent)): @decorator(name="test_function", session_id="test_session_id") def f(prompt, arg_2, kwarg_1=None, kwarg_2=None): - LLMObs.annotate(input_data="my custom input", output_data="my custom output") + llmobs.annotate(input_data="my custom input", output_data="my custom output") return prompt f("test_prompt", "arg_2", kwarg_2=12345) - span = LLMObs._instance.tracer.pop()[0] - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_non_llm_span_event( - span, - decorator_name, - session_id="test_session_id", - input_value="my custom input", - output_value="my custom output", - ) + span = llmobs._instance.tracer.pop()[0] + assert llmobs_events[-1] == _expected_llmobs_non_llm_span_event( + span, + decorator_name, + session_id="test_session_id", + input_value="my custom input", + output_value="my custom output", ) -def test_generator_sync(LLMObs, mock_llmobs_span_writer): +def test_generator_sync(llmobs, llmobs_events): """ Test that decorators work with generator functions. The span should finish after the generator is exhausted. @@ -556,7 +518,7 @@ def f(): for i in range(3): yield i - LLMObs.annotate( + llmobs.annotate( input_data="hello", output_data="world", ) @@ -566,7 +528,7 @@ def f(): assert e == i i += 1 - span = LLMObs._instance.tracer.pop()[0] + span = llmobs._instance.tracer.pop()[0] if decorator_name == "llm": expected_span_event = _expected_llmobs_llm_span_event( span, @@ -594,10 +556,10 @@ def f(): span, decorator_name, input_value="hello", output_value="world" ) - mock_llmobs_span_writer.enqueue.assert_called_with(expected_span_event) + assert llmobs_events[-1] == expected_span_event -async def test_generator_async(LLMObs, mock_llmobs_span_writer): +async def test_generator_async(llmobs, llmobs_events): """ Test that decorators work with generator functions. The span should finish after the generator is exhausted. @@ -617,7 +579,7 @@ async def f(): for i in range(3): yield i - LLMObs.annotate( + llmobs.annotate( input_data="hello", output_data="world", ) @@ -627,7 +589,7 @@ async def f(): assert e == i i += 1 - span = LLMObs._instance.tracer.pop()[0] + span = llmobs._instance.tracer.pop()[0] if decorator_name == "llm": expected_span_event = _expected_llmobs_llm_span_event( span, @@ -655,11 +617,11 @@ async def f(): span, decorator_name, input_value="hello", output_value="world" ) - mock_llmobs_span_writer.enqueue.assert_called_with(expected_span_event) + assert llmobs_events[-1] == expected_span_event -def test_generator_sync_with_llmobs_disabled(LLMObs, mock_logs): - LLMObs.disable() +def test_generator_sync_with_llmobs_disabled(llmobs, mock_logs): + llmobs.disable() @workflow() def f(): @@ -684,10 +646,11 @@ def g(): i += 1 mock_logs.warning.assert_called_with(SPAN_START_WHILE_DISABLED_WARNING) + llmobs.enable() -async def test_generator_async_with_llmobs_disabled(LLMObs, mock_logs): - LLMObs.disable() +async def test_generator_async_with_llmobs_disabled(llmobs, mock_logs): + llmobs.disable() @workflow() async def f(): @@ -712,9 +675,10 @@ async def g(): i += 1 mock_logs.warning.assert_called_with(SPAN_START_WHILE_DISABLED_WARNING) + llmobs.enable() -def test_generator_sync_finishes_span_on_error(LLMObs, mock_llmobs_span_writer): +def test_generator_sync_finishes_span_on_error(llmobs, llmobs_events): """Tests that""" @workflow() @@ -728,19 +692,17 @@ def f(): for _ in f(): pass - span = LLMObs._instance.tracer.pop()[0] - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_non_llm_span_event( - span, - "workflow", - error=span.get_tag("error.type"), - error_message=span.get_tag("error.message"), - error_stack=span.get_tag("error.stack"), - ) + span = llmobs._instance.tracer.pop()[0] + assert llmobs_events[0] == _expected_llmobs_non_llm_span_event( + span, + "workflow", + error=span.get_tag("error.type"), + error_message=span.get_tag("error.message"), + error_stack=span.get_tag("error.stack"), ) -async def test_generator_async_finishes_span_on_error(LLMObs, mock_llmobs_span_writer): +async def test_generator_async_finishes_span_on_error(llmobs, llmobs_events): @workflow() async def f(): for i in range(3): @@ -752,19 +714,17 @@ async def f(): async for _ in f(): pass - span = LLMObs._instance.tracer.pop()[0] - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_non_llm_span_event( - span, - "workflow", - error=span.get_tag("error.type"), - error_message=span.get_tag("error.message"), - error_stack=span.get_tag("error.stack"), - ) + span = llmobs._instance.tracer.pop()[0] + assert llmobs_events[0] == _expected_llmobs_non_llm_span_event( + span, + "workflow", + error=span.get_tag("error.type"), + error_message=span.get_tag("error.message"), + error_stack=span.get_tag("error.stack"), ) -def test_generator_sync_send(LLMObs, mock_llmobs_span_writer): +def test_generator_sync_send(llmobs, llmobs_events): @workflow() def f(): while True: @@ -780,16 +740,11 @@ def f(): assert gen.send(4) == 16 gen.close() - span = LLMObs._instance.tracer.pop()[0] - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_non_llm_span_event( - span, - "workflow", - ) - ) + span = llmobs._instance.tracer.pop()[0] + assert llmobs_events[0] == _expected_llmobs_non_llm_span_event(span, "workflow") -async def test_generator_async_send(LLMObs, mock_llmobs_span_writer): +async def test_generator_async_send(llmobs, llmobs_events): @workflow() async def f(): while True: @@ -805,16 +760,11 @@ async def f(): await gen.aclose() - span = LLMObs._instance.tracer.pop()[0] - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_non_llm_span_event( - span, - "workflow", - ) - ) + span = llmobs._instance.tracer.pop()[0] + assert llmobs_events[0] == _expected_llmobs_non_llm_span_event(span, "workflow") -def test_generator_sync_throw(LLMObs, mock_llmobs_span_writer): +def test_generator_sync_throw(llmobs, llmobs_events): @workflow() def f(): for i in range(3): @@ -825,19 +775,17 @@ def f(): next(gen) gen.throw(ValueError("test_error")) - span = LLMObs._instance.tracer.pop()[0] - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_non_llm_span_event( - span, - "workflow", - error=span.get_tag("error.type"), - error_message=span.get_tag("error.message"), - error_stack=span.get_tag("error.stack"), - ) + span = llmobs._instance.tracer.pop()[0] + assert llmobs_events[0] == _expected_llmobs_non_llm_span_event( + span, + "workflow", + error=span.get_tag("error.type"), + error_message=span.get_tag("error.message"), + error_stack=span.get_tag("error.stack"), ) -async def test_generator_async_throw(LLMObs, mock_llmobs_span_writer): +async def test_generator_async_throw(llmobs, llmobs_events): @workflow() async def f(): for i in range(3): @@ -848,19 +796,17 @@ async def f(): await gen.asend(None) await gen.athrow(ValueError("test_error")) - span = LLMObs._instance.tracer.pop()[0] - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_non_llm_span_event( - span, - "workflow", - error=span.get_tag("error.type"), - error_message=span.get_tag("error.message"), - error_stack=span.get_tag("error.stack"), - ) + span = llmobs._instance.tracer.pop()[0] + assert llmobs_events[0] == _expected_llmobs_non_llm_span_event( + span, + "workflow", + error=span.get_tag("error.type"), + error_message=span.get_tag("error.message"), + error_stack=span.get_tag("error.stack"), ) -def test_generator_exit_exception_sync(LLMObs, mock_llmobs_span_writer): +def test_generator_exit_exception_sync(llmobs, llmobs_events): @workflow() def get_next_element(alist): for element in alist: @@ -873,14 +819,12 @@ def get_next_element(alist): if element == 5: break - span = LLMObs._instance.tracer.pop()[0] - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_non_llm_span_event( - span, - "workflow", - input_value=str({"alist": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]}), - error=span.get_tag("error.type"), - error_message=span.get_tag("error.message"), - error_stack=span.get_tag("error.stack"), - ) + span = llmobs._instance.tracer.pop()[0] + assert llmobs_events[0] == _expected_llmobs_non_llm_span_event( + span, + "workflow", + input_value=str({"alist": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]}), + error=span.get_tag("error.type"), + error_message=span.get_tag("error.message"), + error_stack=span.get_tag("error.stack"), ) diff --git a/tests/llmobs/test_llmobs_eval_metric_writer.py b/tests/llmobs/test_llmobs_eval_metric_writer.py index 2b8341e1616..eb168ef5a00 100644 --- a/tests/llmobs/test_llmobs_eval_metric_writer.py +++ b/tests/llmobs/test_llmobs_eval_metric_writer.py @@ -7,15 +7,19 @@ from ddtrace.llmobs._writer import LLMObsEvalMetricWriter -INTAKE_ENDPOINT = "https://api.datad0g.com/api/intake/llm-obs/v1/eval-metric" +INTAKE_ENDPOINT = "https://api.datad0g.com/api/intake/llm-obs/v2/eval-metric" DD_SITE = "datad0g.com" dd_api_key = os.getenv("DD_API_KEY", default="") def _categorical_metric_event(): return { - "span_id": "12345678901", - "trace_id": "98765432101", + "join_on": { + "span": { + "span_id": "12345678901", + "trace_id": "98765432101", + }, + }, "metric_type": "categorical", "categorical_value": "very", "label": "toxicity", @@ -26,8 +30,12 @@ def _categorical_metric_event(): def _score_metric_event(): return { - "span_id": "12345678902", - "trace_id": "98765432102", + "join_on": { + "span": { + "span_id": "12345678902", + "trace_id": "98765432102", + }, + }, "metric_type": "score", "label": "sentiment", "score_value": 0.9, @@ -69,6 +77,18 @@ def test_send_metric_bad_api_key(mock_writer_logs): ) +@pytest.mark.vcr_logs +def test_send_metric_no_api_key(mock_writer_logs): + llmobs_eval_metric_writer = LLMObsEvalMetricWriter(site="datad0g.com", api_key="", interval=1000, timeout=1) + llmobs_eval_metric_writer.start() + llmobs_eval_metric_writer.enqueue(_categorical_metric_event()) + llmobs_eval_metric_writer.periodic() + mock_writer_logs.warning.assert_called_with( + "DD_API_KEY is required for sending evaluation metrics. Evaluation metric data will not be sent. ", + "Ensure this configuration is set before running your application.", + ) + + @pytest.mark.vcr_logs def test_send_categorical_metric(mock_writer_logs): llmobs_eval_metric_writer = LLMObsEvalMetricWriter(site="datad0g.com", api_key=dd_api_key, interval=1000, timeout=1) @@ -125,6 +145,18 @@ def test_send_multiple_events(mock_writer_logs): def test_send_on_exit(mock_writer_logs, run_python_code_in_subprocess): + env = os.environ.copy() + pypath = [os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))] + if "PYTHONPATH" in env: + pypath.append(env["PYTHONPATH"]) + env.update( + { + "DD_API_KEY": os.getenv("DD_API_KEY", "dummy-api-key"), + "DD_SITE": "datad0g.com", + "PYTHONPATH": ":".join(pypath), + "DD_LLMOBS_ML_APP": "unnamed-ml-app", + } + ) out, err, status, pid = run_python_code_in_subprocess( """ import atexit @@ -144,6 +176,7 @@ def test_send_on_exit(mock_writer_logs, run_python_code_in_subprocess): llmobs_eval_metric_writer.start() llmobs_eval_metric_writer.enqueue(_score_metric_event()) """, + env=env, ) assert status == 0, err assert out == b"" diff --git a/tests/llmobs/test_llmobs_evaluator_runner.py b/tests/llmobs/test_llmobs_evaluator_runner.py index 7ee7d510276..40c9fb5bd2b 100644 --- a/tests/llmobs/test_llmobs_evaluator_runner.py +++ b/tests/llmobs/test_llmobs_evaluator_runner.py @@ -22,7 +22,7 @@ def test_evaluator_runner_start(mock_evaluator_logs): evaluator_runner = EvaluatorRunner(interval=0.01, llmobs_service=mock.MagicMock()) evaluator_runner.evaluators.append(DummyEvaluator(llmobs_service=mock.MagicMock())) evaluator_runner.start() - mock_evaluator_logs.debug.assert_has_calls([mock.call("started %r to %r", "EvaluatorRunner")]) + mock_evaluator_logs.debug.assert_has_calls([mock.call("started %r", "EvaluatorRunner")]) def test_evaluator_runner_buffer_limit(mock_evaluator_logs): @@ -34,9 +34,9 @@ def test_evaluator_runner_buffer_limit(mock_evaluator_logs): ) -def test_evaluator_runner_periodic_enqueues_eval_metric(LLMObs, mock_llmobs_eval_metric_writer): - evaluator_runner = EvaluatorRunner(interval=0.01, llmobs_service=LLMObs) - evaluator_runner.evaluators.append(DummyEvaluator(llmobs_service=LLMObs)) +def test_evaluator_runner_periodic_enqueues_eval_metric(llmobs, mock_llmobs_eval_metric_writer): + evaluator_runner = EvaluatorRunner(interval=0.01, llmobs_service=llmobs) + evaluator_runner.evaluators.append(DummyEvaluator(llmobs_service=llmobs)) evaluator_runner.enqueue({"span_id": "123", "trace_id": "1234"}, DUMMY_SPAN) evaluator_runner.periodic() mock_llmobs_eval_metric_writer.enqueue.assert_called_once_with( @@ -45,9 +45,9 @@ def test_evaluator_runner_periodic_enqueues_eval_metric(LLMObs, mock_llmobs_eval @pytest.mark.vcr_logs -def test_evaluator_runner_timed_enqueues_eval_metric(LLMObs, mock_llmobs_eval_metric_writer): - evaluator_runner = EvaluatorRunner(interval=0.01, llmobs_service=LLMObs) - evaluator_runner.evaluators.append(DummyEvaluator(llmobs_service=LLMObs)) +def test_evaluator_runner_timed_enqueues_eval_metric(llmobs, mock_llmobs_eval_metric_writer): + evaluator_runner = EvaluatorRunner(interval=0.01, llmobs_service=llmobs) + evaluator_runner.evaluators.append(DummyEvaluator(llmobs_service=llmobs)) evaluator_runner.start() evaluator_runner.enqueue({"span_id": "123", "trace_id": "1234"}, DUMMY_SPAN) @@ -64,15 +64,7 @@ def test_evaluator_runner_on_exit(mock_writer_logs, run_python_code_in_subproces pypath = [os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))] if "PYTHONPATH" in env: pypath.append(env["PYTHONPATH"]) - env.update( - { - "DD_API_KEY": os.getenv("DD_API_KEY", "dummy-api-key"), - "DD_SITE": "datad0g.com", - "PYTHONPATH": ":".join(pypath), - "DD_LLMOBS_ML_APP": "unnamed-ml-app", - "_DD_LLMOBS_EVALUATOR_INTERVAL": "5", - } - ) + env.update({"PYTHONPATH": ":".join(pypath), "_DD_LLMOBS_EVALUATOR_INTERVAL": "5"}) out, err, status, pid = run_python_code_in_subprocess( """ import os @@ -87,7 +79,7 @@ def test_evaluator_runner_on_exit(mock_writer_logs, run_python_code_in_subproces ctx = logs_vcr.use_cassette("tests.llmobs.test_llmobs_evaluator_runner.send_score_metric.yaml") ctx.__enter__() atexit.register(lambda: ctx.__exit__()) -LLMObs.enable() +LLMObs.enable(api_key="dummy-api-key", site="datad0g.com", ml_app="unnamed-ml-app") LLMObs._instance._evaluator_runner.evaluators.append(DummyEvaluator(llmobs_service=LLMObs)) LLMObs._instance._evaluator_runner.start() LLMObs._instance._evaluator_runner.enqueue({"span_id": "123", "trace_id": "1234"}, None) @@ -99,6 +91,12 @@ def test_evaluator_runner_on_exit(mock_writer_logs, run_python_code_in_subproces assert err == b"" +def test_evaluator_runner_unsupported_evaluator(): + with override_env({"_DD_LLMOBS_EVALUATORS": "unsupported"}): + with pytest.raises(ValueError): + EvaluatorRunner(interval=0.01, llmobs_service=mock.MagicMock()) + + def test_evaluator_runner_sampler_single_rule(monkeypatch): monkeypatch.setenv( EvaluatorRunnerSampler.SAMPLING_RULES_ENV_VAR, diff --git a/tests/llmobs/test_llmobs_ragas_faithfulness_evaluator.py b/tests/llmobs/test_llmobs_ragas_evaluators.py similarity index 76% rename from tests/llmobs/test_llmobs_ragas_faithfulness_evaluator.py rename to tests/llmobs/test_llmobs_ragas_evaluators.py index 1f78b538f24..251b2642040 100644 --- a/tests/llmobs/test_llmobs_ragas_faithfulness_evaluator.py +++ b/tests/llmobs/test_llmobs_ragas_evaluators.py @@ -6,36 +6,39 @@ from ddtrace.llmobs._evaluators.ragas.faithfulness import RagasFaithfulnessEvaluator from ddtrace.span import Span from tests.llmobs._utils import _expected_llmobs_llm_span_event -from tests.llmobs._utils import _expected_ragas_spans +from tests.llmobs._utils import _expected_ragas_faithfulness_spans from tests.llmobs._utils import _llm_span_with_expected_ragas_inputs_in_messages from tests.llmobs._utils import _llm_span_with_expected_ragas_inputs_in_prompt +pytest.importorskip("ragas", reason="Tests require ragas to be available on user env") + + def _llm_span_without_io(): return _expected_llmobs_llm_span_event(Span("dummy")) -def test_ragas_evaluator_init(ragas, LLMObs): - rf_evaluator = RagasFaithfulnessEvaluator(LLMObs) - assert rf_evaluator.llmobs_service == LLMObs +def test_ragas_evaluator_init(ragas, llmobs): + rf_evaluator = RagasFaithfulnessEvaluator(llmobs) + assert rf_evaluator.llmobs_service == llmobs assert rf_evaluator.ragas_faithfulness_instance == ragas.metrics.faithfulness assert rf_evaluator.ragas_faithfulness_instance.llm == ragas.llms.llm_factory() -def test_ragas_faithfulness_throws_if_dependencies_not_present(LLMObs, mock_ragas_dependencies_not_present, ragas): +def test_ragas_faithfulness_throws_if_dependencies_not_present(llmobs, mock_ragas_dependencies_not_present, ragas): with pytest.raises(NotImplementedError, match="Failed to load dependencies for `ragas_faithfulness` evaluator"): - RagasFaithfulnessEvaluator(LLMObs) + RagasFaithfulnessEvaluator(llmobs) -def test_ragas_faithfulness_returns_none_if_inputs_extraction_fails(ragas, mock_llmobs_submit_evaluation, LLMObs): - rf_evaluator = RagasFaithfulnessEvaluator(LLMObs) +def test_ragas_faithfulness_returns_none_if_inputs_extraction_fails(ragas, mock_llmobs_submit_evaluation, llmobs): + rf_evaluator = RagasFaithfulnessEvaluator(llmobs) failure_msg, _ = rf_evaluator.evaluate(_llm_span_without_io()) assert failure_msg == "fail_extract_faithfulness_inputs" assert rf_evaluator.llmobs_service.submit_evaluation.call_count == 0 def test_ragas_faithfulness_has_modified_faithfulness_instance( - ragas, mock_llmobs_submit_evaluation, reset_ragas_faithfulness_llm, LLMObs + ragas, mock_llmobs_submit_evaluation, reset_ragas_faithfulness_llm, llmobs ): """Faithfulness instance used in ragas evaluator should match the global ragas faithfulness instance""" from ragas.llms import BaseRagasLLM @@ -53,7 +56,7 @@ def agenerate_text(self) -> str: faithfulness.llm = FirstDummyLLM() - rf_evaluator = RagasFaithfulnessEvaluator(LLMObs) + rf_evaluator = RagasFaithfulnessEvaluator(llmobs) assert rf_evaluator.ragas_faithfulness_instance.llm.generate_text() == "dummy llm" @@ -74,9 +77,9 @@ def agenerate_text(self, statements) -> str: @pytest.mark.vcr_logs -def test_ragas_faithfulness_submits_evaluation(ragas, LLMObs, mock_llmobs_submit_evaluation): +def test_ragas_faithfulness_submits_evaluation(ragas, llmobs, mock_llmobs_submit_evaluation): """Test that evaluation is submitted for a valid llm span where question is in the prompt variables""" - rf_evaluator = RagasFaithfulnessEvaluator(LLMObs) + rf_evaluator = RagasFaithfulnessEvaluator(llmobs) llm_span = _llm_span_with_expected_ragas_inputs_in_prompt() rf_evaluator.run_and_submit_evaluation(llm_span) rf_evaluator.llmobs_service.submit_evaluation.assert_has_calls( @@ -101,10 +104,10 @@ def test_ragas_faithfulness_submits_evaluation(ragas, LLMObs, mock_llmobs_submit @pytest.mark.vcr_logs def test_ragas_faithfulness_submits_evaluation_on_span_with_question_in_messages( - ragas, LLMObs, mock_llmobs_submit_evaluation + ragas, llmobs, mock_llmobs_submit_evaluation ): """Test that evaluation is submitted for a valid llm span where the last message content is the question""" - rf_evaluator = RagasFaithfulnessEvaluator(LLMObs) + rf_evaluator = RagasFaithfulnessEvaluator(llmobs) llm_span = _llm_span_with_expected_ragas_inputs_in_messages() rf_evaluator.run_and_submit_evaluation(llm_span) rf_evaluator.llmobs_service.submit_evaluation.assert_has_calls( @@ -128,9 +131,9 @@ def test_ragas_faithfulness_submits_evaluation_on_span_with_question_in_messages @pytest.mark.vcr_logs -def test_ragas_faithfulness_submits_evaluation_on_span_with_custom_keys(ragas, LLMObs, mock_llmobs_submit_evaluation): +def test_ragas_faithfulness_submits_evaluation_on_span_with_custom_keys(ragas, llmobs, mock_llmobs_submit_evaluation): """Test that evaluation is submitted for a valid llm span where the last message content is the question""" - rf_evaluator = RagasFaithfulnessEvaluator(LLMObs) + rf_evaluator = RagasFaithfulnessEvaluator(llmobs) llm_span = _expected_llmobs_llm_span_event( Span("dummy"), prompt={ @@ -167,19 +170,17 @@ def test_ragas_faithfulness_submits_evaluation_on_span_with_custom_keys(ragas, L @pytest.mark.vcr_logs -def test_ragas_faithfulness_emits_traces(ragas, LLMObs): - rf_evaluator = RagasFaithfulnessEvaluator(LLMObs) +def test_ragas_faithfulness_emits_traces(ragas, llmobs, llmobs_events): + rf_evaluator = RagasFaithfulnessEvaluator(llmobs) rf_evaluator.evaluate(_llm_span_with_expected_ragas_inputs_in_prompt()) - assert rf_evaluator.llmobs_service._instance._llmobs_span_writer.enqueue.call_count == 7 - calls = rf_evaluator.llmobs_service._instance._llmobs_span_writer.enqueue.call_args_list - - spans = [call[0][0] for call in calls] - + ragas_spans = [event for event in llmobs_events if event["name"].startswith("dd-ragas.")] + ragas_spans = sorted(ragas_spans, key=lambda d: d["start_ns"]) + assert len(ragas_spans) == 7 # check name, io, span kinds match - assert spans == _expected_ragas_spans() + assert ragas_spans == _expected_ragas_faithfulness_spans() # verify the trace structure - root_span = spans[0] + root_span = ragas_spans[0] root_span_id = root_span["span_id"] assert root_span["parent_id"] == "undefined" assert root_span["meta"] is not None @@ -187,16 +188,15 @@ def test_ragas_faithfulness_emits_traces(ragas, LLMObs): assert isinstance(root_span["meta"]["metadata"]["faithfulness_list"], list) assert isinstance(root_span["meta"]["metadata"]["statements"], list) root_span_trace_id = root_span["trace_id"] - for child_span in spans[1:]: + for child_span in ragas_spans[1:]: assert child_span["trace_id"] == root_span_trace_id - assert spans[1]["parent_id"] == root_span_id # input extraction (task) - assert spans[2]["parent_id"] == root_span_id # create statements (workflow) - assert spans[4]["parent_id"] == root_span_id # create verdicts (workflow) - assert spans[6]["parent_id"] == root_span_id # create score (task) - - assert spans[3]["parent_id"] == spans[2]["span_id"] # create statements prompt (task) - assert spans[5]["parent_id"] == spans[4]["span_id"] # create verdicts prompt (task) + assert ragas_spans[1]["parent_id"] == root_span_id # input extraction (task) + assert ragas_spans[2]["parent_id"] == root_span_id # create statements (workflow) + assert ragas_spans[4]["parent_id"] == root_span_id # create verdicts (workflow) + assert ragas_spans[6]["parent_id"] == root_span_id # create score (task) + assert ragas_spans[3]["parent_id"] == ragas_spans[2]["span_id"] # create statements prompt (task) + assert ragas_spans[5]["parent_id"] == ragas_spans[4]["span_id"] # create verdicts prompt (task) def test_llmobs_with_faithfulness_emits_traces_and_evals_on_exit(mock_writer_logs, run_python_code_in_subprocess): @@ -206,14 +206,11 @@ def test_llmobs_with_faithfulness_emits_traces_and_evals_on_exit(mock_writer_log pypath.append(env["PYTHONPATH"]) env.update( { - "DD_API_KEY": os.getenv("DD_API_KEY", "dummy-api-key"), - "DD_SITE": "datad0g.com", "PYTHONPATH": ":".join(pypath), "OPENAI_API_KEY": os.getenv("OPENAI_API_KEY", "dummy-openai-api-key"), - "DD_LLMOBS_ML_APP": "unnamed-ml-app", "_DD_LLMOBS_EVALUATOR_INTERVAL": "5", "_DD_LLMOBS_EVALUATORS": "ragas_faithfulness", - "DD_LLMOBS_AGENTLESS_ENABLED": "true", + "DD_TRACE_ENABLED": "0", } ) out, err, status, pid = run_python_code_in_subprocess( @@ -228,20 +225,14 @@ def test_llmobs_with_faithfulness_emits_traces_and_evals_on_exit(mock_writer_log from tests.llmobs._utils import logs_vcr ctx = logs_vcr.use_cassette( - "tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.emits_traces_and_evaluations_on_exit.yaml" + "tests.llmobs.test_llmobs_ragas_evaluators.emits_traces_and_evaluations_on_exit.yaml" ) ctx.__enter__() atexit.register(lambda: ctx.__exit__()) -with mock.patch( - "ddtrace.internal.writer.HTTPWriter._send_payload", - return_value=Response( - status=200, - body="{}", - ), -): - LLMObs.enable() +with mock.patch("ddtrace.internal.writer.HTTPWriter._send_payload", return_value=Response(status=200, body="{}")): + LLMObs.enable(api_key="dummy-api-key", site="datad0g.com", ml_app="unnamed-ml-app", agentless_enabled=True) LLMObs._instance._evaluator_runner.enqueue(_llm_span_with_expected_ragas_inputs_in_messages(), None) -""", + """, env=env, ) assert status == 0, err diff --git a/tests/llmobs/test_llmobs_service.py b/tests/llmobs/test_llmobs_service.py index 98748250c3a..dad6accdcfb 100644 --- a/tests/llmobs/test_llmobs_service.py +++ b/tests/llmobs/test_llmobs_service.py @@ -1,4 +1,5 @@ import os +import re import threading import time @@ -7,9 +8,7 @@ import ddtrace from ddtrace._trace.context import Context -from ddtrace._trace.span import Span from ddtrace.ext import SpanTypes -from ddtrace.filters import TraceFilter from ddtrace.internal.service import ServiceStatus from ddtrace.llmobs import LLMObs as llmobs_service from ddtrace.llmobs._constants import INPUT_DOCUMENTS @@ -31,7 +30,8 @@ from ddtrace.llmobs._constants import SPAN_START_WHILE_DISABLED_WARNING from ddtrace.llmobs._constants import TAGS from ddtrace.llmobs._llmobs import SUPPORTED_LLMOBS_INTEGRATIONS -from ddtrace.llmobs._llmobs import LLMObsTraceProcessor +from ddtrace.llmobs._writer import LLMObsAgentlessEventClient +from ddtrace.llmobs._writer import LLMObsProxiedEventClient from ddtrace.llmobs.utils import Prompt from tests.llmobs._utils import _expected_llmobs_eval_metric_event from tests.llmobs._utils import _expected_llmobs_llm_span_event @@ -41,23 +41,16 @@ from tests.utils import override_global_config -@pytest.fixture -def mock_logs(): - with mock.patch("ddtrace.llmobs._llmobs.log") as mock_logs: - yield mock_logs +RAGAS_AVAILABLE = os.getenv("RAGAS_AVAILABLE", False) def run_llmobs_trace_filter(dummy_tracer): - for trace_filter in dummy_tracer._filters: - if isinstance(trace_filter, LLMObsTraceProcessor): - root_llm_span = Span(name="span1", span_type=SpanTypes.LLM) - root_llm_span.set_tag_str(SPAN_KIND, "llm") - trace1 = [root_llm_span] - return trace_filter.process_trace(trace1) - raise ValueError("LLMObsTraceProcessor not found in tracer filters.") + with dummy_tracer.trace("span1", span_type=SpanTypes.LLM) as span: + span.set_tag_str(SPAN_KIND, "llm") + return dummy_tracer._writer.pop() -def test_service_enable(): +def test_service_enable_proxy_default(): with override_global_config(dict(_dd_api_key="", _llmobs_ml_app="")): dummy_tracer = DummyTracer() llmobs_service.enable(_tracer=dummy_tracer) @@ -65,22 +58,22 @@ def test_service_enable(): assert llmobs_instance is not None assert llmobs_service.enabled assert llmobs_instance.tracer == dummy_tracer - assert any(isinstance(tracer_filter, LLMObsTraceProcessor) for tracer_filter in dummy_tracer._filters) + assert isinstance(llmobs_instance._llmobs_span_writer._clients[0], LLMObsProxiedEventClient) assert run_llmobs_trace_filter(dummy_tracer) is not None llmobs_service.disable() -def test_service_enable_with_apm_disabled(monkeypatch): - with override_global_config(dict(_dd_api_key="", _llmobs_ml_app="")): +def test_enable_agentless(): + with override_global_config(dict(_dd_api_key="", _llmobs_ml_app="")): dummy_tracer = DummyTracer() llmobs_service.enable(_tracer=dummy_tracer, agentless_enabled=True) llmobs_instance = llmobs_service._instance assert llmobs_instance is not None assert llmobs_service.enabled assert llmobs_instance.tracer == dummy_tracer - assert any(isinstance(tracer_filter, LLMObsTraceProcessor) for tracer_filter in dummy_tracer._filters) - assert run_llmobs_trace_filter(dummy_tracer) is None + assert isinstance(llmobs_instance._llmobs_span_writer._clients[0], LLMObsAgentlessEventClient) + assert run_llmobs_trace_filter(dummy_tracer) is not None llmobs_service.disable() @@ -118,7 +111,7 @@ def test_service_enable_no_ml_app_specified(): assert llmobs_service._instance._evaluator_runner.status.value == "stopped" -def test_service_enable_deprecated_ml_app_name(monkeypatch, mock_logs): +def test_service_enable_deprecated_ml_app_name(monkeypatch, mock_llmobs_logs): with override_global_config(dict(_dd_api_key="", _llmobs_ml_app="")): dummy_tracer = DummyTracer() monkeypatch.setenv("DD_LLMOBS_APP_NAME", "test_ml_app") @@ -126,11 +119,13 @@ def test_service_enable_deprecated_ml_app_name(monkeypatch, mock_logs): assert llmobs_service.enabled is True assert llmobs_service._instance._llmobs_eval_metric_writer.status.value == "running" assert llmobs_service._instance._llmobs_span_writer.status.value == "running" - mock_logs.warning.assert_called_once_with("`DD_LLMOBS_APP_NAME` is deprecated. Use `DD_LLMOBS_ML_APP` instead.") + mock_llmobs_logs.warning.assert_called_once_with( + "`DD_LLMOBS_APP_NAME` is deprecated. Use `DD_LLMOBS_ML_APP` instead." + ) llmobs_service.disable() -def test_service_enable_already_enabled(mock_logs): +def test_service_enable_already_enabled(mock_llmobs_logs): with override_global_config(dict(_dd_api_key="", _llmobs_ml_app="")): dummy_tracer = DummyTracer() llmobs_service.enable(_tracer=dummy_tracer) @@ -139,9 +134,8 @@ def test_service_enable_already_enabled(mock_logs): assert llmobs_instance is not None assert llmobs_service.enabled assert llmobs_instance.tracer == dummy_tracer - assert any(isinstance(tracer_filter, LLMObsTraceProcessor) for tracer_filter in dummy_tracer._filters) llmobs_service.disable() - mock_logs.debug.assert_has_calls([mock.call("%s already enabled", "LLMObs")]) + mock_llmobs_logs.debug.assert_has_calls([mock.call("%s already enabled", "LLMObs")]) @mock.patch("ddtrace.llmobs._llmobs.patch") @@ -203,107 +197,83 @@ def test_service_enable_does_not_override_global_patch_config(mock_tracer_patch, llmobs_service.disable() -def test_start_span_while_disabled_logs_warning(LLMObs, mock_logs): - LLMObs.disable() - _ = LLMObs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") - mock_logs.warning.assert_called_once_with(SPAN_START_WHILE_DISABLED_WARNING) - mock_logs.reset_mock() - _ = LLMObs.tool(name="test_tool") - mock_logs.warning.assert_called_once_with(SPAN_START_WHILE_DISABLED_WARNING) - mock_logs.reset_mock() - _ = LLMObs.task(name="test_task") - mock_logs.warning.assert_called_once_with(SPAN_START_WHILE_DISABLED_WARNING) - mock_logs.reset_mock() - _ = LLMObs.workflow(name="test_workflow") - mock_logs.warning.assert_called_once_with(SPAN_START_WHILE_DISABLED_WARNING) - mock_logs.reset_mock() - _ = LLMObs.agent(name="test_agent") - mock_logs.warning.assert_called_once_with(SPAN_START_WHILE_DISABLED_WARNING) - - -def test_start_span_uses_kind_as_default_name(LLMObs): - with LLMObs.llm(model_name="test_model", model_provider="test_provider") as span: +def test_start_span_while_disabled_logs_warning(llmobs, mock_llmobs_logs): + llmobs.disable() + _ = llmobs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") + mock_llmobs_logs.warning.assert_called_once_with(SPAN_START_WHILE_DISABLED_WARNING) + mock_llmobs_logs.reset_mock() + _ = llmobs.tool(name="test_tool") + mock_llmobs_logs.warning.assert_called_once_with(SPAN_START_WHILE_DISABLED_WARNING) + mock_llmobs_logs.reset_mock() + _ = llmobs.task(name="test_task") + mock_llmobs_logs.warning.assert_called_once_with(SPAN_START_WHILE_DISABLED_WARNING) + mock_llmobs_logs.reset_mock() + _ = llmobs.workflow(name="test_workflow") + mock_llmobs_logs.warning.assert_called_once_with(SPAN_START_WHILE_DISABLED_WARNING) + mock_llmobs_logs.reset_mock() + _ = llmobs.agent(name="test_agent") + mock_llmobs_logs.warning.assert_called_once_with(SPAN_START_WHILE_DISABLED_WARNING) + + +def test_start_span_uses_kind_as_default_name(llmobs): + with llmobs.llm(model_name="test_model", model_provider="test_provider") as span: assert span.name == "llm" - with LLMObs.tool() as span: + with llmobs.tool() as span: assert span.name == "tool" - with LLMObs.task() as span: + with llmobs.task() as span: assert span.name == "task" - with LLMObs.workflow() as span: + with llmobs.workflow() as span: assert span.name == "workflow" - with LLMObs.agent() as span: + with llmobs.agent() as span: assert span.name == "agent" -def test_start_span_with_session_id(LLMObs): - with LLMObs.llm(model_name="test_model", session_id="test_session_id") as span: +def test_start_span_with_session_id(llmobs): + with llmobs.llm(model_name="test_model", session_id="test_session_id") as span: assert span._get_ctx_item(SESSION_ID) == "test_session_id" - with LLMObs.tool(session_id="test_session_id") as span: + with llmobs.tool(session_id="test_session_id") as span: assert span._get_ctx_item(SESSION_ID) == "test_session_id" - with LLMObs.task(session_id="test_session_id") as span: + with llmobs.task(session_id="test_session_id") as span: assert span._get_ctx_item(SESSION_ID) == "test_session_id" - with LLMObs.workflow(session_id="test_session_id") as span: + with llmobs.workflow(session_id="test_session_id") as span: assert span._get_ctx_item(SESSION_ID) == "test_session_id" - with LLMObs.agent(session_id="test_session_id") as span: + with llmobs.agent(session_id="test_session_id") as span: assert span._get_ctx_item(SESSION_ID) == "test_session_id" -def test_session_id_becomes_top_level_field(LLMObs, mock_llmobs_span_writer): - session_id = "test_session_id" - with LLMObs.task(session_id=session_id) as span: - pass - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_non_llm_span_event(span, "task", session_id=session_id) - ) - - -def test_session_id_becomes_top_level_field_agentless(AgentlessLLMObs, mock_llmobs_span_agentless_writer): +def test_session_id_becomes_top_level_field(llmobs, llmobs_events): session_id = "test_session_id" - with AgentlessLLMObs.task(session_id=session_id) as span: + with llmobs.task(session_id=session_id) as span: pass - mock_llmobs_span_agentless_writer.enqueue.assert_called_with( - _expected_llmobs_non_llm_span_event(span, "task", session_id=session_id) - ) - + assert len(llmobs_events) == 1 + assert llmobs_events[0] == _expected_llmobs_non_llm_span_event(span, "task", session_id=session_id) -def test_llm_span(LLMObs, mock_llmobs_span_writer): - with LLMObs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span: - assert span.name == "test_llm_call" - assert span.resource == "llm" - assert span.span_type == "llm" - assert span._get_ctx_item(SPAN_KIND) == "llm" - assert span._get_ctx_item(MODEL_NAME) == "test_model" - assert span._get_ctx_item(MODEL_PROVIDER) == "test_provider" - - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_llm_span_event(span, "llm", model_name="test_model", model_provider="test_provider") - ) - -def test_llm_span_agentless(AgentlessLLMObs, mock_llmobs_span_agentless_writer): - with AgentlessLLMObs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span: +def test_llm_span(llmobs, llmobs_events): + with llmobs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span: assert span.name == "test_llm_call" assert span.resource == "llm" assert span.span_type == "llm" assert span._get_ctx_item(SPAN_KIND) == "llm" assert span._get_ctx_item(MODEL_NAME) == "test_model" assert span._get_ctx_item(MODEL_PROVIDER) == "test_provider" - - mock_llmobs_span_agentless_writer.enqueue.assert_called_with( - _expected_llmobs_llm_span_event(span, "llm", model_name="test_model", model_provider="test_provider") + assert len(llmobs_events) == 1 + assert llmobs_events[0] == _expected_llmobs_llm_span_event( + span, "llm", model_name="test_model", model_provider="test_provider" ) -def test_llm_span_no_model_sets_default(LLMObs, mock_llmobs_span_writer): - with LLMObs.llm(name="test_llm_call", model_provider="test_provider") as span: +def test_llm_span_no_model_sets_default(llmobs, llmobs_events): + with llmobs.llm(name="test_llm_call", model_provider="test_provider") as span: assert span._get_ctx_item(MODEL_NAME) == "custom" - - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_llm_span_event(span, "llm", model_name="custom", model_provider="test_provider") + assert len(llmobs_events) == 1 + assert llmobs_events[0] == _expected_llmobs_llm_span_event( + span, "llm", model_name="custom", model_provider="test_provider" ) -def test_default_model_provider_set_to_custom(LLMObs): - with LLMObs.llm(model_name="test_model", name="test_llm_call") as span: +def test_default_model_provider_set_to_custom(llmobs): + with llmobs.llm(model_name="test_model", name="test_llm_call") as span: assert span.name == "test_llm_call" assert span.resource == "llm" assert span.span_type == "llm" @@ -312,88 +282,57 @@ def test_default_model_provider_set_to_custom(LLMObs): assert span._get_ctx_item(MODEL_PROVIDER) == "custom" -def test_tool_span(LLMObs, mock_llmobs_span_writer): - with LLMObs.tool(name="test_tool") as span: - assert span.name == "test_tool" - assert span.resource == "tool" - assert span.span_type == "llm" - assert span._get_ctx_item(SPAN_KIND) == "tool" - mock_llmobs_span_writer.enqueue.assert_called_with(_expected_llmobs_non_llm_span_event(span, "tool")) - - -def test_tool_span_agentless(AgentlessLLMObs, mock_llmobs_span_agentless_writer): - with AgentlessLLMObs.tool(name="test_tool") as span: +def test_tool_span(llmobs, llmobs_events): + with llmobs.tool(name="test_tool") as span: assert span.name == "test_tool" assert span.resource == "tool" assert span.span_type == "llm" assert span._get_ctx_item(SPAN_KIND) == "tool" - mock_llmobs_span_agentless_writer.enqueue.assert_called_with(_expected_llmobs_non_llm_span_event(span, "tool")) - - -def test_task_span(LLMObs, mock_llmobs_span_writer): - with LLMObs.task(name="test_task") as span: - assert span.name == "test_task" - assert span.resource == "task" - assert span.span_type == "llm" - assert span._get_ctx_item(SPAN_KIND) == "task" - mock_llmobs_span_writer.enqueue.assert_called_with(_expected_llmobs_non_llm_span_event(span, "task")) + assert len(llmobs_events) == 1 + assert llmobs_events[0] == _expected_llmobs_non_llm_span_event(span, "tool") -def test_task_span_agentless(AgentlessLLMObs, mock_llmobs_span_agentless_writer): - with AgentlessLLMObs.task(name="test_task") as span: +def test_task_span(llmobs, llmobs_events): + with llmobs.task(name="test_task") as span: assert span.name == "test_task" assert span.resource == "task" assert span.span_type == "llm" assert span._get_ctx_item(SPAN_KIND) == "task" - mock_llmobs_span_agentless_writer.enqueue.assert_called_with(_expected_llmobs_non_llm_span_event(span, "task")) - - -def test_workflow_span(LLMObs, mock_llmobs_span_writer): - with LLMObs.workflow(name="test_workflow") as span: - assert span.name == "test_workflow" - assert span.resource == "workflow" - assert span.span_type == "llm" - assert span._get_ctx_item(SPAN_KIND) == "workflow" - mock_llmobs_span_writer.enqueue.assert_called_with(_expected_llmobs_non_llm_span_event(span, "workflow")) + assert len(llmobs_events) == 1 + assert llmobs_events[0] == _expected_llmobs_non_llm_span_event(span, "task") -def test_workflow_span_agentless(AgentlessLLMObs, mock_llmobs_span_agentless_writer): - with AgentlessLLMObs.workflow(name="test_workflow") as span: +def test_workflow_span(llmobs, llmobs_events): + with llmobs.workflow(name="test_workflow") as span: assert span.name == "test_workflow" assert span.resource == "workflow" assert span.span_type == "llm" assert span._get_ctx_item(SPAN_KIND) == "workflow" - mock_llmobs_span_agentless_writer.enqueue.assert_called_with(_expected_llmobs_non_llm_span_event(span, "workflow")) + assert len(llmobs_events) == 1 + assert llmobs_events[0] == _expected_llmobs_non_llm_span_event(span, "workflow") -def test_agent_span(LLMObs, mock_llmobs_span_writer): - with LLMObs.agent(name="test_agent") as span: +def test_agent_span(llmobs, llmobs_events): + with llmobs.agent(name="test_agent") as span: assert span.name == "test_agent" assert span.resource == "agent" assert span.span_type == "llm" assert span._get_ctx_item(SPAN_KIND) == "agent" - mock_llmobs_span_writer.enqueue.assert_called_with(_expected_llmobs_llm_span_event(span, "agent")) + assert len(llmobs_events) == 1 + assert llmobs_events[0] == _expected_llmobs_llm_span_event(span, "agent") -def test_agent_span_agentless(AgentlessLLMObs, mock_llmobs_span_agentless_writer): - with AgentlessLLMObs.agent(name="test_agent") as span: - assert span.name == "test_agent" - assert span.resource == "agent" - assert span.span_type == "llm" - assert span._get_ctx_item(SPAN_KIND) == "agent" - mock_llmobs_span_agentless_writer.enqueue.assert_called_with(_expected_llmobs_llm_span_event(span, "agent")) - - -def test_embedding_span_no_model_sets_default(LLMObs, mock_llmobs_span_writer): - with LLMObs.embedding(name="test_embedding", model_provider="test_provider") as span: +def test_embedding_span_no_model_sets_default(llmobs, llmobs_events): + with llmobs.embedding(name="test_embedding", model_provider="test_provider") as span: assert span._get_ctx_item(MODEL_NAME) == "custom" - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_llm_span_event(span, "embedding", model_name="custom", model_provider="test_provider") + assert len(llmobs_events) == 1 + assert llmobs_events[0] == _expected_llmobs_llm_span_event( + span, "embedding", model_name="custom", model_provider="test_provider" ) -def test_embedding_default_model_provider_set_to_custom(LLMObs): - with LLMObs.embedding(model_name="test_model", name="test_embedding") as span: +def test_embedding_default_model_provider_set_to_custom(llmobs): + with llmobs.embedding(model_name="test_model", name="test_embedding") as span: assert span.name == "test_embedding" assert span.resource == "embedding" assert span.span_type == "llm" @@ -402,198 +341,182 @@ def test_embedding_default_model_provider_set_to_custom(LLMObs): assert span._get_ctx_item(MODEL_PROVIDER) == "custom" -def test_embedding_span(LLMObs, mock_llmobs_span_writer): - with LLMObs.embedding(model_name="test_model", name="test_embedding", model_provider="test_provider") as span: - assert span.name == "test_embedding" - assert span.resource == "embedding" - assert span.span_type == "llm" - assert span._get_ctx_item(SPAN_KIND) == "embedding" - assert span._get_ctx_item(MODEL_NAME) == "test_model" - assert span._get_ctx_item(MODEL_PROVIDER) == "test_provider" - - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_llm_span_event(span, "embedding", model_name="test_model", model_provider="test_provider") - ) - - -def test_embedding_span_agentless(AgentlessLLMObs, mock_llmobs_span_agentless_writer): - with AgentlessLLMObs.embedding( - model_name="test_model", name="test_embedding", model_provider="test_provider" - ) as span: +def test_embedding_span(llmobs, llmobs_events): + with llmobs.embedding(model_name="test_model", name="test_embedding", model_provider="test_provider") as span: assert span.name == "test_embedding" assert span.resource == "embedding" assert span.span_type == "llm" assert span._get_ctx_item(SPAN_KIND) == "embedding" assert span._get_ctx_item(MODEL_NAME) == "test_model" assert span._get_ctx_item(MODEL_PROVIDER) == "test_provider" - - mock_llmobs_span_agentless_writer.enqueue.assert_called_with( - _expected_llmobs_llm_span_event(span, "embedding", model_name="test_model", model_provider="test_provider") + assert len(llmobs_events) == 1 + assert llmobs_events[0] == _expected_llmobs_llm_span_event( + span, "embedding", model_name="test_model", model_provider="test_provider" ) -def test_annotate_no_active_span_logs_warning(LLMObs, mock_logs): - LLMObs.annotate(parameters={"test": "test"}) - mock_logs.warning.assert_called_once_with("No span provided and no active LLMObs-generated span found.") +def test_annotate_no_active_span_logs_warning(llmobs, mock_llmobs_logs): + llmobs.annotate(parameters={"test": "test"}) + mock_llmobs_logs.warning.assert_called_once_with("No span provided and no active LLMObs-generated span found.") -def test_annotate_non_llm_span_logs_warning(LLMObs, mock_logs): +def test_annotate_non_llm_span_logs_warning(llmobs, mock_llmobs_logs): dummy_tracer = DummyTracer() with dummy_tracer.trace("root") as non_llmobs_span: - LLMObs.annotate(span=non_llmobs_span, parameters={"test": "test"}) - mock_logs.warning.assert_called_once_with("Span must be an LLMObs-generated span.") + llmobs.annotate(span=non_llmobs_span, parameters={"test": "test"}) + mock_llmobs_logs.warning.assert_called_once_with("Span must be an LLMObs-generated span.") -def test_annotate_finished_span_does_nothing(LLMObs, mock_logs): - with LLMObs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span: +def test_annotate_finished_span_does_nothing(llmobs, mock_llmobs_logs): + with llmobs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span: pass - LLMObs.annotate(span=span, parameters={"test": "test"}) - mock_logs.warning.assert_called_once_with("Cannot annotate a finished span.") + llmobs.annotate(span=span, parameters={"test": "test"}) + mock_llmobs_logs.warning.assert_called_once_with("Cannot annotate a finished span.") -def test_annotate_parameters(LLMObs, mock_logs): - with LLMObs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span: - LLMObs.annotate(span=span, parameters={"temperature": 0.9, "max_tokens": 50}) +def test_annotate_parameters(llmobs, mock_llmobs_logs): + with llmobs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span: + llmobs.annotate(span=span, parameters={"temperature": 0.9, "max_tokens": 50}) assert span._get_ctx_item(INPUT_PARAMETERS) == {"temperature": 0.9, "max_tokens": 50} - mock_logs.warning.assert_called_once_with( + mock_llmobs_logs.warning.assert_called_once_with( "Setting parameters is deprecated, please set parameters and other metadata as tags instead." ) -def test_annotate_metadata(LLMObs): - with LLMObs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span: - LLMObs.annotate(span=span, metadata={"temperature": 0.5, "max_tokens": 20, "top_k": 10, "n": 3}) +def test_annotate_metadata(llmobs): + with llmobs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span: + llmobs.annotate(span=span, metadata={"temperature": 0.5, "max_tokens": 20, "top_k": 10, "n": 3}) assert span._get_ctx_item(METADATA) == {"temperature": 0.5, "max_tokens": 20, "top_k": 10, "n": 3} -def test_annotate_metadata_wrong_type_raises_warning(LLMObs, mock_logs): - with LLMObs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span: - LLMObs.annotate(span=span, metadata="wrong_metadata") +def test_annotate_metadata_wrong_type_raises_warning(llmobs, mock_llmobs_logs): + with llmobs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span: + llmobs.annotate(span=span, metadata="wrong_metadata") assert span._get_ctx_item(METADATA) is None - mock_logs.warning.assert_called_once_with("metadata must be a dictionary of string key-value pairs.") - mock_logs.reset_mock() + mock_llmobs_logs.warning.assert_called_once_with("metadata must be a dictionary of string key-value pairs.") + mock_llmobs_logs.reset_mock() -def test_annotate_tag(LLMObs): - with LLMObs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span: - LLMObs.annotate(span=span, tags={"test_tag_name": "test_tag_value", "test_numeric_tag": 10}) +def test_annotate_tag(llmobs): + with llmobs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span: + llmobs.annotate(span=span, tags={"test_tag_name": "test_tag_value", "test_numeric_tag": 10}) assert span._get_ctx_item(TAGS) == {"test_tag_name": "test_tag_value", "test_numeric_tag": 10} -def test_annotate_tag_wrong_type(LLMObs, mock_logs): - with LLMObs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span: - LLMObs.annotate(span=span, tags=12345) +def test_annotate_tag_wrong_type(llmobs, mock_llmobs_logs): + with llmobs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span: + llmobs.annotate(span=span, tags=12345) assert span._get_ctx_item(TAGS) is None - mock_logs.warning.assert_called_once_with( + mock_llmobs_logs.warning.assert_called_once_with( "span_tags must be a dictionary of string key - primitive value pairs." ) -def test_annotate_input_string(LLMObs): - with LLMObs.llm(model_name="test_model") as llm_span: - LLMObs.annotate(span=llm_span, input_data="test_input") +def test_annotate_input_string(llmobs): + with llmobs.llm(model_name="test_model") as llm_span: + llmobs.annotate(span=llm_span, input_data="test_input") assert llm_span._get_ctx_item(INPUT_MESSAGES) == [{"content": "test_input"}] - with LLMObs.task() as task_span: - LLMObs.annotate(span=task_span, input_data="test_input") + with llmobs.task() as task_span: + llmobs.annotate(span=task_span, input_data="test_input") assert task_span._get_ctx_item(INPUT_VALUE) == "test_input" - with LLMObs.tool() as tool_span: - LLMObs.annotate(span=tool_span, input_data="test_input") + with llmobs.tool() as tool_span: + llmobs.annotate(span=tool_span, input_data="test_input") assert tool_span._get_ctx_item(INPUT_VALUE) == "test_input" - with LLMObs.workflow() as workflow_span: - LLMObs.annotate(span=workflow_span, input_data="test_input") + with llmobs.workflow() as workflow_span: + llmobs.annotate(span=workflow_span, input_data="test_input") assert workflow_span._get_ctx_item(INPUT_VALUE) == "test_input" - with LLMObs.agent() as agent_span: - LLMObs.annotate(span=agent_span, input_data="test_input") + with llmobs.agent() as agent_span: + llmobs.annotate(span=agent_span, input_data="test_input") assert agent_span._get_ctx_item(INPUT_VALUE) == "test_input" - with LLMObs.retrieval() as retrieval_span: - LLMObs.annotate(span=retrieval_span, input_data="test_input") + with llmobs.retrieval() as retrieval_span: + llmobs.annotate(span=retrieval_span, input_data="test_input") assert retrieval_span._get_ctx_item(INPUT_VALUE) == "test_input" -def test_annotate_numeric_io(LLMObs): - with LLMObs.task() as task_span: - LLMObs.annotate(span=task_span, input_data=0, output_data=0) +def test_annotate_numeric_io(llmobs): + with llmobs.task() as task_span: + llmobs.annotate(span=task_span, input_data=0, output_data=0) assert task_span._get_ctx_item(INPUT_VALUE) == "0" assert task_span._get_ctx_item(OUTPUT_VALUE) == "0" - with LLMObs.task() as task_span: - LLMObs.annotate(span=task_span, input_data=1.23, output_data=1.23) + with llmobs.task() as task_span: + llmobs.annotate(span=task_span, input_data=1.23, output_data=1.23) assert task_span._get_ctx_item(INPUT_VALUE) == "1.23" assert task_span._get_ctx_item(OUTPUT_VALUE) == "1.23" -def test_annotate_input_serializable_value(LLMObs): - with LLMObs.task() as task_span: - LLMObs.annotate(span=task_span, input_data=["test_input"]) +def test_annotate_input_serializable_value(llmobs): + with llmobs.task() as task_span: + llmobs.annotate(span=task_span, input_data=["test_input"]) assert task_span._get_ctx_item(INPUT_VALUE) == str(["test_input"]) - with LLMObs.tool() as tool_span: - LLMObs.annotate(span=tool_span, input_data={"test_input": "hello world"}) + with llmobs.tool() as tool_span: + llmobs.annotate(span=tool_span, input_data={"test_input": "hello world"}) assert tool_span._get_ctx_item(INPUT_VALUE) == str({"test_input": "hello world"}) - with LLMObs.workflow() as workflow_span: - LLMObs.annotate(span=workflow_span, input_data=("asd", 123)) + with llmobs.workflow() as workflow_span: + llmobs.annotate(span=workflow_span, input_data=("asd", 123)) assert workflow_span._get_ctx_item(INPUT_VALUE) == str(("asd", 123)) - with LLMObs.agent() as agent_span: - LLMObs.annotate(span=agent_span, input_data="test_input") + with llmobs.agent() as agent_span: + llmobs.annotate(span=agent_span, input_data="test_input") assert agent_span._get_ctx_item(INPUT_VALUE) == "test_input" - with LLMObs.retrieval() as retrieval_span: - LLMObs.annotate(span=retrieval_span, input_data=[0, 1, 2, 3, 4]) + with llmobs.retrieval() as retrieval_span: + llmobs.annotate(span=retrieval_span, input_data=[0, 1, 2, 3, 4]) assert retrieval_span._get_ctx_item(INPUT_VALUE) == str([0, 1, 2, 3, 4]) -def test_annotate_input_llm_message(LLMObs): - with LLMObs.llm(model_name="test_model") as span: - LLMObs.annotate(span=span, input_data=[{"content": "test_input", "role": "human"}]) +def test_annotate_input_llm_message(llmobs): + with llmobs.llm(model_name="test_model") as span: + llmobs.annotate(span=span, input_data=[{"content": "test_input", "role": "human"}]) assert span._get_ctx_item(INPUT_MESSAGES) == [{"content": "test_input", "role": "human"}] -def test_annotate_input_llm_message_wrong_type(LLMObs, mock_logs): - with LLMObs.llm(model_name="test_model") as span: - LLMObs.annotate(span=span, input_data=[{"content": object()}]) +def test_annotate_input_llm_message_wrong_type(llmobs, mock_llmobs_logs): + with llmobs.llm(model_name="test_model") as span: + llmobs.annotate(span=span, input_data=[{"content": object()}]) assert span._get_ctx_item(INPUT_MESSAGES) is None - mock_logs.warning.assert_called_once_with("Failed to parse input messages.", exc_info=True) + mock_llmobs_logs.warning.assert_called_once_with("Failed to parse input messages.", exc_info=True) -def test_llmobs_annotate_incorrect_message_content_type_raises_warning(LLMObs, mock_logs): - with LLMObs.llm(model_name="test_model") as span: - LLMObs.annotate(span=span, input_data={"role": "user", "content": {"nested": "yes"}}) - mock_logs.warning.assert_called_once_with("Failed to parse input messages.", exc_info=True) - mock_logs.reset_mock() - LLMObs.annotate(span=span, output_data={"role": "user", "content": {"nested": "yes"}}) - mock_logs.warning.assert_called_once_with("Failed to parse output messages.", exc_info=True) +def test_llmobs_annotate_incorrect_message_content_type_raises_warning(llmobs, mock_llmobs_logs): + with llmobs.llm(model_name="test_model") as span: + llmobs.annotate(span=span, input_data={"role": "user", "content": {"nested": "yes"}}) + mock_llmobs_logs.warning.assert_called_once_with("Failed to parse input messages.", exc_info=True) + mock_llmobs_logs.reset_mock() + llmobs.annotate(span=span, output_data={"role": "user", "content": {"nested": "yes"}}) + mock_llmobs_logs.warning.assert_called_once_with("Failed to parse output messages.", exc_info=True) -def test_annotate_document_str(LLMObs): - with LLMObs.embedding(model_name="test_model") as span: - LLMObs.annotate(span=span, input_data="test_document_text") +def test_annotate_document_str(llmobs): + with llmobs.embedding(model_name="test_model") as span: + llmobs.annotate(span=span, input_data="test_document_text") documents = span._get_ctx_item(INPUT_DOCUMENTS) assert documents assert len(documents) == 1 assert documents[0]["text"] == "test_document_text" - with LLMObs.retrieval() as span: - LLMObs.annotate(span=span, output_data="test_document_text") + with llmobs.retrieval() as span: + llmobs.annotate(span=span, output_data="test_document_text") documents = span._get_ctx_item(OUTPUT_DOCUMENTS) assert documents assert len(documents) == 1 assert documents[0]["text"] == "test_document_text" -def test_annotate_document_dict(LLMObs): - with LLMObs.embedding(model_name="test_model") as span: - LLMObs.annotate(span=span, input_data={"text": "test_document_text"}) +def test_annotate_document_dict(llmobs): + with llmobs.embedding(model_name="test_model") as span: + llmobs.annotate(span=span, input_data={"text": "test_document_text"}) documents = span._get_ctx_item(INPUT_DOCUMENTS) assert documents assert len(documents) == 1 assert documents[0]["text"] == "test_document_text" - with LLMObs.retrieval() as span: - LLMObs.annotate(span=span, output_data={"text": "test_document_text"}) + with llmobs.retrieval() as span: + llmobs.annotate(span=span, output_data={"text": "test_document_text"}) documents = span._get_ctx_item(OUTPUT_DOCUMENTS) assert documents assert len(documents) == 1 assert documents[0]["text"] == "test_document_text" -def test_annotate_document_list(LLMObs): - with LLMObs.embedding(model_name="test_model") as span: - LLMObs.annotate( +def test_annotate_document_list(llmobs): + with llmobs.embedding(model_name="test_model") as span: + llmobs.annotate( span=span, input_data=[{"text": "test_document_text"}, {"text": "text", "name": "name", "score": 0.9, "id": "id"}], ) @@ -605,8 +528,8 @@ def test_annotate_document_list(LLMObs): assert documents[1]["name"] == "name" assert documents[1]["id"] == "id" assert documents[1]["score"] == 0.9 - with LLMObs.retrieval() as span: - LLMObs.annotate( + with llmobs.retrieval() as span: + llmobs.annotate( span=span, output_data=[{"text": "test_document_text"}, {"text": "text", "name": "name", "score": 0.9, "id": "id"}], ) @@ -620,129 +543,131 @@ def test_annotate_document_list(LLMObs): assert documents[1]["score"] == 0.9 -def test_annotate_incorrect_document_type_raises_warning(LLMObs, mock_logs): - with LLMObs.embedding(model_name="test_model") as span: - LLMObs.annotate(span=span, input_data={"text": 123}) - mock_logs.warning.assert_called_once_with("Failed to parse input documents.", exc_info=True) - mock_logs.reset_mock() - LLMObs.annotate(span=span, input_data=123) - mock_logs.warning.assert_called_once_with("Failed to parse input documents.", exc_info=True) - mock_logs.reset_mock() - LLMObs.annotate(span=span, input_data=object()) - mock_logs.warning.assert_called_once_with("Failed to parse input documents.", exc_info=True) - mock_logs.reset_mock() - with LLMObs.retrieval() as span: - LLMObs.annotate(span=span, output_data=[{"score": 0.9, "id": "id", "name": "name"}]) - mock_logs.warning.assert_called_once_with("Failed to parse output documents.", exc_info=True) - mock_logs.reset_mock() - LLMObs.annotate(span=span, output_data=123) - mock_logs.warning.assert_called_once_with("Failed to parse output documents.", exc_info=True) - mock_logs.reset_mock() - LLMObs.annotate(span=span, output_data=object()) - mock_logs.warning.assert_called_once_with("Failed to parse output documents.", exc_info=True) - - -def test_annotate_document_no_text_raises_warning(LLMObs, mock_logs): - with LLMObs.embedding(model_name="test_model") as span: - LLMObs.annotate(span=span, input_data=[{"score": 0.9, "id": "id", "name": "name"}]) - mock_logs.warning.assert_called_once_with("Failed to parse input documents.", exc_info=True) - mock_logs.reset_mock() - with LLMObs.retrieval() as span: - LLMObs.annotate(span=span, output_data=[{"score": 0.9, "id": "id", "name": "name"}]) - mock_logs.warning.assert_called_once_with("Failed to parse output documents.", exc_info=True) - - -def test_annotate_incorrect_document_field_type_raises_warning(LLMObs, mock_logs): - with LLMObs.embedding(model_name="test_model") as span: - LLMObs.annotate(span=span, input_data=[{"text": "test_document_text", "score": "0.9"}]) - mock_logs.warning.assert_called_once_with("Failed to parse input documents.", exc_info=True) - mock_logs.reset_mock() - with LLMObs.embedding(model_name="test_model") as span: - LLMObs.annotate( +def test_annotate_incorrect_document_type_raises_warning(llmobs, mock_llmobs_logs): + with llmobs.embedding(model_name="test_model") as span: + llmobs.annotate(span=span, input_data={"text": 123}) + mock_llmobs_logs.warning.assert_called_once_with("Failed to parse input documents.", exc_info=True) + mock_llmobs_logs.reset_mock() + llmobs.annotate(span=span, input_data=123) + mock_llmobs_logs.warning.assert_called_once_with("Failed to parse input documents.", exc_info=True) + mock_llmobs_logs.reset_mock() + llmobs.annotate(span=span, input_data=object()) + mock_llmobs_logs.warning.assert_called_once_with("Failed to parse input documents.", exc_info=True) + mock_llmobs_logs.reset_mock() + with llmobs.retrieval() as span: + llmobs.annotate(span=span, output_data=[{"score": 0.9, "id": "id", "name": "name"}]) + mock_llmobs_logs.warning.assert_called_once_with("Failed to parse output documents.", exc_info=True) + mock_llmobs_logs.reset_mock() + llmobs.annotate(span=span, output_data=123) + mock_llmobs_logs.warning.assert_called_once_with("Failed to parse output documents.", exc_info=True) + mock_llmobs_logs.reset_mock() + llmobs.annotate(span=span, output_data=object()) + mock_llmobs_logs.warning.assert_called_once_with("Failed to parse output documents.", exc_info=True) + + +def test_annotate_document_no_text_raises_warning(llmobs, mock_llmobs_logs): + with llmobs.embedding(model_name="test_model") as span: + llmobs.annotate(span=span, input_data=[{"score": 0.9, "id": "id", "name": "name"}]) + mock_llmobs_logs.warning.assert_called_once_with("Failed to parse input documents.", exc_info=True) + mock_llmobs_logs.reset_mock() + with llmobs.retrieval() as span: + llmobs.annotate(span=span, output_data=[{"score": 0.9, "id": "id", "name": "name"}]) + mock_llmobs_logs.warning.assert_called_once_with("Failed to parse output documents.", exc_info=True) + + +def test_annotate_incorrect_document_field_type_raises_warning(llmobs, mock_llmobs_logs): + with llmobs.embedding(model_name="test_model") as span: + llmobs.annotate(span=span, input_data=[{"text": "test_document_text", "score": "0.9"}]) + mock_llmobs_logs.warning.assert_called_once_with("Failed to parse input documents.", exc_info=True) + mock_llmobs_logs.reset_mock() + with llmobs.embedding(model_name="test_model") as span: + llmobs.annotate( span=span, input_data=[{"text": "text", "id": 123, "score": "0.9", "name": ["h", "e", "l", "l", "o"]}] ) - mock_logs.warning.assert_called_once_with("Failed to parse input documents.", exc_info=True) - mock_logs.reset_mock() - with LLMObs.retrieval() as span: - LLMObs.annotate(span=span, output_data=[{"text": "test_document_text", "score": "0.9"}]) - mock_logs.warning.assert_called_once_with("Failed to parse output documents.", exc_info=True) - mock_logs.reset_mock() - with LLMObs.retrieval() as span: - LLMObs.annotate( + mock_llmobs_logs.warning.assert_called_once_with("Failed to parse input documents.", exc_info=True) + mock_llmobs_logs.reset_mock() + with llmobs.retrieval() as span: + llmobs.annotate(span=span, output_data=[{"text": "test_document_text", "score": "0.9"}]) + mock_llmobs_logs.warning.assert_called_once_with("Failed to parse output documents.", exc_info=True) + mock_llmobs_logs.reset_mock() + with llmobs.retrieval() as span: + llmobs.annotate( span=span, output_data=[{"text": "text", "id": 123, "score": "0.9", "name": ["h", "e", "l", "l", "o"]}] ) - mock_logs.warning.assert_called_once_with("Failed to parse output documents.", exc_info=True) + mock_llmobs_logs.warning.assert_called_once_with("Failed to parse output documents.", exc_info=True) -def test_annotate_output_string(LLMObs): - with LLMObs.llm(model_name="test_model") as llm_span: - LLMObs.annotate(span=llm_span, output_data="test_output") +def test_annotate_output_string(llmobs): + with llmobs.llm(model_name="test_model") as llm_span: + llmobs.annotate(span=llm_span, output_data="test_output") assert llm_span._get_ctx_item(OUTPUT_MESSAGES) == [{"content": "test_output"}] - with LLMObs.embedding(model_name="test_model") as embedding_span: - LLMObs.annotate(span=embedding_span, output_data="test_output") + with llmobs.embedding(model_name="test_model") as embedding_span: + llmobs.annotate(span=embedding_span, output_data="test_output") assert embedding_span._get_ctx_item(OUTPUT_VALUE) == "test_output" - with LLMObs.task() as task_span: - LLMObs.annotate(span=task_span, output_data="test_output") + with llmobs.task() as task_span: + llmobs.annotate(span=task_span, output_data="test_output") assert task_span._get_ctx_item(OUTPUT_VALUE) == "test_output" - with LLMObs.tool() as tool_span: - LLMObs.annotate(span=tool_span, output_data="test_output") + with llmobs.tool() as tool_span: + llmobs.annotate(span=tool_span, output_data="test_output") assert tool_span._get_ctx_item(OUTPUT_VALUE) == "test_output" - with LLMObs.workflow() as workflow_span: - LLMObs.annotate(span=workflow_span, output_data="test_output") + with llmobs.workflow() as workflow_span: + llmobs.annotate(span=workflow_span, output_data="test_output") assert workflow_span._get_ctx_item(OUTPUT_VALUE) == "test_output" - with LLMObs.agent() as agent_span: - LLMObs.annotate(span=agent_span, output_data="test_output") + with llmobs.agent() as agent_span: + llmobs.annotate(span=agent_span, output_data="test_output") assert agent_span._get_ctx_item(OUTPUT_VALUE) == "test_output" -def test_annotate_output_serializable_value(LLMObs): - with LLMObs.embedding(model_name="test_model") as embedding_span: - LLMObs.annotate(span=embedding_span, output_data=[[0, 1, 2, 3], [4, 5, 6, 7]]) +def test_annotate_output_serializable_value(llmobs): + with llmobs.embedding(model_name="test_model") as embedding_span: + llmobs.annotate(span=embedding_span, output_data=[[0, 1, 2, 3], [4, 5, 6, 7]]) assert embedding_span._get_ctx_item(OUTPUT_VALUE) == str([[0, 1, 2, 3], [4, 5, 6, 7]]) - with LLMObs.task() as task_span: - LLMObs.annotate(span=task_span, output_data=["test_output"]) + with llmobs.task() as task_span: + llmobs.annotate(span=task_span, output_data=["test_output"]) assert task_span._get_ctx_item(OUTPUT_VALUE) == str(["test_output"]) - with LLMObs.tool() as tool_span: - LLMObs.annotate(span=tool_span, output_data={"test_output": "hello world"}) + with llmobs.tool() as tool_span: + llmobs.annotate(span=tool_span, output_data={"test_output": "hello world"}) assert tool_span._get_ctx_item(OUTPUT_VALUE) == str({"test_output": "hello world"}) - with LLMObs.workflow() as workflow_span: - LLMObs.annotate(span=workflow_span, output_data=("asd", 123)) + with llmobs.workflow() as workflow_span: + llmobs.annotate(span=workflow_span, output_data=("asd", 123)) assert workflow_span._get_ctx_item(OUTPUT_VALUE) == str(("asd", 123)) - with LLMObs.agent() as agent_span: - LLMObs.annotate(span=agent_span, output_data="test_output") + with llmobs.agent() as agent_span: + llmobs.annotate(span=agent_span, output_data="test_output") assert agent_span._get_ctx_item(OUTPUT_VALUE) == "test_output" -def test_annotate_output_llm_message(LLMObs): - with LLMObs.llm(model_name="test_model") as llm_span: - LLMObs.annotate(span=llm_span, output_data=[{"content": "test_output", "role": "human"}]) +def test_annotate_output_llm_message(llmobs): + with llmobs.llm(model_name="test_model") as llm_span: + llmobs.annotate(span=llm_span, output_data=[{"content": "test_output", "role": "human"}]) assert llm_span._get_ctx_item(OUTPUT_MESSAGES) == [{"content": "test_output", "role": "human"}] -def test_annotate_output_llm_message_wrong_type(LLMObs, mock_logs): - with LLMObs.llm(model_name="test_model") as llm_span: - LLMObs.annotate(span=llm_span, output_data=[{"content": object()}]) +def test_annotate_output_llm_message_wrong_type(llmobs, mock_llmobs_logs): + with llmobs.llm(model_name="test_model") as llm_span: + llmobs.annotate(span=llm_span, output_data=[{"content": object()}]) assert llm_span._get_ctx_item(OUTPUT_MESSAGES) is None - mock_logs.warning.assert_called_once_with("Failed to parse output messages.", exc_info=True) + mock_llmobs_logs.warning.assert_called_once_with("Failed to parse output messages.", exc_info=True) -def test_annotate_metrics(LLMObs): - with LLMObs.llm(model_name="test_model") as span: - LLMObs.annotate(span=span, metrics={"input_tokens": 10, "output_tokens": 20, "total_tokens": 30}) +def test_annotate_metrics(llmobs): + with llmobs.llm(model_name="test_model") as span: + llmobs.annotate(span=span, metrics={"input_tokens": 10, "output_tokens": 20, "total_tokens": 30}) assert span._get_ctx_item(METRICS) == {"input_tokens": 10, "output_tokens": 20, "total_tokens": 30} -def test_annotate_metrics_wrong_type(LLMObs, mock_logs): - with LLMObs.llm(model_name="test_model") as llm_span: - LLMObs.annotate(span=llm_span, metrics=12345) +def test_annotate_metrics_wrong_type(llmobs, mock_llmobs_logs): + with llmobs.llm(model_name="test_model") as llm_span: + llmobs.annotate(span=llm_span, metrics=12345) assert llm_span._get_ctx_item(METRICS) is None - mock_logs.warning.assert_called_once_with("metrics must be a dictionary of string key - numeric value pairs.") - mock_logs.reset_mock() + mock_llmobs_logs.warning.assert_called_once_with( + "metrics must be a dictionary of string key - numeric value pairs." + ) + mock_llmobs_logs.reset_mock() -def test_annotate_prompt_dict(LLMObs): - with LLMObs.llm(model_name="test_model") as span: - LLMObs.annotate( +def test_annotate_prompt_dict(llmobs): + with llmobs.llm(model_name="test_model") as span: + llmobs.annotate( span=span, prompt={ "template": "{var1} {var3}", @@ -761,9 +686,9 @@ def test_annotate_prompt_dict(LLMObs): } -def test_annotate_prompt_dict_with_context_var_keys(LLMObs): - with LLMObs.llm(model_name="test_model") as span: - LLMObs.annotate( +def test_annotate_prompt_dict_with_context_var_keys(llmobs): + with llmobs.llm(model_name="test_model") as span: + llmobs.annotate( span=span, prompt={ "template": "{var1} {var3}", @@ -784,9 +709,9 @@ def test_annotate_prompt_dict_with_context_var_keys(LLMObs): } -def test_annotate_prompt_typed_dict(LLMObs): - with LLMObs.llm(model_name="test_model") as span: - LLMObs.annotate( +def test_annotate_prompt_typed_dict(llmobs): + with llmobs.llm(model_name="test_model") as span: + llmobs.annotate( span=span, prompt=Prompt( template="{var1} {var3}", @@ -807,47 +732,30 @@ def test_annotate_prompt_typed_dict(LLMObs): } -def test_annotate_prompt_wrong_type(LLMObs, mock_logs): - with LLMObs.llm(model_name="test_model") as span: - LLMObs.annotate(span=span, prompt="prompt") +def test_annotate_prompt_wrong_type(llmobs, mock_llmobs_logs): + with llmobs.llm(model_name="test_model") as span: + llmobs.annotate(span=span, prompt="prompt") assert span._get_ctx_item(INPUT_PROMPT) is None - mock_logs.warning.assert_called_once_with("Failed to validate prompt with error: ", exc_info=True) - mock_logs.reset_mock() + mock_llmobs_logs.warning.assert_called_once_with("Failed to validate prompt with error: ", exc_info=True) + mock_llmobs_logs.reset_mock() - LLMObs.annotate(span=span, prompt={"template": 1}) - mock_logs.warning.assert_called_once_with("Failed to validate prompt with error: ", exc_info=True) - mock_logs.reset_mock() + llmobs.annotate(span=span, prompt={"template": 1}) + mock_llmobs_logs.warning.assert_called_once_with("Failed to validate prompt with error: ", exc_info=True) + mock_llmobs_logs.reset_mock() -def test_span_error_sets_error(LLMObs, mock_llmobs_span_writer): +def test_span_error_sets_error(llmobs, llmobs_events): with pytest.raises(ValueError): - with LLMObs.llm(model_name="test_model", model_provider="test_model_provider") as span: + with llmobs.llm(model_name="test_model", model_provider="test_model_provider") as span: raise ValueError("test error message") - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_llm_span_event( - span, - model_name="test_model", - model_provider="test_model_provider", - error="builtins.ValueError", - error_message="test error message", - error_stack=span.get_tag("error.stack"), - ) - ) - - -def test_span_error_sets_error_agentless(AgentlessLLMObs, mock_llmobs_span_agentless_writer): - with pytest.raises(ValueError): - with AgentlessLLMObs.llm(model_name="test_model", model_provider="test_model_provider") as span: - raise ValueError("test error message") - mock_llmobs_span_agentless_writer.enqueue.assert_called_with( - _expected_llmobs_llm_span_event( - span, - model_name="test_model", - model_provider="test_model_provider", - error="builtins.ValueError", - error_message="test error message", - error_stack=span.get_tag("error.stack"), - ) + assert len(llmobs_events) == 1 + assert llmobs_events[0] == _expected_llmobs_llm_span_event( + span, + model_name="test_model", + model_provider="test_model_provider", + error="builtins.ValueError", + error_message="test error message", + error_stack=span.get_tag("error.stack"), ) @@ -855,218 +763,142 @@ def test_span_error_sets_error_agentless(AgentlessLLMObs, mock_llmobs_span_agent "ddtrace_global_config", [dict(version="1.2.3", env="test_env", service="test_service", _llmobs_ml_app="test_app_name")], ) -def test_tags(ddtrace_global_config, LLMObs, mock_llmobs_span_writer, monkeypatch): - with LLMObs.task(name="test_task") as span: - pass - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_non_llm_span_event( - span, - "task", - tags={"version": "1.2.3", "env": "test_env", "service": "test_service", "ml_app": "test_app_name"}, - ) - ) - - -@pytest.mark.parametrize( - "ddtrace_global_config", - [dict(version="1.2.3", env="test_env", service="test_service", _llmobs_ml_app="test_app_name")], -) -def test_tags_agentless(ddtrace_global_config, AgentlessLLMObs, mock_llmobs_span_agentless_writer, monkeypatch): - with AgentlessLLMObs.task(name="test_task") as span: - pass - mock_llmobs_span_agentless_writer.enqueue.assert_called_with( - _expected_llmobs_non_llm_span_event( - span, - "task", - tags={"version": "1.2.3", "env": "test_env", "service": "test_service", "ml_app": "test_app_name"}, - ) - ) - - -def test_ml_app_override(LLMObs, mock_llmobs_span_writer): - with LLMObs.task(name="test_task", ml_app="test_app") as span: - pass - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_non_llm_span_event(span, "task", tags={"ml_app": "test_app"}) - ) - with LLMObs.tool(name="test_tool", ml_app="test_app") as span: - pass - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_non_llm_span_event(span, "tool", tags={"ml_app": "test_app"}) - ) - with LLMObs.llm(model_name="model_name", name="test_llm", ml_app="test_app") as span: - pass - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_llm_span_event( - span, "llm", model_name="model_name", model_provider="custom", tags={"ml_app": "test_app"} - ) - ) - with LLMObs.embedding(model_name="model_name", name="test_embedding", ml_app="test_app") as span: - pass - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_llm_span_event( - span, "embedding", model_name="model_name", model_provider="custom", tags={"ml_app": "test_app"} - ) - ) - with LLMObs.workflow(name="test_workflow", ml_app="test_app") as span: - pass - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_non_llm_span_event(span, "workflow", tags={"ml_app": "test_app"}) - ) - with LLMObs.agent(name="test_agent", ml_app="test_app") as span: - pass - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_llm_span_event(span, "agent", tags={"ml_app": "test_app"}) - ) - with LLMObs.retrieval(name="test_retrieval", ml_app="test_app") as span: +def test_tags(ddtrace_global_config, llmobs, llmobs_events, monkeypatch): + with llmobs.task(name="test_task") as span: pass - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_non_llm_span_event(span, "retrieval", tags={"ml_app": "test_app"}) + assert len(llmobs_events) == 1 + assert llmobs_events[0] == _expected_llmobs_non_llm_span_event( + span, + "task", + tags={"version": "1.2.3", "env": "test_env", "service": "test_service", "ml_app": "test_app_name"}, ) -def test_ml_app_override_agentless(AgentlessLLMObs, mock_llmobs_span_agentless_writer): - with AgentlessLLMObs.task(name="test_task", ml_app="test_app") as span: +def test_ml_app_override(llmobs, llmobs_events): + with llmobs.task(name="test_task", ml_app="test_app") as span: pass - mock_llmobs_span_agentless_writer.enqueue.assert_called_with( - _expected_llmobs_non_llm_span_event(span, "task", tags={"ml_app": "test_app"}) - ) - with AgentlessLLMObs.tool(name="test_tool", ml_app="test_app") as span: + assert len(llmobs_events) == 1 + assert llmobs_events[0] == _expected_llmobs_non_llm_span_event(span, "task", tags={"ml_app": "test_app"}) + with llmobs.tool(name="test_tool", ml_app="test_app") as span: pass - mock_llmobs_span_agentless_writer.enqueue.assert_called_with( - _expected_llmobs_non_llm_span_event(span, "tool", tags={"ml_app": "test_app"}) - ) - with AgentlessLLMObs.llm(model_name="model_name", name="test_llm", ml_app="test_app") as span: + assert len(llmobs_events) == 2 + assert llmobs_events[1] == _expected_llmobs_non_llm_span_event(span, "tool", tags={"ml_app": "test_app"}) + with llmobs.llm(model_name="model_name", name="test_llm", ml_app="test_app") as span: pass - mock_llmobs_span_agentless_writer.enqueue.assert_called_with( - _expected_llmobs_llm_span_event( - span, "llm", model_name="model_name", model_provider="custom", tags={"ml_app": "test_app"} - ) + assert len(llmobs_events) == 3 + assert llmobs_events[2] == _expected_llmobs_llm_span_event( + span, "llm", model_name="model_name", model_provider="custom", tags={"ml_app": "test_app"} ) - with AgentlessLLMObs.embedding(model_name="model_name", name="test_embedding", ml_app="test_app") as span: + with llmobs.embedding(model_name="model_name", name="test_embedding", ml_app="test_app") as span: pass - mock_llmobs_span_agentless_writer.enqueue.assert_called_with( - _expected_llmobs_llm_span_event( - span, "embedding", model_name="model_name", model_provider="custom", tags={"ml_app": "test_app"} - ) + assert len(llmobs_events) == 4 + assert llmobs_events[3] == _expected_llmobs_llm_span_event( + span, "embedding", model_name="model_name", model_provider="custom", tags={"ml_app": "test_app"} ) - with AgentlessLLMObs.workflow(name="test_workflow", ml_app="test_app") as span: + with llmobs.workflow(name="test_workflow", ml_app="test_app") as span: pass - mock_llmobs_span_agentless_writer.enqueue.assert_called_with( - _expected_llmobs_non_llm_span_event(span, "workflow", tags={"ml_app": "test_app"}) - ) - with AgentlessLLMObs.agent(name="test_agent", ml_app="test_app") as span: + assert len(llmobs_events) == 5 + assert llmobs_events[4] == _expected_llmobs_non_llm_span_event(span, "workflow", tags={"ml_app": "test_app"}) + with llmobs.agent(name="test_agent", ml_app="test_app") as span: pass - mock_llmobs_span_agentless_writer.enqueue.assert_called_with( - _expected_llmobs_llm_span_event(span, "agent", tags={"ml_app": "test_app"}) - ) - with AgentlessLLMObs.retrieval(name="test_retrieval", ml_app="test_app") as span: + assert len(llmobs_events) == 6 + assert llmobs_events[5] == _expected_llmobs_llm_span_event(span, "agent", tags={"ml_app": "test_app"}) + with llmobs.retrieval(name="test_retrieval", ml_app="test_app") as span: pass - mock_llmobs_span_agentless_writer.enqueue.assert_called_with( - _expected_llmobs_non_llm_span_event(span, "retrieval", tags={"ml_app": "test_app"}) - ) + assert len(llmobs_events) == 7 + assert llmobs_events[6] == _expected_llmobs_non_llm_span_event(span, "retrieval", tags={"ml_app": "test_app"}) -def test_export_span_specified_span_is_incorrect_type_raises_warning(LLMObs, mock_logs): - LLMObs.export_span(span="asd") - mock_logs.warning.assert_called_once_with("Failed to export span. Span must be a valid Span object.") +def test_export_span_specified_span_is_incorrect_type_raises_warning(llmobs, mock_llmobs_logs): + llmobs.export_span(span="asd") + mock_llmobs_logs.warning.assert_called_once_with("Failed to export span. Span must be a valid Span object.") -def test_export_span_specified_span_is_not_llmobs_span_raises_warning(LLMObs, mock_logs): +def test_export_span_specified_span_is_not_llmobs_span_raises_warning(llmobs, mock_llmobs_logs): with DummyTracer().trace("non_llmobs_span") as span: - LLMObs.export_span(span=span) - mock_logs.warning.assert_called_once_with("Span must be an LLMObs-generated span.") + llmobs.export_span(span=span) + mock_llmobs_logs.warning.assert_called_once_with("Span must be an LLMObs-generated span.") -def test_export_span_specified_span_returns_span_context(LLMObs): - with LLMObs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span: - span_context = LLMObs.export_span(span=span) +def test_export_span_specified_span_returns_span_context(llmobs): + with llmobs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span: + span_context = llmobs.export_span(span=span) assert span_context is not None assert span_context["span_id"] == str(span.span_id) assert span_context["trace_id"] == "{:x}".format(span.trace_id) -def test_export_span_no_specified_span_no_active_span_raises_warning(LLMObs, mock_logs): - LLMObs.export_span() - mock_logs.warning.assert_called_once_with("No span provided and no active LLMObs-generated span found.") +def test_export_span_no_specified_span_no_active_span_raises_warning(llmobs, mock_llmobs_logs): + llmobs.export_span() + mock_llmobs_logs.warning.assert_called_once_with("No span provided and no active LLMObs-generated span found.") -def test_export_span_active_span_not_llmobs_span_raises_warning(LLMObs, mock_logs): - with LLMObs._instance.tracer.trace("non_llmobs_span"): - LLMObs.export_span() - mock_logs.warning.assert_called_once_with("Span must be an LLMObs-generated span.") +def test_export_span_active_span_not_llmobs_span_raises_warning(llmobs, mock_llmobs_logs): + with llmobs._instance.tracer.trace("non_llmobs_span"): + llmobs.export_span() + mock_llmobs_logs.warning.assert_called_once_with("Span must be an LLMObs-generated span.") -def test_export_span_no_specified_span_returns_exported_active_span(LLMObs): - with LLMObs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span: - span_context = LLMObs.export_span() +def test_export_span_no_specified_span_returns_exported_active_span(llmobs): + with llmobs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span: + span_context = llmobs.export_span() assert span_context is not None assert span_context["span_id"] == str(span.span_id) assert span_context["trace_id"] == "{:x}".format(span.trace_id) -def test_submit_evaluation_llmobs_disabled_raises_warning(LLMObs, mock_logs): - LLMObs.disable() - LLMObs.submit_evaluation( - span_context={"span_id": "123", "trace_id": "456"}, label="toxicity", metric_type="categorical", value="high" - ) - mock_logs.warning.assert_called_once_with( - "LLMObs.submit_evaluation() called when LLMObs is not enabled. Evaluation metric data will not be sent." - ) - - -def test_submit_evaluation_no_api_key_raises_warning(AgentlessLLMObs, mock_logs): +def test_submit_evaluation_no_api_key_raises_warning(llmobs, mock_llmobs_logs): with override_global_config(dict(_dd_api_key="")): - AgentlessLLMObs.submit_evaluation( + llmobs.submit_evaluation( span_context={"span_id": "123", "trace_id": "456"}, label="toxicity", metric_type="categorical", value="high", ) - mock_logs.warning.assert_called_once_with( + mock_llmobs_logs.warning.assert_called_once_with( "DD_API_KEY is required for sending evaluation metrics. Evaluation metric data will not be sent. " "Ensure this configuration is set before running your application." ) -def test_submit_evaluation_ml_app_raises_warning(LLMObs, mock_logs): +def test_submit_evaluation_ml_app_raises_warning(llmobs, mock_llmobs_logs): with override_global_config(dict(_llmobs_ml_app="")): - LLMObs.submit_evaluation( + llmobs.submit_evaluation( span_context={"span_id": "123", "trace_id": "456"}, label="toxicity", metric_type="categorical", value="high", ) - mock_logs.warning.assert_called_once_with( + mock_llmobs_logs.warning.assert_called_once_with( "ML App name is required for sending evaluation metrics. Evaluation metric data will not be sent. " "Ensure this configuration is set before running your application." ) -def test_submit_evaluation_span_context_incorrect_type_raises_warning(LLMObs, mock_logs): - LLMObs.submit_evaluation(span_context="asd", label="toxicity", metric_type="categorical", value="high") - mock_logs.warning.assert_called_once_with( +def test_submit_evaluation_span_context_incorrect_type_raises_warning(llmobs, mock_llmobs_logs): + llmobs.submit_evaluation(span_context="asd", label="toxicity", metric_type="categorical", value="high") + mock_llmobs_logs.warning.assert_called_once_with( "span_context must be a dictionary containing both span_id and trace_id keys. " "LLMObs.export_span() can be used to generate this dictionary from a given span." ) -def test_submit_evaluation_empty_span_or_trace_id_raises_warning(LLMObs, mock_logs): - LLMObs.submit_evaluation( +def test_submit_evaluation_empty_span_or_trace_id_raises_warning(llmobs, mock_llmobs_logs): + llmobs.submit_evaluation( span_context={"trace_id": "456"}, label="toxicity", metric_type="categorical", value="high" ) - mock_logs.warning.assert_called_once_with( + mock_llmobs_logs.warning.assert_called_once_with( "span_id and trace_id must both be specified for the given evaluation metric to be submitted." ) - mock_logs.reset_mock() - LLMObs.submit_evaluation(span_context={"span_id": "456"}, label="toxicity", metric_type="categorical", value="high") - mock_logs.warning.assert_called_once_with( + mock_llmobs_logs.reset_mock() + llmobs.submit_evaluation(span_context={"span_id": "456"}, label="toxicity", metric_type="categorical", value="high") + mock_llmobs_logs.warning.assert_called_once_with( "span_id and trace_id must both be specified for the given evaluation metric to be submitted." ) -def test_submit_evaluation_invalid_timestamp_raises_warning(LLMObs, mock_logs): - LLMObs.submit_evaluation( +def test_submit_evaluation_invalid_timestamp_raises_warning(llmobs, mock_llmobs_logs): + llmobs.submit_evaluation( span_context={"span_id": "123", "trace_id": "456"}, label="", metric_type="categorical", @@ -1074,35 +906,35 @@ def test_submit_evaluation_invalid_timestamp_raises_warning(LLMObs, mock_logs): ml_app="dummy", timestamp_ms="invalid", ) - mock_logs.warning.assert_called_once_with( + mock_llmobs_logs.warning.assert_called_once_with( "timestamp_ms must be a non-negative integer. Evaluation metric data will not be sent" ) -def test_submit_evaluation_empty_label_raises_warning(LLMObs, mock_logs): - LLMObs.submit_evaluation( +def test_submit_evaluation_empty_label_raises_warning(llmobs, mock_llmobs_logs): + llmobs.submit_evaluation( span_context={"span_id": "123", "trace_id": "456"}, label="", metric_type="categorical", value="high" ) - mock_logs.warning.assert_called_once_with("label must be the specified name of the evaluation metric.") + mock_llmobs_logs.warning.assert_called_once_with("label must be the specified name of the evaluation metric.") -def test_submit_evaluation_incorrect_metric_type_raises_warning(LLMObs, mock_logs): - LLMObs.submit_evaluation( +def test_submit_evaluation_incorrect_metric_type_raises_warning(llmobs, mock_llmobs_logs): + llmobs.submit_evaluation( span_context={"span_id": "123", "trace_id": "456"}, label="toxicity", metric_type="wrong", value="high" ) - mock_logs.warning.assert_called_once_with("metric_type must be one of 'categorical' or 'score'.") - mock_logs.reset_mock() - LLMObs.submit_evaluation( + mock_llmobs_logs.warning.assert_called_once_with("metric_type must be one of 'categorical' or 'score'.") + mock_llmobs_logs.reset_mock() + llmobs.submit_evaluation( span_context={"span_id": "123", "trace_id": "456"}, label="toxicity", metric_type="", value="high" ) - mock_logs.warning.assert_called_once_with("metric_type must be one of 'categorical' or 'score'.") + mock_llmobs_logs.warning.assert_called_once_with("metric_type must be one of 'categorical' or 'score'.") -def test_submit_evaluation_numerical_value_raises_unsupported_warning(LLMObs, mock_logs): - LLMObs.submit_evaluation( +def test_submit_evaluation_numerical_value_raises_unsupported_warning(llmobs, mock_llmobs_logs): + llmobs.submit_evaluation( span_context={"span_id": "123", "trace_id": "456"}, label="token_count", metric_type="numerical", value="high" ) - mock_logs.warning.assert_has_calls( + mock_llmobs_logs.warning.assert_has_calls( [ mock.call( "The evaluation metric type 'numerical' is unsupported. Use 'score' instead. " @@ -1112,44 +944,44 @@ def test_submit_evaluation_numerical_value_raises_unsupported_warning(LLMObs, mo ) -def test_submit_evaluation_incorrect_numerical_value_type_raises_warning(LLMObs, mock_logs): - LLMObs.submit_evaluation( +def test_submit_evaluation_incorrect_numerical_value_type_raises_warning(llmobs, mock_llmobs_logs): + llmobs.submit_evaluation( span_context={"span_id": "123", "trace_id": "456"}, label="token_count", metric_type="numerical", value="high" ) - mock_logs.warning.assert_has_calls( + mock_llmobs_logs.warning.assert_has_calls( [ mock.call("value must be an integer or float for a score metric."), ] ) -def test_submit_evaluation_incorrect_score_value_type_raises_warning(LLMObs, mock_logs): - LLMObs.submit_evaluation( +def test_submit_evaluation_incorrect_score_value_type_raises_warning(llmobs, mock_llmobs_logs): + llmobs.submit_evaluation( span_context={"span_id": "123", "trace_id": "456"}, label="token_count", metric_type="score", value="high" ) - mock_logs.warning.assert_called_once_with("value must be an integer or float for a score metric.") + mock_llmobs_logs.warning.assert_called_once_with("value must be an integer or float for a score metric.") -def test_submit_evaluation_invalid_tags_raises_warning(LLMObs, mock_logs): - LLMObs.submit_evaluation( +def test_submit_evaluation_invalid_tags_raises_warning(llmobs, mock_llmobs_logs): + llmobs.submit_evaluation( span_context={"span_id": "123", "trace_id": "456"}, label="toxicity", metric_type="categorical", value="high", tags=["invalid"], ) - mock_logs.warning.assert_called_once_with("tags must be a dictionary of string key-value pairs.") + mock_llmobs_logs.warning.assert_called_once_with("tags must be a dictionary of string key-value pairs.") -def test_submit_evaluation_invalid_metadata_raises_warning(LLMObs, mock_logs): - LLMObs.submit_evaluation( +def test_submit_evaluation_invalid_metadata_raises_warning(llmobs, mock_llmobs_logs): + llmobs.submit_evaluation( span_context={"span_id": "123", "trace_id": "456"}, label="toxicity", metric_type="categorical", value="high", metadata=1, ) - mock_logs.warning.assert_called_once_with("metadata must be json serializable dictionary.") + mock_llmobs_logs.warning.assert_called_once_with("metadata must be json serializable dictionary.") @pytest.mark.parametrize( @@ -1157,9 +989,9 @@ def test_submit_evaluation_invalid_metadata_raises_warning(LLMObs, mock_logs): [dict(_llmobs_ml_app="test_app_name")], ) def test_submit_evaluation_non_string_tags_raises_warning_but_still_submits( - LLMObs, mock_logs, mock_llmobs_eval_metric_writer + llmobs, mock_llmobs_logs, mock_llmobs_eval_metric_writer ): - LLMObs.submit_evaluation( + llmobs.submit_evaluation( span_context={"span_id": "123", "trace_id": "456"}, label="toxicity", metric_type="categorical", @@ -1167,8 +999,10 @@ def test_submit_evaluation_non_string_tags_raises_warning_but_still_submits( tags={1: 2, "foo": "bar"}, ml_app="dummy", ) - mock_logs.warning.assert_called_once_with("Failed to parse tags. Tags for evaluation metrics must be strings.") - mock_logs.reset_mock() + mock_llmobs_logs.warning.assert_called_once_with( + "Failed to parse tags. Tags for evaluation metrics must be strings." + ) + mock_llmobs_logs.reset_mock() mock_llmobs_eval_metric_writer.enqueue.assert_called_with( _expected_llmobs_eval_metric_event( ml_app="dummy", @@ -1186,8 +1020,8 @@ def test_submit_evaluation_non_string_tags_raises_warning_but_still_submits( "ddtrace_global_config", [dict(ddtrace="1.2.3", env="test_env", service="test_service", _llmobs_ml_app="test_app_name")], ) -def test_submit_evaluation_metric_tags(LLMObs, mock_llmobs_eval_metric_writer): - LLMObs.submit_evaluation( +def test_submit_evaluation_metric_tags(llmobs, mock_llmobs_eval_metric_writer): + llmobs.submit_evaluation( span_context={"span_id": "123", "trace_id": "456"}, label="toxicity", metric_type="categorical", @@ -1212,8 +1046,8 @@ def test_submit_evaluation_metric_tags(LLMObs, mock_llmobs_eval_metric_writer): "ddtrace_global_config", [dict(ddtrace="1.2.3", env="test_env", service="test_service", _llmobs_ml_app="test_app_name")], ) -def test_submit_evaluation_metric_with_metadata_enqueues_metric(LLMObs, mock_llmobs_eval_metric_writer): - LLMObs.submit_evaluation( +def test_submit_evaluation_metric_with_metadata_enqueues_metric(llmobs, mock_llmobs_eval_metric_writer): + llmobs.submit_evaluation( span_context={"span_id": "123", "trace_id": "456"}, label="toxicity", metric_type="categorical", @@ -1235,7 +1069,7 @@ def test_submit_evaluation_metric_with_metadata_enqueues_metric(LLMObs, mock_llm ) ) mock_llmobs_eval_metric_writer.reset() - LLMObs.submit_evaluation( + llmobs.submit_evaluation( span_context={"span_id": "123", "trace_id": "456"}, label="toxicity", metric_type="categorical", @@ -1257,8 +1091,8 @@ def test_submit_evaluation_metric_with_metadata_enqueues_metric(LLMObs, mock_llm ) -def test_submit_evaluation_enqueues_writer_with_categorical_metric(LLMObs, mock_llmobs_eval_metric_writer): - LLMObs.submit_evaluation( +def test_submit_evaluation_enqueues_writer_with_categorical_metric(llmobs, mock_llmobs_eval_metric_writer): + llmobs.submit_evaluation( span_context={"span_id": "123", "trace_id": "456"}, label="toxicity", metric_type="categorical", @@ -1276,9 +1110,9 @@ def test_submit_evaluation_enqueues_writer_with_categorical_metric(LLMObs, mock_ ) ) mock_llmobs_eval_metric_writer.reset_mock() - with LLMObs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span: - LLMObs.submit_evaluation( - span_context=LLMObs.export_span(span), + with llmobs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span: + llmobs.submit_evaluation( + span_context=llmobs.export_span(span), label="toxicity", metric_type="categorical", value="high", @@ -1296,8 +1130,8 @@ def test_submit_evaluation_enqueues_writer_with_categorical_metric(LLMObs, mock_ ) -def test_submit_evaluation_enqueues_writer_with_score_metric(LLMObs, mock_llmobs_eval_metric_writer): - LLMObs.submit_evaluation( +def test_submit_evaluation_enqueues_writer_with_score_metric(llmobs, mock_llmobs_eval_metric_writer): + llmobs.submit_evaluation( span_context={"span_id": "123", "trace_id": "456"}, label="sentiment", metric_type="score", @@ -1310,9 +1144,9 @@ def test_submit_evaluation_enqueues_writer_with_score_metric(LLMObs, mock_llmobs ) ) mock_llmobs_eval_metric_writer.reset_mock() - with LLMObs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span: - LLMObs.submit_evaluation( - span_context=LLMObs.export_span(span), label="sentiment", metric_type="score", value=0.9, ml_app="dummy" + with llmobs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span: + llmobs.submit_evaluation( + span_context=llmobs.export_span(span), label="sentiment", metric_type="score", value=0.9, ml_app="dummy" ) mock_llmobs_eval_metric_writer.enqueue.assert_called_with( _expected_llmobs_eval_metric_event( @@ -1327,9 +1161,9 @@ def test_submit_evaluation_enqueues_writer_with_score_metric(LLMObs, mock_llmobs def test_submit_evaluation_with_numerical_metric_enqueues_writer_with_score_metric( - LLMObs, mock_llmobs_eval_metric_writer + llmobs, mock_llmobs_eval_metric_writer ): - LLMObs.submit_evaluation( + llmobs.submit_evaluation( span_context={"span_id": "123", "trace_id": "456"}, label="token_count", metric_type="numerical", @@ -1342,9 +1176,9 @@ def test_submit_evaluation_with_numerical_metric_enqueues_writer_with_score_metr ) ) mock_llmobs_eval_metric_writer.reset_mock() - with LLMObs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span: - LLMObs.submit_evaluation( - span_context=LLMObs.export_span(span), + with llmobs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span: + llmobs.submit_evaluation( + span_context=llmobs.export_span(span), label="token_count", metric_type="numerical", value=35, @@ -1362,148 +1196,148 @@ def test_submit_evaluation_with_numerical_metric_enqueues_writer_with_score_metr ) -def test_flush_calls_periodic_agentless( - AgentlessLLMObs, mock_llmobs_span_agentless_writer, mock_llmobs_eval_metric_writer, mock_llmobs_evaluator_runner -): - AgentlessLLMObs.flush() - mock_llmobs_span_agentless_writer.periodic.assert_called_once() - mock_llmobs_eval_metric_writer.periodic.assert_called_once() - mock_llmobs_evaluator_runner.periodic.assert_called_once() - - def test_flush_does_not_call_periodic_when_llmobs_is_disabled( - LLMObs, - mock_llmobs_span_writer, + llmobs, mock_llmobs_eval_metric_writer, mock_llmobs_evaluator_runner, - mock_logs, - disabled_llmobs, + mock_llmobs_logs, ): - LLMObs.flush() - mock_llmobs_span_writer.periodic.assert_not_called() + llmobs.enabled = False + llmobs.flush() mock_llmobs_eval_metric_writer.periodic.assert_not_called() mock_llmobs_evaluator_runner.periodic.assert_not_called() - mock_logs.warning.assert_has_calls( + mock_llmobs_logs.warning.assert_has_calls( [mock.call("flushing when LLMObs is disabled. No spans or evaluation metrics will be sent.")] ) -def test_flush_does_not_call_periodic_when_llmobs_is_disabled_agentless( - AgentlessLLMObs, - mock_llmobs_span_agentless_writer, - mock_llmobs_eval_metric_writer, - mock_llmobs_evaluator_runner, - mock_logs, - disabled_llmobs, -): - AgentlessLLMObs.flush() - mock_llmobs_span_agentless_writer.periodic.assert_not_called() - mock_llmobs_eval_metric_writer.periodic.assert_not_called() - mock_llmobs_evaluator_runner.periodic.assert_not_called() - mock_logs.warning.assert_has_calls( - [mock.call("flushing when LLMObs is disabled. No spans or evaluation metrics will be sent.")] - ) - - -def test_inject_distributed_headers_llmobs_disabled_does_nothing(LLMObs, mock_logs): - LLMObs.disable() - headers = LLMObs.inject_distributed_headers({}, span=None) - mock_logs.warning.assert_called_once_with( +def test_inject_distributed_headers_llmobs_disabled_does_nothing(llmobs, mock_llmobs_logs): + llmobs.disable() + headers = llmobs.inject_distributed_headers({}, span=None) + mock_llmobs_logs.warning.assert_called_once_with( "LLMObs.inject_distributed_headers() called when LLMObs is not enabled. " "Distributed context will not be injected." ) assert headers == {} -def test_inject_distributed_headers_not_dict_logs_warning(LLMObs, mock_logs): - headers = LLMObs.inject_distributed_headers("not a dictionary", span=None) - mock_logs.warning.assert_called_once_with("request_headers must be a dictionary of string key-value pairs.") +def test_inject_distributed_headers_not_dict_logs_warning(llmobs, mock_llmobs_logs): + headers = llmobs.inject_distributed_headers("not a dictionary", span=None) + mock_llmobs_logs.warning.assert_called_once_with("request_headers must be a dictionary of string key-value pairs.") assert headers == "not a dictionary" - mock_logs.reset_mock() - headers = LLMObs.inject_distributed_headers(123, span=None) - mock_logs.warning.assert_called_once_with("request_headers must be a dictionary of string key-value pairs.") + mock_llmobs_logs.reset_mock() + headers = llmobs.inject_distributed_headers(123, span=None) + mock_llmobs_logs.warning.assert_called_once_with("request_headers must be a dictionary of string key-value pairs.") assert headers == 123 - mock_logs.reset_mock() - headers = LLMObs.inject_distributed_headers(None, span=None) - mock_logs.warning.assert_called_once_with("request_headers must be a dictionary of string key-value pairs.") + mock_llmobs_logs.reset_mock() + headers = llmobs.inject_distributed_headers(None, span=None) + mock_llmobs_logs.warning.assert_called_once_with("request_headers must be a dictionary of string key-value pairs.") assert headers is None -def test_inject_distributed_headers_no_active_span_logs_warning(LLMObs, mock_logs): - headers = LLMObs.inject_distributed_headers({}, span=None) - mock_logs.warning.assert_called_once_with("No span provided and no currently active span found.") +def test_inject_distributed_headers_no_active_span_logs_warning(llmobs, mock_llmobs_logs): + headers = llmobs.inject_distributed_headers({}, span=None) + mock_llmobs_logs.warning.assert_called_once_with("No span provided and no currently active span found.") assert headers == {} -def test_inject_distributed_headers_span_calls_httppropagator_inject(LLMObs, mock_logs): - span = LLMObs._instance.tracer.trace("test_span") +def test_inject_distributed_headers_span_calls_httppropagator_inject(llmobs, mock_llmobs_logs): + span = llmobs._instance.tracer.trace("test_span") with mock.patch("ddtrace.propagation.http.HTTPPropagator.inject") as mock_inject: - LLMObs.inject_distributed_headers({}, span=span) + llmobs.inject_distributed_headers({}, span=span) assert mock_inject.call_count == 1 mock_inject.assert_called_once_with(span.context, {}) -def test_inject_distributed_headers_current_active_span_injected(LLMObs, mock_logs): - span = LLMObs._instance.tracer.trace("test_span") +def test_inject_distributed_headers_current_active_span_injected(llmobs, mock_llmobs_logs): + span = llmobs._instance.tracer.trace("test_span") with mock.patch("ddtrace.llmobs._llmobs.HTTPPropagator.inject") as mock_inject: - LLMObs.inject_distributed_headers({}, span=None) + llmobs.inject_distributed_headers({}, span=None) assert mock_inject.call_count == 1 mock_inject.assert_called_once_with(span.context, {}) -def test_activate_distributed_headers_llmobs_disabled_does_nothing(LLMObs, mock_logs): - LLMObs.disable() - LLMObs.activate_distributed_headers({}) - mock_logs.warning.assert_called_once_with( +def test_activate_distributed_headers_llmobs_disabled_does_nothing(llmobs, mock_llmobs_logs): + llmobs.disable() + llmobs.activate_distributed_headers({}) + mock_llmobs_logs.warning.assert_called_once_with( "LLMObs.activate_distributed_headers() called when LLMObs is not enabled. " "Distributed context will not be activated." ) -def test_activate_distributed_headers_calls_httppropagator_extract(LLMObs, mock_logs): +def test_activate_distributed_headers_calls_httppropagator_extract(llmobs, mock_llmobs_logs): with mock.patch("ddtrace.llmobs._llmobs.HTTPPropagator.extract") as mock_extract: - LLMObs.activate_distributed_headers({}) + llmobs.activate_distributed_headers({}) assert mock_extract.call_count == 1 mock_extract.assert_called_once_with({}) -def test_activate_distributed_headers_no_trace_id_does_nothing(LLMObs, mock_logs): +def test_activate_distributed_headers_no_trace_id_does_nothing(llmobs, mock_llmobs_logs): with mock.patch("ddtrace.llmobs._llmobs.HTTPPropagator.extract") as mock_extract: mock_extract.return_value = Context(span_id="123", meta={PROPAGATED_PARENT_ID_KEY: "123"}) - LLMObs.activate_distributed_headers({}) + llmobs.activate_distributed_headers({}) assert mock_extract.call_count == 1 - mock_logs.warning.assert_called_once_with("Failed to extract trace ID or span ID from request headers.") + mock_llmobs_logs.warning.assert_called_once_with("Failed to extract trace ID or span ID from request headers.") -def test_activate_distributed_headers_no_span_id_does_nothing(LLMObs, mock_logs): +def test_activate_distributed_headers_no_span_id_does_nothing(llmobs, mock_llmobs_logs): with mock.patch("ddtrace.llmobs._llmobs.HTTPPropagator.extract") as mock_extract: mock_extract.return_value = Context(trace_id="123", meta={PROPAGATED_PARENT_ID_KEY: "123"}) - LLMObs.activate_distributed_headers({}) + llmobs.activate_distributed_headers({}) assert mock_extract.call_count == 1 - mock_logs.warning.assert_called_once_with("Failed to extract trace ID or span ID from request headers.") + mock_llmobs_logs.warning.assert_called_once_with("Failed to extract trace ID or span ID from request headers.") -def test_activate_distributed_headers_no_llmobs_parent_id_does_nothing(LLMObs, mock_logs): +def test_activate_distributed_headers_no_llmobs_parent_id_does_nothing(llmobs, mock_llmobs_logs): with mock.patch("ddtrace.llmobs._llmobs.HTTPPropagator.extract") as mock_extract: dummy_context = Context(trace_id="123", span_id="456") mock_extract.return_value = dummy_context with mock.patch("ddtrace.llmobs.LLMObs._instance.tracer.context_provider.activate") as mock_activate: - LLMObs.activate_distributed_headers({}) + llmobs.activate_distributed_headers({}) assert mock_extract.call_count == 1 - mock_logs.warning.assert_called_once_with("Failed to extract LLMObs parent ID from request headers.") + mock_llmobs_logs.warning.assert_called_once_with("Failed to extract LLMObs parent ID from request headers.") mock_activate.assert_called_once_with(dummy_context) -def test_activate_distributed_headers_activates_context(LLMObs, mock_logs): +def test_activate_distributed_headers_activates_context(llmobs, mock_llmobs_logs): with mock.patch("ddtrace.llmobs._llmobs.HTTPPropagator.extract") as mock_extract: dummy_context = Context(trace_id="123", span_id="456", meta={PROPAGATED_PARENT_ID_KEY: "789"}) mock_extract.return_value = dummy_context with mock.patch("ddtrace.llmobs.LLMObs._instance.tracer.context_provider.activate") as mock_activate: - LLMObs.activate_distributed_headers({}) + llmobs.activate_distributed_headers({}) assert mock_extract.call_count == 1 mock_activate.assert_called_once_with(dummy_context) +def test_listener_hooks_enqueue_correct_writer(run_python_code_in_subprocess): + """ + Regression test that ensures that listener hooks enqueue span events to the correct writer, + not the default writer created at startup. + """ + env = os.environ.copy() + pypath = [os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))] + if "PYTHONPATH" in env: + pypath.append(env["PYTHONPATH"]) + env.update({"PYTHONPATH": ":".join(pypath), "DD_TRACE_ENABLED": "0"}) + out, err, status, pid = run_python_code_in_subprocess( + """ +from ddtrace.llmobs import LLMObs + +LLMObs.enable(ml_app="repro-issue", agentless_enabled=True, api_key="foobar.baz", site="datad0g.com") +with LLMObs.agent("dummy"): + pass +""", + env=env, + ) + assert status == 0, err + assert out == b"" + agentless_writer_log = b"failed to send traces to intake at https://llmobs-intake.datad0g.com/api/v2/llmobs: HTTP error status 403, reason Forbidden\n" # noqa: E501 + agent_proxy_log = b"failed to send, dropping 1 traces to intake at http://localhost:8126/evp_proxy/v2/api/v2/llmobs after 5 retries" # noqa: E501 + assert err == agentless_writer_log + assert agent_proxy_log not in err + + def test_llmobs_fork_recreates_and_restarts_span_writer(): """Test that forking a process correctly recreates and restarts the LLMObsSpanWriter.""" with mock.patch("ddtrace.internal.writer.HTTPWriter._send_payload"): @@ -1514,16 +1348,10 @@ def test_llmobs_fork_recreates_and_restarts_span_writer(): if pid: # parent assert llmobs_service._instance.tracer._pid == original_pid assert llmobs_service._instance._llmobs_span_writer == original_span_writer - assert ( - llmobs_service._instance._trace_processor._span_writer == llmobs_service._instance._llmobs_span_writer - ) assert llmobs_service._instance._llmobs_span_writer.status == ServiceStatus.RUNNING else: # child assert llmobs_service._instance.tracer._pid != original_pid assert llmobs_service._instance._llmobs_span_writer != original_span_writer - assert ( - llmobs_service._instance._trace_processor._span_writer == llmobs_service._instance._llmobs_span_writer - ) assert llmobs_service._instance._llmobs_span_writer.status == ServiceStatus.RUNNING llmobs_service.disable() os._exit(12) @@ -1569,18 +1397,10 @@ def test_llmobs_fork_recreates_and_restarts_evaluator_runner(mock_ragas_evaluato if pid: # parent assert llmobs_service._instance.tracer._pid == original_pid assert llmobs_service._instance._evaluator_runner == original_evaluator_runner - assert ( - llmobs_service._instance._trace_processor._evaluator_runner - == llmobs_service._instance._evaluator_runner - ) assert llmobs_service._instance._evaluator_runner.status == ServiceStatus.RUNNING else: # child assert llmobs_service._instance.tracer._pid != original_pid assert llmobs_service._instance._evaluator_runner != original_evaluator_runner - assert ( - llmobs_service._instance._trace_processor._evaluator_runner - == llmobs_service._instance._evaluator_runner - ) assert llmobs_service._instance._evaluator_runner.status == ServiceStatus.RUNNING llmobs_service.disable() os._exit(12) @@ -1667,42 +1487,6 @@ def test_llmobs_fork_evaluator_runner_run(monkeypatch): llmobs_service.disable() -def test_llmobs_fork_custom_filter(monkeypatch): - """Test that forking a process correctly keeps any custom filters.""" - - class CustomFilter(TraceFilter): - def process_trace(self, trace): - return trace - - monkeypatch.setenv("_DD_LLMOBS_WRITER_INTERVAL", 5.0) - with mock.patch("ddtrace.internal.writer.HTTPWriter._send_payload"): - tracer = DummyTracer() - custom_filter = CustomFilter() - tracer.configure(settings={"FILTERS": [custom_filter]}) - llmobs_service.enable(_tracer=tracer, ml_app="test_app") - assert custom_filter in llmobs_service._instance.tracer._filters - pid = os.fork() - if pid: # parent - assert custom_filter in llmobs_service._instance.tracer._filters - assert any( - isinstance(tracer_filter, LLMObsTraceProcessor) - for tracer_filter in llmobs_service._instance.tracer._filters - ) - else: # child - assert custom_filter in llmobs_service._instance.tracer._filters - assert any( - isinstance(tracer_filter, LLMObsTraceProcessor) - for tracer_filter in llmobs_service._instance.tracer._filters - ) - llmobs_service.disable() - os._exit(12) - - _, status = os.waitpid(pid, 0) - exit_code = os.WEXITSTATUS(status) - assert exit_code == 12 - llmobs_service.disable() - - def test_llmobs_fork_disabled(monkeypatch): """Test that after being disabled the service remains disabled when forking""" monkeypatch.setenv("DD_LLMOBS_ENABLED", "0") @@ -1746,46 +1530,46 @@ def test_llmobs_fork_disabled_then_enabled(monkeypatch): svc.disable() -def test_llmobs_with_evaluator_runner(LLMObs, mock_llmobs_evaluator_runner): - with LLMObs.llm(model_name="test_model"): +def test_llmobs_with_evaluator_runner(llmobs, mock_llmobs_evaluator_runner): + with llmobs.llm(model_name="test_model"): pass time.sleep(0.1) - assert LLMObs._instance._evaluator_runner.enqueue.call_count == 1 + assert llmobs._instance._evaluator_runner.enqueue.call_count == 1 -def test_llmobs_with_evaluator_runner_does_not_enqueue_evaluation_spans(mock_llmobs_evaluator_runner, LLMObs): - with LLMObs.llm(model_name="test_model", ml_app="{}-dummy".format(RAGAS_ML_APP_PREFIX)): +def test_llmobs_with_evaluator_runner_does_not_enqueue_evaluation_spans(mock_llmobs_evaluator_runner, llmobs): + with llmobs.llm(model_name="test_model", ml_app="{}-dummy".format(RAGAS_ML_APP_PREFIX)): pass time.sleep(0.1) - assert LLMObs._instance._evaluator_runner.enqueue.call_count == 0 + assert llmobs._instance._evaluator_runner.enqueue.call_count == 0 -def test_llmobs_with_evaluation_runner_does_not_enqueue_non_llm_spans(mock_llmobs_evaluator_runner, LLMObs): - with LLMObs.workflow(name="test"): +def test_llmobs_with_evaluation_runner_does_not_enqueue_non_llm_spans(mock_llmobs_evaluator_runner, llmobs): + with llmobs.workflow(name="test"): pass - with LLMObs.agent(name="test"): + with llmobs.agent(name="test"): pass - with LLMObs.task(name="test"): + with llmobs.task(name="test"): pass - with LLMObs.embedding(model_name="test"): + with llmobs.embedding(model_name="test"): pass - with LLMObs.retrieval(name="test"): + with llmobs.retrieval(name="test"): pass - with LLMObs.tool(name="test"): + with llmobs.tool(name="test"): pass time.sleep(0.1) - assert LLMObs._instance._evaluator_runner.enqueue.call_count == 0 + assert llmobs._instance._evaluator_runner.enqueue.call_count == 0 -def test_annotation_context_modifies_span_tags(LLMObs): - with LLMObs.annotation_context(tags={"foo": "bar"}): - with LLMObs.agent(name="test_agent") as span: +def test_annotation_context_modifies_span_tags(llmobs): + with llmobs.annotation_context(tags={"foo": "bar"}): + with llmobs.agent(name="test_agent") as span: assert span._get_ctx_item(TAGS) == {"foo": "bar"} -def test_annotation_context_modifies_prompt(LLMObs): - with LLMObs.annotation_context(prompt={"template": "test_template"}): - with LLMObs.llm(name="test_agent", model_name="test") as span: +def test_annotation_context_modifies_prompt(llmobs): + with llmobs.annotation_context(prompt={"template": "test_template"}): + with llmobs.llm(name="test_agent", model_name="test") as span: assert span._get_ctx_item(INPUT_PROMPT) == { "template": "test_template", "_dd_context_variable_keys": ["context"], @@ -1793,80 +1577,80 @@ def test_annotation_context_modifies_prompt(LLMObs): } -def test_annotation_context_modifies_name(LLMObs): - with LLMObs.annotation_context(name="test_agent_override"): - with LLMObs.llm(name="test_agent", model_name="test") as span: +def test_annotation_context_modifies_name(llmobs): + with llmobs.annotation_context(name="test_agent_override"): + with llmobs.llm(name="test_agent", model_name="test") as span: assert span.name == "test_agent_override" -def test_annotation_context_finished_context_does_not_modify_tags(LLMObs): - with LLMObs.annotation_context(tags={"foo": "bar"}): +def test_annotation_context_finished_context_does_not_modify_tags(llmobs): + with llmobs.annotation_context(tags={"foo": "bar"}): pass - with LLMObs.agent(name="test_agent") as span: + with llmobs.agent(name="test_agent") as span: assert span._get_ctx_item(TAGS) is None -def test_annotation_context_finished_context_does_not_modify_prompt(LLMObs): - with LLMObs.annotation_context(prompt={"template": "test_template"}): +def test_annotation_context_finished_context_does_not_modify_prompt(llmobs): + with llmobs.annotation_context(prompt={"template": "test_template"}): pass - with LLMObs.llm(name="test_agent", model_name="test") as span: + with llmobs.llm(name="test_agent", model_name="test") as span: assert span._get_ctx_item(INPUT_PROMPT) is None -def test_annotation_context_finished_context_does_not_modify_name(LLMObs): - with LLMObs.annotation_context(name="test_agent_override"): +def test_annotation_context_finished_context_does_not_modify_name(llmobs): + with llmobs.annotation_context(name="test_agent_override"): pass - with LLMObs.agent(name="test_agent") as span: + with llmobs.agent(name="test_agent") as span: assert span.name == "test_agent" -def test_annotation_context_nested(LLMObs): - with LLMObs.annotation_context(tags={"foo": "bar", "boo": "bar"}): - with LLMObs.annotation_context(tags={"foo": "baz"}): - with LLMObs.agent(name="test_agent") as span: +def test_annotation_context_nested(llmobs): + with llmobs.annotation_context(tags={"foo": "bar", "boo": "bar"}): + with llmobs.annotation_context(tags={"foo": "baz"}): + with llmobs.agent(name="test_agent") as span: assert span._get_ctx_item(TAGS) == {"foo": "baz", "boo": "bar"} -def test_annotation_context_nested_overrides_name(LLMObs): - with LLMObs.annotation_context(name="unexpected"): - with LLMObs.annotation_context(name="expected"): - with LLMObs.agent(name="test_agent") as span: +def test_annotation_context_nested_overrides_name(llmobs): + with llmobs.annotation_context(name="unexpected"): + with llmobs.annotation_context(name="expected"): + with llmobs.agent(name="test_agent") as span: assert span.name == "expected" -def test_annotation_context_nested_maintains_trace_structure(LLMObs, mock_llmobs_span_writer): +def test_annotation_context_nested_maintains_trace_structure(llmobs, llmobs_events): """This test makes sure starting/stopping annotation contexts do not modify the llmobs trace structure""" - with LLMObs.annotation_context(tags={"foo": "bar", "boo": "bar"}): - with LLMObs.agent(name="parent_span") as parent_span: - with LLMObs.annotation_context(tags={"foo": "baz"}): - with LLMObs.workflow(name="child_span") as child_span: + with llmobs.annotation_context(tags={"foo": "bar", "boo": "bar"}): + with llmobs.agent(name="parent_span") as parent_span: + with llmobs.annotation_context(tags={"foo": "baz"}): + with llmobs.workflow(name="child_span") as child_span: assert child_span._get_ctx_item(TAGS) == {"foo": "baz", "boo": "bar"} assert parent_span._get_ctx_item(TAGS) == {"foo": "bar", "boo": "bar"} - assert len(mock_llmobs_span_writer.enqueue.call_args_list) == 2 - parent_span, child_span = [span[0] for span, _ in mock_llmobs_span_writer.enqueue.call_args_list] + assert len(llmobs_events) == 2 + parent_span, child_span = llmobs_events[1], llmobs_events[0] assert child_span["trace_id"] == parent_span["trace_id"] assert child_span["span_id"] != parent_span["span_id"] assert child_span["parent_id"] == parent_span["span_id"] assert parent_span["parent_id"] == "undefined" - mock_llmobs_span_writer.reset_mock() - with LLMObs.annotation_context(tags={"foo": "bar", "boo": "bar"}): - with LLMObs.agent(name="parent_span"): +def test_annotation_context_separate_traces_maintained(llmobs, llmobs_events): + with llmobs.annotation_context(tags={"foo": "bar", "boo": "bar"}): + with llmobs.agent(name="parent_span"): pass - with LLMObs.workflow(name="child_span"): + with llmobs.workflow(name="child_span"): pass - assert len(mock_llmobs_span_writer.enqueue.call_args_list) == 2 - trace_one, trace_two = [span[0] for span, _ in mock_llmobs_span_writer.enqueue.call_args_list] - assert trace_one["trace_id"] != trace_two["trace_id"] - assert trace_one["span_id"] != trace_two["span_id"] - assert trace_two["parent_id"] == "undefined" - assert trace_one["parent_id"] == "undefined" + assert len(llmobs_events) == 2 + agent_span, workflow_span = llmobs_events[1], llmobs_events[0] + assert agent_span["trace_id"] != workflow_span["trace_id"] + assert agent_span["span_id"] != workflow_span["span_id"] + assert workflow_span["parent_id"] == "undefined" + assert agent_span["parent_id"] == "undefined" -def test_annotation_context_only_applies_to_local_context(LLMObs): +def test_annotation_context_only_applies_to_local_context(llmobs): """ tests that annotation contexts only apply to spans belonging to the same trace context and not globally to all spans. @@ -1882,8 +1666,8 @@ def test_annotation_context_only_applies_to_local_context(LLMObs): def context_one(): nonlocal agent_has_correct_name nonlocal agent_has_correct_tags - with LLMObs.annotation_context(name="expected_agent", tags={"foo": "bar"}): - with LLMObs.agent(name="test_agent") as span: + with llmobs.annotation_context(name="expected_agent", tags={"foo": "bar"}): + with llmobs.agent(name="test_agent") as span: event.wait() agent_has_correct_tags = span._get_ctx_item(TAGS) == {"foo": "bar"} agent_has_correct_name = span.name == "expected_agent" @@ -1892,9 +1676,9 @@ def context_one(): def context_two(): nonlocal tool_has_correct_name nonlocal tool_does_not_have_tags - with LLMObs.agent(name="test_agent"): - with LLMObs.annotation_context(name="expected_tool"): - with LLMObs.tool(name="test_tool") as tool_span: + with llmobs.agent(name="test_agent"): + with llmobs.annotation_context(name="expected_tool"): + with llmobs.tool(name="test_tool") as tool_span: event.wait() tool_does_not_have_tags = tool_span._get_ctx_item(TAGS) is None tool_has_correct_name = tool_span.name == "expected_tool" @@ -1904,7 +1688,7 @@ def context_two(): thread_one.start() thread_two.start() - with LLMObs.agent(name="test_agent") as span: + with llmobs.agent(name="test_agent") as span: assert span.name == "test_agent" assert span._get_ctx_item(TAGS) is None @@ -1920,15 +1704,15 @@ def context_two(): assert tool_does_not_have_tags is True -async def test_annotation_context_async_modifies_span_tags(LLMObs): - async with LLMObs.annotation_context(tags={"foo": "bar"}): - with LLMObs.agent(name="test_agent") as span: +async def test_annotation_context_async_modifies_span_tags(llmobs): + async with llmobs.annotation_context(tags={"foo": "bar"}): + with llmobs.agent(name="test_agent") as span: assert span._get_ctx_item(TAGS) == {"foo": "bar"} -async def test_annotation_context_async_modifies_prompt(LLMObs): - async with LLMObs.annotation_context(prompt={"template": "test_template"}): - with LLMObs.llm(name="test_agent", model_name="test") as span: +async def test_annotation_context_async_modifies_prompt(llmobs): + async with llmobs.annotation_context(prompt={"template": "test_template"}): + with llmobs.llm(name="test_agent", model_name="test") as span: assert span._get_ctx_item(INPUT_PROMPT) == { "template": "test_template", "_dd_context_variable_keys": ["context"], @@ -1936,41 +1720,42 @@ async def test_annotation_context_async_modifies_prompt(LLMObs): } -async def test_annotation_context_async_modifies_name(LLMObs): - async with LLMObs.annotation_context(name="test_agent_override"): - with LLMObs.llm(name="test_agent", model_name="test") as span: +async def test_annotation_context_async_modifies_name(llmobs): + async with llmobs.annotation_context(name="test_agent_override"): + with llmobs.llm(name="test_agent", model_name="test") as span: assert span.name == "test_agent_override" -async def test_annotation_context_async_finished_context_does_not_modify_tags(LLMObs): - async with LLMObs.annotation_context(tags={"foo": "bar"}): +async def test_annotation_context_async_finished_context_does_not_modify_tags(llmobs): + async with llmobs.annotation_context(tags={"foo": "bar"}): pass - with LLMObs.agent(name="test_agent") as span: + with llmobs.agent(name="test_agent") as span: assert span._get_ctx_item(TAGS) is None -async def test_annotation_context_async_finished_context_does_not_modify_prompt(LLMObs): - async with LLMObs.annotation_context(prompt={"template": "test_template"}): +async def test_annotation_context_async_finished_context_does_not_modify_prompt(llmobs): + async with llmobs.annotation_context(prompt={"template": "test_template"}): pass - with LLMObs.llm(name="test_agent", model_name="test") as span: + with llmobs.llm(name="test_agent", model_name="test") as span: assert span._get_ctx_item(INPUT_PROMPT) is None -async def test_annotation_context_finished_context_async_does_not_modify_name(LLMObs): - async with LLMObs.annotation_context(name="test_agent_override"): +async def test_annotation_context_finished_context_async_does_not_modify_name(llmobs): + async with llmobs.annotation_context(name="test_agent_override"): pass - with LLMObs.agent(name="test_agent") as span: + with llmobs.agent(name="test_agent") as span: assert span.name == "test_agent" -async def test_annotation_context_async_nested(LLMObs): - async with LLMObs.annotation_context(tags={"foo": "bar", "boo": "bar"}): - async with LLMObs.annotation_context(tags={"foo": "baz"}): - with LLMObs.agent(name="test_agent") as span: +async def test_annotation_context_async_nested(llmobs): + async with llmobs.annotation_context(tags={"foo": "bar", "boo": "bar"}): + async with llmobs.annotation_context(tags={"foo": "baz"}): + with llmobs.agent(name="test_agent") as span: assert span._get_ctx_item(TAGS) == {"foo": "baz", "boo": "bar"} def test_service_enable_starts_evaluator_runner_when_evaluators_exist(): + pytest.importorskip("ragas") with override_global_config(dict(_dd_api_key="", _llmobs_ml_app="")): with override_env(dict(_DD_LLMOBS_EVALUATORS="ragas_faithfulness")): dummy_tracer = DummyTracer() @@ -1994,3 +1779,293 @@ def test_service_enable_does_not_start_evaluator_runner(): assert llmobs_service._instance._llmobs_span_writer.status.value == "running" assert llmobs_service._instance._evaluator_runner.status.value == "stopped" llmobs_service.disable() + + +def test_submit_evaluation_llmobs_disabled_raises_debug(llmobs, mock_llmobs_logs): + llmobs.disable() + mock_llmobs_logs.reset_mock() + llmobs.submit_evaluation( + span_context={"span_id": "123", "trace_id": "456"}, label="toxicity", metric_type="categorical", value="high" + ) + mock_llmobs_logs.debug.assert_called_once_with( + "LLMObs.submit_evaluation() called when LLMObs is not enabled. Evaluation metric data will not be sent." + ) + + +def test_submit_evaluation_for_no_ml_app_raises_warning(llmobs, mock_llmobs_logs): + with override_global_config(dict(_llmobs_ml_app="")): + llmobs.submit_evaluation_for( + span={"span_id": "123", "trace_id": "456"}, + label="toxicity", + metric_type="categorical", + value="high", + ) + mock_llmobs_logs.warning.assert_called_once_with( + "ML App name is required for sending evaluation metrics. Evaluation metric data will not be sent. " + "Ensure this configuration is set before running your application." + ) + + +def test_submit_evaluation_for_span_incorrect_type_raises_error(llmobs, mock_llmobs_logs): + with pytest.raises( + TypeError, + match=re.escape( + ( + "`span` must be a dictionary containing both span_id and trace_id keys. " + "LLMObs.export_span() can be used to generate this dictionary from a given span." + ) + ), + ): + llmobs.submit_evaluation_for(span="asd", label="toxicity", metric_type="categorical", value="high") + + +def test_submit_evaluation_for_span_with_tag_value_incorrect_type_raises_error(llmobs, mock_llmobs_logs): + with pytest.raises( + TypeError, + match=r"`span_with_tag_value` must be a dict with keys 'tag_key' and 'tag_value' containing string values", + ): + llmobs.submit_evaluation_for( + span_with_tag_value="asd", label="toxicity", metric_type="categorical", value="high" + ) + with pytest.raises( + TypeError, + match=r"`span_with_tag_value` must be a dict with keys 'tag_key' and 'tag_value' containing string values", + ): + llmobs.submit_evaluation_for( + span_with_tag_value={"tag_key": "hi", "tag_value": 1}, + label="toxicity", + metric_type="categorical", + value="high", + ) + + +def test_submit_evaluation_for_empty_span_or_trace_id_raises_error(llmobs, mock_llmobs_logs): + with pytest.raises( + TypeError, + match=re.escape( + ( + "`span` must be a dictionary containing both span_id and trace_id keys. " + "LLMObs.export_span() can be used to generate this dictionary from a given span." + ) + ), + ): + llmobs.submit_evaluation_for( + span={"trace_id": "456"}, label="toxicity", metric_type="categorical", value="high" + ) + with pytest.raises( + TypeError, + match=re.escape( + "`span` must be a dictionary containing both span_id and trace_id keys. " + "LLMObs.export_span() can be used to generate this dictionary from a given span." + ), + ): + llmobs.submit_evaluation_for(span={"span_id": "456"}, label="toxicity", metric_type="categorical", value="high") + + +def test_submit_evaluation_for_span_with_tag_value_empty_key_or_val_raises_error(llmobs, mock_llmobs_logs): + with pytest.raises( + TypeError, + match=r"`span_with_tag_value` must be a dict with keys 'tag_key' and 'tag_value' containing string values", + ): + llmobs.submit_evaluation_for( + span_with_tag_value={"tag_value": "123"}, label="toxicity", metric_type="categorical", value="high" + ) + + +def test_submit_evaluation_for_invalid_timestamp_raises_error(llmobs, mock_llmobs_logs): + with pytest.raises( + ValueError, match="timestamp_ms must be a non-negative integer. Evaluation metric data will not be sent" + ): + llmobs.submit_evaluation_for( + span={"span_id": "123", "trace_id": "456"}, + label="", + metric_type="categorical", + value="high", + ml_app="dummy", + timestamp_ms="invalid", + ) + + +def test_submit_evaluation_for_empty_label_raises_error(llmobs, mock_llmobs_logs): + with pytest.raises(ValueError, match="label must be the specified name of the evaluation metric."): + llmobs.submit_evaluation_for( + span={"span_id": "123", "trace_id": "456"}, label="", metric_type="categorical", value="high" + ) + + +def test_submit_evaluation_for_incorrect_metric_type_raises_error(llmobs, mock_llmobs_logs): + with pytest.raises(ValueError, match="metric_type must be one of 'categorical' or 'score'."): + llmobs.submit_evaluation_for( + span={"span_id": "123", "trace_id": "456"}, label="toxicity", metric_type="wrong", value="high" + ) + with pytest.raises(ValueError, match="metric_type must be one of 'categorical' or 'score'."): + llmobs.submit_evaluation_for( + span={"span_id": "123", "trace_id": "456"}, label="toxicity", metric_type="", value="high" + ) + + +def test_submit_evaluation_for_incorrect_score_value_type_raises_error(llmobs, mock_llmobs_logs): + with pytest.raises(TypeError, match="value must be an integer or float for a score metric."): + llmobs.submit_evaluation_for( + span={"span_id": "123", "trace_id": "456"}, label="token_count", metric_type="score", value="high" + ) + + +def test_submit_evaluation_for_invalid_tags_raises_warning(llmobs, mock_llmobs_logs): + llmobs.submit_evaluation_for( + span={"span_id": "123", "trace_id": "456"}, + label="toxicity", + metric_type="categorical", + value="high", + tags=["invalid"], + ) + mock_llmobs_logs.warning.assert_called_once_with("tags must be a dictionary of string key-value pairs.") + + +@pytest.mark.parametrize( + "ddtrace_global_config", + [dict(_llmobs_ml_app="test_app_name")], +) +def test_submit_evaluation_for_non_string_tags_raises_warning_but_still_submits( + llmobs, mock_llmobs_logs, mock_llmobs_eval_metric_writer +): + llmobs.submit_evaluation_for( + span={"span_id": "123", "trace_id": "456"}, + label="toxicity", + metric_type="categorical", + value="high", + tags={1: 2, "foo": "bar"}, + ml_app="dummy", + ) + mock_llmobs_logs.warning.assert_called_once_with( + "Failed to parse tags. Tags for evaluation metrics must be strings." + ) + mock_llmobs_logs.reset_mock() + mock_llmobs_eval_metric_writer.enqueue.assert_called_with( + _expected_llmobs_eval_metric_event( + ml_app="dummy", + span_id="123", + trace_id="456", + label="toxicity", + metric_type="categorical", + categorical_value="high", + tags=["ddtrace.version:{}".format(ddtrace.__version__), "ml_app:dummy", "foo:bar"], + ) + ) + + +@pytest.mark.parametrize( + "ddtrace_global_config", + [dict(ddtrace="1.2.3", env="test_env", service="test_service", _llmobs_ml_app="test_app_name")], +) +def test_submit_evaluation_for_metric_tags(llmobs, mock_llmobs_eval_metric_writer): + llmobs.submit_evaluation_for( + span={"span_id": "123", "trace_id": "456"}, + label="toxicity", + metric_type="categorical", + value="high", + tags={"foo": "bar", "bee": "baz", "ml_app": "ml_app_override"}, + ml_app="ml_app_override", + ) + mock_llmobs_eval_metric_writer.enqueue.assert_called_with( + _expected_llmobs_eval_metric_event( + ml_app="ml_app_override", + span_id="123", + trace_id="456", + label="toxicity", + metric_type="categorical", + categorical_value="high", + tags=["ddtrace.version:{}".format(ddtrace.__version__), "ml_app:ml_app_override", "foo:bar", "bee:baz"], + ) + ) + + +def test_submit_evaluation_for_span_with_tag_value_enqueues_writer_with_categorical_metric( + llmobs, mock_llmobs_eval_metric_writer +): + llmobs.submit_evaluation_for( + span_with_tag_value={"tag_key": "tag_key", "tag_value": "tag_val"}, + label="toxicity", + metric_type="categorical", + value="high", + ml_app="dummy", + ) + mock_llmobs_eval_metric_writer.enqueue.assert_called_with( + _expected_llmobs_eval_metric_event( + ml_app="dummy", + tag_key="tag_key", + tag_value="tag_val", + label="toxicity", + metric_type="categorical", + categorical_value="high", + ) + ) + + +def test_submit_evaluation_for_enqueues_writer_with_categorical_metric(llmobs, mock_llmobs_eval_metric_writer): + llmobs.submit_evaluation_for( + span={"span_id": "123", "trace_id": "456"}, + label="toxicity", + metric_type="categorical", + value="high", + ml_app="dummy", + ) + mock_llmobs_eval_metric_writer.enqueue.assert_called_with( + _expected_llmobs_eval_metric_event( + ml_app="dummy", + span_id="123", + trace_id="456", + label="toxicity", + metric_type="categorical", + categorical_value="high", + ) + ) + mock_llmobs_eval_metric_writer.reset_mock() + with llmobs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span: + llmobs.submit_evaluation_for( + span=llmobs.export_span(span), + label="toxicity", + metric_type="categorical", + value="high", + ml_app="dummy", + ) + mock_llmobs_eval_metric_writer.enqueue.assert_called_with( + _expected_llmobs_eval_metric_event( + ml_app="dummy", + span_id=str(span.span_id), + trace_id="{:x}".format(span.trace_id), + label="toxicity", + metric_type="categorical", + categorical_value="high", + ) + ) + + +def test_submit_evaluation_for_enqueues_writer_with_score_metric(llmobs, mock_llmobs_eval_metric_writer): + llmobs.submit_evaluation_for( + span={"span_id": "123", "trace_id": "456"}, + label="sentiment", + metric_type="score", + value=0.9, + ml_app="dummy", + ) + mock_llmobs_eval_metric_writer.enqueue.assert_called_with( + _expected_llmobs_eval_metric_event( + span_id="123", trace_id="456", label="sentiment", metric_type="score", score_value=0.9, ml_app="dummy" + ) + ) + mock_llmobs_eval_metric_writer.reset_mock() + with llmobs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span: + llmobs.submit_evaluation_for( + span=llmobs.export_span(span), label="sentiment", metric_type="score", value=0.9, ml_app="dummy" + ) + mock_llmobs_eval_metric_writer.enqueue.assert_called_with( + _expected_llmobs_eval_metric_event( + span_id=str(span.span_id), + trace_id="{:x}".format(span.trace_id), + label="sentiment", + metric_type="score", + score_value=0.9, + ml_app="dummy", + ) + ) diff --git a/tests/llmobs/test_llmobs_span_agent_writer.py b/tests/llmobs/test_llmobs_span_agent_writer.py index 76fe0f21aef..d16bb9f0e2c 100644 --- a/tests/llmobs/test_llmobs_span_agent_writer.py +++ b/tests/llmobs/test_llmobs_span_agent_writer.py @@ -44,7 +44,8 @@ def test_flush_queue_when_event_cause_queue_to_exceed_payload_limit( [ mock.call("flushing queue because queuing next event will exceed EVP payload limit"), mock.call("encode %d LLMObs span events to be sent", 5), - ] + ], + any_order=True, ) diff --git a/tests/llmobs/test_llmobs_span_agentless_writer.py b/tests/llmobs/test_llmobs_span_agentless_writer.py index 4882f3553d8..4a54faf130d 100644 --- a/tests/llmobs/test_llmobs_span_agentless_writer.py +++ b/tests/llmobs/test_llmobs_span_agentless_writer.py @@ -75,26 +75,25 @@ def test_truncating_oversized_events(mock_writer_logs, mock_http_writer_send_pay ) -def test_send_completion_event(mock_writer_logs, mock_http_writer_logs, mock_http_writer_send_payload_response): +def test_send_completion_event(mock_writer_logs, mock_http_writer_send_payload_response): with override_global_config(dict(_dd_site=DATADOG_SITE, _dd_api_key="foobar.baz")): llmobs_span_writer = LLMObsSpanWriter(is_agentless=True, interval=1, timeout=1) llmobs_span_writer.start() llmobs_span_writer.enqueue(_completion_event()) llmobs_span_writer.periodic() mock_writer_logs.debug.assert_has_calls([mock.call("encode %d LLMObs span events to be sent", 1)]) - mock_http_writer_logs.error.assert_not_called() -def test_send_chat_completion_event(mock_writer_logs, mock_http_writer_logs, mock_http_writer_send_payload_response): +def test_send_chat_completion_event(mock_writer_logs, mock_http_writer_send_payload_response): with override_global_config(dict(_dd_site=DATADOG_SITE, _dd_api_key="foobar.baz")): llmobs_span_writer = LLMObsSpanWriter(is_agentless=True, interval=1, timeout=1) llmobs_span_writer.start() llmobs_span_writer.enqueue(_chat_completion_event()) llmobs_span_writer.periodic() mock_writer_logs.debug.assert_has_calls([mock.call("encode %d LLMObs span events to be sent", 1)]) - mock_http_writer_logs.error.assert_not_called() +@mock.patch("ddtrace.internal.writer.writer.log") def test_send_completion_bad_api_key(mock_http_writer_logs, mock_http_writer_put_response_forbidden): with override_global_config(dict(_dd_site=DATADOG_SITE, _dd_api_key="")): llmobs_span_writer = LLMObsSpanWriter(is_agentless=True, interval=1, timeout=1) @@ -109,7 +108,7 @@ def test_send_completion_bad_api_key(mock_http_writer_logs, mock_http_writer_put ) -def test_send_timed_events(mock_writer_logs, mock_http_writer_logs, mock_http_writer_send_payload_response): +def test_send_timed_events(mock_writer_logs, mock_http_writer_send_payload_response): with override_global_config(dict(_dd_site=DATADOG_SITE, _dd_api_key="foobar.baz")): llmobs_span_writer = LLMObsSpanWriter(is_agentless=True, interval=0.01, timeout=1) llmobs_span_writer.start() @@ -122,10 +121,9 @@ def test_send_timed_events(mock_writer_logs, mock_http_writer_logs, mock_http_wr llmobs_span_writer.enqueue(_chat_completion_event()) time.sleep(0.1) mock_writer_logs.debug.assert_has_calls([mock.call("encode %d LLMObs span events to be sent", 1)]) - mock_http_writer_logs.error.assert_not_called() -def test_send_multiple_events(mock_writer_logs, mock_http_writer_logs, mock_http_writer_send_payload_response): +def test_send_multiple_events(mock_writer_logs, mock_http_writer_send_payload_response): with override_global_config(dict(_dd_site=DATADOG_SITE, _dd_api_key="foobar.baz")): llmobs_span_writer = LLMObsSpanWriter(is_agentless=True, interval=0.01, timeout=1) llmobs_span_writer.start() @@ -135,10 +133,9 @@ def test_send_multiple_events(mock_writer_logs, mock_http_writer_logs, mock_http llmobs_span_writer.enqueue(_chat_completion_event()) time.sleep(0.1) mock_writer_logs.debug.assert_has_calls([mock.call("encode %d LLMObs span events to be sent", 2)]) - mock_http_writer_logs.error.assert_not_called() -def test_send_on_exit(mock_writer_logs, run_python_code_in_subprocess): +def test_send_on_exit(run_python_code_in_subprocess): env = os.environ.copy() pypath = [os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))] if "PYTHONPATH" in env: diff --git a/tests/llmobs/test_llmobs_trace_processor.py b/tests/llmobs/test_llmobs_trace_processor.py deleted file mode 100644 index b55286d49c8..00000000000 --- a/tests/llmobs/test_llmobs_trace_processor.py +++ /dev/null @@ -1,36 +0,0 @@ -import mock - -from ddtrace._trace.span import Span -from ddtrace.ext import SpanTypes -from ddtrace.llmobs._constants import SPAN_KIND -from ddtrace.llmobs._trace_processor import LLMObsTraceProcessor -from tests.utils import override_global_config - - -def test_processor_returns_all_traces_by_default(): - """Test that the LLMObsTraceProcessor returns all traces by default.""" - trace_filter = LLMObsTraceProcessor(llmobs_span_writer=mock.MagicMock()) - root_llm_span = Span(name="span1", span_type=SpanTypes.LLM) - root_llm_span._set_ctx_item(SPAN_KIND, "llm") - trace1 = [root_llm_span] - assert trace_filter.process_trace(trace1) == trace1 - - -def test_processor_returns_all_traces_if_not_agentless(): - """Test that the LLMObsTraceProcessor returns all traces if DD_LLMOBS_AGENTLESS_ENABLED is not set to true.""" - with override_global_config(dict(_llmobs_agentless_enabled=False)): - trace_filter = LLMObsTraceProcessor(llmobs_span_writer=mock.MagicMock()) - root_llm_span = Span(name="span1", span_type=SpanTypes.LLM) - root_llm_span._set_ctx_item(SPAN_KIND, "llm") - trace1 = [root_llm_span] - assert trace_filter.process_trace(trace1) == trace1 - - -def test_processor_returns_none_in_agentless_mode(): - """Test that the LLMObsTraceProcessor returns None if DD_LLMOBS_AGENTLESS_ENABLED is set to true.""" - with override_global_config(dict(_llmobs_agentless_enabled=True)): - trace_filter = LLMObsTraceProcessor(llmobs_span_writer=mock.MagicMock()) - root_llm_span = Span(name="span1", span_type=SpanTypes.LLM) - root_llm_span._set_ctx_item(SPAN_KIND, "llm") - trace1 = [root_llm_span] - assert trace_filter.process_trace(trace1) is None diff --git a/tests/llmobs/test_propagation.py b/tests/llmobs/test_propagation.py index d892c6b98a2..e3ab9c80d66 100644 --- a/tests/llmobs/test_propagation.py +++ b/tests/llmobs/test_propagation.py @@ -157,39 +157,39 @@ def test_no_llmobs_parent_id_propagated_if_no_llmobs_spans(run_python_code_in_su assert _get_llmobs_parent_id(span) == "undefined" -def test_inject_distributed_headers_simple(LLMObs): +def test_inject_distributed_headers_simple(llmobs): dummy_tracer = DummyTracer() with dummy_tracer.trace("LLMObs span", span_type=SpanTypes.LLM) as root_span: - request_headers = LLMObs.inject_distributed_headers({}, span=root_span) + request_headers = llmobs.inject_distributed_headers({}, span=root_span) assert PROPAGATED_PARENT_ID_KEY in request_headers["x-datadog-tags"] -def test_inject_distributed_headers_nested_llmobs_non_llmobs(LLMObs): +def test_inject_distributed_headers_nested_llmobs_non_llmobs(llmobs): dummy_tracer = DummyTracer() with dummy_tracer.trace("LLMObs span", span_type=SpanTypes.LLM): with dummy_tracer.trace("Non-LLMObs span") as child_span: - request_headers = LLMObs.inject_distributed_headers({}, span=child_span) + request_headers = llmobs.inject_distributed_headers({}, span=child_span) assert PROPAGATED_PARENT_ID_KEY in request_headers["x-datadog-tags"] -def test_inject_distributed_headers_non_llmobs_root_span(LLMObs): +def test_inject_distributed_headers_non_llmobs_root_span(llmobs): dummy_tracer = DummyTracer() with dummy_tracer.trace("Non-LLMObs span"): with dummy_tracer.trace("LLMObs span", span_type=SpanTypes.LLM) as child_span: - request_headers = LLMObs.inject_distributed_headers({}, span=child_span) + request_headers = llmobs.inject_distributed_headers({}, span=child_span) assert PROPAGATED_PARENT_ID_KEY in request_headers["x-datadog-tags"] -def test_inject_distributed_headers_nested_llmobs_spans(LLMObs): +def test_inject_distributed_headers_nested_llmobs_spans(llmobs): dummy_tracer = DummyTracer() with dummy_tracer.trace("LLMObs span", span_type=SpanTypes.LLM): with dummy_tracer.trace("LLMObs child span", span_type=SpanTypes.LLM): with dummy_tracer.trace("Last LLMObs child span", span_type=SpanTypes.LLM) as last_llmobs_span: - request_headers = LLMObs.inject_distributed_headers({}, span=last_llmobs_span) + request_headers = llmobs.inject_distributed_headers({}, span=last_llmobs_span) assert PROPAGATED_PARENT_ID_KEY in request_headers["x-datadog-tags"] -def test_activate_distributed_headers_propagate_correct_llmobs_parent_id_simple(run_python_code_in_subprocess, LLMObs): +def test_activate_distributed_headers_propagate_correct_llmobs_parent_id_simple(run_python_code_in_subprocess, llmobs): """Test that the correct LLMObs parent ID is propagated in the headers in a simple distributed scenario. Service A (subprocess) has a root LLMObs span and a non-LLMObs child span. Service B (outside subprocess) has a LLMObs span. @@ -216,16 +216,15 @@ def test_activate_distributed_headers_propagate_correct_llmobs_parent_id_simple( env["DD_TRACE_ENABLED"] = "0" stdout, stderr, status, _ = run_python_code_in_subprocess(code=code, env=env) assert status == 0, (stdout, stderr) - assert stderr == b"", (stdout, stderr) headers = json.loads(stdout.decode()) - LLMObs.activate_distributed_headers(headers) - with LLMObs.workflow("LLMObs span") as span: + llmobs.activate_distributed_headers(headers) + with llmobs.workflow("LLMObs span") as span: assert str(span.parent_id) == headers["x-datadog-parent-id"] assert _get_llmobs_parent_id(span) == headers["_DD_LLMOBS_SPAN_ID"] -def test_activate_distributed_headers_propagate_llmobs_parent_id_complex(run_python_code_in_subprocess, LLMObs): +def test_activate_distributed_headers_propagate_llmobs_parent_id_complex(run_python_code_in_subprocess, llmobs): """Test that the correct LLMObs parent ID is propagated in the headers in a more complex trace. Service A (subprocess) has a root LLMObs span and a non-LLMObs child span. Service B (outside subprocess) has a non-LLMObs local root span and a LLMObs child span. @@ -252,19 +251,18 @@ def test_activate_distributed_headers_propagate_llmobs_parent_id_complex(run_pyt env["DD_TRACE_ENABLED"] = "0" stdout, stderr, status, _ = run_python_code_in_subprocess(code=code, env=env) assert status == 0, (stdout, stderr) - assert stderr == b"", (stdout, stderr) headers = json.loads(stdout.decode()) - LLMObs.activate_distributed_headers(headers) + llmobs.activate_distributed_headers(headers) dummy_tracer = DummyTracer() with dummy_tracer.trace("Non-LLMObs span") as span: - with LLMObs.llm(model_name="llm_model", name="LLMObs span") as llm_span: + with llmobs.llm(model_name="llm_model", name="LLMObs span") as llm_span: assert str(span.parent_id) == headers["x-datadog-parent-id"] assert _get_llmobs_parent_id(span) == headers["_DD_LLMOBS_SPAN_ID"] assert _get_llmobs_parent_id(llm_span) == headers["_DD_LLMOBS_SPAN_ID"] -def test_activate_distributed_headers_does_not_propagate_if_no_llmobs_spans(run_python_code_in_subprocess, LLMObs): +def test_activate_distributed_headers_does_not_propagate_if_no_llmobs_spans(run_python_code_in_subprocess, llmobs): """Test that the correct LLMObs parent ID (None) is extracted from the headers in a simple distributed scenario. Service A (subprocess) has spans, but none are LLMObs spans. Service B (outside subprocess) has a LLMObs span. @@ -289,10 +287,9 @@ def test_activate_distributed_headers_does_not_propagate_if_no_llmobs_spans(run_ env["DD_TRACE_ENABLED"] = "0" stdout, stderr, status, _ = run_python_code_in_subprocess(code=code, env=env) assert status == 0, (stdout, stderr) - assert stderr == b"", (stdout, stderr) headers = json.loads(stdout.decode()) - LLMObs.activate_distributed_headers(headers) - with LLMObs.task("LLMObs span") as span: + llmobs.activate_distributed_headers(headers) + with llmobs.task("LLMObs span") as span: assert str(span.parent_id) == headers["x-datadog-parent-id"] assert _get_llmobs_parent_id(span) == "undefined" diff --git a/tests/snapshots/tests.contrib.openai.test_openai.test_chat_completion_stream.json b/tests/snapshots/tests.contrib.openai.test_openai.test_chat_completion_stream.json new file mode 100644 index 00000000000..fe7c9e3b0f2 --- /dev/null +++ b/tests/snapshots/tests.contrib.openai.test_openai.test_chat_completion_stream.json @@ -0,0 +1,53 @@ +[[ + { + "name": "openai.request", + "service": "tests.contrib.openai", + "resource": "createChatCompletion", + "trace_id": 0, + "span_id": 1, + "parent_id": 0, + "type": "", + "error": 0, + "meta": { + "_dd.p.dm": "-0", + "_dd.p.tid": "67741fca00000000", + "component": "openai", + "language": "python", + "openai.base_url": "https://api.openai.com/v1/", + "openai.organization.name": "datadog-4", + "openai.request.client": "OpenAI", + "openai.request.endpoint": "/v1/chat/completions", + "openai.request.messages.0.content": "Who won the world series in 2020?", + "openai.request.messages.0.name": "", + "openai.request.messages.0.role": "user", + "openai.request.method": "POST", + "openai.request.model": "gpt-3.5-turbo", + "openai.request.n": "None", + "openai.request.stream": "True", + "openai.request.user": "ddtrace-test", + "openai.response.choices.0.finish_reason": "stop", + "openai.response.choices.0.message.content": "The Los Angeles Dodgers won the World Series in 2020.", + "openai.response.choices.0.message.role": "assistant", + "openai.response.model": "gpt-3.5-turbo-0301", + "openai.user.api_key": "sk-...key>", + "runtime-id": "d174f65e33314f43ad1de8cf0a5ca4e0" + }, + "metrics": { + "_dd.measured": 1, + "_dd.top_level": 1, + "_dd.tracer_kr": 1.0, + "_sampling_priority_v1": 1, + "openai.organization.ratelimit.requests.limit": 3000, + "openai.organization.ratelimit.requests.remaining": 2999, + "openai.organization.ratelimit.tokens.limit": 250000, + "openai.organization.ratelimit.tokens.remaining": 249979, + "openai.request.prompt_tokens_estimated": 0, + "openai.response.completion_tokens_estimated": 0, + "openai.response.usage.completion_tokens": 19, + "openai.response.usage.prompt_tokens": 17, + "openai.response.usage.total_tokens": 36, + "process_id": 22982 + }, + "duration": 29869000, + "start": 1735663562179157000 + }]] diff --git a/tests/snapshots/tests.contrib.openai.test_openai.test_completion_stream.json b/tests/snapshots/tests.contrib.openai.test_openai.test_completion_stream.json new file mode 100644 index 00000000000..7cf644cfb3d --- /dev/null +++ b/tests/snapshots/tests.contrib.openai.test_openai.test_completion_stream.json @@ -0,0 +1,49 @@ +[[ + { + "name": "openai.request", + "service": "tests.contrib.openai", + "resource": "createCompletion", + "trace_id": 0, + "span_id": 1, + "parent_id": 0, + "type": "", + "error": 0, + "meta": { + "_dd.p.dm": "-0", + "_dd.p.tid": "6774231f00000000", + "component": "openai", + "language": "python", + "openai.base_url": "https://api.openai.com/v1/", + "openai.organization.name": "datadog-4", + "openai.request.client": "OpenAI", + "openai.request.endpoint": "/v1/completions", + "openai.request.method": "POST", + "openai.request.model": "ada", + "openai.request.n": "None", + "openai.request.prompt.0": "Hello world", + "openai.request.stream": "True", + "openai.response.choices.0.finish_reason": "length", + "openai.response.choices.0.text": "! ... A page layouts page drawer? ... Interesting. The \"Tools\" is", + "openai.response.model": "ada", + "openai.user.api_key": "sk-...key>", + "runtime-id": "11872c9ca653441db861b108a4f795eb" + }, + "metrics": { + "_dd.measured": 1, + "_dd.top_level": 1, + "_dd.tracer_kr": 1.0, + "_sampling_priority_v1": 1, + "openai.organization.ratelimit.requests.limit": 3000, + "openai.organization.ratelimit.requests.remaining": 2999, + "openai.organization.ratelimit.tokens.limit": 250000, + "openai.organization.ratelimit.tokens.remaining": 249979, + "openai.request.prompt_tokens_estimated": 0, + "openai.response.completion_tokens_estimated": 0, + "openai.response.usage.completion_tokens": 2, + "openai.response.usage.prompt_tokens": 2, + "openai.response.usage.total_tokens": 4, + "process_id": 27488 + }, + "duration": 28739000, + "start": 1735664415266386000 + }]] diff --git a/tests/snapshots/tests.contrib.openai.test_openai_v1.test_completion_stream_est_tokens.json b/tests/snapshots/tests.contrib.openai.test_openai_v1.test_completion_stream_est_tokens.json new file mode 100644 index 00000000000..445dc39db98 --- /dev/null +++ b/tests/snapshots/tests.contrib.openai.test_openai_v1.test_completion_stream_est_tokens.json @@ -0,0 +1,49 @@ +[[ + { + "name": "openai.request", + "service": "tests.contrib.openai", + "resource": "createCompletion", + "trace_id": 0, + "span_id": 1, + "parent_id": 0, + "type": "", + "error": 0, + "meta": { + "_dd.p.dm": "-0", + "_dd.p.tid": "677c221c00000000", + "component": "openai", + "language": "python", + "openai.base_url": "https://api.openai.com/v1/", + "openai.organization.name": "datadog-4", + "openai.request.client": "OpenAI", + "openai.request.endpoint": "/v1/completions", + "openai.request.method": "POST", + "openai.request.model": "ada", + "openai.request.n": "None", + "openai.request.prompt.0": "Hello world", + "openai.request.stream": "True", + "openai.response.choices.0.finish_reason": "length", + "openai.response.choices.0.text": "! ... A page layouts page drawer? ... Interesting. The \"Tools\" is", + "openai.response.model": "ada", + "openai.user.api_key": "sk-...key>", + "runtime-id": "24f8e851c87e4f758c73d6acd0aaf82b" + }, + "metrics": { + "_dd.measured": 1, + "_dd.top_level": 1, + "_dd.tracer_kr": 1.0, + "_sampling_priority_v1": 1, + "openai.organization.ratelimit.requests.limit": 3000, + "openai.organization.ratelimit.requests.remaining": 2999, + "openai.organization.ratelimit.tokens.limit": 250000, + "openai.organization.ratelimit.tokens.remaining": 249979, + "openai.request.prompt_tokens_estimated": 1, + "openai.response.completion_tokens_estimated": 1, + "openai.response.usage.completion_tokens": 16, + "openai.response.usage.prompt_tokens": 2, + "openai.response.usage.total_tokens": 18, + "process_id": 47101 + }, + "duration": 37957000, + "start": 1736188444222291000 + }]] diff --git a/tests/telemetry/test_writer.py b/tests/telemetry/test_writer.py index 9579f35ce04..e15c3be08b9 100644 --- a/tests/telemetry/test_writer.py +++ b/tests/telemetry/test_writer.py @@ -651,7 +651,7 @@ def test_send_failing_request(mock_status, telemetry_writer): telemetry_writer.periodic(force_flush=True) # asserts unsuccessful status code was logged log.debug.assert_called_with( - "failed to send telemetry to %s. response: %s", + "Failed to send Instrumentation Telemetry to %s. response: %s", telemetry_writer._client.url, mock_status, ) diff --git a/tests/tracer/test_propagation.py b/tests/tracer/test_propagation.py index 61fec650a70..0d4c5d7c01d 100644 --- a/tests/tracer/test_propagation.py +++ b/tests/tracer/test_propagation.py @@ -1888,6 +1888,14 @@ def test_extract_tracecontext(headers, expected_context): B3_SINGLE_HEADERS_VALID, CONTEXT_EMPTY, ), + ( + "baggage_case_insensitive", + None, + {"BAgGage": "key1=val1,key2=val2"}, + { + "baggage": {"key1": "val1", "key2": "val2"}, + }, + ), # All valid headers ( "valid_all_headers_default_style", @@ -2278,14 +2286,14 @@ def test_propagation_extract_w_config(name, styles, headers, expected_context, r overrides = {} if styles is not None: overrides["_propagation_style_extract"] = styles - with override_global_config(overrides): - context = HTTPPropagator.extract(headers) - if not expected_context.get("tracestate"): - assert context == Context(**expected_context) - else: - copied_expectation = expected_context.copy() - tracestate = copied_expectation.pop("tracestate") - assert context == Context(**copied_expectation, meta={"tracestate": tracestate}) + with override_global_config(overrides): + context = HTTPPropagator.extract(headers) + if not expected_context.get("tracestate"): + assert context == Context(**expected_context) + else: + copied_expectation = expected_context.copy() + tracestate = copied_expectation.pop("tracestate") + assert context == Context(**copied_expectation, meta={"tracestate": tracestate}) EXTRACT_OVERRIDE_FIXTURES = [