diff --git a/.riot/requirements/16562eb.txt b/.riot/requirements/16562eb.txt new file mode 100644 index 00000000000..e2aac88c146 --- /dev/null +++ b/.riot/requirements/16562eb.txt @@ -0,0 +1,32 @@ +# +# This file is autogenerated by pip-compile with Python 3.7 +# by the following command: +# +# pip-compile --allow-unsafe --config=pyproject.toml --no-annotate --resolver=backtracking .riot/requirements/16562eb.in +# +attrs==24.2.0 +coverage[toml]==7.2.7 +exceptiongroup==1.2.2 +hypothesis==6.45.0 +idna==3.10 +importlib-metadata==6.7.0 +iniconfig==2.0.0 +mock==5.1.0 +multidict==6.0.5 +opentracing==2.4.0 +packaging==24.0 +pluggy==1.2.0 +pytest==7.4.4 +pytest-asyncio==0.21.1 +pytest-cov==4.1.0 +pytest-mock==3.11.1 +pyyaml==6.0.1 +six==1.17.0 +sortedcontainers==2.4.0 +tomli==2.0.1 +typing-extensions==4.7.1 +urllib3==1.26.20 +vcrpy==4.4.0 +wrapt==1.16.0 +yarl==1.9.4 +zipp==3.15.0 diff --git a/ddtrace/_trace/tracer.py b/ddtrace/_trace/tracer.py index 02d5fed7626..af9b09d3e02 100644 --- a/ddtrace/_trace/tracer.py +++ b/ddtrace/_trace/tracer.py @@ -41,6 +41,7 @@ from ddtrace.internal.atexit import register_on_exit_signal from ddtrace.internal.constants import SAMPLING_DECISION_TRACE_TAG_KEY from ddtrace.internal.constants import SPAN_API_DATADOG +from ddtrace.internal.core import dispatch from ddtrace.internal.dogstatsd import get_dogstatsd_client from ddtrace.internal.logger import get_logger from ddtrace.internal.peer_service.processor import PeerServiceProcessor @@ -849,7 +850,7 @@ def _start_span( for p in chain(self._span_processors, SpanProcessor.__processors__, self._deferred_processors): p.on_span_start(span) self._hooks.emit(self.__class__.start_span, span) - + dispatch("trace.span_start", (span,)) return span start_span = _start_span @@ -866,6 +867,8 @@ def _on_span_finish(self, span: Span) -> None: for p in chain(self._span_processors, SpanProcessor.__processors__, self._deferred_processors): p.on_span_finish(span) + dispatch("trace.span_finish", (span,)) + if log.isEnabledFor(logging.DEBUG): log.debug("finishing span %s (enabled:%s)", span._pprint(), self.enabled) diff --git a/ddtrace/contrib/internal/openai/_endpoint_hooks.py b/ddtrace/contrib/internal/openai/_endpoint_hooks.py index 73a2b2511c9..979e1774a8a 100644 --- a/ddtrace/contrib/internal/openai/_endpoint_hooks.py +++ b/ddtrace/contrib/internal/openai/_endpoint_hooks.py @@ -255,6 +255,14 @@ def _record_request(self, pin, integration, span, args, kwargs): span.set_tag_str("openai.request.messages.%d.content" % idx, integration.trunc(str(content))) span.set_tag_str("openai.request.messages.%d.role" % idx, str(role)) span.set_tag_str("openai.request.messages.%d.name" % idx, str(name)) + if parse_version(OPENAI_VERSION) >= (1, 26) and kwargs.get("stream"): + if kwargs.get("stream_options", {}).get("include_usage", None) is not None: + # Only perform token chunk auto-extraction if this option is not explicitly set + return + span._set_ctx_item("_dd.auto_extract_token_chunk", True) + stream_options = kwargs.get("stream_options", {}) + stream_options["include_usage"] = True + kwargs["stream_options"] = stream_options def _record_response(self, pin, integration, span, args, kwargs, resp, error): resp = super()._record_response(pin, integration, span, args, kwargs, resp, error) diff --git a/ddtrace/contrib/internal/openai/utils.py b/ddtrace/contrib/internal/openai/utils.py index d967383e366..f5dfc10efef 100644 --- a/ddtrace/contrib/internal/openai/utils.py +++ b/ddtrace/contrib/internal/openai/utils.py @@ -48,11 +48,28 @@ def __exit__(self, exc_type, exc_val, exc_tb): self.__wrapped__.__exit__(exc_type, exc_val, exc_tb) def __iter__(self): - return self + exception_raised = False + try: + for chunk in self.__wrapped__: + self._extract_token_chunk(chunk) + yield chunk + _loop_handler(self._dd_span, chunk, self._streamed_chunks) + except Exception: + self._dd_span.set_exc_info(*sys.exc_info()) + exception_raised = True + raise + finally: + if not exception_raised: + _process_finished_stream( + self._dd_integration, self._dd_span, self._kwargs, self._streamed_chunks, self._is_completion + ) + self._dd_span.finish() + self._dd_integration.metric(self._dd_span, "dist", "request.duration", self._dd_span.duration_ns) def __next__(self): try: chunk = self.__wrapped__.__next__() + self._extract_token_chunk(chunk) _loop_handler(self._dd_span, chunk, self._streamed_chunks) return chunk except StopIteration: @@ -68,6 +85,22 @@ def __next__(self): self._dd_integration.metric(self._dd_span, "dist", "request.duration", self._dd_span.duration_ns) raise + def _extract_token_chunk(self, chunk): + """Attempt to extract the token chunk (last chunk in the stream) from the streamed response.""" + if not self._dd_span._get_ctx_item("_dd.auto_extract_token_chunk"): + return + choice = getattr(chunk, "choices", [None])[0] + if not getattr(choice, "finish_reason", None): + # Only the second-last chunk in the stream with token usage enabled will have finish_reason set + return + try: + # User isn't expecting last token chunk to be present since it's not part of the default streamed response, + # so we consume it and extract the token usage metadata before it reaches the user. + usage_chunk = self.__wrapped__.__next__() + self._streamed_chunks[0].insert(0, usage_chunk) + except (StopIteration, GeneratorExit): + return + class TracedOpenAIAsyncStream(BaseTracedOpenAIStream): async def __aenter__(self): @@ -77,12 +110,29 @@ async def __aenter__(self): async def __aexit__(self, exc_type, exc_val, exc_tb): await self.__wrapped__.__aexit__(exc_type, exc_val, exc_tb) - def __aiter__(self): - return self + async def __aiter__(self): + exception_raised = False + try: + async for chunk in self.__wrapped__: + await self._extract_token_chunk(chunk) + yield chunk + _loop_handler(self._dd_span, chunk, self._streamed_chunks) + except Exception: + self._dd_span.set_exc_info(*sys.exc_info()) + exception_raised = True + raise + finally: + if not exception_raised: + _process_finished_stream( + self._dd_integration, self._dd_span, self._kwargs, self._streamed_chunks, self._is_completion + ) + self._dd_span.finish() + self._dd_integration.metric(self._dd_span, "dist", "request.duration", self._dd_span.duration_ns) async def __anext__(self): try: chunk = await self.__wrapped__.__anext__() + await self._extract_token_chunk(chunk) _loop_handler(self._dd_span, chunk, self._streamed_chunks) return chunk except StopAsyncIteration: @@ -98,6 +148,19 @@ async def __anext__(self): self._dd_integration.metric(self._dd_span, "dist", "request.duration", self._dd_span.duration_ns) raise + async def _extract_token_chunk(self, chunk): + """Attempt to extract the token chunk (last chunk in the stream) from the streamed response.""" + if not self._dd_span._get_ctx_item("_dd.auto_extract_token_chunk"): + return + choice = getattr(chunk, "choices", [None])[0] + if not getattr(choice, "finish_reason", None): + return + try: + usage_chunk = await self.__wrapped__.__anext__() + self._streamed_chunks[0].insert(0, usage_chunk) + except (StopAsyncIteration, GeneratorExit): + return + def _compute_token_count(content, model): # type: (Union[str, List[int]], Optional[str]) -> Tuple[bool, int] diff --git a/ddtrace/internal/debug.py b/ddtrace/internal/debug.py index 4d533b604b6..c33ff5ad46d 100644 --- a/ddtrace/internal/debug.py +++ b/ddtrace/internal/debug.py @@ -117,8 +117,8 @@ def collect(tracer): from ddtrace._trace.tracer import log return dict( - # Timestamp UTC ISO 8601 - date=datetime.datetime.utcnow().isoformat(), + # Timestamp UTC ISO 8601 with the trailing +00:00 removed + date=datetime.datetime.now(datetime.timezone.utc).isoformat()[0:-6], # eg. "Linux", "Darwin" os_name=platform.system(), # eg. 12.5.0 diff --git a/ddtrace/llmobs/_llmobs.py b/ddtrace/llmobs/_llmobs.py index 49815151118..cd4069b4094 100644 --- a/ddtrace/llmobs/_llmobs.py +++ b/ddtrace/llmobs/_llmobs.py @@ -3,7 +3,9 @@ import time from typing import Any from typing import Dict +from typing import List from typing import Optional +from typing import Tuple from typing import Union import ddtrace @@ -11,8 +13,12 @@ from ddtrace import config from ddtrace import patch from ddtrace._trace.context import Context +from ddtrace.constants import ERROR_MSG +from ddtrace.constants import ERROR_STACK +from ddtrace.constants import ERROR_TYPE from ddtrace.ext import SpanTypes from ddtrace.internal import atexit +from ddtrace.internal import core from ddtrace.internal import forksafe from ddtrace.internal._rand import rand64bits from ddtrace.internal.compat import ensure_text @@ -24,6 +30,7 @@ from ddtrace.internal.telemetry.constants import TELEMETRY_APM_PRODUCT from ddtrace.internal.utils.formats import asbool from ddtrace.internal.utils.formats import parse_tags_str +from ddtrace.llmobs import _constants as constants from ddtrace.llmobs._constants import ANNOTATIONS_CONTEXT_ID from ddtrace.llmobs._constants import INPUT_DOCUMENTS from ddtrace.llmobs._constants import INPUT_MESSAGES @@ -45,11 +52,11 @@ from ddtrace.llmobs._constants import SPAN_START_WHILE_DISABLED_WARNING from ddtrace.llmobs._constants import TAGS from ddtrace.llmobs._evaluators.runner import EvaluatorRunner -from ddtrace.llmobs._trace_processor import LLMObsTraceProcessor from ddtrace.llmobs._utils import AnnotationContext from ddtrace.llmobs._utils import _get_llmobs_parent_id from ddtrace.llmobs._utils import _get_ml_app from ddtrace.llmobs._utils import _get_session_id +from ddtrace.llmobs._utils import _get_span_name from ddtrace.llmobs._utils import _inject_llmobs_parent_id from ddtrace.llmobs._utils import safe_json from ddtrace.llmobs._utils import validate_prompt @@ -81,34 +88,157 @@ class LLMObs(Service): def __init__(self, tracer=None): super(LLMObs, self).__init__() self.tracer = tracer or ddtrace.tracer - self._llmobs_span_writer = None - self._llmobs_span_writer = LLMObsSpanWriter( is_agentless=config._llmobs_agentless_enabled, interval=float(os.getenv("_DD_LLMOBS_WRITER_INTERVAL", 1.0)), timeout=float(os.getenv("_DD_LLMOBS_WRITER_TIMEOUT", 5.0)), ) - self._llmobs_eval_metric_writer = LLMObsEvalMetricWriter( site=config._dd_site, api_key=config._dd_api_key, interval=float(os.getenv("_DD_LLMOBS_WRITER_INTERVAL", 1.0)), timeout=float(os.getenv("_DD_LLMOBS_WRITER_TIMEOUT", 5.0)), ) - self._evaluator_runner = EvaluatorRunner( interval=float(os.getenv("_DD_LLMOBS_EVALUATOR_INTERVAL", 1.0)), llmobs_service=self, ) - self._trace_processor = LLMObsTraceProcessor(self._llmobs_span_writer, self._evaluator_runner) forksafe.register(self._child_after_fork) self._annotations = [] self._annotation_context_lock = forksafe.RLock() - self.tracer.on_start_span(self._do_annotations) - def _do_annotations(self, span): + # Register hooks for span events + core.on("trace.span_start", self._do_annotations) + core.on("trace.span_finish", self._on_span_finish) + + def _on_span_finish(self, span): + if self.enabled and span.span_type == SpanTypes.LLM: + self._submit_llmobs_span(span) + + def _submit_llmobs_span(self, span: Span) -> None: + """Generate and submit an LLMObs span event to be sent to LLMObs.""" + span_event = None + is_llm_span = span._get_ctx_item(SPAN_KIND) == "llm" + is_ragas_integration_span = False + try: + span_event, is_ragas_integration_span = self._llmobs_span_event(span) + self._llmobs_span_writer.enqueue(span_event) + except (KeyError, TypeError): + log.error( + "Error generating LLMObs span event for span %s, likely due to malformed span", span, exc_info=True + ) + finally: + if not span_event or not is_llm_span or is_ragas_integration_span: + return + if self._evaluator_runner: + self._evaluator_runner.enqueue(span_event, span) + + @classmethod + def _llmobs_span_event(cls, span: Span) -> Tuple[Dict[str, Any], bool]: + """Span event object structure.""" + span_kind = span._get_ctx_item(SPAN_KIND) + if not span_kind: + raise KeyError("Span kind not found in span context") + meta: Dict[str, Any] = {"span.kind": span_kind, "input": {}, "output": {}} + if span_kind in ("llm", "embedding") and span._get_ctx_item(MODEL_NAME) is not None: + meta["model_name"] = span._get_ctx_item(MODEL_NAME) + meta["model_provider"] = (span._get_ctx_item(MODEL_PROVIDER) or "custom").lower() + meta["metadata"] = span._get_ctx_item(METADATA) or {} + if span._get_ctx_item(INPUT_PARAMETERS): + meta["input"]["parameters"] = span._get_ctx_item(INPUT_PARAMETERS) + if span_kind == "llm" and span._get_ctx_item(INPUT_MESSAGES) is not None: + meta["input"]["messages"] = span._get_ctx_item(INPUT_MESSAGES) + if span._get_ctx_item(INPUT_VALUE) is not None: + meta["input"]["value"] = safe_json(span._get_ctx_item(INPUT_VALUE)) + if span_kind == "llm" and span._get_ctx_item(OUTPUT_MESSAGES) is not None: + meta["output"]["messages"] = span._get_ctx_item(OUTPUT_MESSAGES) + if span_kind == "embedding" and span._get_ctx_item(INPUT_DOCUMENTS) is not None: + meta["input"]["documents"] = span._get_ctx_item(INPUT_DOCUMENTS) + if span._get_ctx_item(OUTPUT_VALUE) is not None: + meta["output"]["value"] = safe_json(span._get_ctx_item(OUTPUT_VALUE)) + if span_kind == "retrieval" and span._get_ctx_item(OUTPUT_DOCUMENTS) is not None: + meta["output"]["documents"] = span._get_ctx_item(OUTPUT_DOCUMENTS) + if span._get_ctx_item(INPUT_PROMPT) is not None: + prompt_json_str = span._get_ctx_item(INPUT_PROMPT) + if span_kind != "llm": + log.warning( + "Dropping prompt on non-LLM span kind, annotating prompts is only supported for LLM span kinds." + ) + else: + meta["input"]["prompt"] = prompt_json_str + if span.error: + meta.update( + { + ERROR_MSG: span.get_tag(ERROR_MSG), + ERROR_STACK: span.get_tag(ERROR_STACK), + ERROR_TYPE: span.get_tag(ERROR_TYPE), + } + ) + if not meta["input"]: + meta.pop("input") + if not meta["output"]: + meta.pop("output") + metrics = span._get_ctx_item(METRICS) or {} + ml_app = _get_ml_app(span) + + is_ragas_integration_span = False + + if ml_app.startswith(constants.RAGAS_ML_APP_PREFIX): + is_ragas_integration_span = True + + span._set_ctx_item(ML_APP, ml_app) + parent_id = str(_get_llmobs_parent_id(span) or "undefined") + + llmobs_span_event = { + "trace_id": "{:x}".format(span.trace_id), + "span_id": str(span.span_id), + "parent_id": parent_id, + "name": _get_span_name(span), + "start_ns": span.start_ns, + "duration": span.duration_ns, + "status": "error" if span.error else "ok", + "meta": meta, + "metrics": metrics, + } + session_id = _get_session_id(span) + if session_id is not None: + span._set_ctx_item(SESSION_ID, session_id) + llmobs_span_event["session_id"] = session_id + + llmobs_span_event["tags"] = cls._llmobs_tags( + span, ml_app, session_id, is_ragas_integration_span=is_ragas_integration_span + ) + return llmobs_span_event, is_ragas_integration_span + + @staticmethod + def _llmobs_tags( + span: Span, ml_app: str, session_id: Optional[str] = None, is_ragas_integration_span: bool = False + ) -> List[str]: + tags = { + "version": config.version or "", + "env": config.env or "", + "service": span.service or "", + "source": "integration", + "ml_app": ml_app, + "ddtrace.version": ddtrace.__version__, + "language": "python", + "error": span.error, + } + err_type = span.get_tag(ERROR_TYPE) + if err_type: + tags["error_type"] = err_type + if session_id: + tags["session_id"] = session_id + if is_ragas_integration_span: + tags[constants.RUNNER_IS_INTEGRATION_SPAN_TAG] = "ragas" + existing_tags = span._get_ctx_item(TAGS) + if existing_tags is not None: + tags.update(existing_tags) + return ["{}:{}".format(k, v) for k, v in tags.items()] + + def _do_annotations(self, span: Span) -> None: # get the current span context # only do the annotations if it matches the context if span.span_type != SpanTypes.LLM: # do this check to avoid the warning log in `annotate` @@ -120,20 +250,14 @@ def _do_annotations(self, span): if current_context_id == context_id: self.annotate(span, **annotation_kwargs) - def _child_after_fork(self): + def _child_after_fork(self) -> None: self._llmobs_span_writer = self._llmobs_span_writer.recreate() self._llmobs_eval_metric_writer = self._llmobs_eval_metric_writer.recreate() self._evaluator_runner = self._evaluator_runner.recreate() - self._trace_processor._span_writer = self._llmobs_span_writer - self._trace_processor._evaluator_runner = self._evaluator_runner if self.enabled: self._start_service() def _start_service(self) -> None: - tracer_filters = self.tracer._filters - if not any(isinstance(tracer_filter, LLMObsTraceProcessor) for tracer_filter in tracer_filters): - tracer_filters += [self._trace_processor] - self.tracer.configure(settings={"FILTERS": tracer_filters}) try: self._llmobs_span_writer.start() self._llmobs_eval_metric_writer.start() @@ -160,11 +284,7 @@ def _stop_service(self) -> None: except ServiceStatusError: log.debug("Error stopping LLMObs writers") - try: - forksafe.unregister(self._child_after_fork) - self.tracer.shutdown() - except Exception: - log.warning("Failed to shutdown tracer", exc_info=True) + forksafe.unregister(self._child_after_fork) @classmethod def enable( @@ -265,7 +385,6 @@ def disable(cls) -> None: cls._instance.stop() cls.enabled = False - cls._instance.tracer.deregister_on_start_span(cls._instance._do_annotations) telemetry_writer.product_activated(TELEMETRY_APM_PRODUCT.LLMOBS, False) log.debug("%s disabled", cls.__name__) diff --git a/ddtrace/llmobs/_trace_processor.py b/ddtrace/llmobs/_trace_processor.py deleted file mode 100644 index 231d53d7626..00000000000 --- a/ddtrace/llmobs/_trace_processor.py +++ /dev/null @@ -1,177 +0,0 @@ -from typing import Any -from typing import Dict -from typing import List -from typing import Optional -from typing import Tuple - -import ddtrace -from ddtrace import Span -from ddtrace import config -from ddtrace._trace.processor import TraceProcessor -from ddtrace.constants import ERROR_MSG -from ddtrace.constants import ERROR_STACK -from ddtrace.constants import ERROR_TYPE -from ddtrace.ext import SpanTypes -from ddtrace.internal.logger import get_logger -from ddtrace.llmobs._constants import INPUT_DOCUMENTS -from ddtrace.llmobs._constants import INPUT_MESSAGES -from ddtrace.llmobs._constants import INPUT_PARAMETERS -from ddtrace.llmobs._constants import INPUT_PROMPT -from ddtrace.llmobs._constants import INPUT_VALUE -from ddtrace.llmobs._constants import METADATA -from ddtrace.llmobs._constants import METRICS -from ddtrace.llmobs._constants import ML_APP -from ddtrace.llmobs._constants import MODEL_NAME -from ddtrace.llmobs._constants import MODEL_PROVIDER -from ddtrace.llmobs._constants import OUTPUT_DOCUMENTS -from ddtrace.llmobs._constants import OUTPUT_MESSAGES -from ddtrace.llmobs._constants import OUTPUT_VALUE -from ddtrace.llmobs._constants import RAGAS_ML_APP_PREFIX -from ddtrace.llmobs._constants import RUNNER_IS_INTEGRATION_SPAN_TAG -from ddtrace.llmobs._constants import SESSION_ID -from ddtrace.llmobs._constants import SPAN_KIND -from ddtrace.llmobs._constants import TAGS -from ddtrace.llmobs._utils import _get_llmobs_parent_id -from ddtrace.llmobs._utils import _get_ml_app -from ddtrace.llmobs._utils import _get_session_id -from ddtrace.llmobs._utils import _get_span_name -from ddtrace.llmobs._utils import safe_json - - -log = get_logger(__name__) - - -class LLMObsTraceProcessor(TraceProcessor): - """ - Processor that extracts LLM-type spans in a trace to submit as separate LLMObs span events to LLM Observability. - """ - - def __init__(self, llmobs_span_writer, evaluator_runner=None): - self._span_writer = llmobs_span_writer - self._evaluator_runner = evaluator_runner - - def process_trace(self, trace: List[Span]) -> Optional[List[Span]]: - if not trace: - return None - for span in trace: - if span.span_type == SpanTypes.LLM: - self.submit_llmobs_span(span) - return None if config._llmobs_agentless_enabled else trace - - def submit_llmobs_span(self, span: Span) -> None: - """Generate and submit an LLMObs span event to be sent to LLMObs.""" - span_event = None - is_llm_span = span._get_ctx_item(SPAN_KIND) == "llm" - is_ragas_integration_span = False - try: - span_event, is_ragas_integration_span = self._llmobs_span_event(span) - self._span_writer.enqueue(span_event) - except (KeyError, TypeError): - log.error("Error generating LLMObs span event for span %s, likely due to malformed span", span) - finally: - if not span_event or not is_llm_span or is_ragas_integration_span: - return - if self._evaluator_runner: - self._evaluator_runner.enqueue(span_event, span) - - def _llmobs_span_event(self, span: Span) -> Tuple[Dict[str, Any], bool]: - """Span event object structure.""" - span_kind = span._get_ctx_item(SPAN_KIND) - if not span_kind: - raise KeyError("Span kind not found in span context") - meta: Dict[str, Any] = {"span.kind": span_kind, "input": {}, "output": {}} - if span_kind in ("llm", "embedding") and span._get_ctx_item(MODEL_NAME) is not None: - meta["model_name"] = span._get_ctx_item(MODEL_NAME) - meta["model_provider"] = (span._get_ctx_item(MODEL_PROVIDER) or "custom").lower() - meta["metadata"] = span._get_ctx_item(METADATA) or {} - if span._get_ctx_item(INPUT_PARAMETERS): - meta["input"]["parameters"] = span._get_ctx_item(INPUT_PARAMETERS) - if span_kind == "llm" and span._get_ctx_item(INPUT_MESSAGES) is not None: - meta["input"]["messages"] = span._get_ctx_item(INPUT_MESSAGES) - if span._get_ctx_item(INPUT_VALUE) is not None: - meta["input"]["value"] = safe_json(span._get_ctx_item(INPUT_VALUE)) - if span_kind == "llm" and span._get_ctx_item(OUTPUT_MESSAGES) is not None: - meta["output"]["messages"] = span._get_ctx_item(OUTPUT_MESSAGES) - if span_kind == "embedding" and span._get_ctx_item(INPUT_DOCUMENTS) is not None: - meta["input"]["documents"] = span._get_ctx_item(INPUT_DOCUMENTS) - if span._get_ctx_item(OUTPUT_VALUE) is not None: - meta["output"]["value"] = safe_json(span._get_ctx_item(OUTPUT_VALUE)) - if span_kind == "retrieval" and span._get_ctx_item(OUTPUT_DOCUMENTS) is not None: - meta["output"]["documents"] = span._get_ctx_item(OUTPUT_DOCUMENTS) - if span._get_ctx_item(INPUT_PROMPT) is not None: - prompt_json_str = span._get_ctx_item(INPUT_PROMPT) - if span_kind != "llm": - log.warning( - "Dropping prompt on non-LLM span kind, annotating prompts is only supported for LLM span kinds." - ) - else: - meta["input"]["prompt"] = prompt_json_str - if span.error: - meta.update( - { - ERROR_MSG: span.get_tag(ERROR_MSG), - ERROR_STACK: span.get_tag(ERROR_STACK), - ERROR_TYPE: span.get_tag(ERROR_TYPE), - } - ) - if not meta["input"]: - meta.pop("input") - if not meta["output"]: - meta.pop("output") - metrics = span._get_ctx_item(METRICS) or {} - ml_app = _get_ml_app(span) - - is_ragas_integration_span = False - - if ml_app.startswith(RAGAS_ML_APP_PREFIX): - is_ragas_integration_span = True - - span._set_ctx_item(ML_APP, ml_app) - parent_id = str(_get_llmobs_parent_id(span) or "undefined") - - llmobs_span_event = { - "trace_id": "{:x}".format(span.trace_id), - "span_id": str(span.span_id), - "parent_id": parent_id, - "name": _get_span_name(span), - "start_ns": span.start_ns, - "duration": span.duration_ns, - "status": "error" if span.error else "ok", - "meta": meta, - "metrics": metrics, - } - session_id = _get_session_id(span) - if session_id is not None: - span._set_ctx_item(SESSION_ID, session_id) - llmobs_span_event["session_id"] = session_id - - llmobs_span_event["tags"] = self._llmobs_tags( - span, ml_app, session_id, is_ragas_integration_span=is_ragas_integration_span - ) - return llmobs_span_event, is_ragas_integration_span - - @staticmethod - def _llmobs_tags( - span: Span, ml_app: str, session_id: Optional[str] = None, is_ragas_integration_span: bool = False - ) -> List[str]: - tags = { - "version": config.version or "", - "env": config.env or "", - "service": span.service or "", - "source": "integration", - "ml_app": ml_app, - "ddtrace.version": ddtrace.__version__, - "language": "python", - "error": span.error, - } - err_type = span.get_tag(ERROR_TYPE) - if err_type: - tags["error_type"] = err_type - if session_id: - tags["session_id"] = session_id - if is_ragas_integration_span: - tags[RUNNER_IS_INTEGRATION_SPAN_TAG] = "ragas" - existing_tags = span._get_ctx_item(TAGS) - if existing_tags is not None: - tags.update(existing_tags) - return ["{}:{}".format(k, v) for k, v in tags.items()] diff --git a/ddtrace/llmobs/_utils.py b/ddtrace/llmobs/_utils.py index c1b1c4a776c..dd616db8bef 100644 --- a/ddtrace/llmobs/_utils.py +++ b/ddtrace/llmobs/_utils.py @@ -135,9 +135,12 @@ def _get_ml_app(span: Span) -> str: ml_app = span._get_ctx_item(ML_APP) if ml_app: return ml_app - nearest_llmobs_ancestor = _get_nearest_llmobs_ancestor(span) - if nearest_llmobs_ancestor: - ml_app = nearest_llmobs_ancestor._get_ctx_item(ML_APP) + llmobs_parent = _get_nearest_llmobs_ancestor(span) + while llmobs_parent: + ml_app = llmobs_parent._get_ctx_item(ML_APP) + if ml_app is not None: + return ml_app + llmobs_parent = _get_nearest_llmobs_ancestor(llmobs_parent) return ml_app or config._llmobs_ml_app or "unknown-ml-app" @@ -149,9 +152,12 @@ def _get_session_id(span: Span) -> Optional[str]: session_id = span._get_ctx_item(SESSION_ID) if session_id: return session_id - nearest_llmobs_ancestor = _get_nearest_llmobs_ancestor(span) - if nearest_llmobs_ancestor: - session_id = nearest_llmobs_ancestor._get_ctx_item(SESSION_ID) + llmobs_parent = _get_nearest_llmobs_ancestor(span) + while llmobs_parent: + session_id = llmobs_parent._get_ctx_item(SESSION_ID) + if session_id is not None: + return session_id + llmobs_parent = _get_nearest_llmobs_ancestor(llmobs_parent) return session_id diff --git a/ddtrace/profiling/exporter/http.py b/ddtrace/profiling/exporter/http.py index 6700e584ade..b4ec6994d72 100644 --- a/ddtrace/profiling/exporter/http.py +++ b/ddtrace/profiling/exporter/http.py @@ -220,8 +220,18 @@ def export( "family": "python", "attachments": [item["filename"].decode("utf-8") for item in data], "tags_profiler": self._get_tags(service), - "start": (datetime.datetime.utcfromtimestamp(start_time_ns / 1e9).replace(microsecond=0).isoformat() + "Z"), - "end": (datetime.datetime.utcfromtimestamp(end_time_ns / 1e9).replace(microsecond=0).isoformat() + "Z"), + "start": ( + datetime.datetime.fromtimestamp(start_time_ns / 1e9, tz=datetime.timezone.utc) + .replace(microsecond=0) + .isoformat()[0:-6] # removes the trailing +00:00 portion of the time + + "Z" + ), + "end": ( + datetime.datetime.fromtimestamp(end_time_ns / 1e9, tz=datetime.timezone.utc) + .replace(microsecond=0) + .isoformat()[0:-6] # removes the trailing +00:00 portion of the time + + "Z" + ), } # type: Dict[str, Any] if self.endpoint_call_counter_span_processor is not None: diff --git a/ddtrace/propagation/http.py b/ddtrace/propagation/http.py index a1664664ace..563ee838d84 100644 --- a/ddtrace/propagation/http.py +++ b/ddtrace/propagation/http.py @@ -101,6 +101,7 @@ def _possible_header(header): _POSSIBLE_HTTP_HEADER_B3_FLAGS = _possible_header(_HTTP_HEADER_B3_FLAGS) _POSSIBLE_HTTP_HEADER_TRACEPARENT = _possible_header(_HTTP_HEADER_TRACEPARENT) _POSSIBLE_HTTP_HEADER_TRACESTATE = _possible_header(_HTTP_HEADER_TRACESTATE) +_POSSIBLE_HTTP_BAGGAGE_HEADER = _possible_header(_HTTP_HEADER_BAGGAGE) # https://www.w3.org/TR/trace-context/#traceparent-header-field-values @@ -937,7 +938,7 @@ def _inject(span_context: Context, headers: Dict[str, str]) -> None: @staticmethod def _extract(headers: Dict[str, str]) -> Context: - header_value = headers.get(_HTTP_HEADER_BAGGAGE) + header_value = _extract_header_value(_POSSIBLE_HTTP_BAGGAGE_HEADER, headers) if not header_value: return Context(baggage={}) diff --git a/releasenotes/notes/case_insensitive_baggage_header_extraction-63167c492474da6f.yaml b/releasenotes/notes/case_insensitive_baggage_header_extraction-63167c492474da6f.yaml new file mode 100644 index 00000000000..ad0eacb28e8 --- /dev/null +++ b/releasenotes/notes/case_insensitive_baggage_header_extraction-63167c492474da6f.yaml @@ -0,0 +1,6 @@ +--- +fixes: + - | + tracer: This fix resolves an issue where baggage header extraction was case sensitive and didn't accept the header prepended with HTTP. + Now the baggage header will be extracted regardless of casing and the HTTP format. + diff --git a/releasenotes/notes/feat-openai-streamed-chunk-auto-extract-4cbaea8870b1df13.yaml b/releasenotes/notes/feat-openai-streamed-chunk-auto-extract-4cbaea8870b1df13.yaml new file mode 100644 index 00000000000..afaf95876d5 --- /dev/null +++ b/releasenotes/notes/feat-openai-streamed-chunk-auto-extract-4cbaea8870b1df13.yaml @@ -0,0 +1,6 @@ +--- +features: + - | + openai: Introduces automatic extraction of token usage from streamed chat completions. + Unless ``stream_options: {"include_usage": False}`` is explicitly set on your streamed chat completion request, + the OpenAI integration will add ``stream_options: {"include_usage": True}`` to your request and automatically extract the token usage chunk from the streamed response. diff --git a/releasenotes/notes/fix-llmobs-processor-4afd715a84323d32.yaml b/releasenotes/notes/fix-llmobs-processor-4afd715a84323d32.yaml new file mode 100644 index 00000000000..5912a415022 --- /dev/null +++ b/releasenotes/notes/fix-llmobs-processor-4afd715a84323d32.yaml @@ -0,0 +1,5 @@ +--- +fixes: + - | + LLM Observability: Resolves an issue where configuring custom trace filters/processors onto the tracer would disable LLM Observability. + Note that if LLM Observability is enabled in agentless mode, writing APM traces must be explicitly disabled by setting `DD_TRACE_ENABLED=0`. diff --git a/riotfile.py b/riotfile.py index 0d9f66ca925..0398175d930 100644 --- a/riotfile.py +++ b/riotfile.py @@ -2958,8 +2958,8 @@ def select_pys(min_version: str = MIN_PYTHON_VERSION, max_version: str = MAX_PYT name="llmobs", command="pytest {cmdargs} tests/llmobs", pkgs={"vcrpy": latest, "pytest-asyncio": "==0.21.1"}, - pys=select_pys(min_version="3.7"), venvs=[ + Venv(pys="3.7"), Venv(pys=select_pys(min_version="3.8"), pkgs={"ragas": "==0.1.21", "langchain": latest}), ], ), diff --git a/tests/appsec/iast_packages/packages/pkg_pyjwt.py b/tests/appsec/iast_packages/packages/pkg_pyjwt.py index 4712f6cee0f..ec43d8a17d2 100644 --- a/tests/appsec/iast_packages/packages/pkg_pyjwt.py +++ b/tests/appsec/iast_packages/packages/pkg_pyjwt.py @@ -3,6 +3,7 @@ https://pypi.org/project/PyJWT/ """ + import datetime from flask import Blueprint @@ -25,7 +26,10 @@ def pkg_pyjwt_view(): secret_key = "your-256-bit-secret" user_payload = request.args.get("package_param", "default-user") - payload = {"user": user_payload, "exp": datetime.datetime.utcnow() + datetime.timedelta(seconds=30)} + payload = { + "user": user_payload, + "exp": datetime.datetime.now(datetime.timezone.utc) + datetime.timedelta(seconds=30), + } try: # Encode the payload to create a JWT diff --git a/tests/contrib/openai/test_openai_llmobs.py b/tests/contrib/openai/test_openai_llmobs.py index a1a2b93a5ca..a145877c8c8 100644 --- a/tests/contrib/openai/test_openai_llmobs.py +++ b/tests/contrib/openai/test_openai_llmobs.py @@ -518,11 +518,17 @@ async def test_chat_completion_azure_async( ) ) - def test_chat_completion_stream(self, openai, ddtrace_global_config, mock_llmobs_writer, mock_tracer): + @pytest.mark.skipif( + parse_version(openai_module.version.VERSION) < (1, 26), reason="Stream options only available openai >= 1.26" + ) + def test_chat_completion_stream_explicit_no_tokens( + self, openai, ddtrace_global_config, mock_llmobs_writer, mock_tracer + ): """Ensure llmobs records are emitted for chat completion endpoints when configured. Also ensure the llmobs records have the correct tagging including trace/span ID for trace correlation. """ + with get_openai_vcr(subdirectory_name="v1").use_cassette("chat_completion_streamed.yaml"): with mock.patch("ddtrace.contrib.internal.openai.utils.encoding_for_model", create=True) as mock_encoding: with mock.patch("ddtrace.contrib.internal.openai.utils._est_tokens") as mock_est: @@ -534,7 +540,11 @@ def test_chat_completion_stream(self, openai, ddtrace_global_config, mock_llmobs expected_completion = "The Los Angeles Dodgers won the World Series in 2020." client = openai.OpenAI() resp = client.chat.completions.create( - model=model, messages=input_messages, stream=True, user="ddtrace-test" + model=model, + messages=input_messages, + stream=True, + user="ddtrace-test", + stream_options={"include_usage": False}, ) for chunk in resp: resp_model = chunk.model @@ -547,7 +557,7 @@ def test_chat_completion_stream(self, openai, ddtrace_global_config, mock_llmobs model_provider="openai", input_messages=input_messages, output_messages=[{"content": expected_completion, "role": "assistant"}], - metadata={"stream": True, "user": "ddtrace-test"}, + metadata={"stream": True, "stream_options": {"include_usage": False}, "user": "ddtrace-test"}, token_metrics={"input_tokens": 8, "output_tokens": 8, "total_tokens": 16}, tags={"ml_app": "", "service": "tests.contrib.openai"}, ) @@ -557,20 +567,14 @@ def test_chat_completion_stream(self, openai, ddtrace_global_config, mock_llmobs parse_version(openai_module.version.VERSION) < (1, 26, 0), reason="Streamed tokens available in 1.26.0+" ) def test_chat_completion_stream_tokens(self, openai, ddtrace_global_config, mock_llmobs_writer, mock_tracer): - """ - Ensure llmobs records are emitted for chat completion endpoints when configured - with the `stream_options={"include_usage": True}`. - Also ensure the llmobs records have the correct tagging including trace/span ID for trace correlation. - """ + """Assert that streamed token chunk extraction logic works when options are not explicitly passed from user.""" with get_openai_vcr(subdirectory_name="v1").use_cassette("chat_completion_streamed_tokens.yaml"): model = "gpt-3.5-turbo" resp_model = model input_messages = [{"role": "user", "content": "Who won the world series in 2020?"}] expected_completion = "The Los Angeles Dodgers won the World Series in 2020." client = openai.OpenAI() - resp = client.chat.completions.create( - model=model, messages=input_messages, stream=True, stream_options={"include_usage": True} - ) + resp = client.chat.completions.create(model=model, messages=input_messages, stream=True) for chunk in resp: resp_model = chunk.model span = mock_tracer.pop_traces()[0][0] @@ -671,7 +675,6 @@ def test_chat_completion_tool_call_stream(self, openai, ddtrace_global_config, m messages=[{"role": "user", "content": chat_completion_input_description}], user="ddtrace-test", stream=True, - stream_options={"include_usage": True}, ) for chunk in resp: resp_model = chunk.model diff --git a/tests/contrib/openai/test_openai_v1.py b/tests/contrib/openai/test_openai_v1.py index f13de144fc5..91737d9e5eb 100644 --- a/tests/contrib/openai/test_openai_v1.py +++ b/tests/contrib/openai/test_openai_v1.py @@ -921,128 +921,78 @@ def test_span_finish_on_stream_error(openai, openai_vcr, snapshot_tracer): ) -def test_completion_stream(openai, openai_vcr, mock_metrics, mock_tracer): +@pytest.mark.snapshot +@pytest.mark.skipif(TIKTOKEN_AVAILABLE, reason="This test estimates token counts") +def test_completion_stream_est_tokens(openai, openai_vcr, mock_metrics, snapshot_tracer): with openai_vcr.use_cassette("completion_streamed.yaml"): with mock.patch("ddtrace.contrib.internal.openai.utils.encoding_for_model", create=True) as mock_encoding: mock_encoding.return_value.encode.side_effect = lambda x: [1, 2] - expected_completion = '! ... A page layouts page drawer? ... Interesting. The "Tools" is' client = openai.OpenAI() resp = client.completions.create(model="ada", prompt="Hello world", stream=True, n=None) - chunks = [c for c in resp] - - completion = "".join([c.choices[0].text for c in chunks]) - assert completion == expected_completion + _ = [c for c in resp] - traces = mock_tracer.pop_traces() - assert len(traces) == 1 - assert len(traces[0]) == 1 - assert traces[0][0].get_tag("openai.response.choices.0.text") == expected_completion - assert traces[0][0].get_tag("openai.response.choices.0.finish_reason") == "length" - expected_tags = [ - "version:", - "env:", - "service:tests.contrib.openai", - "openai.request.model:ada", - "model:ada", - "openai.request.endpoint:/v1/completions", - "openai.request.method:POST", - "openai.organization.id:", - "openai.organization.name:datadog-4", - "openai.user.api_key:sk-...key>", - "error:0", - "openai.estimated:true", - ] - if TIKTOKEN_AVAILABLE: - expected_tags = expected_tags[:-1] - assert mock.call.distribution("tokens.prompt", 2, tags=expected_tags) in mock_metrics.mock_calls - assert mock.call.distribution("tokens.completion", mock.ANY, tags=expected_tags) in mock_metrics.mock_calls - assert mock.call.distribution("tokens.total", mock.ANY, tags=expected_tags) in mock_metrics.mock_calls +@pytest.mark.skipif(not TIKTOKEN_AVAILABLE, reason="This test computes token counts using tiktoken") +@pytest.mark.snapshot(token="tests.contrib.openai.test_openai.test_completion_stream") +def test_completion_stream(openai, openai_vcr, mock_metrics, snapshot_tracer): + with openai_vcr.use_cassette("completion_streamed.yaml"): + with mock.patch("ddtrace.contrib.internal.openai.utils.encoding_for_model", create=True) as mock_encoding: + mock_encoding.return_value.encode.side_effect = lambda x: [1, 2] + client = openai.OpenAI() + resp = client.completions.create(model="ada", prompt="Hello world", stream=True, n=None) + _ = [c for c in resp] -async def test_completion_async_stream(openai, openai_vcr, mock_metrics, mock_tracer): +@pytest.mark.skipif(not TIKTOKEN_AVAILABLE, reason="This test computes token counts using tiktoken") +@pytest.mark.snapshot(token="tests.contrib.openai.test_openai.test_completion_stream") +async def test_completion_async_stream(openai, openai_vcr, mock_metrics, snapshot_tracer): with openai_vcr.use_cassette("completion_streamed.yaml"): with mock.patch("ddtrace.contrib.internal.openai.utils.encoding_for_model", create=True) as mock_encoding: mock_encoding.return_value.encode.side_effect = lambda x: [1, 2] - expected_completion = '! ... A page layouts page drawer? ... Interesting. The "Tools" is' client = openai.AsyncOpenAI() - resp = await client.completions.create(model="ada", prompt="Hello world", stream=True) - chunks = [c async for c in resp] - - completion = "".join([c.choices[0].text for c in chunks]) - assert completion == expected_completion - - traces = mock_tracer.pop_traces() - assert len(traces) == 1 - assert len(traces[0]) == 1 - assert traces[0][0].get_tag("openai.response.choices.0.text") == expected_completion - assert traces[0][0].get_tag("openai.response.choices.0.finish_reason") == "length" - - expected_tags = [ - "version:", - "env:", - "service:tests.contrib.openai", - "openai.request.model:ada", - "model:ada", - "openai.request.endpoint:/v1/completions", - "openai.request.method:POST", - "openai.organization.id:", - "openai.organization.name:datadog-4", - "openai.user.api_key:sk-...key>", - "error:0", - "openai.estimated:true", - ] - if TIKTOKEN_AVAILABLE: - expected_tags = expected_tags[:-1] - assert mock.call.distribution("tokens.prompt", 2, tags=expected_tags) in mock_metrics.mock_calls - assert mock.call.distribution("tokens.completion", mock.ANY, tags=expected_tags) in mock_metrics.mock_calls - assert mock.call.distribution("tokens.total", mock.ANY, tags=expected_tags) in mock_metrics.mock_calls + resp = await client.completions.create(model="ada", prompt="Hello world", stream=True, n=None) + _ = [c async for c in resp] @pytest.mark.skipif( - parse_version(openai_module.version.VERSION) < (1, 6, 0), + parse_version(openai_module.version.VERSION) < (1, 6, 0) or not TIKTOKEN_AVAILABLE, reason="Streamed response context managers are only available v1.6.0+", ) -def test_completion_stream_context_manager(openai, openai_vcr, mock_metrics, mock_tracer): +@pytest.mark.snapshot(token="tests.contrib.openai.test_openai.test_completion_stream") +def test_completion_stream_context_manager(openai, openai_vcr, mock_metrics, snapshot_tracer): with openai_vcr.use_cassette("completion_streamed.yaml"): with mock.patch("ddtrace.contrib.internal.openai.utils.encoding_for_model", create=True) as mock_encoding: mock_encoding.return_value.encode.side_effect = lambda x: [1, 2] - expected_completion = '! ... A page layouts page drawer? ... Interesting. The "Tools" is' client = openai.OpenAI() with client.completions.create(model="ada", prompt="Hello world", stream=True, n=None) as resp: - chunks = [c for c in resp] + _ = [c for c in resp] - completion = "".join([c.choices[0].text for c in chunks]) - assert completion == expected_completion - - traces = mock_tracer.pop_traces() - assert len(traces) == 1 - assert len(traces[0]) == 1 - assert traces[0][0].get_tag("openai.response.choices.0.text") == expected_completion - assert traces[0][0].get_tag("openai.response.choices.0.finish_reason") == "length" - expected_tags = [ - "version:", - "env:", - "service:tests.contrib.openai", - "openai.request.model:ada", - "model:ada", - "openai.request.endpoint:/v1/completions", - "openai.request.method:POST", - "openai.organization.id:", - "openai.organization.name:datadog-4", - "openai.user.api_key:sk-...key>", - "error:0", - "openai.estimated:true", - ] - if TIKTOKEN_AVAILABLE: - expected_tags = expected_tags[:-1] - assert mock.call.distribution("tokens.prompt", 2, tags=expected_tags) in mock_metrics.mock_calls - assert mock.call.distribution("tokens.completion", mock.ANY, tags=expected_tags) in mock_metrics.mock_calls - assert mock.call.distribution("tokens.total", mock.ANY, tags=expected_tags) in mock_metrics.mock_calls +@pytest.mark.skipif( + parse_version(openai_module.version.VERSION) < (1, 26), reason="Stream options only available openai >= 1.26" +) +@pytest.mark.snapshot(token="tests.contrib.openai.test_openai.test_chat_completion_stream") +def test_chat_completion_stream(openai, openai_vcr, mock_metrics, snapshot_tracer): + """Assert that streamed token chunk extraction logic works automatically.""" + with openai_vcr.use_cassette("chat_completion_streamed_tokens.yaml"): + with mock.patch("ddtrace.contrib.internal.openai.utils.encoding_for_model", create=True) as mock_encoding: + mock_encoding.return_value.encode.side_effect = lambda x: [1, 2, 3, 4, 5, 6, 7, 8] + client = openai.OpenAI() + resp = client.chat.completions.create( + model="gpt-3.5-turbo", + messages=[{"role": "user", "content": "Who won the world series in 2020?"}], + stream=True, + user="ddtrace-test", + n=None, + ) + _ = [c for c in resp] -def test_chat_completion_stream(openai, openai_vcr, mock_metrics, snapshot_tracer): +@pytest.mark.skipif( + parse_version(openai_module.version.VERSION) < (1, 26), reason="Stream options only available openai >= 1.26" +) +def test_chat_completion_stream_explicit_no_tokens(openai, openai_vcr, mock_metrics, snapshot_tracer): + """Assert that streamed token chunk extraction logic is avoided if explicitly set to False by the user.""" with openai_vcr.use_cassette("chat_completion_streamed.yaml"): with mock.patch("ddtrace.contrib.internal.openai.utils.encoding_for_model", create=True) as mock_encoding: mock_encoding.return_value.encode.side_effect = lambda x: [1, 2, 3, 4, 5, 6, 7, 8] @@ -1054,20 +1004,16 @@ def test_chat_completion_stream(openai, openai_vcr, mock_metrics, snapshot_trace {"role": "user", "content": "Who won the world series in 2020?"}, ], stream=True, + stream_options={"include_usage": False}, user="ddtrace-test", n=None, ) - prompt_tokens = 8 span = snapshot_tracer.current_span() chunks = [c for c in resp] assert len(chunks) == 15 completion = "".join([c.choices[0].delta.content for c in chunks if c.choices[0].delta.content is not None]) assert completion == expected_completion - assert span.get_tag("openai.response.choices.0.message.content") == expected_completion - assert span.get_tag("openai.response.choices.0.message.role") == "assistant" - assert span.get_tag("openai.response.choices.0.finish_reason") == "stop" - expected_tags = [ "version:", "env:", @@ -1087,16 +1033,19 @@ def test_chat_completion_stream(openai, openai_vcr, mock_metrics, snapshot_trace expected_tags += ["openai.estimated:true"] if TIKTOKEN_AVAILABLE: expected_tags = expected_tags[:-1] - assert mock.call.distribution("tokens.prompt", prompt_tokens, tags=expected_tags) in mock_metrics.mock_calls + assert mock.call.distribution("tokens.prompt", 8, tags=expected_tags) in mock_metrics.mock_calls assert mock.call.distribution("tokens.completion", mock.ANY, tags=expected_tags) in mock_metrics.mock_calls assert mock.call.distribution("tokens.total", mock.ANY, tags=expected_tags) in mock_metrics.mock_calls +@pytest.mark.skipif( + parse_version(openai_module.version.VERSION) < (1, 26, 0), reason="Streamed tokens available in 1.26.0+" +) +@pytest.mark.snapshot(token="tests.contrib.openai.test_openai.test_chat_completion_stream") async def test_chat_completion_async_stream(openai, openai_vcr, mock_metrics, snapshot_tracer): - with openai_vcr.use_cassette("chat_completion_streamed.yaml"): + with openai_vcr.use_cassette("chat_completion_streamed_tokens.yaml"): with mock.patch("ddtrace.contrib.internal.openai.utils.encoding_for_model", create=True) as mock_encoding: mock_encoding.return_value.encode.side_effect = lambda x: [1, 2, 3, 4, 5, 6, 7, 8] - expected_completion = "The Los Angeles Dodgers won the World Series in 2020." client = openai.AsyncOpenAI() resp = await client.chat.completions.create( model="gpt-3.5-turbo", @@ -1104,99 +1053,21 @@ async def test_chat_completion_async_stream(openai, openai_vcr, mock_metrics, sn {"role": "user", "content": "Who won the world series in 2020?"}, ], stream=True, + n=None, user="ddtrace-test", ) - prompt_tokens = 8 - span = snapshot_tracer.current_span() - chunks = [c async for c in resp] - assert len(chunks) == 15 - completion = "".join([c.choices[0].delta.content for c in chunks if c.choices[0].delta.content is not None]) - assert completion == expected_completion - - assert span.get_tag("openai.response.choices.0.message.content") == expected_completion - assert span.get_tag("openai.response.choices.0.message.role") == "assistant" - assert span.get_tag("openai.response.choices.0.finish_reason") == "stop" - - expected_tags = [ - "version:", - "env:", - "service:tests.contrib.openai", - "openai.request.model:gpt-3.5-turbo", - "model:gpt-3.5-turbo", - "openai.request.endpoint:/v1/chat/completions", - "openai.request.method:POST", - "openai.organization.id:", - "openai.organization.name:datadog-4", - "openai.user.api_key:sk-...key>", - "error:0", - ] - assert mock.call.distribution("request.duration", span.duration_ns, tags=expected_tags) in mock_metrics.mock_calls - assert mock.call.gauge("ratelimit.requests", 3000, tags=expected_tags) in mock_metrics.mock_calls - assert mock.call.gauge("ratelimit.remaining.requests", 2999, tags=expected_tags) in mock_metrics.mock_calls - expected_tags += ["openai.estimated:true"] - if TIKTOKEN_AVAILABLE: - expected_tags = expected_tags[:-1] - assert mock.call.distribution("tokens.prompt", prompt_tokens, tags=expected_tags) in mock_metrics.mock_calls - assert mock.call.distribution("tokens.completion", mock.ANY, tags=expected_tags) in mock_metrics.mock_calls - assert mock.call.distribution("tokens.total", mock.ANY, tags=expected_tags) in mock_metrics.mock_calls - - -@pytest.mark.skipif( - parse_version(openai_module.version.VERSION) < (1, 26, 0), reason="Streamed tokens available in 1.26.0+" -) -def test_chat_completion_stream_tokens(openai, openai_vcr, mock_metrics, snapshot_tracer): - with openai_vcr.use_cassette("chat_completion_streamed_tokens.yaml"): - expected_completion = "The Los Angeles Dodgers won the World Series in 2020." - client = openai.OpenAI() - resp = client.chat.completions.create( - model="gpt-3.5-turbo", - messages=[{"role": "user", "content": "Who won the world series in 2020?"}], - stream=True, - user="ddtrace-test", - n=None, - stream_options={"include_usage": True}, - ) - span = snapshot_tracer.current_span() - chunks = [c for c in resp] - completion = "".join( - [c.choices[0].delta.content for c in chunks if c.choices and c.choices[0].delta.content is not None] - ) - assert completion == expected_completion - - assert span.get_tag("openai.response.choices.0.message.content") == expected_completion - assert span.get_tag("openai.response.choices.0.message.role") == "assistant" - assert span.get_tag("openai.response.choices.0.finish_reason") == "stop" - - expected_tags = [ - "version:", - "env:", - "service:tests.contrib.openai", - "openai.request.model:gpt-3.5-turbo", - "model:gpt-3.5-turbo", - "openai.request.endpoint:/v1/chat/completions", - "openai.request.method:POST", - "openai.organization.id:", - "openai.organization.name:datadog-4", - "openai.user.api_key:sk-...key>", - "error:0", - ] - assert mock.call.distribution("request.duration", span.duration_ns, tags=expected_tags) in mock_metrics.mock_calls - assert mock.call.gauge("ratelimit.requests", 3000, tags=expected_tags) in mock_metrics.mock_calls - assert mock.call.gauge("ratelimit.remaining.requests", 2999, tags=expected_tags) in mock_metrics.mock_calls - assert mock.call.distribution("tokens.prompt", 17, tags=expected_tags) in mock_metrics.mock_calls - assert mock.call.distribution("tokens.completion", 19, tags=expected_tags) in mock_metrics.mock_calls - assert mock.call.distribution("tokens.total", 36, tags=expected_tags) in mock_metrics.mock_calls + _ = [c async for c in resp] @pytest.mark.skipif( - parse_version(openai_module.version.VERSION) < (1, 6, 0), - reason="Streamed response context managers are only available v1.6.0+", + parse_version(openai_module.version.VERSION) < (1, 26, 0), + reason="Streamed response context managers are only available v1.6.0+, tokens available 1.26.0+", ) +@pytest.mark.snapshot(token="tests.contrib.openai.test_openai.test_chat_completion_stream") async def test_chat_completion_async_stream_context_manager(openai, openai_vcr, mock_metrics, snapshot_tracer): - with openai_vcr.use_cassette("chat_completion_streamed.yaml"): + with openai_vcr.use_cassette("chat_completion_streamed_tokens.yaml"): with mock.patch("ddtrace.contrib.internal.openai.utils.encoding_for_model", create=True) as mock_encoding: mock_encoding.return_value.encode.side_effect = lambda x: [1, 2, 3, 4, 5, 6, 7, 8] - expected_completion = "The Los Angeles Dodgers won the World Series in 2020." client = openai.AsyncOpenAI() async with await client.chat.completions.create( model="gpt-3.5-turbo", @@ -1207,41 +1078,7 @@ async def test_chat_completion_async_stream_context_manager(openai, openai_vcr, user="ddtrace-test", n=None, ) as resp: - prompt_tokens = 8 - span = snapshot_tracer.current_span() - chunks = [c async for c in resp] - assert len(chunks) == 15 - completion = "".join( - [c.choices[0].delta.content for c in chunks if c.choices[0].delta.content is not None] - ) - assert completion == expected_completion - - assert span.get_tag("openai.response.choices.0.message.content") == expected_completion - assert span.get_tag("openai.response.choices.0.message.role") == "assistant" - assert span.get_tag("openai.response.choices.0.finish_reason") == "stop" - - expected_tags = [ - "version:", - "env:", - "service:tests.contrib.openai", - "openai.request.model:gpt-3.5-turbo", - "model:gpt-3.5-turbo", - "openai.request.endpoint:/v1/chat/completions", - "openai.request.method:POST", - "openai.organization.id:", - "openai.organization.name:datadog-4", - "openai.user.api_key:sk-...key>", - "error:0", - ] - assert mock.call.distribution("request.duration", span.duration_ns, tags=expected_tags) in mock_metrics.mock_calls - assert mock.call.gauge("ratelimit.requests", 3000, tags=expected_tags) in mock_metrics.mock_calls - assert mock.call.gauge("ratelimit.remaining.requests", 2999, tags=expected_tags) in mock_metrics.mock_calls - expected_tags += ["openai.estimated:true"] - if TIKTOKEN_AVAILABLE: - expected_tags = expected_tags[:-1] - assert mock.call.distribution("tokens.prompt", prompt_tokens, tags=expected_tags) in mock_metrics.mock_calls - assert mock.call.distribution("tokens.completion", mock.ANY, tags=expected_tags) in mock_metrics.mock_calls - assert mock.call.distribution("tokens.total", mock.ANY, tags=expected_tags) in mock_metrics.mock_calls + _ = [c async for c in resp] @pytest.mark.snapshot( diff --git a/tests/llmobs/conftest.py b/tests/llmobs/conftest.py index a7d467b3985..5a63b7e2b8f 100644 --- a/tests/llmobs/conftest.py +++ b/tests/llmobs/conftest.py @@ -31,26 +31,6 @@ def pytest_configure(config): config.addinivalue_line("markers", "vcr_logs: mark test to use recorded request/responses") -@pytest.fixture -def mock_llmobs_span_writer(): - patcher = mock.patch("ddtrace.llmobs._llmobs.LLMObsSpanWriter") - LLMObsSpanWriterMock = patcher.start() - m = mock.MagicMock() - LLMObsSpanWriterMock.return_value = m - yield m - patcher.stop() - - -@pytest.fixture -def mock_llmobs_span_agentless_writer(): - patcher = mock.patch("ddtrace.llmobs._llmobs.LLMObsSpanWriter") - LLMObsSpanWriterMock = patcher.start() - m = mock.MagicMock() - LLMObsSpanWriterMock.return_value = m - yield m - patcher.stop() - - @pytest.fixture def mock_llmobs_eval_metric_writer(): patcher = mock.patch("ddtrace.llmobs._llmobs.LLMObsEvalMetricWriter") @@ -85,10 +65,7 @@ def mock_llmobs_submit_evaluation(): def mock_http_writer_send_payload_response(): with mock.patch( "ddtrace.internal.writer.HTTPWriter._send_payload", - return_value=Response( - status=200, - body="{}", - ), + return_value=Response(status=200, body="{}"), ): yield @@ -124,9 +101,10 @@ def mock_evaluator_sampler_logs(): @pytest.fixture -def mock_http_writer_logs(): - with mock.patch("ddtrace.internal.writer.writer.log") as m: +def mock_llmobs_logs(): + with mock.patch("ddtrace.llmobs._llmobs.log") as m: yield m + m.reset_mock() @pytest.fixture @@ -139,44 +117,6 @@ def default_global_config(): return {"_dd_api_key": "", "_llmobs_ml_app": "unnamed-ml-app"} -@pytest.fixture -def LLMObs( - mock_llmobs_span_writer, mock_llmobs_eval_metric_writer, mock_llmobs_evaluator_runner, ddtrace_global_config -): - global_config = default_global_config() - global_config.update(ddtrace_global_config) - with override_global_config(global_config): - dummy_tracer = DummyTracer() - llmobs_service.enable(_tracer=dummy_tracer) - yield llmobs_service - llmobs_service.disable() - - -@pytest.fixture -def AgentlessLLMObs( - mock_llmobs_span_agentless_writer, - mock_llmobs_eval_metric_writer, - mock_llmobs_evaluator_runner, - ddtrace_global_config, -): - global_config = default_global_config() - global_config.update(ddtrace_global_config) - global_config.update(dict(_llmobs_agentless_enabled=True)) - with override_global_config(global_config): - dummy_tracer = DummyTracer() - llmobs_service.enable(_tracer=dummy_tracer) - yield llmobs_service - llmobs_service.disable() - - -@pytest.fixture -def disabled_llmobs(): - prev = llmobs_service.enabled - llmobs_service.enabled = False - yield - llmobs_service.enabled = prev - - @pytest.fixture def mock_ragas_dependencies_not_present(): import ragas @@ -189,18 +129,22 @@ def mock_ragas_dependencies_not_present(): @pytest.fixture -def ragas(mock_llmobs_span_writer, mock_llmobs_eval_metric_writer): +def ragas(mock_llmobs_eval_metric_writer): with override_global_config(dict(_dd_api_key="")): - import ragas - + try: + import ragas + except ImportError: + pytest.skip("Ragas not installed") with override_env(dict(OPENAI_API_KEY=os.getenv("OPENAI_API_KEY", ""))): yield ragas @pytest.fixture def reset_ragas_faithfulness_llm(): - import ragas - + try: + import ragas + except ImportError: + pytest.skip("Ragas not installed") previous_llm = ragas.metrics.faithfulness.llm yield ragas.metrics.faithfulness.llm = previous_llm @@ -243,16 +187,25 @@ def llmobs_span_writer(): @pytest.fixture -def llmobs(monkeypatch, tracer, llmobs_env, llmobs_span_writer): +def llmobs( + ddtrace_global_config, + monkeypatch, + tracer, + llmobs_env, + llmobs_span_writer, + mock_llmobs_eval_metric_writer, + mock_llmobs_evaluator_runner, +): for env, val in llmobs_env.items(): monkeypatch.setenv(env, val) - + global_config = default_global_config() + global_config.update(dict(_llmobs_ml_app=llmobs_env.get("DD_LLMOBS_ML_APP"))) + global_config.update(ddtrace_global_config) # TODO: remove once rest of tests are moved off of global config tampering - with override_global_config(dict(_llmobs_ml_app=llmobs_env.get("DD_LLMOBS_ML_APP"))): + with override_global_config(global_config): llmobs_service.enable(_tracer=tracer) llmobs_service._instance._llmobs_span_writer = llmobs_span_writer - llmobs_service._instance._trace_processor._span_writer = llmobs_span_writer - yield llmobs + yield llmobs_service llmobs_service.disable() diff --git a/tests/llmobs/test_llmobs.py b/tests/llmobs/test_llmobs.py index 1bae7efe9ed..6cf19fc3e2c 100644 --- a/tests/llmobs/test_llmobs.py +++ b/tests/llmobs/test_llmobs.py @@ -1,4 +1,3 @@ -import mock import pytest from ddtrace.ext import SpanTypes @@ -8,12 +7,6 @@ from tests.llmobs._utils import _expected_llmobs_llm_span_event -@pytest.fixture -def mock_logs(): - with mock.patch("ddtrace.llmobs._trace_processor.log") as mock_logs: - yield mock_logs - - class TestMLApp: @pytest.mark.parametrize("llmobs_env", [{"DD_LLMOBS_ML_APP": ""}]) def test_tag_defaults_to_env_var(self, tracer, llmobs_env, llmobs_events): @@ -228,19 +221,19 @@ def test_model_and_provider_are_set(tracer, llmobs_events): assert span_event["meta"]["model_provider"] == "model_provider" -def test_malformed_span_logs_error_instead_of_raising(mock_logs, tracer, llmobs_events): +def test_malformed_span_logs_error_instead_of_raising(tracer, llmobs_events, mock_llmobs_logs): """Test that a trying to create a span event from a malformed span will log an error instead of crashing.""" with tracer.trace("root_llm_span", span_type=SpanTypes.LLM) as llm_span: # span does not have SPAN_KIND tag pass - mock_logs.error.assert_called_once_with( - "Error generating LLMObs span event for span %s, likely due to malformed span", llm_span + mock_llmobs_logs.error.assert_called_with( + "Error generating LLMObs span event for span %s, likely due to malformed span", llm_span, exc_info=True ) assert len(llmobs_events) == 0 -def test_processor_only_creates_llmobs_span_event(tracer, llmobs_events): - """Test that the LLMObsTraceProcessor only creates LLMObs span events for LLM span types.""" +def test_only_generate_span_events_from_llmobs_spans(tracer, llmobs_events): + """Test that we only generate LLMObs span events for LLM span types.""" with tracer.trace("root_llm_span", service="tests.llmobs", span_type=SpanTypes.LLM) as root_span: root_span._set_ctx_item(const.SPAN_KIND, "llm") with tracer.trace("child_span"): @@ -250,5 +243,5 @@ def test_processor_only_creates_llmobs_span_event(tracer, llmobs_events): expected_grandchild_llmobs_span["parent_id"] = str(root_span.span_id) assert len(llmobs_events) == 2 - assert llmobs_events[0] == _expected_llmobs_llm_span_event(root_span, "llm") - assert llmobs_events[1] == expected_grandchild_llmobs_span + assert llmobs_events[1] == _expected_llmobs_llm_span_event(root_span, "llm") + assert llmobs_events[0] == expected_grandchild_llmobs_span diff --git a/tests/llmobs/test_llmobs_decorators.py b/tests/llmobs/test_llmobs_decorators.py index e94d72aec64..056de72ee96 100644 --- a/tests/llmobs/test_llmobs_decorators.py +++ b/tests/llmobs/test_llmobs_decorators.py @@ -19,7 +19,7 @@ def mock_logs(): yield mock_logs -def test_llm_decorator_with_llmobs_disabled_logs_warning(LLMObs, mock_logs): +def test_llm_decorator_with_llmobs_disabled_logs_warning(llmobs, mock_logs): for decorator_name, decorator in (("llm", llm), ("embedding", embedding)): @decorator( @@ -28,13 +28,13 @@ def test_llm_decorator_with_llmobs_disabled_logs_warning(LLMObs, mock_logs): def f(): pass - LLMObs.disable() + llmobs.disable() f() mock_logs.warning.assert_called_with(SPAN_START_WHILE_DISABLED_WARNING) mock_logs.reset_mock() -def test_non_llm_decorator_with_llmobs_disabled_logs_warning(LLMObs, mock_logs): +def test_non_llm_decorator_with_llmobs_disabled_logs_warning(llmobs, mock_logs): for decorator_name, decorator in ( ("task", task), ("workflow", workflow), @@ -47,53 +47,49 @@ def test_non_llm_decorator_with_llmobs_disabled_logs_warning(LLMObs, mock_logs): def f(): pass - LLMObs.disable() + llmobs.disable() f() mock_logs.warning.assert_called_with(SPAN_START_WHILE_DISABLED_WARNING) mock_logs.reset_mock() -def test_llm_decorator(LLMObs, mock_llmobs_span_writer): +def test_llm_decorator(llmobs, llmobs_events): @llm(model_name="test_model", model_provider="test_provider", name="test_function", session_id="test_session_id") def f(): pass f() - span = LLMObs._instance.tracer.pop()[0] - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_llm_span_event( - span, "llm", model_name="test_model", model_provider="test_provider", session_id="test_session_id" - ) + span = llmobs._instance.tracer.pop()[0] + assert llmobs_events[0] == _expected_llmobs_llm_span_event( + span, "llm", model_name="test_model", model_provider="test_provider", session_id="test_session_id" ) -def test_llm_decorator_no_model_name_sets_default(LLMObs, mock_llmobs_span_writer): +def test_llm_decorator_no_model_name_sets_default(llmobs, llmobs_events): @llm(model_provider="test_provider", name="test_function", session_id="test_session_id") def f(): pass f() - span = LLMObs._instance.tracer.pop()[0] - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_llm_span_event( - span, "llm", model_name="custom", model_provider="test_provider", session_id="test_session_id" - ) + span = llmobs._instance.tracer.pop()[0] + assert llmobs_events[0] == _expected_llmobs_llm_span_event( + span, "llm", model_name="custom", model_provider="test_provider", session_id="test_session_id" ) -def test_llm_decorator_default_kwargs(LLMObs, mock_llmobs_span_writer): +def test_llm_decorator_default_kwargs(llmobs, llmobs_events): @llm def f(): pass f() - span = LLMObs._instance.tracer.pop()[0] - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_llm_span_event(span, "llm", model_name="custom", model_provider="custom") + span = llmobs._instance.tracer.pop()[0] + assert llmobs_events[0] == _expected_llmobs_llm_span_event( + span, "llm", model_name="custom", model_provider="custom" ) -def test_embedding_decorator(LLMObs, mock_llmobs_span_writer): +def test_embedding_decorator(llmobs, llmobs_events): @embedding( model_name="test_model", model_provider="test_provider", name="test_function", session_id="test_session_id" ) @@ -101,173 +97,157 @@ def f(): pass f() - span = LLMObs._instance.tracer.pop()[0] - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_llm_span_event( - span, "embedding", model_name="test_model", model_provider="test_provider", session_id="test_session_id" - ) + span = llmobs._instance.tracer.pop()[0] + assert llmobs_events[0] == _expected_llmobs_llm_span_event( + span, "embedding", model_name="test_model", model_provider="test_provider", session_id="test_session_id" ) -def test_embedding_decorator_no_model_name_sets_default(LLMObs, mock_llmobs_span_writer): +def test_embedding_decorator_no_model_name_sets_default(llmobs, llmobs_events): @embedding(model_provider="test_provider", name="test_function", session_id="test_session_id") def f(): pass f() - span = LLMObs._instance.tracer.pop()[0] - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_llm_span_event( - span, "embedding", model_name="custom", model_provider="test_provider", session_id="test_session_id" - ) + span = llmobs._instance.tracer.pop()[0] + assert llmobs_events[0] == _expected_llmobs_llm_span_event( + span, "embedding", model_name="custom", model_provider="test_provider", session_id="test_session_id" ) -def test_embedding_decorator_default_kwargs(LLMObs, mock_llmobs_span_writer): +def test_embedding_decorator_default_kwargs(llmobs, llmobs_events): @embedding def f(): pass f() - span = LLMObs._instance.tracer.pop()[0] - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_llm_span_event(span, "embedding", model_name="custom", model_provider="custom") + span = llmobs._instance.tracer.pop()[0] + assert llmobs_events[0] == _expected_llmobs_llm_span_event( + span, "embedding", model_name="custom", model_provider="custom" ) -def test_retrieval_decorator(LLMObs, mock_llmobs_span_writer): +def test_retrieval_decorator(llmobs, llmobs_events): @retrieval(name="test_function", session_id="test_session_id") def f(): pass f() - span = LLMObs._instance.tracer.pop()[0] - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_non_llm_span_event(span, "retrieval", session_id="test_session_id") - ) + span = llmobs._instance.tracer.pop()[0] + assert llmobs_events[0] == _expected_llmobs_non_llm_span_event(span, "retrieval", session_id="test_session_id") -def test_retrieval_decorator_default_kwargs(LLMObs, mock_llmobs_span_writer): +def test_retrieval_decorator_default_kwargs(llmobs, llmobs_events): @retrieval() def f(): pass f() - span = LLMObs._instance.tracer.pop()[0] - mock_llmobs_span_writer.enqueue.assert_called_with(_expected_llmobs_non_llm_span_event(span, "retrieval")) + span = llmobs._instance.tracer.pop()[0] + assert llmobs_events[0] == _expected_llmobs_non_llm_span_event(span, "retrieval") -def test_task_decorator(LLMObs, mock_llmobs_span_writer): +def test_task_decorator(llmobs, llmobs_events): @task(name="test_function", session_id="test_session_id") def f(): pass f() - span = LLMObs._instance.tracer.pop()[0] - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_non_llm_span_event(span, "task", session_id="test_session_id") - ) + span = llmobs._instance.tracer.pop()[0] + assert llmobs_events[0] == _expected_llmobs_non_llm_span_event(span, "task", session_id="test_session_id") -def test_task_decorator_default_kwargs(LLMObs, mock_llmobs_span_writer): +def test_task_decorator_default_kwargs(llmobs, llmobs_events): @task() def f(): pass f() - span = LLMObs._instance.tracer.pop()[0] - mock_llmobs_span_writer.enqueue.assert_called_with(_expected_llmobs_non_llm_span_event(span, "task")) + span = llmobs._instance.tracer.pop()[0] + assert llmobs_events[0] == _expected_llmobs_non_llm_span_event(span, "task") -def test_tool_decorator(LLMObs, mock_llmobs_span_writer): +def test_tool_decorator(llmobs, llmobs_events): @tool(name="test_function", session_id="test_session_id") def f(): pass f() - span = LLMObs._instance.tracer.pop()[0] - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_non_llm_span_event(span, "tool", session_id="test_session_id") - ) + span = llmobs._instance.tracer.pop()[0] + assert llmobs_events[0] == _expected_llmobs_non_llm_span_event(span, "tool", session_id="test_session_id") -def test_tool_decorator_default_kwargs(LLMObs, mock_llmobs_span_writer): +def test_tool_decorator_default_kwargs(llmobs, llmobs_events): @tool() def f(): pass f() - span = LLMObs._instance.tracer.pop()[0] - mock_llmobs_span_writer.enqueue.assert_called_with(_expected_llmobs_non_llm_span_event(span, "tool")) + span = llmobs._instance.tracer.pop()[0] + assert llmobs_events[0] == _expected_llmobs_non_llm_span_event(span, "tool") -def test_workflow_decorator(LLMObs, mock_llmobs_span_writer): +def test_workflow_decorator(llmobs, llmobs_events): @workflow(name="test_function", session_id="test_session_id") def f(): pass f() - span = LLMObs._instance.tracer.pop()[0] - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_non_llm_span_event(span, "workflow", session_id="test_session_id") - ) + span = llmobs._instance.tracer.pop()[0] + assert llmobs_events[0] == _expected_llmobs_non_llm_span_event(span, "workflow", session_id="test_session_id") -def test_workflow_decorator_default_kwargs(LLMObs, mock_llmobs_span_writer): +def test_workflow_decorator_default_kwargs(llmobs, llmobs_events): @workflow() def f(): pass f() - span = LLMObs._instance.tracer.pop()[0] - mock_llmobs_span_writer.enqueue.assert_called_with(_expected_llmobs_non_llm_span_event(span, "workflow")) + span = llmobs._instance.tracer.pop()[0] + assert llmobs_events[0] == _expected_llmobs_non_llm_span_event(span, "workflow") -def test_agent_decorator(LLMObs, mock_llmobs_span_writer): +def test_agent_decorator(llmobs, llmobs_events): @agent(name="test_function", session_id="test_session_id") def f(): pass f() - span = LLMObs._instance.tracer.pop()[0] - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_llm_span_event(span, "agent", session_id="test_session_id") - ) + span = llmobs._instance.tracer.pop()[0] + assert llmobs_events[0] == _expected_llmobs_llm_span_event(span, "agent", session_id="test_session_id") -def test_agent_decorator_default_kwargs(LLMObs, mock_llmobs_span_writer): +def test_agent_decorator_default_kwargs(llmobs, llmobs_events): @agent() def f(): pass f() - span = LLMObs._instance.tracer.pop()[0] - mock_llmobs_span_writer.enqueue.assert_called_with(_expected_llmobs_llm_span_event(span, "agent")) + span = llmobs._instance.tracer.pop()[0] + assert llmobs_events[0] == _expected_llmobs_llm_span_event(span, "agent") -def test_llm_decorator_with_error(LLMObs, mock_llmobs_span_writer): +def test_llm_decorator_with_error(llmobs, llmobs_events): @llm(model_name="test_model", model_provider="test_provider", name="test_function", session_id="test_session_id") def f(): raise ValueError("test_error") with pytest.raises(ValueError): f() - span = LLMObs._instance.tracer.pop()[0] - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_llm_span_event( - span, - "llm", - model_name="test_model", - model_provider="test_provider", - session_id="test_session_id", - error=span.get_tag("error.type"), - error_message=span.get_tag("error.message"), - error_stack=span.get_tag("error.stack"), - ) + span = llmobs._instance.tracer.pop()[0] + assert llmobs_events[0] == _expected_llmobs_llm_span_event( + span, + "llm", + model_name="test_model", + model_provider="test_provider", + session_id="test_session_id", + error=span.get_tag("error.type"), + error_message=span.get_tag("error.message"), + error_stack=span.get_tag("error.stack"), ) -def test_non_llm_decorators_with_error(LLMObs, mock_llmobs_span_writer): +def test_non_llm_decorators_with_error(llmobs, llmobs_events): for decorator_name, decorator in [("task", task), ("workflow", workflow), ("tool", tool), ("agent", agent)]: @decorator(name="test_function", session_id="test_session_id") @@ -276,23 +256,21 @@ def f(): with pytest.raises(ValueError): f() - span = LLMObs._instance.tracer.pop()[0] - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_non_llm_span_event( - span, - decorator_name, - session_id="test_session_id", - error=span.get_tag("error.type"), - error_message=span.get_tag("error.message"), - error_stack=span.get_tag("error.stack"), - ) + span = llmobs._instance.tracer.pop()[0] + assert llmobs_events[-1] == _expected_llmobs_non_llm_span_event( + span, + decorator_name, + session_id="test_session_id", + error=span.get_tag("error.type"), + error_message=span.get_tag("error.message"), + error_stack=span.get_tag("error.stack"), ) -def test_llm_annotate(LLMObs, mock_llmobs_span_writer): +def test_llm_annotate(llmobs, llmobs_events): @llm(model_name="test_model", model_provider="test_provider", name="test_function", session_id="test_session_id") def f(): - LLMObs.annotate( + llmobs.annotate( parameters={"temperature": 0.9, "max_tokens": 50}, input_data=[{"content": "test_prompt"}], output_data=[{"content": "test_response"}], @@ -301,27 +279,25 @@ def f(): ) f() - span = LLMObs._instance.tracer.pop()[0] - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_llm_span_event( - span, - "llm", - model_name="test_model", - model_provider="test_provider", - input_messages=[{"content": "test_prompt"}], - output_messages=[{"content": "test_response"}], - parameters={"temperature": 0.9, "max_tokens": 50}, - token_metrics={"input_tokens": 10, "output_tokens": 20, "total_tokens": 30}, - tags={"custom_tag": "tag_value"}, - session_id="test_session_id", - ) + span = llmobs._instance.tracer.pop()[0] + assert llmobs_events[0] == _expected_llmobs_llm_span_event( + span, + "llm", + model_name="test_model", + model_provider="test_provider", + input_messages=[{"content": "test_prompt"}], + output_messages=[{"content": "test_response"}], + parameters={"temperature": 0.9, "max_tokens": 50}, + token_metrics={"input_tokens": 10, "output_tokens": 20, "total_tokens": 30}, + tags={"custom_tag": "tag_value"}, + session_id="test_session_id", ) -def test_llm_annotate_raw_string_io(LLMObs, mock_llmobs_span_writer): +def test_llm_annotate_raw_string_io(llmobs, llmobs_events): @llm(model_name="test_model", model_provider="test_provider", name="test_function", session_id="test_session_id") def f(): - LLMObs.annotate( + llmobs.annotate( parameters={"temperature": 0.9, "max_tokens": 50}, input_data="test_prompt", output_data="test_response", @@ -330,24 +306,22 @@ def f(): ) f() - span = LLMObs._instance.tracer.pop()[0] - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_llm_span_event( - span, - "llm", - model_name="test_model", - model_provider="test_provider", - input_messages=[{"content": "test_prompt"}], - output_messages=[{"content": "test_response"}], - parameters={"temperature": 0.9, "max_tokens": 50}, - token_metrics={"input_tokens": 10, "output_tokens": 20, "total_tokens": 30}, - tags={"custom_tag": "tag_value"}, - session_id="test_session_id", - ) + span = llmobs._instance.tracer.pop()[0] + assert llmobs_events[0] == _expected_llmobs_llm_span_event( + span, + "llm", + model_name="test_model", + model_provider="test_provider", + input_messages=[{"content": "test_prompt"}], + output_messages=[{"content": "test_response"}], + parameters={"temperature": 0.9, "max_tokens": 50}, + token_metrics={"input_tokens": 10, "output_tokens": 20, "total_tokens": 30}, + tags={"custom_tag": "tag_value"}, + session_id="test_session_id", ) -def test_non_llm_decorators_no_args(LLMObs, mock_llmobs_span_writer): +def test_non_llm_decorators_no_args(llmobs, llmobs_events): """Test that using the decorators without any arguments, i.e. @tool, works the same as @tool(...).""" for decorator_name, decorator in [ ("task", task), @@ -362,11 +336,11 @@ def f(): pass f() - span = LLMObs._instance.tracer.pop()[0] - mock_llmobs_span_writer.enqueue.assert_called_with(_expected_llmobs_non_llm_span_event(span, decorator_name)) + span = llmobs._instance.tracer.pop()[0] + assert llmobs_events[-1] == _expected_llmobs_non_llm_span_event(span, decorator_name) -def test_agent_decorator_no_args(LLMObs, mock_llmobs_span_writer): +def test_agent_decorator_no_args(llmobs, llmobs_events): """Test that using agent decorator without any arguments, i.e. @agent, works the same as @agent(...).""" @agent @@ -374,11 +348,11 @@ def f(): pass f() - span = LLMObs._instance.tracer.pop()[0] - mock_llmobs_span_writer.enqueue.assert_called_with(_expected_llmobs_llm_span_event(span, "agent")) + span = llmobs._instance.tracer.pop()[0] + assert llmobs_events[0] == _expected_llmobs_llm_span_event(span, "agent") -def test_ml_app_override(LLMObs, mock_llmobs_span_writer): +def test_ml_app_override(llmobs, llmobs_events): """Test that setting ml_app kwarg on the LLMObs decorators will override the DD_LLMOBS_ML_APP value.""" for decorator_name, decorator in [("task", task), ("workflow", workflow), ("tool", tool)]: @@ -387,9 +361,9 @@ def f(): pass f() - span = LLMObs._instance.tracer.pop()[0] - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_non_llm_span_event(span, decorator_name, tags={"ml_app": "test_ml_app"}) + span = llmobs._instance.tracer.pop()[0] + assert llmobs_events[-1] == _expected_llmobs_non_llm_span_event( + span, decorator_name, tags={"ml_app": "test_ml_app"} ) @llm(model_name="test_model", ml_app="test_ml_app") @@ -397,11 +371,9 @@ def g(): pass g() - span = LLMObs._instance.tracer.pop()[0] - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_llm_span_event( - span, "llm", model_name="test_model", model_provider="custom", tags={"ml_app": "test_ml_app"} - ) + span = llmobs._instance.tracer.pop()[0] + assert llmobs_events[-1] == _expected_llmobs_llm_span_event( + span, "llm", model_name="test_model", model_provider="custom", tags={"ml_app": "test_ml_app"} ) @embedding(model_name="test_model", ml_app="test_ml_app") @@ -409,15 +381,13 @@ def h(): pass h() - span = LLMObs._instance.tracer.pop()[0] - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_llm_span_event( - span, "embedding", model_name="test_model", model_provider="custom", tags={"ml_app": "test_ml_app"} - ) + span = llmobs._instance.tracer.pop()[0] + assert llmobs_events[-1] == _expected_llmobs_llm_span_event( + span, "embedding", model_name="test_model", model_provider="custom", tags={"ml_app": "test_ml_app"} ) -async def test_non_llm_async_decorators(LLMObs, mock_llmobs_span_writer): +async def test_non_llm_async_decorators(llmobs, llmobs_events): """Test that decorators work with async functions.""" for decorator_name, decorator in [ ("task", task), @@ -432,11 +402,11 @@ async def f(): pass await f() - span = LLMObs._instance.tracer.pop()[0] - mock_llmobs_span_writer.enqueue.assert_called_with(_expected_llmobs_non_llm_span_event(span, decorator_name)) + span = llmobs._instance.tracer.pop()[0] + assert llmobs_events[-1] == _expected_llmobs_non_llm_span_event(span, decorator_name) -async def test_llm_async_decorators(LLMObs, mock_llmobs_span_writer): +async def test_llm_async_decorators(llmobs, llmobs_events): """Test that decorators work with async functions.""" for decorator_name, decorator in [("llm", llm), ("embedding", embedding)]: @@ -445,15 +415,13 @@ async def f(): pass await f() - span = LLMObs._instance.tracer.pop()[0] - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_llm_span_event( - span, decorator_name, model_name="test_model", model_provider="test_provider" - ) + span = llmobs._instance.tracer.pop()[0] + assert llmobs_events[-1] == _expected_llmobs_llm_span_event( + span, decorator_name, model_name="test_model", model_provider="test_provider" ) -def test_automatic_annotation_non_llm_decorators(LLMObs, mock_llmobs_span_writer): +def test_automatic_annotation_non_llm_decorators(llmobs, llmobs_events): """Test that automatic input/output annotation works for non-LLM decorators.""" for decorator_name, decorator in (("task", task), ("workflow", workflow), ("tool", tool), ("agent", agent)): @@ -462,19 +430,17 @@ def f(prompt, arg_2, kwarg_1=None, kwarg_2=None): return prompt f("test_prompt", "arg_2", kwarg_2=12345) - span = LLMObs._instance.tracer.pop()[0] - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_non_llm_span_event( - span, - decorator_name, - input_value=str({"prompt": "test_prompt", "arg_2": "arg_2", "kwarg_2": 12345}), - output_value="test_prompt", - session_id="test_session_id", - ) + span = llmobs._instance.tracer.pop()[0] + assert llmobs_events[-1] == _expected_llmobs_non_llm_span_event( + span, + decorator_name, + input_value=str({"prompt": "test_prompt", "arg_2": "arg_2", "kwarg_2": 12345}), + output_value="test_prompt", + session_id="test_session_id", ) -def test_automatic_annotation_retrieval_decorator(LLMObs, mock_llmobs_span_writer): +def test_automatic_annotation_retrieval_decorator(llmobs, llmobs_events): """Test that automatic input annotation works for retrieval decorators.""" @retrieval(session_id="test_session_id") @@ -482,18 +448,16 @@ def test_retrieval(query, arg_2, kwarg_1=None, kwarg_2=None): return [{"name": "name", "id": "1234567890", "score": 0.9}] test_retrieval("test_query", "arg_2", kwarg_2=12345) - span = LLMObs._instance.tracer.pop()[0] - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_non_llm_span_event( - span, - "retrieval", - input_value=str({"query": "test_query", "arg_2": "arg_2", "kwarg_2": 12345}), - session_id="test_session_id", - ) + span = llmobs._instance.tracer.pop()[0] + assert llmobs_events[0] == _expected_llmobs_non_llm_span_event( + span, + "retrieval", + input_value=str({"query": "test_query", "arg_2": "arg_2", "kwarg_2": 12345}), + session_id="test_session_id", ) -def test_automatic_annotation_off_non_llm_decorators(LLMObs, mock_llmobs_span_writer): +def test_automatic_annotation_off_non_llm_decorators(llmobs, llmobs_events): """Test disabling automatic input/output annotation for non-LLM decorators.""" for decorator_name, decorator in ( ("task", task), @@ -508,35 +472,33 @@ def f(prompt, arg_2, kwarg_1=None, kwarg_2=None): return prompt f("test_prompt", "arg_2", kwarg_2=12345) - span = LLMObs._instance.tracer.pop()[0] - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_non_llm_span_event(span, decorator_name, session_id="test_session_id") + span = llmobs._instance.tracer.pop()[0] + assert llmobs_events[-1] == _expected_llmobs_non_llm_span_event( + span, decorator_name, session_id="test_session_id" ) -def test_automatic_annotation_off_if_manually_annotated(LLMObs, mock_llmobs_span_writer): +def test_automatic_annotation_off_if_manually_annotated(llmobs, llmobs_events): """Test disabling automatic input/output annotation for non-LLM decorators.""" for decorator_name, decorator in (("task", task), ("workflow", workflow), ("tool", tool), ("agent", agent)): @decorator(name="test_function", session_id="test_session_id") def f(prompt, arg_2, kwarg_1=None, kwarg_2=None): - LLMObs.annotate(input_data="my custom input", output_data="my custom output") + llmobs.annotate(input_data="my custom input", output_data="my custom output") return prompt f("test_prompt", "arg_2", kwarg_2=12345) - span = LLMObs._instance.tracer.pop()[0] - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_non_llm_span_event( - span, - decorator_name, - session_id="test_session_id", - input_value="my custom input", - output_value="my custom output", - ) + span = llmobs._instance.tracer.pop()[0] + assert llmobs_events[-1] == _expected_llmobs_non_llm_span_event( + span, + decorator_name, + session_id="test_session_id", + input_value="my custom input", + output_value="my custom output", ) -def test_generator_sync(LLMObs, mock_llmobs_span_writer): +def test_generator_sync(llmobs, llmobs_events): """ Test that decorators work with generator functions. The span should finish after the generator is exhausted. @@ -556,7 +518,7 @@ def f(): for i in range(3): yield i - LLMObs.annotate( + llmobs.annotate( input_data="hello", output_data="world", ) @@ -566,7 +528,7 @@ def f(): assert e == i i += 1 - span = LLMObs._instance.tracer.pop()[0] + span = llmobs._instance.tracer.pop()[0] if decorator_name == "llm": expected_span_event = _expected_llmobs_llm_span_event( span, @@ -594,10 +556,10 @@ def f(): span, decorator_name, input_value="hello", output_value="world" ) - mock_llmobs_span_writer.enqueue.assert_called_with(expected_span_event) + assert llmobs_events[-1] == expected_span_event -async def test_generator_async(LLMObs, mock_llmobs_span_writer): +async def test_generator_async(llmobs, llmobs_events): """ Test that decorators work with generator functions. The span should finish after the generator is exhausted. @@ -617,7 +579,7 @@ async def f(): for i in range(3): yield i - LLMObs.annotate( + llmobs.annotate( input_data="hello", output_data="world", ) @@ -627,7 +589,7 @@ async def f(): assert e == i i += 1 - span = LLMObs._instance.tracer.pop()[0] + span = llmobs._instance.tracer.pop()[0] if decorator_name == "llm": expected_span_event = _expected_llmobs_llm_span_event( span, @@ -655,11 +617,11 @@ async def f(): span, decorator_name, input_value="hello", output_value="world" ) - mock_llmobs_span_writer.enqueue.assert_called_with(expected_span_event) + assert llmobs_events[-1] == expected_span_event -def test_generator_sync_with_llmobs_disabled(LLMObs, mock_logs): - LLMObs.disable() +def test_generator_sync_with_llmobs_disabled(llmobs, mock_logs): + llmobs.disable() @workflow() def f(): @@ -684,10 +646,11 @@ def g(): i += 1 mock_logs.warning.assert_called_with(SPAN_START_WHILE_DISABLED_WARNING) + llmobs.enable() -async def test_generator_async_with_llmobs_disabled(LLMObs, mock_logs): - LLMObs.disable() +async def test_generator_async_with_llmobs_disabled(llmobs, mock_logs): + llmobs.disable() @workflow() async def f(): @@ -712,9 +675,10 @@ async def g(): i += 1 mock_logs.warning.assert_called_with(SPAN_START_WHILE_DISABLED_WARNING) + llmobs.enable() -def test_generator_sync_finishes_span_on_error(LLMObs, mock_llmobs_span_writer): +def test_generator_sync_finishes_span_on_error(llmobs, llmobs_events): """Tests that""" @workflow() @@ -728,19 +692,17 @@ def f(): for _ in f(): pass - span = LLMObs._instance.tracer.pop()[0] - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_non_llm_span_event( - span, - "workflow", - error=span.get_tag("error.type"), - error_message=span.get_tag("error.message"), - error_stack=span.get_tag("error.stack"), - ) + span = llmobs._instance.tracer.pop()[0] + assert llmobs_events[0] == _expected_llmobs_non_llm_span_event( + span, + "workflow", + error=span.get_tag("error.type"), + error_message=span.get_tag("error.message"), + error_stack=span.get_tag("error.stack"), ) -async def test_generator_async_finishes_span_on_error(LLMObs, mock_llmobs_span_writer): +async def test_generator_async_finishes_span_on_error(llmobs, llmobs_events): @workflow() async def f(): for i in range(3): @@ -752,19 +714,17 @@ async def f(): async for _ in f(): pass - span = LLMObs._instance.tracer.pop()[0] - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_non_llm_span_event( - span, - "workflow", - error=span.get_tag("error.type"), - error_message=span.get_tag("error.message"), - error_stack=span.get_tag("error.stack"), - ) + span = llmobs._instance.tracer.pop()[0] + assert llmobs_events[0] == _expected_llmobs_non_llm_span_event( + span, + "workflow", + error=span.get_tag("error.type"), + error_message=span.get_tag("error.message"), + error_stack=span.get_tag("error.stack"), ) -def test_generator_sync_send(LLMObs, mock_llmobs_span_writer): +def test_generator_sync_send(llmobs, llmobs_events): @workflow() def f(): while True: @@ -780,16 +740,11 @@ def f(): assert gen.send(4) == 16 gen.close() - span = LLMObs._instance.tracer.pop()[0] - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_non_llm_span_event( - span, - "workflow", - ) - ) + span = llmobs._instance.tracer.pop()[0] + assert llmobs_events[0] == _expected_llmobs_non_llm_span_event(span, "workflow") -async def test_generator_async_send(LLMObs, mock_llmobs_span_writer): +async def test_generator_async_send(llmobs, llmobs_events): @workflow() async def f(): while True: @@ -805,16 +760,11 @@ async def f(): await gen.aclose() - span = LLMObs._instance.tracer.pop()[0] - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_non_llm_span_event( - span, - "workflow", - ) - ) + span = llmobs._instance.tracer.pop()[0] + assert llmobs_events[0] == _expected_llmobs_non_llm_span_event(span, "workflow") -def test_generator_sync_throw(LLMObs, mock_llmobs_span_writer): +def test_generator_sync_throw(llmobs, llmobs_events): @workflow() def f(): for i in range(3): @@ -825,19 +775,17 @@ def f(): next(gen) gen.throw(ValueError("test_error")) - span = LLMObs._instance.tracer.pop()[0] - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_non_llm_span_event( - span, - "workflow", - error=span.get_tag("error.type"), - error_message=span.get_tag("error.message"), - error_stack=span.get_tag("error.stack"), - ) + span = llmobs._instance.tracer.pop()[0] + assert llmobs_events[0] == _expected_llmobs_non_llm_span_event( + span, + "workflow", + error=span.get_tag("error.type"), + error_message=span.get_tag("error.message"), + error_stack=span.get_tag("error.stack"), ) -async def test_generator_async_throw(LLMObs, mock_llmobs_span_writer): +async def test_generator_async_throw(llmobs, llmobs_events): @workflow() async def f(): for i in range(3): @@ -848,19 +796,17 @@ async def f(): await gen.asend(None) await gen.athrow(ValueError("test_error")) - span = LLMObs._instance.tracer.pop()[0] - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_non_llm_span_event( - span, - "workflow", - error=span.get_tag("error.type"), - error_message=span.get_tag("error.message"), - error_stack=span.get_tag("error.stack"), - ) + span = llmobs._instance.tracer.pop()[0] + assert llmobs_events[0] == _expected_llmobs_non_llm_span_event( + span, + "workflow", + error=span.get_tag("error.type"), + error_message=span.get_tag("error.message"), + error_stack=span.get_tag("error.stack"), ) -def test_generator_exit_exception_sync(LLMObs, mock_llmobs_span_writer): +def test_generator_exit_exception_sync(llmobs, llmobs_events): @workflow() def get_next_element(alist): for element in alist: @@ -873,14 +819,12 @@ def get_next_element(alist): if element == 5: break - span = LLMObs._instance.tracer.pop()[0] - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_non_llm_span_event( - span, - "workflow", - input_value=str({"alist": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]}), - error=span.get_tag("error.type"), - error_message=span.get_tag("error.message"), - error_stack=span.get_tag("error.stack"), - ) + span = llmobs._instance.tracer.pop()[0] + assert llmobs_events[0] == _expected_llmobs_non_llm_span_event( + span, + "workflow", + input_value=str({"alist": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]}), + error=span.get_tag("error.type"), + error_message=span.get_tag("error.message"), + error_stack=span.get_tag("error.stack"), ) diff --git a/tests/llmobs/test_llmobs_evaluator_runner.py b/tests/llmobs/test_llmobs_evaluator_runner.py index 99a097cdb10..eb0be25c91b 100644 --- a/tests/llmobs/test_llmobs_evaluator_runner.py +++ b/tests/llmobs/test_llmobs_evaluator_runner.py @@ -34,9 +34,9 @@ def test_evaluator_runner_buffer_limit(mock_evaluator_logs): ) -def test_evaluator_runner_periodic_enqueues_eval_metric(LLMObs, mock_llmobs_eval_metric_writer): - evaluator_runner = EvaluatorRunner(interval=0.01, llmobs_service=LLMObs) - evaluator_runner.evaluators.append(DummyEvaluator(llmobs_service=LLMObs)) +def test_evaluator_runner_periodic_enqueues_eval_metric(llmobs, mock_llmobs_eval_metric_writer): + evaluator_runner = EvaluatorRunner(interval=0.01, llmobs_service=llmobs) + evaluator_runner.evaluators.append(DummyEvaluator(llmobs_service=llmobs)) evaluator_runner.enqueue({"span_id": "123", "trace_id": "1234"}, DUMMY_SPAN) evaluator_runner.periodic() mock_llmobs_eval_metric_writer.enqueue.assert_called_once_with( @@ -45,9 +45,9 @@ def test_evaluator_runner_periodic_enqueues_eval_metric(LLMObs, mock_llmobs_eval @pytest.mark.vcr_logs -def test_evaluator_runner_timed_enqueues_eval_metric(LLMObs, mock_llmobs_eval_metric_writer): - evaluator_runner = EvaluatorRunner(interval=0.01, llmobs_service=LLMObs) - evaluator_runner.evaluators.append(DummyEvaluator(llmobs_service=LLMObs)) +def test_evaluator_runner_timed_enqueues_eval_metric(llmobs, mock_llmobs_eval_metric_writer): + evaluator_runner = EvaluatorRunner(interval=0.01, llmobs_service=llmobs) + evaluator_runner.evaluators.append(DummyEvaluator(llmobs_service=llmobs)) evaluator_runner.start() evaluator_runner.enqueue({"span_id": "123", "trace_id": "1234"}, DUMMY_SPAN) diff --git a/tests/llmobs/test_llmobs_ragas_evaluators.py b/tests/llmobs/test_llmobs_ragas_evaluators.py index 14e71764caa..9df6c392470 100644 --- a/tests/llmobs/test_llmobs_ragas_evaluators.py +++ b/tests/llmobs/test_llmobs_ragas_evaluators.py @@ -11,31 +11,34 @@ from tests.llmobs._utils import _llm_span_with_expected_ragas_inputs_in_prompt +pytest.importorskip("ragas", reason="Tests require ragas to be available on user env") + + def _llm_span_without_io(): return _expected_llmobs_llm_span_event(Span("dummy")) -def test_ragas_faithfulness_evaluator_init(ragas, LLMObs): - rf_evaluator = RagasFaithfulnessEvaluator(LLMObs) - assert rf_evaluator.llmobs_service == LLMObs +def test_ragas_evaluator_init(ragas, llmobs): + rf_evaluator = RagasFaithfulnessEvaluator(llmobs) + assert rf_evaluator.llmobs_service == llmobs assert rf_evaluator.ragas_faithfulness_instance == ragas.metrics.faithfulness assert rf_evaluator.ragas_faithfulness_instance.llm == ragas.llms.llm_factory() -def test_ragas_faithfulness_throws_if_dependencies_not_present(LLMObs, mock_ragas_dependencies_not_present, ragas): +def test_ragas_faithfulness_throws_if_dependencies_not_present(llmobs, mock_ragas_dependencies_not_present, ragas): with pytest.raises(NotImplementedError, match="Failed to load dependencies for `ragas_faithfulness` evaluator"): - RagasFaithfulnessEvaluator(LLMObs) + RagasFaithfulnessEvaluator(llmobs) -def test_ragas_faithfulness_returns_none_if_inputs_extraction_fails(ragas, mock_llmobs_submit_evaluation, LLMObs): - rf_evaluator = RagasFaithfulnessEvaluator(LLMObs) +def test_ragas_faithfulness_returns_none_if_inputs_extraction_fails(ragas, mock_llmobs_submit_evaluation, llmobs): + rf_evaluator = RagasFaithfulnessEvaluator(llmobs) failure_msg, _ = rf_evaluator.evaluate(_llm_span_without_io()) assert failure_msg == "fail_extract_faithfulness_inputs" assert rf_evaluator.llmobs_service.submit_evaluation.call_count == 0 def test_ragas_faithfulness_has_modified_faithfulness_instance( - ragas, mock_llmobs_submit_evaluation, reset_ragas_faithfulness_llm, LLMObs + ragas, mock_llmobs_submit_evaluation, reset_ragas_faithfulness_llm, llmobs ): """Faithfulness instance used in ragas evaluator should match the global ragas faithfulness instance""" from ragas.llms import BaseRagasLLM @@ -53,7 +56,7 @@ def agenerate_text(self) -> str: faithfulness.llm = FirstDummyLLM() - rf_evaluator = RagasFaithfulnessEvaluator(LLMObs) + rf_evaluator = RagasFaithfulnessEvaluator(llmobs) assert rf_evaluator.ragas_faithfulness_instance.llm.generate_text() == "dummy llm" @@ -74,9 +77,9 @@ def agenerate_text(self, statements) -> str: @pytest.mark.vcr_logs -def test_ragas_faithfulness_submits_evaluation(ragas, LLMObs, mock_llmobs_submit_evaluation): +def test_ragas_faithfulness_submits_evaluation(ragas, llmobs, mock_llmobs_submit_evaluation): """Test that evaluation is submitted for a valid llm span where question is in the prompt variables""" - rf_evaluator = RagasFaithfulnessEvaluator(LLMObs) + rf_evaluator = RagasFaithfulnessEvaluator(llmobs) llm_span = _llm_span_with_expected_ragas_inputs_in_prompt() rf_evaluator.run_and_submit_evaluation(llm_span) rf_evaluator.llmobs_service.submit_evaluation.assert_has_calls( @@ -101,10 +104,10 @@ def test_ragas_faithfulness_submits_evaluation(ragas, LLMObs, mock_llmobs_submit @pytest.mark.vcr_logs def test_ragas_faithfulness_submits_evaluation_on_span_with_question_in_messages( - ragas, LLMObs, mock_llmobs_submit_evaluation + ragas, llmobs, mock_llmobs_submit_evaluation ): """Test that evaluation is submitted for a valid llm span where the last message content is the question""" - rf_evaluator = RagasFaithfulnessEvaluator(LLMObs) + rf_evaluator = RagasFaithfulnessEvaluator(llmobs) llm_span = _llm_span_with_expected_ragas_inputs_in_messages() rf_evaluator.run_and_submit_evaluation(llm_span) rf_evaluator.llmobs_service.submit_evaluation.assert_has_calls( @@ -128,9 +131,9 @@ def test_ragas_faithfulness_submits_evaluation_on_span_with_question_in_messages @pytest.mark.vcr_logs -def test_ragas_faithfulness_submits_evaluation_on_span_with_custom_keys(ragas, LLMObs, mock_llmobs_submit_evaluation): +def test_ragas_faithfulness_submits_evaluation_on_span_with_custom_keys(ragas, llmobs, mock_llmobs_submit_evaluation): """Test that evaluation is submitted for a valid llm span where the last message content is the question""" - rf_evaluator = RagasFaithfulnessEvaluator(LLMObs) + rf_evaluator = RagasFaithfulnessEvaluator(llmobs) llm_span = _expected_llmobs_llm_span_event( Span("dummy"), prompt={ @@ -167,19 +170,17 @@ def test_ragas_faithfulness_submits_evaluation_on_span_with_custom_keys(ragas, L @pytest.mark.vcr_logs -def test_ragas_faithfulness_emits_traces(ragas, LLMObs): - rf_evaluator = RagasFaithfulnessEvaluator(LLMObs) +def test_ragas_faithfulness_emits_traces(ragas, llmobs, llmobs_events): + rf_evaluator = RagasFaithfulnessEvaluator(llmobs) rf_evaluator.evaluate(_llm_span_with_expected_ragas_inputs_in_prompt()) - assert rf_evaluator.llmobs_service._instance._llmobs_span_writer.enqueue.call_count == 7 - calls = rf_evaluator.llmobs_service._instance._llmobs_span_writer.enqueue.call_args_list - - spans = [call[0][0] for call in calls] - + ragas_spans = [event for event in llmobs_events if event["name"].startswith("dd-ragas.")] + ragas_spans = sorted(ragas_spans, key=lambda d: d["start_ns"]) + assert len(ragas_spans) == 7 # check name, io, span kinds match - assert spans == _expected_ragas_faithfulness_spans() + assert ragas_spans == _expected_ragas_faithfulness_spans() # verify the trace structure - root_span = spans[0] + root_span = ragas_spans[0] root_span_id = root_span["span_id"] assert root_span["parent_id"] == "undefined" assert root_span["meta"] is not None @@ -187,16 +188,15 @@ def test_ragas_faithfulness_emits_traces(ragas, LLMObs): assert isinstance(root_span["meta"]["metadata"]["faithfulness_list"], list) assert isinstance(root_span["meta"]["metadata"]["statements"], list) root_span_trace_id = root_span["trace_id"] - for child_span in spans[1:]: + for child_span in ragas_spans[1:]: assert child_span["trace_id"] == root_span_trace_id - assert spans[1]["parent_id"] == root_span_id # input extraction (task) - assert spans[2]["parent_id"] == root_span_id # create statements (workflow) - assert spans[4]["parent_id"] == root_span_id # create verdicts (workflow) - assert spans[6]["parent_id"] == root_span_id # create score (task) - - assert spans[3]["parent_id"] == spans[2]["span_id"] # create statements prompt (task) - assert spans[5]["parent_id"] == spans[4]["span_id"] # create verdicts prompt (task) + assert ragas_spans[1]["parent_id"] == root_span_id # input extraction (task) + assert ragas_spans[2]["parent_id"] == root_span_id # create statements (workflow) + assert ragas_spans[4]["parent_id"] == root_span_id # create verdicts (workflow) + assert ragas_spans[6]["parent_id"] == root_span_id # create score (task) + assert ragas_spans[3]["parent_id"] == ragas_spans[2]["span_id"] # create statements prompt (task) + assert ragas_spans[5]["parent_id"] == ragas_spans[4]["span_id"] # create verdicts prompt (task) def test_llmobs_with_faithfulness_emits_traces_and_evals_on_exit(mock_writer_logs, run_python_code_in_subprocess): @@ -213,7 +213,7 @@ def test_llmobs_with_faithfulness_emits_traces_and_evals_on_exit(mock_writer_log "DD_LLMOBS_ML_APP": "unnamed-ml-app", "_DD_LLMOBS_EVALUATOR_INTERVAL": "5", "_DD_LLMOBS_EVALUATORS": "ragas_faithfulness", - "DD_LLMOBS_AGENTLESS_ENABLED": "true", + "DD_LLMOBS_AGENTLESS_ENABLED": "1", } ) out, err, status, pid = run_python_code_in_subprocess( @@ -241,7 +241,7 @@ def test_llmobs_with_faithfulness_emits_traces_and_evals_on_exit(mock_writer_log ): LLMObs.enable() LLMObs._instance._evaluator_runner.enqueue(_llm_span_with_expected_ragas_inputs_in_messages(), None) -""", + """, env=env, ) assert status == 0, err diff --git a/tests/llmobs/test_llmobs_service.py b/tests/llmobs/test_llmobs_service.py index 98748250c3a..69ebb216d7e 100644 --- a/tests/llmobs/test_llmobs_service.py +++ b/tests/llmobs/test_llmobs_service.py @@ -7,9 +7,7 @@ import ddtrace from ddtrace._trace.context import Context -from ddtrace._trace.span import Span from ddtrace.ext import SpanTypes -from ddtrace.filters import TraceFilter from ddtrace.internal.service import ServiceStatus from ddtrace.llmobs import LLMObs as llmobs_service from ddtrace.llmobs._constants import INPUT_DOCUMENTS @@ -31,7 +29,8 @@ from ddtrace.llmobs._constants import SPAN_START_WHILE_DISABLED_WARNING from ddtrace.llmobs._constants import TAGS from ddtrace.llmobs._llmobs import SUPPORTED_LLMOBS_INTEGRATIONS -from ddtrace.llmobs._llmobs import LLMObsTraceProcessor +from ddtrace.llmobs._writer import LLMObsAgentlessEventClient +from ddtrace.llmobs._writer import LLMObsProxiedEventClient from ddtrace.llmobs.utils import Prompt from tests.llmobs._utils import _expected_llmobs_eval_metric_event from tests.llmobs._utils import _expected_llmobs_llm_span_event @@ -41,23 +40,16 @@ from tests.utils import override_global_config -@pytest.fixture -def mock_logs(): - with mock.patch("ddtrace.llmobs._llmobs.log") as mock_logs: - yield mock_logs +RAGAS_AVAILABLE = os.getenv("RAGAS_AVAILABLE", False) def run_llmobs_trace_filter(dummy_tracer): - for trace_filter in dummy_tracer._filters: - if isinstance(trace_filter, LLMObsTraceProcessor): - root_llm_span = Span(name="span1", span_type=SpanTypes.LLM) - root_llm_span.set_tag_str(SPAN_KIND, "llm") - trace1 = [root_llm_span] - return trace_filter.process_trace(trace1) - raise ValueError("LLMObsTraceProcessor not found in tracer filters.") + with dummy_tracer.trace("span1", span_type=SpanTypes.LLM) as span: + span.set_tag_str(SPAN_KIND, "llm") + return dummy_tracer._writer.pop() -def test_service_enable(): +def test_service_enable_proxy_default(): with override_global_config(dict(_dd_api_key="", _llmobs_ml_app="")): dummy_tracer = DummyTracer() llmobs_service.enable(_tracer=dummy_tracer) @@ -65,22 +57,22 @@ def test_service_enable(): assert llmobs_instance is not None assert llmobs_service.enabled assert llmobs_instance.tracer == dummy_tracer - assert any(isinstance(tracer_filter, LLMObsTraceProcessor) for tracer_filter in dummy_tracer._filters) + assert isinstance(llmobs_instance._llmobs_span_writer._clients[0], LLMObsProxiedEventClient) assert run_llmobs_trace_filter(dummy_tracer) is not None llmobs_service.disable() -def test_service_enable_with_apm_disabled(monkeypatch): - with override_global_config(dict(_dd_api_key="", _llmobs_ml_app="")): +def test_enable_agentless(): + with override_global_config(dict(_dd_api_key="", _llmobs_ml_app="")): dummy_tracer = DummyTracer() llmobs_service.enable(_tracer=dummy_tracer, agentless_enabled=True) llmobs_instance = llmobs_service._instance assert llmobs_instance is not None assert llmobs_service.enabled assert llmobs_instance.tracer == dummy_tracer - assert any(isinstance(tracer_filter, LLMObsTraceProcessor) for tracer_filter in dummy_tracer._filters) - assert run_llmobs_trace_filter(dummy_tracer) is None + assert isinstance(llmobs_instance._llmobs_span_writer._clients[0], LLMObsAgentlessEventClient) + assert run_llmobs_trace_filter(dummy_tracer) is not None llmobs_service.disable() @@ -118,7 +110,7 @@ def test_service_enable_no_ml_app_specified(): assert llmobs_service._instance._evaluator_runner.status.value == "stopped" -def test_service_enable_deprecated_ml_app_name(monkeypatch, mock_logs): +def test_service_enable_deprecated_ml_app_name(monkeypatch, mock_llmobs_logs): with override_global_config(dict(_dd_api_key="", _llmobs_ml_app="")): dummy_tracer = DummyTracer() monkeypatch.setenv("DD_LLMOBS_APP_NAME", "test_ml_app") @@ -126,11 +118,13 @@ def test_service_enable_deprecated_ml_app_name(monkeypatch, mock_logs): assert llmobs_service.enabled is True assert llmobs_service._instance._llmobs_eval_metric_writer.status.value == "running" assert llmobs_service._instance._llmobs_span_writer.status.value == "running" - mock_logs.warning.assert_called_once_with("`DD_LLMOBS_APP_NAME` is deprecated. Use `DD_LLMOBS_ML_APP` instead.") + mock_llmobs_logs.warning.assert_called_once_with( + "`DD_LLMOBS_APP_NAME` is deprecated. Use `DD_LLMOBS_ML_APP` instead." + ) llmobs_service.disable() -def test_service_enable_already_enabled(mock_logs): +def test_service_enable_already_enabled(mock_llmobs_logs): with override_global_config(dict(_dd_api_key="", _llmobs_ml_app="")): dummy_tracer = DummyTracer() llmobs_service.enable(_tracer=dummy_tracer) @@ -139,9 +133,8 @@ def test_service_enable_already_enabled(mock_logs): assert llmobs_instance is not None assert llmobs_service.enabled assert llmobs_instance.tracer == dummy_tracer - assert any(isinstance(tracer_filter, LLMObsTraceProcessor) for tracer_filter in dummy_tracer._filters) llmobs_service.disable() - mock_logs.debug.assert_has_calls([mock.call("%s already enabled", "LLMObs")]) + mock_llmobs_logs.debug.assert_has_calls([mock.call("%s already enabled", "LLMObs")]) @mock.patch("ddtrace.llmobs._llmobs.patch") @@ -203,107 +196,83 @@ def test_service_enable_does_not_override_global_patch_config(mock_tracer_patch, llmobs_service.disable() -def test_start_span_while_disabled_logs_warning(LLMObs, mock_logs): - LLMObs.disable() - _ = LLMObs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") - mock_logs.warning.assert_called_once_with(SPAN_START_WHILE_DISABLED_WARNING) - mock_logs.reset_mock() - _ = LLMObs.tool(name="test_tool") - mock_logs.warning.assert_called_once_with(SPAN_START_WHILE_DISABLED_WARNING) - mock_logs.reset_mock() - _ = LLMObs.task(name="test_task") - mock_logs.warning.assert_called_once_with(SPAN_START_WHILE_DISABLED_WARNING) - mock_logs.reset_mock() - _ = LLMObs.workflow(name="test_workflow") - mock_logs.warning.assert_called_once_with(SPAN_START_WHILE_DISABLED_WARNING) - mock_logs.reset_mock() - _ = LLMObs.agent(name="test_agent") - mock_logs.warning.assert_called_once_with(SPAN_START_WHILE_DISABLED_WARNING) - - -def test_start_span_uses_kind_as_default_name(LLMObs): - with LLMObs.llm(model_name="test_model", model_provider="test_provider") as span: +def test_start_span_while_disabled_logs_warning(llmobs, mock_llmobs_logs): + llmobs.disable() + _ = llmobs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") + mock_llmobs_logs.warning.assert_called_once_with(SPAN_START_WHILE_DISABLED_WARNING) + mock_llmobs_logs.reset_mock() + _ = llmobs.tool(name="test_tool") + mock_llmobs_logs.warning.assert_called_once_with(SPAN_START_WHILE_DISABLED_WARNING) + mock_llmobs_logs.reset_mock() + _ = llmobs.task(name="test_task") + mock_llmobs_logs.warning.assert_called_once_with(SPAN_START_WHILE_DISABLED_WARNING) + mock_llmobs_logs.reset_mock() + _ = llmobs.workflow(name="test_workflow") + mock_llmobs_logs.warning.assert_called_once_with(SPAN_START_WHILE_DISABLED_WARNING) + mock_llmobs_logs.reset_mock() + _ = llmobs.agent(name="test_agent") + mock_llmobs_logs.warning.assert_called_once_with(SPAN_START_WHILE_DISABLED_WARNING) + + +def test_start_span_uses_kind_as_default_name(llmobs): + with llmobs.llm(model_name="test_model", model_provider="test_provider") as span: assert span.name == "llm" - with LLMObs.tool() as span: + with llmobs.tool() as span: assert span.name == "tool" - with LLMObs.task() as span: + with llmobs.task() as span: assert span.name == "task" - with LLMObs.workflow() as span: + with llmobs.workflow() as span: assert span.name == "workflow" - with LLMObs.agent() as span: + with llmobs.agent() as span: assert span.name == "agent" -def test_start_span_with_session_id(LLMObs): - with LLMObs.llm(model_name="test_model", session_id="test_session_id") as span: +def test_start_span_with_session_id(llmobs): + with llmobs.llm(model_name="test_model", session_id="test_session_id") as span: assert span._get_ctx_item(SESSION_ID) == "test_session_id" - with LLMObs.tool(session_id="test_session_id") as span: + with llmobs.tool(session_id="test_session_id") as span: assert span._get_ctx_item(SESSION_ID) == "test_session_id" - with LLMObs.task(session_id="test_session_id") as span: + with llmobs.task(session_id="test_session_id") as span: assert span._get_ctx_item(SESSION_ID) == "test_session_id" - with LLMObs.workflow(session_id="test_session_id") as span: + with llmobs.workflow(session_id="test_session_id") as span: assert span._get_ctx_item(SESSION_ID) == "test_session_id" - with LLMObs.agent(session_id="test_session_id") as span: + with llmobs.agent(session_id="test_session_id") as span: assert span._get_ctx_item(SESSION_ID) == "test_session_id" -def test_session_id_becomes_top_level_field(LLMObs, mock_llmobs_span_writer): +def test_session_id_becomes_top_level_field(llmobs, llmobs_events): session_id = "test_session_id" - with LLMObs.task(session_id=session_id) as span: + with llmobs.task(session_id=session_id) as span: pass - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_non_llm_span_event(span, "task", session_id=session_id) - ) - - -def test_session_id_becomes_top_level_field_agentless(AgentlessLLMObs, mock_llmobs_span_agentless_writer): - session_id = "test_session_id" - with AgentlessLLMObs.task(session_id=session_id) as span: - pass - mock_llmobs_span_agentless_writer.enqueue.assert_called_with( - _expected_llmobs_non_llm_span_event(span, "task", session_id=session_id) - ) + assert len(llmobs_events) == 1 + assert llmobs_events[0] == _expected_llmobs_non_llm_span_event(span, "task", session_id=session_id) -def test_llm_span(LLMObs, mock_llmobs_span_writer): - with LLMObs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span: +def test_llm_span(llmobs, llmobs_events): + with llmobs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span: assert span.name == "test_llm_call" assert span.resource == "llm" assert span.span_type == "llm" assert span._get_ctx_item(SPAN_KIND) == "llm" assert span._get_ctx_item(MODEL_NAME) == "test_model" assert span._get_ctx_item(MODEL_PROVIDER) == "test_provider" - - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_llm_span_event(span, "llm", model_name="test_model", model_provider="test_provider") + assert len(llmobs_events) == 1 + assert llmobs_events[0] == _expected_llmobs_llm_span_event( + span, "llm", model_name="test_model", model_provider="test_provider" ) -def test_llm_span_agentless(AgentlessLLMObs, mock_llmobs_span_agentless_writer): - with AgentlessLLMObs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span: - assert span.name == "test_llm_call" - assert span.resource == "llm" - assert span.span_type == "llm" - assert span._get_ctx_item(SPAN_KIND) == "llm" - assert span._get_ctx_item(MODEL_NAME) == "test_model" - assert span._get_ctx_item(MODEL_PROVIDER) == "test_provider" - - mock_llmobs_span_agentless_writer.enqueue.assert_called_with( - _expected_llmobs_llm_span_event(span, "llm", model_name="test_model", model_provider="test_provider") - ) - - -def test_llm_span_no_model_sets_default(LLMObs, mock_llmobs_span_writer): - with LLMObs.llm(name="test_llm_call", model_provider="test_provider") as span: +def test_llm_span_no_model_sets_default(llmobs, llmobs_events): + with llmobs.llm(name="test_llm_call", model_provider="test_provider") as span: assert span._get_ctx_item(MODEL_NAME) == "custom" - - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_llm_span_event(span, "llm", model_name="custom", model_provider="test_provider") + assert len(llmobs_events) == 1 + assert llmobs_events[0] == _expected_llmobs_llm_span_event( + span, "llm", model_name="custom", model_provider="test_provider" ) -def test_default_model_provider_set_to_custom(LLMObs): - with LLMObs.llm(model_name="test_model", name="test_llm_call") as span: +def test_default_model_provider_set_to_custom(llmobs): + with llmobs.llm(model_name="test_model", name="test_llm_call") as span: assert span.name == "test_llm_call" assert span.resource == "llm" assert span.span_type == "llm" @@ -312,88 +281,57 @@ def test_default_model_provider_set_to_custom(LLMObs): assert span._get_ctx_item(MODEL_PROVIDER) == "custom" -def test_tool_span(LLMObs, mock_llmobs_span_writer): - with LLMObs.tool(name="test_tool") as span: - assert span.name == "test_tool" - assert span.resource == "tool" - assert span.span_type == "llm" - assert span._get_ctx_item(SPAN_KIND) == "tool" - mock_llmobs_span_writer.enqueue.assert_called_with(_expected_llmobs_non_llm_span_event(span, "tool")) - - -def test_tool_span_agentless(AgentlessLLMObs, mock_llmobs_span_agentless_writer): - with AgentlessLLMObs.tool(name="test_tool") as span: +def test_tool_span(llmobs, llmobs_events): + with llmobs.tool(name="test_tool") as span: assert span.name == "test_tool" assert span.resource == "tool" assert span.span_type == "llm" assert span._get_ctx_item(SPAN_KIND) == "tool" - mock_llmobs_span_agentless_writer.enqueue.assert_called_with(_expected_llmobs_non_llm_span_event(span, "tool")) - - -def test_task_span(LLMObs, mock_llmobs_span_writer): - with LLMObs.task(name="test_task") as span: - assert span.name == "test_task" - assert span.resource == "task" - assert span.span_type == "llm" - assert span._get_ctx_item(SPAN_KIND) == "task" - mock_llmobs_span_writer.enqueue.assert_called_with(_expected_llmobs_non_llm_span_event(span, "task")) + assert len(llmobs_events) == 1 + assert llmobs_events[0] == _expected_llmobs_non_llm_span_event(span, "tool") -def test_task_span_agentless(AgentlessLLMObs, mock_llmobs_span_agentless_writer): - with AgentlessLLMObs.task(name="test_task") as span: +def test_task_span(llmobs, llmobs_events): + with llmobs.task(name="test_task") as span: assert span.name == "test_task" assert span.resource == "task" assert span.span_type == "llm" assert span._get_ctx_item(SPAN_KIND) == "task" - mock_llmobs_span_agentless_writer.enqueue.assert_called_with(_expected_llmobs_non_llm_span_event(span, "task")) + assert len(llmobs_events) == 1 + assert llmobs_events[0] == _expected_llmobs_non_llm_span_event(span, "task") -def test_workflow_span(LLMObs, mock_llmobs_span_writer): - with LLMObs.workflow(name="test_workflow") as span: +def test_workflow_span(llmobs, llmobs_events): + with llmobs.workflow(name="test_workflow") as span: assert span.name == "test_workflow" assert span.resource == "workflow" assert span.span_type == "llm" assert span._get_ctx_item(SPAN_KIND) == "workflow" - mock_llmobs_span_writer.enqueue.assert_called_with(_expected_llmobs_non_llm_span_event(span, "workflow")) + assert len(llmobs_events) == 1 + assert llmobs_events[0] == _expected_llmobs_non_llm_span_event(span, "workflow") -def test_workflow_span_agentless(AgentlessLLMObs, mock_llmobs_span_agentless_writer): - with AgentlessLLMObs.workflow(name="test_workflow") as span: - assert span.name == "test_workflow" - assert span.resource == "workflow" - assert span.span_type == "llm" - assert span._get_ctx_item(SPAN_KIND) == "workflow" - mock_llmobs_span_agentless_writer.enqueue.assert_called_with(_expected_llmobs_non_llm_span_event(span, "workflow")) - - -def test_agent_span(LLMObs, mock_llmobs_span_writer): - with LLMObs.agent(name="test_agent") as span: +def test_agent_span(llmobs, llmobs_events): + with llmobs.agent(name="test_agent") as span: assert span.name == "test_agent" assert span.resource == "agent" assert span.span_type == "llm" assert span._get_ctx_item(SPAN_KIND) == "agent" - mock_llmobs_span_writer.enqueue.assert_called_with(_expected_llmobs_llm_span_event(span, "agent")) + assert len(llmobs_events) == 1 + assert llmobs_events[0] == _expected_llmobs_llm_span_event(span, "agent") -def test_agent_span_agentless(AgentlessLLMObs, mock_llmobs_span_agentless_writer): - with AgentlessLLMObs.agent(name="test_agent") as span: - assert span.name == "test_agent" - assert span.resource == "agent" - assert span.span_type == "llm" - assert span._get_ctx_item(SPAN_KIND) == "agent" - mock_llmobs_span_agentless_writer.enqueue.assert_called_with(_expected_llmobs_llm_span_event(span, "agent")) - - -def test_embedding_span_no_model_sets_default(LLMObs, mock_llmobs_span_writer): - with LLMObs.embedding(name="test_embedding", model_provider="test_provider") as span: +def test_embedding_span_no_model_sets_default(llmobs, llmobs_events): + with llmobs.embedding(name="test_embedding", model_provider="test_provider") as span: assert span._get_ctx_item(MODEL_NAME) == "custom" - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_llm_span_event(span, "embedding", model_name="custom", model_provider="test_provider") + assert len(llmobs_events) == 1 + assert llmobs_events[0] == _expected_llmobs_llm_span_event( + span, "embedding", model_name="custom", model_provider="test_provider" ) -def test_embedding_default_model_provider_set_to_custom(LLMObs): - with LLMObs.embedding(model_name="test_model", name="test_embedding") as span: +def test_embedding_default_model_provider_set_to_custom(llmobs): + with llmobs.embedding(model_name="test_model", name="test_embedding") as span: assert span.name == "test_embedding" assert span.resource == "embedding" assert span.span_type == "llm" @@ -402,198 +340,182 @@ def test_embedding_default_model_provider_set_to_custom(LLMObs): assert span._get_ctx_item(MODEL_PROVIDER) == "custom" -def test_embedding_span(LLMObs, mock_llmobs_span_writer): - with LLMObs.embedding(model_name="test_model", name="test_embedding", model_provider="test_provider") as span: +def test_embedding_span(llmobs, llmobs_events): + with llmobs.embedding(model_name="test_model", name="test_embedding", model_provider="test_provider") as span: assert span.name == "test_embedding" assert span.resource == "embedding" assert span.span_type == "llm" assert span._get_ctx_item(SPAN_KIND) == "embedding" assert span._get_ctx_item(MODEL_NAME) == "test_model" assert span._get_ctx_item(MODEL_PROVIDER) == "test_provider" - - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_llm_span_event(span, "embedding", model_name="test_model", model_provider="test_provider") + assert len(llmobs_events) == 1 + assert llmobs_events[0] == _expected_llmobs_llm_span_event( + span, "embedding", model_name="test_model", model_provider="test_provider" ) -def test_embedding_span_agentless(AgentlessLLMObs, mock_llmobs_span_agentless_writer): - with AgentlessLLMObs.embedding( - model_name="test_model", name="test_embedding", model_provider="test_provider" - ) as span: - assert span.name == "test_embedding" - assert span.resource == "embedding" - assert span.span_type == "llm" - assert span._get_ctx_item(SPAN_KIND) == "embedding" - assert span._get_ctx_item(MODEL_NAME) == "test_model" - assert span._get_ctx_item(MODEL_PROVIDER) == "test_provider" - - mock_llmobs_span_agentless_writer.enqueue.assert_called_with( - _expected_llmobs_llm_span_event(span, "embedding", model_name="test_model", model_provider="test_provider") - ) +def test_annotate_no_active_span_logs_warning(llmobs, mock_llmobs_logs): + llmobs.annotate(parameters={"test": "test"}) + mock_llmobs_logs.warning.assert_called_once_with("No span provided and no active LLMObs-generated span found.") -def test_annotate_no_active_span_logs_warning(LLMObs, mock_logs): - LLMObs.annotate(parameters={"test": "test"}) - mock_logs.warning.assert_called_once_with("No span provided and no active LLMObs-generated span found.") - - -def test_annotate_non_llm_span_logs_warning(LLMObs, mock_logs): +def test_annotate_non_llm_span_logs_warning(llmobs, mock_llmobs_logs): dummy_tracer = DummyTracer() with dummy_tracer.trace("root") as non_llmobs_span: - LLMObs.annotate(span=non_llmobs_span, parameters={"test": "test"}) - mock_logs.warning.assert_called_once_with("Span must be an LLMObs-generated span.") + llmobs.annotate(span=non_llmobs_span, parameters={"test": "test"}) + mock_llmobs_logs.warning.assert_called_once_with("Span must be an LLMObs-generated span.") -def test_annotate_finished_span_does_nothing(LLMObs, mock_logs): - with LLMObs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span: +def test_annotate_finished_span_does_nothing(llmobs, mock_llmobs_logs): + with llmobs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span: pass - LLMObs.annotate(span=span, parameters={"test": "test"}) - mock_logs.warning.assert_called_once_with("Cannot annotate a finished span.") + llmobs.annotate(span=span, parameters={"test": "test"}) + mock_llmobs_logs.warning.assert_called_once_with("Cannot annotate a finished span.") -def test_annotate_parameters(LLMObs, mock_logs): - with LLMObs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span: - LLMObs.annotate(span=span, parameters={"temperature": 0.9, "max_tokens": 50}) +def test_annotate_parameters(llmobs, mock_llmobs_logs): + with llmobs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span: + llmobs.annotate(span=span, parameters={"temperature": 0.9, "max_tokens": 50}) assert span._get_ctx_item(INPUT_PARAMETERS) == {"temperature": 0.9, "max_tokens": 50} - mock_logs.warning.assert_called_once_with( + mock_llmobs_logs.warning.assert_called_once_with( "Setting parameters is deprecated, please set parameters and other metadata as tags instead." ) -def test_annotate_metadata(LLMObs): - with LLMObs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span: - LLMObs.annotate(span=span, metadata={"temperature": 0.5, "max_tokens": 20, "top_k": 10, "n": 3}) +def test_annotate_metadata(llmobs): + with llmobs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span: + llmobs.annotate(span=span, metadata={"temperature": 0.5, "max_tokens": 20, "top_k": 10, "n": 3}) assert span._get_ctx_item(METADATA) == {"temperature": 0.5, "max_tokens": 20, "top_k": 10, "n": 3} -def test_annotate_metadata_wrong_type_raises_warning(LLMObs, mock_logs): - with LLMObs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span: - LLMObs.annotate(span=span, metadata="wrong_metadata") +def test_annotate_metadata_wrong_type_raises_warning(llmobs, mock_llmobs_logs): + with llmobs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span: + llmobs.annotate(span=span, metadata="wrong_metadata") assert span._get_ctx_item(METADATA) is None - mock_logs.warning.assert_called_once_with("metadata must be a dictionary of string key-value pairs.") - mock_logs.reset_mock() + mock_llmobs_logs.warning.assert_called_once_with("metadata must be a dictionary of string key-value pairs.") + mock_llmobs_logs.reset_mock() -def test_annotate_tag(LLMObs): - with LLMObs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span: - LLMObs.annotate(span=span, tags={"test_tag_name": "test_tag_value", "test_numeric_tag": 10}) +def test_annotate_tag(llmobs): + with llmobs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span: + llmobs.annotate(span=span, tags={"test_tag_name": "test_tag_value", "test_numeric_tag": 10}) assert span._get_ctx_item(TAGS) == {"test_tag_name": "test_tag_value", "test_numeric_tag": 10} -def test_annotate_tag_wrong_type(LLMObs, mock_logs): - with LLMObs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span: - LLMObs.annotate(span=span, tags=12345) +def test_annotate_tag_wrong_type(llmobs, mock_llmobs_logs): + with llmobs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span: + llmobs.annotate(span=span, tags=12345) assert span._get_ctx_item(TAGS) is None - mock_logs.warning.assert_called_once_with( + mock_llmobs_logs.warning.assert_called_once_with( "span_tags must be a dictionary of string key - primitive value pairs." ) -def test_annotate_input_string(LLMObs): - with LLMObs.llm(model_name="test_model") as llm_span: - LLMObs.annotate(span=llm_span, input_data="test_input") +def test_annotate_input_string(llmobs): + with llmobs.llm(model_name="test_model") as llm_span: + llmobs.annotate(span=llm_span, input_data="test_input") assert llm_span._get_ctx_item(INPUT_MESSAGES) == [{"content": "test_input"}] - with LLMObs.task() as task_span: - LLMObs.annotate(span=task_span, input_data="test_input") + with llmobs.task() as task_span: + llmobs.annotate(span=task_span, input_data="test_input") assert task_span._get_ctx_item(INPUT_VALUE) == "test_input" - with LLMObs.tool() as tool_span: - LLMObs.annotate(span=tool_span, input_data="test_input") + with llmobs.tool() as tool_span: + llmobs.annotate(span=tool_span, input_data="test_input") assert tool_span._get_ctx_item(INPUT_VALUE) == "test_input" - with LLMObs.workflow() as workflow_span: - LLMObs.annotate(span=workflow_span, input_data="test_input") + with llmobs.workflow() as workflow_span: + llmobs.annotate(span=workflow_span, input_data="test_input") assert workflow_span._get_ctx_item(INPUT_VALUE) == "test_input" - with LLMObs.agent() as agent_span: - LLMObs.annotate(span=agent_span, input_data="test_input") + with llmobs.agent() as agent_span: + llmobs.annotate(span=agent_span, input_data="test_input") assert agent_span._get_ctx_item(INPUT_VALUE) == "test_input" - with LLMObs.retrieval() as retrieval_span: - LLMObs.annotate(span=retrieval_span, input_data="test_input") + with llmobs.retrieval() as retrieval_span: + llmobs.annotate(span=retrieval_span, input_data="test_input") assert retrieval_span._get_ctx_item(INPUT_VALUE) == "test_input" -def test_annotate_numeric_io(LLMObs): - with LLMObs.task() as task_span: - LLMObs.annotate(span=task_span, input_data=0, output_data=0) +def test_annotate_numeric_io(llmobs): + with llmobs.task() as task_span: + llmobs.annotate(span=task_span, input_data=0, output_data=0) assert task_span._get_ctx_item(INPUT_VALUE) == "0" assert task_span._get_ctx_item(OUTPUT_VALUE) == "0" - with LLMObs.task() as task_span: - LLMObs.annotate(span=task_span, input_data=1.23, output_data=1.23) + with llmobs.task() as task_span: + llmobs.annotate(span=task_span, input_data=1.23, output_data=1.23) assert task_span._get_ctx_item(INPUT_VALUE) == "1.23" assert task_span._get_ctx_item(OUTPUT_VALUE) == "1.23" -def test_annotate_input_serializable_value(LLMObs): - with LLMObs.task() as task_span: - LLMObs.annotate(span=task_span, input_data=["test_input"]) +def test_annotate_input_serializable_value(llmobs): + with llmobs.task() as task_span: + llmobs.annotate(span=task_span, input_data=["test_input"]) assert task_span._get_ctx_item(INPUT_VALUE) == str(["test_input"]) - with LLMObs.tool() as tool_span: - LLMObs.annotate(span=tool_span, input_data={"test_input": "hello world"}) + with llmobs.tool() as tool_span: + llmobs.annotate(span=tool_span, input_data={"test_input": "hello world"}) assert tool_span._get_ctx_item(INPUT_VALUE) == str({"test_input": "hello world"}) - with LLMObs.workflow() as workflow_span: - LLMObs.annotate(span=workflow_span, input_data=("asd", 123)) + with llmobs.workflow() as workflow_span: + llmobs.annotate(span=workflow_span, input_data=("asd", 123)) assert workflow_span._get_ctx_item(INPUT_VALUE) == str(("asd", 123)) - with LLMObs.agent() as agent_span: - LLMObs.annotate(span=agent_span, input_data="test_input") + with llmobs.agent() as agent_span: + llmobs.annotate(span=agent_span, input_data="test_input") assert agent_span._get_ctx_item(INPUT_VALUE) == "test_input" - with LLMObs.retrieval() as retrieval_span: - LLMObs.annotate(span=retrieval_span, input_data=[0, 1, 2, 3, 4]) + with llmobs.retrieval() as retrieval_span: + llmobs.annotate(span=retrieval_span, input_data=[0, 1, 2, 3, 4]) assert retrieval_span._get_ctx_item(INPUT_VALUE) == str([0, 1, 2, 3, 4]) -def test_annotate_input_llm_message(LLMObs): - with LLMObs.llm(model_name="test_model") as span: - LLMObs.annotate(span=span, input_data=[{"content": "test_input", "role": "human"}]) +def test_annotate_input_llm_message(llmobs): + with llmobs.llm(model_name="test_model") as span: + llmobs.annotate(span=span, input_data=[{"content": "test_input", "role": "human"}]) assert span._get_ctx_item(INPUT_MESSAGES) == [{"content": "test_input", "role": "human"}] -def test_annotate_input_llm_message_wrong_type(LLMObs, mock_logs): - with LLMObs.llm(model_name="test_model") as span: - LLMObs.annotate(span=span, input_data=[{"content": object()}]) +def test_annotate_input_llm_message_wrong_type(llmobs, mock_llmobs_logs): + with llmobs.llm(model_name="test_model") as span: + llmobs.annotate(span=span, input_data=[{"content": object()}]) assert span._get_ctx_item(INPUT_MESSAGES) is None - mock_logs.warning.assert_called_once_with("Failed to parse input messages.", exc_info=True) + mock_llmobs_logs.warning.assert_called_once_with("Failed to parse input messages.", exc_info=True) -def test_llmobs_annotate_incorrect_message_content_type_raises_warning(LLMObs, mock_logs): - with LLMObs.llm(model_name="test_model") as span: - LLMObs.annotate(span=span, input_data={"role": "user", "content": {"nested": "yes"}}) - mock_logs.warning.assert_called_once_with("Failed to parse input messages.", exc_info=True) - mock_logs.reset_mock() - LLMObs.annotate(span=span, output_data={"role": "user", "content": {"nested": "yes"}}) - mock_logs.warning.assert_called_once_with("Failed to parse output messages.", exc_info=True) +def test_llmobs_annotate_incorrect_message_content_type_raises_warning(llmobs, mock_llmobs_logs): + with llmobs.llm(model_name="test_model") as span: + llmobs.annotate(span=span, input_data={"role": "user", "content": {"nested": "yes"}}) + mock_llmobs_logs.warning.assert_called_once_with("Failed to parse input messages.", exc_info=True) + mock_llmobs_logs.reset_mock() + llmobs.annotate(span=span, output_data={"role": "user", "content": {"nested": "yes"}}) + mock_llmobs_logs.warning.assert_called_once_with("Failed to parse output messages.", exc_info=True) -def test_annotate_document_str(LLMObs): - with LLMObs.embedding(model_name="test_model") as span: - LLMObs.annotate(span=span, input_data="test_document_text") +def test_annotate_document_str(llmobs): + with llmobs.embedding(model_name="test_model") as span: + llmobs.annotate(span=span, input_data="test_document_text") documents = span._get_ctx_item(INPUT_DOCUMENTS) assert documents assert len(documents) == 1 assert documents[0]["text"] == "test_document_text" - with LLMObs.retrieval() as span: - LLMObs.annotate(span=span, output_data="test_document_text") + with llmobs.retrieval() as span: + llmobs.annotate(span=span, output_data="test_document_text") documents = span._get_ctx_item(OUTPUT_DOCUMENTS) assert documents assert len(documents) == 1 assert documents[0]["text"] == "test_document_text" -def test_annotate_document_dict(LLMObs): - with LLMObs.embedding(model_name="test_model") as span: - LLMObs.annotate(span=span, input_data={"text": "test_document_text"}) +def test_annotate_document_dict(llmobs): + with llmobs.embedding(model_name="test_model") as span: + llmobs.annotate(span=span, input_data={"text": "test_document_text"}) documents = span._get_ctx_item(INPUT_DOCUMENTS) assert documents assert len(documents) == 1 assert documents[0]["text"] == "test_document_text" - with LLMObs.retrieval() as span: - LLMObs.annotate(span=span, output_data={"text": "test_document_text"}) + with llmobs.retrieval() as span: + llmobs.annotate(span=span, output_data={"text": "test_document_text"}) documents = span._get_ctx_item(OUTPUT_DOCUMENTS) assert documents assert len(documents) == 1 assert documents[0]["text"] == "test_document_text" -def test_annotate_document_list(LLMObs): - with LLMObs.embedding(model_name="test_model") as span: - LLMObs.annotate( +def test_annotate_document_list(llmobs): + with llmobs.embedding(model_name="test_model") as span: + llmobs.annotate( span=span, input_data=[{"text": "test_document_text"}, {"text": "text", "name": "name", "score": 0.9, "id": "id"}], ) @@ -605,8 +527,8 @@ def test_annotate_document_list(LLMObs): assert documents[1]["name"] == "name" assert documents[1]["id"] == "id" assert documents[1]["score"] == 0.9 - with LLMObs.retrieval() as span: - LLMObs.annotate( + with llmobs.retrieval() as span: + llmobs.annotate( span=span, output_data=[{"text": "test_document_text"}, {"text": "text", "name": "name", "score": 0.9, "id": "id"}], ) @@ -620,129 +542,131 @@ def test_annotate_document_list(LLMObs): assert documents[1]["score"] == 0.9 -def test_annotate_incorrect_document_type_raises_warning(LLMObs, mock_logs): - with LLMObs.embedding(model_name="test_model") as span: - LLMObs.annotate(span=span, input_data={"text": 123}) - mock_logs.warning.assert_called_once_with("Failed to parse input documents.", exc_info=True) - mock_logs.reset_mock() - LLMObs.annotate(span=span, input_data=123) - mock_logs.warning.assert_called_once_with("Failed to parse input documents.", exc_info=True) - mock_logs.reset_mock() - LLMObs.annotate(span=span, input_data=object()) - mock_logs.warning.assert_called_once_with("Failed to parse input documents.", exc_info=True) - mock_logs.reset_mock() - with LLMObs.retrieval() as span: - LLMObs.annotate(span=span, output_data=[{"score": 0.9, "id": "id", "name": "name"}]) - mock_logs.warning.assert_called_once_with("Failed to parse output documents.", exc_info=True) - mock_logs.reset_mock() - LLMObs.annotate(span=span, output_data=123) - mock_logs.warning.assert_called_once_with("Failed to parse output documents.", exc_info=True) - mock_logs.reset_mock() - LLMObs.annotate(span=span, output_data=object()) - mock_logs.warning.assert_called_once_with("Failed to parse output documents.", exc_info=True) - - -def test_annotate_document_no_text_raises_warning(LLMObs, mock_logs): - with LLMObs.embedding(model_name="test_model") as span: - LLMObs.annotate(span=span, input_data=[{"score": 0.9, "id": "id", "name": "name"}]) - mock_logs.warning.assert_called_once_with("Failed to parse input documents.", exc_info=True) - mock_logs.reset_mock() - with LLMObs.retrieval() as span: - LLMObs.annotate(span=span, output_data=[{"score": 0.9, "id": "id", "name": "name"}]) - mock_logs.warning.assert_called_once_with("Failed to parse output documents.", exc_info=True) - - -def test_annotate_incorrect_document_field_type_raises_warning(LLMObs, mock_logs): - with LLMObs.embedding(model_name="test_model") as span: - LLMObs.annotate(span=span, input_data=[{"text": "test_document_text", "score": "0.9"}]) - mock_logs.warning.assert_called_once_with("Failed to parse input documents.", exc_info=True) - mock_logs.reset_mock() - with LLMObs.embedding(model_name="test_model") as span: - LLMObs.annotate( +def test_annotate_incorrect_document_type_raises_warning(llmobs, mock_llmobs_logs): + with llmobs.embedding(model_name="test_model") as span: + llmobs.annotate(span=span, input_data={"text": 123}) + mock_llmobs_logs.warning.assert_called_once_with("Failed to parse input documents.", exc_info=True) + mock_llmobs_logs.reset_mock() + llmobs.annotate(span=span, input_data=123) + mock_llmobs_logs.warning.assert_called_once_with("Failed to parse input documents.", exc_info=True) + mock_llmobs_logs.reset_mock() + llmobs.annotate(span=span, input_data=object()) + mock_llmobs_logs.warning.assert_called_once_with("Failed to parse input documents.", exc_info=True) + mock_llmobs_logs.reset_mock() + with llmobs.retrieval() as span: + llmobs.annotate(span=span, output_data=[{"score": 0.9, "id": "id", "name": "name"}]) + mock_llmobs_logs.warning.assert_called_once_with("Failed to parse output documents.", exc_info=True) + mock_llmobs_logs.reset_mock() + llmobs.annotate(span=span, output_data=123) + mock_llmobs_logs.warning.assert_called_once_with("Failed to parse output documents.", exc_info=True) + mock_llmobs_logs.reset_mock() + llmobs.annotate(span=span, output_data=object()) + mock_llmobs_logs.warning.assert_called_once_with("Failed to parse output documents.", exc_info=True) + + +def test_annotate_document_no_text_raises_warning(llmobs, mock_llmobs_logs): + with llmobs.embedding(model_name="test_model") as span: + llmobs.annotate(span=span, input_data=[{"score": 0.9, "id": "id", "name": "name"}]) + mock_llmobs_logs.warning.assert_called_once_with("Failed to parse input documents.", exc_info=True) + mock_llmobs_logs.reset_mock() + with llmobs.retrieval() as span: + llmobs.annotate(span=span, output_data=[{"score": 0.9, "id": "id", "name": "name"}]) + mock_llmobs_logs.warning.assert_called_once_with("Failed to parse output documents.", exc_info=True) + + +def test_annotate_incorrect_document_field_type_raises_warning(llmobs, mock_llmobs_logs): + with llmobs.embedding(model_name="test_model") as span: + llmobs.annotate(span=span, input_data=[{"text": "test_document_text", "score": "0.9"}]) + mock_llmobs_logs.warning.assert_called_once_with("Failed to parse input documents.", exc_info=True) + mock_llmobs_logs.reset_mock() + with llmobs.embedding(model_name="test_model") as span: + llmobs.annotate( span=span, input_data=[{"text": "text", "id": 123, "score": "0.9", "name": ["h", "e", "l", "l", "o"]}] ) - mock_logs.warning.assert_called_once_with("Failed to parse input documents.", exc_info=True) - mock_logs.reset_mock() - with LLMObs.retrieval() as span: - LLMObs.annotate(span=span, output_data=[{"text": "test_document_text", "score": "0.9"}]) - mock_logs.warning.assert_called_once_with("Failed to parse output documents.", exc_info=True) - mock_logs.reset_mock() - with LLMObs.retrieval() as span: - LLMObs.annotate( + mock_llmobs_logs.warning.assert_called_once_with("Failed to parse input documents.", exc_info=True) + mock_llmobs_logs.reset_mock() + with llmobs.retrieval() as span: + llmobs.annotate(span=span, output_data=[{"text": "test_document_text", "score": "0.9"}]) + mock_llmobs_logs.warning.assert_called_once_with("Failed to parse output documents.", exc_info=True) + mock_llmobs_logs.reset_mock() + with llmobs.retrieval() as span: + llmobs.annotate( span=span, output_data=[{"text": "text", "id": 123, "score": "0.9", "name": ["h", "e", "l", "l", "o"]}] ) - mock_logs.warning.assert_called_once_with("Failed to parse output documents.", exc_info=True) + mock_llmobs_logs.warning.assert_called_once_with("Failed to parse output documents.", exc_info=True) -def test_annotate_output_string(LLMObs): - with LLMObs.llm(model_name="test_model") as llm_span: - LLMObs.annotate(span=llm_span, output_data="test_output") +def test_annotate_output_string(llmobs): + with llmobs.llm(model_name="test_model") as llm_span: + llmobs.annotate(span=llm_span, output_data="test_output") assert llm_span._get_ctx_item(OUTPUT_MESSAGES) == [{"content": "test_output"}] - with LLMObs.embedding(model_name="test_model") as embedding_span: - LLMObs.annotate(span=embedding_span, output_data="test_output") + with llmobs.embedding(model_name="test_model") as embedding_span: + llmobs.annotate(span=embedding_span, output_data="test_output") assert embedding_span._get_ctx_item(OUTPUT_VALUE) == "test_output" - with LLMObs.task() as task_span: - LLMObs.annotate(span=task_span, output_data="test_output") + with llmobs.task() as task_span: + llmobs.annotate(span=task_span, output_data="test_output") assert task_span._get_ctx_item(OUTPUT_VALUE) == "test_output" - with LLMObs.tool() as tool_span: - LLMObs.annotate(span=tool_span, output_data="test_output") + with llmobs.tool() as tool_span: + llmobs.annotate(span=tool_span, output_data="test_output") assert tool_span._get_ctx_item(OUTPUT_VALUE) == "test_output" - with LLMObs.workflow() as workflow_span: - LLMObs.annotate(span=workflow_span, output_data="test_output") + with llmobs.workflow() as workflow_span: + llmobs.annotate(span=workflow_span, output_data="test_output") assert workflow_span._get_ctx_item(OUTPUT_VALUE) == "test_output" - with LLMObs.agent() as agent_span: - LLMObs.annotate(span=agent_span, output_data="test_output") + with llmobs.agent() as agent_span: + llmobs.annotate(span=agent_span, output_data="test_output") assert agent_span._get_ctx_item(OUTPUT_VALUE) == "test_output" -def test_annotate_output_serializable_value(LLMObs): - with LLMObs.embedding(model_name="test_model") as embedding_span: - LLMObs.annotate(span=embedding_span, output_data=[[0, 1, 2, 3], [4, 5, 6, 7]]) +def test_annotate_output_serializable_value(llmobs): + with llmobs.embedding(model_name="test_model") as embedding_span: + llmobs.annotate(span=embedding_span, output_data=[[0, 1, 2, 3], [4, 5, 6, 7]]) assert embedding_span._get_ctx_item(OUTPUT_VALUE) == str([[0, 1, 2, 3], [4, 5, 6, 7]]) - with LLMObs.task() as task_span: - LLMObs.annotate(span=task_span, output_data=["test_output"]) + with llmobs.task() as task_span: + llmobs.annotate(span=task_span, output_data=["test_output"]) assert task_span._get_ctx_item(OUTPUT_VALUE) == str(["test_output"]) - with LLMObs.tool() as tool_span: - LLMObs.annotate(span=tool_span, output_data={"test_output": "hello world"}) + with llmobs.tool() as tool_span: + llmobs.annotate(span=tool_span, output_data={"test_output": "hello world"}) assert tool_span._get_ctx_item(OUTPUT_VALUE) == str({"test_output": "hello world"}) - with LLMObs.workflow() as workflow_span: - LLMObs.annotate(span=workflow_span, output_data=("asd", 123)) + with llmobs.workflow() as workflow_span: + llmobs.annotate(span=workflow_span, output_data=("asd", 123)) assert workflow_span._get_ctx_item(OUTPUT_VALUE) == str(("asd", 123)) - with LLMObs.agent() as agent_span: - LLMObs.annotate(span=agent_span, output_data="test_output") + with llmobs.agent() as agent_span: + llmobs.annotate(span=agent_span, output_data="test_output") assert agent_span._get_ctx_item(OUTPUT_VALUE) == "test_output" -def test_annotate_output_llm_message(LLMObs): - with LLMObs.llm(model_name="test_model") as llm_span: - LLMObs.annotate(span=llm_span, output_data=[{"content": "test_output", "role": "human"}]) +def test_annotate_output_llm_message(llmobs): + with llmobs.llm(model_name="test_model") as llm_span: + llmobs.annotate(span=llm_span, output_data=[{"content": "test_output", "role": "human"}]) assert llm_span._get_ctx_item(OUTPUT_MESSAGES) == [{"content": "test_output", "role": "human"}] -def test_annotate_output_llm_message_wrong_type(LLMObs, mock_logs): - with LLMObs.llm(model_name="test_model") as llm_span: - LLMObs.annotate(span=llm_span, output_data=[{"content": object()}]) +def test_annotate_output_llm_message_wrong_type(llmobs, mock_llmobs_logs): + with llmobs.llm(model_name="test_model") as llm_span: + llmobs.annotate(span=llm_span, output_data=[{"content": object()}]) assert llm_span._get_ctx_item(OUTPUT_MESSAGES) is None - mock_logs.warning.assert_called_once_with("Failed to parse output messages.", exc_info=True) + mock_llmobs_logs.warning.assert_called_once_with("Failed to parse output messages.", exc_info=True) -def test_annotate_metrics(LLMObs): - with LLMObs.llm(model_name="test_model") as span: - LLMObs.annotate(span=span, metrics={"input_tokens": 10, "output_tokens": 20, "total_tokens": 30}) +def test_annotate_metrics(llmobs): + with llmobs.llm(model_name="test_model") as span: + llmobs.annotate(span=span, metrics={"input_tokens": 10, "output_tokens": 20, "total_tokens": 30}) assert span._get_ctx_item(METRICS) == {"input_tokens": 10, "output_tokens": 20, "total_tokens": 30} -def test_annotate_metrics_wrong_type(LLMObs, mock_logs): - with LLMObs.llm(model_name="test_model") as llm_span: - LLMObs.annotate(span=llm_span, metrics=12345) +def test_annotate_metrics_wrong_type(llmobs, mock_llmobs_logs): + with llmobs.llm(model_name="test_model") as llm_span: + llmobs.annotate(span=llm_span, metrics=12345) assert llm_span._get_ctx_item(METRICS) is None - mock_logs.warning.assert_called_once_with("metrics must be a dictionary of string key - numeric value pairs.") - mock_logs.reset_mock() + mock_llmobs_logs.warning.assert_called_once_with( + "metrics must be a dictionary of string key - numeric value pairs." + ) + mock_llmobs_logs.reset_mock() -def test_annotate_prompt_dict(LLMObs): - with LLMObs.llm(model_name="test_model") as span: - LLMObs.annotate( +def test_annotate_prompt_dict(llmobs): + with llmobs.llm(model_name="test_model") as span: + llmobs.annotate( span=span, prompt={ "template": "{var1} {var3}", @@ -761,9 +685,9 @@ def test_annotate_prompt_dict(LLMObs): } -def test_annotate_prompt_dict_with_context_var_keys(LLMObs): - with LLMObs.llm(model_name="test_model") as span: - LLMObs.annotate( +def test_annotate_prompt_dict_with_context_var_keys(llmobs): + with llmobs.llm(model_name="test_model") as span: + llmobs.annotate( span=span, prompt={ "template": "{var1} {var3}", @@ -784,9 +708,9 @@ def test_annotate_prompt_dict_with_context_var_keys(LLMObs): } -def test_annotate_prompt_typed_dict(LLMObs): - with LLMObs.llm(model_name="test_model") as span: - LLMObs.annotate( +def test_annotate_prompt_typed_dict(llmobs): + with llmobs.llm(model_name="test_model") as span: + llmobs.annotate( span=span, prompt=Prompt( template="{var1} {var3}", @@ -807,63 +731,30 @@ def test_annotate_prompt_typed_dict(LLMObs): } -def test_annotate_prompt_wrong_type(LLMObs, mock_logs): - with LLMObs.llm(model_name="test_model") as span: - LLMObs.annotate(span=span, prompt="prompt") +def test_annotate_prompt_wrong_type(llmobs, mock_llmobs_logs): + with llmobs.llm(model_name="test_model") as span: + llmobs.annotate(span=span, prompt="prompt") assert span._get_ctx_item(INPUT_PROMPT) is None - mock_logs.warning.assert_called_once_with("Failed to validate prompt with error: ", exc_info=True) - mock_logs.reset_mock() - - LLMObs.annotate(span=span, prompt={"template": 1}) - mock_logs.warning.assert_called_once_with("Failed to validate prompt with error: ", exc_info=True) - mock_logs.reset_mock() + mock_llmobs_logs.warning.assert_called_once_with("Failed to validate prompt with error: ", exc_info=True) + mock_llmobs_logs.reset_mock() + llmobs.annotate(span=span, prompt={"template": 1}) + mock_llmobs_logs.warning.assert_called_once_with("Failed to validate prompt with error: ", exc_info=True) + mock_llmobs_logs.reset_mock() -def test_span_error_sets_error(LLMObs, mock_llmobs_span_writer): - with pytest.raises(ValueError): - with LLMObs.llm(model_name="test_model", model_provider="test_model_provider") as span: - raise ValueError("test error message") - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_llm_span_event( - span, - model_name="test_model", - model_provider="test_model_provider", - error="builtins.ValueError", - error_message="test error message", - error_stack=span.get_tag("error.stack"), - ) - ) - -def test_span_error_sets_error_agentless(AgentlessLLMObs, mock_llmobs_span_agentless_writer): +def test_span_error_sets_error(llmobs, llmobs_events): with pytest.raises(ValueError): - with AgentlessLLMObs.llm(model_name="test_model", model_provider="test_model_provider") as span: + with llmobs.llm(model_name="test_model", model_provider="test_model_provider") as span: raise ValueError("test error message") - mock_llmobs_span_agentless_writer.enqueue.assert_called_with( - _expected_llmobs_llm_span_event( - span, - model_name="test_model", - model_provider="test_model_provider", - error="builtins.ValueError", - error_message="test error message", - error_stack=span.get_tag("error.stack"), - ) - ) - - -@pytest.mark.parametrize( - "ddtrace_global_config", - [dict(version="1.2.3", env="test_env", service="test_service", _llmobs_ml_app="test_app_name")], -) -def test_tags(ddtrace_global_config, LLMObs, mock_llmobs_span_writer, monkeypatch): - with LLMObs.task(name="test_task") as span: - pass - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_non_llm_span_event( - span, - "task", - tags={"version": "1.2.3", "env": "test_env", "service": "test_service", "ml_app": "test_app_name"}, - ) + assert len(llmobs_events) == 1 + assert llmobs_events[0] == _expected_llmobs_llm_span_event( + span, + model_name="test_model", + model_provider="test_model_provider", + error="builtins.ValueError", + error_message="test error message", + error_stack=span.get_tag("error.stack"), ) @@ -871,202 +762,152 @@ def test_tags(ddtrace_global_config, LLMObs, mock_llmobs_span_writer, monkeypatc "ddtrace_global_config", [dict(version="1.2.3", env="test_env", service="test_service", _llmobs_ml_app="test_app_name")], ) -def test_tags_agentless(ddtrace_global_config, AgentlessLLMObs, mock_llmobs_span_agentless_writer, monkeypatch): - with AgentlessLLMObs.task(name="test_task") as span: +def test_tags(ddtrace_global_config, llmobs, llmobs_events, monkeypatch): + with llmobs.task(name="test_task") as span: pass - mock_llmobs_span_agentless_writer.enqueue.assert_called_with( - _expected_llmobs_non_llm_span_event( - span, - "task", - tags={"version": "1.2.3", "env": "test_env", "service": "test_service", "ml_app": "test_app_name"}, - ) + assert len(llmobs_events) == 1 + assert llmobs_events[0] == _expected_llmobs_non_llm_span_event( + span, + "task", + tags={"version": "1.2.3", "env": "test_env", "service": "test_service", "ml_app": "test_app_name"}, ) -def test_ml_app_override(LLMObs, mock_llmobs_span_writer): - with LLMObs.task(name="test_task", ml_app="test_app") as span: +def test_ml_app_override(llmobs, llmobs_events): + with llmobs.task(name="test_task", ml_app="test_app") as span: pass - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_non_llm_span_event(span, "task", tags={"ml_app": "test_app"}) - ) - with LLMObs.tool(name="test_tool", ml_app="test_app") as span: + assert len(llmobs_events) == 1 + assert llmobs_events[0] == _expected_llmobs_non_llm_span_event(span, "task", tags={"ml_app": "test_app"}) + with llmobs.tool(name="test_tool", ml_app="test_app") as span: pass - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_non_llm_span_event(span, "tool", tags={"ml_app": "test_app"}) - ) - with LLMObs.llm(model_name="model_name", name="test_llm", ml_app="test_app") as span: + assert len(llmobs_events) == 2 + assert llmobs_events[1] == _expected_llmobs_non_llm_span_event(span, "tool", tags={"ml_app": "test_app"}) + with llmobs.llm(model_name="model_name", name="test_llm", ml_app="test_app") as span: pass - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_llm_span_event( - span, "llm", model_name="model_name", model_provider="custom", tags={"ml_app": "test_app"} - ) + assert len(llmobs_events) == 3 + assert llmobs_events[2] == _expected_llmobs_llm_span_event( + span, "llm", model_name="model_name", model_provider="custom", tags={"ml_app": "test_app"} ) - with LLMObs.embedding(model_name="model_name", name="test_embedding", ml_app="test_app") as span: + with llmobs.embedding(model_name="model_name", name="test_embedding", ml_app="test_app") as span: pass - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_llm_span_event( - span, "embedding", model_name="model_name", model_provider="custom", tags={"ml_app": "test_app"} - ) + assert len(llmobs_events) == 4 + assert llmobs_events[3] == _expected_llmobs_llm_span_event( + span, "embedding", model_name="model_name", model_provider="custom", tags={"ml_app": "test_app"} ) - with LLMObs.workflow(name="test_workflow", ml_app="test_app") as span: + with llmobs.workflow(name="test_workflow", ml_app="test_app") as span: pass - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_non_llm_span_event(span, "workflow", tags={"ml_app": "test_app"}) - ) - with LLMObs.agent(name="test_agent", ml_app="test_app") as span: + assert len(llmobs_events) == 5 + assert llmobs_events[4] == _expected_llmobs_non_llm_span_event(span, "workflow", tags={"ml_app": "test_app"}) + with llmobs.agent(name="test_agent", ml_app="test_app") as span: pass - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_llm_span_event(span, "agent", tags={"ml_app": "test_app"}) - ) - with LLMObs.retrieval(name="test_retrieval", ml_app="test_app") as span: + assert len(llmobs_events) == 6 + assert llmobs_events[5] == _expected_llmobs_llm_span_event(span, "agent", tags={"ml_app": "test_app"}) + with llmobs.retrieval(name="test_retrieval", ml_app="test_app") as span: pass - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_non_llm_span_event(span, "retrieval", tags={"ml_app": "test_app"}) - ) - - -def test_ml_app_override_agentless(AgentlessLLMObs, mock_llmobs_span_agentless_writer): - with AgentlessLLMObs.task(name="test_task", ml_app="test_app") as span: - pass - mock_llmobs_span_agentless_writer.enqueue.assert_called_with( - _expected_llmobs_non_llm_span_event(span, "task", tags={"ml_app": "test_app"}) - ) - with AgentlessLLMObs.tool(name="test_tool", ml_app="test_app") as span: - pass - mock_llmobs_span_agentless_writer.enqueue.assert_called_with( - _expected_llmobs_non_llm_span_event(span, "tool", tags={"ml_app": "test_app"}) - ) - with AgentlessLLMObs.llm(model_name="model_name", name="test_llm", ml_app="test_app") as span: - pass - mock_llmobs_span_agentless_writer.enqueue.assert_called_with( - _expected_llmobs_llm_span_event( - span, "llm", model_name="model_name", model_provider="custom", tags={"ml_app": "test_app"} - ) - ) - with AgentlessLLMObs.embedding(model_name="model_name", name="test_embedding", ml_app="test_app") as span: - pass - mock_llmobs_span_agentless_writer.enqueue.assert_called_with( - _expected_llmobs_llm_span_event( - span, "embedding", model_name="model_name", model_provider="custom", tags={"ml_app": "test_app"} - ) - ) - with AgentlessLLMObs.workflow(name="test_workflow", ml_app="test_app") as span: - pass - mock_llmobs_span_agentless_writer.enqueue.assert_called_with( - _expected_llmobs_non_llm_span_event(span, "workflow", tags={"ml_app": "test_app"}) - ) - with AgentlessLLMObs.agent(name="test_agent", ml_app="test_app") as span: - pass - mock_llmobs_span_agentless_writer.enqueue.assert_called_with( - _expected_llmobs_llm_span_event(span, "agent", tags={"ml_app": "test_app"}) - ) - with AgentlessLLMObs.retrieval(name="test_retrieval", ml_app="test_app") as span: - pass - mock_llmobs_span_agentless_writer.enqueue.assert_called_with( - _expected_llmobs_non_llm_span_event(span, "retrieval", tags={"ml_app": "test_app"}) - ) + assert len(llmobs_events) == 7 + assert llmobs_events[6] == _expected_llmobs_non_llm_span_event(span, "retrieval", tags={"ml_app": "test_app"}) -def test_export_span_specified_span_is_incorrect_type_raises_warning(LLMObs, mock_logs): - LLMObs.export_span(span="asd") - mock_logs.warning.assert_called_once_with("Failed to export span. Span must be a valid Span object.") +def test_export_span_specified_span_is_incorrect_type_raises_warning(llmobs, mock_llmobs_logs): + llmobs.export_span(span="asd") + mock_llmobs_logs.warning.assert_called_once_with("Failed to export span. Span must be a valid Span object.") -def test_export_span_specified_span_is_not_llmobs_span_raises_warning(LLMObs, mock_logs): +def test_export_span_specified_span_is_not_llmobs_span_raises_warning(llmobs, mock_llmobs_logs): with DummyTracer().trace("non_llmobs_span") as span: - LLMObs.export_span(span=span) - mock_logs.warning.assert_called_once_with("Span must be an LLMObs-generated span.") + llmobs.export_span(span=span) + mock_llmobs_logs.warning.assert_called_once_with("Span must be an LLMObs-generated span.") -def test_export_span_specified_span_returns_span_context(LLMObs): - with LLMObs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span: - span_context = LLMObs.export_span(span=span) +def test_export_span_specified_span_returns_span_context(llmobs): + with llmobs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span: + span_context = llmobs.export_span(span=span) assert span_context is not None assert span_context["span_id"] == str(span.span_id) assert span_context["trace_id"] == "{:x}".format(span.trace_id) -def test_export_span_no_specified_span_no_active_span_raises_warning(LLMObs, mock_logs): - LLMObs.export_span() - mock_logs.warning.assert_called_once_with("No span provided and no active LLMObs-generated span found.") +def test_export_span_no_specified_span_no_active_span_raises_warning(llmobs, mock_llmobs_logs): + llmobs.export_span() + mock_llmobs_logs.warning.assert_called_once_with("No span provided and no active LLMObs-generated span found.") -def test_export_span_active_span_not_llmobs_span_raises_warning(LLMObs, mock_logs): - with LLMObs._instance.tracer.trace("non_llmobs_span"): - LLMObs.export_span() - mock_logs.warning.assert_called_once_with("Span must be an LLMObs-generated span.") +def test_export_span_active_span_not_llmobs_span_raises_warning(llmobs, mock_llmobs_logs): + with llmobs._instance.tracer.trace("non_llmobs_span"): + llmobs.export_span() + mock_llmobs_logs.warning.assert_called_once_with("Span must be an LLMObs-generated span.") -def test_export_span_no_specified_span_returns_exported_active_span(LLMObs): - with LLMObs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span: - span_context = LLMObs.export_span() +def test_export_span_no_specified_span_returns_exported_active_span(llmobs): + with llmobs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span: + span_context = llmobs.export_span() assert span_context is not None assert span_context["span_id"] == str(span.span_id) assert span_context["trace_id"] == "{:x}".format(span.trace_id) -def test_submit_evaluation_llmobs_disabled_raises_warning(LLMObs, mock_logs): - LLMObs.disable() - LLMObs.submit_evaluation( +def test_submit_evaluation_llmobs_disabled_raises_warning(llmobs, mock_llmobs_logs): + llmobs.disable() + llmobs.submit_evaluation( span_context={"span_id": "123", "trace_id": "456"}, label="toxicity", metric_type="categorical", value="high" ) - mock_logs.warning.assert_called_once_with( + mock_llmobs_logs.warning.assert_called_once_with( "LLMObs.submit_evaluation() called when LLMObs is not enabled. Evaluation metric data will not be sent." ) -def test_submit_evaluation_no_api_key_raises_warning(AgentlessLLMObs, mock_logs): +def test_submit_evaluation_no_api_key_raises_warning(llmobs, mock_llmobs_logs): with override_global_config(dict(_dd_api_key="")): - AgentlessLLMObs.submit_evaluation( + llmobs.submit_evaluation( span_context={"span_id": "123", "trace_id": "456"}, label="toxicity", metric_type="categorical", value="high", ) - mock_logs.warning.assert_called_once_with( + mock_llmobs_logs.warning.assert_called_once_with( "DD_API_KEY is required for sending evaluation metrics. Evaluation metric data will not be sent. " "Ensure this configuration is set before running your application." ) -def test_submit_evaluation_ml_app_raises_warning(LLMObs, mock_logs): +def test_submit_evaluation_ml_app_raises_warning(llmobs, mock_llmobs_logs): with override_global_config(dict(_llmobs_ml_app="")): - LLMObs.submit_evaluation( + llmobs.submit_evaluation( span_context={"span_id": "123", "trace_id": "456"}, label="toxicity", metric_type="categorical", value="high", ) - mock_logs.warning.assert_called_once_with( + mock_llmobs_logs.warning.assert_called_once_with( "ML App name is required for sending evaluation metrics. Evaluation metric data will not be sent. " "Ensure this configuration is set before running your application." ) -def test_submit_evaluation_span_context_incorrect_type_raises_warning(LLMObs, mock_logs): - LLMObs.submit_evaluation(span_context="asd", label="toxicity", metric_type="categorical", value="high") - mock_logs.warning.assert_called_once_with( +def test_submit_evaluation_span_context_incorrect_type_raises_warning(llmobs, mock_llmobs_logs): + llmobs.submit_evaluation(span_context="asd", label="toxicity", metric_type="categorical", value="high") + mock_llmobs_logs.warning.assert_called_once_with( "span_context must be a dictionary containing both span_id and trace_id keys. " "LLMObs.export_span() can be used to generate this dictionary from a given span." ) -def test_submit_evaluation_empty_span_or_trace_id_raises_warning(LLMObs, mock_logs): - LLMObs.submit_evaluation( +def test_submit_evaluation_empty_span_or_trace_id_raises_warning(llmobs, mock_llmobs_logs): + llmobs.submit_evaluation( span_context={"trace_id": "456"}, label="toxicity", metric_type="categorical", value="high" ) - mock_logs.warning.assert_called_once_with( + mock_llmobs_logs.warning.assert_called_once_with( "span_id and trace_id must both be specified for the given evaluation metric to be submitted." ) - mock_logs.reset_mock() - LLMObs.submit_evaluation(span_context={"span_id": "456"}, label="toxicity", metric_type="categorical", value="high") - mock_logs.warning.assert_called_once_with( + mock_llmobs_logs.reset_mock() + llmobs.submit_evaluation(span_context={"span_id": "456"}, label="toxicity", metric_type="categorical", value="high") + mock_llmobs_logs.warning.assert_called_once_with( "span_id and trace_id must both be specified for the given evaluation metric to be submitted." ) -def test_submit_evaluation_invalid_timestamp_raises_warning(LLMObs, mock_logs): - LLMObs.submit_evaluation( +def test_submit_evaluation_invalid_timestamp_raises_warning(llmobs, mock_llmobs_logs): + llmobs.submit_evaluation( span_context={"span_id": "123", "trace_id": "456"}, label="", metric_type="categorical", @@ -1074,35 +915,35 @@ def test_submit_evaluation_invalid_timestamp_raises_warning(LLMObs, mock_logs): ml_app="dummy", timestamp_ms="invalid", ) - mock_logs.warning.assert_called_once_with( + mock_llmobs_logs.warning.assert_called_once_with( "timestamp_ms must be a non-negative integer. Evaluation metric data will not be sent" ) -def test_submit_evaluation_empty_label_raises_warning(LLMObs, mock_logs): - LLMObs.submit_evaluation( +def test_submit_evaluation_empty_label_raises_warning(llmobs, mock_llmobs_logs): + llmobs.submit_evaluation( span_context={"span_id": "123", "trace_id": "456"}, label="", metric_type="categorical", value="high" ) - mock_logs.warning.assert_called_once_with("label must be the specified name of the evaluation metric.") + mock_llmobs_logs.warning.assert_called_once_with("label must be the specified name of the evaluation metric.") -def test_submit_evaluation_incorrect_metric_type_raises_warning(LLMObs, mock_logs): - LLMObs.submit_evaluation( +def test_submit_evaluation_incorrect_metric_type_raises_warning(llmobs, mock_llmobs_logs): + llmobs.submit_evaluation( span_context={"span_id": "123", "trace_id": "456"}, label="toxicity", metric_type="wrong", value="high" ) - mock_logs.warning.assert_called_once_with("metric_type must be one of 'categorical' or 'score'.") - mock_logs.reset_mock() - LLMObs.submit_evaluation( + mock_llmobs_logs.warning.assert_called_once_with("metric_type must be one of 'categorical' or 'score'.") + mock_llmobs_logs.reset_mock() + llmobs.submit_evaluation( span_context={"span_id": "123", "trace_id": "456"}, label="toxicity", metric_type="", value="high" ) - mock_logs.warning.assert_called_once_with("metric_type must be one of 'categorical' or 'score'.") + mock_llmobs_logs.warning.assert_called_once_with("metric_type must be one of 'categorical' or 'score'.") -def test_submit_evaluation_numerical_value_raises_unsupported_warning(LLMObs, mock_logs): - LLMObs.submit_evaluation( +def test_submit_evaluation_numerical_value_raises_unsupported_warning(llmobs, mock_llmobs_logs): + llmobs.submit_evaluation( span_context={"span_id": "123", "trace_id": "456"}, label="token_count", metric_type="numerical", value="high" ) - mock_logs.warning.assert_has_calls( + mock_llmobs_logs.warning.assert_has_calls( [ mock.call( "The evaluation metric type 'numerical' is unsupported. Use 'score' instead. " @@ -1112,44 +953,44 @@ def test_submit_evaluation_numerical_value_raises_unsupported_warning(LLMObs, mo ) -def test_submit_evaluation_incorrect_numerical_value_type_raises_warning(LLMObs, mock_logs): - LLMObs.submit_evaluation( +def test_submit_evaluation_incorrect_numerical_value_type_raises_warning(llmobs, mock_llmobs_logs): + llmobs.submit_evaluation( span_context={"span_id": "123", "trace_id": "456"}, label="token_count", metric_type="numerical", value="high" ) - mock_logs.warning.assert_has_calls( + mock_llmobs_logs.warning.assert_has_calls( [ mock.call("value must be an integer or float for a score metric."), ] ) -def test_submit_evaluation_incorrect_score_value_type_raises_warning(LLMObs, mock_logs): - LLMObs.submit_evaluation( +def test_submit_evaluation_incorrect_score_value_type_raises_warning(llmobs, mock_llmobs_logs): + llmobs.submit_evaluation( span_context={"span_id": "123", "trace_id": "456"}, label="token_count", metric_type="score", value="high" ) - mock_logs.warning.assert_called_once_with("value must be an integer or float for a score metric.") + mock_llmobs_logs.warning.assert_called_once_with("value must be an integer or float for a score metric.") -def test_submit_evaluation_invalid_tags_raises_warning(LLMObs, mock_logs): - LLMObs.submit_evaluation( +def test_submit_evaluation_invalid_tags_raises_warning(llmobs, mock_llmobs_logs): + llmobs.submit_evaluation( span_context={"span_id": "123", "trace_id": "456"}, label="toxicity", metric_type="categorical", value="high", tags=["invalid"], ) - mock_logs.warning.assert_called_once_with("tags must be a dictionary of string key-value pairs.") + mock_llmobs_logs.warning.assert_called_once_with("tags must be a dictionary of string key-value pairs.") -def test_submit_evaluation_invalid_metadata_raises_warning(LLMObs, mock_logs): - LLMObs.submit_evaluation( +def test_submit_evaluation_invalid_metadata_raises_warning(llmobs, mock_llmobs_logs): + llmobs.submit_evaluation( span_context={"span_id": "123", "trace_id": "456"}, label="toxicity", metric_type="categorical", value="high", metadata=1, ) - mock_logs.warning.assert_called_once_with("metadata must be json serializable dictionary.") + mock_llmobs_logs.warning.assert_called_once_with("metadata must be json serializable dictionary.") @pytest.mark.parametrize( @@ -1157,9 +998,9 @@ def test_submit_evaluation_invalid_metadata_raises_warning(LLMObs, mock_logs): [dict(_llmobs_ml_app="test_app_name")], ) def test_submit_evaluation_non_string_tags_raises_warning_but_still_submits( - LLMObs, mock_logs, mock_llmobs_eval_metric_writer + llmobs, mock_llmobs_logs, mock_llmobs_eval_metric_writer ): - LLMObs.submit_evaluation( + llmobs.submit_evaluation( span_context={"span_id": "123", "trace_id": "456"}, label="toxicity", metric_type="categorical", @@ -1167,8 +1008,10 @@ def test_submit_evaluation_non_string_tags_raises_warning_but_still_submits( tags={1: 2, "foo": "bar"}, ml_app="dummy", ) - mock_logs.warning.assert_called_once_with("Failed to parse tags. Tags for evaluation metrics must be strings.") - mock_logs.reset_mock() + mock_llmobs_logs.warning.assert_called_once_with( + "Failed to parse tags. Tags for evaluation metrics must be strings." + ) + mock_llmobs_logs.reset_mock() mock_llmobs_eval_metric_writer.enqueue.assert_called_with( _expected_llmobs_eval_metric_event( ml_app="dummy", @@ -1186,8 +1029,8 @@ def test_submit_evaluation_non_string_tags_raises_warning_but_still_submits( "ddtrace_global_config", [dict(ddtrace="1.2.3", env="test_env", service="test_service", _llmobs_ml_app="test_app_name")], ) -def test_submit_evaluation_metric_tags(LLMObs, mock_llmobs_eval_metric_writer): - LLMObs.submit_evaluation( +def test_submit_evaluation_metric_tags(llmobs, mock_llmobs_eval_metric_writer): + llmobs.submit_evaluation( span_context={"span_id": "123", "trace_id": "456"}, label="toxicity", metric_type="categorical", @@ -1212,8 +1055,8 @@ def test_submit_evaluation_metric_tags(LLMObs, mock_llmobs_eval_metric_writer): "ddtrace_global_config", [dict(ddtrace="1.2.3", env="test_env", service="test_service", _llmobs_ml_app="test_app_name")], ) -def test_submit_evaluation_metric_with_metadata_enqueues_metric(LLMObs, mock_llmobs_eval_metric_writer): - LLMObs.submit_evaluation( +def test_submit_evaluation_metric_with_metadata_enqueues_metric(llmobs, mock_llmobs_eval_metric_writer): + llmobs.submit_evaluation( span_context={"span_id": "123", "trace_id": "456"}, label="toxicity", metric_type="categorical", @@ -1235,7 +1078,7 @@ def test_submit_evaluation_metric_with_metadata_enqueues_metric(LLMObs, mock_llm ) ) mock_llmobs_eval_metric_writer.reset() - LLMObs.submit_evaluation( + llmobs.submit_evaluation( span_context={"span_id": "123", "trace_id": "456"}, label="toxicity", metric_type="categorical", @@ -1257,8 +1100,8 @@ def test_submit_evaluation_metric_with_metadata_enqueues_metric(LLMObs, mock_llm ) -def test_submit_evaluation_enqueues_writer_with_categorical_metric(LLMObs, mock_llmobs_eval_metric_writer): - LLMObs.submit_evaluation( +def test_submit_evaluation_enqueues_writer_with_categorical_metric(llmobs, mock_llmobs_eval_metric_writer): + llmobs.submit_evaluation( span_context={"span_id": "123", "trace_id": "456"}, label="toxicity", metric_type="categorical", @@ -1276,9 +1119,9 @@ def test_submit_evaluation_enqueues_writer_with_categorical_metric(LLMObs, mock_ ) ) mock_llmobs_eval_metric_writer.reset_mock() - with LLMObs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span: - LLMObs.submit_evaluation( - span_context=LLMObs.export_span(span), + with llmobs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span: + llmobs.submit_evaluation( + span_context=llmobs.export_span(span), label="toxicity", metric_type="categorical", value="high", @@ -1296,8 +1139,8 @@ def test_submit_evaluation_enqueues_writer_with_categorical_metric(LLMObs, mock_ ) -def test_submit_evaluation_enqueues_writer_with_score_metric(LLMObs, mock_llmobs_eval_metric_writer): - LLMObs.submit_evaluation( +def test_submit_evaluation_enqueues_writer_with_score_metric(llmobs, mock_llmobs_eval_metric_writer): + llmobs.submit_evaluation( span_context={"span_id": "123", "trace_id": "456"}, label="sentiment", metric_type="score", @@ -1310,9 +1153,9 @@ def test_submit_evaluation_enqueues_writer_with_score_metric(LLMObs, mock_llmobs ) ) mock_llmobs_eval_metric_writer.reset_mock() - with LLMObs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span: - LLMObs.submit_evaluation( - span_context=LLMObs.export_span(span), label="sentiment", metric_type="score", value=0.9, ml_app="dummy" + with llmobs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span: + llmobs.submit_evaluation( + span_context=llmobs.export_span(span), label="sentiment", metric_type="score", value=0.9, ml_app="dummy" ) mock_llmobs_eval_metric_writer.enqueue.assert_called_with( _expected_llmobs_eval_metric_event( @@ -1327,9 +1170,9 @@ def test_submit_evaluation_enqueues_writer_with_score_metric(LLMObs, mock_llmobs def test_submit_evaluation_with_numerical_metric_enqueues_writer_with_score_metric( - LLMObs, mock_llmobs_eval_metric_writer + llmobs, mock_llmobs_eval_metric_writer ): - LLMObs.submit_evaluation( + llmobs.submit_evaluation( span_context={"span_id": "123", "trace_id": "456"}, label="token_count", metric_type="numerical", @@ -1342,9 +1185,9 @@ def test_submit_evaluation_with_numerical_metric_enqueues_writer_with_score_metr ) ) mock_llmobs_eval_metric_writer.reset_mock() - with LLMObs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span: - LLMObs.submit_evaluation( - span_context=LLMObs.export_span(span), + with llmobs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span: + llmobs.submit_evaluation( + span_context=llmobs.export_span(span), label="token_count", metric_type="numerical", value=35, @@ -1362,144 +1205,116 @@ def test_submit_evaluation_with_numerical_metric_enqueues_writer_with_score_metr ) -def test_flush_calls_periodic_agentless( - AgentlessLLMObs, mock_llmobs_span_agentless_writer, mock_llmobs_eval_metric_writer, mock_llmobs_evaluator_runner -): - AgentlessLLMObs.flush() - mock_llmobs_span_agentless_writer.periodic.assert_called_once() - mock_llmobs_eval_metric_writer.periodic.assert_called_once() - mock_llmobs_evaluator_runner.periodic.assert_called_once() - - def test_flush_does_not_call_periodic_when_llmobs_is_disabled( - LLMObs, - mock_llmobs_span_writer, + llmobs, mock_llmobs_eval_metric_writer, mock_llmobs_evaluator_runner, - mock_logs, - disabled_llmobs, + mock_llmobs_logs, ): - LLMObs.flush() - mock_llmobs_span_writer.periodic.assert_not_called() + llmobs.enabled = False + llmobs.flush() mock_llmobs_eval_metric_writer.periodic.assert_not_called() mock_llmobs_evaluator_runner.periodic.assert_not_called() - mock_logs.warning.assert_has_calls( + mock_llmobs_logs.warning.assert_has_calls( [mock.call("flushing when LLMObs is disabled. No spans or evaluation metrics will be sent.")] ) -def test_flush_does_not_call_periodic_when_llmobs_is_disabled_agentless( - AgentlessLLMObs, - mock_llmobs_span_agentless_writer, - mock_llmobs_eval_metric_writer, - mock_llmobs_evaluator_runner, - mock_logs, - disabled_llmobs, -): - AgentlessLLMObs.flush() - mock_llmobs_span_agentless_writer.periodic.assert_not_called() - mock_llmobs_eval_metric_writer.periodic.assert_not_called() - mock_llmobs_evaluator_runner.periodic.assert_not_called() - mock_logs.warning.assert_has_calls( - [mock.call("flushing when LLMObs is disabled. No spans or evaluation metrics will be sent.")] - ) - - -def test_inject_distributed_headers_llmobs_disabled_does_nothing(LLMObs, mock_logs): - LLMObs.disable() - headers = LLMObs.inject_distributed_headers({}, span=None) - mock_logs.warning.assert_called_once_with( +def test_inject_distributed_headers_llmobs_disabled_does_nothing(llmobs, mock_llmobs_logs): + llmobs.disable() + headers = llmobs.inject_distributed_headers({}, span=None) + mock_llmobs_logs.warning.assert_called_once_with( "LLMObs.inject_distributed_headers() called when LLMObs is not enabled. " "Distributed context will not be injected." ) assert headers == {} -def test_inject_distributed_headers_not_dict_logs_warning(LLMObs, mock_logs): - headers = LLMObs.inject_distributed_headers("not a dictionary", span=None) - mock_logs.warning.assert_called_once_with("request_headers must be a dictionary of string key-value pairs.") +def test_inject_distributed_headers_not_dict_logs_warning(llmobs, mock_llmobs_logs): + headers = llmobs.inject_distributed_headers("not a dictionary", span=None) + mock_llmobs_logs.warning.assert_called_once_with("request_headers must be a dictionary of string key-value pairs.") assert headers == "not a dictionary" - mock_logs.reset_mock() - headers = LLMObs.inject_distributed_headers(123, span=None) - mock_logs.warning.assert_called_once_with("request_headers must be a dictionary of string key-value pairs.") + mock_llmobs_logs.reset_mock() + headers = llmobs.inject_distributed_headers(123, span=None) + mock_llmobs_logs.warning.assert_called_once_with("request_headers must be a dictionary of string key-value pairs.") assert headers == 123 - mock_logs.reset_mock() - headers = LLMObs.inject_distributed_headers(None, span=None) - mock_logs.warning.assert_called_once_with("request_headers must be a dictionary of string key-value pairs.") + mock_llmobs_logs.reset_mock() + headers = llmobs.inject_distributed_headers(None, span=None) + mock_llmobs_logs.warning.assert_called_once_with("request_headers must be a dictionary of string key-value pairs.") assert headers is None -def test_inject_distributed_headers_no_active_span_logs_warning(LLMObs, mock_logs): - headers = LLMObs.inject_distributed_headers({}, span=None) - mock_logs.warning.assert_called_once_with("No span provided and no currently active span found.") +def test_inject_distributed_headers_no_active_span_logs_warning(llmobs, mock_llmobs_logs): + headers = llmobs.inject_distributed_headers({}, span=None) + mock_llmobs_logs.warning.assert_called_once_with("No span provided and no currently active span found.") assert headers == {} -def test_inject_distributed_headers_span_calls_httppropagator_inject(LLMObs, mock_logs): - span = LLMObs._instance.tracer.trace("test_span") +def test_inject_distributed_headers_span_calls_httppropagator_inject(llmobs, mock_llmobs_logs): + span = llmobs._instance.tracer.trace("test_span") with mock.patch("ddtrace.propagation.http.HTTPPropagator.inject") as mock_inject: - LLMObs.inject_distributed_headers({}, span=span) + llmobs.inject_distributed_headers({}, span=span) assert mock_inject.call_count == 1 mock_inject.assert_called_once_with(span.context, {}) -def test_inject_distributed_headers_current_active_span_injected(LLMObs, mock_logs): - span = LLMObs._instance.tracer.trace("test_span") +def test_inject_distributed_headers_current_active_span_injected(llmobs, mock_llmobs_logs): + span = llmobs._instance.tracer.trace("test_span") with mock.patch("ddtrace.llmobs._llmobs.HTTPPropagator.inject") as mock_inject: - LLMObs.inject_distributed_headers({}, span=None) + llmobs.inject_distributed_headers({}, span=None) assert mock_inject.call_count == 1 mock_inject.assert_called_once_with(span.context, {}) -def test_activate_distributed_headers_llmobs_disabled_does_nothing(LLMObs, mock_logs): - LLMObs.disable() - LLMObs.activate_distributed_headers({}) - mock_logs.warning.assert_called_once_with( +def test_activate_distributed_headers_llmobs_disabled_does_nothing(llmobs, mock_llmobs_logs): + llmobs.disable() + llmobs.activate_distributed_headers({}) + mock_llmobs_logs.warning.assert_called_once_with( "LLMObs.activate_distributed_headers() called when LLMObs is not enabled. " "Distributed context will not be activated." ) -def test_activate_distributed_headers_calls_httppropagator_extract(LLMObs, mock_logs): +def test_activate_distributed_headers_calls_httppropagator_extract(llmobs, mock_llmobs_logs): with mock.patch("ddtrace.llmobs._llmobs.HTTPPropagator.extract") as mock_extract: - LLMObs.activate_distributed_headers({}) + llmobs.activate_distributed_headers({}) assert mock_extract.call_count == 1 mock_extract.assert_called_once_with({}) -def test_activate_distributed_headers_no_trace_id_does_nothing(LLMObs, mock_logs): +def test_activate_distributed_headers_no_trace_id_does_nothing(llmobs, mock_llmobs_logs): with mock.patch("ddtrace.llmobs._llmobs.HTTPPropagator.extract") as mock_extract: mock_extract.return_value = Context(span_id="123", meta={PROPAGATED_PARENT_ID_KEY: "123"}) - LLMObs.activate_distributed_headers({}) + llmobs.activate_distributed_headers({}) assert mock_extract.call_count == 1 - mock_logs.warning.assert_called_once_with("Failed to extract trace ID or span ID from request headers.") + mock_llmobs_logs.warning.assert_called_once_with("Failed to extract trace ID or span ID from request headers.") -def test_activate_distributed_headers_no_span_id_does_nothing(LLMObs, mock_logs): +def test_activate_distributed_headers_no_span_id_does_nothing(llmobs, mock_llmobs_logs): with mock.patch("ddtrace.llmobs._llmobs.HTTPPropagator.extract") as mock_extract: mock_extract.return_value = Context(trace_id="123", meta={PROPAGATED_PARENT_ID_KEY: "123"}) - LLMObs.activate_distributed_headers({}) + llmobs.activate_distributed_headers({}) assert mock_extract.call_count == 1 - mock_logs.warning.assert_called_once_with("Failed to extract trace ID or span ID from request headers.") + mock_llmobs_logs.warning.assert_called_once_with("Failed to extract trace ID or span ID from request headers.") -def test_activate_distributed_headers_no_llmobs_parent_id_does_nothing(LLMObs, mock_logs): +def test_activate_distributed_headers_no_llmobs_parent_id_does_nothing(llmobs, mock_llmobs_logs): with mock.patch("ddtrace.llmobs._llmobs.HTTPPropagator.extract") as mock_extract: dummy_context = Context(trace_id="123", span_id="456") mock_extract.return_value = dummy_context with mock.patch("ddtrace.llmobs.LLMObs._instance.tracer.context_provider.activate") as mock_activate: - LLMObs.activate_distributed_headers({}) + llmobs.activate_distributed_headers({}) assert mock_extract.call_count == 1 - mock_logs.warning.assert_called_once_with("Failed to extract LLMObs parent ID from request headers.") + mock_llmobs_logs.warning.assert_called_once_with("Failed to extract LLMObs parent ID from request headers.") mock_activate.assert_called_once_with(dummy_context) -def test_activate_distributed_headers_activates_context(LLMObs, mock_logs): +def test_activate_distributed_headers_activates_context(llmobs, mock_llmobs_logs): with mock.patch("ddtrace.llmobs._llmobs.HTTPPropagator.extract") as mock_extract: dummy_context = Context(trace_id="123", span_id="456", meta={PROPAGATED_PARENT_ID_KEY: "789"}) mock_extract.return_value = dummy_context with mock.patch("ddtrace.llmobs.LLMObs._instance.tracer.context_provider.activate") as mock_activate: - LLMObs.activate_distributed_headers({}) + llmobs.activate_distributed_headers({}) assert mock_extract.call_count == 1 mock_activate.assert_called_once_with(dummy_context) @@ -1514,16 +1329,10 @@ def test_llmobs_fork_recreates_and_restarts_span_writer(): if pid: # parent assert llmobs_service._instance.tracer._pid == original_pid assert llmobs_service._instance._llmobs_span_writer == original_span_writer - assert ( - llmobs_service._instance._trace_processor._span_writer == llmobs_service._instance._llmobs_span_writer - ) assert llmobs_service._instance._llmobs_span_writer.status == ServiceStatus.RUNNING else: # child assert llmobs_service._instance.tracer._pid != original_pid assert llmobs_service._instance._llmobs_span_writer != original_span_writer - assert ( - llmobs_service._instance._trace_processor._span_writer == llmobs_service._instance._llmobs_span_writer - ) assert llmobs_service._instance._llmobs_span_writer.status == ServiceStatus.RUNNING llmobs_service.disable() os._exit(12) @@ -1569,18 +1378,10 @@ def test_llmobs_fork_recreates_and_restarts_evaluator_runner(mock_ragas_evaluato if pid: # parent assert llmobs_service._instance.tracer._pid == original_pid assert llmobs_service._instance._evaluator_runner == original_evaluator_runner - assert ( - llmobs_service._instance._trace_processor._evaluator_runner - == llmobs_service._instance._evaluator_runner - ) assert llmobs_service._instance._evaluator_runner.status == ServiceStatus.RUNNING else: # child assert llmobs_service._instance.tracer._pid != original_pid assert llmobs_service._instance._evaluator_runner != original_evaluator_runner - assert ( - llmobs_service._instance._trace_processor._evaluator_runner - == llmobs_service._instance._evaluator_runner - ) assert llmobs_service._instance._evaluator_runner.status == ServiceStatus.RUNNING llmobs_service.disable() os._exit(12) @@ -1667,42 +1468,6 @@ def test_llmobs_fork_evaluator_runner_run(monkeypatch): llmobs_service.disable() -def test_llmobs_fork_custom_filter(monkeypatch): - """Test that forking a process correctly keeps any custom filters.""" - - class CustomFilter(TraceFilter): - def process_trace(self, trace): - return trace - - monkeypatch.setenv("_DD_LLMOBS_WRITER_INTERVAL", 5.0) - with mock.patch("ddtrace.internal.writer.HTTPWriter._send_payload"): - tracer = DummyTracer() - custom_filter = CustomFilter() - tracer.configure(settings={"FILTERS": [custom_filter]}) - llmobs_service.enable(_tracer=tracer, ml_app="test_app") - assert custom_filter in llmobs_service._instance.tracer._filters - pid = os.fork() - if pid: # parent - assert custom_filter in llmobs_service._instance.tracer._filters - assert any( - isinstance(tracer_filter, LLMObsTraceProcessor) - for tracer_filter in llmobs_service._instance.tracer._filters - ) - else: # child - assert custom_filter in llmobs_service._instance.tracer._filters - assert any( - isinstance(tracer_filter, LLMObsTraceProcessor) - for tracer_filter in llmobs_service._instance.tracer._filters - ) - llmobs_service.disable() - os._exit(12) - - _, status = os.waitpid(pid, 0) - exit_code = os.WEXITSTATUS(status) - assert exit_code == 12 - llmobs_service.disable() - - def test_llmobs_fork_disabled(monkeypatch): """Test that after being disabled the service remains disabled when forking""" monkeypatch.setenv("DD_LLMOBS_ENABLED", "0") @@ -1746,46 +1511,46 @@ def test_llmobs_fork_disabled_then_enabled(monkeypatch): svc.disable() -def test_llmobs_with_evaluator_runner(LLMObs, mock_llmobs_evaluator_runner): - with LLMObs.llm(model_name="test_model"): +def test_llmobs_with_evaluator_runner(llmobs, mock_llmobs_evaluator_runner): + with llmobs.llm(model_name="test_model"): pass time.sleep(0.1) - assert LLMObs._instance._evaluator_runner.enqueue.call_count == 1 + assert llmobs._instance._evaluator_runner.enqueue.call_count == 1 -def test_llmobs_with_evaluator_runner_does_not_enqueue_evaluation_spans(mock_llmobs_evaluator_runner, LLMObs): - with LLMObs.llm(model_name="test_model", ml_app="{}-dummy".format(RAGAS_ML_APP_PREFIX)): +def test_llmobs_with_evaluator_runner_does_not_enqueue_evaluation_spans(mock_llmobs_evaluator_runner, llmobs): + with llmobs.llm(model_name="test_model", ml_app="{}-dummy".format(RAGAS_ML_APP_PREFIX)): pass time.sleep(0.1) - assert LLMObs._instance._evaluator_runner.enqueue.call_count == 0 + assert llmobs._instance._evaluator_runner.enqueue.call_count == 0 -def test_llmobs_with_evaluation_runner_does_not_enqueue_non_llm_spans(mock_llmobs_evaluator_runner, LLMObs): - with LLMObs.workflow(name="test"): +def test_llmobs_with_evaluation_runner_does_not_enqueue_non_llm_spans(mock_llmobs_evaluator_runner, llmobs): + with llmobs.workflow(name="test"): pass - with LLMObs.agent(name="test"): + with llmobs.agent(name="test"): pass - with LLMObs.task(name="test"): + with llmobs.task(name="test"): pass - with LLMObs.embedding(model_name="test"): + with llmobs.embedding(model_name="test"): pass - with LLMObs.retrieval(name="test"): + with llmobs.retrieval(name="test"): pass - with LLMObs.tool(name="test"): + with llmobs.tool(name="test"): pass time.sleep(0.1) - assert LLMObs._instance._evaluator_runner.enqueue.call_count == 0 + assert llmobs._instance._evaluator_runner.enqueue.call_count == 0 -def test_annotation_context_modifies_span_tags(LLMObs): - with LLMObs.annotation_context(tags={"foo": "bar"}): - with LLMObs.agent(name="test_agent") as span: +def test_annotation_context_modifies_span_tags(llmobs): + with llmobs.annotation_context(tags={"foo": "bar"}): + with llmobs.agent(name="test_agent") as span: assert span._get_ctx_item(TAGS) == {"foo": "bar"} -def test_annotation_context_modifies_prompt(LLMObs): - with LLMObs.annotation_context(prompt={"template": "test_template"}): - with LLMObs.llm(name="test_agent", model_name="test") as span: +def test_annotation_context_modifies_prompt(llmobs): + with llmobs.annotation_context(prompt={"template": "test_template"}): + with llmobs.llm(name="test_agent", model_name="test") as span: assert span._get_ctx_item(INPUT_PROMPT) == { "template": "test_template", "_dd_context_variable_keys": ["context"], @@ -1793,80 +1558,80 @@ def test_annotation_context_modifies_prompt(LLMObs): } -def test_annotation_context_modifies_name(LLMObs): - with LLMObs.annotation_context(name="test_agent_override"): - with LLMObs.llm(name="test_agent", model_name="test") as span: +def test_annotation_context_modifies_name(llmobs): + with llmobs.annotation_context(name="test_agent_override"): + with llmobs.llm(name="test_agent", model_name="test") as span: assert span.name == "test_agent_override" -def test_annotation_context_finished_context_does_not_modify_tags(LLMObs): - with LLMObs.annotation_context(tags={"foo": "bar"}): +def test_annotation_context_finished_context_does_not_modify_tags(llmobs): + with llmobs.annotation_context(tags={"foo": "bar"}): pass - with LLMObs.agent(name="test_agent") as span: + with llmobs.agent(name="test_agent") as span: assert span._get_ctx_item(TAGS) is None -def test_annotation_context_finished_context_does_not_modify_prompt(LLMObs): - with LLMObs.annotation_context(prompt={"template": "test_template"}): +def test_annotation_context_finished_context_does_not_modify_prompt(llmobs): + with llmobs.annotation_context(prompt={"template": "test_template"}): pass - with LLMObs.llm(name="test_agent", model_name="test") as span: + with llmobs.llm(name="test_agent", model_name="test") as span: assert span._get_ctx_item(INPUT_PROMPT) is None -def test_annotation_context_finished_context_does_not_modify_name(LLMObs): - with LLMObs.annotation_context(name="test_agent_override"): +def test_annotation_context_finished_context_does_not_modify_name(llmobs): + with llmobs.annotation_context(name="test_agent_override"): pass - with LLMObs.agent(name="test_agent") as span: + with llmobs.agent(name="test_agent") as span: assert span.name == "test_agent" -def test_annotation_context_nested(LLMObs): - with LLMObs.annotation_context(tags={"foo": "bar", "boo": "bar"}): - with LLMObs.annotation_context(tags={"foo": "baz"}): - with LLMObs.agent(name="test_agent") as span: +def test_annotation_context_nested(llmobs): + with llmobs.annotation_context(tags={"foo": "bar", "boo": "bar"}): + with llmobs.annotation_context(tags={"foo": "baz"}): + with llmobs.agent(name="test_agent") as span: assert span._get_ctx_item(TAGS) == {"foo": "baz", "boo": "bar"} -def test_annotation_context_nested_overrides_name(LLMObs): - with LLMObs.annotation_context(name="unexpected"): - with LLMObs.annotation_context(name="expected"): - with LLMObs.agent(name="test_agent") as span: +def test_annotation_context_nested_overrides_name(llmobs): + with llmobs.annotation_context(name="unexpected"): + with llmobs.annotation_context(name="expected"): + with llmobs.agent(name="test_agent") as span: assert span.name == "expected" -def test_annotation_context_nested_maintains_trace_structure(LLMObs, mock_llmobs_span_writer): +def test_annotation_context_nested_maintains_trace_structure(llmobs, llmobs_events): """This test makes sure starting/stopping annotation contexts do not modify the llmobs trace structure""" - with LLMObs.annotation_context(tags={"foo": "bar", "boo": "bar"}): - with LLMObs.agent(name="parent_span") as parent_span: - with LLMObs.annotation_context(tags={"foo": "baz"}): - with LLMObs.workflow(name="child_span") as child_span: + with llmobs.annotation_context(tags={"foo": "bar", "boo": "bar"}): + with llmobs.agent(name="parent_span") as parent_span: + with llmobs.annotation_context(tags={"foo": "baz"}): + with llmobs.workflow(name="child_span") as child_span: assert child_span._get_ctx_item(TAGS) == {"foo": "baz", "boo": "bar"} assert parent_span._get_ctx_item(TAGS) == {"foo": "bar", "boo": "bar"} - assert len(mock_llmobs_span_writer.enqueue.call_args_list) == 2 - parent_span, child_span = [span[0] for span, _ in mock_llmobs_span_writer.enqueue.call_args_list] + assert len(llmobs_events) == 2 + parent_span, child_span = llmobs_events[1], llmobs_events[0] assert child_span["trace_id"] == parent_span["trace_id"] assert child_span["span_id"] != parent_span["span_id"] assert child_span["parent_id"] == parent_span["span_id"] assert parent_span["parent_id"] == "undefined" - mock_llmobs_span_writer.reset_mock() - with LLMObs.annotation_context(tags={"foo": "bar", "boo": "bar"}): - with LLMObs.agent(name="parent_span"): +def test_annotation_context_separate_traces_maintained(llmobs, llmobs_events): + with llmobs.annotation_context(tags={"foo": "bar", "boo": "bar"}): + with llmobs.agent(name="parent_span"): pass - with LLMObs.workflow(name="child_span"): + with llmobs.workflow(name="child_span"): pass - assert len(mock_llmobs_span_writer.enqueue.call_args_list) == 2 - trace_one, trace_two = [span[0] for span, _ in mock_llmobs_span_writer.enqueue.call_args_list] - assert trace_one["trace_id"] != trace_two["trace_id"] - assert trace_one["span_id"] != trace_two["span_id"] - assert trace_two["parent_id"] == "undefined" - assert trace_one["parent_id"] == "undefined" + assert len(llmobs_events) == 2 + agent_span, workflow_span = llmobs_events[1], llmobs_events[0] + assert agent_span["trace_id"] != workflow_span["trace_id"] + assert agent_span["span_id"] != workflow_span["span_id"] + assert workflow_span["parent_id"] == "undefined" + assert agent_span["parent_id"] == "undefined" -def test_annotation_context_only_applies_to_local_context(LLMObs): +def test_annotation_context_only_applies_to_local_context(llmobs): """ tests that annotation contexts only apply to spans belonging to the same trace context and not globally to all spans. @@ -1882,8 +1647,8 @@ def test_annotation_context_only_applies_to_local_context(LLMObs): def context_one(): nonlocal agent_has_correct_name nonlocal agent_has_correct_tags - with LLMObs.annotation_context(name="expected_agent", tags={"foo": "bar"}): - with LLMObs.agent(name="test_agent") as span: + with llmobs.annotation_context(name="expected_agent", tags={"foo": "bar"}): + with llmobs.agent(name="test_agent") as span: event.wait() agent_has_correct_tags = span._get_ctx_item(TAGS) == {"foo": "bar"} agent_has_correct_name = span.name == "expected_agent" @@ -1892,9 +1657,9 @@ def context_one(): def context_two(): nonlocal tool_has_correct_name nonlocal tool_does_not_have_tags - with LLMObs.agent(name="test_agent"): - with LLMObs.annotation_context(name="expected_tool"): - with LLMObs.tool(name="test_tool") as tool_span: + with llmobs.agent(name="test_agent"): + with llmobs.annotation_context(name="expected_tool"): + with llmobs.tool(name="test_tool") as tool_span: event.wait() tool_does_not_have_tags = tool_span._get_ctx_item(TAGS) is None tool_has_correct_name = tool_span.name == "expected_tool" @@ -1904,7 +1669,7 @@ def context_two(): thread_one.start() thread_two.start() - with LLMObs.agent(name="test_agent") as span: + with llmobs.agent(name="test_agent") as span: assert span.name == "test_agent" assert span._get_ctx_item(TAGS) is None @@ -1920,15 +1685,15 @@ def context_two(): assert tool_does_not_have_tags is True -async def test_annotation_context_async_modifies_span_tags(LLMObs): - async with LLMObs.annotation_context(tags={"foo": "bar"}): - with LLMObs.agent(name="test_agent") as span: +async def test_annotation_context_async_modifies_span_tags(llmobs): + async with llmobs.annotation_context(tags={"foo": "bar"}): + with llmobs.agent(name="test_agent") as span: assert span._get_ctx_item(TAGS) == {"foo": "bar"} -async def test_annotation_context_async_modifies_prompt(LLMObs): - async with LLMObs.annotation_context(prompt={"template": "test_template"}): - with LLMObs.llm(name="test_agent", model_name="test") as span: +async def test_annotation_context_async_modifies_prompt(llmobs): + async with llmobs.annotation_context(prompt={"template": "test_template"}): + with llmobs.llm(name="test_agent", model_name="test") as span: assert span._get_ctx_item(INPUT_PROMPT) == { "template": "test_template", "_dd_context_variable_keys": ["context"], @@ -1936,41 +1701,42 @@ async def test_annotation_context_async_modifies_prompt(LLMObs): } -async def test_annotation_context_async_modifies_name(LLMObs): - async with LLMObs.annotation_context(name="test_agent_override"): - with LLMObs.llm(name="test_agent", model_name="test") as span: +async def test_annotation_context_async_modifies_name(llmobs): + async with llmobs.annotation_context(name="test_agent_override"): + with llmobs.llm(name="test_agent", model_name="test") as span: assert span.name == "test_agent_override" -async def test_annotation_context_async_finished_context_does_not_modify_tags(LLMObs): - async with LLMObs.annotation_context(tags={"foo": "bar"}): +async def test_annotation_context_async_finished_context_does_not_modify_tags(llmobs): + async with llmobs.annotation_context(tags={"foo": "bar"}): pass - with LLMObs.agent(name="test_agent") as span: + with llmobs.agent(name="test_agent") as span: assert span._get_ctx_item(TAGS) is None -async def test_annotation_context_async_finished_context_does_not_modify_prompt(LLMObs): - async with LLMObs.annotation_context(prompt={"template": "test_template"}): +async def test_annotation_context_async_finished_context_does_not_modify_prompt(llmobs): + async with llmobs.annotation_context(prompt={"template": "test_template"}): pass - with LLMObs.llm(name="test_agent", model_name="test") as span: + with llmobs.llm(name="test_agent", model_name="test") as span: assert span._get_ctx_item(INPUT_PROMPT) is None -async def test_annotation_context_finished_context_async_does_not_modify_name(LLMObs): - async with LLMObs.annotation_context(name="test_agent_override"): +async def test_annotation_context_finished_context_async_does_not_modify_name(llmobs): + async with llmobs.annotation_context(name="test_agent_override"): pass - with LLMObs.agent(name="test_agent") as span: + with llmobs.agent(name="test_agent") as span: assert span.name == "test_agent" -async def test_annotation_context_async_nested(LLMObs): - async with LLMObs.annotation_context(tags={"foo": "bar", "boo": "bar"}): - async with LLMObs.annotation_context(tags={"foo": "baz"}): - with LLMObs.agent(name="test_agent") as span: +async def test_annotation_context_async_nested(llmobs): + async with llmobs.annotation_context(tags={"foo": "bar", "boo": "bar"}): + async with llmobs.annotation_context(tags={"foo": "baz"}): + with llmobs.agent(name="test_agent") as span: assert span._get_ctx_item(TAGS) == {"foo": "baz", "boo": "bar"} def test_service_enable_starts_evaluator_runner_when_evaluators_exist(): + pytest.importorskip("ragas") with override_global_config(dict(_dd_api_key="", _llmobs_ml_app="")): with override_env(dict(_DD_LLMOBS_EVALUATORS="ragas_faithfulness")): dummy_tracer = DummyTracer() diff --git a/tests/llmobs/test_llmobs_span_agent_writer.py b/tests/llmobs/test_llmobs_span_agent_writer.py index 76fe0f21aef..d16bb9f0e2c 100644 --- a/tests/llmobs/test_llmobs_span_agent_writer.py +++ b/tests/llmobs/test_llmobs_span_agent_writer.py @@ -44,7 +44,8 @@ def test_flush_queue_when_event_cause_queue_to_exceed_payload_limit( [ mock.call("flushing queue because queuing next event will exceed EVP payload limit"), mock.call("encode %d LLMObs span events to be sent", 5), - ] + ], + any_order=True, ) diff --git a/tests/llmobs/test_llmobs_span_agentless_writer.py b/tests/llmobs/test_llmobs_span_agentless_writer.py index 4882f3553d8..4a54faf130d 100644 --- a/tests/llmobs/test_llmobs_span_agentless_writer.py +++ b/tests/llmobs/test_llmobs_span_agentless_writer.py @@ -75,26 +75,25 @@ def test_truncating_oversized_events(mock_writer_logs, mock_http_writer_send_pay ) -def test_send_completion_event(mock_writer_logs, mock_http_writer_logs, mock_http_writer_send_payload_response): +def test_send_completion_event(mock_writer_logs, mock_http_writer_send_payload_response): with override_global_config(dict(_dd_site=DATADOG_SITE, _dd_api_key="foobar.baz")): llmobs_span_writer = LLMObsSpanWriter(is_agentless=True, interval=1, timeout=1) llmobs_span_writer.start() llmobs_span_writer.enqueue(_completion_event()) llmobs_span_writer.periodic() mock_writer_logs.debug.assert_has_calls([mock.call("encode %d LLMObs span events to be sent", 1)]) - mock_http_writer_logs.error.assert_not_called() -def test_send_chat_completion_event(mock_writer_logs, mock_http_writer_logs, mock_http_writer_send_payload_response): +def test_send_chat_completion_event(mock_writer_logs, mock_http_writer_send_payload_response): with override_global_config(dict(_dd_site=DATADOG_SITE, _dd_api_key="foobar.baz")): llmobs_span_writer = LLMObsSpanWriter(is_agentless=True, interval=1, timeout=1) llmobs_span_writer.start() llmobs_span_writer.enqueue(_chat_completion_event()) llmobs_span_writer.periodic() mock_writer_logs.debug.assert_has_calls([mock.call("encode %d LLMObs span events to be sent", 1)]) - mock_http_writer_logs.error.assert_not_called() +@mock.patch("ddtrace.internal.writer.writer.log") def test_send_completion_bad_api_key(mock_http_writer_logs, mock_http_writer_put_response_forbidden): with override_global_config(dict(_dd_site=DATADOG_SITE, _dd_api_key="")): llmobs_span_writer = LLMObsSpanWriter(is_agentless=True, interval=1, timeout=1) @@ -109,7 +108,7 @@ def test_send_completion_bad_api_key(mock_http_writer_logs, mock_http_writer_put ) -def test_send_timed_events(mock_writer_logs, mock_http_writer_logs, mock_http_writer_send_payload_response): +def test_send_timed_events(mock_writer_logs, mock_http_writer_send_payload_response): with override_global_config(dict(_dd_site=DATADOG_SITE, _dd_api_key="foobar.baz")): llmobs_span_writer = LLMObsSpanWriter(is_agentless=True, interval=0.01, timeout=1) llmobs_span_writer.start() @@ -122,10 +121,9 @@ def test_send_timed_events(mock_writer_logs, mock_http_writer_logs, mock_http_wr llmobs_span_writer.enqueue(_chat_completion_event()) time.sleep(0.1) mock_writer_logs.debug.assert_has_calls([mock.call("encode %d LLMObs span events to be sent", 1)]) - mock_http_writer_logs.error.assert_not_called() -def test_send_multiple_events(mock_writer_logs, mock_http_writer_logs, mock_http_writer_send_payload_response): +def test_send_multiple_events(mock_writer_logs, mock_http_writer_send_payload_response): with override_global_config(dict(_dd_site=DATADOG_SITE, _dd_api_key="foobar.baz")): llmobs_span_writer = LLMObsSpanWriter(is_agentless=True, interval=0.01, timeout=1) llmobs_span_writer.start() @@ -135,10 +133,9 @@ def test_send_multiple_events(mock_writer_logs, mock_http_writer_logs, mock_http llmobs_span_writer.enqueue(_chat_completion_event()) time.sleep(0.1) mock_writer_logs.debug.assert_has_calls([mock.call("encode %d LLMObs span events to be sent", 2)]) - mock_http_writer_logs.error.assert_not_called() -def test_send_on_exit(mock_writer_logs, run_python_code_in_subprocess): +def test_send_on_exit(run_python_code_in_subprocess): env = os.environ.copy() pypath = [os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))] if "PYTHONPATH" in env: diff --git a/tests/llmobs/test_llmobs_trace_processor.py b/tests/llmobs/test_llmobs_trace_processor.py deleted file mode 100644 index b55286d49c8..00000000000 --- a/tests/llmobs/test_llmobs_trace_processor.py +++ /dev/null @@ -1,36 +0,0 @@ -import mock - -from ddtrace._trace.span import Span -from ddtrace.ext import SpanTypes -from ddtrace.llmobs._constants import SPAN_KIND -from ddtrace.llmobs._trace_processor import LLMObsTraceProcessor -from tests.utils import override_global_config - - -def test_processor_returns_all_traces_by_default(): - """Test that the LLMObsTraceProcessor returns all traces by default.""" - trace_filter = LLMObsTraceProcessor(llmobs_span_writer=mock.MagicMock()) - root_llm_span = Span(name="span1", span_type=SpanTypes.LLM) - root_llm_span._set_ctx_item(SPAN_KIND, "llm") - trace1 = [root_llm_span] - assert trace_filter.process_trace(trace1) == trace1 - - -def test_processor_returns_all_traces_if_not_agentless(): - """Test that the LLMObsTraceProcessor returns all traces if DD_LLMOBS_AGENTLESS_ENABLED is not set to true.""" - with override_global_config(dict(_llmobs_agentless_enabled=False)): - trace_filter = LLMObsTraceProcessor(llmobs_span_writer=mock.MagicMock()) - root_llm_span = Span(name="span1", span_type=SpanTypes.LLM) - root_llm_span._set_ctx_item(SPAN_KIND, "llm") - trace1 = [root_llm_span] - assert trace_filter.process_trace(trace1) == trace1 - - -def test_processor_returns_none_in_agentless_mode(): - """Test that the LLMObsTraceProcessor returns None if DD_LLMOBS_AGENTLESS_ENABLED is set to true.""" - with override_global_config(dict(_llmobs_agentless_enabled=True)): - trace_filter = LLMObsTraceProcessor(llmobs_span_writer=mock.MagicMock()) - root_llm_span = Span(name="span1", span_type=SpanTypes.LLM) - root_llm_span._set_ctx_item(SPAN_KIND, "llm") - trace1 = [root_llm_span] - assert trace_filter.process_trace(trace1) is None diff --git a/tests/llmobs/test_propagation.py b/tests/llmobs/test_propagation.py index d892c6b98a2..e3ab9c80d66 100644 --- a/tests/llmobs/test_propagation.py +++ b/tests/llmobs/test_propagation.py @@ -157,39 +157,39 @@ def test_no_llmobs_parent_id_propagated_if_no_llmobs_spans(run_python_code_in_su assert _get_llmobs_parent_id(span) == "undefined" -def test_inject_distributed_headers_simple(LLMObs): +def test_inject_distributed_headers_simple(llmobs): dummy_tracer = DummyTracer() with dummy_tracer.trace("LLMObs span", span_type=SpanTypes.LLM) as root_span: - request_headers = LLMObs.inject_distributed_headers({}, span=root_span) + request_headers = llmobs.inject_distributed_headers({}, span=root_span) assert PROPAGATED_PARENT_ID_KEY in request_headers["x-datadog-tags"] -def test_inject_distributed_headers_nested_llmobs_non_llmobs(LLMObs): +def test_inject_distributed_headers_nested_llmobs_non_llmobs(llmobs): dummy_tracer = DummyTracer() with dummy_tracer.trace("LLMObs span", span_type=SpanTypes.LLM): with dummy_tracer.trace("Non-LLMObs span") as child_span: - request_headers = LLMObs.inject_distributed_headers({}, span=child_span) + request_headers = llmobs.inject_distributed_headers({}, span=child_span) assert PROPAGATED_PARENT_ID_KEY in request_headers["x-datadog-tags"] -def test_inject_distributed_headers_non_llmobs_root_span(LLMObs): +def test_inject_distributed_headers_non_llmobs_root_span(llmobs): dummy_tracer = DummyTracer() with dummy_tracer.trace("Non-LLMObs span"): with dummy_tracer.trace("LLMObs span", span_type=SpanTypes.LLM) as child_span: - request_headers = LLMObs.inject_distributed_headers({}, span=child_span) + request_headers = llmobs.inject_distributed_headers({}, span=child_span) assert PROPAGATED_PARENT_ID_KEY in request_headers["x-datadog-tags"] -def test_inject_distributed_headers_nested_llmobs_spans(LLMObs): +def test_inject_distributed_headers_nested_llmobs_spans(llmobs): dummy_tracer = DummyTracer() with dummy_tracer.trace("LLMObs span", span_type=SpanTypes.LLM): with dummy_tracer.trace("LLMObs child span", span_type=SpanTypes.LLM): with dummy_tracer.trace("Last LLMObs child span", span_type=SpanTypes.LLM) as last_llmobs_span: - request_headers = LLMObs.inject_distributed_headers({}, span=last_llmobs_span) + request_headers = llmobs.inject_distributed_headers({}, span=last_llmobs_span) assert PROPAGATED_PARENT_ID_KEY in request_headers["x-datadog-tags"] -def test_activate_distributed_headers_propagate_correct_llmobs_parent_id_simple(run_python_code_in_subprocess, LLMObs): +def test_activate_distributed_headers_propagate_correct_llmobs_parent_id_simple(run_python_code_in_subprocess, llmobs): """Test that the correct LLMObs parent ID is propagated in the headers in a simple distributed scenario. Service A (subprocess) has a root LLMObs span and a non-LLMObs child span. Service B (outside subprocess) has a LLMObs span. @@ -216,16 +216,15 @@ def test_activate_distributed_headers_propagate_correct_llmobs_parent_id_simple( env["DD_TRACE_ENABLED"] = "0" stdout, stderr, status, _ = run_python_code_in_subprocess(code=code, env=env) assert status == 0, (stdout, stderr) - assert stderr == b"", (stdout, stderr) headers = json.loads(stdout.decode()) - LLMObs.activate_distributed_headers(headers) - with LLMObs.workflow("LLMObs span") as span: + llmobs.activate_distributed_headers(headers) + with llmobs.workflow("LLMObs span") as span: assert str(span.parent_id) == headers["x-datadog-parent-id"] assert _get_llmobs_parent_id(span) == headers["_DD_LLMOBS_SPAN_ID"] -def test_activate_distributed_headers_propagate_llmobs_parent_id_complex(run_python_code_in_subprocess, LLMObs): +def test_activate_distributed_headers_propagate_llmobs_parent_id_complex(run_python_code_in_subprocess, llmobs): """Test that the correct LLMObs parent ID is propagated in the headers in a more complex trace. Service A (subprocess) has a root LLMObs span and a non-LLMObs child span. Service B (outside subprocess) has a non-LLMObs local root span and a LLMObs child span. @@ -252,19 +251,18 @@ def test_activate_distributed_headers_propagate_llmobs_parent_id_complex(run_pyt env["DD_TRACE_ENABLED"] = "0" stdout, stderr, status, _ = run_python_code_in_subprocess(code=code, env=env) assert status == 0, (stdout, stderr) - assert stderr == b"", (stdout, stderr) headers = json.loads(stdout.decode()) - LLMObs.activate_distributed_headers(headers) + llmobs.activate_distributed_headers(headers) dummy_tracer = DummyTracer() with dummy_tracer.trace("Non-LLMObs span") as span: - with LLMObs.llm(model_name="llm_model", name="LLMObs span") as llm_span: + with llmobs.llm(model_name="llm_model", name="LLMObs span") as llm_span: assert str(span.parent_id) == headers["x-datadog-parent-id"] assert _get_llmobs_parent_id(span) == headers["_DD_LLMOBS_SPAN_ID"] assert _get_llmobs_parent_id(llm_span) == headers["_DD_LLMOBS_SPAN_ID"] -def test_activate_distributed_headers_does_not_propagate_if_no_llmobs_spans(run_python_code_in_subprocess, LLMObs): +def test_activate_distributed_headers_does_not_propagate_if_no_llmobs_spans(run_python_code_in_subprocess, llmobs): """Test that the correct LLMObs parent ID (None) is extracted from the headers in a simple distributed scenario. Service A (subprocess) has spans, but none are LLMObs spans. Service B (outside subprocess) has a LLMObs span. @@ -289,10 +287,9 @@ def test_activate_distributed_headers_does_not_propagate_if_no_llmobs_spans(run_ env["DD_TRACE_ENABLED"] = "0" stdout, stderr, status, _ = run_python_code_in_subprocess(code=code, env=env) assert status == 0, (stdout, stderr) - assert stderr == b"", (stdout, stderr) headers = json.loads(stdout.decode()) - LLMObs.activate_distributed_headers(headers) - with LLMObs.task("LLMObs span") as span: + llmobs.activate_distributed_headers(headers) + with llmobs.task("LLMObs span") as span: assert str(span.parent_id) == headers["x-datadog-parent-id"] assert _get_llmobs_parent_id(span) == "undefined" diff --git a/tests/snapshots/tests.contrib.openai.test_openai.test_chat_completion_stream.json b/tests/snapshots/tests.contrib.openai.test_openai.test_chat_completion_stream.json new file mode 100644 index 00000000000..fe7c9e3b0f2 --- /dev/null +++ b/tests/snapshots/tests.contrib.openai.test_openai.test_chat_completion_stream.json @@ -0,0 +1,53 @@ +[[ + { + "name": "openai.request", + "service": "tests.contrib.openai", + "resource": "createChatCompletion", + "trace_id": 0, + "span_id": 1, + "parent_id": 0, + "type": "", + "error": 0, + "meta": { + "_dd.p.dm": "-0", + "_dd.p.tid": "67741fca00000000", + "component": "openai", + "language": "python", + "openai.base_url": "https://api.openai.com/v1/", + "openai.organization.name": "datadog-4", + "openai.request.client": "OpenAI", + "openai.request.endpoint": "/v1/chat/completions", + "openai.request.messages.0.content": "Who won the world series in 2020?", + "openai.request.messages.0.name": "", + "openai.request.messages.0.role": "user", + "openai.request.method": "POST", + "openai.request.model": "gpt-3.5-turbo", + "openai.request.n": "None", + "openai.request.stream": "True", + "openai.request.user": "ddtrace-test", + "openai.response.choices.0.finish_reason": "stop", + "openai.response.choices.0.message.content": "The Los Angeles Dodgers won the World Series in 2020.", + "openai.response.choices.0.message.role": "assistant", + "openai.response.model": "gpt-3.5-turbo-0301", + "openai.user.api_key": "sk-...key>", + "runtime-id": "d174f65e33314f43ad1de8cf0a5ca4e0" + }, + "metrics": { + "_dd.measured": 1, + "_dd.top_level": 1, + "_dd.tracer_kr": 1.0, + "_sampling_priority_v1": 1, + "openai.organization.ratelimit.requests.limit": 3000, + "openai.organization.ratelimit.requests.remaining": 2999, + "openai.organization.ratelimit.tokens.limit": 250000, + "openai.organization.ratelimit.tokens.remaining": 249979, + "openai.request.prompt_tokens_estimated": 0, + "openai.response.completion_tokens_estimated": 0, + "openai.response.usage.completion_tokens": 19, + "openai.response.usage.prompt_tokens": 17, + "openai.response.usage.total_tokens": 36, + "process_id": 22982 + }, + "duration": 29869000, + "start": 1735663562179157000 + }]] diff --git a/tests/snapshots/tests.contrib.openai.test_openai.test_completion_stream.json b/tests/snapshots/tests.contrib.openai.test_openai.test_completion_stream.json new file mode 100644 index 00000000000..7cf644cfb3d --- /dev/null +++ b/tests/snapshots/tests.contrib.openai.test_openai.test_completion_stream.json @@ -0,0 +1,49 @@ +[[ + { + "name": "openai.request", + "service": "tests.contrib.openai", + "resource": "createCompletion", + "trace_id": 0, + "span_id": 1, + "parent_id": 0, + "type": "", + "error": 0, + "meta": { + "_dd.p.dm": "-0", + "_dd.p.tid": "6774231f00000000", + "component": "openai", + "language": "python", + "openai.base_url": "https://api.openai.com/v1/", + "openai.organization.name": "datadog-4", + "openai.request.client": "OpenAI", + "openai.request.endpoint": "/v1/completions", + "openai.request.method": "POST", + "openai.request.model": "ada", + "openai.request.n": "None", + "openai.request.prompt.0": "Hello world", + "openai.request.stream": "True", + "openai.response.choices.0.finish_reason": "length", + "openai.response.choices.0.text": "! ... A page layouts page drawer? ... Interesting. The \"Tools\" is", + "openai.response.model": "ada", + "openai.user.api_key": "sk-...key>", + "runtime-id": "11872c9ca653441db861b108a4f795eb" + }, + "metrics": { + "_dd.measured": 1, + "_dd.top_level": 1, + "_dd.tracer_kr": 1.0, + "_sampling_priority_v1": 1, + "openai.organization.ratelimit.requests.limit": 3000, + "openai.organization.ratelimit.requests.remaining": 2999, + "openai.organization.ratelimit.tokens.limit": 250000, + "openai.organization.ratelimit.tokens.remaining": 249979, + "openai.request.prompt_tokens_estimated": 0, + "openai.response.completion_tokens_estimated": 0, + "openai.response.usage.completion_tokens": 2, + "openai.response.usage.prompt_tokens": 2, + "openai.response.usage.total_tokens": 4, + "process_id": 27488 + }, + "duration": 28739000, + "start": 1735664415266386000 + }]] diff --git a/tests/snapshots/tests.contrib.openai.test_openai_v1.test_completion_stream_est_tokens.json b/tests/snapshots/tests.contrib.openai.test_openai_v1.test_completion_stream_est_tokens.json new file mode 100644 index 00000000000..445dc39db98 --- /dev/null +++ b/tests/snapshots/tests.contrib.openai.test_openai_v1.test_completion_stream_est_tokens.json @@ -0,0 +1,49 @@ +[[ + { + "name": "openai.request", + "service": "tests.contrib.openai", + "resource": "createCompletion", + "trace_id": 0, + "span_id": 1, + "parent_id": 0, + "type": "", + "error": 0, + "meta": { + "_dd.p.dm": "-0", + "_dd.p.tid": "677c221c00000000", + "component": "openai", + "language": "python", + "openai.base_url": "https://api.openai.com/v1/", + "openai.organization.name": "datadog-4", + "openai.request.client": "OpenAI", + "openai.request.endpoint": "/v1/completions", + "openai.request.method": "POST", + "openai.request.model": "ada", + "openai.request.n": "None", + "openai.request.prompt.0": "Hello world", + "openai.request.stream": "True", + "openai.response.choices.0.finish_reason": "length", + "openai.response.choices.0.text": "! ... A page layouts page drawer? ... Interesting. The \"Tools\" is", + "openai.response.model": "ada", + "openai.user.api_key": "sk-...key>", + "runtime-id": "24f8e851c87e4f758c73d6acd0aaf82b" + }, + "metrics": { + "_dd.measured": 1, + "_dd.top_level": 1, + "_dd.tracer_kr": 1.0, + "_sampling_priority_v1": 1, + "openai.organization.ratelimit.requests.limit": 3000, + "openai.organization.ratelimit.requests.remaining": 2999, + "openai.organization.ratelimit.tokens.limit": 250000, + "openai.organization.ratelimit.tokens.remaining": 249979, + "openai.request.prompt_tokens_estimated": 1, + "openai.response.completion_tokens_estimated": 1, + "openai.response.usage.completion_tokens": 16, + "openai.response.usage.prompt_tokens": 2, + "openai.response.usage.total_tokens": 18, + "process_id": 47101 + }, + "duration": 37957000, + "start": 1736188444222291000 + }]] diff --git a/tests/tracer/test_propagation.py b/tests/tracer/test_propagation.py index 61fec650a70..0d4c5d7c01d 100644 --- a/tests/tracer/test_propagation.py +++ b/tests/tracer/test_propagation.py @@ -1888,6 +1888,14 @@ def test_extract_tracecontext(headers, expected_context): B3_SINGLE_HEADERS_VALID, CONTEXT_EMPTY, ), + ( + "baggage_case_insensitive", + None, + {"BAgGage": "key1=val1,key2=val2"}, + { + "baggage": {"key1": "val1", "key2": "val2"}, + }, + ), # All valid headers ( "valid_all_headers_default_style", @@ -2278,14 +2286,14 @@ def test_propagation_extract_w_config(name, styles, headers, expected_context, r overrides = {} if styles is not None: overrides["_propagation_style_extract"] = styles - with override_global_config(overrides): - context = HTTPPropagator.extract(headers) - if not expected_context.get("tracestate"): - assert context == Context(**expected_context) - else: - copied_expectation = expected_context.copy() - tracestate = copied_expectation.pop("tracestate") - assert context == Context(**copied_expectation, meta={"tracestate": tracestate}) + with override_global_config(overrides): + context = HTTPPropagator.extract(headers) + if not expected_context.get("tracestate"): + assert context == Context(**expected_context) + else: + copied_expectation = expected_context.copy() + tracestate = copied_expectation.pop("tracestate") + assert context == Context(**copied_expectation, meta={"tracestate": tracestate}) EXTRACT_OVERRIDE_FIXTURES = [