Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix evaluator errors #1538

Merged
merged 6 commits into from
Mar 3, 2025
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions python/langsmith/client.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""Client for interacting with the LangSmith API.

Check notice on line 1 in python/langsmith/client.py

View workflow job for this annotation

GitHub Actions / benchmark

Benchmark results

........... create_5_000_run_trees: Mean +- std dev: 658 ms +- 57 ms ........... create_10_000_run_trees: Mean +- std dev: 1.35 sec +- 0.11 sec ........... create_20_000_run_trees: Mean +- std dev: 2.67 sec +- 0.13 sec ........... dumps_class_nested_py_branch_and_leaf_200x400: Mean +- std dev: 714 us +- 7 us ........... dumps_class_nested_py_leaf_50x100: Mean +- std dev: 24.8 ms +- 0.4 ms ........... dumps_class_nested_py_leaf_100x200: Mean +- std dev: 103 ms +- 2 ms ........... dumps_dataclass_nested_50x100: Mean +- std dev: 25.2 ms +- 0.2 ms ........... WARNING: the benchmark result may be unstable * the standard deviation (7.41 ms) is 13% of the mean (55.6 ms) Try to rerun the benchmark with more runs, values and/or loops. Run 'python -m pyperf system tune' command to reduce the system jitter. Use pyperf stats, pyperf dump and pyperf hist to analyze results. Use --quiet option to hide these warnings. dumps_pydantic_nested_50x100: Mean +- std dev: 55.6 ms +- 7.4 ms ........... dumps_pydanticv1_nested_50x100: Mean +- std dev: 193 ms +- 3 ms

Check notice on line 1 in python/langsmith/client.py

View workflow job for this annotation

GitHub Actions / benchmark

Comparison against main

+-----------------------------------------------+----------+------------------------+ | Benchmark | main | changes | +===============================================+==========+========================+ | dumps_pydanticv1_nested_50x100 | 215 ms | 193 ms: 1.11x faster | +-----------------------------------------------+----------+------------------------+ | dumps_pydantic_nested_50x100 | 57.8 ms | 55.6 ms: 1.04x faster | +-----------------------------------------------+----------+------------------------+ | create_5_000_run_trees | 680 ms | 658 ms: 1.03x faster | +-----------------------------------------------+----------+------------------------+ | dumps_class_nested_py_leaf_50x100 | 25.2 ms | 24.8 ms: 1.02x faster | +-----------------------------------------------+----------+------------------------+ | create_20_000_run_trees | 2.67 sec | 2.67 sec: 1.00x faster | +-----------------------------------------------+----------+------------------------+ | dumps_class_nested_py_leaf_100x200 | 103 ms | 103 ms: 1.00x slower | +-----------------------------------------------+----------+------------------------+ | dumps_dataclass_nested_50x100 | 25.1 ms | 25.2 ms: 1.00x slower | +-----------------------------------------------+----------+------------------------+ | dumps_class_nested_py_branch_and_leaf_200x400 | 704 us | 714 us: 1.02x slower | +-----------------------------------------------+----------+------------------------+ | create_10_000_run_trees | 1.31 sec | 1.35 sec: 1.03x slower | +-----------------------------------------------+----------+------------------------+ | Geometric mean | (ref) | 1.02x faster | +-----------------------------------------------+----------+------------------------+

Use the client to customize API keys / workspace connections, SSL certs,
etc. for tracing.
Expand Down Expand Up @@ -5344,6 +5344,7 @@
run_id_ = res.target_run_id
elif run is not None:
run_id_ = run.id
error = res.extra.get("error", None) if res.extra is not None else None

_submit_feedback(
run_id=run_id_,
Expand All @@ -5361,6 +5362,7 @@
project_id=project_id,
extra=res.extra,
trace_id=run.trace_id if run else None,
error=error,
)
return results

Expand Down Expand Up @@ -5430,6 +5432,7 @@
feedback_group_id: Optional[ID_TYPE] = None,
extra: Optional[Dict] = None,
trace_id: Optional[ID_TYPE] = None,
error: Optional[bool] = None,
**kwargs: Any,
) -> ls_schemas.Feedback:
"""Create a feedback in the LangSmith API.
Expand Down Expand Up @@ -5547,6 +5550,7 @@
),
feedback_group_id=_ensure_uuid(feedback_group_id, accept_null=True),
extra=extra,
error=error,
)

use_multipart = (self.info.batch_ingest_config or {}).get(
Expand Down
6 changes: 4 additions & 2 deletions python/langsmith/evaluation/_arunner.py
Original file line number Diff line number Diff line change
Expand Up @@ -983,10 +983,12 @@ async def _arun_evaluators(
eval_results = current_results["evaluation_results"]

async def _run_single_evaluator(evaluator):
evaluator_run_id = uuid.uuid4()
try:
evaluator_response = await evaluator.aevaluate_run(
evaluator_response = await evaluator.aevaluate_run( # type: ignore[call-arg]
run=run,
example=self._get_example_with_readers(example),
source_run_id=evaluator_run_id,
)
selected_results = self.client._select_eval_results(
evaluator_response
Expand All @@ -1005,7 +1007,7 @@ async def _run_single_evaluator(evaluator):
results=[
EvaluationResult(
key=key,
source_run_id=run.id,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not sure i follow this bit, does source run not refer to the run being evaluated?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah this is confusing. this is the run for the evaluator. basically the issue was that by setting it to the parent run, when you were in langsmith in the experiment view and clicked on a failed evaluator run (or what should have been the evaluator run) it just opened up the target run. This fixes that.

source_run_id=evaluator_run_id,
comment=repr(e),
extra={"error": True},
)
Expand Down
6 changes: 4 additions & 2 deletions python/langsmith/evaluation/_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -1630,10 +1630,12 @@ def _run_evaluators(
example = current_results["example"]
eval_results = current_results["evaluation_results"]
for evaluator in evaluators:
evaluator_run_id = uuid.uuid4()
try:
evaluator_response = evaluator.evaluate_run(
evaluator_response = evaluator.evaluate_run( # type: ignore[call-arg]
run=run,
example=example,
source_run_id=evaluator_run_id,
)

eval_results["results"].extend(
Expand All @@ -1652,7 +1654,7 @@ def _run_evaluators(
results=[
EvaluationResult(
key=key,
source_run_id=run.id,
source_run_id=evaluator_run_id,
comment=repr(e),
extra={"error": True},
)
Expand Down
18 changes: 14 additions & 4 deletions python/langsmith/evaluation/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -307,7 +307,10 @@ def is_async(self) -> bool:
return hasattr(self, "afunc")

def evaluate_run(
self, run: Run, example: Optional[Example] = None
self,
run: Run,
example: Optional[Example] = None,
source_run_id: Optional[uuid.UUID] = None,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we give this a diff name, like evaluator_run_id?

) -> Union[EvaluationResult, EvaluationResults]:
"""Evaluate a run using the wrapped function.

Expand All @@ -329,7 +332,8 @@ def evaluate_run(
)
else:
return running_loop.run_until_complete(self.aevaluate_run(run, example))
source_run_id = uuid.uuid4()
if source_run_id is None:
source_run_id = uuid.uuid4()
metadata: Dict[str, Any] = {"target_run_id": run.id}
if getattr(run, "session_id", None):
metadata["experiment"] = str(run.session_id)
Expand All @@ -340,7 +344,12 @@ def evaluate_run(
)
return self._format_result(result, source_run_id)

async def aevaluate_run(self, run: Run, example: Optional[Example] = None):
async def aevaluate_run(
self,
run: Run,
example: Optional[Example] = None,
source_run_id: Optional[uuid.UUID] = None,
):
"""Evaluate a run asynchronously using the wrapped async function.

This method directly invokes the wrapped async function with the
Expand All @@ -356,7 +365,8 @@ async def aevaluate_run(self, run: Run, example: Optional[Example] = None):
"""
if not hasattr(self, "afunc"):
return await super().aevaluate_run(run, example)
source_run_id = uuid.uuid4()
if source_run_id is None:
source_run_id = uuid.uuid4()
metadata: Dict[str, Any] = {"target_run_id": run.id}
if getattr(run, "session_id", None):
metadata["experiment"] = str(run.session_id)
Expand Down
2 changes: 2 additions & 0 deletions python/langsmith/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -643,6 +643,8 @@ class FeedbackCreate(FeedbackBase):
feedback_source: FeedbackSourceBase
"""The source of the feedback."""
feedback_config: Optional[FeedbackConfig] = None
"""The config for the feedback"""
error: Optional[bool] = None


class Feedback(FeedbackBase):
Expand Down
Loading