Skip to content

Commit

Permalink
Update Python signature
Browse files Browse the repository at this point in the history
  • Loading branch information
jacoblee93 committed Feb 24, 2025
1 parent 952df10 commit d2ffc47
Show file tree
Hide file tree
Showing 5 changed files with 58 additions and 8 deletions.
12 changes: 10 additions & 2 deletions python/langsmith/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -5236,7 +5236,10 @@ def _resolve_example_id(
def _select_eval_results(
self,
results: Union[
ls_evaluator.EvaluationResult, ls_evaluator.EvaluationResults, dict
ls_evaluator.EvaluationResult,
list[ls_evaluator.EvaluationResult],
ls_evaluator.EvaluationResults,
dict,
],
*,
fn_name: Optional[str] = None,
Expand All @@ -5261,6 +5264,8 @@ def _is_eval_results(results: Any) -> TypeGuard[ls_evaluator.EvaluationResults]:

if isinstance(results, ls_evaluator.EvaluationResult):
results_ = [results]
elif isinstance(results, list):
results_ = [_cast_result(r) for r in results]
elif _is_eval_results(results):
results_ = [_cast_result(r) for r in results["results"]]
elif isinstance(results, dict):
Expand Down Expand Up @@ -5319,7 +5324,10 @@ def evaluate_run(
def _log_evaluation_feedback(
self,
evaluator_response: Union[
ls_evaluator.EvaluationResult, ls_evaluator.EvaluationResults, dict
ls_evaluator.EvaluationResult,
list[ls_evaluator.EvaluationResult],
ls_evaluator.EvaluationResults,
dict,
],
run: Optional[ls_schemas.Run] = None,
source_info: Optional[Dict[str, Any]] = None,
Expand Down
8 changes: 5 additions & 3 deletions python/langsmith/evaluation/_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,14 +78,16 @@
RunEvaluator,
Callable[
[schemas.Run, Optional[schemas.Example]],
Union[EvaluationResult, EvaluationResults],
Union[EvaluationResult, list[EvaluationResult], EvaluationResults],
],
Callable[
..., Union[dict, EvaluationResults, list[EvaluationResult], EvaluationResult]
],
Callable[..., Union[dict, EvaluationResults, EvaluationResult]],
]
AEVALUATOR_T = Union[
Callable[
[schemas.Run, Optional[schemas.Example]],
Awaitable[Union[EvaluationResult, EvaluationResults]],
Awaitable[Union[EvaluationResult, list[EvaluationResult], EvaluationResults]],
],
]
EXPERIMENT_T = Union[str, uuid.UUID, schemas.TracerSession]
Expand Down
16 changes: 14 additions & 2 deletions python/langsmith/evaluation/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,9 @@ async def aevaluate_run(
)


_RUNNABLE_OUTPUT = Union[EvaluationResult, EvaluationResults, dict]
_RUNNABLE_OUTPUT = Union[
EvaluationResult, list[EvaluationResult], EvaluationResults, dict
]


class ComparisonEvaluationResult(BaseModel):
Expand Down Expand Up @@ -281,14 +283,24 @@ def _coerce_evaluation_results(
def _format_result(
self,
result: Union[
EvaluationResult, EvaluationResults, dict, str, int, bool, float, list
EvaluationResult,
list[EvaluationResult],
EvaluationResults,
dict,
str,
int,
bool,
float,
list,
],
source_run_id: uuid.UUID,
) -> Union[EvaluationResult, EvaluationResults]:
if isinstance(result, EvaluationResult):
if not result.source_run_id:
result.source_run_id = source_run_id
return result
elif isinstance(result, list):
return [self._format_result(r, source_run_id) for r in result]
result = _format_evaluator_result(result)
return self._coerce_evaluation_results(result, source_run_id)

Expand Down
2 changes: 1 addition & 1 deletion python/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "langsmith"
version = "0.3.10"
version = "0.3.11"
description = "Client library to connect to the LangSmith LLM Tracing and Evaluation Platform."
authors = ["LangChain <[email protected]>"]
license = "MIT"
Expand Down
28 changes: 28 additions & 0 deletions python/tests/evaluation/test_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,11 @@ async def slow_accuracy(run: Run, example: Example):
await asyncio.sleep(2)
return {"score": expected.lower() == pred.lower()}

async def multi_accuracy(run: Run, example: Example):
pred = run.outputs["output"] # type: ignore
expected = example.outputs["answer"] # type: ignore
return [{"score": expected.lower() == pred.lower()}]

def precision(runs: Sequence[Run], examples: Sequence[Example]):
predictions = [run.outputs["output"].lower() for run in runs] # type: ignore
expected = [example.outputs["answer"].lower() for example in examples] # type: ignore
Expand Down Expand Up @@ -293,6 +298,15 @@ def check_run_count():
)
assert len(results4) == 10

# Multiple return values
results5 = await aevaluate(
apredict,
data=dataset.name,
evaluators=[multi_accuracy],
experiment=results.experiment_name,
)
assert len(results5["results"]) == 10


def test_evaluate():
client = Client()
Expand All @@ -306,6 +320,11 @@ def accuracy(run: Run, example: Example):
expected = example.outputs["answer"] # type: ignore
return {"score": expected.lower() == pred.lower()}

def multi_accuracy(run: Run, example: Example):
pred = run.outputs["output"] # type: ignore
expected = example.outputs["answer"] # type: ignore
return [{"score": expected.lower() == pred.lower()}]

def precision(runs: Sequence[Run], examples: Sequence[Example]):
predictions = [run.outputs["output"].lower() for run in runs] # type: ignore
expected = [example.outputs["answer"].lower() for example in examples] # type: ignore
Expand Down Expand Up @@ -378,6 +397,15 @@ def predict(inputs: dict) -> dict:
)
assert len(results4) == 10

# Multiple return values
results5 = evaluate(
predict,
data=client.list_examples(dataset_name=dataset_name, as_of="test_version"),
evaluators=[multi_accuracy],
experiment=results.experiment_name,
)
assert len(results5["results"]) == 10


@test
def test_foo():
Expand Down

0 comments on commit d2ffc47

Please sign in to comment.