Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

test cleanup #1453

Merged
merged 11 commits into from
Mar 8, 2025
6 changes: 2 additions & 4 deletions src/inspect_ai/util/_subprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,10 +195,8 @@ async def run_command_timeout() -> Union[ExecResult[str], ExecResult[bytes]]:
await anyio.sleep(2)
if proc.returncode is None:
proc.kill()
except Exception as ex:
logger.warning(
f"Unexpected error terminating timed out process '{args}': {ex}"
)
except Exception:
pass
raise

# await result without timeout
Expand Down
19 changes: 13 additions & 6 deletions tests/model/test_mock_model_llm.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import pytest
from test_helpers.utils import skip_if_trio

from inspect_ai import Task, eval
from inspect_ai.dataset import Sample
Expand All @@ -8,15 +9,17 @@
from inspect_ai.solver import generate


@pytest.mark.anyio
@pytest.mark.asycnio
@skip_if_trio
async def test_mock_generate_default() -> None:
model = get_model("mockllm/model")

response = await model.generate(input="unused")
assert response.completion == MockLLM.default_output


@pytest.mark.anyio
@pytest.mark.asyncio
@skip_if_trio
async def test_mock_generate_custom_valid() -> None:
custom_content_str = "custom #content"
model = get_model(
Expand All @@ -30,7 +33,8 @@ async def test_mock_generate_custom_valid() -> None:
assert response.completion == custom_content_str


@pytest.mark.anyio
@pytest.mark.asyncio
@skip_if_trio
async def test_mock_generate_custom_invalid() -> None:
model = get_model(
"mockllm/model",
Expand All @@ -41,7 +45,8 @@ async def test_mock_generate_custom_invalid() -> None:
assert "must be an instance of ModelOutput" in str(e_info.value)


@pytest.mark.anyio
@pytest.mark.asyncio
@skip_if_trio
async def test_mock_generate_custom_invalid_iterable_string() -> None:
model = get_model(
"mockllm/model",
Expand All @@ -52,7 +57,8 @@ async def test_mock_generate_custom_invalid_iterable_string() -> None:
assert "must be an instance of ModelOutput" in str(e_info.value)


@pytest.mark.anyio
@pytest.mark.asyncio
@skip_if_trio
async def test_mock_generate_custom_invalid_iterable_number() -> None:
with pytest.raises(ValueError) as e_info:
get_model(
Expand All @@ -62,7 +68,8 @@ async def test_mock_generate_custom_invalid_iterable_number() -> None:
assert "must be an Iterable or a Generator" in str(e_info.value)


@pytest.mark.anyio
@pytest.mark.asyncio
@skip_if_trio
async def test_mock_generate_not_enough() -> None:
model = get_model(
"mockllm/model",
Expand Down
19 changes: 13 additions & 6 deletions tests/model/test_stop_reason.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
skip_if_no_mistral,
skip_if_no_openai,
skip_if_no_together,
skip_if_trio,
)

from inspect_ai.model import GenerateConfig, ModelOutput, get_model
Expand All @@ -31,37 +32,43 @@ async def check_stop_reason(model_name):
assert response.choices[0].stop_reason == "max_tokens"


@pytest.mark.anyio
@pytest.mark.asyncio
@skip_if_no_groq
@skip_if_trio
async def test_groq_stop_reason() -> None:
await check_stop_reason("groq/llama3-70b-8192")


@pytest.mark.anyio
@pytest.mark.asyncio
@skip_if_no_openai
@skip_if_trio
async def test_openai_stop_reason() -> None:
await check_stop_reason("openai/gpt-3.5-turbo")


@pytest.mark.anyio
@pytest.mark.asyncio
@skip_if_no_anthropic
@skip_if_trio
async def test_anthropic_stop_reason() -> None:
await check_stop_reason("anthropic/claude-3-haiku-20240307")


@pytest.mark.anyio
@pytest.mark.asyncio
@skip_if_no_mistral
@skip_if_trio
async def test_mistral_stop_reason() -> None:
await check_stop_reason("mistral/mistral-medium-latest")


@pytest.mark.anyio
@pytest.mark.asyncio
@skip_if_no_grok
@skip_if_trio
async def test_grok_stop_reason() -> None:
await check_stop_reason("grok/grok-beta")


@pytest.mark.anyio
@pytest.mark.asyncio
@skip_if_no_together
@skip_if_trio
async def test_together_stop_reason() -> None:
await check_stop_reason("together/google/gemma-2b-it")
8 changes: 7 additions & 1 deletion tests/model/test_structured_output.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
from pydantic import BaseModel, ValidationError
from test_helpers.utils import skip_if_no_google, skip_if_no_mistral, skip_if_no_openai
from test_helpers.utils import (
skip_if_no_google,
skip_if_no_mistral,
skip_if_no_openai,
skip_if_trio,
)

from inspect_ai import Task, eval, task
from inspect_ai.dataset import Sample
Expand Down Expand Up @@ -75,6 +80,7 @@ def test_openai_structured_output():


@skip_if_no_google
@skip_if_trio
def test_google_structured_output():
check_structured_output("google/gemini-2.0-flash")

Expand Down
1 change: 1 addition & 0 deletions tests/test_eval_set.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,7 @@ def test_eval_zero_retries() -> None:
assert not success


@skip_if_trio # throwing the keyboardinterrupt corrupts trio's internals
def test_eval_set_previous_task_args():
with tempfile.TemporaryDirectory() as log_dir:

Expand Down
5 changes: 3 additions & 2 deletions tests/test_extensions.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import pytest
from pydantic_core import to_jsonable_python
from test_helpers.tools import list_files
from test_helpers.utils import ensure_test_package_installed
from test_helpers.utils import ensure_test_package_installed, skip_if_trio

from inspect_ai import Task, eval_async
from inspect_ai.dataset import Sample
Expand All @@ -13,7 +13,8 @@
from inspect_ai.util import SandboxEnvironmentSpec


@pytest.mark.anyio
@pytest.mark.asyncio
@skip_if_trio
async def test_extension_model():
# ensure the package is installed
ensure_test_package_installed()
Expand Down
21 changes: 21 additions & 0 deletions tests/test_helpers/file_runner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import subprocess
import sys
from pathlib import Path


def run_tests_by_file():
args = sys.argv[1:]
TESTS_DIR = Path(__file__).parent.parent
test_files = TESTS_DIR.glob("**/test_*.py")

for test_file in test_files:
pytest_command = ["pytest", test_file.as_posix()] + args
result = subprocess.run(pytest_command)
if result.returncode != 0:
sys.exit(result.returncode)

print("All tests passed.")


if __name__ == "__main__":
run_tests_by_file()
6 changes: 4 additions & 2 deletions tests/test_sample_limits.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,8 +205,10 @@ def test_working_limit_reporting():
model="mockllm/model",
)[0]
assert log.samples
for index, sample in enumerate(log.samples):
assert (sample.total_time - sample.working_time + 0.1) >= index
waiting_time = 0
for sample in log.samples:
waiting_time += sample.total_time - sample.working_time + 0.1
assert waiting_time > 3


def check_working_limit_event(log: EvalLog, working_limit: int):
Expand Down
20 changes: 10 additions & 10 deletions tests/tools/test_tool_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,6 @@
from test_helpers.utils import (
skip_if_no_anthropic,
skip_if_no_google,
skip_if_no_grok,
skip_if_no_mistral,
skip_if_no_openai,
skip_if_no_vertex,
skip_if_trio,
Expand Down Expand Up @@ -286,18 +284,20 @@ def test_vertex_tool_types():
check_tool_types("vertex/gemini-1.5-flash")


@skip_if_no_mistral
def test_mistral_tool_types() -> None:
check_tool_types("mistral/mistral-large-latest")
# mistral, grok, and groq tool calling is extremely unreliable and
# consequently cause failed tests that are red herrings. don't
# exercise these for now.

# @skip_if_no_mistral
# def test_mistral_tool_types() -> None:
# check_tool_types("mistral/mistral-large-latest")

@skip_if_no_grok
def test_grok_tool_types() -> None:
check_tool_types("grok/grok-beta")

# @skip_if_no_grok
# def test_grok_tool_types() -> None:
# check_tool_types("grok/grok-beta")


# groq tool calling is extremely unreliable and consequently causes
# failed tests that are red herrings. don't exercise this for now.
# @skip_if_no_groq
# def test_groq_tool_types() -> None:
# check_tool_types("groq/mixtral-8x7b-32768")
Expand Down
1 change: 0 additions & 1 deletion tests/tools/test_web_browser.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,6 @@ def test_web_browser_click():
assert click_call
click_response = get_tool_response(log.samples[0].messages, click_call)
assert click_response
assert "defines an execution plan" in click_response.text


@skip_if_no_docker
Expand Down