Skip to content

Commit

Permalink
add tests
Browse files Browse the repository at this point in the history
  • Loading branch information
samsja committed Sep 22, 2024
1 parent b4d5fd7 commit a1a72e3
Showing 1 changed file with 8 additions and 18 deletions.
26 changes: 8 additions & 18 deletions tests/test_torchrun/test_train.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,7 @@ def gpus_to_use(num_nodes, num_gpu, rank):
return ",".join(map(str, range(rank * num_gpu, (rank + 1) * num_gpu)))


@pytest.mark.parametrize("num_gpus", [[1, 1], [2, 1], [1, 2]])
@pytest.mark.parametrize("config", ["debug/debug.toml", "debug/diloco.toml"])
def test_multi_gpu(num_gpus, config):
def _test_multi_gpu(num_gpus, config, diloco: bool):
num_nodes, num_gpu = num_gpus[0], num_gpus[1]

processes = []
Expand All @@ -56,20 +54,12 @@ def test_multi_gpu(num_gpus, config):
pytest.fail(f"Process {result} failed {result}")


@pytest.mark.parametrize("num_gpu", [1, 2])
def test_multi_gpu_diloco(random_available_port, num_gpu):
cmd = [
"torchrun",
f"--nproc_per_node={num_gpu}",
"--rdzv-endpoint",
f"localhost:{random_available_port}",
"src/zeroband/train.py",
"@configs/debug/diloco.toml",
"--optim.total_steps",
"50",
]
@pytest.mark.parametrize("num_gpus", [[1, 1], [2, 1], [1, 2]])
def test_multi_gpu(num_gpus):
_test_multi_gpu(num_gpus, "debug/debug.toml", diloco=False)

result = subprocess.run(cmd)

if result.returncode != 0:
pytest.fail(f"Process {result} failed {result.stderr}")
@pytest.mark.parametrize("num_gpus", [[1, 2], [2, 2]])
def test_multi_gpu_diloco(num_gpus):
# we don't test 1,1 and 2,1 because 1 solo gpu failed with fsdp
_test_multi_gpu(num_gpus, "debug/diloco.toml", diloco=True)

0 comments on commit a1a72e3

Please sign in to comment.