diff --git a/tests/test_torchrun/test_train.py b/tests/test_torchrun/test_train.py
index 8e98716a..7bb545dd 100644
--- a/tests/test_torchrun/test_train.py
+++ b/tests/test_torchrun/test_train.py
@@ -28,9 +28,7 @@ def gpus_to_use(num_nodes, num_gpu, rank):
     return ",".join(map(str, range(rank * num_gpu, (rank + 1) * num_gpu)))
 
 
-@pytest.mark.parametrize("num_gpus", [[1, 1], [2, 1], [1, 2]])
-@pytest.mark.parametrize("config", ["debug/debug.toml", "debug/diloco.toml"])
-def test_multi_gpu(num_gpus, config):
+def _test_multi_gpu(num_gpus, config, diloco: bool):
     num_nodes, num_gpu = num_gpus[0], num_gpus[1]
 
     processes = []
@@ -56,20 +54,12 @@ def test_multi_gpu(num_gpus, config):
             pytest.fail(f"Process {result} failed {result}")
 
 
-@pytest.mark.parametrize("num_gpu", [1, 2])
-def test_multi_gpu_diloco(random_available_port, num_gpu):
-    cmd = [
-        "torchrun",
-        f"--nproc_per_node={num_gpu}",
-        "--rdzv-endpoint",
-        f"localhost:{random_available_port}",
-        "src/zeroband/train.py",
-        "@configs/debug/diloco.toml",
-        "--optim.total_steps",
-        "50",
-    ]
+@pytest.mark.parametrize("num_gpus", [[1, 1], [2, 1], [1, 2]])
+def test_multi_gpu(num_gpus):
+    _test_multi_gpu(num_gpus, "debug/debug.toml", diloco=False)
 
-    result = subprocess.run(cmd)
 
-    if result.returncode != 0:
-        pytest.fail(f"Process {result} failed {result.stderr}")
+@pytest.mark.parametrize("num_gpus", [[1, 2], [2, 2]])
+def test_multi_gpu_diloco(num_gpus):
+    # we don't test 1,1 and 2,1 because 1 solo gpu failed with fsdp
+    _test_multi_gpu(num_gpus, "debug/diloco.toml", diloco=True)