Skip to content

Commit

Permalink
simplify setup
Browse files Browse the repository at this point in the history
  • Loading branch information
fnattino committed Jun 27, 2024
1 parent de84ec8 commit 7011183
Show file tree
Hide file tree
Showing 5 changed files with 60 additions and 41 deletions.
15 changes: 4 additions & 11 deletions config/dask/jobqueue.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,16 +11,9 @@ jobqueue:
local-directory: '$TMPDIR' # Location of fast local storage like /scratch or $TMPDIR
walltime: '1:00:00'
scheduler-options:
port: 8786
host: 'localhost'
# contact_address: 'tcp://${SLURM_LOGIN_NODE}:8787'
# preload: ${DASK_CONFIG}/scheduler-setup.py
host: '0.0.0.0'
job_script_prologue:
- 'APPTAINER_TMPDIR=${TMPDIR}'
- 'WORKER_PORT=`shuf -i 8400-9400 -n 1`'
- 'WORKER_PORT=`shuf -i 8900-9900 -n 1`'
worker_extra_args:
- '--listen-address'
- 'tcp://0.0.0.0:${WORKER_PORT}'
- '--contact-address'
- 'tcp://localhost:${WORKER_PORT}'
# python: 'apptainer exec ${APPTAINER_IMAGE} python'
- '--worker-port'
- '${WORKER_PORT}'
47 changes: 37 additions & 10 deletions config/dask/scheduler-setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,27 +21,52 @@
class SchedulerPluginForRemoteWorkers(SchedulerPlugin):
"""
Scheduler plugin to connect to workers running on a remote
SLURM system. When a new worker is added to the cluster,
SLURM system. When a new worker is added to the cluster,
the port to which the worker binds is forwarded locally.
Note, however, that the workers will still advertise their
"real" address, which is likely on the private network of the
SLURM cluster (10.X.X.X). For the scheduler to reach them, we
thus need to redirect all traffic towards these IP addresses to
the localhost, e.g. with the following iptables rule (replace
XXXX:YYYY with the range of ports used by the workers, e.g.
8900:9900 - see `jobqueue.yaml` for the range employed):
```
sudo iptables -t nat -I OUTPUT --dst 10.0.0.0/16 -p tcp --match multiport --dports XXXX:YYYY -j REDIRECT
```
Also note that it is assumed that the workers can reach the
scheduler: the port which the scheduler binds to should be open
on the machine where the scheduler runs on.
"""
def __init__(self, slurm_user=None, slurm_host=None):
def __init__(self, slurm_user=None, slurm_host=None, slurm_ssh_key_filename=None):

slurm_user = os.environ.get('SLURM_USER', None) \
if slurm_user is None else slurm_user
slurm_host = os.environ.get('SLURM_HOST', None) \
if slurm_host is None else slurm_host

slurm_user = os.environ.get('SLURM_USER', None) if slurm_user is None else slurm_user
slurm_host = os.environ.get('SLURM_HOST', None) if slurm_host is None else slurm_host
# SSH key is not required, we can be using a SSH agent
if (slurm_user is None) or (slurm_host is None):
raise ValueError('Provide SLURM user and host names')

slurm_ssh_key_filename = os.environ.get('SLURM_SSH_KEY_FILENAME', None) \
if slurm_ssh_key_filename is None else slurm_ssh_key_filename
connect_kwargs = {"key_filename": slurm_ssh_key_filename} \
if slurm_ssh_key_filename is not None else None

self.forward_locals = {}
self.connection = fabric.Connection(user=slurm_user, host=slurm_host)
self.connection = fabric.Connection(
user=slurm_user, host=slurm_host, connect_kwargs=connect_kwargs,
)
super().__init__()

def add_worker(self, scheduler, worker):
"""
When a worker starts, local forwarding of the worker's port.
"""
nanny = scheduler.workers[worker].nanny
host, _ = get_address_host_port(nanny)
_, port = get_address_host_port(worker)
host, port = get_address_host_port(worker)
forward_local = self.connection.forward_local(
remote_port=port,
local_port=port,
Expand All @@ -66,6 +91,8 @@ def remove_worker(self, scheduler, worker, *, stimulus_id, **kwargs):
@click.command()
@click.option("--slurm-user", type=str)
@click.option("--slurm-host", type=str)
def dask_setup(scheduler, slurm_user=None, slurm_host=None):
plugin = SchedulerPluginForRemoteWorkers(slurm_user, slurm_host)
@click.option("--slurm-ssh-key-filename", type=str)
def dask_setup(scheduler, slurm_user=None, slurm_host=None, slurm_ssh_key_filename=None):
plugin = SchedulerPluginForRemoteWorkers(slurm_user, slurm_host, slurm_ssh_key_filename)
scheduler.add_plugin(plugin)

12 changes: 11 additions & 1 deletion notebooks/load_daymet_dcache.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,16 @@
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9b547eeb-5ca8-40af-a463-b3263cdb7b6e",
"metadata": {},
"outputs": [],
"source": [
"ds"
]
},
{
"cell_type": "markdown",
"id": "75df734f-03c9-41fb-b0cc-05dfb1736196",
Expand Down Expand Up @@ -109,7 +119,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.7"
"version": "3.11.8"
}
},
"nbformat": 4,
Expand Down
18 changes: 5 additions & 13 deletions run-jupyter.bsh
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,10 @@ source var

# Start SSH agent and add key to connect to SLURM system
eval `ssh-agent -s`
ssh-add ${SLURM_SSH_PRIVATE_KEY}
ssh-add ${SLURM_SSH_KEY}

# Remote forwarding of the port used by the scheduler. This is so that workers can connect to the
# scheduler. By default remote hosts other than localhost are not allowed to the forwarded port.
# Either set "GatewayPorts yes" in /etc/ssh/sshd_config or run the command on remote to bind to
# all interfaces (however, **it needs to use a different port**) - see https://iximiuz.com/en/posts/ssh-tunnels/
ssh -A -tt -f -M \
-S /tmp/.ssh-slurm \
-R 0.0.0.0:${LOCAL_DASK_SCHEDULER_PORT}:localhost:${LOCAL_DASK_SCHEDULER_PORT} \
${SLURM_USER}@${SLURM_HOST} \
"ssh -g -N -L ${SLURM_DASK_SCHEDULER_PORT}:localhost:${LOCAL_DASK_SCHEDULER_PORT} localhost"
# Define address where to contact the scheduler
HOST_PUBLIC=`curl ifconfig.me`

# Export the socket location to the container (also, need to bind /tmp)
# this will be needed to submit jobs to the SLURM system and to local forward worker's ports
Expand All @@ -25,7 +18,8 @@ export SLURM_USER
export SLURM_HOST
# Dask settings
export DASK_CONFIG="${PWD}/config/dask"
export DASK_JOBQUEUE__SLURM__SCHEDULER_OPTIONS__CONTACT_ADDRESS="tcp://${SLURM_LOGIN_NODE}:${SLURM_DASK_SCHEDULER_PORT}"
export DASK_JOBQUEUE__SLURM__SCHEDULER_OPTIONS__PORT="8786" # port should be open for the workers to reach the scheduler
export DASK_JOBQUEUE__SLURM__SCHEDULER_OPTIONS__CONTACT_ADDRESS="tcp://${HOST_PUBLIC}:${DASK_JOBQUEUE__SLURM__SCHEDULER_OPTIONS__PORT}"
export DASK_JOBQUEUE__SLURM__SCHEDULER_OPTIONS__PRELOAD="${DASK_CONFIG}/scheduler-setup.py"
export DASK_JOBQUEUE__SLURM__PYTHON="apptainer exec ${APPTAINER_IMAGE} python"
# Enable aliases for SLURM commands
Expand All @@ -41,5 +35,3 @@ apptainer -d exec -B /tmp:/tmp ${APPTAINER_IMAGE} jupyter lab --no-browser --por
# kill the SSH agent
kill ${SSH_AGENT_PID}

# Close remote forwarding, also killing remote SSH commmand
ssh -S /tmp/.ssh-slurm -O exit ${SLURM_USER}@${SLURM_HOST}
9 changes: 3 additions & 6 deletions var.template
Original file line number Diff line number Diff line change
@@ -1,8 +1,5 @@
APPTAINER_IMAGE="oras://ghcr.io/rs-dat/jupyterdaskcloud2cluster:latest"
LOCAL_DASK_SCHEDULER_PORT="8786"
SLURM_DASK_SCHEDULER_PORT="8787" # should differ from the local one, both should be available on SLURM login node
SLURM_USERNAME="login-name"
SLURM_SSH_PRIVATE_KEY="/path/to/the/ssh/key"
SLURM_LOGIN_NODE="ui-01"
SLURM_HOST="${SLURM_LOGIN_NODE}.spider.surfsara.nl"
SLURM_USER="login-name"
SLURM_SSH_KEY="/path/to/the/ssh/key"
SLURM_HOST="hostname.surf.nl"
DCACHE_TOKEN="paste-the-macaroon-here"

0 comments on commit 7011183

Please sign in to comment.