From 7011183446159984fbf21fde6241c4b2c2765800 Mon Sep 17 00:00:00 2001 From: Francesco Nattino Date: Thu, 27 Jun 2024 14:19:59 +0200 Subject: [PATCH] simplify setup --- config/dask/jobqueue.yaml | 15 +++------- config/dask/scheduler-setup.py | 47 +++++++++++++++++++++++------- notebooks/load_daymet_dcache.ipynb | 12 +++++++- run-jupyter.bsh | 18 ++++-------- var.template | 9 ++---- 5 files changed, 60 insertions(+), 41 deletions(-) diff --git a/config/dask/jobqueue.yaml b/config/dask/jobqueue.yaml index 1697a9c..af2b8ea 100644 --- a/config/dask/jobqueue.yaml +++ b/config/dask/jobqueue.yaml @@ -11,16 +11,9 @@ jobqueue: local-directory: '$TMPDIR' # Location of fast local storage like /scratch or $TMPDIR walltime: '1:00:00' scheduler-options: - port: 8786 - host: 'localhost' -# contact_address: 'tcp://${SLURM_LOGIN_NODE}:8787' -# preload: ${DASK_CONFIG}/scheduler-setup.py + host: '0.0.0.0' job_script_prologue: - - 'APPTAINER_TMPDIR=${TMPDIR}' - - 'WORKER_PORT=`shuf -i 8400-9400 -n 1`' + - 'WORKER_PORT=`shuf -i 8900-9900 -n 1`' worker_extra_args: - - '--listen-address' - - 'tcp://0.0.0.0:${WORKER_PORT}' - - '--contact-address' - - 'tcp://localhost:${WORKER_PORT}' -# python: 'apptainer exec ${APPTAINER_IMAGE} python' + - '--worker-port' + - '${WORKER_PORT}' diff --git a/config/dask/scheduler-setup.py b/config/dask/scheduler-setup.py index 296a3e9..60fe8da 100644 --- a/config/dask/scheduler-setup.py +++ b/config/dask/scheduler-setup.py @@ -21,27 +21,52 @@ class SchedulerPluginForRemoteWorkers(SchedulerPlugin): """ Scheduler plugin to connect to workers running on a remote - SLURM system. When a new worker is added to the cluster, + SLURM system. When a new worker is added to the cluster, the port to which the worker binds is forwarded locally. + + Note, however, that the workers will still advertise their + "real" address, which is likely on the private network of the + SLURM cluster (10.X.X.X). For the scheduler to reach them, we + thus need to redirect all traffic towards these IP addresses to + the localhost, e.g. with the following iptables rule (replace + XXXX:YYYY with the range of ports used by the workers, e.g. + 8900:9900 - see `jobqueue.yaml` for the range employed): + + ``` + sudo iptables -t nat -I OUTPUT --dst 10.0.0.0/16 -p tcp --match multiport --dports XXXX:YYYY -j REDIRECT + ``` + + Also note that it is assumed that the workers can reach the + scheduler: the port which the scheduler binds to should be open + on the machine where the scheduler runs on. """ - def __init__(self, slurm_user=None, slurm_host=None): + def __init__(self, slurm_user=None, slurm_host=None, slurm_ssh_key_filename=None): + + slurm_user = os.environ.get('SLURM_USER', None) \ + if slurm_user is None else slurm_user + slurm_host = os.environ.get('SLURM_HOST', None) \ + if slurm_host is None else slurm_host - slurm_user = os.environ.get('SLURM_USER', None) if slurm_user is None else slurm_user - slurm_host = os.environ.get('SLURM_HOST', None) if slurm_host is None else slurm_host + # SSH key is not required, we can be using a SSH agent if (slurm_user is None) or (slurm_host is None): raise ValueError('Provide SLURM user and host names') + slurm_ssh_key_filename = os.environ.get('SLURM_SSH_KEY_FILENAME', None) \ + if slurm_ssh_key_filename is None else slurm_ssh_key_filename + connect_kwargs = {"key_filename": slurm_ssh_key_filename} \ + if slurm_ssh_key_filename is not None else None + self.forward_locals = {} - self.connection = fabric.Connection(user=slurm_user, host=slurm_host) + self.connection = fabric.Connection( + user=slurm_user, host=slurm_host, connect_kwargs=connect_kwargs, + ) super().__init__() def add_worker(self, scheduler, worker): """ When a worker starts, local forwarding of the worker's port. """ - nanny = scheduler.workers[worker].nanny - host, _ = get_address_host_port(nanny) - _, port = get_address_host_port(worker) + host, port = get_address_host_port(worker) forward_local = self.connection.forward_local( remote_port=port, local_port=port, @@ -66,6 +91,8 @@ def remove_worker(self, scheduler, worker, *, stimulus_id, **kwargs): @click.command() @click.option("--slurm-user", type=str) @click.option("--slurm-host", type=str) -def dask_setup(scheduler, slurm_user=None, slurm_host=None): - plugin = SchedulerPluginForRemoteWorkers(slurm_user, slurm_host) +@click.option("--slurm-ssh-key-filename", type=str) +def dask_setup(scheduler, slurm_user=None, slurm_host=None, slurm_ssh_key_filename=None): + plugin = SchedulerPluginForRemoteWorkers(slurm_user, slurm_host, slurm_ssh_key_filename) scheduler.add_plugin(plugin) + diff --git a/notebooks/load_daymet_dcache.ipynb b/notebooks/load_daymet_dcache.ipynb index 0a2d0ca..610b186 100644 --- a/notebooks/load_daymet_dcache.ipynb +++ b/notebooks/load_daymet_dcache.ipynb @@ -51,6 +51,16 @@ ")" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "9b547eeb-5ca8-40af-a463-b3263cdb7b6e", + "metadata": {}, + "outputs": [], + "source": [ + "ds" + ] + }, { "cell_type": "markdown", "id": "75df734f-03c9-41fb-b0cc-05dfb1736196", @@ -109,7 +119,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.7" + "version": "3.11.8" } }, "nbformat": 4, diff --git a/run-jupyter.bsh b/run-jupyter.bsh index ae382d2..74ed13d 100644 --- a/run-jupyter.bsh +++ b/run-jupyter.bsh @@ -5,17 +5,10 @@ source var # Start SSH agent and add key to connect to SLURM system eval `ssh-agent -s` -ssh-add ${SLURM_SSH_PRIVATE_KEY} +ssh-add ${SLURM_SSH_KEY} -# Remote forwarding of the port used by the scheduler. This is so that workers can connect to the -# scheduler. By default remote hosts other than localhost are not allowed to the forwarded port. -# Either set "GatewayPorts yes" in /etc/ssh/sshd_config or run the command on remote to bind to -# all interfaces (however, **it needs to use a different port**) - see https://iximiuz.com/en/posts/ssh-tunnels/ -ssh -A -tt -f -M \ - -S /tmp/.ssh-slurm \ - -R 0.0.0.0:${LOCAL_DASK_SCHEDULER_PORT}:localhost:${LOCAL_DASK_SCHEDULER_PORT} \ - ${SLURM_USER}@${SLURM_HOST} \ - "ssh -g -N -L ${SLURM_DASK_SCHEDULER_PORT}:localhost:${LOCAL_DASK_SCHEDULER_PORT} localhost" +# Define address where to contact the scheduler +HOST_PUBLIC=`curl ifconfig.me` # Export the socket location to the container (also, need to bind /tmp) # this will be needed to submit jobs to the SLURM system and to local forward worker's ports @@ -25,7 +18,8 @@ export SLURM_USER export SLURM_HOST # Dask settings export DASK_CONFIG="${PWD}/config/dask" -export DASK_JOBQUEUE__SLURM__SCHEDULER_OPTIONS__CONTACT_ADDRESS="tcp://${SLURM_LOGIN_NODE}:${SLURM_DASK_SCHEDULER_PORT}" +export DASK_JOBQUEUE__SLURM__SCHEDULER_OPTIONS__PORT="8786" # port should be open for the workers to reach the scheduler +export DASK_JOBQUEUE__SLURM__SCHEDULER_OPTIONS__CONTACT_ADDRESS="tcp://${HOST_PUBLIC}:${DASK_JOBQUEUE__SLURM__SCHEDULER_OPTIONS__PORT}" export DASK_JOBQUEUE__SLURM__SCHEDULER_OPTIONS__PRELOAD="${DASK_CONFIG}/scheduler-setup.py" export DASK_JOBQUEUE__SLURM__PYTHON="apptainer exec ${APPTAINER_IMAGE} python" # Enable aliases for SLURM commands @@ -41,5 +35,3 @@ apptainer -d exec -B /tmp:/tmp ${APPTAINER_IMAGE} jupyter lab --no-browser --por # kill the SSH agent kill ${SSH_AGENT_PID} -# Close remote forwarding, also killing remote SSH commmand -ssh -S /tmp/.ssh-slurm -O exit ${SLURM_USER}@${SLURM_HOST} diff --git a/var.template b/var.template index ddc9625..b3f14a7 100644 --- a/var.template +++ b/var.template @@ -1,8 +1,5 @@ APPTAINER_IMAGE="oras://ghcr.io/rs-dat/jupyterdaskcloud2cluster:latest" -LOCAL_DASK_SCHEDULER_PORT="8786" -SLURM_DASK_SCHEDULER_PORT="8787" # should differ from the local one, both should be available on SLURM login node -SLURM_USERNAME="login-name" -SLURM_SSH_PRIVATE_KEY="/path/to/the/ssh/key" -SLURM_LOGIN_NODE="ui-01" -SLURM_HOST="${SLURM_LOGIN_NODE}.spider.surfsara.nl" +SLURM_USER="login-name" +SLURM_SSH_KEY="/path/to/the/ssh/key" +SLURM_HOST="hostname.surf.nl" DCACHE_TOKEN="paste-the-macaroon-here"