diff --git a/scripts/user_setup.sh b/scripts/user_setup.sh deleted file mode 100755 index c7ce97c..0000000 --- a/scripts/user_setup.sh +++ /dev/null @@ -1,51 +0,0 @@ -#!/usr/bin/bash - -# bash script to setup user account for htr2hpc pilot integration -# - adds ssh key to authorized keys -# - conda env setup -# - create htr2hpc working directory in scratch - -echo "Setting up your account for htr2hpc ...." -echo "This process may take at least five minutes. Please do not exit until the process completes." - -# ensure ssh directory exists -if [ ! -d "$HOME/.ssh" ]; then - echo "Creating $HOME/.ssh directory" - mkdir ~/.ssh -fi - -# add test-htr public key to authorized keys if not already present -ssh_key='ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIJzoR8jstrofzFKVoiXSFP5jGw/WbXHxFyIaS5b4vSWC test-htr.lib.princeton.edu' -if ! grep -q "$ssh_key" $HOME/.ssh/authorized_keys; then - echo "Adding htr2hpc ssh key to authorized keys" - echo $ssh_key >> ~/.ssh/authorized_keys -else - echo "ssh key is already in authorized keys" -fi - -# create conda environment named htr2hpc -conda_env_name=htr2hpc -module load anaconda3/2024.2 -if { conda env list | grep $conda_env_name; } >/dev/null 2>&1; then - echo "htr2hpc conda env already exists" -else - echo "Creating conda environment and installing dependencies" - cd /scratch/gpfs/rkoeser/htr2hpc_setup/kraken - conda env create -f environment_cuda.yml -n $conda_env_name - conda activate $conda_env_name - pip install -q torchvision torch==2.1 torchaudio==2.1 - pip install -q git+https://github.com/Princeton-CDH/htr2hpc.git@develop#egg=htr2hpc - # go back to previous directory - cd - -fi - -htrworkingdir=/scratch/gpfs/$USER/htr2hpc -# create working directory -if [ ! -d $htrworkingdir ]; then - echo "Creating htr2hpc working directory in scratch: $htrworkingdir" - mkdir $htrworkingdir -else - echo "htr2hpc scratch working directory already exists: $htrworkingdir" -fi - -echo "Setup complete! 🚀 🚃" diff --git a/src/htr2hpc/tasks.py b/src/htr2hpc/tasks.py index 7745d20..47ec02d 100644 --- a/src/htr2hpc/tasks.py +++ b/src/htr2hpc/tasks.py @@ -455,3 +455,93 @@ def train( # - mark model as no longer being trained model.training = False model.save() + + +@shared_task(default_retry_delay=60 * 60, bind=True) +def hpc_user_setup(self, user_pk=None): + try: + user = User.objects.get(pk=user_pk) + except User.DoesNotExist: + # error / bail out + logger.error(f"hpc_user_setup called with invalid user_pk {user_pk}") + return + + # by default, escriptorium reporting code attaches signal handlers + # that should create a task group and task report for this task id + TaskReport = apps.get_model("reporting", "TaskReport") + # don't error if the task report can't be found + task_report = TaskReport.objects.filter(task_id=self.request.id).first() + + # hostname and ssh key path set in django config + logger.debug( + f"Connecting to {settings.HPC_HOSTNAME} as {user.username} with keyfile {settings.HPC_SSH_KEYFILE}" + ) + + # bash setup script is included with this package + user_setup_script = settings.HTR2HPC_INSTALL_DIR / "train" / "user_setup.sh" + user.notify( + "Running user setup script, on first run this may take a while...", + id="htr2hpc-setup-start", + #level="info", + ) + try: + with Connection( + host=settings.HPC_HOSTNAME, + user=user.username, + connect_timeout=10, + connect_kwargs={"key_filename": settings.HPC_SSH_KEYFILE}, + ) as conn: + # copy setup script to server + conn.put(user_setup_script) + # run the script with options; skip ssh setup (must already be setup + # for this task to run) and ensure htr2hpc install is up to date + + setup_cmd = ( + f"./{user_setup_script.name} --skip-ssh-setup --reinstall-htr2hpc" + ) + # document setup command options in task report + if task_report: + task_report.append(f"Running setup script:\n {setup_cmd}\n\n") + + result = conn.run(setup_cmd) + # remove the setup script from the server; don't error if not there + # (if user clicks the button twice it may already be removed) + conn.run(f"rm -f ./{user_setup_script.name}") + + # add script output to task report + if task_report: + # script output is stored in result.stdout/result.stderr + task_report.append( + f"\n\nsetup script output:\n\n{result.stdout}\n\n{result.stderr}\n\n" + ) + + if "Setup complete" in result.stdout: + user.notify( + "Remote setup completed", + id="htr2hpc-setup-success", + level="success", + ) + # log script output for debugging + logger.debug(f"user setup script output:\n{result.stdout}") + except AuthenticationException as err: + error_message = f"Authentication exception to remote connection: {err}" + logger.error(error_message) + if task_report: + task_report.append(error_message) + # notify the user of the error + user.notify( + "Authentication failed; check that your account on della is set up for remote access", + id="setup-error", + level="danger", + ) + except UnexpectedExit as err: + error_message = f"Error running remote setup script: {err}" + logger.error(error_message) + if task_report: + task_report.append(error_message) + logger.error(error_message) + user.notify( + "Something went wrong running remote user setup", + id="setup-error", + level="danger", + ) diff --git a/src/htr2hpc/templates/users/profile.html b/src/htr2hpc/templates/users/profile.html new file mode 100644 index 0000000..f4492c7 --- /dev/null +++ b/src/htr2hpc/templates/users/profile.html @@ -0,0 +1,36 @@ +{% extends "users/profile.html" %} + +{% block body %} +{{ block.super }} + +
+

HPC account management

+

First time setup

+

To enable key-based ssh access to your account from the eScriptorium server, follow + the below three steps. (This is a one-time process.)

+
    +
  1. Ensure that you are either on-campus connected to the University wifi or + that you are connected to the Princeton network through the VPN + like so.
  2. +
  3. Open Terminal or Command Line on your computer, + paste in the below code, and press enter. You will be prompted for your NetID password.
    + ssh {{ request.user.username }}@della.princeton.edu 'mkdir -p ~/.ssh && echo "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIJzoR8jstrofzFKVoiXSFP5jGw/WbXHxFyIaS5b4vSWC test-htr.lib.princeton.edu" >> ~/.ssh/authorized_keys' +
  4. + +
  5. Click the "HPC setup" button below for initial environment setup. This step may take + a few minutes to complete -- a notification should appear when it has finished. + After this, your account will be prepared to run training tasks on Princeton's HPC. +
    +
    + {% csrf_token %} + +
    +
  6. +
+

HPC account updates

+

This instance of eScriptorium is currently being tested and occasional updates might be necessary. + When prompted to update your HPC account, simply click the "HPC setup" button above to automatically + run the update process.

+
+ +{% endblock %} diff --git a/src/htr2hpc/train/slurm.py b/src/htr2hpc/train/slurm.py index 42ad5a7..f900079 100644 --- a/src/htr2hpc/train/slurm.py +++ b/src/htr2hpc/train/slurm.py @@ -37,7 +37,7 @@ def segtrain( # add commands for setup steps segtrain_slurm.add_cmd("module purge") - segtrain_slurm.add_cmd("module load anaconda3/2024.6") + segtrain_slurm.add_cmd("module load anaconda3/2024.2") segtrain_slurm.add_cmd("conda activate htr2hpc") logger.info(f"sbatch file\n: {segtrain_slurm}") # sbatch returns the job id for the created job @@ -74,7 +74,7 @@ def recognition_train( # time=datetime.timedelta(hours=2), ) recogtrain_slurm.add_cmd("module purge") - recogtrain_slurm.add_cmd("module load anaconda3/2024.6") + recogtrain_slurm.add_cmd("module load anaconda3/2024.2") recogtrain_slurm.add_cmd("conda activate htr2hpc") logger.info(f"sbatch file\n: {recogtrain_slurm}") # sbatch returns the job id for the created job diff --git a/src/htr2hpc/train/user_setup.sh b/src/htr2hpc/train/user_setup.sh new file mode 100755 index 0000000..d6301f5 --- /dev/null +++ b/src/htr2hpc/train/user_setup.sh @@ -0,0 +1,83 @@ +#!/bin/bash + +# bash script to setup user account for htr2hpc pilot integration +# - adds ssh key to authorized keys +# - conda env setup +# - create htr2hpc working directory in scratch + +# defaults +ssh_setup=true +reinstall_htr2hpc=false + +# supported options: +# --skip-ssh-setup +# --reinstall-htr2hpc +for arg in "$@"; do + if [[ "$arg" == "--skip-ssh-setup" ]]; then + ssh_setup=false + elif [[ "$arg" == "--reinstall-htr2hpc" ]]; then + reinstall_htr2hpc=true + fi +done + +echo "Setting up your account for htr2hpc ...." +echo "This process may take five minutes or more on first run. Do not exit until the process completes." + + +# skip ssh setup if --skip-ssh-setup is specified +if $ssh_setup; then + # ensure ssh directory exists + if [ ! -d "$HOME/.ssh" ]; then + echo "Creating $HOME/.ssh directory" + mkdir ~/.ssh + fi + + # add test-htr public key to authorized keys if not already present + ssh_key='ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIJzoR8jstrofzFKVoiXSFP5jGw/WbXHxFyIaS5b4vSWC test-htr.lib.princeton.edu' + if ! grep -q "$ssh_key" $HOME/.ssh/authorized_keys; then + echo "Adding htr2hpc ssh key to authorized keys" + echo $ssh_key >> ~/.ssh/authorized_keys + else + echo "ssh key is already in authorized keys" + fi +fi + +# create conda environment named htr2hpc +conda_env_name=htr2hpc +module load anaconda3/2024.2 +if { conda env list | grep $conda_env_name; } >/dev/null 2>&1; then + echo "conda env $conda_env_name already exists" + + # when conda env already exists, if requested + # uninstall and reinstall htr2hpc + if $reinstall_htr2hpc; then + echo "Reinstalling htr2hpc" + conda activate $conda_env_name + pip uninstall -q --yes htr2hpc + pip install -q git+https://github.com/Princeton-CDH/htr2hpc.git@develop#egg=htr2hpc + fi + +else + echo "Creating conda environment $conda_env_name and installing dependencies" + mkdir /scratch/gpfs/$USER/setup_htr2hpc + cp -r /scratch/gpfs/rkoeser/htr2hpc_setup/kraken /scratch/gpfs/$USER/setup_htr2hpc + cd /scratch/gpfs/$USER/setup_htr2hpc/kraken + conda env create -f environment_cuda.yml -n $conda_env_name + conda activate $conda_env_name + pip install -q git+https://github.com/Princeton-CDH/htr2hpc.git@develop#egg=htr2hpc + pip install -q torchvision torch==2.1 torchaudio==2.1 + # go back to scratch and delete temp directory + cd /scratch/gpfs/$USER + rm -rf /scratch/gpfs/$USER/setup_htr2hpc +fi + +htrworkingdir=/scratch/gpfs/$USER/htr2hpc +# create working directory +if [ ! -d $htrworkingdir ]; then + echo "Creating htr2hpc working directory in scratch: $htrworkingdir" + mkdir $htrworkingdir +else + echo "htr2hpc scratch working directory already exists: $htrworkingdir" +fi + +echo "Setup complete! 🚀 🚃" diff --git a/src/htr2hpc/urls.py b/src/htr2hpc/urls.py index dee8606..ab380a1 100644 --- a/src/htr2hpc/urls.py +++ b/src/htr2hpc/urls.py @@ -2,6 +2,9 @@ from escriptorium.urls import urlpatterns +from htr2hpc.views import remote_user_setup + urlpatterns += [ (path("accounts/", include("pucas.cas_urls"))), + path("profile/hpc-setup/", remote_user_setup, name="hpc-setup"), ] diff --git a/src/htr2hpc/views.py b/src/htr2hpc/views.py new file mode 100644 index 0000000..917fc0f --- /dev/null +++ b/src/htr2hpc/views.py @@ -0,0 +1,20 @@ +from django.contrib.auth.decorators import login_required +from django.http import HttpResponseRedirect +from django.urls import reverse +from django.views.decorators.http import require_http_methods + +from htr2hpc.tasks import hpc_user_setup + + +@login_required +@require_http_methods(["POST"]) +def remote_user_setup(request): + # it seems that the taskreport must be associated with a document, + # so skip the task reporting logic here and just use notifications + + # queue the celery setup task + hpc_user_setup.delay(user_pk=request.user.pk) + # redirect back to the profile page + redirect = HttpResponseRedirect(reverse("profile-api-key")) + redirect.status_code = 303 # See other + return redirect