Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/hpc setup task #47

Open
wants to merge 14 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 0 additions & 51 deletions scripts/user_setup.sh

This file was deleted.

90 changes: 90 additions & 0 deletions src/htr2hpc/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -455,3 +455,93 @@ def train(
# - mark model as no longer being trained
model.training = False
model.save()


@shared_task(default_retry_delay=60 * 60, bind=True)
def hpc_user_setup(self, user_pk=None):
try:
user = User.objects.get(pk=user_pk)
except User.DoesNotExist:
# error / bail out
logger.error(f"hpc_user_setup called with invalid user_pk {user_pk}")
return

# by default, escriptorium reporting code attaches signal handlers
# that should create a task group and task report for this task id
TaskReport = apps.get_model("reporting", "TaskReport")
# don't error if the task report can't be found
task_report = TaskReport.objects.filter(task_id=self.request.id).first()

# hostname and ssh key path set in django config
logger.debug(
f"Connecting to {settings.HPC_HOSTNAME} as {user.username} with keyfile {settings.HPC_SSH_KEYFILE}"
)

# bash setup script is included with this package
user_setup_script = settings.HTR2HPC_INSTALL_DIR / "train" / "user_setup.sh"
user.notify(
"Running user setup script, on first run this may take a while...",
id="htr2hpc-setup-start",
level="info",
)
try:
with Connection(
host=settings.HPC_HOSTNAME,
user=user.username,
connect_timeout=10,
connect_kwargs={"key_filename": settings.HPC_SSH_KEYFILE},
) as conn:
# copy setup script to server
conn.put(user_setup_script)
# run the script with options; skip ssh setup (must already be setup
# for this task to run) and ensure htr2hpc install is up to date

setup_cmd = (
f"./{user_setup_script.name} --skip-ssh-setup --reinstall-htr2hpc"
)
# document setup command options in task report
if task_report:
task_report.append(f"Running setup script:\n {setup_cmd}\n\n")

result = conn.run(setup_cmd)
# remove the setup script from the server; don't error if not there
# (if user clicks the button twice it may already be removed)
conn.run(f"rm -f ./{user_setup_script.name}")

# add script output to task report
if task_report:
# script output is stored in result.stdout/result.stderr
task_report.append(
f"\n\nsetup script output:\n\n{result.stdout}\n\n{result.stderr}\n\n"
)

if "Setup complete" in result.stdout:
user.notify(
"Remote setup completed",
id="htr2hpc-setup-success",
level="success",
)
# log script output for debugging
logger.debug(f"user setup script output:\n{result.stdout}")
except AuthenticationException as err:
error_message = f"Authentication exception to remote connection: {err}"
logger.error(error_message)
if task_report:
task_report.append(error_message)
# notify the user of the error
user.notify(
"Authentication failed; check that your account on della is set up for remote access",
id="setup-error",
level="danger",
)
except UnexpectedExit as err:
error_message = f"Error running remote setup script: {err}"
logger.error(error_message)
if task_report:
task_report.append(error_message)
logger.error(error_message)
user.notify(
"Something went wrong running remote user setup",
id="setup-error",
level="danger",
)
36 changes: 36 additions & 0 deletions src/htr2hpc/templates/users/profile.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
{% extends "users/profile.html" %}

{% block body %}
{{ block.super }}

<div class="container pt-5">
<h2>HPC account management</h2>
<h3>First time setup</h3>
<p>To enable key-based ssh access to your account from the eScriptorium server, follow
the below three steps. (This is a one-time process.)</p>
<ol>
<li>Ensure that you are either on-campus connected to the University wifi or
that you are connected to the Princeton network through the VPN
<a href="https://informationsecurity.princeton.edu/connecting-to-princeton-n">like so</a>.</li>
<li>Open Terminal or Command Line on your computer,
paste in the below code, and press enter. You will be prompted for your NetID password.<br>
<code>ssh {{ request.user.username }}@della.princeton.edu 'mkdir -p ~/.ssh &amp;&amp; echo "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIJzoR8jstrofzFKVoiXSFP5jGw/WbXHxFyIaS5b4vSWC test-htr.lib.princeton.edu" >> ~/.ssh/authorized_keys'</code>
</li>

<li>Click the "HPC setup" button below for initial environment setup. This step may take
a few minutes to complete -- a notification should appear when it has finished.
After this, your account will be prepared to run training tasks on Princeton's HPC.
<br>
<form method="post" action="{% url 'hpc-setup' %}">
{% csrf_token %}
<input type="submit" value="HPC setup" class="btn btn-success">
</form>
</li>
</ol>
<h3>HPC account updates</h3>
<p>This instance of eScriptorium is currently being tested and occasional updates might be necessary.
When prompted to update your HPC account, simply click the "HPC setup" button above to automatically
run the update process.</p>
</div>

{% endblock %}
4 changes: 2 additions & 2 deletions src/htr2hpc/train/slurm.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def segtrain(

# add commands for setup steps
segtrain_slurm.add_cmd("module purge")
segtrain_slurm.add_cmd("module load anaconda3/2024.6")
segtrain_slurm.add_cmd("module load anaconda3/2024.2")
segtrain_slurm.add_cmd("conda activate htr2hpc")
logger.info(f"sbatch file\n: {segtrain_slurm}")
# sbatch returns the job id for the created job
Expand Down Expand Up @@ -74,7 +74,7 @@ def recognition_train(
# time=datetime.timedelta(hours=2),
)
recogtrain_slurm.add_cmd("module purge")
recogtrain_slurm.add_cmd("module load anaconda3/2024.6")
recogtrain_slurm.add_cmd("module load anaconda3/2024.2")
recogtrain_slurm.add_cmd("conda activate htr2hpc")
logger.info(f"sbatch file\n: {recogtrain_slurm}")
# sbatch returns the job id for the created job
Expand Down
80 changes: 80 additions & 0 deletions src/htr2hpc/train/user_setup.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
#!/bin/bash

# bash script to setup user account for htr2hpc pilot integration
# - adds ssh key to authorized keys
# - conda env setup
# - create htr2hpc working directory in scratch

# defaults
ssh_setup=true
reinstall_htr2hpc=false

# supported options:
# --skip-ssh-setup
# --reinstall-htr2hpc
for arg in "$@"; do
if [[ "$arg" == "--skip-ssh-setup" ]]; then
ssh_setup=false
elif [[ "$arg" == "--reinstall-htr2hpc" ]]; then
reinstall_htr2hpc=true
fi
done

echo "Setting up your account for htr2hpc ...."
echo "This process may take five minutes or more on first run. Do not exit until the process completes."


# skip ssh setup if --skip-ssh-setup is specified
if $ssh_setup; then
# ensure ssh directory exists
if [ ! -d "$HOME/.ssh" ]; then
echo "Creating $HOME/.ssh directory"
mkdir ~/.ssh
fi

# add test-htr public key to authorized keys if not already present
ssh_key='ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIJzoR8jstrofzFKVoiXSFP5jGw/WbXHxFyIaS5b4vSWC test-htr.lib.princeton.edu'
if ! grep -q "$ssh_key" $HOME/.ssh/authorized_keys; then
echo "Adding htr2hpc ssh key to authorized keys"
echo $ssh_key >> ~/.ssh/authorized_keys
else
echo "ssh key is already in authorized keys"
fi
fi

# create conda environment named htr2hpc
conda_env_name=htr2hpc
module load anaconda3/2024.2
if { conda env list | grep $conda_env_name; } >/dev/null 2>&1; then
echo "conda env $conda_env_name already exists"

# when conda env already exists, if requested
# uninstall and reinstall htr2hpc
if $reinstall_htr2hpc; then
echo "Reinstalling htr2hpc"
conda activate $conda_env_name
pip uninstall -q --yes htr2hpc
pip install -q git+https://github.com/Princeton-CDH/htr2hpc.git@develop#egg=htr2hpc
fi

else
echo "Creating conda environment $conda_env_name and installing dependencies"
cd /scratch/gpfs/rkoeser/htr2hpc_setup/kraken
conda env create -f environment_cuda.yml -n $conda_env_name
conda activate $conda_env_name
pip install -q torchvision torch==2.1 torchaudio==2.1
pip install -q git+https://github.com/Princeton-CDH/htr2hpc.git@develop#egg=htr2hpc
# go back to previous directory
cd -
fi

htrworkingdir=/scratch/gpfs/$USER/htr2hpc
# create working directory
if [ ! -d $htrworkingdir ]; then
echo "Creating htr2hpc working directory in scratch: $htrworkingdir"
mkdir $htrworkingdir
else
echo "htr2hpc scratch working directory already exists: $htrworkingdir"
fi

echo "Setup complete! 🚀 🚃"
3 changes: 3 additions & 0 deletions src/htr2hpc/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@

from escriptorium.urls import urlpatterns

from htr2hpc.views import remote_user_setup

urlpatterns += [
(path("accounts/", include("pucas.cas_urls"))),
path("profile/hpc-setup/", remote_user_setup, name="hpc-setup"),
]
20 changes: 20 additions & 0 deletions src/htr2hpc/views.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from django.contrib.auth.decorators import login_required
from django.http import HttpResponseRedirect
from django.urls import reverse
from django.views.decorators.http import require_http_methods

from htr2hpc.tasks import hpc_user_setup


@login_required
@require_http_methods(["POST"])
def remote_user_setup(request):
# it seems that the taskreport must be associated with a document,
# so skip the task reporting logic here and just use notifications

# queue the celery setup task
hpc_user_setup.delay(user_pk=request.user.pk)
# redirect back to the profile page
redirect = HttpResponseRedirect(reverse("profile"))
redirect.status_code = 303 # See other
return redirect