From 1a9c17ec3223dd6d836f2713d1938018f60495f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Manuel=20Dom=C3=ADnguez?= Date: Tue, 7 Nov 2023 15:42:41 +0100 Subject: [PATCH] Configure Galaxy to be able to access the secondary HTCondor cluster Add a runner and TPV destinations for the secondary HTCondor cluster. Sending jobs to the secondary cluster is controlled via the `condor-secondary` TPV scheduling tag. TPV destinations are automatically duplicated by Jinja. It creates extra destinations whose name is prefixed with `secondary_`, makes them inherit from the original destination, replaces the runner, and makes them require the `condor-secondary` scheduling tag. `job_conf.yml.j2` was also modified so that concurrency limits affecting condor destinations also get cloned. This commit also lets the Galaxy user invoke the appropriate `condor_*` commands using `systemd-run`. --- files/galaxy/tpv/destinations.yml.j2 | 16 ++++++++++++++++ group_vars/htcondor-secondary-submit-host.yml | 5 +++++ htcondor.yml | 15 +++++++++++++++ templates/galaxy/config/job_conf.yml | 6 ++++++ templates/galaxy/config/job_conf.yml.j2 | 14 ++++++++++++++ 5 files changed, 56 insertions(+) diff --git a/files/galaxy/tpv/destinations.yml.j2 b/files/galaxy/tpv/destinations.yml.j2 index f40643958..30f934255 100644 --- a/files/galaxy/tpv/destinations.yml.j2 +++ b/files/galaxy/tpv/destinations.yml.j2 @@ -318,6 +318,8 @@ destinations: # LOCAL CONDOR DESTINATIONS # ############################# +{# Save condor destinations to replicate them in the secondary cluster. #} +{% set condor %} condor_docker: inherits: basic_docker_destination runner: condor @@ -430,3 +432,17 @@ destinations: GPU_AVAILABLE: 1 params: requirements: 'GalaxyGroup == "compute_gpu"' +{%- endset %}{{ condor }} + +{# Generate secondary cluster destinations. #} +{% for name, destination in (condor | from_yaml).items() %} + {% if destination.runner is defined and destination.runner == "condor" %} + secondary_{{ name }}: + inherits: {{ name }} + runner: condor_secondary + scheduling: + require: + - condor-secondary + + {% endif %} +{% endfor %} diff --git a/group_vars/htcondor-secondary-submit-host.yml b/group_vars/htcondor-secondary-submit-host.yml index 5263d8982..fbcbc9b36 100644 --- a/group_vars/htcondor-secondary-submit-host.yml +++ b/group_vars/htcondor-secondary-submit-host.yml @@ -61,6 +61,11 @@ nspawn_ssh_authorized_keys: - "ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAACAQDV7gfNbNN5O8vH6/tM/iOFXKBP2YKRHXOmdfV8ogvu9BdVV0IPmDzk2EooVpThDE1VMv1hz3811tvBhHRJ6IgNhVIV/61w/+RazQD/AU27X8bX+Hb9EQ/bP4DW+6ySd/z5vdDLzpH5dbiMhzPEDkXVsylUT+hkQnas6cHspDhHmtKQ5MWOgDe3D/IEudTDJQe8hxxaU4TaZUmFzn7eYp9HvuK8qW0yCy4NWOxJJHA+G5wSCyLuKnaKo4AitUIzSKF1AB94oq7b96KONhPxgRptAk4OYIUTdNFbrI5HDaSNzHLnF5FbjQvG+Eu6m5nY5yvJMogE+jiuWeIXCZTCFljg287FUo0ohmbZpd802L6VXun14VumRC+rRgPrvBALo/CsyCsPIoBSTKhVElxKVOcRjmTLNfrUZM0GQxqJhIvah8BV+JTExkipPwkrKTdMAWIXvCoehxV+WMpBWqtEEzAzEoqJpaiec7HfriwsHTGESZWAPYEbFjzbHXQZtqBkbOvtokPMRmTWfWKxaplCMN6ddJeeY6faorD0w/e6lszWES1Q1ieajiPKDy37UvybKKvPTk4o3MzyzYOS4c8HQj+jnGeR5Q3ETuyz4psLyOfuBtIrfOeuxV42rFDmkYM3IrrRR+F9oklFG6Ig8DVfgQEzSG36NkgvpF4OdFvigYqXvw== cloud@vgcn" nspawn_ssh_host_trust_container: yes +nspawn_condor_systemd_run: "/usr/bin/systemd-run --uid={{ galaxy_user.uid }} --gid={{ galaxy_group.gid }} --pipe --quiet --machine {{ nspawn_name }}" +nspawn_condor_rm_command: "{{ nspawn_condor_systemd_run }} /usr/bin/condor_rm" +nspawn_condor_ssh_to_job_command: "{{ nspawn_condor_systemd_run }} /usr/bin/condor_ssh_to_job" +nspawn_condor_submit_command: "{{ nspawn_condor_systemd_run }} /usr/bin/condor_submit" + ssh_allow_tcp_forwarding: "local" sshd_custom_options: - "PermitOpen 127.0.0.1:{{ nspawn_ssh_config.Port }}" diff --git a/htcondor.yml b/htcondor.yml index 597414625..66b259e02 100644 --- a/htcondor.yml +++ b/htcondor.yml @@ -177,6 +177,21 @@ key: "[127.0.0.1]:{{ nspawn_ssh_config.Port }} {{ nspawn_ssh_host_key.content | b64decode }}" when: nspawn_ssh_host_trust_container + - name: Allow the Galaxy user to run HTCondor commands in the container. + # Uses /etc/sudoers. Ideally this would be solved using what is requested + # in this issue https://github.com/systemd/systemd/issues/10997, but the + # issue is still open. + community.general.sudoers: + name: htcondor-nspawn + user: "{{ galaxy_user.name }}" + nopassword: true + validation: required + setenv: true + commands: + - "{{ nspawn_condor_rm_command }} *" + - "{{ nspawn_condor_ssh_to_job_command }} *" + - "{{ nspawn_condor_submit_command }} *" + - name: HTCondor cluster. hosts: htcondor:!sn06.galaxyproject.eu handlers: diff --git a/templates/galaxy/config/job_conf.yml b/templates/galaxy/config/job_conf.yml index 045f5729c..2b6bea45c 100644 --- a/templates/galaxy/config/job_conf.yml +++ b/templates/galaxy/config/job_conf.yml @@ -15,6 +15,12 @@ galaxy_jobconf: #workers: 3 - id: condor load: galaxy.jobs.runners.condor:CondorJobRunner + - id: condor_secondary + load: galaxy.jobs.runners.condor:CondorJobRunner + params: + condor_rm_cmd: "sudo {{ nspawn_condor_rm_command }}" + condor_ssh_to_job_cmd: "sudo {{ nspawn_condor_ssh_to_job_command }}" + condor_submit_cmd: "sudo {{ nspawn_condor_submit_command }}" - id: local load: galaxy.jobs.runners.local:LocalJobRunner - id: pulsar_embedded diff --git a/templates/galaxy/config/job_conf.yml.j2 b/templates/galaxy/config/job_conf.yml.j2 index 52ad02a9b..45414081a 100644 --- a/templates/galaxy/config/job_conf.yml.j2 +++ b/templates/galaxy/config/job_conf.yml.j2 @@ -136,3 +136,17 @@ limits: window: {{ limit['window'] }} {% endif %} {% endfor %} +{# Replicate destination limits for the secondary HTCondor cluster #} +{% for limit in galaxy_jobconf['limits'] | sort(attribute='type') %} +{% if limit['type'].startswith('destination_') and 'id' in limit +and limit['id'].startswith('condor_') %} + - type: {{ limit['type'] }} + value: {{ limit['value'] }} + id: secondary_{{ limit['id'] }} +{% if 'tag' in limit %} + tag: {{ limit['tag'] }} +{% elif 'window' in limit %} + window: {{ limit['window'] }} +{% endif %} +{% endif %} +{% endfor %}