Skip to content

Commit

Permalink
Avoid using sinfo -N to support cluster creation with large compute…
Browse files Browse the repository at this point in the history
… nodes

Prior to this commit, if the total number of dynamic compute node is larger than 130k. After the commit, number becomes 180k

Signed-off-by: Hanwen <[email protected]>
  • Loading branch information
hanwen-cluster committed Aug 29, 2024
1 parent d39131d commit 2aa9aec
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,13 @@
mode '0755'
end

template "/usr/local/bin/is_fleet_ready.sh" do
source 'compute_fleet_status/is_fleet_ready.erb'
owner 'root'
group 'root'
mode '0755'
end

template "#{node['cluster']['etc_dir']}/clusterstatusmgtd.conf" do
source 'clusterstatusmgtd/clusterstatusmgtd.conf.erb'
owner 'root'
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#!/bin/bash

sinfo_output=$(<%= node['cluster']['slurm']['install_dir'] %>/bin/sinfo -h -o '%N %t' | grep -v -E '(idle|alloc|mix|maint)$')
while IFS= read -r line; do
nodelist=$(echo "$line" | awk '{print $1}')
<%= node['cluster']['slurm']['install_dir'] %>/bin/scontrol show hostnames "$nodelist" | { grep -E '^[a-z0-9\-]+\-st\-[a-z0-9\-]+\-[0-9]+.*' || true; }
done <<< "$sinfo_output"
5 changes: 1 addition & 4 deletions cookbooks/aws-parallelcluster-slurm/libraries/helpers.rb
Original file line number Diff line number Diff line change
Expand Up @@ -212,10 +212,7 @@ def check_for_protected_mode(fleet_status_command) # rubocop:disable Lint/Nested
# spot-st-t2large-2 idle
# capacity-block-st-t2micro-1 maint
# capacity-block-dy-t2micro-1 maint
is_fleet_ready_command = Shellwords.escape(
"set -o pipefail && #{node['cluster']['slurm']['install_dir']}/bin/sinfo -N -h -o '%N %t' | { grep -E '^[a-z0-9\\-]+\\-st\\-[a-z0-9\\-]+\\-[0-9]+ .*' || true; } | { grep -v -E '(idle|alloc|mix|maint)$' || true; }"
)
until shell_out!("/bin/bash -c #{is_fleet_ready_command}").stdout.strip.empty?
until shell_out!("/bin/bash -c /usr/local/bin/is_fleet_ready.sh").stdout.strip.empty?
check_for_protected_mode(fleet_status_command)

Chef::Log.info("Waiting for static fleet capacity provisioning")
Expand Down

0 comments on commit 2aa9aec

Please sign in to comment.