Skip to content

Commit

Permalink
Consolidate nvidia fabric_manager and gdrcopy configure in the resource
Browse files Browse the repository at this point in the history
Signed-off-by: Francesco Giordano <[email protected]>
  • Loading branch information
francesco-giordano committed Apr 14, 2023
1 parent 13334d8 commit fb6e593
Show file tree
Hide file tree
Showing 20 changed files with 33 additions and 40 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,13 @@
return if arm_instance? || !(node['cluster']['nvidia']['enabled'] == 'yes' || node['cluster']['nvidia']['enabled'] == true)
action_install_package
end

action :configure do
# Start nvidia fabric manager on NVSwitch enabled systems
if get_nvswitches > 1
service 'nvidia-fabricmanager' do
action %i(start enable)
supports status: true
end
end
end
Original file line number Diff line number Diff line change
Expand Up @@ -74,3 +74,17 @@ def gdrcopy_version_extended
"#{node['cluster']['nvidia']['gdrcopy']['version']}-1"
end
end

action :configure do
if graphic_instance? && is_service_installed?(node['cluster']['nvidia']['gdrcopy']['service'])
# NVIDIA GDRCopy
execute "enable #{node['cluster']['nvidia']['gdrcopy']['service']} service" do
# Using command in place of service resource because of: https://github.com/chef/chef/issues/12053
command "systemctl enable #{node['cluster']['nvidia']['gdrcopy']['service']}"
end
service node['cluster']['nvidia']['gdrcopy']['service'] do
action :start
supports status: true
end
end
end
9 changes: 7 additions & 2 deletions cookbooks/aws-parallelcluster-config/recipes/base.rb
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,13 @@
# Amazon Time Sync
include_recipe 'aws-parallelcluster-config::chrony'

# NVIDIA services (fabric manager)
include_recipe "aws-parallelcluster-config::nvidia"
# NVIDIA services
fabric_manager 'Configure fabric manager' do
action :configure
end
gdrcopy 'Configure gdrcopy' do
action :configure
end

# EFA runtime configuration
efa 'Configure system for EFA' do
Expand Down
36 changes: 0 additions & 36 deletions cookbooks/aws-parallelcluster-config/recipes/nvidia.rb

This file was deleted.

2 changes: 1 addition & 1 deletion kitchen.recipes-install.yml
Original file line number Diff line number Diff line change
Expand Up @@ -291,7 +291,7 @@ suites:
- /nvidia/
driver:
# nvidia_driver can be executed only on a graphic EC2 instance example: g5.xlarge(x86_86) or g5g.xlarge(aarm64)
instance_type: g5.xlarge
instance_type: g4dn.2xlarge
attributes:
dependencies:
- recipe:aws-parallelcluster-install::directories
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@

control 'tag:config_expected_versions_of_nvidia_fabric_manager_installed' do
only_if do
!(os_properties.centos7? && os_properties.arm?) && !os_properties.redhat8? && !os_properties.arm? && !instance.custom_ami? &&
!(os_properties.centos7? && os_properties.arm?) && !os_properties.arm? && !instance.custom_ami? &&
(node['cluster']['nvidia']['enabled'] == 'yes' || node['cluster']['nvidia']['enabled'] == true)
end

Expand Down

0 comments on commit fb6e593

Please sign in to comment.