diff --git a/CHANGELOG.md b/CHANGELOG.md index 08b8ec5421..0dd7bf7a2b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,6 +24,8 @@ This file is used to list changes made in each version of the AWS ParallelCluste - Add log rotation support for ParallelCluster managed logs. - Track head node memory and root volume disk utilization using the `mem_used_percent` and `disk_used_percent` metrics collected through the CloudWatch Agent. - Enforce the DCV Authenticator Server to use at least `TLS-1.2` protocol when creating the SSL Socket. +- Load kernel module [nvidia-uvm](https://developer.nvidia.com/blog/unified-memory-cuda-beginners/) by default. +- Install [Nvidia persistence daemon](https://docs.nvidia.com/deploy/driver-persistence/index.html) as a system service. **CHANGES** - Upgrade Slurm to version 23.02.1. diff --git a/cookbooks/aws-parallelcluster-config/files/default/nvidia/nvidia.conf b/cookbooks/aws-parallelcluster-config/files/default/nvidia/nvidia.conf new file mode 100644 index 0000000000..a84f8bb9d3 --- /dev/null +++ b/cookbooks/aws-parallelcluster-config/files/default/nvidia/nvidia.conf @@ -0,0 +1 @@ +nvidia-uvm \ No newline at end of file diff --git a/cookbooks/aws-parallelcluster-config/recipes/base.rb b/cookbooks/aws-parallelcluster-config/recipes/base.rb index 5ebf30570e..d886015235 100644 --- a/cookbooks/aws-parallelcluster-config/recipes/base.rb +++ b/cookbooks/aws-parallelcluster-config/recipes/base.rb @@ -39,6 +39,9 @@ action :configure end +# Configure Nvidia driver +include_recipe "aws-parallelcluster-config::nvidia" + # EFA runtime configuration efa 'Configure system for EFA' do action :configure diff --git a/cookbooks/aws-parallelcluster-config/recipes/nvidia.rb b/cookbooks/aws-parallelcluster-config/recipes/nvidia.rb new file mode 100644 index 0000000000..504af5635a --- /dev/null +++ b/cookbooks/aws-parallelcluster-config/recipes/nvidia.rb @@ -0,0 +1,41 @@ +# frozen_string_literal: true + +# +# Cookbook:: aws-parallelcluster +# Recipe:: nvidia +# +# Copyright:: 2013-2021 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the +# License. A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and +# limitations under the License. + +if graphic_instance? && nvidia_installed? + # Load kernel module Nvidia-uvm + kernel_module 'nvidia-uvm' do + action :load + end + # Make sure kernel module Nvidia-uvm is loaded at instance boot time + cookbook_file 'nvidia.conf' do + source 'nvidia/nvidia.conf' + path '/etc/modules-load.d/nvidia.conf' + owner 'root' + group 'root' + mode '0644' + end + # Make sure nvidia_persistenced is installed as a system service + bash 'nvidia.run advanced' do + cwd '/usr/share/doc/NVIDIA_GLX-1.0/samples' + user 'root' + group 'root' + code <<-NVIDIA + tar -xf nvidia-persistenced-init.tar.bz2 + ./nvidia-persistenced-init/install.sh + NVIDIA + end +end diff --git a/test/recipes/controls/aws_parallelcluster_config/nvidia_spec.rb b/test/recipes/controls/aws_parallelcluster_config/nvidia_spec.rb index a8bd99a9d7..21a59b8df0 100644 --- a/test/recipes/controls/aws_parallelcluster_config/nvidia_spec.rb +++ b/test/recipes/controls/aws_parallelcluster_config/nvidia_spec.rb @@ -39,6 +39,26 @@ end end +control 'tag:config_nvidia_uvm_and_persistenced_on_graphic_instances' do + only_if do + !(os_properties.centos7? && os_properties.arm?) && + !instance.custom_ami? && instance.graphic? + end + + describe kernel_module('nvidia_uvm') do + it { should be_loaded } + end + + describe file('/etc/modules-load.d/nvidia.conf') do + its('content') { should include("uvm") } + end + + describe service('nvidia-persistenced') do + it { should be_enabled } + it { should be_running } + end +end + control 'tag:config_gdrcopy_disabled_on_non_graphic_instances' do only_if do !(os_properties.centos7? && os_properties.arm?) &&