diff --git a/CHANGELOG.md b/CHANGELOG.md index d0a3990f2..0a075da8d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,13 +10,7 @@ This file is used to list changes made in each version of the AWS ParallelCluste - Add support for RHEL9. - Add support for Rocky Linux 9 as `CustomAmi` created through `build-image` process. No public official ParallelCluster Rocky9 Linux AMI is made available at this time. - Add the configuration parameter `DeploymentSettings/DefaultUserHome` to allow users to move the default user's home directory to `/local/home` instead of `/home` (default). -- Add support for installing Intel OneAPI Base Toolkit and HPC Toolkit, and Intel Python. - - Intel OneAPI Base Toolkits: 2023.2.0 - - Intel OneAPI HPC Toolkits: 2023.2.0 - - Intel Python: 2023.2.0 - - Critical Update for Intel oneAPI DPC++/C++ Compiler: 2023.2.1 - - Critical Update for Intel Fortran Compiler & Intel Fortran Compiler Classic: 2023.2.1 - - Add possibility to choose between Open and Closed Source Nvidia Drivers when building an AMI, through the ```['cluster']['nvidia']['kernel_open']``` cookbook node attribute. +- Add possibility to choose between Open and Closed Source Nvidia Drivers when building an AMI, through the ```['cluster']['nvidia']['kernel_open']``` cookbook node attribute. **CHANGES** - Upgrade Slurm to 23.11.3 (from 23.02.7). diff --git a/cookbooks/aws-parallelcluster-entrypoints/recipes/update.rb b/cookbooks/aws-parallelcluster-entrypoints/recipes/update.rb index bf7fc57f0..43fc43d62 100644 --- a/cookbooks/aws-parallelcluster-entrypoints/recipes/update.rb +++ b/cookbooks/aws-parallelcluster-entrypoints/recipes/update.rb @@ -12,9 +12,8 @@ # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and # limitations under the License. -fetch_config 'Fetch and load cluster configs' do - update true -end +# Fetch and load cluster configs +include_recipe 'aws-parallelcluster-platform::update' # generate the updated shared storages mapping file include_recipe 'aws-parallelcluster-environment::update_fs_mapping' @@ -26,5 +25,3 @@ if is_custom_node? include_recipe 'aws-parallelcluster-computefleet::update_parallelcluster_node' end - -sudo_access "Update Sudo Access" if node['cluster']['scheduler'] == 'slurm' diff --git a/cookbooks/aws-parallelcluster-platform/kitchen.platform-config.yml b/cookbooks/aws-parallelcluster-platform/kitchen.platform-config.yml index 3640f9c9e..edf84fb96 100644 --- a/cookbooks/aws-parallelcluster-platform/kitchen.platform-config.yml +++ b/cookbooks/aws-parallelcluster-platform/kitchen.platform-config.yml @@ -93,7 +93,6 @@ suites: verifier: controls: - /tag:config_intel_hpc/ - - /tag:intel_one_api/ attributes: resource: intel_hpc:configure dependencies: @@ -101,8 +100,6 @@ suites: - resource:intel_hpc:setup cluster: enable_intel_hpc_platform: 'true' - install_intel_base_toolkit: 'true' - install_intel_hpc_toolkit: 'true' node_type: HeadNode - name: sticky_bits run_list: diff --git a/cookbooks/aws-parallelcluster-platform/recipes/update.rb b/cookbooks/aws-parallelcluster-platform/recipes/update.rb new file mode 100644 index 000000000..f7b041111 --- /dev/null +++ b/cookbooks/aws-parallelcluster-platform/recipes/update.rb @@ -0,0 +1,22 @@ +# frozen_string_literal: true + +# +# Cookbook:: aws-parallelcluster-platform +# Recipe:: update +# +# Copyright:: 2013-2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the +# License. A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and +# limitations under the License. + +fetch_config 'Fetch and load cluster configs' do + update true +end + +sudo_access "Update Sudo Access" if node['cluster']['scheduler'] == 'slurm' diff --git a/cookbooks/aws-parallelcluster-platform/resources/intel_hpc/install_intel_software.rb b/cookbooks/aws-parallelcluster-platform/resources/intel_hpc/install_intel_software.rb deleted file mode 100644 index 85bb7fec0..000000000 --- a/cookbooks/aws-parallelcluster-platform/resources/intel_hpc/install_intel_software.rb +++ /dev/null @@ -1,41 +0,0 @@ -# frozen_string_literal: true - -# Copyright:: 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"). -# You may not use this file except in compliance with the License. -# A copy of the License is located at -# -# http://aws.amazon.com/apache2.0/ -# -# or in the "LICENSE.txt" file accompanying this file. -# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. -# See the License for the specific language governing permissions and limitations under the License. - -provides :install_intel_software - -unified_mode true - -property :intel_offline_installer_dir, String, required: true -property :software_name, String, required: true -property :software_url, String, required: true - -default_action :install - -action :install do - remote_file "#{new_resource.intel_offline_installer_dir}/#{new_resource.software_name}" do - source new_resource.software_url - mode '0744' - retries 3 - retry_delay 5 - action :create_if_missing - end - bash "install Intel #{new_resource.software_name}" do - cwd new_resource.intel_offline_installer_dir - code <<-INTEL - set -e - sh #{new_resource.software_name} -a -s --eula accept - rm -f #{new_resource.software_name} - INTEL - end -end diff --git a/cookbooks/aws-parallelcluster-platform/resources/intel_hpc/intel_hpc.rb b/cookbooks/aws-parallelcluster-platform/resources/intel_hpc/intel_hpc.rb index 0dcb76372..f797f2d13 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/intel_hpc/intel_hpc.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/intel_hpc/intel_hpc.rb @@ -22,71 +22,5 @@ end action :configure do - return unless intel_hpc_supported? && (node['cluster']['install_intel_base_toolkit'] == "true" || node['cluster']['install_intel_hpc_toolkit'] == "true" || node['cluster']['install_intel_python'] == "true") - - base_toolkit_version = "2023.2.0.49397" - base_toolkit_name = "l_BaseKit_p_#{base_toolkit_version}_offline.sh" - base_toolkit_url = "https://registrationcenter-download.intel.com/akdlm/IRC_NAS/992857b9-624c-45de-9701-f6445d845359/#{base_toolkit_name}" - hpc_toolkit_version = "2023.2.0.49440" - hpc_toolkit_name = "l_HPCKit_p_#{hpc_toolkit_version}_offline.sh" - hpc_toolkit_url = "https://registrationcenter-download.intel.com/akdlm/IRC_NAS/0722521a-34b5-4c41-af3f-d5d14e88248d/#{hpc_toolkit_name}" - intel_python_version = "2023.2.0.49422" - intel_python_name = "l_pythoni39_oneapi_p_#{intel_python_version}_offline.sh" - intel_python_url = "https://registrationcenter-download.intel.com/akdlm/IRC_NAS/03aae3a8-623a-47cf-9655-5dd8fcf86430/#{intel_python_name}" - # Below are critical security updates not included in the tookits: - cpp_compiler_version = "2023.2.1.8" - cpp_compiler_name = "l_dpcpp-cpp-compiler_p_#{cpp_compiler_version}_offline.sh" - cpp_compiler_url = "https://registrationcenter-download.intel.com/akdlm/IRC_NAS/ebf5d9aa-17a7-46a4-b5df-ace004227c0e/#{cpp_compiler_name}" - fortran_compiler_version = "2023.2.1.8" - fortran_compiler_name = "l_fortran-compiler_p_#{fortran_compiler_version}_offline.sh" - fortran_compiler_url = "https://registrationcenter-download.intel.com/akdlm/IRC_NAS/0d65c8d4-f245-4756-80c4-6712b43cf835/#{fortran_compiler_name}" - - intel_offline_installer_dir = '/opt/intel/offlineInstaller' - - directory intel_offline_installer_dir do - recursive true - end - - if node['cluster']['install_intel_base_toolkit'] == "true" - install_intel_software "Install Intel Base Toolkit" do - software_name base_toolkit_name - software_url base_toolkit_url - intel_offline_installer_dir intel_offline_installer_dir - end - install_intel_software "Critical Update for Intel oneAPI DPC++/C++ Compiler" do - software_name cpp_compiler_name - software_url cpp_compiler_url - intel_offline_installer_dir intel_offline_installer_dir - end - end - if node['cluster']['install_intel_hpc_toolkit'] == "true" - install_intel_software "Intel OneAPI HPC Toolkits" do - software_name hpc_toolkit_name - software_url hpc_toolkit_url - intel_offline_installer_dir intel_offline_installer_dir - end - install_intel_software "Critical Update for Intel Fortran Compiler & Intel Fortran Compiler Classic" do - software_name fortran_compiler_name - software_url fortran_compiler_url - intel_offline_installer_dir intel_offline_installer_dir - end - end - if node['cluster']['install_intel_python'] == "true" - install_intel_software "Install Intel Python" do - software_name intel_python_name - software_url intel_python_url - intel_offline_installer_dir intel_offline_installer_dir - end - end - bash "copy Intel modulefiles to MODULEPATH" do - cwd "/opt/intel" - code <<-INTEL - set -e - ./modulefiles-setup.sh --output-dir="/usr/share/Modules/modulefiles/intel" - INTEL - end -end - -def intel_hpc_supported? - !arm_instance? + # do nothing end diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/nvidia_driver_centos7.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/nvidia_driver_centos7.rb index acd9621f4..a7fc54201 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/nvidia_driver_centos7.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/nvidia_driver_centos7.rb @@ -21,3 +21,8 @@ def nvidia_driver_enabled? !arm_instance? && nvidia_enabled? end + +# Pinning the Nvidia Driver version for centos7 due to incompatibility with Gdrcopy 2.3.1 +def _nvidia_driver_version + '535.129.03' +end diff --git a/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/update_spec.rb b/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/update_spec.rb new file mode 100644 index 000000000..220feae15 --- /dev/null +++ b/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/update_spec.rb @@ -0,0 +1,54 @@ +# frozen_string_literal: true + +# Copyright:: 2024 Amazon.com, Inc. and its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the +# License. A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and +# limitations under the License. + +require 'spec_helper' + +describe 'aws-parallelcluster-platform::update' do + for_all_oses do |platform, version| + context "on #{platform}#{version}" do + context "when scheduler is slurm" do + cached(:chef_run) do + runner = runner(platform: platform, version: version) do |node| + node.override['cluster']['scheduler'] = 'slurm' + end + runner.converge(described_recipe) + end + cached(:node) { chef_run.node } + + it 'it fetches and updates cluster configs' do + is_expected.to run_fetch_config('Fetch and load cluster configs') + end + it 'it updates sudo access' do + is_expected.to setup_sudo_access('Update Sudo Access') + end + end + + context "when scheduler is awsbatch" do + cached(:chef_run) do + runner = runner(platform: platform, version: version) do |node| + node.override['cluster']['scheduler'] = 'awsbatch' + end + runner.converge(described_recipe) + end + cached(:node) { chef_run.node } + + it 'it fetches and updates cluster configs' do + is_expected.to run_fetch_config('Fetch and load cluster configs') + end + it 'it doesnt update sudo access' do + is_expected.not_to setup_sudo_access('Update Sudo Access') + end + end + end + end +end diff --git a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_driver_spec.rb b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_driver_spec.rb index 7aed15258..18c781e55 100644 --- a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_driver_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_driver_spec.rb @@ -163,11 +163,12 @@ def self.setup(chef_run, nvidia_driver_version: nil) [%w(false kernel), %w(true kernel-open)].each do |kernel_open, kernel_module| context "on #{platform}#{version} when nvidia_driver enabled and node['cluster']['nvidia']['kernel_open'] is #{kernel_open}" do - cached(:nvidia_arch) { 'nvidia_arch' } - cached(:nvidia_driver_version) { 'nvidia_driver_version' } - cached(:nvidia_kernel_module) { 'nvidia_kernel_module' } + if platform == 'centos' + cached(:nvidia_driver_version) { '535.129.03' } + else + cached(:nvidia_driver_version) { 'nvidia_driver_version' } + end cached(:nvidia_driver_url) { "https://us.download.nvidia.com/tesla/#{nvidia_driver_version}/NVIDIA-Linux-#{nvidia_arch}-#{nvidia_driver_version}.run" } - cached(:chef_run) do stubs_for_resource('nvidia_driver') do |res| allow(res).to receive(:nvidia_driver_enabled?).and_return(true) diff --git a/cookbooks/aws-parallelcluster-platform/test/controls/intel_hpc_spec.rb b/cookbooks/aws-parallelcluster-platform/test/controls/intel_hpc_spec.rb index ce4f85798..63c8f0115 100644 --- a/cookbooks/aws-parallelcluster-platform/test/controls/intel_hpc_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/test/controls/intel_hpc_spec.rb @@ -137,31 +137,3 @@ its('mode') { should cmp '0755' } end end - -control 'tag:intel_one_api_toolkits_configured' do - # TODO: Enable this test in daily run. This test requires larger root volume size. - # TODO: After increasing the root volume size, config_intel_hpc_enough_space_on_root_volume needs to be ajusted. - title 'Checks Intel OneApi Toolkits have been installed' - - only_if { !os_properties.on_docker? } - only_if { !os_properties.centos7? && !os_properties.arm? } - - intel_directory = "/opt/intel" - - if node['cluster']['install_intel_base_toolkit'] == 'true' - %w(advisor ccl compiler dal dnnl dpl ipp ippcp mkl vtune).each do |software| - describe directory("#{intel_directory}/#{software}") do - it { should exist } - end - end - end - - modulefile_dir = "/usr/share/Modules/modulefiles" - # Intel PSXE module file - describe file("#{modulefile_dir}/intel") do - it { should exist } - its('owner') { should eq 'root' } - its('group') { should eq 'root' } - its('mode') { should cmp '0755' } - end -end