Skip to content

Commit

Permalink
Merge branch 'develop' into developforpr
Browse files Browse the repository at this point in the history
  • Loading branch information
hanwen-pcluste authored Feb 14, 2024
2 parents 6c4b1e7 + e4fee1f commit e458114
Show file tree
Hide file tree
Showing 10 changed files with 90 additions and 155 deletions.
8 changes: 1 addition & 7 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,7 @@ This file is used to list changes made in each version of the AWS ParallelCluste
- Add support for RHEL9.
- Add support for Rocky Linux 9 as `CustomAmi` created through `build-image` process. No public official ParallelCluster Rocky9 Linux AMI is made available at this time.
- Add the configuration parameter `DeploymentSettings/DefaultUserHome` to allow users to move the default user's home directory to `/local/home` instead of `/home` (default).
- Add support for installing Intel OneAPI Base Toolkit and HPC Toolkit, and Intel Python.
- Intel OneAPI Base Toolkits: 2023.2.0
- Intel OneAPI HPC Toolkits: 2023.2.0
- Intel Python: 2023.2.0
- Critical Update for Intel oneAPI DPC++/C++ Compiler: 2023.2.1
- Critical Update for Intel Fortran Compiler & Intel Fortran Compiler Classic: 2023.2.1
- Add possibility to choose between Open and Closed Source Nvidia Drivers when building an AMI, through the ```['cluster']['nvidia']['kernel_open']``` cookbook node attribute.
- Add possibility to choose between Open and Closed Source Nvidia Drivers when building an AMI, through the ```['cluster']['nvidia']['kernel_open']``` cookbook node attribute.

**CHANGES**
- Upgrade Slurm to 23.11.3 (from 23.02.7).
Expand Down
7 changes: 2 additions & 5 deletions cookbooks/aws-parallelcluster-entrypoints/recipes/update.rb
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,8 @@
# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
# limitations under the License.

fetch_config 'Fetch and load cluster configs' do
update true
end
# Fetch and load cluster configs
include_recipe 'aws-parallelcluster-platform::update'

# generate the updated shared storages mapping file
include_recipe 'aws-parallelcluster-environment::update_fs_mapping'
Expand All @@ -26,5 +25,3 @@
if is_custom_node?
include_recipe 'aws-parallelcluster-computefleet::update_parallelcluster_node'
end

sudo_access "Update Sudo Access" if node['cluster']['scheduler'] == 'slurm'
Original file line number Diff line number Diff line change
Expand Up @@ -93,16 +93,13 @@ suites:
verifier:
controls:
- /tag:config_intel_hpc/
- /tag:intel_one_api/
attributes:
resource: intel_hpc:configure
dependencies:
- resource:package_repos:update
- resource:intel_hpc:setup
cluster:
enable_intel_hpc_platform: 'true'
install_intel_base_toolkit: 'true'
install_intel_hpc_toolkit: 'true'
node_type: HeadNode
- name: sticky_bits
run_list:
Expand Down
22 changes: 22 additions & 0 deletions cookbooks/aws-parallelcluster-platform/recipes/update.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# frozen_string_literal: true

#
# Cookbook:: aws-parallelcluster-platform
# Recipe:: update
#
# Copyright:: 2013-2024 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the
# License. A copy of the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
# limitations under the License.

fetch_config 'Fetch and load cluster configs' do
update true
end

sudo_access "Update Sudo Access" if node['cluster']['scheduler'] == 'slurm'

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -22,71 +22,5 @@
end

action :configure do
return unless intel_hpc_supported? && (node['cluster']['install_intel_base_toolkit'] == "true" || node['cluster']['install_intel_hpc_toolkit'] == "true" || node['cluster']['install_intel_python'] == "true")

base_toolkit_version = "2023.2.0.49397"
base_toolkit_name = "l_BaseKit_p_#{base_toolkit_version}_offline.sh"
base_toolkit_url = "https://registrationcenter-download.intel.com/akdlm/IRC_NAS/992857b9-624c-45de-9701-f6445d845359/#{base_toolkit_name}"
hpc_toolkit_version = "2023.2.0.49440"
hpc_toolkit_name = "l_HPCKit_p_#{hpc_toolkit_version}_offline.sh"
hpc_toolkit_url = "https://registrationcenter-download.intel.com/akdlm/IRC_NAS/0722521a-34b5-4c41-af3f-d5d14e88248d/#{hpc_toolkit_name}"
intel_python_version = "2023.2.0.49422"
intel_python_name = "l_pythoni39_oneapi_p_#{intel_python_version}_offline.sh"
intel_python_url = "https://registrationcenter-download.intel.com/akdlm/IRC_NAS/03aae3a8-623a-47cf-9655-5dd8fcf86430/#{intel_python_name}"
# Below are critical security updates not included in the tookits:
cpp_compiler_version = "2023.2.1.8"
cpp_compiler_name = "l_dpcpp-cpp-compiler_p_#{cpp_compiler_version}_offline.sh"
cpp_compiler_url = "https://registrationcenter-download.intel.com/akdlm/IRC_NAS/ebf5d9aa-17a7-46a4-b5df-ace004227c0e/#{cpp_compiler_name}"
fortran_compiler_version = "2023.2.1.8"
fortran_compiler_name = "l_fortran-compiler_p_#{fortran_compiler_version}_offline.sh"
fortran_compiler_url = "https://registrationcenter-download.intel.com/akdlm/IRC_NAS/0d65c8d4-f245-4756-80c4-6712b43cf835/#{fortran_compiler_name}"

intel_offline_installer_dir = '/opt/intel/offlineInstaller'

directory intel_offline_installer_dir do
recursive true
end

if node['cluster']['install_intel_base_toolkit'] == "true"
install_intel_software "Install Intel Base Toolkit" do
software_name base_toolkit_name
software_url base_toolkit_url
intel_offline_installer_dir intel_offline_installer_dir
end
install_intel_software "Critical Update for Intel oneAPI DPC++/C++ Compiler" do
software_name cpp_compiler_name
software_url cpp_compiler_url
intel_offline_installer_dir intel_offline_installer_dir
end
end
if node['cluster']['install_intel_hpc_toolkit'] == "true"
install_intel_software "Intel OneAPI HPC Toolkits" do
software_name hpc_toolkit_name
software_url hpc_toolkit_url
intel_offline_installer_dir intel_offline_installer_dir
end
install_intel_software "Critical Update for Intel Fortran Compiler & Intel Fortran Compiler Classic" do
software_name fortran_compiler_name
software_url fortran_compiler_url
intel_offline_installer_dir intel_offline_installer_dir
end
end
if node['cluster']['install_intel_python'] == "true"
install_intel_software "Install Intel Python" do
software_name intel_python_name
software_url intel_python_url
intel_offline_installer_dir intel_offline_installer_dir
end
end
bash "copy Intel modulefiles to MODULEPATH" do
cwd "/opt/intel"
code <<-INTEL
set -e
./modulefiles-setup.sh --output-dir="/usr/share/Modules/modulefiles/intel"
INTEL
end
end

def intel_hpc_supported?
!arm_instance?
# do nothing
end
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,8 @@
def nvidia_driver_enabled?
!arm_instance? && nvidia_enabled?
end

# Pinning the Nvidia Driver version for centos7 due to incompatibility with Gdrcopy 2.3.1
def _nvidia_driver_version
'535.129.03'
end
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# frozen_string_literal: true

# Copyright:: 2024 Amazon.com, Inc. and its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the
# License. A copy of the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
# limitations under the License.

require 'spec_helper'

describe 'aws-parallelcluster-platform::update' do
for_all_oses do |platform, version|
context "on #{platform}#{version}" do
context "when scheduler is slurm" do
cached(:chef_run) do
runner = runner(platform: platform, version: version) do |node|
node.override['cluster']['scheduler'] = 'slurm'
end
runner.converge(described_recipe)
end
cached(:node) { chef_run.node }

it 'it fetches and updates cluster configs' do
is_expected.to run_fetch_config('Fetch and load cluster configs')
end
it 'it updates sudo access' do
is_expected.to setup_sudo_access('Update Sudo Access')
end
end

context "when scheduler is awsbatch" do
cached(:chef_run) do
runner = runner(platform: platform, version: version) do |node|
node.override['cluster']['scheduler'] = 'awsbatch'
end
runner.converge(described_recipe)
end
cached(:node) { chef_run.node }

it 'it fetches and updates cluster configs' do
is_expected.to run_fetch_config('Fetch and load cluster configs')
end
it 'it doesnt update sudo access' do
is_expected.not_to setup_sudo_access('Update Sudo Access')
end
end
end
end
end
Original file line number Diff line number Diff line change
Expand Up @@ -163,11 +163,12 @@ def self.setup(chef_run, nvidia_driver_version: nil)

[%w(false kernel), %w(true kernel-open)].each do |kernel_open, kernel_module|
context "on #{platform}#{version} when nvidia_driver enabled and node['cluster']['nvidia']['kernel_open'] is #{kernel_open}" do
cached(:nvidia_arch) { 'nvidia_arch' }
cached(:nvidia_driver_version) { 'nvidia_driver_version' }
cached(:nvidia_kernel_module) { 'nvidia_kernel_module' }
if platform == 'centos'
cached(:nvidia_driver_version) { '535.129.03' }
else
cached(:nvidia_driver_version) { 'nvidia_driver_version' }
end
cached(:nvidia_driver_url) { "https://us.download.nvidia.com/tesla/#{nvidia_driver_version}/NVIDIA-Linux-#{nvidia_arch}-#{nvidia_driver_version}.run" }

cached(:chef_run) do
stubs_for_resource('nvidia_driver') do |res|
allow(res).to receive(:nvidia_driver_enabled?).and_return(true)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -137,31 +137,3 @@
its('mode') { should cmp '0755' }
end
end

control 'tag:intel_one_api_toolkits_configured' do
# TODO: Enable this test in daily run. This test requires larger root volume size.
# TODO: After increasing the root volume size, config_intel_hpc_enough_space_on_root_volume needs to be ajusted.
title 'Checks Intel OneApi Toolkits have been installed'

only_if { !os_properties.on_docker? }
only_if { !os_properties.centos7? && !os_properties.arm? }

intel_directory = "/opt/intel"

if node['cluster']['install_intel_base_toolkit'] == 'true'
%w(advisor ccl compiler dal dnnl dpl ipp ippcp mkl vtune).each do |software|
describe directory("#{intel_directory}/#{software}") do
it { should exist }
end
end
end

modulefile_dir = "/usr/share/Modules/modulefiles"
# Intel PSXE module file
describe file("#{modulefile_dir}/intel") do
it { should exist }
its('owner') { should eq 'root' }
its('group') { should eq 'root' }
its('mode') { should cmp '0755' }
end
end

0 comments on commit e458114

Please sign in to comment.