Skip to content

Commit

Permalink
merging release 1.7 to main
Browse files Browse the repository at this point in the history
  • Loading branch information
priti-parate committed Jan 23, 2025
1 parent 614c93b commit a06c9d1
Show file tree
Hide file tree
Showing 1,343 changed files with 34,587 additions and 20,537 deletions.
2 changes: 2 additions & 0 deletions .ansible-lint
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
skip_list:
- var-naming[no-role-prefix]
3 changes: 2 additions & 1 deletion .metadata/omnia_version
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
omnia_version: 1.6.1
omnia_version: 1.7
omnia_installation_path: ""
35 changes: 23 additions & 12 deletions accelerator/accelerator.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved.
# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand All @@ -13,34 +13,34 @@
# limitations under the License.
---

- name: Check if virtual environment is active
ansible.builtin.import_playbook: ../utils/check_venv.yml
when: not ( check_venv_executed | default(false) | bool )

- name: Update Inventory with ansible_host information
ansible.builtin.import_playbook: ../utils/servicetag_host_mapping.yml

- name: Validate accelerator inputs
hosts: localhost
gather_facts: true
connection: local
roles:
- accelerator_validation
tags: amd, nvidia
tags: amd, nvidia, intel

- name: Update Repositories/Registries on nodes
ansible.builtin.import_playbook: ../utils/update_user_repo.yml
when: not ( hostvars['127.0.0.1']['update_user_repo_executed'] | default(false) | bool )

# - name: Validate repo file and subscription
# hosts: all
# gather_facts: true
# roles:
# - repo_validation
# tags: amd, nvidia

- name: Gather Cluster Facts
hosts: all
hosts: slurm_control_node, slurm_node, kube_control_plane, kube_node, auth_server, login, etcd
gather_facts: true
roles:
- common
tags: amd, nvidia
tags: amd, nvidia, intel

- name: Perform GPU driver and ROCm installation for AMD Accelerators
hosts: all
hosts: slurm_control_node, slurm_node, kube_control_plane, kube_node, auth_server, login, etcd
gather_facts: true
any_errors_fatal: true
roles:
Expand All @@ -66,3 +66,14 @@
# - name: Reboot node
# ansible.builtin.reboot:
# tags: nvidia

- name: Install Intel Gaudi drivers on nodes
hosts: slurm_control_node, slurm_node, kube_control_plane, kube_node, auth_server, login, etcd
gather_facts: true
any_errors_fatal: true
roles:
- intel
tags: intel

- name: Import playbook to set performance profile on nodes
ansible.builtin.import_playbook: "../utils/performance_profile/performance_profile.yml"
4 changes: 3 additions & 1 deletion accelerator/ansible.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,13 @@ log_path = /var/log/omnia/accelerator.log
host_key_checking = false
forks = 5
timeout = 180
collections_path = $VIRTUAL_ENV
executable = /bin/bash

[persistent_connection]
command_timeout = 180
connect_timeout = 180

[ssh_connection]
retries = 3
ssh_args = -o ControlMaster=auto -o ControlPersist=180
ssh_args = -o ControlMaster=auto -o ControlPersist=180
25 changes: 23 additions & 2 deletions accelerator/roles/accelerator_validation/tasks/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,33 @@
# limitations under the License.
---

- name: Saving distribution of os
- name: Saving distribution and version of OS
ansible.builtin.set_fact:
control_plane_os: "{{ ansible_distribution | lower }}"
oim_os: "{{ ansible_distribution | lower }}"
oim_os_version: "{{ ansible_distribution_version | lower }}"

- name: Include local_repo variables
ansible.builtin.include_tasks: include_local_repo_config.yml

- name: Check xcat installation status
ansible.builtin.include_tasks: validate_amd.yml

- name: Check xcat installation status
ansible.builtin.include_tasks: validate_intel_gaudi.yml

- name: Debug intel_gaudi_config_status
ansible.builtin.debug:
msg: "intel_gaudi_config_status is {{ hostvars['localhost']['intel_gaudi_config_status'] }}"

- name: Debug amdgpu_config_status
ansible.builtin.debug:
msg: "amdgpu_config_status is {{ hostvars['localhost']['amdgpu_config_status'] }}"

- name: Check if both intel_gaudi_config_status and amdgpu_config_status are false
ansible.builtin.fail:
msg: "{{ driver_not_found_msg }}"
when: >
hostvars['localhost']['intel_gaudi_config_status'] is defined and
hostvars['localhost']['amdgpu_config_status'] is defined and
not hostvars['localhost']['intel_gaudi_config_status'] | bool and
not hostvars['localhost']['amdgpu_config_status'] | bool
79 changes: 50 additions & 29 deletions accelerator/roles/accelerator_validation/tasks/validate_amd.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@
file: "{{ software_config_json_file }}"
name: user_config

- name: Include vars for {{ control_plane_os }}
ansible.builtin.include_vars: "{{ role_path }}/vars/{{ control_plane_os }}.yml"
- name: Include vars for {{ oim_os }}
ansible.builtin.include_vars: "{{ role_path }}/vars/{{ oim_os }}.yml"

- name: Get amdgpu status
ansible.builtin.set_fact:
Expand All @@ -47,10 +47,11 @@
loop_control:
loop_var: item

- name: Failed, AMDGPU ROCm software stack not present in software_config.json
ansible.builtin.fail:
msg: "{{ amdgpu_input_fail_msg }}"
when: not amdgpu_input_status
- name: Check if the rocm offline repo exists
ansible.builtin.stat:
path: "{{ offline_rocm_directory }}/rocm/"
register: check_rocm_repo
when: rocm_input_status

- name: Set amdgpu_config_status
when: amdgpu_input_status
Expand All @@ -63,28 +64,28 @@
ansible.builtin.set_fact:
amdgpu_directory: "{{ offline_rocm_directory }}/amdgpu/{{ amdgpu_version }}/"

- name: Check amdgpu_version exists or not
- name: Check amdgpu version directory exists or not
ansible.builtin.stat:
path: "{{ amdgpu_directory }}"
register: check_amdgpu_dir
failed_when: not check_amdgpu_dir.stat.exists

- name: Set amdgpu_config_status to true
- name: Set amdgpu_config_status based on directory existence
ansible.builtin.set_fact:
amdgpu_config_status: true
when: check_amdgpu_dir.stat.exists
amdgpu_config_status: "{{ check_amdgpu_dir.stat.exists | ternary(true, false) }}"
rescue:
- name: Failed, amdgpu directory repo not found
ansible.builtin.fail:
msg: "{{ amdgpu_repo_fail_msg }}"
when: not check_amdgpu_dir.stat.exists
- name: Log an error message
ansible.builtin.debug:
msg: " {{ amdgpu_fail_msg }} "

- name: Failed, amdgpu version not found
ansible.builtin.fail:
msg: "{{ amdgpu_version_fail_msg }}"
- name: Set amdgpu_config_status to false
ansible.builtin.set_fact:
amdgpu_config_status: false

- name: Set rocm_config_status
when: rocm_input_status
when:
- rocm_input_status
- user_config.repo_config == 'always' or user_config.repo_config == 'partial'
- check_rocm_repo.stat.exists
block:
- name: Fetch rocm_version
ansible.builtin.set_fact:
Expand All @@ -98,18 +99,38 @@
ansible.builtin.stat:
path: "{{ rocm_directory }}"
register: check_rocm_dir
failed_when: not check_rocm_dir.stat.exists

- name: Set rocm_config_status based on directory existence
ansible.builtin.set_fact:
rocm_config_status: "{{ check_rocm_dir.stat.exists | ternary(true, false) }}"

rescue:
- name: Log an error message
ansible.builtin.debug:
msg: " {{ amdgpu_fail_msg }} "

- name: Set rocm_config_status to false
ansible.builtin.set_fact:
rocm_config_status: false

- name: Set rocm_config_status
when:
- rocm_input_status
- user_config.repo_config == 'never' or user_config.repo_config == 'partial'
- not check_rocm_repo.stat.exists
block:
- name: Fetch rocm_version
ansible.builtin.set_fact:
rocm_version: "{{ user_config.amdgpu | selectattr('name', 'equalto', 'rocm') | map(attribute='version') | first }}"

- name: Set rocm_config_status to true
ansible.builtin.set_fact:
rocm_config_status: true
when: check_rocm_dir.stat.exists
rescue:
- name: Failed, rocm directory repo not found
ansible.builtin.fail:
msg: "{{ rocm_repo_fail_msg }}"
when: not check_rocm_dir.stat.exists

- name: Failed, rocm version not found
ansible.builtin.fail:
msg: "{{ rocm_version_fail_msg }}"
- name: Log an error message
ansible.builtin.debug:
msg: " {{ amdgpu_fail_msg }} "

- name: Set rocm_config_status to false
ansible.builtin.set_fact:
rocm_config_status: false
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
# Copyright 2024 Intel Corporation.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
---

- name: Set default intel gaudi status
ansible.builtin.set_fact:
habana_config_status: false
habana_input_status: false
intel_gaudi_config_status: false
intel_gaudi_input_status: false

- name: Load software_config.json
ansible.builtin.include_vars:
file: "{{ software_config_json_file }}"
name: user_config

- name: Include vars for {{ oim_os }}
ansible.builtin.include_vars: "{{ role_path }}/vars/{{ oim_os }}.yml"

- name: Get Intel Gaudi status
ansible.builtin.set_fact:
intel_gaudi_input_status: true
loop: "{{ user_config.softwares | default([]) }}"
when:
- "'intelgaudi' in item.name"
loop_control:
loop_var: item

- name: Get habana status only if intel gaudi is present gaudi_status is true
ansible.builtin.set_fact:
habana_input_status: true
loop: "{{ user_config.gaudi | default([]) }}"
when:
- intel_gaudi_input_status
- "'intel' in item.name"
loop_control:
loop_var: item

- name: Set intel_gaudi_config_status
when: intel_gaudi_input_status
block:
- name: Fetch intelgaudi_version
ansible.builtin.set_fact:
intelgaudi_version: "{{ user_config.softwares | selectattr('name', 'equalto', 'intelgaudi') | map(attribute='version') | first }}"

- name: Set intelgaudi_version
ansible.builtin.set_fact:
intelgaudi_directory: "{{ offline_intelgaudi_directory }}/intelgaudi/{{ intelgaudi_version }}/"

- name: Set gaudi_directory
ansible.builtin.set_fact:
gaudi_directory: "{{ intelgaudi_directory }}"

- name: Check gaudi_directory exists or not
ansible.builtin.stat:
path: "{{ gaudi_directory }}"
register: check_gaudi_dir

- name: Set intel_gaudi_config_status to true
ansible.builtin.set_fact:
intel_gaudi_config_status: true
when: check_gaudi_dir.stat.exists

rescue:
- name: Intel Gaudi not found
ansible.builtin.debug:
msg: "{{ intel_gaudi_repo_fail_msg }}"
when: not check_gaudi_dir.stat.exists

- name: Set habana_config_status
when: habana_config_status
block:

- name: Check driver packages inside offline_gaudi_directory
ansible.builtin.find:
paths: "{{ offline_gaudi_directory }}"
patterns: "{{ gaudi_search_pattern }}"
register: check_driver_packages

- name: Set habana_config_status to true
ansible.builtin.set_fact:
habana_config_status: true
when: check_driver_packages.matched > 0
rescue:
- name: Intel Gaudi driver packages not found
ansible.builtin.debug:
msg: "{{ intel_gaudi_repo_fail_msg }}"
when: check_driver_packages.matched == 0
9 changes: 9 additions & 0 deletions accelerator/roles/accelerator_validation/vars/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,16 @@ amdgpu_version_fail_msg: "Failed, software_config.json does not have the version
amdgpu_repo_fail_msg: "Failed, local_repo.yml is not executed for downloading AMDGPU packages."
rocm_version_fail_msg: "Failed, software_config.json does not have the version for ROCM."
rocm_repo_fail_msg: "Failed, local_repo.yml is not executed for downloading ROCM packages."
amdgpu_fail_msg: "An error occurred while setting the rocm_config_status."

# Usage: include_local_repo_config.yml
local_repo_config_file: "{{ role_path }}/../../../input/local_repo_config.yml"
local_repo_config_syntax_fail_msg: "Failed. Syntax errors present in local_repo_config.yml. Fix errors and re-run playbook again."

# Usage: validate_intel_gaudi.yml
intel_gaudi_input_fail_msg: "Failed, software_config.json does not have the intelgaudi software stack."
intel_gaudi_repo_fail_msg: "Failed, local_repo.yml is not executed for downloading Intel Gaudi driver packages."

# Usage: main.yml
driver_not_found_msg: |
"Please ensure that either 'intelgaudi' or 'amdgpu' is included in 'software_config.json' and then run 'accelerator.yml' to install GPU drivers."
5 changes: 5 additions & 0 deletions accelerator/roles/accelerator_validation/vars/ubuntu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,8 @@

# Usage: validate_amd.yml
offline_rocm_directory: "{{ repo_store_path }}/cluster/apt"

# Usage: validate_intel_gaudi.yml
offline_intelgaudi_directory: "{{ repo_store_path }}/cluster/apt"
offline_gaudi_directory: "{{ repo_store_path }}/cluster/{{ oim_os }}/{{ oim_os_version }}/deb"
gaudi_search_pattern: "habanalabs*.deb"
Loading

0 comments on commit a06c9d1

Please sign in to comment.