diff --git a/README.md b/README.md
index f5edf93..65b6d62 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# Gaudi Setup and Installation
+# Intel® Gaudi® Accelerator Setup and Installation
@@ -6,7 +6,7 @@
-By installing, copying, accessing, or using the software, you agree to be legally bound by the terms and conditions of the Habana software license agreement [defined here](https://habana.ai/habana-outbound-software-license-agreement/).
+By installing, copying, accessing, or using the software, you agree to be legally bound by the terms and conditions of the Intel Gaudi software license agreement [defined here](https://habana.ai/habana-outbound-software-license-agreement/).
@@ -18,7 +18,7 @@ By installing, copying, accessing, or using the software, you agree to be legall
Welcome to Setup and Installation GitHub Repository!
-The full installation documentation has been consolidated into the Installation Guide in our Habana Documentation. Please reference our [Habana docs](https://docs.habana.ai/en/latest/Installation_Guide/GAUDI_Installation_Guide.html) for the full installation guide.
+The full installation documentation has been consolidated into the Installation Guide in our Intel Gaudi Documentation. Please reference our [Intel Gaudi docs](https://docs.habana.ai/en/latest/Installation_Guide/GAUDI_Installation_Guide.html) for the full installation guide.
This respository contains the following references:
- dockerfiles -- Reference dockerfiles and build script to build Gaudi Docker images
diff --git a/dockerfiles/base/Dockerfile.rhel8.6 b/dockerfiles/base/Dockerfile.rhel8.6
index aa74d90..5d93727 100644
--- a/dockerfiles/base/Dockerfile.rhel8.6
+++ b/dockerfiles/base/Dockerfile.rhel8.6
@@ -18,13 +18,13 @@ RUN dnf install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.n
RUN echo "[appstream]" > /etc/yum.repos.d/CentOS-Linux-AppStream.repo && \
echo "name=CentOS Linux 8 - AppStream" >> /etc/yum.repos.d/CentOS-Linux-AppStream.repo && \
- echo "mirrorlist=http://mirrorlist.centos.org/?release=\$releasever-stream&arch=\$basearch&repo=AppStream&infra=\$infra" >> /etc/yum.repos.d/CentOS-Linux-AppStream.repo && \
+ echo "baseurl=https://vault.centos.org/8-stream/AppStream/x86_64/os" >> /etc/yum.repos.d/CentOS-Linux-AppStream.repo && \
echo "gpgcheck=0" >> /etc/yum.repos.d/CentOS-Linux-AppStream.repo
RUN echo "[BaseOS]" > /etc/yum.repos.d/CentOS-Linux-BaseOS.repo && \
echo "name=CentOS Linux 8 - BaseOS" >> /etc/yum.repos.d/CentOS-Linux-BaseOS.repo && \
- echo "mirrorlist=http://mirrorlist.centos.org/?release=\$releasever-stream&arch=\$basearch&repo=BaseOS&infra=\$infra" >> /etc/yum.repos.d/CentOS-Linux-BaseOS.repo && \
+ echo "baseurl=https://vault.centos.org/8-stream/BaseOS/x86_64/os" >> /etc/yum.repos.d/CentOS-Linux-BaseOS.repo && \
echo "gpgcheck=0" >> /etc/yum.repos.d/CentOS-Linux-BaseOS.repo
RUN dnf install -y \
@@ -77,7 +77,7 @@ RUN echo "[habanalabs]" > /etc/yum.repos.d/habanalabs.repo && \
RUN echo "[powertools]" > /etc/yum.repos.d/powertools.repo && \
echo "name=powertools" >> /etc/yum.repos.d/powertools.repo && \
- echo "baseurl=http://mirror.centos.org/centos/8-stream/PowerTools/x86_64/os/" >> /etc/yum.repos.d/powertools.repo && \
+ echo "baseurl=https://vault.centos.org/8-stream/PowerTools/x86_64/os/" >> /etc/yum.repos.d/powertools.repo && \
echo "gpgcheck=0" >> /etc/yum.repos.d/powertools.repo
RUN dnf install -y habanalabs-rdma-core-"$VERSION"-"$REVISION".el8 \
diff --git a/dockerfiles/common.mk b/dockerfiles/common.mk
index 84d8c23..81f3293 100644
--- a/dockerfiles/common.mk
+++ b/dockerfiles/common.mk
@@ -6,8 +6,8 @@ BUILD_DIR ?= $(CURDIR)/dockerbuild
REPO_SERVER ?= vault.habana.ai
PT_VERSION ?= 2.2.2
-RELEASE_VERSION ?= 1.16.0
-RELEASE_BUILD_ID ?= 526
+RELEASE_VERSION ?= 1.16.1
+RELEASE_BUILD_ID ?= 7
BASE_IMAGE_URL ?= base-installer-$(BUILD_OS)
IMAGE_URL = $(IMAGE_NAME):$(RELEASE_VERSION)-$(RELEASE_BUILD_ID)
diff --git a/utils/README.md b/utils/README.md
index 7c29b9d..af6a8c0 100644
--- a/utils/README.md
+++ b/utils/README.md
@@ -1,6 +1,6 @@
# Gaudi Utils
-By installing, copying, accessing, or using the software, you agree to be legally bound by the terms and conditions of the Habana software license agreement [defined here](https://habana.ai/habana-outbound-software-license-agreement/).
+By installing, copying, accessing, or using the software, you agree to be legally bound by the terms and conditions of the Intel Gaudi software license agreement [defined here](https://habana.ai/habana-outbound-software-license-agreement/).
## Table of Contents
@@ -14,20 +14,20 @@ By installing, copying, accessing, or using the software, you agree to be legall
- [Status](#status)
- [Set IP](#set-ip)
- [Unset IP](#unset-ip)
- - [check\_habana\_framework\_env](#check_habana_framework_env)
- - [Habana Health Screen (HHS)](#habana-health-screen-hhs)
+ - [check\_framework\_env](#check_framework_env)
+ - [Intel Gaudi Health Screen (IGHS)](#intel-gaudi-health-screen-ighs)
## Overview
-Welcome to Gaudi's Util Scripts!
+Welcome to Intel Gaudi's Util Scripts!
-This folder contains some Gaudi utility scripts that users can access as reference.
+This folder contains some Intel Gaudi utility scripts that users can access as reference.
## manage_network_ifs
Moved to habanalabs-qual Example: (/opt/habanalabs/qual/gaudi2/bin/manage_network_ifs.sh).
-This script can be used as reference to bring up, take down, set IPs, unset IPs and check for status of the Gaudi network interfaces.
+This script can be used as reference to bring up, take down, set IPs, unset IPs and check for status of the Intel Gaudi network interfaces.
The following is the usage of the script:
@@ -35,11 +35,11 @@ The following is the usage of the script:
usage: ./manage_network_ifs.sh [options]
options:
- --up toggle up all Habana network interfaces
- --down toggle down all Habana network interfaces
- --status print status of all Habana network interfaces
- --set-ip set IP for all internal Habana network interfaces
- --unset-ip unset IP from all internal Habana network interfaces
+ --up toggle up all Intel Gaudi network interfaces
+ --down toggle down all Intel Gaudi network interfaces
+ --status print status of all Intel Gaudi network interfaces
+ --set-ip set IP for all internal Intel Gaudi network interfaces
+ --unset-ip unset IP from all internal Intel Gaudi network interfaces
-v, --verbose print more logs
-h, --help print this help
@@ -47,67 +47,67 @@ Note: Please run this script with one operation at a time
```
## Operations
-Before executing any operation, this script finds all the Habana network interfaces available on the system and stores the Habana interface information into a list.
-The list will be used for the operations. If no Habana network interface is found, the script will exit.
+Before executing any operation, this script finds all the Intel Gaudi network interfaces available on the system and stores the Intel Gaudi interface information into a list.
+The list will be used for the operations. If no Intel Gaudi network interface is found, the script will exit.
### Up
-Use the following command to bring all Habana network interfaces online:
+Use the following command to bring all Intel Gaudi network interfaces online:
```
sudo manage_network_ifs.sh --up
```
-Once all the Habana interfaces are toggled up, IPs will be set by default. Please refer [Set Ip](#set-ip) for more detail. To unset IPs, run this script with '--unset-ip'
+Once all the Intel Gaudi interfaces are toggled up, IPs will be set by default. Please refer [Set Ip](#set-ip) for more detail. To unset IPs, run this script with '--unset-ip'
### Down
-Use the following command to bring all Habana network interfaces offline:
+Use the following command to bring all Intel Gaudi network interfaces offline:
```
sudo manage_network_ifs.sh --down
```
### Status
-Print the current operational state of all Habana network interfaces such as how many ports are up/down:
+Print the current operational state of all Intel Gaudi network interfaces such as how many ports are up/down:
```
sudo manage_network_ifs.sh --status
```
### Set IP
-Use the following command to assign a default IP for all Habana network interfaces:
+Use the following command to assign a default IP for all Intel Gaudi network interfaces:
```
sudo manage_network_ifs.sh --set-ip
```
Note: Default IPs are 192.168.100.1, 192.168.100.2, 192.168.100.3 and so on
### Unset IP
-Remove IP from all available Habana network interfaces by the following command:
+Remove IP from all available Intel Gaudi network interfaces by the following command:
```
sudo manage_network_ifs.sh --unset-ip
```
-## check_habana_framework_env
+## check_framework_env
-This script can be used as reference to check the environment for running PyTorch on Habana.
+This script can be used as reference to check the environment for running PyTorch on Intel Gaudi.
The following is the usage of the script:
```
-usage: check_habana_framework_env.py [-h] [--cards CARDS]
+usage: check_framework_env.py [-h] [--cards CARDS]
-Check health of HPUs for PyTorch
+Check health of Intel Gaudi for PyTorch
optional arguments:
-h, --help show this help message and exit
--cards CARDS Set number of cards to test (default: 1)
```
-## Habana Health Screen (HHS)
+## Intel Gaudi Health Screen (IGHS)
-**Habana Health Screen** (HHS) tool has been developed to verify the cluster network health through a suite of diagnostic tests. The test
+**Intel Gaudi Health Screen** (IGHS) tool has been developed to verify the cluster network health through a suite of diagnostic tests. The test
includes checking gaudi port status, running small workloads, and running standard collective operations arcoss multiple systems.
``` bash
usage: screen.py [-h] [--initialize] [--screen] [--target-nodes TARGET_NODES]
[--job-id JOB_ID] [--round ROUND] [--config CONFIG]
- [--hhs-check [{node,hccl-demo,none}]] [--node-write-report]
+ [--ighs-check [{node,hccl-demo,none}]] [--node-write-report]
[--node-name NODE_NAME] [--logs-dir LOGS_DIR]
optional arguments:
@@ -119,18 +119,18 @@ optional arguments:
--job-id JOB_ID Needed to identify hccl-demo running log
--round ROUND Needed to identify hccl-demo running round log
--config CONFIG Configuration file for Health Screener
- --hhs-check [{node,hccl-demo,none}]
- Check HHS Status for Node (Ports status, Device Acquire Fail) or all_reduce
+ --ighs-check [{node,hccl-demo,none}]
+ Check IGHS Status for Node (Ports status, Device Acquire Fail, Device Temperature) or all_reduce
(HCCL_DEMO between paris of nodes)
--node-write-report Write Individual Node Health Report
--node-name NODE_NAME Name of Node
--logs-dir LOGS_DIR Output directory of health screen results
```
-To run a full HHS test, run the below command:
+To run a full IGHS test, run the below command:
``` bash
-# Creates HHS Report and screens clusters for any infected nodes.
+# Creates IGHS Report and screens clusters for any infected nodes.
# Will check Level 1 and 2 by default
python screen.py --initialize --screen
```
\ No newline at end of file
diff --git a/utils/check_habana_framework_env.py b/utils/check_framework_env.py
old mode 100755
new mode 100644
similarity index 82%
rename from utils/check_habana_framework_env.py
rename to utils/check_framework_env.py
index 359aac0..c12bf28
--- a/utils/check_habana_framework_env.py
+++ b/utils/check_framework_env.py
@@ -15,7 +15,7 @@
import concurrent.futures
def parse_arguments():
- parser = argparse.ArgumentParser(description="Check health of HPUs for PyTorch")
+ parser = argparse.ArgumentParser(description="Check health of Intel Gaudi for PyTorch")
parser.add_argument("--cards",
default=1,
@@ -29,11 +29,11 @@ def parse_arguments():
return args
def pytorch_test(device_id=0):
- """ Checks health of HPU through running a basic
- PyTorch example on HPU
+ """ Checks health of Intel Gaudi through running a basic
+ PyTorch example on Intel Gaudi
Args:
- device_id (int, optional): ID of HPU. Defaults to 0.
+ device_id (int, optional): ID of Intel Gaudi. Defaults to 0.
"""
os.environ["ID"] = str(device_id)
@@ -42,7 +42,7 @@ def pytorch_test(device_id=0):
import torch
import habana_frameworks.torch.core
except Exception as e:
- print(f"Card {device_id} Failed to initialize Habana PyTorch: {str(e)}")
+ print(f"Card {device_id} Failed to initialize Intel Gaudi PyTorch: {str(e)}")
raise
try:
@@ -50,7 +50,7 @@ def pytorch_test(device_id=0):
y = x + x
assert y == 4, 'Sanity check failed: Wrong Add output'
- assert 'hpu' in y.device.type.lower(), 'Sanity check failed: Operation not executed on Habana Device'
+ assert 'hpu' in y.device.type.lower(), 'Sanity check failed: Operation not executed on Intel Gaudi Card'
except (RuntimeError, AssertionError) as e:
print(f"Card {device_id} Failure: {e}")
raise
@@ -64,7 +64,7 @@ def pytorch_test(device_id=0):
for device_id, res in zip(range(args.cards), executor.map(pytorch_test, range(args.cards))):
print(f"Card {device_id} PASSED")
except Exception as e:
- print(f"Failed to initialize Habana, error: {str(e)}")
+ print(f"Failed to initialize on Intel Gaudi, error: {str(e)}")
print(f"Check FAILED")
exit(1)
diff --git a/utils/habana_health_screen/version.txt b/utils/habana_health_screen/version.txt
deleted file mode 100644
index afaf360..0000000
--- a/utils/habana_health_screen/version.txt
+++ /dev/null
@@ -1 +0,0 @@
-1.0.0
\ No newline at end of file
diff --git a/utils/habana_health_screen/.gitignore b/utils/intel_gaudi_health_screen/.gitignore
similarity index 100%
rename from utils/habana_health_screen/.gitignore
rename to utils/intel_gaudi_health_screen/.gitignore
diff --git a/utils/habana_health_screen/HabanaHealthReport.py b/utils/intel_gaudi_health_screen/HealthReport.py
similarity index 90%
rename from utils/habana_health_screen/HabanaHealthReport.py
rename to utils/intel_gaudi_health_screen/HealthReport.py
index 4ac194e..b0409f1 100644
--- a/utils/habana_health_screen/HabanaHealthReport.py
+++ b/utils/intel_gaudi_health_screen/HealthReport.py
@@ -18,12 +18,12 @@
import logging
-_logger = logging.getLogger("habana_health_screener")
+_logger = logging.getLogger("health_screener")
-class HabanaHealthReport():
+class HealthReport():
def __init__(self, f_dir="tmp", report_name="health_report.csv"):
- """ Initialize Habana Health Report Class
+ """ Initialize Health Report Class
Args:
f_dir (str, optional): File Directory to store Health Report logs and results. Defaults to "tmp".
@@ -83,8 +83,8 @@ def write_rows(self, cards=list(), node_id="", data=list(), level=1):
""" Write health check results to Health Report CSV. Can write multiple rows at once
Args:
- cards ([HCard], optional): Level 1 HCards to report about. Defaults to list().
- node_id (str, optional): Node ID of HCards. Defaults to "".
+ cards ([IGCard], optional): Level 1 IGCards to report about. Defaults to list().
+ node_id (str, optional): Node ID of IGCards. Defaults to "".
data (_type_, optional): Health Report CSV Row data. Defaults to list().
level (int, optional): Health Screen Level. Defaults to 1.
"""
@@ -118,12 +118,12 @@ def update_health_report(self, detected_nodes, infected_nodes, missing_nodes):
infected_nodes (list[str]): List of infected node_ids
missing_nodes (list[str]): List of missing node_ids
"""
- tempfile = NamedTemporaryFile(mode='w', delete=False)
+ temp_file = NamedTemporaryFile(mode='w', delete=False)
detected_nodes_cp = detected_nodes.copy()
- with open(self.f_path, 'r', newline='') as csvfile, tempfile:
- reader = csv.DictReader(csvfile)
- writer = csv.DictWriter(tempfile, fieldnames=self.header)
+ with open(self.f_path, 'r', newline='') as csv_file, temp_file:
+ reader = csv.DictReader(csv_file)
+ writer = csv.DictWriter(temp_file, fieldnames=self.header)
writer.writeheader()
for row in reader:
@@ -148,22 +148,22 @@ def update_health_report(self, detected_nodes, infected_nodes, missing_nodes):
for n in missing_nodes:
writer.writerow({"node_id": n, "multi_node_fail": True, "missing": True})
- shutil.move(tempfile.name, self.f_path)
+ shutil.move(temp_file.name, self.f_path)
def update_hccl_demo_health_report(self, round, all_node_pairs, multi_node_fail, qpc_fail, missing_nodes):
""" Update health_report with hccl_demo results, based on infected_nodes.
Args:
- all_node_pairs (list[str]): List of all node pairs reported by Level 2 round
+ all_node_pairs (list[str]): List of all Node Pairs reported by Level 2 round
multi_node_fail (list[str]): List of Node Pairs that failed HCCL_Demo Test
qpc_fail (list[str]): List of Node Pairs that failed HCCL_Demo Test due to QPC error
missing_nodes (list[str]): List of Node Pairs that couldn't run HCCL_Demo
"""
- tempfile = NamedTemporaryFile(mode='w', delete=False)
+ temp_file = NamedTemporaryFile(mode='w', delete=False)
- with open(self.f_path_hccl_demo, 'r', newline='') as csvfile, tempfile:
- reader = csv.DictReader(csvfile)
- writer = csv.DictWriter(tempfile, fieldnames=self.header_hccl_demo, extrasaction='ignore')
+ with open(self.f_path_hccl_demo, 'r', newline='') as csv_file, temp_file:
+ reader = csv.DictReader(csv_file)
+ writer = csv.DictWriter(temp_file, fieldnames=self.header_hccl_demo, extrasaction='ignore')
writer.writeheader()
for row in reader:
@@ -181,7 +181,7 @@ def update_hccl_demo_health_report(self, round, all_node_pairs, multi_node_fail,
if len(all_node_pairs):
writer.writerows(list(all_node_pairs.values()))
- shutil.move(tempfile.name, self.f_path_hccl_demo)
+ shutil.move(temp_file.name, self.f_path_hccl_demo)
def check_screen_complete(self, num_nodes, hccl_demo=False, round=0):
""" Check on status of Health Screen Check.
@@ -306,11 +306,11 @@ def gather_health_report(self, level, remote_path, hosts):
""" Gathers Health Report from all hosts
Args:
- level (str): HHS Level
- remote_path (str): Remote Destintation of HHS Report
- hosts (list, optional): List of IP Addresses to gather HHS Reports
+ level (str): IGHS Level
+ remote_path (str): Remote Destintation of IGHS Report
+ hosts (list, optional): List of IP Addresses to gather IGHS Reports
"""
- copy_files(src=f"{remote_path}/habana_health_screen/{self.f_dir}/L{level}",
+ copy_files(src=f"{remote_path}/intel_gaudi_health_screen/{self.f_dir}/L{level}",
dst=f"{self.f_dir}",
hosts=hosts,
to_remote=False)
@@ -319,7 +319,7 @@ def consolidate_health_report(self, level, report_dir):
""" Consolidates the health_report_*.csv from worker pods into a single master csv file
Args:
- level (str): HHS Level
+ level (str): IGHS Level
report_dir (str): Directory of CSV files to merge
"""
data = list()
@@ -327,8 +327,8 @@ def consolidate_health_report(self, level, report_dir):
csv_files = glob.glob(path)
for f in csv_files:
- with open(f, 'r', newline='') as csvfile:
- reader = csv.DictReader(csvfile)
+ with open(f, 'r', newline='') as csv_file:
+ reader = csv.DictReader(csv_file)
for row in reader:
data.append(row)
diff --git a/utils/habana_health_screen/HNodes.py b/utils/intel_gaudi_health_screen/IGNodes.py
similarity index 77%
rename from utils/habana_health_screen/HNodes.py
rename to utils/intel_gaudi_health_screen/IGNodes.py
index 4cf0abf..3980209 100644
--- a/utils/habana_health_screen/HNodes.py
+++ b/utils/intel_gaudi_health_screen/IGNodes.py
@@ -10,24 +10,23 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-import os, time, yaml, csv
+import os, time, csv
import logging
-from multiprocessing.pool import Pool
+import multiprocessing
-from HabanaHealthReport import HabanaHealthReport
+from HealthReport import HealthReport
from utilities import run_cmd, create_logger
-from hccl_demo_helper import find_groups
-_logger = logging.getLogger("habana_health_screener")
+_logger = logging.getLogger("health_screener")
-class HNodes():
+class IGNodes():
- def __init__(self, health_report=HabanaHealthReport()):
+ def __init__(self, health_report=HealthReport()):
""" Keeps Track of Nodes and their current states
Args:
- health_report (HabanaHealthReport, optional): HHS Health Report. Defaults to creating a new HabanaHealthReport().
+ health_report (HealthReport, optional): IGHS Health Report. Defaults to creating a new HealthReport().
"""
self.all_nodes = list()
self.launcher_nodes = list()
@@ -42,14 +41,13 @@ def __init__(self, health_report=HabanaHealthReport()):
-class HNode():
+class IGNode():
- def __init__(self, name="", health_report=HabanaHealthReport(), num_checks_link_state=10, log_level=logging.INFO):
+ def __init__(self, name="", health_report=HealthReport(), num_checks_link_state=10, log_level=logging.INFO):
self.name = name
if name == "" and "MY_NODE_NAME" in os.environ:
self.name = os.environ["MY_NODE_NAME"]
-
self.cards = dict()
self.num_checks_link_state = num_checks_link_state
@@ -77,21 +75,31 @@ def scan_cards(self):
memory_used = int(row[3].split()[0])
temperature_C = int(row[4].split()[0])
- card = HCard(index=i, module_id=module_id, pci_address=pci_address, memory_used=memory_used, temperature=temperature_C, logger=self.logger)
+ card = IGCard(index=i, module_id=module_id, pci_address=pci_address, memory_used=memory_used, temperature=temperature_C, logger=self.logger)
self.cards[i] = card
self.cards = dict(sorted(self.cards.items()))
def health_check(self, target_cards=[], write_report=False):
- checked_cards = list()
+ checked_cards = list()
+ processes = list()
+ card_queue = multiprocessing.Queue()
if len(target_cards) == 0:
target_cards = self.cards.keys()
for i in target_cards:
card = self.cards[str(i)]
- card.check_health(num_checks_link_state=self.num_checks_link_state)
+ p = multiprocessing.Process(target=card.check_health, args=(self.num_checks_link_state,card_queue))
+
+ p.start()
+ processes.append((card,p))
+
+ for card,p in processes:
+ p.join()
+ card_queue.put(None)
+ for card in iter(card_queue.get, None):
checked_cards.append(card)
self.logger.info(card)
@@ -99,8 +107,7 @@ def health_check(self, target_cards=[], write_report=False):
self.health_report.write_rows(node_id=self.name, cards=checked_cards)
-
-class HCard():
+class IGCard():
def __init__(self, index=-1, module_id=-1, pci_address="", memory_used=-1, framework="pytorch", temperature=-1, logger=None):
self.logger = logger
@@ -119,11 +126,13 @@ def __init__(self, index=-1, module_id=-1, pci_address="", memory_used=-1, frame
self.external_ports = [1, 8, 9]
self.incorrect_ports_direction = list()
- def check_health(self,num_checks_link_state=10):
+ def check_health(self,num_checks_link_state=10, checked_cards=[]):
self.check_link_state(attempts=num_checks_link_state, sleep_sec=0.2)
self.check_device_acquire_fail()
self.check_temperature_state()
+ checked_cards.put(self)
+
def check_link_state(self, attempts=10, sleep_sec=0.5):
self.logger.debug(f"Checking {self.pci_address} Link State. Will check {attempts} times")
cmd = f"hl-smi -n link -i {self.pci_address}"
@@ -170,21 +179,26 @@ def check_port_direction(self):
def check_device_acquire_fail(self):
self.logger.debug(f"Checking {self.pci_address} for Device Acquire Issues")
+ self.device_acquire_fail = False
- from build.Setup_and_Install.utils import check_habana_framework_env
+ os.environ["ID"] = str(self.module_id)
- self.device_acquire_fail = False
- fw_test = check_habana_framework_env.pytorch_test
- if self.framework == "tensorflow":
- fw_test = check_habana_framework_env.tensorflow_test
+ try:
+ import torch
+ import habana_frameworks.torch.core
+ except Exception as e:
+ self.logger.error(f"Card {self.module_id} {self.pci_address} Failed to initialize Intel Gaudi PyTorch: {str(e)}")
+ self.device_acquire_fail = True
try:
- with Pool() as pool:
- result = pool.apply(fw_test, args=(self.module_id))
+ x = torch.tensor([2]).to('hpu')
+ y = x + x
+ assert y == 4, 'Sanity check failed: Wrong Add output'
+ assert 'hpu' in y.device.type.lower(), 'Sanity check failed: Operation not executed on Habana Device'
except (RuntimeError, AssertionError, Exception) as e:
+ self.logger.error(f"{self.pci_address} Device Acquire Failure: {e}")
self.device_acquire_fail = True
- self.logger.warning(f"{self.pci_address} Device Acquire Failure")
return self.device_acquire_fail
@@ -197,16 +211,8 @@ def check_temperature_state(self):
self.temperature_state_C = "CRITICAL"
elif self.temperature_C - base_temperature >= max_delta:
self.temperature_state_C = "WARN"
-
- def check_temperature_state(self):
- max_good_temperature = 83
- base_temperature = 25
- max_delta = 25
-
- if self.temperature_C >= max_good_temperature:
- self.temperature_state_C = "CRITICAL"
- elif self.temperature_C - base_temperature >= max_delta:
- self.temperature_state_C = "WARN"
+ else:
+ self.temperature_state_C = "NORMAL"
def __str__(self):
report_str = f""" Index: {self.index}
diff --git a/utils/habana_health_screen/README.md b/utils/intel_gaudi_health_screen/README.md
similarity index 85%
rename from utils/habana_health_screen/README.md
rename to utils/intel_gaudi_health_screen/README.md
index ae23ced..a0f89d5 100644
--- a/utils/habana_health_screen/README.md
+++ b/utils/intel_gaudi_health_screen/README.md
@@ -1,20 +1,20 @@
-# Habana Health Screen 1.0.0
+# Intel Gaudi Health Screen 2.0.0
-A large scale Gaudi cluster contains a lot of moving parts. To ensure distributed training proceeds smoothly, it is recommended to check the
+A large scale Intel Gaudi cluster contains a lot of moving parts. To ensure distributed training proceeds smoothly, it is recommended to check the
cluster network health. Troubleshooting issues on a large cluster can be a tedious act. To simplify the debugging process the
-**Habana Health Screen** (HHS) tool has been developed to verify the cluster network health through a suite of diagnostic tests. The test
+**Intel Gaudi Health Screen** (IGHS) tool has been developed to verify the cluster network health through a suite of diagnostic tests. The test
includes checking gaudi port status, running small workloads, and running standard collective operations arcoss multiple systems
-HHS is capable of running on a Kubernetes cluster or on a baremetal cluster. It is an active scan, which will block other users from training
-on a gaudi systems until the scans are complete. At the end of the scans, HHS produces a CSV report detailing the state of each gaudi card.
+IGHS is capable of running on a Kubernetes cluster or on a baremetal cluster. It is an active scan, which will block other users from training
+on a gaudi systems until the scans are complete. At the end of the scans, IGHS produces a CSV report detailing the state of each gaudi card.
-It is reccomended to run HHS in the below scenarios:
+It is reccomended to run IGHS in the below scenarios:
* After a system upgrade/update
* Before running a long term training
* Pinpointing problematic systems in a cluster if a problem can't be isolated to a single system
-HHS runs a multi-tiered configurable scan:
+IGHS runs a multi-tiered configurable scan:
* Level 1 - Individual System Diagnostics
* Level 2 - Multi-System Communication Diagnostics
@@ -27,10 +27,11 @@ Level 1 focuses on individual Gaudi Cards Health Diagnostics.
| ------------------------- | ---------------------------------------------------------- |
| Gaudi Ports Status | Checks if ports are DOWN |
| Device Acquire Failures | Checks if devices are busy |
+| Device Temperatue | Checks if devices temperatures are in acceptable range |
**2 System Cluster Example**
-Here is an example of running HHS on a 2 system cluster. It identifies the Gaudi Cards that have down links, device acquire issues, and
+Here is an example of running IGHS on a 2 system cluster. It identifies the Gaudi Cards that have down links, device acquire issues, and
flags for multi node communication failure
| node_id | index | module_id | pci_address | temperature_C | temperature_C | device_acquire_fail | down_links | multi_node_fail | missing |
@@ -73,7 +74,7 @@ first round.
** Multi Node Cluster Example**
-Here is an example of running HHS for 2 rounds and the results gets recorded to `hccl_demo_health_report.csv`. It identifies node pairs that failed the all_reduce test. If "True" is flagged
+Here is an example of running IGHS for 2 rounds and the results gets recorded to `hccl_demo_health_report.csv`. It identifies node pairs that failed the all_reduce test. If "True" is flagged
in the multi_node_fail column, then one of the nodes has a communication issue. List of infected nodes will be printed out to
the log as well as the `health_report.csv` multi_node_fail column.
@@ -125,61 +126,61 @@ been tested, such as having missing cards, it is occupied by another session, or
## Setup
-HHS is compatible with python3 default packages and does not require additional packages
+IGHS is compatible with python3 default packages and does not require additional packages
to be installed
If your setup envionrment requires custom configruation, update the yaml files located in the templates folder. The default template
-relies on storing HHS in a shared file system.
+relies on storing IGHS in a shared file system.
If running on bare metal system, then install `pdsh` to your system.
Update [config.yaml](config.yaml) to match your system envionrment
``` yaml
-# Sets HHS to screen for K8s or Bare Metal Envionrment (k8s, bare-metal).
+# Sets IGHS to screen for K8s or Bare Metal Envionrment (k8s, bare-metal).
system-info:
type: "k8s"
# Namespace is only required for k8s settings
- namespace: "habanalabs"
+ namespace: "intelgaudi"
# Can specify specific systems. For k8s, to scan entire cluster comment out hostfile
- hostfile: "./hostfile"
+ # hostfile: "./hostfile"
# Bare Metal Configurations
ssh-path: "./ssh"
tcp-interface: "10.3.124.0/24"
-# Image to run Habana Health Screen
+# Image to run Intel Gaudi Health Screen
image: "vault.habana.ai/gaudi-docker/1.16.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.2:latest"
-# Node Label used to identify a Gaudi Node
-gaudi-node-label: "brightcomputing.com/node-category=gaudi"
+# Node Label used to identify a Intel Gaudi Node
+gaudi-node-label: "ighs_label=gaudi"
# Controls granularity of Logs (INFO, DEBUG, WARN, ERROR, CRITICAL)
log-level: "DEBUG"
-# Level 1 - Checks Individual Node Health (Ports status, Device Acquire failure)
+# Level 1 - Checks Individual Node Health (Ports status, Device Acquire failure, Device Temperature)
level-1:
run: true
- timeout_s: 300
+ timeout_s: 150
# Number of times to check Port Status
num-checks-link-state: 10
# Level 2 - Checks All Reduce between node pairs in the cluster.
level-2:
run: true
- timeout_s: 180
+ timeout_s: 130
# Number of times to check Network connections between nodes
num-rounds: 5
```
-To learn the features of HHS, run the below command:
+To learn the features of IGHS, run the below command:
``` bash
python screen.py --help
usage: screen.py [-h] [--initialize] [--screen] [--target-nodes TARGET_NODES]
[--job-id JOB_ID] [--round ROUND] [--config CONFIG]
- [--hhs-check [{node,hccl-demo,none}]] [--node-write-report]
+ [--ighs-check [{node,hccl-demo,none}]] [--node-write-report]
[--node-name NODE_NAME] [--logs-dir LOGS_DIR]
optional arguments:
@@ -191,18 +192,18 @@ optional arguments:
--job-id JOB_ID Needed to identify hccl-demo running log
--round ROUND Needed to identify hccl-demo running round log
--config CONFIG Configuration file for Health Screener
- --hhs-check [{node,hccl-demo,none}]
- Check HHS Status for Node (Ports status, Device Acquire Fail) or all_reduce
+ --ighs-check [{node,hccl-demo,none}]
+ Check IGHS Status for Node (Ports status, Device Acquire Fail, Device Temperature) or all_reduce
(HCCL_DEMO between paris of nodes)
--node-write-report Write Individual Node Health Report
--node-name NODE_NAME Name of Node
--logs-dir LOGS_DIR Output directory of health screen results
```
-To Run HHS, run the below command:
+To Run IGHS, run the below command:
``` bash
-# Creates HHS Report and screens clusters for any infected nodes.
+# Creates IGHS Report and screens clusters for any infected nodes.
# Will check Level 1 and 2 by default
python screen.py --initialize --screen
```
@@ -212,11 +213,11 @@ python screen.py --initialize --screen
To run on bare-metal systems update the [config.yaml](config.yaml) to use bare-metal configuration.
``` yaml
-# Sets HHS to screen for K8s or Bare Metal Envionrment (k8s, bare-metal).
+# Sets IGHS to screen for K8s or Bare Metal Envionrment (k8s, bare-metal).
system-info:
type: "bare-metal"
# Namespace is only required for k8s settings
- namespace: "habanalabs"
+ namespace: "intelgaudi"
# Can specify specific systems. For k8s, to scan entire cluster comment out hostfile
hostfile: "./hostfile"
@@ -224,26 +225,26 @@ system-info:
ssh-path: "./ssh"
tcp-interface: "10.3.124.0/24"
-# Image to run Habana Health Screen
+# Image to run Intel Gaudi Health Screen
image: "vault.habana.ai/gaudi-docker/1.16.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.2:latest"
-# Node Label used to identify a Gaudi Node
+# Node Label used to identify a Intel Gaudi Node
gaudi-node-label: "brightcomputing.com/node-category=gaudi"
# Controls granularity of Logs (INFO, DEBUG, WARN, ERROR, CRITICAL)
log-level: "DEBUG"
-# Level 1 - Checks Individual Node Health (Ports status, Device Acquire failure)
+# Level 1 - Checks Individual Node Health (Ports status, Device Acquire failure, Device Temperature)
level-1:
run: true
- timeout_s: 300
+ timeout_s: 150
# Number of times to check Port Status
num-checks-link-state: 10
# Level 2 - Checks All Reduce between node pairs in the cluster.
level-2:
run: true
- timeout_s: 180
+ timeout_s: 130
# Number of times to check Network connections between nodes
num-rounds: 5
```
@@ -252,9 +253,9 @@ Before running the screening test, you need to generate the ssh key used for pas
``` bash
# Keys to setup initial bare-metal passwordless ssh connection between systems
-ssh-keygen -t rsa -f ssh/hhs_rsa
-chmod 600 ssh/hhs_rsa;
-chmod 644 ssh/hhs_rsa.pub;
+ssh-keygen -t rsa -f ssh/ighs_rsa
+chmod 600 ssh/ighs_rsa;
+chmod 644 ssh/ighs_rsa.pub;
# Keys to setup containers passwordless ssh connection
ssh-keygen -t rsa -f template/bare-metal/ssh/id_rsa
diff --git a/utils/habana_health_screen/config.yaml b/utils/intel_gaudi_health_screen/config.yaml
similarity index 69%
rename from utils/habana_health_screen/config.yaml
rename to utils/intel_gaudi_health_screen/config.yaml
index cc984e5..34f8c88 100644
--- a/utils/habana_health_screen/config.yaml
+++ b/utils/intel_gaudi_health_screen/config.yaml
@@ -1,8 +1,8 @@
-# Sets HHS to screen for K8s or Bare Metal Envionrment (k8s, bare-metal). k8s does not require any system info
+# Sets IGHS to screen for K8s or Bare Metal Envionrment (k8s, bare-metal). k8s does not require any system info
system-info:
type: "k8s"
# Namespace is only required for k8s settings
- namespace: "habanalabs"
+ namespace: "intelgaudi"
# Can specify specific systems. For k8s, to scan entire cluster comment out hostfile
# hostfile: "./hostfile"
@@ -11,25 +11,25 @@ system-info:
ssh-path: "./ssh"
tcp-interface: "10.3.124.0/24"
-# Image to run Habana Health Screen
+# Image to run Intel Gaudi Health Screen
image: "vault.habana.ai/gaudi-docker/1.16.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.2:latest"
-# Node Label used to identify a Gaudi Node
-gaudi-node-label: "hhs_label=gaudi"
+# Node Label used to identify a Intel Gaudi Node
+gaudi-node-label: "ighs_label=gaudi"
# Controls granularity of Logs (INFO, DEBUG, WARN, ERROR, CRITICAL)
log-level: "DEBUG"
-# Level 1 - Checks Individual Node Health (Ports status, Device Busy, Device Acquire failure)
+# Level 1 - Checks Individual Node Health (Ports status, Device Busy, Device Acquire failure, Device Temperature)
level-1:
run: true
- timeout_s: 300
+ timeout_s: 150
# Number of times to check Port Status
num-checks-link-state: 12
# Level 2 - Checks All Reduce between node pairs in the cluster.
level-2:
run: true
- timeout_s: 100
+ timeout_s: 130
# Number of times to check Network connections between nodes
num-rounds: 5
\ No newline at end of file
diff --git a/utils/habana_health_screen/hccl_demo_helper.py b/utils/intel_gaudi_health_screen/hccl_demo_helper.py
similarity index 97%
rename from utils/habana_health_screen/hccl_demo_helper.py
rename to utils/intel_gaudi_health_screen/hccl_demo_helper.py
index 3525ac9..7868178 100644
--- a/utils/habana_health_screen/hccl_demo_helper.py
+++ b/utils/intel_gaudi_health_screen/hccl_demo_helper.py
@@ -13,7 +13,7 @@
import random, math, os, yaml, glob
import logging
-_logger = logging.getLogger("habana_health_screener")
+_logger = logging.getLogger("health_screener")
def find_groups(nodes_to_test, groups_tracker):
""" Find a list of node groups to run hccl_demo all reduce test
@@ -116,7 +116,7 @@ def gather_hccl_logs(job_path, round, log_dir, health_report):
job_path (str): Base directory of job yamls executed
round (int): Round to retrieve HCCL_Demo logs
log_dir (str): Base directory of HCCL_Demo logs
- health_report (HabanaHealthReport): Tracks and reports health of hccl_demo
+ health_report (HealthReport): Tracks and reports health of hccl_demo
"""
path = f"{job_path}/**/r{round}/*.yaml"
job_files = glob.glob(path, recursive=True)
@@ -159,7 +159,7 @@ def hccl_demo_check(job_id, target_nodes, health_report, write=True):
Args:
job_id (str): Metadata name of the Job
target_nodes ([str]): Nodes that are used in hccl_demo testing
- health_report (HabanaHealthReport): Tracks and reports health of hccl_demo
+ health_report (HealthReport): Tracks and reports health of hccl_demo
write (bool, optional): Writes to Report. Used to collect hccl results and update Base Health Report. Default to True
Returns:
diff --git a/utils/habana_health_screen/hostfile b/utils/intel_gaudi_health_screen/hostfile
similarity index 100%
rename from utils/habana_health_screen/hostfile
rename to utils/intel_gaudi_health_screen/hostfile
diff --git a/utils/habana_health_screen/run_hhs.sh b/utils/intel_gaudi_health_screen/run_hhs.sh
similarity index 100%
rename from utils/habana_health_screen/run_hhs.sh
rename to utils/intel_gaudi_health_screen/run_hhs.sh
diff --git a/utils/habana_health_screen/screen.py b/utils/intel_gaudi_health_screen/screen.py
similarity index 79%
rename from utils/habana_health_screen/screen.py
rename to utils/intel_gaudi_health_screen/screen.py
index 9ad8eb2..9a0b3e2 100644
--- a/utils/habana_health_screen/screen.py
+++ b/utils/intel_gaudi_health_screen/screen.py
@@ -10,16 +10,16 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-import os, datetime, shutil, yaml, sys
+import os, datetime, yaml, sys
import argparse
import logging
-from utilities import download_repos, clear_hhs_pods, create_logger, get_logging_level
+from utilities import download_repos, clear_ighs_pods, create_logger, get_logging_level
from hccl_demo_helper import hccl_demo_check
from system_utils import KubeUtils, BareMetalUtils
-from HabanaHealthReport import HabanaHealthReport
-from HNodes import HNodes, HNode
+from HealthReport import HealthReport
+from IGNodes import IGNodes, IGNode
_logger = None
@@ -36,22 +36,22 @@ def main(args):
args.logs_dir = f"logs/{date_year_format}/{date_format}/{date_format}_{time_format}"
- hhs_report_name = "health_report.csv"
- hhs_log_dir = args.logs_dir
+ ighs_report_name = "health_report.csv"
+ ighs_log_dir = args.logs_dir
if args.node_name:
- hhs_level = os.environ["HHS_LEVEL"]
- hhs_report_name = f"health_report_{args.node_name}.csv"
- hhs_log_dir = f"{args.logs_dir}/L{hhs_level}"
+ ighs_level = os.environ["IGHS_LEVEL"]
+ ighs_report_name = f"health_report_{args.node_name}.csv"
+ ighs_log_dir = f"{args.logs_dir}/L{ighs_level}"
- health_report = HabanaHealthReport(f_dir=hhs_log_dir, report_name=hhs_report_name)
+ health_report = HealthReport(f_dir=ighs_log_dir, report_name=ighs_report_name)
job_path = "tmp/jobs"
with open(args.config, 'r') as f:
config_data = yaml.safe_load(f)
log_level = get_logging_level(config_data["log-level"])
- _logger, _ = create_logger(logger_name="habana_health_screener", logger_file_name="screener", f_path=args.logs_dir, level=log_level)
+ _logger, _ = create_logger(logger_name="health_screener", logger_file_name="screener", f_path=args.logs_dir, level=log_level)
if config_data["system-info"]["type"] == "k8s":
system_mode = KubeUtils(image=config_data["image"],
@@ -82,8 +82,8 @@ def main(args):
if args.screen:
start_time = datetime.datetime.now()
- habana_nodes = HNodes(health_report=health_report)
- habana_nodes.all_nodes = system_mode.collect_nodes(gaudi_node_label=config_data["gaudi-node-label"])
+ intel_gaudi_nodes = IGNodes(health_report=health_report)
+ intel_gaudi_nodes.all_nodes = system_mode.collect_nodes(gaudi_node_label=config_data["gaudi-node-label"])
if config_data["level-1"]["run"]:
_logger.info("Running Level 1 Checks: Card Diagnostics")
@@ -91,14 +91,14 @@ def main(args):
os.makedirs(f"{health_report.f_dir}/L1")
system_mode.initialize_node_jobs(level=1,
- nodes=habana_nodes,
+ nodes=intel_gaudi_nodes,
job_base_path=job_path)
- healthy_nodes, infected_nodes, missing_nodes = system_mode.monitor_hhs_status(level=1,
- nodes=habana_nodes,
+ healthy_nodes, infected_nodes, missing_nodes = system_mode.monitor_ighs_status(level=1,
+ nodes=intel_gaudi_nodes,
timeout_s=config_data["level-1"]["timeout_s"])
system_mode.diagnose_unhealthy_nodes(infected_nodes, missing_nodes)
- system_mode.clear_hhs_pods()
+ system_mode.clear_ighs_pods()
if config_data["level-2"]["run"]:
_logger.info("Running Level 2 Checks: Pair HCCL_DEMO All Reduce")
@@ -107,16 +107,16 @@ def main(args):
for i in range(config_data["level-2"]["num-rounds"]):
system_mode.initialize_node_jobs(level=2,
- nodes=habana_nodes,
+ nodes=intel_gaudi_nodes,
job_base_path=job_path,
round=i)
- healthy_nodes, infected_nodes, missing_nodes = system_mode.monitor_hhs_status(level=2,
- nodes=habana_nodes,
+ healthy_nodes, infected_nodes, missing_nodes = system_mode.monitor_ighs_status(level=2,
+ nodes=intel_gaudi_nodes,
timeout_s=config_data["level-2"]["timeout_s"],
round=i)
system_mode.diagnose_unhealthy_nodes(infected_nodes, missing_nodes)
- system_mode.clear_hhs_pods(job_type="mpijobs")
+ system_mode.clear_ighs_pods(job_type="mpijobs")
if len(infected_nodes) == 0:
_logger.info(f"Round {i}/{config_data['level-2']['num-rounds']}: No Infected Nodes found. Exit screening early.")
@@ -127,13 +127,13 @@ def main(args):
diff_time = (end_time - start_time)
_logger.info(f"Total Run Time: {diff_time}")
- if args.hhs_check == "node":
- node = HNode(health_report=health_report,
+ if args.ighs_check == "node":
+ node = IGNode(health_report=health_report,
num_checks_link_state=config_data["level-1"]["num-checks-link-state"],
log_level=log_level)
node.scan_cards()
node.health_check(write_report=args.node_write_report)
- elif args.hhs_check == "hccl-demo":
+ elif args.ighs_check == "hccl-demo":
health_report.create(create_base=False, create_hccl_demo=True)
target_nodes = args.target_nodes.strip("[']").replace("'","").split(',')
@@ -149,8 +149,8 @@ def main(args):
parser.add_argument("--job-id", type=str, default="", help="Needed to identify hccl-demo running log")
parser.add_argument("--round", type=str, default="", help="Needed to identify hccl-demo running round log")
parser.add_argument("--config", type=str, default="config.yaml", help="Configuration file for Health Screener")
- parser.add_argument("--hhs-check", default="none", const="none", nargs="?", choices=["node", "hccl-demo", "none"],
- help="Check HHS Status for Node (Ports status, Device Acquire Fail) or all_reduce (HCCL_DEMO between paris of nodes)")
+ parser.add_argument("--ighs-check", default="none", const="none", nargs="?", choices=["node", "hccl-demo", "none"],
+ help="Check IGHS Status for Node (Ports status, Device Acquire Fail, Device Temperature) or all_reduce (HCCL_DEMO between paris of nodes)")
parser.add_argument("--node-write-report", action="store_true", help="Write Individual Node Health Report")
parser.add_argument("--node-name", type=str, default="", help="Name of Node")
diff --git a/utils/habana_health_screen/system_utils.py b/utils/intel_gaudi_health_screen/system_utils.py
similarity index 80%
rename from utils/habana_health_screen/system_utils.py
rename to utils/intel_gaudi_health_screen/system_utils.py
index 2d530c3..76363d8 100644
--- a/utils/habana_health_screen/system_utils.py
+++ b/utils/intel_gaudi_health_screen/system_utils.py
@@ -17,12 +17,12 @@
from hccl_demo_helper import find_groups, gather_hccl_logs
-_logger = logging.getLogger("habana_health_screener")
+_logger = logging.getLogger("health_screener")
class SystemUtils():
- def __init__(self, image, log_dir, remote_path="/tmp/hhs"):
+ def __init__(self, image, log_dir, remote_path="/tmp/ighs"):
self.job_path = "tmp/jobs"
self.image = image
self.log_dir = log_dir
@@ -47,7 +47,7 @@ def extract_host(self, hostfile):
return hosts
- def monitor_hhs_status(self, level, nodes, timeout_s=240, round=0, monitor=True):
+ def monitor_ighs_status(self, level, nodes, timeout_s=240, round=0, monitor=True):
is_finished = False
attempt = 0
max_attempts = (timeout_s // 10) + min(timeout_s % 10, 1)
@@ -58,7 +58,7 @@ def monitor_hhs_status(self, level, nodes, timeout_s=240, round=0, monitor=True)
else:
num_nodes = len(nodes.all_nodes)
- _logger.info(f"Checking HHS Level {level} Status")
+ _logger.info(f"Checking IGHS Level {level} Status")
if monitor:
for attempt in range(max_attempts):
@@ -85,7 +85,7 @@ def monitor_hhs_status(self, level, nodes, timeout_s=240, round=0, monitor=True)
if len(nodes.launcher_nodes) > 0:
hosts = nodes.launcher_nodes
- nodes.health_report.gather_health_report(level, remote_path="/tmp/hhs", hosts=hosts)
+ nodes.health_report.gather_health_report(level, remote_path="/tmp/ighs", hosts=hosts)
nodes.health_report.consolidate_health_report(level=level, report_dir=f"{self.log_dir}")
if level == 1:
@@ -127,8 +127,8 @@ def __init__(self, image, hostfile, namespace, log_dir):
self.hostfile = hostfile
def initialize_system(self):
- self.clear_hhs_pods()
- self.clear_hhs_pods(job_type="mpijobs")
+ self.clear_ighs_pods()
+ self.clear_ighs_pods(job_type="mpijobs")
self.clear_jobs()
def collect_nodes(self, gaudi_node_label):
@@ -158,14 +158,14 @@ def initialize_node_jobs(self, level,
}
if level == 1:
- source_f = "template/k8s/pt-habana-health-screen-L1.yaml"
+ source_f = "template/k8s/intel-gaudi-health-screen-L1.yaml"
update_val["num-nodes"] = len(nodes.all_nodes)
update_val["target-nodes"] = nodes.all_nodes
node_groups = nodes.all_nodes
job_path = f"{job_base_path}/L1"
yaml_type = "job"
elif level == 2:
- source_f = "template/k8s/pt-habana-health-screen-L2_hccl-demo.yaml"
+ source_f = "template/k8s/intel-gaudi-health-screen-L2_hccl-demo.yaml"
yaml_type = "mpijob"
if len(nodes.healthy_nodes) > 0:
@@ -178,11 +178,11 @@ def initialize_node_jobs(self, level,
for i, node_group in enumerate(node_groups):
if level == 1:
- update_val["metadata-name"] = f"hhs-{node_group}"
+ update_val["metadata-name"] = f"ighs-{node_group}"
update_val["target-nodes"] = [node_group]
out_file = f"{node_group}.yaml"
elif level == 2:
- update_val["metadata-name"] = f"hhs-hccl-r{round}-{i}"
+ update_val["metadata-name"] = f"ighs-hccl-r{round}-{i}"
update_val["target-nodes"] = node_group
update_val["num-nodes"] = len(node_group)
out_file = f"{update_val['metadata-name']}.yaml"
@@ -200,7 +200,7 @@ def initialize_node_jobs(self, level,
def update_yaml_job(self, update_val={},
- source_file="template/k8s/pt-habana-health-screen-L1.yaml",
+ source_file="template/k8s/intel-gaudi-health-screen-L1.yaml",
out_dir="tmp/jobs",
out_file="default.yaml",
yaml_type="job"):
@@ -252,21 +252,21 @@ def update_yaml_job(self, update_val={},
return out_f
- def clear_hhs_pods(self, job_type="jobs"):
- """ Clear Pods with label=hhs,hhs-hccl
+ def clear_ighs_pods(self, job_type="jobs"):
+ """ Clear Pods with label=ighs,ighs-hccl
Args:
job_type (str, optional): Type of Job to delete. Options: [jobs, mpijobs]. Defaults to "jobs".
"""
- _logger.info(f"Checking for existing HHS Pods ({job_type})")
+ _logger.info(f"Checking for existing IGHS Pods ({job_type})")
- metadata_app = "hhs" if (job_type == "jobs") else "hhs-hccl"
+ metadata_app = "ighs" if (job_type == "jobs") else "ighs-hccl"
cmd = f"kubectl get pods -n {self.namespace} -l app={metadata_app} -o=custom-columns='NAME:.metadata.name' --no-headers"
output = run_cmd(cmd).strip()
if len(output) > 0:
- _logger.info(f"Found existing HHS Pods ({job_type}). Will delete.")
+ _logger.info(f"Found existing IGHS Pods ({job_type}). Will delete.")
cmd = f"kubectl get {job_type} -n {self.namespace} -l app={metadata_app} -o=custom-columns='NAME:.metadata.name' --no-headers"
output = run_cmd(cmd).strip()
@@ -338,8 +338,8 @@ def __init__(self,
ssh_path,
tcp_interface,
log_dir,
- docker_compose_f="template/pt-hhs-docker-compose-L1.yaml"):
- super().__init__(image, log_dir, remote_path="/tmp/hhs")
+ docker_compose_f="template/intel-gaudi-docker-compose-L1.yaml"):
+ super().__init__(image, log_dir, remote_path="/tmp/ighs")
self.hostfile = hostfile
self.ssh_path = ssh_path
@@ -361,20 +361,20 @@ def initialize_ssh(self):
cmd = f"ssh-agent -s"
output = run_cmd(cmd)
- _logger.debug("Adding hhs private key to ssh-agent")
- cmd = f"ssh-add {self.ssh_path}/hhs_rsa"
+ _logger.debug("Adding ighs private key to ssh-agent")
+ cmd = f"ssh-add {self.ssh_path}/ighs_rsa"
output = run_cmd(cmd)
def initialize_system(self):
- self.clear_hhs_pods()
- self.clear_hhs_pods(job_type="mpijobs")
+ self.clear_ighs_pods()
+ self.clear_ighs_pods(job_type="mpijobs")
self.clear_jobs()
self.clear_remote_jobs()
_logger.info(f"Setting up ssh connection for hosts: {self.hosts}")
for h in self.hosts:
- cmd = f"ssh-copy-id -o StrictHostKeyChecking=no -i {self.ssh_path}/hhs_rsa.pub {os.environ['USER']}@{h}"
+ cmd = f"ssh-copy-id -o StrictHostKeyChecking=no -i {self.ssh_path}/ighs_rsa.pub {os.environ['USER']}@{h}"
output = run_cmd(cmd)
self.initialize_ssh()
@@ -406,7 +406,7 @@ def initialize_node_jobs(self, level,
job_path = f"{job_base_path}/L1"
elif level == 2:
if len(nodes.healthy_nodes) > 0:
- nodes_to_test = [n.replace("hhs-","").replace(":48","") for n in nodes.healthy_nodes]
+ nodes_to_test = [n.replace("ighs-","").replace(":48","") for n in nodes.healthy_nodes]
else:
nodes_to_test = nodes.all_nodes.copy()
@@ -426,24 +426,24 @@ def initialize_node_jobs(self, level,
copy_files(src="tmp/jobs", dst=f"{self.remote_path}", hosts=update_val["target-nodes"])
copy_files(src="template/bare-metal/dockerfile", dst=f"{self.remote_path}/jobs/L1", hosts=update_val["target-nodes"])
copy_files(src="./ssh", dst=f"{self.remote_path}/jobs/L1", hosts=update_val["target-nodes"])
- copy_files(src="tmp/config.yaml", dst=f"{self.remote_path}/habana_health_screen", hosts=update_val["target-nodes"])
+ copy_files(src="tmp/config.yaml", dst=f"{self.remote_path}/intel_gaudi_health_screen", hosts=update_val["target-nodes"])
elif level == 2:
- update_val["metadata-name"] = f"hhs-hccl-r{round}-{i}"
+ update_val["metadata-name"] = f"ighs-hccl-r{round}-{i}"
update_val["target-nodes"] = node_group
update_val["master-node"] = node_group[0]
update_val["num-nodes"] = len(node_group)
- self.update_yaml_job(source_file="template/bare-metal/pt-hhs-docker-compose-L2-launcher.yaml",
+ self.update_yaml_job(source_file="template/bare-metal/intel-gaudi-docker-compose-L2-launcher.yaml",
update_val=update_val,
out_dir=job_path,
- out_file=f"pt-hhs-docker-compose-L2-launcher.yaml",
+ out_file=f"intel-gaudi-docker-compose-L2-launcher.yaml",
yaml_type="mpijob_launcher")
- self.update_yaml_job(source_file="template/bare-metal/pt-hhs-docker-compose-L2-worker.yaml",
+ self.update_yaml_job(source_file="template/bare-metal/intel-gaudi-docker-compose-L2-worker.yaml",
update_val=update_val,
out_dir=job_path,
- out_file=f"pt-hhs-docker-compose-L2-worker.yaml",
+ out_file=f"intel-gaudi-docker-compose-L2-worker.yaml",
yaml_type="mpijob_worker")
nodes.launcher_nodes.append(node_group[0])
nodes.worker_nodes.extend(node_group[1:])
@@ -451,13 +451,13 @@ def initialize_node_jobs(self, level,
copy_files(src="tmp/jobs", dst=f"{self.remote_path}", hosts=update_val["target-nodes"])
copy_files(src="template/bare-metal/dockerfile", dst=f"{self.remote_path}/jobs/L2/r{round}", hosts=update_val["target-nodes"])
copy_files(src="template/bare-metal/ssh", dst=f"{self.remote_path}/jobs/L2/r{round}", hosts=update_val["target-nodes"])
- copy_files(src="tmp/config.yaml", dst=f"{self.remote_path}/habana_health_screen", hosts=update_val["target-nodes"])
+ copy_files(src="tmp/config.yaml", dst=f"{self.remote_path}/intel_gaudi_health_screen", hosts=update_val["target-nodes"])
_logger.info(f"Launching Level {level} Jobs at {job_path}")
if level == 1:
- cmd = f"{self.docker_compose_cmd} -f {self.remote_path}/jobs/L1/pt-hhs-docker-compose-L1.yaml up"
+ cmd = f"{self.docker_compose_cmd} -f {self.remote_path}/jobs/L1/intel-gaudi-docker-compose-L1.yaml up"
output = run_cmd(cmd).strip()
elif level == 2:
with open(f"{job_base_path}/L2/r{round}/hostfile_launchers", mode='wt', encoding='utf-8') as f:
@@ -466,50 +466,50 @@ def initialize_node_jobs(self, level,
f.write('\n'.join(nodes.worker_nodes))
cmd_list = [
- f"pdsh -w ^{job_base_path}/L2/r{round}/hostfile_workers {self.docker_compose_alias} -f {self.remote_path}/jobs/L2/r{round}/pt-hhs-docker-compose-L2-worker.yaml build",
- f"pdsh -w ^{job_base_path}/L2/r{round}/hostfile_workers {self.docker_compose_alias} -f {self.remote_path}/jobs/L2/r{round}/pt-hhs-docker-compose-L2-worker.yaml up -d --remove-orphans",
- f"pdsh -w ^{job_base_path}/L2/r{round}/hostfile_launchers {self.docker_compose_alias} -f {self.remote_path}/jobs/L2/r{round}/pt-hhs-docker-compose-L2-launcher.yaml build",
- f"pdsh -w ^{job_base_path}/L2/r{round}/hostfile_launchers {self.docker_compose_alias} -f {self.remote_path}/jobs/L2/r{round}/pt-hhs-docker-compose-L2-launcher.yaml up --remove-orphans"
+ f"pdsh -w ^{job_base_path}/L2/r{round}/hostfile_workers {self.docker_compose_alias} -f {self.remote_path}/jobs/L2/r{round}/intel-gaudi-docker-compose-L2-worker.yaml build",
+ f"pdsh -w ^{job_base_path}/L2/r{round}/hostfile_workers {self.docker_compose_alias} -f {self.remote_path}/jobs/L2/r{round}/intel-gaudi-docker-compose-L2-worker.yaml up -d --remove-orphans",
+ f"pdsh -w ^{job_base_path}/L2/r{round}/hostfile_launchers {self.docker_compose_alias} -f {self.remote_path}/jobs/L2/r{round}/intel-gaudi-docker-compose-L2-launcher.yaml build",
+ f"pdsh -w ^{job_base_path}/L2/r{round}/hostfile_launchers {self.docker_compose_alias} -f {self.remote_path}/jobs/L2/r{round}/intel-gaudi-docker-compose-L2-launcher.yaml up --remove-orphans"
]
for cmd in cmd_list:
output = run_cmd(cmd).strip()
def update_yaml_job(self,
- source_file="template/bare-metal/pt-hhs-docker-compose-L1.yaml",
+ source_file="template/bare-metal/intel-gaudi-docker-compose-L1.yaml",
out_dir="tmp/jobs",
- out_file="pt-hhs-docker-compose-L1.yaml",
+ out_file="intel-gaudi-docker-compose-L1.yaml",
update_val={},
yaml_type="job"):
with open(source_file, 'r') as f:
template_data = yaml.safe_load(f)
if yaml_type == "job":
- template_data["services"]["hhs_level1"]["build"]["args"]["BASE_IMAGE"] = self.image
+ template_data["services"]["ighs_level1"]["build"]["args"]["BASE_IMAGE"] = self.image
- template_data["services"]["hhs_level1"]["environment"].append(f"MY_NODE_NAME={update_val['metadata-name']}")
- template_data["services"]["hhs_level1"]["environment"].append(f"LOG_DIR={self.log_dir}")
+ template_data["services"]["ighs_level1"]["environment"].append(f"MY_NODE_NAME={update_val['metadata-name']}")
+ template_data["services"]["ighs_level1"]["environment"].append(f"LOG_DIR={self.log_dir}")
elif yaml_type == "mpijob_launcher":
- template_data["services"]["hhs_level2_launcher"]["build"]["args"]["BASE_IMAGE"] = self.image
-
- template_data["services"]["hhs_level2_launcher"]["environment"].append(f"MY_NODE_NAME={update_val['metadata-name']}")
- template_data["services"]["hhs_level2_launcher"]["environment"].append(f"LOG_DIR={self.log_dir}")
- template_data["services"]["hhs_level2_launcher"]["environment"].append(f"ROUND=r{update_val['round']}")
- template_data["services"]["hhs_level2_launcher"]["environment"].append(f"NUM_NODES={update_val['num-nodes']}")
- template_data["services"]["hhs_level2_launcher"]["environment"].append(f'TARGET_NODES={",".join(update_val["target-nodes"])}')
- template_data["services"]["hhs_level2_launcher"]["environment"].append(f"MASTER_ADDR={update_val['master-node']}")
- template_data["services"]["hhs_level2_launcher"]["environment"].append(f"TCP_INTERFACE={self.tcp_interface}")
- template_data["services"]["hhs_level2_launcher"]["environment"].append(f"JOB_ID={update_val['metadata-name']}")
+ template_data["services"]["ighs_level2_launcher"]["build"]["args"]["BASE_IMAGE"] = self.image
+
+ template_data["services"]["ighs_level2_launcher"]["environment"].append(f"MY_NODE_NAME={update_val['metadata-name']}")
+ template_data["services"]["ighs_level2_launcher"]["environment"].append(f"LOG_DIR={self.log_dir}")
+ template_data["services"]["ighs_level2_launcher"]["environment"].append(f"ROUND=r{update_val['round']}")
+ template_data["services"]["ighs_level2_launcher"]["environment"].append(f"NUM_NODES={update_val['num-nodes']}")
+ template_data["services"]["ighs_level2_launcher"]["environment"].append(f'TARGET_NODES={",".join(update_val["target-nodes"])}')
+ template_data["services"]["ighs_level2_launcher"]["environment"].append(f"MASTER_ADDR={update_val['master-node']}")
+ template_data["services"]["ighs_level2_launcher"]["environment"].append(f"TCP_INTERFACE={self.tcp_interface}")
+ template_data["services"]["ighs_level2_launcher"]["environment"].append(f"JOB_ID={update_val['metadata-name']}")
elif yaml_type == "mpijob_worker":
- template_data["services"]["hhs_level2_worker"]["build"]["args"]["BASE_IMAGE"] = self.image
- template_data["services"]["hhs_level2_worker"]["environment"].append(f"MY_NODE_NAME={update_val['metadata-name']}")
- template_data["services"]["hhs_level2_worker"]["environment"].append(f"LOG_DIR={self.log_dir}")
- template_data["services"]["hhs_level2_worker"]["environment"].append(f"JOB_ID={update_val['metadata-name']}")
+ template_data["services"]["ighs_level2_worker"]["build"]["args"]["BASE_IMAGE"] = self.image
+ template_data["services"]["ighs_level2_worker"]["environment"].append(f"MY_NODE_NAME={update_val['metadata-name']}")
+ template_data["services"]["ighs_level2_worker"]["environment"].append(f"LOG_DIR={self.log_dir}")
+ template_data["services"]["ighs_level2_worker"]["environment"].append(f"JOB_ID={update_val['metadata-name']}")
elif yaml_type == "config":
hostfile = template_data["system-info"]["hostfile"]
ssh_path = template_data["system-info"]["ssh-path"]
- template_data["system-info"]["hostfile"] = f"/tmp/hhs/habana_health_screen/{os.path.basename(hostfile)}"
- template_data["system-info"]["ssh-path"] = f"/tmp/hhs/habana_health_screen/{os.path.basename(ssh_path)}"
+ template_data["system-info"]["hostfile"] = f"/tmp/ighs/intel_gaudi_health_screen/{os.path.basename(hostfile)}"
+ template_data["system-info"]["ssh-path"] = f"/tmp/ighs/intel_gaudi_health_screen/{os.path.basename(ssh_path)}"
out_f = f"{out_dir}/{out_file}"
dir_name = os.path.dirname(out_f)
@@ -521,14 +521,14 @@ def update_yaml_job(self,
_logger.info(f"Created Yaml: {out_f}")
- def monitor_hhs_status(self, level, nodes, timeout_s=240, round=0, monitor=True):
- return super().monitor_hhs_status(level=level, nodes=nodes, timeout_s=timeout_s, round=round, monitor=False)
+ def monitor_ighs_status(self, level, nodes, timeout_s=240, round=0, monitor=True):
+ return super().monitor_ighs_status(level=level, nodes=nodes, timeout_s=timeout_s, round=round, monitor=False)
- def clear_hhs_pods(self, job_type="jobs"):
+ def clear_ighs_pods(self, job_type="jobs"):
work_dir = f"{self.remote_path}/jobs"
if job_type == "jobs":
- cmd = f"{self.docker_compose_cmd} -f {work_dir}/L1/pt-hhs-docker-compose-L1.yaml down"
+ cmd = f"{self.docker_compose_cmd} -f {work_dir}/L1/intel-gaudi-docker-compose-L1.yaml down"
output = run_cmd(cmd).strip()
else:
files = glob.glob(f"{work_dir}/L2/**/*.yaml", recursive=True)
@@ -544,7 +544,7 @@ def clear_hhs_pods(self, job_type="jobs"):
output = run_cmd(cmd).strip()
def clear_remote_jobs(self):
- cmd = f"{self.pdsh_cmd} rm -R /tmp/hhs/jobs/"
+ cmd = f"{self.pdsh_cmd} rm -R /tmp/ighs/jobs/"
output = run_cmd(cmd)
def diagnose_unhealthy_nodes(self, infected_nodes, missing_nodes):
diff --git a/utils/habana_health_screen/template/bare-metal/dockerfile b/utils/intel_gaudi_health_screen/template/bare-metal/dockerfile
similarity index 100%
rename from utils/habana_health_screen/template/bare-metal/dockerfile
rename to utils/intel_gaudi_health_screen/template/bare-metal/dockerfile
diff --git a/utils/habana_health_screen/template/bare-metal/pt-hhs-docker-compose-L1.yaml b/utils/intel_gaudi_health_screen/template/bare-metal/intel-gaudi-docker-compose-L1.yaml
similarity index 56%
rename from utils/habana_health_screen/template/bare-metal/pt-hhs-docker-compose-L1.yaml
rename to utils/intel_gaudi_health_screen/template/bare-metal/intel-gaudi-docker-compose-L1.yaml
index c7b7f97..fbee7d9 100644
--- a/utils/habana_health_screen/template/bare-metal/pt-hhs-docker-compose-L1.yaml
+++ b/utils/intel_gaudi_health_screen/template/bare-metal/intel-gaudi-docker-compose-L1.yaml
@@ -1,28 +1,28 @@
version: '3.3'
services:
- hhs_level1:
- image: hhs_level1
+ ighs_level1:
+ image: ighs_level1
build:
context: .
network: host
args:
BASE_IMAGE: "${BASE_IMAGE}"
- container_name: hhs_level1
+ container_name: ighs_level1
runtime: habana
environment:
- HABANA_VISIBLE_DEVICES=all
- OMPI_MCA_btl_vader_single_copy_mechanism=none
- - HHS_LEVEL=1
+ - IGHS_LEVEL=1
cap_add:
- SYS_NICE
ipc: host
network_mode: host
- working_dir: /tmp/hhs/habana_health_screen
+ working_dir: /tmp/ighs/intel_gaudi_health_screen
volumes:
- ./ssh:/root/.ssh/
- - /tmp/hhs/habana_health_screen:/tmp/hhs/habana_health_screen
+ - /tmp/ighs/intel_gaudi_health_screen:/tmp/ighs/intel_gaudi_health_screen
- /etc/localtime:/etc/localtime:ro
command: >
- bash -c "python screen.py --hhs-check node --logs-dir $${LOG_DIR} --node-name $${MY_NODE_NAME} --node-write-report && \
+ bash -c "python screen.py --ighs-check node --logs-dir $${LOG_DIR} --node-name $${MY_NODE_NAME} --node-write-report && \
chmod 777 -R $${LOG_DIR}"
diff --git a/utils/habana_health_screen/template/bare-metal/pt-hhs-docker-compose-L2-launcher.yaml b/utils/intel_gaudi_health_screen/template/bare-metal/intel-gaudi-docker-compose-L2-launcher.yaml
similarity index 65%
rename from utils/habana_health_screen/template/bare-metal/pt-hhs-docker-compose-L2-launcher.yaml
rename to utils/intel_gaudi_health_screen/template/bare-metal/intel-gaudi-docker-compose-L2-launcher.yaml
index b19c303..454550e 100644
--- a/utils/habana_health_screen/template/bare-metal/pt-hhs-docker-compose-L2-launcher.yaml
+++ b/utils/intel_gaudi_health_screen/template/bare-metal/intel-gaudi-docker-compose-L2-launcher.yaml
@@ -1,27 +1,27 @@
version: '3.3'
services:
- hhs_level2_launcher:
- image: hhs_level2
+ ighs_level2_launcher:
+ image: ighs_level2
build:
context: .
network: host
args:
BASE_IMAGE: "${BASE_IMAGE}"
- container_name: hhs_level2_launcher
+ container_name: ighs_level2_launcher
runtime: habana
environment:
- HABANA_VISIBLE_DEVICES=all
- OMPI_MCA_btl_vader_single_copy_mechanism=none
- - HHS_LEVEL=2
+ - IGHS_LEVEL=2
cap_add:
- SYS_NICE
ipc: host
network_mode: host
- working_dir: /tmp/hhs/habana_health_screen
+ working_dir: /tmp/ighs/intel_gaudi_health_screen
volumes:
- ./ssh:/root/.ssh/
- - /tmp/hhs/habana_health_screen:/tmp/hhs/habana_health_screen
+ - /tmp/ighs/intel_gaudi_health_screen:/tmp/ighs/intel_gaudi_health_screen
- /etc/localtime:/etc/localtime:ro
command: >
template/bare-metal/run_hccl_demo.sh
\ No newline at end of file
diff --git a/utils/habana_health_screen/template/bare-metal/pt-hhs-docker-compose-L2-worker.yaml b/utils/intel_gaudi_health_screen/template/bare-metal/intel-gaudi-docker-compose-L2-worker.yaml
similarity index 63%
rename from utils/habana_health_screen/template/bare-metal/pt-hhs-docker-compose-L2-worker.yaml
rename to utils/intel_gaudi_health_screen/template/bare-metal/intel-gaudi-docker-compose-L2-worker.yaml
index a8f6c6a..8a99927 100644
--- a/utils/habana_health_screen/template/bare-metal/pt-hhs-docker-compose-L2-worker.yaml
+++ b/utils/intel_gaudi_health_screen/template/bare-metal/intel-gaudi-docker-compose-L2-worker.yaml
@@ -1,26 +1,26 @@
version: '3.3'
services:
- hhs_level2_worker:
- image: hhs_level2
+ ighs_level2_worker:
+ image: ighs_level2
build:
context: .
network: host
args:
BASE_IMAGE: "${BASE_IMAGE}"
- container_name: hhs_level2_worker
+ container_name: ighs_level2_worker
runtime: habana
environment:
- HABANA_VISIBLE_DEVICES=all
- OMPI_MCA_btl_vader_single_copy_mechanism=none
- - HHS_LEVEL=2
+ - IGHS_LEVEL=2
cap_add:
- SYS_NICE
ipc: host
network_mode: host
- working_dir: /tmp/hhs/habana_health_screen
+ working_dir: /tmp/ighs/intel_gaudi_health_screen
volumes:
- ./ssh:/root/.ssh/
- - /tmp/hhs/habana_health_screen:/tmp/hhs/habana_health_screen
+ - /tmp/ighs/intel_gaudi_health_screen:/tmp/ighs/intel_gaudi_health_screen
- /etc/localtime:/etc/localtime:ro
tty: true
diff --git a/utils/habana_health_screen/template/bare-metal/run_hccl_demo.sh b/utils/intel_gaudi_health_screen/template/bare-metal/run_hccl_demo.sh
similarity index 70%
rename from utils/habana_health_screen/template/bare-metal/run_hccl_demo.sh
rename to utils/intel_gaudi_health_screen/template/bare-metal/run_hccl_demo.sh
index b772ebf..5d51d58 100644
--- a/utils/habana_health_screen/template/bare-metal/run_hccl_demo.sh
+++ b/utils/intel_gaudi_health_screen/template/bare-metal/run_hccl_demo.sh
@@ -1,8 +1,8 @@
#!/bin/bash
NUM_NODES="${NUM_NODES:-1}";
-HOME_DIR="${HOME_DIR:-/tmp/hhs/habana_health_screen}";
-WORK_DIR="${WORK_DIR:-/tmp/hhs/habana_health_screen/build/hccl_demo}";
+HOME_DIR="${HOME_DIR:-/tmp/ighs/intel_gaudi_health_screen}";
+WORK_DIR="${WORK_DIR:-/tmp/ighs/intel_gaudi_health_screen/build/hccl_demo}";
NGPU_PER_NODE=8;
N_CARDS=$((NUM_NODES*NGPU_PER_NODE));
@@ -32,9 +32,9 @@ $CMD \
-x MASTER_ADDR \
-x PYTHONPATH="/usr/lib/habanalabs/:$PYTHONPATH" \
-x ENABLE_CONSOLE="true" -x LOG_LEVEL_ALL=4 \
-2>&1 | tee -a $HOME_DIR/$LOG_DIR/L2/$ROUND/$JOB_ID.log;
+2>&1 | ts '[%Y-%m-%d %H:%M:%S]' | tee -a $HOME_DIR/$LOG_DIR/L2/$ROUND/$JOB_ID.log;
cd ${HOME_DIR};
-python $HOME_DIR/screen.py --hhs-check hccl-demo --logs-dir $LOG_DIR --job-id $JOB_ID --target-nodes $TARGET_NODES --node-name $MY_NODE_NAME;
+python $HOME_DIR/screen.py --ighs-check hccl-demo --logs-dir $LOG_DIR --job-id $JOB_ID --target-nodes $TARGET_NODES --node-name $MY_NODE_NAME;
chmod 777 -R $HOME_DIR/$LOG_DIR
diff --git a/utils/habana_health_screen/template/k8s/pt-habana-health-screen-L1.yaml b/utils/intel_gaudi_health_screen/template/k8s/intel-gaudi-health-screen-L1.yaml
similarity index 76%
rename from utils/habana_health_screen/template/k8s/pt-habana-health-screen-L1.yaml
rename to utils/intel_gaudi_health_screen/template/k8s/intel-gaudi-health-screen-L1.yaml
index 3bc647d..0161d65 100644
--- a/utils/habana_health_screen/template/k8s/pt-habana-health-screen-L1.yaml
+++ b/utils/intel_gaudi_health_screen/template/k8s/intel-gaudi-health-screen-L1.yaml
@@ -4,12 +4,12 @@ metadata:
name: template-metadata-name
namespace: default
labels:
- app: hhs
+ app: ighs
spec:
template:
metadata:
labels:
- app: hhs
+ app: ighs
spec:
restartPolicy: "Never"
affinity:
@@ -20,7 +20,7 @@ spec:
- key: kubernetes.io/hostname
operator: In
values:
- - HHS-DUMMY-VAL
+ - IGHS-DUMMY-VAL
volumes:
- name: mydir
hostPath:
@@ -33,18 +33,19 @@ spec:
containers:
- name: template-container-name
image: template-container-image
- workingDir: /habana_health_screen
+ imagePullPolicy: IfNotPresent
+ workingDir: /intel_gaudi_health_screen
command: ["/bin/bash", "-c"]
args:
- >-
- python $HOME_DIR/screen.py --hhs-check node --logs-dir $LOG_DIR --node-write-report;
+ python $HOME_DIR/screen.py --ighs-check node --logs-dir $LOG_DIR --node-write-report;
volumeMounts:
- name: mydir
- mountPath: /habana_health_screen
+ mountPath: /intel_gaudi_health_screen
env:
- name: HOME_DIR
- value: "/habana_health_screen"
- - name: HHS_LEVEL
+ value: "/intel_gaudi_health_screen"
+ - name: IGHS_LEVEL
value: "1"
- name: MY_POD_IP
valueFrom:
@@ -61,7 +62,9 @@ spec:
resources:
limits:
habana.ai/gaudi: 8
+ hugepages-2Mi: 29000Mi
cpu: 95
requests:
habana.ai/gaudi: 8
+ hugepages-2Mi: 29000Mi
cpu: 95
diff --git a/utils/habana_health_screen/template/k8s/pt-habana-health-screen-L2_hccl-demo.yaml b/utils/intel_gaudi_health_screen/template/k8s/intel-gaudi-health-screen-L2_hccl-demo.yaml
similarity index 80%
rename from utils/habana_health_screen/template/k8s/pt-habana-health-screen-L2_hccl-demo.yaml
rename to utils/intel_gaudi_health_screen/template/k8s/intel-gaudi-health-screen-L2_hccl-demo.yaml
index 11d7b22..3768a1e 100644
--- a/utils/habana_health_screen/template/k8s/pt-habana-health-screen-L2_hccl-demo.yaml
+++ b/utils/intel_gaudi_health_screen/template/k8s/intel-gaudi-health-screen-L2_hccl-demo.yaml
@@ -4,7 +4,7 @@ metadata:
name: template-metadata-name
namespace: default
labels:
- app: hhs-hccl
+ app: ighs-hccl
spec:
slotsPerWorker: 8
runPolicy:
@@ -15,7 +15,7 @@ spec:
template:
metadata:
labels:
- app: hhs-hccl
+ app: ighs-hccl
spec:
volumes:
- name: mydir
@@ -24,11 +24,12 @@ spec:
type: Directory
containers:
- image: template-container-image
- name: pt-hhs-launcher
- workingDir: /habana_health_screen
+ name: ighs-launcher
+ imagePullPolicy: IfNotPresent
+ workingDir: /intel_gaudi_health_screen
volumeMounts:
- name: mydir
- mountPath: /habana_health_screen
+ mountPath: /intel_gaudi_health_screen
env:
- name: JOB_ID
valueFrom:
@@ -39,8 +40,8 @@ spec:
fieldRef:
fieldPath: spec.nodeName
- name: HOME_DIR
- value: "/habana_health_screen"
- - name: HHS_LEVEL
+ value: "/intel_gaudi_health_screen"
+ - name: IGHS_LEVEL
value: "2"
command: ["/bin/bash", "-c"]
args:
@@ -73,19 +74,20 @@ spec:
--rank-by core --report-bindings \
--tag-output \
--merge-stderr-to-stdout --prefix $MPI_ROOT \
+ --mca btl_tcp_if_include eth0 \
-x PYTHONPATH="/usr/lib/habanalabs/:$PYTHONPATH" \
-x ENABLE_CONSOLE="true" -x LOG_LEVEL_ALL=4 \
- -x MAX_TIMEOUT=60 2>&1 | tee -a $HOME_DIR/$LOG_DIR/L2/$ROUND/$JOB_ID.log;
+ -x MAX_TIMEOUT=60 2>&1 | ts '[%Y-%m-%d %H:%M:%S]' | tee -a $HOME_DIR/$LOG_DIR/L2/$ROUND/$JOB_ID.log;
cd ${HOME_DIR};
- python ${HOME_DIR}/screen.py --hhs-check hccl-demo --target-nodes $TARGET_NODES --job-id $JOB_ID --logs-dir $LOG_DIR --round $ROUND;
+ python ${HOME_DIR}/screen.py --ighs-check hccl-demo --target-nodes $TARGET_NODES --job-id $JOB_ID --logs-dir $LOG_DIR --round $ROUND;
Worker:
replicas: template-num-nodes
template:
metadata:
labels:
- app: hhs-hccl
+ app: ighs-hccl
spec:
affinity:
nodeAffinity:
@@ -95,7 +97,7 @@ spec:
- key: kubernetes.io/hostname
operator: In
values:
- - HHS-DUMMY-VAL
+ - IGHS-DUMMY-VAL
volumes:
- name: mydir
hostPath:
@@ -110,17 +112,22 @@ spec:
effect: "NoExecute"
containers:
- image: template-container-image
- name: pt-hhs-worker
+ name: ighs-worker
+ imagePullPolicy: IfNotPresent
resources:
limits:
habana.ai/gaudi: 8
+ hugepages-2Mi: 29000Mi
+ cpu: 95
requests:
habana.ai/gaudi: 8
+ hugepages-2Mi: 29000Mi
+ cpu: 95
volumeMounts:
- name: mydir
- mountPath: /habana_health_screen
+ mountPath: /intel_gaudi_health_screen
env:
- - name: HHS_LEVEL
+ - name: IGHS_LEVEL
value: "2"
- name: MY_POD_IP
valueFrom:
diff --git a/utils/habana_health_screen/utilities.py b/utils/intel_gaudi_health_screen/utilities.py
similarity index 91%
rename from utils/habana_health_screen/utilities.py
rename to utils/intel_gaudi_health_screen/utilities.py
index d8e015a..a782d14 100644
--- a/utils/habana_health_screen/utilities.py
+++ b/utils/intel_gaudi_health_screen/utilities.py
@@ -17,7 +17,7 @@
import logging
from logging import handlers
-_logger = logging.getLogger("habana_health_screener")
+_logger = logging.getLogger("health_screener")
def get_logging_level(log_level):
log_level = log_level.lower()
@@ -96,16 +96,11 @@ def run_cmd(cmd, timeout_s=1_800, verbose=False):
return result.stdout
def download_repos():
- """ Download Habana's Setup_and_Install and HCCL_DEMO Repos to assist in health checks
+ """ Download HCCL_DEMO Repo to assist in health checks
"""
if not os.path.exists("build"):
os.makedirs("build")
- if not os.path.exists("build/Setup_and_Install"):
- _logger.info(f"Downloading Setup_and_Install into build/")
- cmd = "git clone https://github.com/HabanaAI/Setup_and_Install.git build/Setup_and_Install"
- run_cmd(cmd)
-
if not os.path.exists("build/hccl_demo"):
_logger.info(f"Downloading hccl_demo into build/")
cmd = "git clone https://github.com/HabanaAI/hccl_demo.git build/hccl_demo"
@@ -168,21 +163,21 @@ def clear_job(job):
time.sleep(10)
-def clear_hhs_pods(job_type="jobs"):
- """ Clear Pods with label=hhs,hhs-hccl
+def clear_ighs_pods(job_type="jobs"):
+ """ Clear Pods with label=ighs,ighs-hccl
Args:
job_type (str, optional): Type of Job to delete. Options: [jobs, mpijobs]. Defaults to "jobs".
"""
- _logger.info(f"Checking for existing HHS Pods ({job_type})")
+ _logger.info(f"Checking for existing IGHS Pods ({job_type})")
- metadata_app = "hhs" if (job_type == "jobs") else "hhs-hccl"
+ metadata_app = "ighs" if (job_type == "jobs") else "ighs-hccl"
cmd = f"kubectl get pods -n default -l app={metadata_app} -o=custom-columns='NAME:.metadata.name' --no-headers"
output = run_cmd(cmd).strip()
if len(output) > 0:
- _logger.info(f"Found existing HHS Pods ({job_type}). Will delete.")
+ _logger.info(f"Found existing IGHS Pods ({job_type}). Will delete.")
cmd = f"kubectl get {job_type} -n default -l app={metadata_app} -o=custom-columns='NAME:.metadata.name' --no-headers"
output = run_cmd(cmd).strip()
diff --git a/utils/intel_gaudi_health_screen/version.txt b/utils/intel_gaudi_health_screen/version.txt
new file mode 100644
index 0000000..359a5b9
--- /dev/null
+++ b/utils/intel_gaudi_health_screen/version.txt
@@ -0,0 +1 @@
+2.0.0
\ No newline at end of file