Auto detect first variant for CPU and MEM (#22)

* Auto detect first variant for CPU and MEM * Switching to base frequency detection * Added logging and allowed to force auto tune * Adding detection for freq in name and fallback * Ignored sublime * Added TDP and make detection; Fixed --auto mode when no parameter supplied * Changed error message wording * Commented-out fluctuating frequency guesser * No auto calculation of cores * derived -> found * Wording * fixes the freq problem * Revert "fixes the freq problem" This reverts commit e4f9820. --------- Co-authored-by: Didi Hoffmann <[email protected]>
green-coding-solutions · Feb 16, 2024 · 3232ae2 · 3232ae2
1 parent 5557f3d
commit 3232ae2
Show file tree

Hide file tree

Showing 4 changed files with 245 additions and 21 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,4 @@
 venv
 .DS_Store
-__pycache__
+__pycache__
+sftp-config.json
diff --git a/.pylintrc b/.pylintrc
@@ -11,6 +11,29 @@ disable=missing-function-docstring,
     missing-class-docstring,
     too-few-public-methods,
     duplicate-code,
+    too-many-nested-blocks,
+    line-too-long,
+    too-many-boolean-expressions,
+    too-many-nested-blocks,
+    line-too-long,
+    protected-access,
+    too-many-lines,
+    multiple-statements,
+    pointless-string-statement,
+    too-many-locals,
+    too-many-public-methods,
+    too-many-branches,
+    too-many-statements,
+    too-many-arguments,
+    too-many-return-statements,
+    too-many-instance-attributes,
+    invalid-name,
+    wrong-import-position,
+    wrong-import-order,
+    ungrouped-imports,
+    fixme
+
+
 
 [MASTER]
 ignore=env

diff --git a/auto_detect.py b/auto_detect.py
@@ -0,0 +1,162 @@
+# pylint: disable=redefined-outer-name,invalid-name
+
+import subprocess
+import re
+import logging
+import math
+
+def get_cpu_info(logger):
+
+    data = {
+        'freq' : None,
+        'threads': None,
+        'cores': None,
+        'tdp': None,
+        'mem': None,
+        'make': None,
+        'chips': None
+    }
+
+    try:
+        file_path = '/sys/class/powercap/intel-rapl/intel-rapl:0/name'
+        with open(file_path, 'r', encoding='UTF-8') as file:
+            domain_name = file.read().strip()
+            if domain_name != 'package-0':
+                raise RuntimeError(f"Domain /sys/class/powercap/intel-rapl/intel-rapl:0/name was not package-0, but {domain_name}")
+
+        file_path = '/sys/class/powercap/intel-rapl/intel-rapl:0/constraint_0_name'
+        with open(file_path, 'r', encoding='UTF-8') as file:
+            constraint_name = file.read().strip()
+            if constraint_name != 'long_term':
+                raise RuntimeError(f"Constraint /sys/class/powercap/intel-rapl/intel-rapl:0/constraint_0_name was not long_term, but {constraint_name}")
+
+        file_path = '/sys/class/powercap/intel-rapl/intel-rapl:0/constraint_0_max_power_uw'
+        with open(file_path, 'r', encoding='UTF-8') as file:
+            tdp = file.read()
+            data['tdp'] = int(tdp) / 1_000_000
+
+        logger.info('Found TDP: %d W', data['tdp'])
+    #pylint: disable=broad-except
+    except Exception as err:
+        logger.info('Exception: %s', err)
+        logger.info('Could not read RAPL powercapping info from /sys/class/powercap/intel-rapl')
+
+    try:
+        file_paths = {
+            1: '/sys/class/powercap/intel-rapl/intel-rapl:0/name',
+            2: '/sys/class/powercap/intel-rapl/intel-rapl:1/name',
+            3: '/sys/class/powercap/intel-rapl/intel-rapl:2/name',
+            4: '/sys/class/powercap/intel-rapl/intel-rapl:3/name',
+            5: '/sys/class/powercap/intel-rapl/intel-rapl:4/name',
+            6: '/sys/class/powercap/intel-rapl/intel-rapl:5/name',
+        }
+        for chips, file_path in file_paths.items():
+            with open(file_path, 'r', encoding='UTF-8') as file:
+                domain_name = file.read().strip()
+                if domain_name != f"package-{chips-1}":
+                    raise RuntimeError(f"Domain {file_path} was not package-{chips-1}, but {domain_name}")
+                logger.info('Found Sockets: %d', chips)
+                data['chips'] = chips
+    #pylint: disable=broad-except
+    except Exception as err:
+        logger.info('Exception: %s', err)
+        logger.info('Could not find (additional) chips info under file path. Most likely reached final chip. continuing ...')
+
+
+    try:
+        cpuinfo = subprocess.check_output('lscpu', encoding='UTF-8')
+        match = re.search(r'On-line CPU\(s\) list:\s*(0-)?(\d+)', cpuinfo)
+        if match:
+            data['threads'] = int(match.group(2))+1 # +1 because 0 indexed
+            logger.info('Found Threads: %d', data['threads'])
+        else:
+            logger.info('Could not find Threads. Using default None')
+
+        # this will overwrite info we have from RAPL socket discovery, as we
+        # deem lscpu more relieable
+        match = re.search(r'Socket\(s\):\s*(\d+)', cpuinfo)
+        if match:
+            data['chips'] = int(match.group(1))
+            logger.info('Found Sockets: %d (will take precedence if not 0)', data['chips'])
+        else:
+            logger.info('Could not find Chips/Sockets via lscpu')
+
+        if data['chips']:
+            match = re.search(r'Core\(s\) per socket:\s*(\d+)', cpuinfo)
+            if match:
+                cores_per_socket = int(match.group(1))
+                data['cores'] = cores_per_socket * data['chips']
+                logger.info('Found cores: %d ', data['cores'])
+            else:
+                logger.info('Could not find Cores. Using default None')
+
+        match = re.search(r'Model name:.*@\s*([\d.]+)\s*GHz', cpuinfo)
+        if match:
+            data['freq'] = int(float(match.group(1))*1000)
+            logger.info('Found Frequency: %s', data['freq'])
+        else:
+            logger.info('Could not find Frequency. Using default None')
+
+        match = re.search(r'Model name:.*Intel\(R\)', cpuinfo)
+        if match:
+            data['make'] = 'intel'
+            logger.info('Found Make: %s', data['make'])
+
+        match = re.search(r'Model name:.*AMD ', cpuinfo)
+        if match:
+            data['make'] = 'amd'
+            logger.info('Found Make: %s', data['make'])
+
+
+        # we currently do not match for architecture, as this info is provided nowhere
+
+        # we also currently do not matc for make, as this info can result in ARM which is currently not supported and
+        # would rather lead to confusion
+    #pylint: disable=broad-except
+    except Exception as err:
+        logger.info('Exception: %s', err)
+        logger.info('Could not check for CPU info.')
+
+
+    """ This code is problematic, as the CPU freq is changing rapidly sometimes and making the resulting XGBoost
+    values fluctuate a lot.
+    """
+
+
+    # if not data['freq']:
+    #     try:
+    #         cpuinfo_proc = subprocess.check_output(['cat', '/proc/cpuinfo'], encoding='UTF-8', stderr=subprocess.DEVNULL)
+    #         match = re.findall(r'cpu MHz\s*:\s*([\d.]+)', cpuinfo_proc)
+    #         if match:
+    #             data['freq'] = round(max(map(float, match)))
+    #             logger.info('Found assumend Frequency: %d', data['freq'])
+    #         else:
+    #             logger.info('Could not find Frequency. Using default None')
+    #     #pylint: disable=broad-except
+    #     except Exception as err:
+    #         logger.info('Exception: %s', err)
+    #         logger.info('/proc/cpuinfo not accesible on system. Could not check for Base Frequency info. Setting value to None.')
+
+
+
+    try:
+        meminfo = subprocess.check_output(['cat', '/proc/meminfo'], encoding='UTF-8', stderr=subprocess.DEVNULL)
+        match = re.search(r'MemTotal:\s*(\d+) kB', meminfo)
+        if match:
+            data['mem'] = math.ceil(int(match.group(1)) / 1024 / 1024)
+            logger.info('Found Memory: %d GB', data['mem'])
+        else:
+            logger.info('Could not find Memory. Using default None')
+    #pylint: disable=broad-except
+    except Exception as err:
+        logger.info('Exception: %s', err)
+        logger.info('/proc/meminfo not accesible on system. Could not check for Memory info. Defaulting to None.')
+
+    return data
+
+if __name__ == "__main__":
+    logger = logging.getLogger(__name__)
+    logger.addHandler(logging.StreamHandler())
+    logger.setLevel(logging.INFO)
+
+    print(get_cpu_info(logger))
diff --git a/xgb.py b/xgb.py
@@ -3,28 +3,38 @@
 import sys
 import os
 import time
+import logging
 import platform
 import pandas as pd
 import numpy as np
 from xgboost import XGBRegressor
 
-def train_model(cpu_chips, Z, silent=False):
+import auto_detect
+
+logger = logging.getLogger(__name__)
+logger.addHandler(logging.StreamHandler())
+logger.setLevel(logging.INFO)
+
+def train_model(cpu_chips, Z):
 
     df = pd.read_csv(f"{os.path.dirname(os.path.abspath(__file__))}/data/spec_data_cleaned.csv")
 
     X = df.copy()
     X = pd.get_dummies(X, columns=['CPUMake', 'Architecture'])
 
-    if not silent:
-        print('Model will be restricted to the following amount of chips:', cpu_chips)
+    if cpu_chips:
+        logger.info('Training data will be restricted to the following amount of chips: %d', cpu_chips)
+
+        X = X[X.CPUChips == cpu_chips] # Fit a model for every amount of CPUChips
+
+    if X.empty:
+        raise RuntimeError(f"The training data does not contain any servers with a chips amount ({cpu_chips}). Please select a different amount.")
 
-    X = X[X.CPUChips == cpu_chips] # Fit a model for every amount of CPUChips
     y = X.power
 
     X = X[Z.columns] # only select the supplied columns from the command line
 
-    if not silent:
-        print('Model will be trained on:', X.columns)
+    logger.info('Model will be trained on the following columns and restrictions: \n%s', Z)
 
 #    params = {
 #      'max_depth': 10,
@@ -81,7 +91,7 @@ def interpolate_predictions(predictions):
 
     parser = argparse.ArgumentParser()
 
-    parser.add_argument('--cpu-chips', type=int, help='Number of CPU chips', default=1)
+    parser.add_argument('--cpu-chips', type=int, help='Number of CPU chips')
     parser.add_argument('--cpu-freq', type=int, help='CPU frequency')
     parser.add_argument('--cpu-threads', type=int, help='Number of CPU threads')
     parser.add_argument('--cpu-cores', type=int, help='Number of CPU cores')
@@ -90,10 +100,12 @@ def interpolate_predictions(predictions):
     parser.add_argument('--ram', type=int, help='Amount of DRAM for the bare metal system')
     parser.add_argument('--architecture', type=str, help='The architecture of the CPU. lowercase. ex.: haswell')
     parser.add_argument('--cpu-make', type=str, help='The make of the CPU (intel or amd)')
+    parser.add_argument('--auto', action='store_true', help='Force auto detect. Will overwrite supplied arguments')
+
     parser.add_argument('--vhost-ratio',
         type=float,
-        help='Virtualization ratio of the system. Input numbers between (0,1].',
-        default=1.0
+        help='Virtualization ratio of the system. Input numbers between (0,1].'
+
     )
     parser.add_argument('--silent',
         action='store_true',
@@ -111,6 +123,39 @@ def interpolate_predictions(predictions):
 
     args = parser.parse_args()
 
+    if args.silent:
+        # sadly some libs have future warnings we need to suppress for
+        # silent mode to work in bash scripts
+        import warnings
+        warnings.simplefilter(action='ignore', category=FutureWarning)
+        logger.setLevel(logging.WARNING)
+
+    args_dict = args.__dict__.copy()
+    del args_dict['silent']
+    del args_dict['auto']
+    del args_dict['energy']
+
+    # did the user supply any of the auto detectable arguments?
+    if not any(args_dict.values()) or args.auto:
+        logger.info('No arguments where supplied, or auto mode was forced. Running auto detect on the sytem.')
+
+        data = auto_detect.get_cpu_info(logger)
+
+        logger.info('The following data was auto detected: %s', data)
+
+        # only overwrite not already supplied values
+        args.cpu_freq = args.cpu_freq or data['freq']
+        args.cpu_threads = args.cpu_threads or data['threads']
+        args.cpu_cores = args.cpu_cores or data['cores']
+        args.tdp = args.tdp or data['tdp']
+        args.ram = args.ram or data['mem']
+        args.cpu_make = args.cpu_make or data['make']
+        args.cpu_chips = args.cpu_chips or data['chips']
+
+    # set default. We do this here and not in argparse, so we can check if anything was supplied at all
+    if not args.vhost_ratio:
+        args.vhost_ratio = 1.0
+
     if platform.system() == 'Darwin' and args.autoinput and args.interval < 0.5:
         print('''
                 Under MacOS the internal values are updated every 0.5 seconds by the kernel if you use the host_statistics call.
@@ -121,7 +166,6 @@ def interpolate_predictions(predictions):
               ''')
         sys.exit(1)
 
-
     Z = pd.DataFrame.from_dict({
         'HW_CPUFreq' : [args.cpu_freq],
         'CPUThreads': [args.cpu_threads],
@@ -138,18 +182,12 @@ def interpolate_predictions(predictions):
 
     Z = Z.dropna(axis=1)
 
-    if args.silent:
-        # sadly some libs have future warnings we need to suppress for
-        # silent mode to work in bash scripts
-        import warnings
-        warnings.simplefilter(action='ignore', category=FutureWarning)
-    else:
-        print('Sending following dataframe to model:\n', Z)
-        print('vHost ratio is set to ', args.vhost_ratio)
-        print('Infering all predictions to dictionary')
 
-    trained_model = train_model(args.cpu_chips, Z, args.silent)
+    logger.info('vHost ratio is set to %s', args.vhost_ratio)
+
+    trained_model = train_model(args.cpu_chips, Z)
 
+    logger.info('Infering all predictions to dictionary')
 
     inferred_predictions = infer_predictions(trained_model, Z)
     interpolated_predictions = interpolate_predictions(inferred_predictions)