Skip to content

Commit

Permalink
Auto detect first variant for CPU and MEM (#22)
Browse files Browse the repository at this point in the history
* Auto detect first variant for CPU and MEM

* Switching to base frequency detection

* Added logging and allowed to force auto tune

* Adding detection for freq in name and fallback

* Ignored sublime

* Added TDP and make detection; Fixed --auto mode when no parameter supplied

* Changed error message wording

* Commented-out fluctuating frequency guesser

* No auto calculation of cores

* derived -> found

* Wording

* fixes the freq problem

* Revert "fixes the freq problem"

This reverts commit e4f9820.

---------

Co-authored-by: Didi Hoffmann <[email protected]>
  • Loading branch information
ArneTR and ribalba authored Feb 16, 2024
1 parent 5557f3d commit 3232ae2
Show file tree
Hide file tree
Showing 4 changed files with 245 additions and 21 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
venv
.DS_Store
__pycache__
__pycache__
sftp-config.json
23 changes: 23 additions & 0 deletions .pylintrc
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,29 @@ disable=missing-function-docstring,
missing-class-docstring,
too-few-public-methods,
duplicate-code,
too-many-nested-blocks,
line-too-long,
too-many-boolean-expressions,
too-many-nested-blocks,
line-too-long,
protected-access,
too-many-lines,
multiple-statements,
pointless-string-statement,
too-many-locals,
too-many-public-methods,
too-many-branches,
too-many-statements,
too-many-arguments,
too-many-return-statements,
too-many-instance-attributes,
invalid-name,
wrong-import-position,
wrong-import-order,
ungrouped-imports,
fixme



[MASTER]
ignore=env
Expand Down
162 changes: 162 additions & 0 deletions auto_detect.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
# pylint: disable=redefined-outer-name,invalid-name

import subprocess
import re
import logging
import math

def get_cpu_info(logger):

data = {
'freq' : None,
'threads': None,
'cores': None,
'tdp': None,
'mem': None,
'make': None,
'chips': None
}

try:
file_path = '/sys/class/powercap/intel-rapl/intel-rapl:0/name'
with open(file_path, 'r', encoding='UTF-8') as file:
domain_name = file.read().strip()
if domain_name != 'package-0':
raise RuntimeError(f"Domain /sys/class/powercap/intel-rapl/intel-rapl:0/name was not package-0, but {domain_name}")

file_path = '/sys/class/powercap/intel-rapl/intel-rapl:0/constraint_0_name'
with open(file_path, 'r', encoding='UTF-8') as file:
constraint_name = file.read().strip()
if constraint_name != 'long_term':
raise RuntimeError(f"Constraint /sys/class/powercap/intel-rapl/intel-rapl:0/constraint_0_name was not long_term, but {constraint_name}")

file_path = '/sys/class/powercap/intel-rapl/intel-rapl:0/constraint_0_max_power_uw'
with open(file_path, 'r', encoding='UTF-8') as file:
tdp = file.read()
data['tdp'] = int(tdp) / 1_000_000

logger.info('Found TDP: %d W', data['tdp'])
#pylint: disable=broad-except
except Exception as err:
logger.info('Exception: %s', err)
logger.info('Could not read RAPL powercapping info from /sys/class/powercap/intel-rapl')

try:
file_paths = {
1: '/sys/class/powercap/intel-rapl/intel-rapl:0/name',
2: '/sys/class/powercap/intel-rapl/intel-rapl:1/name',
3: '/sys/class/powercap/intel-rapl/intel-rapl:2/name',
4: '/sys/class/powercap/intel-rapl/intel-rapl:3/name',
5: '/sys/class/powercap/intel-rapl/intel-rapl:4/name',
6: '/sys/class/powercap/intel-rapl/intel-rapl:5/name',
}
for chips, file_path in file_paths.items():
with open(file_path, 'r', encoding='UTF-8') as file:
domain_name = file.read().strip()
if domain_name != f"package-{chips-1}":
raise RuntimeError(f"Domain {file_path} was not package-{chips-1}, but {domain_name}")
logger.info('Found Sockets: %d', chips)
data['chips'] = chips
#pylint: disable=broad-except
except Exception as err:
logger.info('Exception: %s', err)
logger.info('Could not find (additional) chips info under file path. Most likely reached final chip. continuing ...')


try:
cpuinfo = subprocess.check_output('lscpu', encoding='UTF-8')
match = re.search(r'On-line CPU\(s\) list:\s*(0-)?(\d+)', cpuinfo)
if match:
data['threads'] = int(match.group(2))+1 # +1 because 0 indexed
logger.info('Found Threads: %d', data['threads'])
else:
logger.info('Could not find Threads. Using default None')

# this will overwrite info we have from RAPL socket discovery, as we
# deem lscpu more relieable
match = re.search(r'Socket\(s\):\s*(\d+)', cpuinfo)
if match:
data['chips'] = int(match.group(1))
logger.info('Found Sockets: %d (will take precedence if not 0)', data['chips'])
else:
logger.info('Could not find Chips/Sockets via lscpu')

if data['chips']:
match = re.search(r'Core\(s\) per socket:\s*(\d+)', cpuinfo)
if match:
cores_per_socket = int(match.group(1))
data['cores'] = cores_per_socket * data['chips']
logger.info('Found cores: %d ', data['cores'])
else:
logger.info('Could not find Cores. Using default None')

match = re.search(r'Model name:.*@\s*([\d.]+)\s*GHz', cpuinfo)
if match:
data['freq'] = int(float(match.group(1))*1000)
logger.info('Found Frequency: %s', data['freq'])
else:
logger.info('Could not find Frequency. Using default None')

match = re.search(r'Model name:.*Intel\(R\)', cpuinfo)
if match:
data['make'] = 'intel'
logger.info('Found Make: %s', data['make'])

match = re.search(r'Model name:.*AMD ', cpuinfo)
if match:
data['make'] = 'amd'
logger.info('Found Make: %s', data['make'])


# we currently do not match for architecture, as this info is provided nowhere

# we also currently do not matc for make, as this info can result in ARM which is currently not supported and
# would rather lead to confusion
#pylint: disable=broad-except
except Exception as err:
logger.info('Exception: %s', err)
logger.info('Could not check for CPU info.')


""" This code is problematic, as the CPU freq is changing rapidly sometimes and making the resulting XGBoost
values fluctuate a lot.
"""


# if not data['freq']:
# try:
# cpuinfo_proc = subprocess.check_output(['cat', '/proc/cpuinfo'], encoding='UTF-8', stderr=subprocess.DEVNULL)
# match = re.findall(r'cpu MHz\s*:\s*([\d.]+)', cpuinfo_proc)
# if match:
# data['freq'] = round(max(map(float, match)))
# logger.info('Found assumend Frequency: %d', data['freq'])
# else:
# logger.info('Could not find Frequency. Using default None')
# #pylint: disable=broad-except
# except Exception as err:
# logger.info('Exception: %s', err)
# logger.info('/proc/cpuinfo not accesible on system. Could not check for Base Frequency info. Setting value to None.')



try:
meminfo = subprocess.check_output(['cat', '/proc/meminfo'], encoding='UTF-8', stderr=subprocess.DEVNULL)
match = re.search(r'MemTotal:\s*(\d+) kB', meminfo)
if match:
data['mem'] = math.ceil(int(match.group(1)) / 1024 / 1024)
logger.info('Found Memory: %d GB', data['mem'])
else:
logger.info('Could not find Memory. Using default None')
#pylint: disable=broad-except
except Exception as err:
logger.info('Exception: %s', err)
logger.info('/proc/meminfo not accesible on system. Could not check for Memory info. Defaulting to None.')

return data

if __name__ == "__main__":
logger = logging.getLogger(__name__)
logger.addHandler(logging.StreamHandler())
logger.setLevel(logging.INFO)

print(get_cpu_info(logger))
78 changes: 58 additions & 20 deletions xgb.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,28 +3,38 @@
import sys
import os
import time
import logging
import platform
import pandas as pd
import numpy as np
from xgboost import XGBRegressor

def train_model(cpu_chips, Z, silent=False):
import auto_detect

logger = logging.getLogger(__name__)
logger.addHandler(logging.StreamHandler())
logger.setLevel(logging.INFO)

def train_model(cpu_chips, Z):

df = pd.read_csv(f"{os.path.dirname(os.path.abspath(__file__))}/data/spec_data_cleaned.csv")

X = df.copy()
X = pd.get_dummies(X, columns=['CPUMake', 'Architecture'])

if not silent:
print('Model will be restricted to the following amount of chips:', cpu_chips)
if cpu_chips:
logger.info('Training data will be restricted to the following amount of chips: %d', cpu_chips)

X = X[X.CPUChips == cpu_chips] # Fit a model for every amount of CPUChips

if X.empty:
raise RuntimeError(f"The training data does not contain any servers with a chips amount ({cpu_chips}). Please select a different amount.")

X = X[X.CPUChips == cpu_chips] # Fit a model for every amount of CPUChips
y = X.power

X = X[Z.columns] # only select the supplied columns from the command line

if not silent:
print('Model will be trained on:', X.columns)
logger.info('Model will be trained on the following columns and restrictions: \n%s', Z)

# params = {
# 'max_depth': 10,
Expand Down Expand Up @@ -81,7 +91,7 @@ def interpolate_predictions(predictions):

parser = argparse.ArgumentParser()

parser.add_argument('--cpu-chips', type=int, help='Number of CPU chips', default=1)
parser.add_argument('--cpu-chips', type=int, help='Number of CPU chips')
parser.add_argument('--cpu-freq', type=int, help='CPU frequency')
parser.add_argument('--cpu-threads', type=int, help='Number of CPU threads')
parser.add_argument('--cpu-cores', type=int, help='Number of CPU cores')
Expand All @@ -90,10 +100,12 @@ def interpolate_predictions(predictions):
parser.add_argument('--ram', type=int, help='Amount of DRAM for the bare metal system')
parser.add_argument('--architecture', type=str, help='The architecture of the CPU. lowercase. ex.: haswell')
parser.add_argument('--cpu-make', type=str, help='The make of the CPU (intel or amd)')
parser.add_argument('--auto', action='store_true', help='Force auto detect. Will overwrite supplied arguments')

parser.add_argument('--vhost-ratio',
type=float,
help='Virtualization ratio of the system. Input numbers between (0,1].',
default=1.0
help='Virtualization ratio of the system. Input numbers between (0,1].'

)
parser.add_argument('--silent',
action='store_true',
Expand All @@ -111,6 +123,39 @@ def interpolate_predictions(predictions):

args = parser.parse_args()

if args.silent:
# sadly some libs have future warnings we need to suppress for
# silent mode to work in bash scripts
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
logger.setLevel(logging.WARNING)

args_dict = args.__dict__.copy()
del args_dict['silent']
del args_dict['auto']
del args_dict['energy']

# did the user supply any of the auto detectable arguments?
if not any(args_dict.values()) or args.auto:
logger.info('No arguments where supplied, or auto mode was forced. Running auto detect on the sytem.')

data = auto_detect.get_cpu_info(logger)

logger.info('The following data was auto detected: %s', data)

# only overwrite not already supplied values
args.cpu_freq = args.cpu_freq or data['freq']
args.cpu_threads = args.cpu_threads or data['threads']
args.cpu_cores = args.cpu_cores or data['cores']
args.tdp = args.tdp or data['tdp']
args.ram = args.ram or data['mem']
args.cpu_make = args.cpu_make or data['make']
args.cpu_chips = args.cpu_chips or data['chips']

# set default. We do this here and not in argparse, so we can check if anything was supplied at all
if not args.vhost_ratio:
args.vhost_ratio = 1.0

if platform.system() == 'Darwin' and args.autoinput and args.interval < 0.5:
print('''
Under MacOS the internal values are updated every 0.5 seconds by the kernel if you use the host_statistics call.
Expand All @@ -121,7 +166,6 @@ def interpolate_predictions(predictions):
''')
sys.exit(1)


Z = pd.DataFrame.from_dict({
'HW_CPUFreq' : [args.cpu_freq],
'CPUThreads': [args.cpu_threads],
Expand All @@ -138,18 +182,12 @@ def interpolate_predictions(predictions):

Z = Z.dropna(axis=1)

if args.silent:
# sadly some libs have future warnings we need to suppress for
# silent mode to work in bash scripts
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
else:
print('Sending following dataframe to model:\n', Z)
print('vHost ratio is set to ', args.vhost_ratio)
print('Infering all predictions to dictionary')

trained_model = train_model(args.cpu_chips, Z, args.silent)
logger.info('vHost ratio is set to %s', args.vhost_ratio)

trained_model = train_model(args.cpu_chips, Z)

logger.info('Infering all predictions to dictionary')

inferred_predictions = infer_predictions(trained_model, Z)
interpolated_predictions = interpolate_predictions(inferred_predictions)
Expand Down

0 comments on commit 3232ae2

Please sign in to comment.