-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_slurm_conf.py
executable file
·213 lines (187 loc) · 7.02 KB
/
get_slurm_conf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
#!/usr/bin/env python
import subprocess
import re
import itertools
import argparse
import functools
def max_in_col(data, col_id):
"""If data is a list of tuples, return the maximum """
return max(int(row[col_id]) for row in data)
def oneline(s):
"""Converts a multi-line string to one line and removes extra spaces"""
return re.sub("[\s]+", " ", s).strip()
def return_on_error(val):
"""Decorator that makes a function return a default value instead of throwing an exception
Example:
@return_on_error("green")
def foo(x):
if x:
return "red"
else:
raise Exception("This will be caught and ignored")
foo(True) # returns "red"
foo(False) # returns "green"
"""
def val_decorator(func):
@functools.wraps(func)
def func_wrapper(*args, **kwargs):
try:
return func(*args, **kwargs)
except:
return val
return func_wrapper
return val_decorator
def get_short_intel_model_name(model):
"""Shortens a full Intel processor description"""
# Clean up text before matching
model = re.sub("CPU", "", model)
model = oneline(model)
m = re.match("Intel\(R\) ([\w]+)\(\w+\) ([\w\d-]+\s?[\w\d]*) @ \d+[.]\d+\w+", model)
if m:
family = m.group(1)
number = m.group(2)
# HACK: cpuinfo sometimes includes a 0 in place of the version number that we don't care about
if number.endswith(" 0"):
number = number[:-2]
number = number.replace(" ", "")
model = family + "-" + number
return model
def get_short_power_model_name(model):
return model.strip().replace(" ", "-")
@return_on_error("UNKNOWN")
def get_cpu_model():
text = subprocess.check_output(["cat", "/proc/cpuinfo"])
for line in text.splitlines():
# Look for model name
if line.startswith("model name") or line.startswith("machine"):
# Grab processor description
model = line.split(":")[1]
if model.startswith(" Intel"):
return get_short_intel_model_name(model)
elif model.startswith(" Power"):
return get_short_power_model_name(model)
@return_on_error({"cpus":1, "cores":1, "sockets":1})
def get_cpu_info():
"""Return the number of cpus, cores, and sockets on this machine"""
cpu_text = subprocess.check_output(["lscpu", "--parse=cpu,core,socket"])
rows = [line.split(",") for line in cpu_text.splitlines() if not line.startswith("#")]
return {
"cpus" : max_in_col(rows, 0) + 1,
"cores" : max_in_col(rows, 1) + 1,
"sockets" : max_in_col(rows, 2) + 1,
}
@return_on_error("0")
def get_mem_info():
"""Return the total amount of RAM on this system, in MiB"""
text = subprocess.check_output(["cat", "/proc/meminfo"])
m = re.match("MemTotal:\s+(\d+) kB", text)
if not m:
return 0
else:
total_memory = int(m.group(1)) / 1024
# Nodes need to hold some memory in reserve for OS, etc
reserved = 0.05
return int(total_memory * (1 - reserved))
@return_on_error("UNKNOWN")
def get_hostname():
"""Return this node's hostname"""
return subprocess.check_output("hostname").splitlines()[0].split(".")[0]
@return_on_error("0.0.0.0")
def get_ipaddr():
hostname = get_hostname()
text = subprocess.check_output(["host", "-4", hostname])
# Returned text has this format:
# host has address 0.0.0.0
return text.strip().split(" ")[3]
def get_gpu_names():
"""Return a list of names of GPU's on this node"""
try:
text = subprocess.check_output(["nvidia-smi", "--query-gpu=name", "--format=csv,noheader"])
lines = text.splitlines()
return [line.replace(" ", "") for line in lines]
except subprocess.CalledProcessError:
return []
except OSError:
return []
def get_gres_conf(include_gpu_types=False):
"""Return a list of lines for this node's gres.conf
include_gpu_types: Emit the type field for each gpu.
"""
template = "NodeName={hostname} Name=gpu File={file}"
if include_gpu_types:
template += " Type={type}"
gpu_list = get_gpu_names()
out_lines = []
for n, line in enumerate(gpu_list):
data = {
"hostname" : get_hostname(),
"type" : line,
"file" : "/dev/nvidia%i"%n, # HACK ordering from nvidia-smi might not match device ID's
}
out_lines.append(template.format(**data))
return "\n".join(out_lines)
def get_gres_desc(include_gpu_types=False):
"""Return a string describing the generic resources available in this node,
This fills the Gres field in slurm.conf for this node.
"""
tokens = []
gpus = get_gpu_names()
if include_gpu_types:
for gpu_type, group in itertools.groupby(sorted(gpus)):
tokens.append("gpu:{}:{}".format(gpu_type, len(list(group))))
else:
if len(gpus) > 0:
tokens.append("gpu:{}".format(len(gpus)))
if len(tokens) == 0:
return ""
else:
return "Gres=" + ",".join(tokens)
def get_features():
features = []
features += [get_cpu_model()]
features += list(set(get_gpu_names()))
return ",".join(features)
def get_slurm_conf(include_gpu_types=False, include_hyperthreads=False):
"""Return a line describing this node's resources to put in slurm.conf"""
cpu_info = get_cpu_info()
data = {
"hostname" : get_hostname(),
"ipaddr" : get_ipaddr(),
"cpus" : cpu_info["cpus"] if include_hyperthreads else cpu_info["cores"],
"threads_per_core" : cpu_info["cpus"] / cpu_info["cores"],
"cores_per_socket" : cpu_info["cores"] / cpu_info["sockets"],
"num_sockets" : cpu_info["sockets"],
"memory" : get_mem_info(),
"feature" : get_features(),
"gres" : get_gres_desc(include_gpu_types),
}
template = oneline("""\
NodeName={hostname}
NodeAddr={ipaddr}
CPUs={cpus}
ThreadsPerCore={threads_per_core}
CoresPerSocket={cores_per_socket}
Sockets={num_sockets}
RealMemory={memory}
Feature={feature}
{gres}
State=UNKNOWN
""")
return template.format(**data)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("file", help="Which config file to generate: slurm.conf or gres.conf?")
parser.add_argument("--include-gpu-types", default=False, action="store_true", help="Emit the type field for each gpu")
parser.add_argument("--include-hyperthreads", default=True, action="store_true", help="Count each hyperthread as a separate CPU")
args = parser.parse_args()
if args.file == "slurm.conf":
print get_slurm_conf(
include_gpu_types=args.include_gpu_types,
include_hyperthreads=args.include_hyperthreads
)
elif args.file == "gres.conf":
print get_gres_conf(
include_gpu_types=args.include_gpu_types
)
else:
raise Exception("I don't know how to generate {}".format(args.file))