forked from stas00/ml-engineering
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfs-watchdog.py
185 lines (147 loc) · 7.23 KB
/
fs-watchdog.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
#!/usr/bin/env python
#
# This tool alerts on the status of the filesystem - when it's getting close to running out of disk space or inodes on various partitions at JZ
#
# Example:
#
# fs-watchdog.py
#
import argparse
import re
import smtplib
import socket
import subprocess
import sys
SLURM_GROUP_NAME = "six"
# this needs to be an actual email subscribed to [email protected]
FROM_ADDR = "[email protected]"
TO_ADDRS = ["[email protected]", "[email protected]"] # wants a list
def send_email(subject, body):
message = f"""\
From: {FROM_ADDR}
To: {", ".join(TO_ADDRS)}
Subject: {subject}
{body}
"""
server = smtplib.SMTP("localhost")
#server.set_debuglevel(3) # uncomment if need to debug
server.sendmail(FROM_ADDR, TO_ADDRS, message)
server.quit()
def send_email_alert(msg):
subject = f"[ALERT] JZ filesystem is getting close to being full"
body = f"""
***ALERT: One or more partitions at JZ are getting close to being full! Alert someone at Eng WG***
{msg}
Please reply to this email once the issue has been taken care of, or if you are in the process of doing that, should new alerts be sent again.
If unsure what to do, please post in the #bigscience-engineering slack channel.
"""
send_email(subject, body)
def check_running_on_jean_zay():
fqdn = socket.getfqdn()
# sometimes it gives fqdn, other times it doesn't, so try to use both patterns
if not ("idris.fr" in fqdn or "idrsrv" in fqdn):
raise ValueError("This script relies on JZ's specific environment and won't work elsewhere. "
f"You're attempting to run it on '{fqdn}'.")
def run_cmd(cmd, check=True):
try:
git_status = subprocess.run(
cmd,
stderr=subprocess.PIPE,
stdout=subprocess.PIPE,
check=check,
encoding="utf-8",
).stdout.strip()
except subprocess.CalledProcessError as exc:
raise EnvironmentError(exc.stderr)
return git_status
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
parser.add_argument("--no-email", action='store_true', help="do not email alerts")
return parser.parse_args()
def main():
check_running_on_jean_zay()
args = get_args()
alerts = []
def analyse_partition_bytes(partition_name, partition_path, hard_limit_bytes, alert_bytes_threshold):
soft_limit_bytes = hard_limit_bytes * alert_bytes_threshold
cmd = f"du -bs {partition_path}"
response = run_cmd(cmd.split(), check=False) # du could report partial errors for wrong perms
size_bytes = int(response.split()[0])
if args.debug:
print(f"{partition_name} bytes: {size_bytes}")
if size_bytes > soft_limit_bytes:
current_usage_percent = 100*size_bytes/hard_limit_bytes
alerts.append(f"{partition_name} is at {current_usage_percent:.2f}% bytes usage ({size_bytes/2**30:.2f}GB/{hard_limit_bytes/2**30:.2f}GB)")
alerts.append("")
def analyse_partition_inodes(partition_name, partition_path, hard_limit_inodes, alert_inodes_threshold):
soft_limit_inodes = hard_limit_inodes * alert_inodes_threshold
cmd = f"du -s -BK --inodes {partition_path}"
response = run_cmd(cmd.split(), check=False) # du could report partial errors for wrong perms
size_inodes = int(response.split()[0])
if args.debug:
print(f"{partition_name} Inodes: {size_inodes}")
if size_inodes > soft_limit_inodes:
current_usage_percent = 100*size_inodes/hard_limit_inodes
alerts.append(f"{partition_name} is at {current_usage_percent:.2f}% inodes usage ({size_inodes/2**10:.2f}K/{hard_limit_inodes/2**10:.2f}K)")
alerts.append("")
def analyse_partition_idrquota(partition_name, partition_flag, alert_bytes_threshold, alert_inodes_threshold):
cmd = f"idrquota {partition_flag} -p {SLURM_GROUP_NAME}"
response = run_cmd(cmd.split())
match = re.findall(' \(([\d\.]+)%\)', response)
if match:
bytes_percent, inodes_percent = [float(x) for x in match]
else:
raise ValueError(f"{cmd} failed")
if args.debug:
print(f"{partition_name} bytes: {bytes_percent}%")
print(f"{partition_name} inodes: {inodes_percent}%")
msg = []
if bytes_percent/100 > alert_bytes_threshold:
msg.append(f"{partition_name} is at {bytes_percent:.2f}% bytes usage")
if inodes_percent/100 > alert_inodes_threshold:
msg.append(f"{partition_name} is at {inodes_percent:.2f}% inodes usage")
if len(msg) > 0:
alerts.extend(msg)
alerts.append(response)
alerts.append("")
def analyse_shared_disk(partition_name, alert_bytes_threshold):
partition_name_2_disk = {
"SCRATCH": "gpfsssd",
"WORK": "gpfsdswork",
"STORE": "gpfsdsstore"
}
cmd = "df"
response = run_cmd(cmd.split())
disk_metas = response.split("\n")
column_names = disk_metas[0].split()
disk_meta = [disk_meta_.split() for disk_meta_ in disk_metas if disk_meta_.startswith(partition_name_2_disk[partition_name])][0]
disk_meta = {column_name: value for column_name, value in zip(column_names, disk_meta)}
# default `df` counts uses 1024-byte units, and `1024 == 2 ** 10`
available_disk_left = int(disk_meta["Available"]) * 2 ** 10
if available_disk_left < alert_bytes_threshold:
alerts.append(f"Shared {partition_name} has {available_disk_left/2**40:.2f}TB left")
alerts.append("")
# WORK and STORE partitions stats can be accessed much faster through `idrquota`, and it already
# includes the quota info
analyse_partition_idrquota(partition_name="WORK", partition_flag="-w", alert_bytes_threshold=0.85, alert_inodes_threshold=0.85)
analyse_partition_idrquota(partition_name="STORE", partition_flag="-s", alert_bytes_threshold=0.85, alert_inodes_threshold=0.85)
# SCRATCH - check only bytes w/ a hard quota of 400TB - alert on lower threshold than other
# partitions due to it filling up at a faster rate (dumping huge checkpoints)
analyse_partition_bytes(partition_name="SCRATCH", partition_path="/gpfsssd/scratch/rech/six/", hard_limit_bytes=400*2**40, alert_bytes_threshold=0.75)
# Actually SCRATCH is shared with everyone and we should monitor the output of `df -h | grep gpfsssd`
# Check that there's still 40TB left
analyse_shared_disk("SCRATCH", 100 * 2 ** 40)
# WORKSF - check both bytes and inodes w/ hard quotas of 2TB / 3M
analyse_partition_bytes(partition_name="WORKSF", partition_path="/gpfsssd/worksf/projects/rech/six/", hard_limit_bytes=2*2**40, alert_bytes_threshold=0.85)
analyse_partition_inodes(partition_name="WORKSF", partition_path="/gpfsssd/worksf/projects/rech/six/", hard_limit_inodes=3*10**6, alert_inodes_threshold=0.85)
if len(alerts) > 0 :
print(f"[ALERT] JZ filesystem is getting close to being full")
msg = "\n".join(alerts)
print(msg)
if not args.no_email:
send_email_alert(msg)
else:
print("All partitions are in a good standing")
if __name__ == "__main__":
main()