-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathsop_linter.py
426 lines (347 loc) · 19.7 KB
/
sop_linter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
import argparse
import os
import re
import json
from typing import List, Dict, Any
from packaging.version import Version, InvalidVersion
import markdown
from bs4 import BeautifulSoup
from utils import find_tables, collect_sop_files
class SOPLinter:
def __init__(self, verbosity: int = 0, strict: bool = False, required_sections: dict = {}):
"""
Initializes the SOPLinter with verbosity and strict mode settings.
:param verbosity: Level of verbosity for output messages.
:param strict: Whether to treat warnings as errors.
"""
self.verbosity = verbosity
self.strict = strict
self.results = {}
self.tables = {}
self.required_sections = required_sections
def lint_sop(self, file_path: str):
"""
Lints a single SOP file for compliance with the required rules.
:param file_path: Path to the SOP file.
"""
if self.verbosity > 1:
print(f"- Starting linting for {file_path}")
self.results[file_path] = {"errors": [], "warnings": []}
self.tables[file_path] = None
with open(file_path, 'r') as file:
content = file.read()
html_content = markdown.markdown(content, extensions=['tables'])
soup = BeautifulSoup(html_content, 'html.parser')
# We iterate one by one over the linting rules
self.lr_check_title(soup, file_path)
self.lr_check_required_sections(soup, file_path)
self.lr_check_non_empty_sections(soup, file_path)
self.lr_check_metadata_table(soup, file_path)
self.lr_check_document_history(soup, file_path)
self.lr_check_roles_and_responsibilities(soup, file_path)
if self.verbosity > 1:
print(f"- Finished linting for {file_path}")
print(json.dumps(self.results[file_path], indent=2),"\n")
def lr_check_title(self, soup: BeautifulSoup, file_path: str):
"""
Checks if the SOP title follows the required format.
:param soup: BeautifulSoup object of the parsed SOP content.
:param file_path: Path to the SOP file.
"""
if self.verbosity > 1:
print("-- Linting rule: checking title...")
first_header = soup.find('h1')
if not first_header or not first_header.text.startswith("European GDI - "):
self.report_issue(f"Title must start with '# European GDI - ' followed by the SOP title. Current title: '{first_header.text}'", file_path, error=True)
if self.verbosity > 1:
print(f"{json.dumps(self.results[file_path], indent=2)}\n")
def lr_check_metadata_table(self, soup: BeautifulSoup, file_path: str):
"""
Checks if the metadata table is correctly formatted and contains proper content.
:param soup: BeautifulSoup object of the parsed SOP content.
:param file_path: Path to the SOP file.
"""
if self.verbosity > 1:
print("-- Linting rule: checking metadata table...")
metadata_table_headers = ["Metadata", "Value"]
table_find_result = self.find_tables(soup = soup, file_path = file_path, aim_headers = metadata_table_headers)
if not table_find_result:
self.report_issue("Metadata table is missing or incorrectly formatted.", file_path, error=True)
return
metadata_table = table_find_result[0]
rows = metadata_table.find_all('tr') # Table rows (tr)
# Expected format of the "Value" column of each row
expected_metadata = {
"template sop number": r"GDI-SOP\d{4}", # e.g.: GDI-SOP0001
"template sop version": self.is_valid_version, # e.g.: v1,
"topic": ["Data protection & security", "Data & metadata management", "Technical infrastructure & software development", "Helpdesk & operations"],
"template sop type": ["Node-specific SOP", "European-level SOP"],
"gdi node": r"^[A-Z]{3}$", # e.g.: SWE (for Sweden)
"instance version": self.is_valid_version, # e.g.: v1
}
table_dict = {}
# Iterate over each table row, applying its validation rule
for row in rows[1:]: # Skip the header row
columns = [col.text.strip() for col in row.find_all('td')]
if not len(columns) == 2:
self.report_issue(f"Metadata table row is incorrectly formatted (2 columns are expected): '{' | '.join(columns)}'.", file_path, error=True)
continue
key, value = columns[0], columns[1]
table_dict[key.lower()] = value
for key, value in table_dict.items():
# Linting rules for node-specific SOPs should not apply for european-level ones
node_specific_keys = ["gdi node", "instance version"]
if key in node_specific_keys:
try:
if table_dict["template sop type"].lower() == "European-level SOP".lower():
continue
except:
pass
# We only want to evaluate each key Node-specific key format if any is present.
# Otherwise, it could be a Node-specific SOP template (correct without these keys)
if all(table_dict.get(key) in [None, "", []] for key in node_specific_keys):
self.report_issue(f"At the metadata table, value column for '{key}' was empty. If the SOP is a Node-specific SOP Instance (not a template), it should have a value.", file_path, warning=True)
continue
if key in expected_metadata:
# Depending on the type of format rules for each row, we apply them differently
if isinstance(expected_metadata[key], str) and not re.match(expected_metadata[key], value):
# e.g., GDI-SOP0001
self.report_issue(f"At the metadata table, value column for '{key}' is incorrectly formatted: '{value}'. It should follow the regex '{expected_metadata[key]}'", file_path, error=True)
elif callable(expected_metadata[key]) and not expected_metadata[key](value):
# e.g., v1.0.2
self.report_issue(f"At the metadata table, value column for '{key}' is incorrectly formatted: '{value}'.", file_path, error=True)
elif isinstance(expected_metadata[key], list) and value.lower() not in [item.lower() for item in expected_metadata[key]]:
# e.g., Node-specific SOP
self.report_issue(f"At the metadata table, value column for '{key}' is invalid: '{value}'. It's value should be one of: {expected_metadata[key]}", file_path, error=True)
else:
self.report_issue(f"Unexpected row in the metadata table: '{' | '.join([key, value])}'.", file_path, warning=True)
for key in expected_metadata.keys():
if key.lower() not in table_dict.keys():
self.report_issue(f"Metadata row '{key}' is missing from the metadata table.", file_path, error=True)
try:
if table_dict["template sop type"] == "European-level SOP" and (table_dict["gdi node"] or table_dict["instance version"]):
self.report_issue("European-level SOPs should not have 'GDI Node' or 'Instance version' values in the metadata table.", file_path, error=True)
except Exception:
# The missing rows are already reported above
pass
if self.verbosity > 1:
print(f"{json.dumps(self.results[file_path], indent=2)}\n")
def lr_check_required_sections(self, soup: BeautifulSoup, file_path: str):
"""
Checks if all required sections are present in the SOP.
:param soup: BeautifulSoup object of the parsed SOP content.
:param file_path: Path to the SOP file.
"""
if self.verbosity > 1:
print("-- Linting rule: checking required sections...")
for section, selector in self.required_sections.items():
if not soup.select_one(selector):
self.report_issue(f"Required section '{section}' is missing.", file_path, error=True)
if self.verbosity > 1:
print(f"{json.dumps(self.results[file_path], indent=2)}\n")
def lr_check_document_history(self, soup: BeautifulSoup, file_path: str):
"""
Checks the 'Document History' table for proper version increments, non-empty change descriptions,
valid author names, and valid date formats.
:param soup: BeautifulSoup object of the parsed SOP content.
:param file_path: Path to the SOP file.
"""
if self.verbosity > 1:
print("-- Linting rule: checking Document History table...")
aim_headers = ["Template Version", "Instance version", "Author(s)", "Description of changes", "Date"]
table_find_result = self.find_tables(soup, file_path, aim_headers)
if not table_find_result:
self.report_issue("Document History table is missing or incorrectly formatted.", file_path, error=True)
return
document_history_table = table_find_result[0]
rows = document_history_table.find_all('tr')[1:] # Skip the header row
previous_template_version = None
previous_instance_version = None
for row in rows:
columns = [col.text.strip('`').strip() for col in row.find_all('td')]
if len(columns) != 5:
self.report_issue(f"Document History table row is incorrectly formatted (expected 5 columns). Row: '{' | '.join(columns)}'.", file_path, error=True)
continue
template_version = columns[0]
instance_version = columns[1]
author = columns[2]
description = columns[3]
date = columns[4]
# Check versioning rules
if self.is_valid_version(template_version):
current_template_version = Version(template_version)
else:
self.report_issue(f"At the Document History table, Template Version ('{template_version}') is incorrectly formatted. Row: '{' | '.join(columns)}'.", file_path, error=True)
continue
if instance_version:
if self.is_valid_version(instance_version):
current_instance_version = Version(instance_version)
else:
self.report_issue(f"At the Document History table, Instance Version ('{instance_version}') is incorrectly formatted. Row: '{' | '.join(columns)}'.", file_path, error=True)
continue
else:
current_instance_version = None
if previous_template_version:
# Template versions should be equal or lower than the ones above (more recent)
if (current_template_version > previous_template_version) or (current_template_version == previous_template_version and not previous_instance_version):
self.report_issue(f"At the Document History table, Template version ('{current_template_version}') should be lower than the version right above ('{previous_template_version}'). Notice the order of the table: from recent (top) to older (bottom) and address the versioning. Row: '{' | '.join(columns)}'.", file_path, error=True)
# Instance versions are not always required, only when it's a node instance
if current_instance_version and previous_instance_version:
# Instance versions should always be higher than the previous one
if current_instance_version >= previous_instance_version:
self.report_issue(f"At the Document History table, Instance Version ('{current_instance_version}') should be lower than the version right above ('{previous_instance_version}'). Notice the order of the table: from recent (top) to older (bottom) and address the versioning. Row: '{' | '.join(columns)}'.", file_path, error=True)
# Assigned for the next iteration to use for comparisons
previous_template_version = current_template_version
previous_instance_version = current_instance_version
# Check author name
if not author:
self.report_issue(f"Author name is missing. Row: '{' | '.join(columns)}'.", file_path, error=True)
# Check description
if not description:
self.report_issue(f"Description of changes is missing. Row: '{' | '.join(columns)}'.", file_path, error=True)
# Check date format
if not re.match(r"\d{4}\.\d{2}\.\d{2}", date):
self.report_issue(f"Date is incorrectly formatted (expected format is YYYY.MM.DD): '{date}'. Row: '{' | '.join(columns)}'.", file_path, error=True)
if self.verbosity > 1:
print(f"{json.dumps(self.results[file_path], indent=2)}\n")
def lr_check_roles_and_responsibilities(self, soup: BeautifulSoup, file_path: str):
"""
Checks if the Roles and Responsibilities table exists and has at least one non-empty Full Name for roles Author, Reviewer, and Approver.
:param soup: BeautifulSoup object of the parsed SOP content.
:param file_path: Path to the SOP file.
"""
if self.verbosity > 1:
print("-- Linting rule: checking Roles and Responsibilities table...")
aim_headers = ["Role", "Full name", "GDI/node role", "Organisation"]
table_find_result = self.find_tables(soup, file_path, aim_headers)
if not table_find_result:
self.report_issue("Roles and Responsibilities table is missing or incorrectly formatted.", file_path, error=True)
return
roles_table = table_find_result[0]
required_roles = ["Author", "Reviewer", "Approver"]
found_roles = {role: False for role in required_roles}
rows = roles_table.find_all('tr')[1:] # Skip the header row
for row in rows:
columns = [col.text.strip() for col in row.find_all('td')]
if len(columns) != 4:
self.report_issue(f"Roles and Responsibilities table row is incorrectly formatted (expected 4 columns): '{' | '.join(columns)}'.", file_path, error=True)
continue
role, full_name = columns[0], columns[1]
# If we haven't already found one, and this one is required and has a full name value
if role in required_roles and full_name:
if not found_roles[role]:
found_roles[role] = True
for role, found in found_roles.items():
if not found:
self.report_issue(f"Role '{role}' is missing a non-empty Full Name row in the Roles and Responsibilities table.", file_path, error=True)
if self.verbosity > 1:
print(f"{json.dumps(self.results[file_path], indent=2)}\n")
def lr_check_non_empty_sections(self, soup: BeautifulSoup, file_path: str):
"""
Checks if required sections are non-empty.
:param soup: BeautifulSoup object of the parsed SOP content.
:param file_path: Path to the SOP file.
"""
if self.verbosity > 1:
print("-- Linting rule: checking non-empty required sections...")
for section_title, selector in self.required_sections.items():
header = soup.select_one(selector)
if not header:
# No need to report the missing section as an error, since
# that's the role of a different linting rule
continue
# Get all siblings of the header until the next header of the same or higher level
section_content = []
for sibling in header.find_next_siblings(): # Create iterator of headers
# Only capture content within the current section and stop if we reach a new section of the same or higher level
if re.match(r'^h[1-6]$', sibling.name) and sibling.name <= header.name:
break
section_content.append(sibling)
if not section_content or all(not sibling.text.strip() for sibling in section_content):
self.report_issue(f"The section '{section_title}' is empty.", file_path, error=True)
if self.verbosity > 1:
print(f"{json.dumps(self.results[file_path], indent=2)}\n")
def find_tables(self, soup: BeautifulSoup, file_path: str, aim_headers: List[str]) -> List[BeautifulSoup]:
"""
Uses imported function to find all tables by their set of headers
Returns a list of tables.
"""
if not self.tables[file_path]:
self.tables[file_path] = soup.find_all('table')
aim_tables = find_tables(soup=soup, aim_headers=aim_headers, tables=self.tables[file_path])
return aim_tables
def is_valid_version(self, version: str) -> bool:
"""
Checks if the given version string follows semantic versioning.
:param version: Version string to check.
:return: True if the version string is valid, False otherwise.
"""
try:
Version(version)
return True
except InvalidVersion:
return False
def report_issue(self, message: str, file_path: str, error: bool = False, warning: bool = False):
"""
Reports an issue found during linting.
:param message: Description of the issue.
:param file_path: Path to the SOP file.
:param error: Whether the issue is an error (True) or a warning (False).
"""
issue_type = "errors" if error or self.strict else "warnings"
self.results[file_path][issue_type].append(message)
def generate_report(self) -> str:
"""
Generates a JSON formatted report of all linting results.
:return: JSON string of the linting results and a boolean on whether there are errors or not
"""
has_errors = any(file_results['errors'] for file_results in self.results.values())
return json.dumps(self.results, indent=2), has_errors
def parse_args() -> Any:
"""
Parses command-line arguments.
:return: Parsed arguments.
"""
parser = argparse.ArgumentParser(description="Lints SOP markdown files")
parser.add_argument(
"inputs", nargs="+", help="SOP file(s) or directories to lint. Given directories will be explored, looking for markdown files following the SOP naming conventions"
)
parser.add_argument(
"-v", "--verbosity", type=int, default=0, help="Verbosity level (0-2). 0 prints nothing; 1 prints the end report; 2 prints the report of each file at each step"
)
parser.add_argument(
"-s", "--strict", action="store_true", help="Treat warnings as errors."
)
return parser.parse_args()
def main():
"""
Main function to run the SOP linter.
"""
args = parse_args()
sop_files = collect_sop_files(args.inputs)
required_sections = {
"## Index": "h2:contains('Index')",
"### Document History": "h3:contains('Document History')",
"### Glossary": "h3:contains('Glossary')",
"### Roles and Responsibilities": "h3:contains('Roles and Responsibilities')",
"### Purpose": "h3:contains('Purpose')",
"### Scope": "h3:contains('Scope')",
"### Procedure": "h3:contains('Procedure')",
"### References": "h3:contains('References')"
}
linter = SOPLinter(verbosity=args.verbosity, strict=args.strict, required_sections=required_sections)
for sop_file in sop_files:
linter.lint_sop(sop_file)
report, has_errors = linter.generate_report()
if args.verbosity > 0:
print(report)
# These exit codes will be interpreted downstream
if has_errors:
exit(1)
else:
exit(0)
if __name__ == "__main__":
"""
To run it as a standalone script besides importing bits of it
"""
main()