-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #35 from GenomicDataInfrastructure/ZH-350
ZH-350 - Create automatic SOP review reminders
- Loading branch information
Showing
5 changed files
with
353 additions
and
11 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
# This workflow executes a comparison script on all SOP documents and decides | ||
# whether the documents are due for review or not based on their last edit date | ||
# and a given day(s) threshold. | ||
# The workflow triggers itself every month, checking if new SOP documents are due for | ||
# review. | ||
# | ||
# For more information, check: | ||
# https://github.com/GenomicDataInfrastructure/standard-operating-procedures/tree/main/scripts/check_sop_reviews.py | ||
# https://github.com/GenomicDataInfrastructure/standard-operating-procedures/blob/main/docs/GDI-SOP_information-service-management.md | ||
# https://github.com/GenomicDataInfrastructure/standard-operating-procedures/blob/main/docs/GDI-SOP_charter.md | ||
name: Monthly SOP Review Check | ||
|
||
on: | ||
schedule: | ||
- cron: '0 0 1 * *' # This runs at midnight on the 1st of every month | ||
workflow_dispatch: # Allows for manual triggering of the workflow | ||
|
||
jobs: | ||
check-sop-reviews: | ||
runs-on: ubuntu-latest | ||
if: github.ref == 'refs/heads/main' # Ensures the workflow only runs on the main branch | ||
|
||
steps: | ||
- name: Checkout Repository | ||
uses: actions/checkout@v3 | ||
|
||
- name: Set up Python | ||
uses: actions/setup-python@v4 | ||
with: | ||
python-version: 3.x | ||
|
||
- name: Install dependencies | ||
run: | | ||
pip install --upgrade pip | ||
requirements_f="./requirements.txt" | ||
if [ -f "$requirements_f" ]; then pip install -r "$requirements_f" --verbose; fi | ||
- name: Run SOP Review Check | ||
run: | | ||
# To vary the day(s) threshold, modify "-dr". 365 --> One full year since last edit | ||
python3 scripts/check_sop_reviews.py sops/ -dr 365 -v 1 -r 'GenomicDataInfrastructure/standard-operating-procedures' -ct | ||
env: | ||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # Uses GitHub's automatic token for auth |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,14 @@ | ||
# Required for parsing HTML content in SOP documents | ||
beautifulsoup4>=4.12.3 | ||
|
||
# Required for parsing markdown SOP documents | ||
markdown>=3.6 | ||
|
||
# Required for checking versioning and packaging in SOPs | ||
packaging>=24 | ||
|
||
# Required for handling data processing if needed in future expansions | ||
pandas>=2.0,<2.2 | ||
|
||
# Required for making HTTP requests to GitHub API | ||
requests>=2.31.0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,266 @@ | ||
import os | ||
import re | ||
import argparse | ||
import requests | ||
import json | ||
from typing import List, Dict, Any | ||
import markdown | ||
from bs4 import BeautifulSoup | ||
from datetime import datetime | ||
from utils import find_tables, collect_sop_files, get_gh_issues | ||
|
||
# GitHub authentication | ||
gh_token = os.getenv("GITHUB_TOKEN") | ||
|
||
sop_id_regex = r"(GDI-SOP\d{4})" | ||
|
||
# Regular expression to match the date in the document history (YYYY.MM.DD) | ||
date_regex = r"\d{4}\.\d{2}\.\d{2}" | ||
|
||
def parse_args() -> Any: | ||
""" | ||
Parses command-line arguments. | ||
:return: Parsed arguments. | ||
""" | ||
parser = argparse.ArgumentParser(description="Checks if SOP files are due for review based on the dates in their Document History") | ||
parser.add_argument( | ||
"inputs", nargs="+", help="SOP file(s) or directories to check. Given directories will be explored, looking for markdown files following the SOP naming conventions" | ||
) | ||
parser.add_argument( # By default one year | ||
"-dr", "--days-review", type=int, default=365, help="Number of days since last edit for an SOP document to be considered 'due for review'. Default: 365 (a full year)." | ||
) | ||
parser.add_argument( | ||
"-ct", "--create-issues", action="store_true", help="Flag to let the script know that, if an SOP is due for review, you want for it to create the corresponding GH issue." | ||
) | ||
parser.add_argument( | ||
"-r", "--repository", type=str, default="GenomicDataInfrastructure/standard-operating-procedures", help="Repository path where the GH issues will be created. Default: 'GenomicDataInfrastructure/standard-operating-procedures'" | ||
) | ||
parser.add_argument( | ||
"-v", "--verbosity", type=int, default=0, help="Verbosity level (0-2). 0 prints nothing; 1 prints the end report; 2 prints the report of each file at each step" | ||
) | ||
return parser.parse_args() | ||
|
||
# Function to parse the "Document History" table and extract the last date | ||
def get_last_edit_date(soup: BeautifulSoup, sop_file: str) -> datetime: | ||
""" | ||
Extracts the most recent date from the Document History table. | ||
:param soup: BeautifulSoup object of the parsed SOP content. | ||
:param sop_file: Filepath of the file being checked | ||
:return: Most recent date (datetime object) or None if no valid date is found. | ||
""" | ||
# Headers to match the Document History table | ||
aim_headers = ["Template Version", "Instance Version", "Author(s)", "Description of changes", "Date"] | ||
tables = find_tables(soup, aim_headers) | ||
|
||
if not tables: | ||
raise ValueError(f"No Document History table was found (based on given headers) for file '{sop_file}'.") | ||
|
||
document_history_table = tables[0] # Use the first found table (assuming there's only one Document History table) | ||
first_row = document_history_table.find_all('tr')[1:2][0] # Skip header row, and get first row, which should be the newest entry | ||
columns = [col.text.strip() for col in first_row.find_all('td')] | ||
if len(columns) != len(aim_headers): | ||
raise ValueError(f"First row of the Document History table was found malformed (had '{len(columns)}' where it should have '{len(aim_headers)}') for file '{sop_file}'.") | ||
|
||
date_str = columns[4] # The last column (5th) is expected to be the date | ||
if re.match(date_regex, date_str): | ||
last_date = datetime.strptime(date_str, "%Y.%m.%d") | ||
else: | ||
raise ValueError(f"Invalid date format '{date_str}' in the Document History table for file '{sop_file}'") | ||
|
||
return last_date | ||
|
||
def check_existing_github_issues(all_issues: List, sop_file: str, verbosity: int = 1) -> int: | ||
""" | ||
Checks if a GitHub issue already exists for the given SOP file. | ||
:param sop_file: The SOP file to check. | ||
:return: The issue HTML URL if an existing issue is found, otherwise None. | ||
""" | ||
sop_name = os.path.basename(sop_file) | ||
# We extract the SOP ID from the filename (e.g., "GDI-SOP0003") | ||
sop_id = re.match(sop_id_regex, sop_name).group(1) | ||
|
||
if not all_issues: | ||
if verbosity > 1: | ||
print(f"- Given GitHub issue list was empty. Check filtering parameters if it's not expected.") | ||
return None | ||
|
||
for issue in all_issues: | ||
# We have already filtered all issues by the labels, so the SOP ID in the title should do the trick | ||
if sop_id in issue['title']: | ||
issue_url = issue['html_url'] | ||
if verbosity > 1: | ||
print(f"-- Existing issue found for '{sop_file}', Issue URL: {issue_url}") | ||
return issue_url | ||
|
||
if verbosity > 1: | ||
print(f"-- No existing issue was found for '{sop_id}' ('{sop_file}') in all '{len(all_issues)}' issues that were fetched.") | ||
return None | ||
|
||
def create_github_issue(gh_repo: str, gh_token: str, sop_file: str, last_edit_date: datetime, days_review: int) -> int: | ||
""" | ||
Creates a GitHub issue for SOP review if it hasn't been reviewed in the last year. | ||
:param gh_repo: The GH repository where the new issue is created. | ||
:param gh_token: GitHub token to authorize | ||
:param sop_file: The SOP file being reviewed. | ||
:param last_edit_date: The last review date of the SOP. | ||
:param days_review: The amount of days set as threshold for an SOP to have to go through review | ||
:return: The GitHub issue ID if the issue is created successfully, otherwise None. | ||
""" | ||
url = f"https://api.github.com/repos/{gh_repo}/issues" | ||
headers = {"Authorization": f"token {gh_token}"} | ||
|
||
# Payload for the GitHub issue | ||
sop_name = os.path.basename(sop_file) | ||
# We extract the SOP ID from the filename (e.g., "GDI-SOP0003") | ||
sop_id = re.match(sop_id_regex, sop_name).group(1) | ||
|
||
# Issue body with proper markdown formatting | ||
issue_body = ( | ||
f"## Summary\n" | ||
f"The SOP **'{sop_id}'** is **due for review** and potential revision.\n\n" | ||
|
||
f"## Motivation\n" | ||
f"Part of the SOP life-cycle is the periodic review process. After **'{days_review}' days** (defined at " | ||
f"`.github/workflows/review_reminder.yml`) since the last entry in the Document History, **every SOP has to be formally reviewed**, " | ||
f"to make sure that the SOP is still relevant and up to date. Find more information inside the **[Charter](https://github.com/GenomicDataInfrastructure/standard-operating-procedures/blob/main/docs/GDI-SOP_charter.md)** " | ||
f"and **[ISM](https://github.com/GenomicDataInfrastructure/standard-operating-procedures/blob/main/docs/GDI-SOP_information-service-management.md)** documents.\n\n" | ||
|
||
f"Based on the information in ``{sop_name}``, it was last reviewed/edited on ``{last_edit_date.date()}``. " | ||
f"This falls beyond the period of '{days_review}' days ago set as threshold for SOPs to be reviewed.\n\n" | ||
|
||
f"## Required action\n" | ||
f"- **Review document `{sop_name}`**. This includes, but not limited to: appointing relevant reviewers within the GDI network, " | ||
f"bringing the SOP up for discussion within GDI, reviewing that the SOP is still relevant, reviewing that the SOP complies with the styling guide...\n" | ||
f"- **Modify the SOP document based on the review**. For any content modification of the SOP, follow a similar approach as the one described in " | ||
f"**[GDI-SOP0007](https://github.com/GenomicDataInfrastructure/standard-operating-procedures/blob/main/sops/european-level/GDI-SOP0007_SOP-template-creation.md)**.\n\n" | ||
|
||
f"## Reminders:\n" | ||
f"- Remember to **link this GH issue with the respective PR** (i.e., copy-paste the PR URL as a comment below).\n" | ||
f"- Remember to **add the review/revision row to the Document History**. Even if no change was required, once the review is finished, add the proper row to the Document History of the document. " | ||
f"This will aid with the automatic review detection and help the maintainers know which SOPs were reviewed and by whom.\n\n" | ||
|
||
f"## Disclaimer\n" | ||
f"This GitHub issue was created automatically through the execution of `scripts/check_sop_reviews.py`, likely triggered through the GitHub workflow `.github/workflows/review_reminder.yml`.\n\n" | ||
|
||
f"- To stop this behaviour, remove the automatic trigger (i.e., delete 'schedule' from the workflow file). **Only do so if you are sure** that this automatic review trigger is wrong.\n" | ||
f"- If this automatic trigger was a fluke, it may be due to the Document History of this SOP being malformed (e.g., recent entries being added at the bottom) " | ||
f"or other `SOP-review` GitHub issues not being named correctly (e.g., the SOP ID should appear in the title).\n" | ||
) | ||
|
||
issue = { | ||
"title": f"[SOP Review] Review due: '{sop_name}'", | ||
"body": issue_body, | ||
"labels": ["SOP-Review"] | ||
} | ||
|
||
response = requests.post(url, json=issue, headers=headers) | ||
|
||
if response.status_code == 201: | ||
issue_data = response.json() | ||
issue_url = issue_data['html_url'] | ||
return issue_url | ||
else: | ||
print(f"Failed to create issue for '{sop_file}': {response.status_code}, {response.text}") | ||
return None | ||
|
||
def process_sop_file(sop_file: str, args, all_issues: List[Dict]) -> Dict: | ||
""" | ||
Processes a single SOP file, checks if it is due for review, and creates GitHub issues if necessary. | ||
:param sop_file: Path to the SOP file being processed. | ||
:param args: Parsed command-line arguments. | ||
:param all_issues: List of all open GitHub issues for comparison. | ||
:return: A dictionary with the status report for the processed SOP file. | ||
""" | ||
individual_report = { | ||
"filepath": sop_file, | ||
"last_edit_date": "", | ||
"due_review": False, | ||
"existing_gh_issue": "", | ||
"new_gh_issue": "" | ||
} | ||
|
||
with open(sop_file, 'r') as f: | ||
sop_content = f.read() | ||
|
||
html_content = markdown.markdown(sop_content, extensions=['tables']) | ||
soup = BeautifulSoup(html_content, 'html.parser') | ||
last_edit_date = get_last_edit_date(soup, sop_file) | ||
individual_report["last_edit_date"] = str(last_edit_date) | ||
|
||
# If SOP is due for review | ||
if last_edit_date and (datetime.now() - last_edit_date).days > args.days_review: | ||
individual_report["due_review"] = True | ||
existing_issue_url = check_existing_github_issues(all_issues, sop_file, args.verbosity) | ||
|
||
if not existing_issue_url and args.create_issues: | ||
issue_url = create_github_issue( | ||
gh_repo=args.repository, gh_token=gh_token, sop_file=sop_file, | ||
last_edit_date=last_edit_date, days_review=args.days_review | ||
) | ||
individual_report["new_gh_issue"] = issue_url | ||
else: | ||
individual_report["existing_gh_issue"] = existing_issue_url | ||
|
||
return individual_report | ||
|
||
|
||
def generate_report(sop_files: List[str], args, all_issues: List[Dict]) -> Dict: | ||
""" | ||
Generates a report on all processed SOP files. | ||
:param sop_files: List of all SOP files to be processed. | ||
:param args: Parsed command-line arguments. | ||
:param all_issues: List of all open GitHub issues for comparison. | ||
:return: A dictionary containing the final report. | ||
""" | ||
report = { | ||
"n_input_files": len(sop_files), | ||
"n_files_due_review": 0, | ||
"date_now": str(datetime.now()), | ||
"n_days_threshold": args.days_review, | ||
"n_created_gh_issues": 0, | ||
"all_files": [] | ||
} | ||
|
||
for sop_file in sop_files: | ||
if args.verbosity > 1: | ||
print(f"- Checking input file '{sop_file}'") | ||
|
||
individual_report = process_sop_file(sop_file, args, all_issues) | ||
report["all_files"].append(individual_report) | ||
|
||
# Count due reviews and created issues | ||
if individual_report["due_review"]: | ||
report["n_files_due_review"] += 1 | ||
if individual_report["new_gh_issue"]: | ||
report["n_created_gh_issues"] += 1 | ||
|
||
return report | ||
|
||
|
||
def main(): | ||
""" | ||
Main function to check if SOPs need review and create GitHub issues for those due. | ||
""" | ||
args = parse_args() | ||
if not gh_token: | ||
raise EnvironmentError("GitHub token not found. Please set the 'GITHUB_TOKEN' environment variable.") | ||
sop_files = collect_sop_files(args.inputs) # Collect all SOP files from the specified directory | ||
all_issues = get_gh_issues( | ||
gh_repo=args.repository, gh_token=gh_token, issue_params={"state": "open", "labels": "SOP-Review"} | ||
) # Collect all GH issues with the given parameters | ||
|
||
# Generate the final report, check issues and create new ones if needed | ||
report = generate_report(sop_files, args, all_issues) | ||
|
||
if args.verbosity > 0: | ||
print(json.dumps(report, indent=2), "\n") | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters