Skip to content

Commit

Permalink
fix: auto-create data drift issue (fix #5)
Browse files Browse the repository at this point in the history
  • Loading branch information
ijdoc authored Oct 9, 2024
1 parent 8e8babb commit 2db0e98
Show file tree
Hide file tree
Showing 4 changed files with 18 additions and 17 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/data_drift_retrain.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ on:
workflows: ["Production Data Batching"]
types:
- completed
push:
branches: ijdoc/issue5

jobs:
drift_check:
Expand Down
2 changes: 1 addition & 1 deletion batch/batch_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def main(args):
)
artifact = run.log_artifact(artifact).wait()
print(
f"New `{batch_type}` batch logged as `{artifact.source_name}`.\n"
f"Executed [run]({run.url}) to log new `{batch_type}` batch as `{artifact.source_name}`.\n"
f"- Iteration: {iteration}\n"
f"- Iteration Stride: {stride_days} day(s)\n"
f"- Total length: {history_days} day(s)"
Expand Down
21 changes: 10 additions & 11 deletions drift/check_drift.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,25 +73,24 @@
artifact = run.log_artifact(artifact).wait()
# Open a github issue asking for manual review
issue_title = f"Data drift detected on {train_artifact.source_name}"
drifted_features = ", ".join(
[feature for feature, drift in drift_results.items() if drift]
)
issue_body = (
f"Drift has been detected in the following features: {drifted_features}.\n\n"
f"Please review the [logged artifact](https://wandb.ai//{run.entity}/{run.project}/artifacts/{artifact.type}/{artifact.source_name}) "
f"and the [drift report]({report_url}) to determine if the training data should be updated.\n\n"
f"If approved, link the [logged artifact](https://wandb.ai//{run.entity}/{run.project}/artifacts/{artifact.type}/{artifact.source_name}) "
f"to the training Registry (`jdoc-org/wandb-registry-dataset/training`), otherwise, close this issue."
f"Data drift has been detected when comparing the registered training dataset with recent production data.\n\n"
f"Please review the [candidate artifact](https://wandb.ai/{run.entity}/{run.project}/artifacts/{artifact.type}/{artifact.source_name}) "
f"and the [drift report]({report_url}) to determine if the registered training data should be updated.\n\n"
f"To approve the new candidate after review, link it to [the training Dataset Registry](https://wandb.ai/registry/dataset?selectionPath=jdoc-org%2Fwandb-registry-dataset%2Ftraining&view=versions) at "
f"(`jdoc-org/wandb-registry-dataset/training`), otherwise close this issue."
)
issue_url = open_github_issue(issue_title, issue_body, labels=["drift", "data"])
print(
f"Production batch `{prod_artifact.source_name}` was logged "
f"as candidate training data `{artifact.source_name}`. "
f"An [issue]({issue_url}) was created for manual review. "
f"Production batch `{prod_artifact.source_name}` has been logged "
f"as candidate to replace training data `{artifact.source_name}`. "
f"An [issue]({issue_url}) was created for manual review:\n"
)
print(f"- [Data Drift Issue]({issue_url})")
else:
print("> No drift detected.\n")

print(f"- [W&B Run]({run.url})")
print(f"- [Full data drift report]({report_url})")

# Optionally the drift detection result in a parseable format.
Expand Down
10 changes: 5 additions & 5 deletions drift/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,19 +174,19 @@ def open_github_issue(issue_title, issue_body, labels=None):
url = f"https://api.github.com/repos/{repo_owner}/{repo_name}/issues"

headers = {
"Authorization": f"token {token}",
"Accept": "application/vnd.github.v3+json",
"Authorization": f"Bearer {token}",
"Accept": "application/vnd.github+json",
"X-GitHub-Api-Version": "2022-11-28",
}

data = {"title": issue_title, "body": issue_body, "labels": labels}

response = requests.post(url, headers=headers, json=data)
if response.status_code == 201:
issue_url = response.json()["html_url"]
print(f"GitHub issue created: {issue_url}")
return issue_url
else:
print(f"Failed to create GitHub issue: {response.content}")
raise RuntimeError(f"Failed to create GitHub issue: {response.content}")
return None


Expand All @@ -208,7 +208,7 @@ def get_github_repo_info():
pattern = r"[email protected]:(.+)/(.+)\.git"
elif remote_url.startswith("https://"):
# HTTPS URL
pattern = r"https://github.com/(.+)/(.+)\.git"
pattern = r"https://github.com/(.+)/(.+)"
else:
# Other formats
return None, None
Expand Down

0 comments on commit 2db0e98

Please sign in to comment.