Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore: deploy uptime incident to production switchover #176

Merged
merged 3 commits into from
May 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions docs/environment-variables.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,12 @@ The CSS app should be put in maintenance mode when the production environent is
- CSS_GH_TOKEN: The personal access token for deploying the CSS app.
- CSS_BRANCH: `dev` for sandbox-dev agent, `main` for production-prod agent.

## Uptime.com Status Page Integration

The switchover agent is able to create and close incidents on uptime status pages. (Production is located at [https://status.loginproxy.gov.bc.ca/](https://status.loginproxy.gov.bc.ca/)). To do this, two environment vars must be configured:

- UPTIME_STATUS_PAGE_ID: This integer can be found in the status page's non-vanity url for the statu page hosted by uptime.com.
- UPTIME_STATUS_TOKEN: The credential used for the uptime.com API. It can be found [here](https://uptime.com/api/tokens).
## Preemptive Failover

There are 4 optional environment variables if you need to schedule a failover/failback over night. These are:
Expand Down
3 changes: 3 additions & 0 deletions src/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,4 +27,7 @@
preemptive_failover_end_time=os.environ.get("PREEMPTIVE_FAILOVER_END_TIME", ""),
preemptive_failover_workflow_id=os.environ.get("PREEMPTIVE_WORKFLOW_ID", "preemptive-failover.yml"),
enable_gold_route_workflow_id=os.environ.get("ENABLE_GOLD_ROUTE_WORKFLOW_ID", "turn-off-gold-routing.yml"),
uptime_status_api="https://uptime.com/api/v1/statuspages/",
uptime_status_page_id=os.environ.get("UPTIME_STATUS_PAGE_ID", ""),
uptime_status_token=os.environ.get("UPTIME_STATUS_TOKEN", "")
)
96 changes: 86 additions & 10 deletions src/logic.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import traceback
import requests
import json
import datetime
from clients.dns import check_dns_by_env

from multiprocessing import Queue
Expand All @@ -17,6 +18,8 @@
css_maintenance_workflow_id = config.get('css_maintenance_workflow_id')
css_gh_token = config.get('css_gh_token')

incident_id = None


def handle_queues(queue: Queue, processes: list):
previous_valid_ip = 'undefined'
Expand Down Expand Up @@ -69,29 +72,34 @@ def action_dispatcher(ip: str, prev_ip: str, active_ip: str, passive_ip: str):
if css_maintenance_to_active:
logger.info("active_ip")
dispatch_css_maintenance_action(False)
dispatch_uptime_incident(False)
else:
logger.info("Failed to turn off the css maintenance mode")
elif (ip == passive_ip and prev_ip == active_ip):
logger.info("passive_ip")
dispatch_action_by_id(config.get('gh_workflow_id'))
dispatch_css_maintenance_action(True)
dispatch_uptime_incident(True)


# This runs a github action in the sso-switchover agent repos currently works with actions with 3 three required inputs
# gh_branch (usually dev or main)
# project (SANDBOX or PRODUCTION)
# environment (dev, test, prod)
def dispatch_action_by_id(workflow_id: str):
environment = config.get('namespace')[7:]
url = 'https://api.github.com/repos/%s/%s/actions/workflows/%s/dispatches' % (config.get('gh_owner'), config.get('gh_repo'), workflow_id)
data = {'ref': config.get('gh_branch'), 'inputs': {'project': config.get('project'), 'environment': environment}}
bearer = 'token %s' % config.get('gh_token')
headers = {'Accept': 'application/vnd.github.v3+json', 'Authorization': bearer}
x = requests.post(url, json=data, headers=headers)
if x.status_code == 204:
logger.info('GH API status: %s' % x.status_code)
else:
logger.error('GH API error: %s' % x.content)
try:
environment = config.get('namespace')[7:]
url = 'https://api.github.com/repos/%s/%s/actions/workflows/%s/dispatches' % (config.get('gh_owner'), config.get('gh_repo'), workflow_id)
data = {'ref': config.get('gh_branch'), 'inputs': {'project': config.get('project'), 'environment': environment}}
bearer = 'token %s' % config.get('gh_token')
headers = {'Accept': 'application/vnd.github.v3+json', 'Authorization': bearer}
x = requests.post(url, json=data, headers=headers)
if x.status_code == 204:
logger.info('GH API status: %s' % x.status_code)
else:
logger.error('GH API error: %s' % x.content)
except Exception as ex:
logger.error('The dispatch action failed. %s' % ex)


def dispatch_rocketchat_webhook(maintenance_mode: str):
Expand Down Expand Up @@ -182,3 +190,71 @@ def dispatch_css_maintenance_action(maintenance_mode: bool):
logger.info('CSS GH API status: %s' % x.status_code)
else:
logger.error('GH API error: %s' % x.content)


def dispatch_uptime_incident(enable_incident: bool):
global incident_id

url = config.get('uptime_status_api')
uptime_token = config.get('uptime_status_token')
uptime_statuspage_id = config.get('uptime_status_page_id')
namespace = config.get('namespace')
env = namespace[7:]
if uptime_token == "" or uptime_statuspage_id == "":
logger.error('The uptime status page incident creation/closure has not been configured')
return

incident_url = f"{url}{uptime_statuspage_id}/incidents/"
bearer = 'token %s' % uptime_token
headers = {'Authorization': bearer}

body = {
"name": f"[{env}] Keycloak is in Disaster Recovery mode",
"include_in_global_metrics": False,
"updates": [
{
"id": 0,
"description": f"The switchover to GoldDR was triggered for the {env} environment.",
"incident_state": "investigating"
}
],
"incident_type": "INCIDENT",
"update_component_status": True,
"notify_subscribers": True
}

if enable_incident:
if incident_id is None:
try:
x = requests.post(incident_url, json=body, headers=headers)
if x.status_code == 200:
incident_id = x.json()["results"]["pk"]
logger.info(f"Uptime incident {incident_id} was created.")
else:
logger.error(f"Uptime incident creation returned code {x.status_code}")
incident_id = None
except BaseException:
logger.error("Uptime incident creation failed.")
incident_id = None
else:
logger.error("An Uptime incident has already been created for this environment")
else:
# close the incident
try:
close_body = body
close_body["ends_at"] = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.%fZ")
close_body["incident_state"] = "resolved"
close_body["updates"] = [{
"id": 0,
"description": f"The {env} environement is no longer in disaster recovery mode.",
"incident_state": "resolved"
}]
x = requests.patch(f"{incident_url}{incident_id}", json=body, headers=headers)
if x.status_code == 200:
logger.info(f"Uptime incident number {incident_id}, has been closed.")
else:
logger.error(f"Uptime incident {incident_id} failed to close.")
except BaseException:
logger.error(f"Uptime incident number {incident_id}, failed to close.")

incident_id = None
2 changes: 1 addition & 1 deletion transition-scripts/helpers/patroni.sh
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,7 @@ wait_for_patroni_xlog_close() {
count=0
wait_ready() {
synch_status=$(patroni_xlog_diffrence "$namespace")
max_xlog_lag=100000
max_xlog_lag=150000

if [ "$synch_status" == "synced" ]; then
info "patroni xlog in $namespace is $synch_status"
Expand Down
Loading