bcgov · thegentlemanphysicist · May 22, 2024 · May 16, 2024 · May 16, 2024 · May 22, 2024
diff --git a/docs/environment-variables.md b/docs/environment-variables.md
@@ -38,6 +38,12 @@ The CSS app should be put in maintenance mode when the production environent is
  - CSS_GH_TOKEN:  The personal access token for deploying the CSS app.
  - CSS_BRANCH: `dev` for sandbox-dev agent, `main` for production-prod agent.
 
+## Uptime.com Status Page Integration
+
+The switchover agent is able to create and close incidents on uptime status pages.  (Production is located at [https://status.loginproxy.gov.bc.ca/](https://status.loginproxy.gov.bc.ca/)).  To do this, two environment vars must be configured:
+
+- UPTIME_STATUS_PAGE_ID: This integer can be found in the status page's non-vanity url for the statu page hosted by uptime.com.
+- UPTIME_STATUS_TOKEN: The credential used for the uptime.com API.  It can be found [here](https://uptime.com/api/tokens).
 ## Preemptive Failover
 
 There are 4 optional environment variables if you need to schedule a failover/failback over night.  These are:

diff --git a/src/config.py b/src/config.py
@@ -27,4 +27,7 @@
     preemptive_failover_end_time=os.environ.get("PREEMPTIVE_FAILOVER_END_TIME", ""),
     preemptive_failover_workflow_id=os.environ.get("PREEMPTIVE_WORKFLOW_ID", "preemptive-failover.yml"),
     enable_gold_route_workflow_id=os.environ.get("ENABLE_GOLD_ROUTE_WORKFLOW_ID", "turn-off-gold-routing.yml"),
+    uptime_status_api="https://uptime.com/api/v1/statuspages/",
+    uptime_status_page_id=os.environ.get("UPTIME_STATUS_PAGE_ID", ""),
+    uptime_status_token=os.environ.get("UPTIME_STATUS_TOKEN", "")
 )
diff --git a/src/logic.py b/src/logic.py
@@ -3,6 +3,7 @@
 import traceback
 import requests
 import json
+import datetime
 from clients.dns import check_dns_by_env
 
 from multiprocessing import Queue
@@ -17,6 +18,8 @@
 css_maintenance_workflow_id = config.get('css_maintenance_workflow_id')
 css_gh_token = config.get('css_gh_token')
 
+incident_id = None
+
 
 def handle_queues(queue: Queue, processes: list):
     previous_valid_ip = 'undefined'
@@ -69,29 +72,34 @@ def action_dispatcher(ip: str, prev_ip: str, active_ip: str, passive_ip: str):
         if css_maintenance_to_active:
             logger.info("active_ip")
             dispatch_css_maintenance_action(False)
+            dispatch_uptime_incident(False)
         else:
             logger.info("Failed to turn off the css maintenance mode")
     elif (ip == passive_ip and prev_ip == active_ip):
         logger.info("passive_ip")
         dispatch_action_by_id(config.get('gh_workflow_id'))
         dispatch_css_maintenance_action(True)
+        dispatch_uptime_incident(True)
 
 
 # This runs a github action in the sso-switchover agent repos currently works with actions with 3 three required inputs
 # gh_branch (usually dev or main)
 # project (SANDBOX or PRODUCTION)
 # environment (dev, test, prod)
 def dispatch_action_by_id(workflow_id: str):
-    environment = config.get('namespace')[7:]
-    url = 'https://api.github.com/repos/%s/%s/actions/workflows/%s/dispatches' % (config.get('gh_owner'), config.get('gh_repo'), workflow_id)
-    data = {'ref': config.get('gh_branch'), 'inputs': {'project': config.get('project'), 'environment': environment}}
-    bearer = 'token %s' % config.get('gh_token')
-    headers = {'Accept': 'application/vnd.github.v3+json', 'Authorization': bearer}
-    x = requests.post(url, json=data, headers=headers)
-    if x.status_code == 204:
-        logger.info('GH API status: %s' % x.status_code)
-    else:
-        logger.error('GH API error: %s' % x.content)
+    try:
+        environment = config.get('namespace')[7:]
+        url = 'https://api.github.com/repos/%s/%s/actions/workflows/%s/dispatches' % (config.get('gh_owner'), config.get('gh_repo'), workflow_id)
+        data = {'ref': config.get('gh_branch'), 'inputs': {'project': config.get('project'), 'environment': environment}}
+        bearer = 'token %s' % config.get('gh_token')
+        headers = {'Accept': 'application/vnd.github.v3+json', 'Authorization': bearer}
+        x = requests.post(url, json=data, headers=headers)
+        if x.status_code == 204:
+            logger.info('GH API status: %s' % x.status_code)
+        else:
+            logger.error('GH API error: %s' % x.content)
+    except Exception as ex:
+        logger.error('The dispatch action failed. %s' % ex)
 
 
 def dispatch_rocketchat_webhook(maintenance_mode: str):
@@ -182,3 +190,71 @@ def dispatch_css_maintenance_action(maintenance_mode: bool):
             logger.info('CSS GH API status: %s' % x.status_code)
         else:
             logger.error('GH API error: %s' % x.content)
+
+
+def dispatch_uptime_incident(enable_incident: bool):
+    global incident_id
+
+    url = config.get('uptime_status_api')
+    uptime_token = config.get('uptime_status_token')
+    uptime_statuspage_id = config.get('uptime_status_page_id')
+    namespace = config.get('namespace')
+    env = namespace[7:]
+    if uptime_token == "" or uptime_statuspage_id == "":
+        logger.error('The uptime status page incident creation/closure has not been configured')
+        return
+
+    incident_url = f"{url}{uptime_statuspage_id}/incidents/"
+    bearer = 'token %s' % uptime_token
+    headers = {'Authorization': bearer}
+
+    body = {
+        "name": f"[{env}] Keycloak is in Disaster Recovery mode",
+        "include_in_global_metrics": False,
+        "updates": [
+            {
+                "id": 0,
+                "description": f"The switchover to GoldDR was triggered for the {env} environment.",
+                "incident_state": "investigating"
+            }
+        ],
+        "incident_type": "INCIDENT",
+        "update_component_status": True,
+        "notify_subscribers": True
+    }
+
+    if enable_incident:
+        if incident_id is None:
+            try:
+                x = requests.post(incident_url, json=body, headers=headers)
+                if x.status_code == 200:
+                    incident_id = x.json()["results"]["pk"]
+                    logger.info(f"Uptime incident {incident_id} was created.")
+                else:
+                    logger.error(f"Uptime incident creation returned code {x.status_code}")
+                    incident_id = None
+            except BaseException:
+                logger.error("Uptime incident creation failed.")
+                incident_id = None
+        else:
+            logger.error("An Uptime incident has already been created for this environment")
+    else:
+        # close the incident
+        try:
+            close_body = body
+            close_body["ends_at"] = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.%fZ")
+            close_body["incident_state"] = "resolved"
+            close_body["updates"] = [{
+                "id": 0,
+                "description": f"The {env} environement is no longer in disaster recovery mode.",
+                "incident_state": "resolved"
+            }]
+            x = requests.patch(f"{incident_url}{incident_id}", json=body, headers=headers)
+            if x.status_code == 200:
+                logger.info(f"Uptime incident number {incident_id}, has been closed.")
+            else:
+                logger.error(f"Uptime incident {incident_id} failed to close.")
+        except BaseException:
+            logger.error(f"Uptime incident number {incident_id}, failed to close.")
+
+        incident_id = None
diff --git a/transition-scripts/helpers/patroni.sh b/transition-scripts/helpers/patroni.sh
@@ -255,7 +255,7 @@ wait_for_patroni_xlog_close() {
   count=0
   wait_ready() {
     synch_status=$(patroni_xlog_diffrence "$namespace")
-    max_xlog_lag=100000
+    max_xlog_lag=150000
 
     if [ "$synch_status" == "synced" ]; then
       info "patroni xlog in $namespace is $synch_status"