From 5211cc66dda809886e9b812f85b83d5e3584a590 Mon Sep 17 00:00:00 2001 From: Andrew Erickson Date: Mon, 21 Oct 2024 23:03:48 -0500 Subject: [PATCH 1/4] wip --- worker_health/worker_health/fitness.py | 52 +++++++++++++++++--------- 1 file changed, 34 insertions(+), 18 deletions(-) diff --git a/worker_health/worker_health/fitness.py b/worker_health/worker_health/fitness.py index d0e5bda3..b421d2e3 100644 --- a/worker_health/worker_health/fitness.py +++ b/worker_health/worker_health/fitness.py @@ -26,6 +26,10 @@ def get_r8_inventory_path(): return f"/Users/{getpass.getuser()}/git/ronin_puppet/inventory.d/macmini-r8.yaml" +class NoDataForHostException(Exception): + pass + + class Fitness: def __init__( self, @@ -81,6 +85,32 @@ def format_workertype_fitness_report_result(self, res): return_string += self.sr_dict_format(res) return return_string + # TODO: take provisioner? + def fitness_report_single_host(self, worker_type, worker_id): + self.get_pending_tasks_multi([worker_type]) + url = ( + f"{self.tc_url_root}/provisioners/{self.provisioner}/worker-types/{worker_type}/workers?limit=5" + # "https://queue.taskcluster.net/v1/provisioners/%s/worker-types/%s/workers?limit=5" + ) + # print(url) + worker_group_result = utils.get_jsonc(url, self.verbosity) + # worker_group = worker_group_result['workerTypes'][0][] + # import pprint + # pprint.pprint(worker_group_result) + # sys.exit() + if len(worker_group_result["workers"]) == 0: + print("%s.%s: %s" % (worker_type, worker_id, "no data")) + return + worker_group = worker_group_result["workers"][0]["workerGroup"] + self.quarantine_data[worker_type] = self.quarantine.get_quarantined_workers(self.provisioner, worker_type) + try: + _worker, result_object, _e = self.device_fitness_report(worker_type, worker_group, worker_id) + except NoDataForHostException as e: + print(f"{worker_type}.{worker_id}: {e}") + sys.exit(1) + result_object["worker_id"] = worker_id + return result_object + def main(self, provisioner, worker_type, worker_id): # TODO: show when worker last started a task (taskStarted in TC) # - aws metal nodes has quarantined nodes that have been deleted that never drop off from worker-data @@ -96,24 +126,7 @@ def main(self, provisioner, worker_type, worker_id): # - can't code in else below be smarter about queries (so we don't need this)? if worker_type and worker_id: worker_count = 1 - self.get_pending_tasks_multi([worker_type]) - url = ( - f"{self.tc_url_root}/provisioners/{self.provisioner}/worker-types/{worker_type}/workers?limit=5" - # "https://queue.taskcluster.net/v1/provisioners/%s/worker-types/%s/workers?limit=5" - ) - # print(url) - worker_group_result = utils.get_jsonc(url, self.verbosity) - # worker_group = worker_group_result['workerTypes'][0][] - # import pprint - # pprint.pprint(worker_group_result) - # sys.exit() - if len(worker_group_result["workers"]) == 0: - print("%s.%s: %s" % (worker_type, worker_id, "no data")) - return - worker_group = worker_group_result["workers"][0]["workerGroup"] - self.quarantine_data[worker_type] = self.quarantine.get_quarantined_workers(self.provisioner, worker_type) - _worker, res_obj, _e = self.device_fitness_report(worker_type, worker_group, worker_id) - res_obj["worker_id"] = worker_id + res_obj = self.fitness_report_single_host(worker_type, worker_id) sr_total += res_obj["sr"] print("%s.%s" % (worker_type, self.format_workertype_fitness_report_result(res_obj))) else: @@ -448,6 +461,9 @@ def device_fitness_report(self, queue, worker_group, device): task_last_started_timestamp = None task_history_success_array = [] # 1 for success, 0 for failure or exception + if "recentTasks" not in results: + raise NoDataForHostException(f"no results['recentTasks'] for {queue}.{device}") + task_ids = [] for task in results["recentTasks"]: task_id = task["taskId"] From 96cdc65d4bc8ac97cbcbb0da732806916295c797 Mon Sep 17 00:00:00 2001 From: Andrew Erickson Date: Mon, 21 Oct 2024 23:09:33 -0500 Subject: [PATCH 2/4] main: don't take provisioner as arg --- worker_health/fitness_check.py | 8 +++----- worker_health/worker_health/fitness.py | 4 ++-- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/worker_health/fitness_check.py b/worker_health/fitness_check.py index c582582f..a6c97cd9 100755 --- a/worker_health/fitness_check.py +++ b/worker_health/fitness_check.py @@ -30,16 +30,14 @@ "--alert-percent", default=fitness.ALERT_PERCENT, type=float, - help="percentage of successful jobs to alert at. 0 to 1, defaults to %s." - % fitness.ALERT_PERCENT, + help="percentage of successful jobs to alert at. 0 to 1, defaults to %s." % fitness.ALERT_PERCENT, ) parser.add_argument( "-t", "--alert-time", default=fitness.ALERT_TIME, type=int, - help="alert if a worker hasn't worked in this many minutes, defaults to %s." - % fitness.ALERT_TIME, + help="alert if a worker hasn't worked in this many minutes, defaults to %s." % fitness.ALERT_TIME, ) parser.add_argument( "-o", @@ -114,4 +112,4 @@ ) # TODO: just pass args? f.args = args - f.main(args.provisioner, arg_worker_type, arg_worker_id) + f.main(arg_worker_type, arg_worker_id) diff --git a/worker_health/worker_health/fitness.py b/worker_health/worker_health/fitness.py index b421d2e3..6f41a32c 100644 --- a/worker_health/worker_health/fitness.py +++ b/worker_health/worker_health/fitness.py @@ -111,7 +111,7 @@ def fitness_report_single_host(self, worker_type, worker_id): result_object["worker_id"] = worker_id return result_object - def main(self, provisioner, worker_type, worker_id): + def main(self, worker_type, worker_id): # TODO: show when worker last started a task (taskStarted in TC) # - aws metal nodes has quarantined nodes that have been deleted that never drop off from worker-data @@ -135,7 +135,7 @@ def main(self, provisioner, worker_type, worker_id): worker_types = [worker_type] # provisioner mode else: - worker_types_result = self.get_worker_types(provisioner) + worker_types_result = self.get_worker_types(self.provisioner) worker_types = [] if "workerTypes" in worker_types_result: for provisioner in worker_types_result["workerTypes"]: From a315c6ca69308c927477dba93ba7e65104deff15 Mon Sep 17 00:00:00 2001 From: Andrew Erickson Date: Mon, 21 Oct 2024 23:09:49 -0500 Subject: [PATCH 3/4] avoid var shadowing --- worker_health/worker_health/fitness.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/worker_health/worker_health/fitness.py b/worker_health/worker_health/fitness.py index 6f41a32c..3519d7fd 100644 --- a/worker_health/worker_health/fitness.py +++ b/worker_health/worker_health/fitness.py @@ -138,8 +138,8 @@ def main(self, worker_type, worker_id): worker_types_result = self.get_worker_types(self.provisioner) worker_types = [] if "workerTypes" in worker_types_result: - for provisioner in worker_types_result["workerTypes"]: - worker_type = provisioner["workerType"] + for prov in worker_types_result["workerTypes"]: + worker_type = prov["workerType"] worker_types.append(worker_type) # print(worker_types) else: From 22f8d41d57e975ec8b76e3d05198448c1c6fac06 Mon Sep 17 00:00:00 2001 From: Andrew Erickson Date: Mon, 21 Oct 2024 23:45:59 -0500 Subject: [PATCH 4/4] add tool placeholder with outlien --- worker_health/quarantine_eval.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 worker_health/quarantine_eval.py diff --git a/worker_health/quarantine_eval.py b/worker_health/quarantine_eval.py new file mode 100644 index 00000000..b175b0b0 --- /dev/null +++ b/worker_health/quarantine_eval.py @@ -0,0 +1,26 @@ +#!/usr/bin/env python3 + +# Idea: We have X hosts in quarantine. Let them run one job and then see if +# they're better, if not quarantine them again. +# +# - Detects if there are pending jobs and only runs then? +# - Maybe always put them back in quarantine after one job and present a report? + +# v0: single host +# - take single host as input +# - check that there are jobs to run +# - record current state of host +# - remove from quarantine +# - wait for job to start +# - place back in quarantine (done immediately so only single job runs) +# - loop until the job is done +# - record current state of host +# - record sucess/failure for host + +# v1: multiple hosts +# step 1: take list of quarantined hosts +# step 2: repeat v0 over all hosts +# step 3: show report + +# v2: advanced +# step 1: gather quarantined hosts