Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactoring and quarantined host eval tool #133

Draft
wants to merge 4 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 3 additions & 5 deletions worker_health/fitness_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,16 +30,14 @@
"--alert-percent",
default=fitness.ALERT_PERCENT,
type=float,
help="percentage of successful jobs to alert at. 0 to 1, defaults to %s."
% fitness.ALERT_PERCENT,
help="percentage of successful jobs to alert at. 0 to 1, defaults to %s." % fitness.ALERT_PERCENT,
)
parser.add_argument(
"-t",
"--alert-time",
default=fitness.ALERT_TIME,
type=int,
help="alert if a worker hasn't worked in this many minutes, defaults to %s."
% fitness.ALERT_TIME,
help="alert if a worker hasn't worked in this many minutes, defaults to %s." % fitness.ALERT_TIME,
)
parser.add_argument(
"-o",
Expand Down Expand Up @@ -114,4 +112,4 @@
)
# TODO: just pass args?
f.args = args
f.main(args.provisioner, arg_worker_type, arg_worker_id)
f.main(arg_worker_type, arg_worker_id)
26 changes: 26 additions & 0 deletions worker_health/quarantine_eval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#!/usr/bin/env python3

# Idea: We have X hosts in quarantine. Let them run one job and then see if
# they're better, if not quarantine them again.
#
# - Detects if there are pending jobs and only runs then?
# - Maybe always put them back in quarantine after one job and present a report?

# v0: single host
# - take single host as input
# - check that there are jobs to run
# - record current state of host
# - remove from quarantine
# - wait for job to start
# - place back in quarantine (done immediately so only single job runs)
# - loop until the job is done
# - record current state of host
# - record sucess/failure for host

# v1: multiple hosts
# step 1: take list of quarantined hosts
# step 2: repeat v0 over all hosts
# step 3: show report

# v2: advanced
# step 1: gather quarantined hosts
60 changes: 38 additions & 22 deletions worker_health/worker_health/fitness.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,10 @@ def get_r8_inventory_path():
return f"/Users/{getpass.getuser()}/git/ronin_puppet/inventory.d/macmini-r8.yaml"


class NoDataForHostException(Exception):
pass


class Fitness:
def __init__(
self,
Expand Down Expand Up @@ -81,7 +85,33 @@ def format_workertype_fitness_report_result(self, res):
return_string += self.sr_dict_format(res)
return return_string

def main(self, provisioner, worker_type, worker_id):
# TODO: take provisioner?
def fitness_report_single_host(self, worker_type, worker_id):
self.get_pending_tasks_multi([worker_type])
url = (
f"{self.tc_url_root}/provisioners/{self.provisioner}/worker-types/{worker_type}/workers?limit=5"
# "https://queue.taskcluster.net/v1/provisioners/%s/worker-types/%s/workers?limit=5"
)
# print(url)
worker_group_result = utils.get_jsonc(url, self.verbosity)
# worker_group = worker_group_result['workerTypes'][0][]
# import pprint
# pprint.pprint(worker_group_result)
# sys.exit()
if len(worker_group_result["workers"]) == 0:
print("%s.%s: %s" % (worker_type, worker_id, "no data"))
return
worker_group = worker_group_result["workers"][0]["workerGroup"]
self.quarantine_data[worker_type] = self.quarantine.get_quarantined_workers(self.provisioner, worker_type)
try:
_worker, result_object, _e = self.device_fitness_report(worker_type, worker_group, worker_id)
except NoDataForHostException as e:
print(f"{worker_type}.{worker_id}: {e}")
sys.exit(1)
result_object["worker_id"] = worker_id
return result_object

def main(self, worker_type, worker_id):
# TODO: show when worker last started a task (taskStarted in TC)
# - aws metal nodes has quarantined nodes that have been deleted that never drop off from worker-data

Expand All @@ -96,24 +126,7 @@ def main(self, provisioner, worker_type, worker_id):
# - can't code in else below be smarter about queries (so we don't need this)?
if worker_type and worker_id:
worker_count = 1
self.get_pending_tasks_multi([worker_type])
url = (
f"{self.tc_url_root}/provisioners/{self.provisioner}/worker-types/{worker_type}/workers?limit=5"
# "https://queue.taskcluster.net/v1/provisioners/%s/worker-types/%s/workers?limit=5"
)
# print(url)
worker_group_result = utils.get_jsonc(url, self.verbosity)
# worker_group = worker_group_result['workerTypes'][0][]
# import pprint
# pprint.pprint(worker_group_result)
# sys.exit()
if len(worker_group_result["workers"]) == 0:
print("%s.%s: %s" % (worker_type, worker_id, "no data"))
return
worker_group = worker_group_result["workers"][0]["workerGroup"]
self.quarantine_data[worker_type] = self.quarantine.get_quarantined_workers(self.provisioner, worker_type)
_worker, res_obj, _e = self.device_fitness_report(worker_type, worker_group, worker_id)
res_obj["worker_id"] = worker_id
res_obj = self.fitness_report_single_host(worker_type, worker_id)
sr_total += res_obj["sr"]
print("%s.%s" % (worker_type, self.format_workertype_fitness_report_result(res_obj)))
else:
Expand All @@ -122,11 +135,11 @@ def main(self, provisioner, worker_type, worker_id):
worker_types = [worker_type]
# provisioner mode
else:
worker_types_result = self.get_worker_types(provisioner)
worker_types_result = self.get_worker_types(self.provisioner)
worker_types = []
if "workerTypes" in worker_types_result:
for provisioner in worker_types_result["workerTypes"]:
worker_type = provisioner["workerType"]
for prov in worker_types_result["workerTypes"]:
worker_type = prov["workerType"]
worker_types.append(worker_type)
# print(worker_types)
else:
Expand Down Expand Up @@ -448,6 +461,9 @@ def device_fitness_report(self, queue, worker_group, device):
task_last_started_timestamp = None
task_history_success_array = [] # 1 for success, 0 for failure or exception

if "recentTasks" not in results:
raise NoDataForHostException(f"no results['recentTasks'] for {queue}.{device}")

task_ids = []
for task in results["recentTasks"]:
task_id = task["taskId"]
Expand Down