diff --git a/discovery-infra/bm_inventory_api.py b/discovery-infra/bm_inventory_api.py index b2d7352..4062fa1 100644 --- a/discovery-infra/bm_inventory_api.py +++ b/discovery-infra/bm_inventory_api.py @@ -89,10 +89,18 @@ def get_hosts_id_with_macs(self, cluster_id): hosts = self.get_cluster_hosts(cluster_id) hosts_data = {} for host in hosts: - hw = json.loads(host.hardware_info) - hosts_data[host.id] = [nic["mac"] for nic in hw["nics"]] + hw = json.loads(host["hardware_info"]) + hosts_data[host["id"]] = [nic["mac"] for nic in hw["nics"]] return hosts_data + def get_host_by_mac(self, cluster_id, mac): + hosts = self.get_cluster_hosts(cluster_id) + + for host in hosts: + hw = json.loads(host["hardware_info"]) + if mac.lower() in [nic["mac"].lower() for nic in hw["nics"]]: + return host + def download_and_save_file(self, cluster_id, file_name, file_path): log.info("Downloading %s to %s", file_name, file_path) response = self.client.download_cluster_files(cluster_id=cluster_id, file_name=file_name, diff --git a/discovery-infra/install_cluster.py b/discovery-infra/install_cluster.py index 0375b5f..3d0a791 100755 --- a/discovery-infra/install_cluster.py +++ b/discovery-infra/install_cluster.py @@ -31,6 +31,7 @@ def _install_cluster(client, cluster): def wait_till_installed(client, cluster, timeout=60*60*2): log.info("Waiting %s till cluster finished installation", timeout) + # TODO: Change host validation for only previous known hosts utils.wait_till_all_hosts_are_in_status(client=client, cluster_id=cluster.id, nodes_count=len(cluster.hosts), statuses=[consts.NodesStatus.INSTALLED], diff --git a/discovery-infra/start_discovery.py b/discovery-infra/start_discovery.py index 3edaee9..eff9a35 100755 --- a/discovery-infra/start_discovery.py +++ b/discovery-infra/start_discovery.py @@ -60,15 +60,17 @@ def create_nodes_and_wait_till_registered(inventory_client, cluster, image_path, master_count, nodes_details): nodes_count = master_count + nodes_details["worker_count"] create_nodes(image_path, storage_path=storage_path, master_count=master_count, nodes_details=nodes_details) + + # TODO: Check for only new nodes utils.wait_till_nodes_are_ready(nodes_count=nodes_count, cluster_name=nodes_details["cluster_name"]) if not inventory_client: log.info("No inventory url, will not wait till nodes registration") return log.info("Wait till nodes will be registered") - waiting.wait(lambda: len(inventory_client.get_cluster_hosts(cluster.id)) >= nodes_count, + waiting.wait(lambda: utils.are_all_libvirt_nodes_in_cluster_hosts(inventory_client, cluster.id), timeout_seconds=consts.NODES_REGISTERED_TIMEOUT, - sleep_seconds=5, waiting_for="Nodes to be registered in inventory service") + sleep_seconds=10, waiting_for="Nodes to be registered in inventory service") log.info("Registered nodes are:") pprint.pprint(inventory_client.get_cluster_hosts(cluster.id)) @@ -77,13 +79,16 @@ def create_nodes_and_wait_till_registered(inventory_client, cluster, image_path, # If master in name -> role will be master, same for worker def set_hosts_roles(client, cluster_id): hosts = [] - libvirt_macs = utils.get_libvirt_nodes_mac_role_ip_and_name() + libvirt_nodes = utils.get_libvirt_nodes_mac_role_ip_and_name() inventory_hosts = client.get_cluster_hosts(cluster_id) - assert len(libvirt_macs) == len(inventory_hosts) - for host in inventory_hosts: - hw = json.loads(host["hardware_info"]) - role = [libvirt_macs[nic["mac"]]["role"] for nic in hw["nics"] if nic["mac"].lower() in libvirt_macs][0] - hosts.append({"id": host["id"], "role": role}) + + for libvirt_mac, libvirt_metadata in libvirt_nodes.items(): + for host in inventory_hosts: + hw = json.loads(host["hardware_info"]) + + if libvirt_mac.lower() in map(lambda nic: nic["mac"].lower(), hw["nics"]): + hosts.append({"id": host["id"], "role": libvirt_metadata["role"]}) + if hosts: client.set_hosts_roles(cluster_id=cluster_id, hosts_with_roles=hosts) @@ -141,15 +146,22 @@ def nodes_flow(client, cluster_name, cluster): master_count=args.master_count, nodes_details=nodes_details) if client: - nodes_count = args.master_count + args.number_of_workers - utils.wait_till_all_hosts_are_in_status(client=client, cluster_id=cluster.id, - nodes_count=nodes_count, statuses=[consts.NodesStatus.INSUFFICIENT]) - set_cluster_vips(client, cluster.id) + cluster_info = client.cluster_get(cluster.id) + macs = utils.get_libvirt_nodes_macs() + + if not (cluster_info.api_vip and cluster_info.ingress_vip): + utils.wait_till_hosts_with_macs_are_in_status(client=client, cluster_id=cluster.id, macs=macs, + statuses=[consts.NodesStatus.INSUFFICIENT]) + set_cluster_vips(client, cluster.id) + else: + log.info("VIPs already configured") + set_hosts_roles(client, cluster.id) - utils.wait_till_all_hosts_are_in_status(client=client, cluster_id=cluster.id, - nodes_count=nodes_count, statuses=[consts.NodesStatus.KNOWN]) + utils.wait_till_hosts_with_macs_are_in_status(client=client, cluster_id=cluster.id, macs=macs, + statuses=[consts.NodesStatus.KNOWN]) log.info("Printing after setting roles") pprint.pprint(client.get_cluster_hosts(cluster.id)) + if args.install_cluster: time.sleep(10) install_cluster.run_install_flow(client=client, cluster_id=cluster.id, diff --git a/discovery-infra/utils.py b/discovery-infra/utils.py index cf7dbbc..bcebfad 100644 --- a/discovery-infra/utils.py +++ b/discovery-infra/utils.py @@ -1,4 +1,5 @@ import os +import itertools import subprocess from pathlib import Path import shlex @@ -44,7 +45,7 @@ def get_service_url(service_name): def wait_till_nodes_are_ready(nodes_count, cluster_name): - log.info("Wait till %s hosts will have ips", nodes_count) + log.info("Wait till %s nodes will be ready and have ips", nodes_count) cmd = "%s %s| grep %s | wc -l" % (VIRSH_LEASES_COMMAND, consts.TEST_NETWORK, cluster_name) try: waiting.wait(lambda: int(run_command(cmd, shell=True).strip()) >= nodes_count, @@ -59,7 +60,6 @@ def wait_till_nodes_are_ready(nodes_count, cluster_name): # Require wait_till_nodes_are_ready has finished and all nodes are up def get_libvirt_nodes_mac_role_ip_and_name(): - log.info("Get nodes macs and roles from libvirt") cmd = "%s %s" % (VIRSH_LEASES_COMMAND, consts.TEST_NETWORK) nodes_data = {} try: @@ -77,6 +77,19 @@ def get_libvirt_nodes_mac_role_ip_and_name(): raise +def get_libvirt_nodes_macs(): + return get_libvirt_nodes_mac_role_ip_and_name().keys() + + +def are_all_libvirt_nodes_in_cluster_hosts(client, cluster_id): + hosts_macs = client.get_hosts_id_with_macs(cluster_id) + return all(mac.lower() in map(str.lower, itertools.chain(*hosts_macs.values())) for mac in get_libvirt_nodes_macs()) + + +def get_cluster_hosts_with_mac(client, cluster_id, macs): + return [client.get_host_by_mac(cluster_id, mac) for mac in macs] + + def get_tfvars(): if not os.path.exists(consts.TFVARS_JSON_FILE): raise Exception("%s doesn't exists" % consts.TFVARS_JSON_FILE) @@ -85,8 +98,7 @@ def get_tfvars(): return tfvars -def are_all_hosts_in_status(client, cluster_id, nodes_count, statuses, fall_on_error_status=True): - hosts = client.get_cluster_hosts(cluster_id) +def are_hosts_in_status(client, cluster_id, hosts, nodes_count, statuses, fall_on_error_status=True): hosts_in_status = [host for host in hosts if host["status"] in statuses] if len(hosts_in_status) >= nodes_count: return True @@ -95,22 +107,43 @@ def are_all_hosts_in_status(client, cluster_id, nodes_count, statuses, fall_on_e log.error("Some of the hosts are in insufficient or error status. Hosts in error %s", hosts_in_error) raise Exception("All the nodes must be in valid status, but got some in error") - log.info("Asked all hosts to be in one of the statuses from %s and currently hosts statuses are %s", statuses, + log.info("Asked hosts to be in one of the statuses from %s and currently hosts statuses are %s", statuses, [(host["id"], host["status"], host["status_info"]) for host in hosts]) return False +def wait_till_hosts_with_macs_are_in_status(client, cluster_id, macs, statuses, + timeout=consts.NODES_REGISTERED_TIMEOUT, + fall_on_error_status=True, interval=5): + log.info("Wait till %s nodes are in one of the statuses %s", len(macs), statuses) + + try: + waiting.wait(lambda: are_hosts_in_status(client, cluster_id, get_cluster_hosts_with_mac(client, cluster_id, macs), + len(macs), statuses, fall_on_error_status), + timeout_seconds=timeout, + sleep_seconds=interval, waiting_for="Nodes to be in of the statuses %s" % statuses) + except: + hosts = get_cluster_hosts_with_mac(client, cluster_id, macs) + log.info("All nodes: %s", hosts) + pprint.pprint(hosts) + raise + + def wait_till_all_hosts_are_in_status(client, cluster_id, nodes_count, statuses, timeout=consts.NODES_REGISTERED_TIMEOUT, fall_on_error_status=True, interval=5): + hosts = client.get_cluster_hosts(cluster_id) log.info("Wait till %s nodes are in one of the statuses %s", nodes_count, statuses) + try: - waiting.wait(lambda: are_all_hosts_in_status(client, cluster_id, nodes_count, statuses, fall_on_error_status), + waiting.wait(lambda: are_hosts_in_status(client, cluster_id, client.get_cluster_hosts(cluster_id), + nodes_count, statuses, fall_on_error_status), timeout_seconds=timeout, sleep_seconds=interval, waiting_for="Nodes to be in of the statuses %s" % statuses) except: - log.info("All nodes: %s", client.get_cluster_hosts(cluster_id)) - pprint.pprint(client.get_cluster_hosts(cluster_id)) + hosts = client.get_cluster_hosts(cluster_id) + log.info("All nodes: %s", hosts) + pprint.pprint(hosts) raise diff --git a/scripts/utils.sh b/scripts/utils.sh index 40a325e..9307a6f 100755 --- a/scripts/utils.sh +++ b/scripts/utils.sh @@ -71,7 +71,7 @@ function wait_for_url_and_run() { url_reachable "$1" && STATUS=$? || STATUS=$? done if [ $RETRIES -eq 0 ]; then - echo "Timeout reached, url not reachable" + echo "Timeout reached, url $1 not reachable" exit 1 fi } diff --git a/skipper.yaml b/skipper.yaml index 4783c30..0e4ff1a 100644 --- a/skipper.yaml +++ b/skipper.yaml @@ -34,4 +34,5 @@ env: RUN_WITH_VIPS: $RUN_WITH_VIPS KUBECONFIG_GENERATE_IMAGE: $KUBECONFIG_GENERATE_IMAGE REMOTE_INVENTORY_URL: $REMOTE_INVENTORY_URL - CLUSTER_ID: $CLUSTER_ID \ No newline at end of file + CLUSTER_ID: $CLUSTER_ID + NUM_MASTERS: $NUM_MASTERS \ No newline at end of file