Skip to content
This repository has been archived by the owner on Aug 11, 2020. It is now read-only.

Commit

Permalink
Allow deploy nodes to an existing cluster with hosts
Browse files Browse the repository at this point in the history
  • Loading branch information
YuviGold committed Jun 16, 2020
1 parent ce5c6c2 commit 3cf4d01
Show file tree
Hide file tree
Showing 6 changed files with 81 additions and 26 deletions.
12 changes: 10 additions & 2 deletions discovery-infra/bm_inventory_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,10 +89,18 @@ def get_hosts_id_with_macs(self, cluster_id):
hosts = self.get_cluster_hosts(cluster_id)
hosts_data = {}
for host in hosts:
hw = json.loads(host.hardware_info)
hosts_data[host.id] = [nic["mac"] for nic in hw["nics"]]
hw = json.loads(host["hardware_info"])
hosts_data[host["id"]] = [nic["mac"] for nic in hw["nics"]]
return hosts_data

def get_host_by_mac(self, cluster_id, mac):
hosts = self.get_cluster_hosts(cluster_id)

for host in hosts:
hw = json.loads(host["hardware_info"])
if mac.lower() in [nic["mac"].lower() for nic in hw["nics"]]:
return host

def download_and_save_file(self, cluster_id, file_name, file_path):
log.info("Downloading %s to %s", file_name, file_path)
response = self.client.download_cluster_files(cluster_id=cluster_id, file_name=file_name,
Expand Down
1 change: 1 addition & 0 deletions discovery-infra/install_cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ def _install_cluster(client, cluster):

def wait_till_installed(client, cluster, timeout=60*60*2):
log.info("Waiting %s till cluster finished installation", timeout)
# TODO: Change host validation for only previous known hosts
utils.wait_till_all_hosts_are_in_status(client=client, cluster_id=cluster.id,
nodes_count=len(cluster.hosts),
statuses=[consts.NodesStatus.INSTALLED],
Expand Down
40 changes: 26 additions & 14 deletions discovery-infra/start_discovery.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,15 +60,17 @@ def create_nodes_and_wait_till_registered(inventory_client, cluster, image_path,
master_count, nodes_details):
nodes_count = master_count + nodes_details["worker_count"]
create_nodes(image_path, storage_path=storage_path, master_count=master_count, nodes_details=nodes_details)

# TODO: Check for only new nodes
utils.wait_till_nodes_are_ready(nodes_count=nodes_count, cluster_name=nodes_details["cluster_name"])
if not inventory_client:
log.info("No inventory url, will not wait till nodes registration")
return

log.info("Wait till nodes will be registered")
waiting.wait(lambda: len(inventory_client.get_cluster_hosts(cluster.id)) >= nodes_count,
waiting.wait(lambda: utils.are_all_libvirt_nodes_in_cluster_hosts(inventory_client, cluster.id),
timeout_seconds=consts.NODES_REGISTERED_TIMEOUT,
sleep_seconds=5, waiting_for="Nodes to be registered in inventory service")
sleep_seconds=10, waiting_for="Nodes to be registered in inventory service")
log.info("Registered nodes are:")
pprint.pprint(inventory_client.get_cluster_hosts(cluster.id))

Expand All @@ -77,13 +79,16 @@ def create_nodes_and_wait_till_registered(inventory_client, cluster, image_path,
# If master in name -> role will be master, same for worker
def set_hosts_roles(client, cluster_id):
hosts = []
libvirt_macs = utils.get_libvirt_nodes_mac_role_ip_and_name()
libvirt_nodes = utils.get_libvirt_nodes_mac_role_ip_and_name()
inventory_hosts = client.get_cluster_hosts(cluster_id)
assert len(libvirt_macs) == len(inventory_hosts)
for host in inventory_hosts:
hw = json.loads(host["hardware_info"])
role = [libvirt_macs[nic["mac"]]["role"] for nic in hw["nics"] if nic["mac"].lower() in libvirt_macs][0]
hosts.append({"id": host["id"], "role": role})

for libvirt_mac, libvirt_metadata in libvirt_nodes.items():
for host in inventory_hosts:
hw = json.loads(host["hardware_info"])

if libvirt_mac.lower() in map(lambda nic: nic["mac"].lower(), hw["nics"]):
hosts.append({"id": host["id"], "role": libvirt_metadata["role"]})

if hosts:
client.set_hosts_roles(cluster_id=cluster_id, hosts_with_roles=hosts)

Expand Down Expand Up @@ -141,15 +146,22 @@ def nodes_flow(client, cluster_name, cluster):
master_count=args.master_count,
nodes_details=nodes_details)
if client:
nodes_count = args.master_count + args.number_of_workers
utils.wait_till_all_hosts_are_in_status(client=client, cluster_id=cluster.id,
nodes_count=nodes_count, statuses=[consts.NodesStatus.INSUFFICIENT])
set_cluster_vips(client, cluster.id)
cluster_info = client.cluster_get(cluster.id)
macs = utils.get_libvirt_nodes_macs()

if not (cluster_info.api_vip and cluster_info.ingress_vip):
utils.wait_till_hosts_with_macs_are_in_status(client=client, cluster_id=cluster.id, macs=macs,
statuses=[consts.NodesStatus.INSUFFICIENT])
set_cluster_vips(client, cluster.id)
else:
log.info("VIPs already configured")

set_hosts_roles(client, cluster.id)
utils.wait_till_all_hosts_are_in_status(client=client, cluster_id=cluster.id,
nodes_count=nodes_count, statuses=[consts.NodesStatus.KNOWN])
utils.wait_till_hosts_with_macs_are_in_status(client=client, cluster_id=cluster.id, macs=macs,
statuses=[consts.NodesStatus.KNOWN])
log.info("Printing after setting roles")
pprint.pprint(client.get_cluster_hosts(cluster.id))

if args.install_cluster:
time.sleep(10)
install_cluster.run_install_flow(client=client, cluster_id=cluster.id,
Expand Down
49 changes: 41 additions & 8 deletions discovery-infra/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
import itertools
import subprocess
from pathlib import Path
import shlex
Expand Down Expand Up @@ -44,7 +45,7 @@ def get_service_url(service_name):


def wait_till_nodes_are_ready(nodes_count, cluster_name):
log.info("Wait till %s hosts will have ips", nodes_count)
log.info("Wait till %s nodes will be ready and have ips", nodes_count)
cmd = "%s %s| grep %s | wc -l" % (VIRSH_LEASES_COMMAND, consts.TEST_NETWORK, cluster_name)
try:
waiting.wait(lambda: int(run_command(cmd, shell=True).strip()) >= nodes_count,
Expand All @@ -59,7 +60,6 @@ def wait_till_nodes_are_ready(nodes_count, cluster_name):

# Require wait_till_nodes_are_ready has finished and all nodes are up
def get_libvirt_nodes_mac_role_ip_and_name():
log.info("Get nodes macs and roles from libvirt")
cmd = "%s %s" % (VIRSH_LEASES_COMMAND, consts.TEST_NETWORK)
nodes_data = {}
try:
Expand All @@ -77,6 +77,19 @@ def get_libvirt_nodes_mac_role_ip_and_name():
raise


def get_libvirt_nodes_macs():
return get_libvirt_nodes_mac_role_ip_and_name().keys()


def are_all_libvirt_nodes_in_cluster_hosts(client, cluster_id):
hosts_macs = client.get_hosts_id_with_macs(cluster_id)
return all(mac.lower() in map(str.lower, itertools.chain(*hosts_macs.values())) for mac in get_libvirt_nodes_macs())


def get_cluster_hosts_with_mac(client, cluster_id, macs):
return [client.get_host_by_mac(cluster_id, mac) for mac in macs]


def get_tfvars():
if not os.path.exists(consts.TFVARS_JSON_FILE):
raise Exception("%s doesn't exists" % consts.TFVARS_JSON_FILE)
Expand All @@ -85,8 +98,7 @@ def get_tfvars():
return tfvars


def are_all_hosts_in_status(client, cluster_id, nodes_count, statuses, fall_on_error_status=True):
hosts = client.get_cluster_hosts(cluster_id)
def are_hosts_in_status(client, cluster_id, hosts, nodes_count, statuses, fall_on_error_status=True):
hosts_in_status = [host for host in hosts if host["status"] in statuses]
if len(hosts_in_status) >= nodes_count:
return True
Expand All @@ -95,22 +107,43 @@ def are_all_hosts_in_status(client, cluster_id, nodes_count, statuses, fall_on_e
log.error("Some of the hosts are in insufficient or error status. Hosts in error %s", hosts_in_error)
raise Exception("All the nodes must be in valid status, but got some in error")

log.info("Asked all hosts to be in one of the statuses from %s and currently hosts statuses are %s", statuses,
log.info("Asked hosts to be in one of the statuses from %s and currently hosts statuses are %s", statuses,
[(host["id"], host["status"], host["status_info"]) for host in hosts])
return False


def wait_till_hosts_with_macs_are_in_status(client, cluster_id, macs, statuses,
timeout=consts.NODES_REGISTERED_TIMEOUT,
fall_on_error_status=True, interval=5):
log.info("Wait till %s nodes are in one of the statuses %s", len(macs), statuses)

try:
waiting.wait(lambda: are_hosts_in_status(client, cluster_id, get_cluster_hosts_with_mac(client, cluster_id, macs),
len(macs), statuses, fall_on_error_status),
timeout_seconds=timeout,
sleep_seconds=interval, waiting_for="Nodes to be in of the statuses %s" % statuses)
except:
hosts = get_cluster_hosts_with_mac(client, cluster_id, macs)
log.info("All nodes: %s", hosts)
pprint.pprint(hosts)
raise


def wait_till_all_hosts_are_in_status(client, cluster_id, nodes_count, statuses,
timeout=consts.NODES_REGISTERED_TIMEOUT,
fall_on_error_status=True, interval=5):
hosts = client.get_cluster_hosts(cluster_id)
log.info("Wait till %s nodes are in one of the statuses %s", nodes_count, statuses)

try:
waiting.wait(lambda: are_all_hosts_in_status(client, cluster_id, nodes_count, statuses, fall_on_error_status),
waiting.wait(lambda: are_hosts_in_status(client, cluster_id, client.get_cluster_hosts(cluster_id),
nodes_count, statuses, fall_on_error_status),
timeout_seconds=timeout,
sleep_seconds=interval, waiting_for="Nodes to be in of the statuses %s" % statuses)
except:
log.info("All nodes: %s", client.get_cluster_hosts(cluster_id))
pprint.pprint(client.get_cluster_hosts(cluster_id))
hosts = client.get_cluster_hosts(cluster_id)
log.info("All nodes: %s", hosts)
pprint.pprint(hosts)
raise


Expand Down
2 changes: 1 addition & 1 deletion scripts/utils.sh
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ function wait_for_url_and_run() {
url_reachable "$1" && STATUS=$? || STATUS=$?
done
if [ $RETRIES -eq 0 ]; then
echo "Timeout reached, url not reachable"
echo "Timeout reached, url $1 not reachable"
exit 1
fi
}
Expand Down
3 changes: 2 additions & 1 deletion skipper.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,4 +34,5 @@ env:
RUN_WITH_VIPS: $RUN_WITH_VIPS
KUBECONFIG_GENERATE_IMAGE: $KUBECONFIG_GENERATE_IMAGE
REMOTE_INVENTORY_URL: $REMOTE_INVENTORY_URL
CLUSTER_ID: $CLUSTER_ID
CLUSTER_ID: $CLUSTER_ID
NUM_MASTERS: $NUM_MASTERS

0 comments on commit 3cf4d01

Please sign in to comment.