Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DAOS-16557 test: Add debug to NvmeEnospace ftest #15559

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
265 changes: 216 additions & 49 deletions src/tests/ftest/nvme/enospace.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
'''
(C) Copyright 2020-2024 Intel Corporation.
(C) Copyright 2025 Hewlett Packard Enterprise Development LP

SPDX-License-Identifier: BSD-2-Clause-Patent
'''
Expand Down Expand Up @@ -33,7 +34,42 @@ def __init__(self, *args, **kwargs):
"""Initialize a NvmeEnospace object."""
super().__init__(*args, **kwargs)

self.metric_names = ['engine_pool_vos_space_scm_used', 'engine_pool_vos_space_nvme_used']
self.space_metric_names = [
'engine_pool_vos_space_scm_used',
'engine_pool_vos_space_nvme_used'
]
self.aggr_metric_names = [
# -- Merged records --
"engine_pool_vos_aggregation_merged_size",
"engine_pool_vos_aggregation_merged_recs",
# -- Deleted records --
"engine_pool_vos_aggregation_deleted_ev",
"engine_pool_vos_aggregation_deleted_sv",
# -- Errors --
"engine_pool_vos_aggregation_fail_count",
"engine_pool_vos_aggregation_csum_errors",
"engine_pool_vos_aggregation_uncommitted",
"engine_pool_vos_aggregation_agg_blocked",
"engine_pool_vos_aggregation_discard_blocked",
# -- Details stat counter --
"engine_pool_vos_aggregation_obj_deleted",
"engine_pool_vos_aggregation_obj_scanned",
"engine_pool_vos_aggregation_obj_skipped",
"engine_pool_vos_aggregation_akey_deleted",
"engine_pool_vos_aggregation_akey_scanned",
"engine_pool_vos_aggregation_akey_skipped",
"engine_pool_vos_aggregation_dkey_deleted",
"engine_pool_vos_aggregation_dkey_scanned",
"engine_pool_vos_aggregation_dkey_skipped",
# -- Duration --
"engine_pool_vos_aggregation_epr_duration",
"engine_pool_vos_aggregation_epr_duration_max",
"engine_pool_vos_aggregation_epr_duration_mean",
"engine_pool_vos_aggregation_epr_duration_min",
"engine_pool_vos_aggregation_epr_duration_stddev"
]
self.metric_names = self.space_metric_names + self.aggr_metric_names

self.media_names = ['SCM', 'NVMe']
self.expected_errors = [self.DER_NOSPACE, self.DER_TIMEDOUT]

Expand All @@ -55,26 +91,31 @@ def setUp(self):
self.daos_cmd = DaosCommand(self.bin)
self.create_pool_max_size()

def get_pool_space_metrics(self, pool_uuid):
def get_pool_space_metrics(self, pool, metrics):
"""Return the metrics on space usage of a given pool.

Args:
pool_uuid (str): Unique id of a pool.
pool (TestPool): target TestPool.
metrics (dict): telemetry metrics.

Returns:
dict: metrics on space usage.

"""
metrics = {}
for hostname, data in self.telemetry.get_metrics(",".join(self.metric_names)).items():
pool_uuid = pool.uuid
space_metrics = {}
for hostname, data in metrics.items():
for metric_name, entry in data.items():
if metric_name not in metrics:
metrics[metric_name] = {
if metric_name not in self.space_metric_names:
continue

if metric_name not in space_metrics:
space_metrics[metric_name] = {
"description": entry['description'],
"hosts": {}
}

hosts = metrics[metric_name]["hosts"]
hosts = space_metrics[metric_name]["hosts"]
for metric in entry['metrics']:
if metric['labels']['pool'].casefold() != pool_uuid.casefold():
continue
Expand All @@ -89,11 +130,60 @@ def get_pool_space_metrics(self, pool_uuid):
target = metric['labels']['target']
hosts[hostname][rank][target] = metric['value']

return metrics
return space_metrics

def get_pool_aggr_metrics(self, pool, metrics):
"""Return the metrics on aggregation counters and gauges.

Args:
pool (TestPool): target TestPool.
metrics (dict): telemetry metrics.

Returns:
dict: metrics on aggregation.

"""
pool_uuid = pool.uuid
aggr_metrics = {
"metric_descriptions": {},
"metric_values": {}
}
for hostname, data in metrics.items():
if hostname not in aggr_metrics["metric_values"]:
aggr_metrics["metric_values"][hostname] = {}
hosts = aggr_metrics["metric_values"][hostname]

for metric_name, entry in data.items():
if metric_name not in self.aggr_metric_names:
continue

if metric_name not in aggr_metrics["metric_descriptions"]:
aggr_metrics["metric_descriptions"][metric_name] = entry["description"]

for metric in entry['metrics']:
if metric['labels']['pool'].casefold() != pool_uuid.casefold():
continue

rank = metric['labels']['rank']
if rank not in hosts:
hosts[rank] = {}
ranks = hosts[rank]

target = metric['labels']['target']
if target not in ranks:
ranks[target] = {}
targets = ranks[target]

targets[metric_name] = metric['value']

return aggr_metrics

def get_pool_usage(self, pool_space):
"""Get the pool storage used % for SCM and NVMe.

Args:
pool_space (object): space usage information of a pool.

Returns:
list: a list of SCM/NVMe pool space usage in %(float)

Expand All @@ -106,14 +196,55 @@ def get_pool_usage(self, pool_space):

return pool_usage

def display_pool_stats(self, pool_space, pool_space_metrics):
"""Display statistics on pool usage.
def display_table(self, title, table, align_idx):
"""Pretty print table content.

Args:
title (str): Title of the table.
table (list): Table to print on stdout.
align_idx (int): Last column to left align.
"""
cols_size = [
max(i) for i in [[len(row[j]) for row in table] for j in range(len(table[0]))]]
line_size = sum(cols_size) + 3 * (len(cols_size) - 1)

self.log.debug("")
line = f"{' ' + title + ' ':-^{line_size}}"
self.log.debug(line)

line = ""
for idx, elt in enumerate(table[0]):
line += f"{elt:^{cols_size[idx]}}"
if idx + 1 != len(table[0]):
line += " | "
self.log.debug(line)

line = ""
for idx, size in enumerate(cols_size):
line += '-' * size
if idx + 1 != len(cols_size):
line += "-+-"
self.log.debug(line)

for row in table[1:]:
line = ""
for idx, elt in enumerate(row):
align_op = "<"
if idx > align_idx:
align_op = ">"
line += f"{elt:{align_op}{cols_size[idx]}}"
if idx + 1 != len(row):
line += " | "
self.log.debug(line)

def display_pool_space(self, pool_space, pool_space_metrics):
"""Display space usage statistics of a given pool.

Args:
pool_space (object): space usage information of a pool.
pool_space_metrics (dict): dict of metrics on space usage of a pool.
"""

self.log.debug("")
title = f"{' Pool Space Usage ':-^80}"
self.log.debug(title)

Expand All @@ -135,34 +266,65 @@ def display_pool_stats(self, pool_space, pool_space_metrics):

for metric in pool_space_metrics.values():
table = [["Hostname", "Rank", "Target", "Size"]]
cols_size = []
for cell in table[0]:
cols_size.append(len(cell))
for hostname, ranks in metric['hosts'].items():
for rank, targets in ranks.items():
for target, size in targets.items():
row = [hostname, rank, target, get_display_size(size)]
table.append(row)
for idx, elt in enumerate(cols_size):
cols_size[idx] = max(elt, len(row[idx]))
hostname = ""
rank = ""

for idx, elt in enumerate(table[0]):
table[0][idx] = f"{elt:^{cols_size[idx]}}"
row = ' | '.join(table[0])
title = f"{' ' + metric['description'] + ' ':-^{len(row)}}"
self.log.debug("")
self.log.debug(title)
self.log.debug(row)
self.log.debug("-" * len(row))
for row in table[1:]:
for idx, elt in enumerate(row):
align_op = "<"
if idx + 1 == len(row):
align_op = ">"
row[idx] = f"{elt:{align_op}{cols_size[idx]}}"
self.log.debug(" | ".join(row))
self.display_table(metric['description'], table, 2)

def display_pool_aggregation(self, metrics):
"""Display record aggregation statistics of a given pool.

Args:
metrics (dict): dict of metrics on pool aggregation.
"""
table = [["Hostname", "Rank", "Target"]]
for it in self.aggr_metric_names:
table[0].append(metrics["metric_descriptions"][it])

for hostname in sorted(metrics["metric_values"]):
row = [hostname]

for rank in sorted(metrics["metric_values"][hostname]):
if not row:
row = [""]
row.append(rank)

for target in sorted(metrics["metric_values"][hostname][rank]):
if not row:
row = ["", ""]
row.append(target)

idx = 3
for metric_name in self.aggr_metric_names:
value = metrics["metric_values"][hostname][rank][target][metric_name]
if metric_name == "engine_pool_vos_aggregation_merged_size":
row.append(get_display_size(value))
else:
row.append(str(value))
idx += 1

table.append(row)
row = None

self.display_table('Pool Aggregation stats', table, 2)

def display_stats(self):
"""Display usage statistics of the tested pool."""
self.pool.get_info()
metrics = self.telemetry.get_metrics(",".join(self.metric_names))

pool_space = self.pool.info.pi_space
pool_space_metrics = self.get_pool_space_metrics(self.pool, metrics)
self.display_pool_space(pool_space, pool_space_metrics)

pool_aggr_metrics = self.get_pool_aggr_metrics(self.pool, metrics)
self.display_pool_aggregation(pool_aggr_metrics)
self.log.debug("")

def verify_enospace_log(self, log_file):
"""Function checking logs consistency.
Expand Down Expand Up @@ -207,10 +369,14 @@ def err_to_str(err_no):
"Number of errors %s (%s) is > 0: got=%d",
err_to_str(error), error, errors_count[error])

def delete_all_containers(self):
"""Delete all the containers."""
def delete_all_containers(self, pool):
"""Delete all the containers of a given pool.

Args:
pool (TestPool): target TestPool.
"""
# List all the container
kwargs = {"pool": self.pool.uuid}
kwargs = {"pool": pool.uuid}
data = self.daos_cmd.container_list(**kwargs)
containers = [uuid_label["uuid"] for uuid_label in data["response"]]

Expand Down Expand Up @@ -291,17 +457,22 @@ def run_enospace_foreground(self, log_file):
log_file (str): name prefix of the log files to check.
"""
self.log.info('----Starting main IOR load----')
self.display_stats()

# Fill 75% of current SCM free space. Aggregation is Enabled so NVMe space will
# start to fill up.
self.log.info('--Filling 75% of the current SCM free space--')
self.start_ior_load(storage='SCM', operation="Auto_Write", percent=75)
self.log.info(self.pool.pool_percentage_used())
try:
self.start_ior_load(storage='SCM', operation="Auto_Write", percent=75)
finally:
self.display_stats()

# Fill 50% of current SCM free space. Aggregation is Enabled so NVMe space will
# continue to fill up.
self.start_ior_load(storage='SCM', operation="Auto_Write", percent=50)
self.log.info(self.pool.pool_percentage_used())
try:
self.start_ior_load(storage='SCM', operation="Auto_Write", percent=50)
finally:
self.display_stats()

# Fill 60% of current SCM free space. This time, NVMe will be Full so data will
# not be moved to NVMe and continue to fill up SCM. SCM will be full and this
Expand All @@ -314,18 +485,14 @@ def run_enospace_foreground(self, log_file):
self.log.info('Test is expected to fail because of DER_NOSPACE')
else:
self.fail('This test is suppose to FAIL because of DER_NOSPACE but it Passed')

# Display the pool statistics
self.pool.get_info()
pool_space = self.pool.info.pi_space
pool_space_metrics = self.get_pool_space_metrics(self.pool.uuid)
self.display_pool_stats(pool_space, pool_space_metrics)
finally:
self.display_stats()

# verify the DER_NO_SPACE error count is expected and no other Error in client log
self.verify_enospace_log(log_file)

# Check both NVMe and SCM are full.
pool_usage = self.get_pool_usage(pool_space)
pool_usage = self.get_pool_usage(self.pool.info.pi_space)
for idx, elt in enumerate(self.media_names):
if pool_usage[idx] >= self.pool_usage_min[idx]:
continue
Expand Down Expand Up @@ -413,7 +580,7 @@ def test_enospace_lazy_with_fg(self):
log_file = f"-loop_{_loop}".join(os.path.splitext(self.client_log))
self.run_enospace_foreground(log_file)
# Delete all the containers
self.delete_all_containers()
self.delete_all_containers(self.pool)
# Delete container will take some time to release the space
time.sleep(60)

Expand Down Expand Up @@ -475,7 +642,7 @@ def test_enospace_time_with_fg(self):
log_file = f"-loop_{_loop}".join(os.path.splitext(self.client_log))
self.run_enospace_with_bg_job(log_file)
# Delete all the containers
self.delete_all_containers()
self.delete_all_containers(self.pool)
# Delete container will take some time to release the space
time.sleep(60)

Expand Down Expand Up @@ -571,7 +738,7 @@ def test_enospace_no_aggregation(self):
self.verify_enospace_log(log_file)

# Delete all the containers
self.delete_all_containers()
self.delete_all_containers(self.pool)

# Wait for the SCM space to be released. (Usage goes below 60%)
scm_released = False
Expand Down