Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Automatically computing the regex for indeterministic properties #323

Merged
merged 9 commits into from
Feb 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions acto/input/input.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,11 @@ def __init__(
)
)

for base_schema, k8s_schema_name in self.full_matched_schemas:
base_schema.attributes |= (
property_attribute.PropertyAttribute.Mapped
)

# Apply custom property attributes based on the property_attribute module
self.apply_custom_field()

Expand Down
209 changes: 191 additions & 18 deletions acto/post_process/post_diff_test.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import argparse
import difflib
import glob
import hashlib
import itertools
import json
import logging
import multiprocessing
Expand All @@ -13,10 +15,11 @@
import threading
import time
from copy import deepcopy
from typing import Dict, List, Optional
from typing import Optional

import pandas as pd
import pydantic
import yaml
from deepdiff import DeepDiff
from deepdiff.helper import CannotCompare
from deepdiff.model import DiffLevel, TreeResult
Expand All @@ -33,22 +36,21 @@
from acto.serialization import ActoEncoder
from acto.snapshot import Snapshot
from acto.trial import Step
from acto.utils import add_acto_label, get_thread_logger
from acto.utils.error_handler import handle_excepthook, thread_excepthook
from acto.utils import add_acto_label, error_handler, get_thread_logger


class DiffTestResult(pydantic.BaseModel):
"""The result of a diff test
It contains the input digest, the snapshot, the original trial and generation,
and the time spent on each step
The oracle result is separated from this dataclass, so that it can easily recomputed
after changing the oracle
The oracle result is separated from this dataclass,
so that it can easily recomputed after changing the oracle
"""

input_digest: str
snapshot: Snapshot
originals: list[Dict]
time: Dict
originals: list[dict]
time: dict

@classmethod
def from_file(cls, file_path: str) -> "DiffTestResult":
Expand Down Expand Up @@ -332,7 +334,7 @@

def __init__(
self,
context: Dict,
context: dict,
deploy: Deploy,
workdir: str,
cluster: base.KubernetesEngine,
Expand Down Expand Up @@ -403,7 +405,7 @@
def __init__(
self,
workqueue: multiprocessing.Queue,
context: Dict,
context: dict,
deploy: Deploy,
workdir: str,
cluster: base.KubernetesEngine,
Expand Down Expand Up @@ -517,6 +519,80 @@
generation += 1


def compute_common_regex(paths: list[str]) -> list[str]:
"""Compute the common regex from the list of paths"""
common_regex: set[str] = set()
sorted_paths = sorted(paths)
curr_regex = None
for i in range(len(sorted_paths)):
if curr_regex is None or not re.match(curr_regex, sorted_paths[i]):
# if we do not have enough items to look ahead, then we give up
if i + 2 >= len(sorted_paths):
common_regex.add("^" + re.escape(sorted_paths[i]) + "$")
continue

path = sorted_paths[i]
next_path = sorted_paths[i + 1]
next_next_path = sorted_paths[i + 2]

# if the resource type is different, then we give up to avoid
# a wildcard regex
if (
path.startswith("root")
and next_path.startswith("root")
and next_next_path.startswith("root")
):
resource_type = re.findall(r"root\['(.*?)'\]", path)[0]
next_resource_type = re.findall(r"root\['(.*?)'\]", next_path)[
0
]
next_next_resource_type = re.findall(
r"root\['(.*?)'\]", next_next_path
)[0]
if (
resource_type != next_resource_type
or next_resource_type != next_next_resource_type
):
common_regex.add("^" + re.escape(path) + "$")
continue

Check warning on line 557 in acto/post_process/post_diff_test.py

View workflow job for this annotation

GitHub Actions / coverage-report

Missing coverage

Missing coverage on lines 556-557

# Look ahead to find a regex candidate
# Construct the regex candidate using the difflib
matched_blocks = difflib.SequenceMatcher(
None,
path,
next_path,
).get_matching_blocks()
regex_candidate = r""
for block in matched_blocks:
if block.a == 0 and block.b == 0:
regex_candidate += r"^"
elif block.size != 0:
# we may have duplicate wild card here,
# but it is fine because two consequence wild cards is
# equivalent to one wild card
regex_candidate += r".*"

if block.size <= 3:
# prevent accidental single character match in random string
continue
regex_candidate += re.escape(
path[block.a : block.a + block.size]
)
regex_candidate += r"$"

# Check if the regex candidate is valid for the third item
# if matched, then we have a valid regex
# if not, then we give up finding a common regex for the current
# item
if re.match(regex_candidate, next_next_path):
common_regex.add(regex_candidate)
curr_regex = regex_candidate
else:
common_regex.add(path)

Check warning on line 592 in acto/post_process/post_diff_test.py

View workflow job for this annotation

GitHub Actions / coverage-report

Missing coverage

Missing coverage on line 592
return list(common_regex)


class PostDiffTest(PostProcessor):
"""Post diff test class for Acto"""

Expand Down Expand Up @@ -566,9 +642,8 @@
],
)

self.unique_inputs: Dict[
str, pd.DataFrame
] = {} # input digest -> group of steps
# input digest -> group of steps
self.unique_inputs: dict[str, pd.DataFrame] = {}
groups = self.df.groupby("input_digest")
for digest, group in groups:
self.unique_inputs[digest] = group
Expand Down Expand Up @@ -605,7 +680,7 @@
for unique_input_group in self.unique_inputs.values():
workqueue.put(unique_input_group)

runners: List[DeployRunner] = []
runners: list[DeployRunner] = []

Check warning on line 683 in acto/post_process/post_diff_test.py

View workflow job for this annotation

GitHub Actions / coverage-report

Missing coverage

Missing coverage on line 683
for i in range(num_workers):
runner = DeployRunner(
workqueue,
Expand All @@ -630,18 +705,44 @@
def check(self, workdir: str, num_workers: int = 1):
"""Check the post process result"""
logger = get_thread_logger(with_prefix=True)
logger.info(
"Additional exclude paths: %s", self.config.diff_ignore_fields
)
trial_dirs = glob.glob(os.path.join(workdir, "trial-*"))

with open(self.config.seed_custom_resource, "r", encoding="utf-8") as f:
seed_cr = yaml.load(f, Loader=yaml.FullLoader)
seed_input_digest = hashlib.md5(

Check warning on line 712 in acto/post_process/post_diff_test.py

View workflow job for this annotation

GitHub Actions / coverage-report

Missing coverage

Missing coverage on lines 710-712
json.dumps(seed_cr, sort_keys=True).encode("utf-8")
).hexdigest()

workqueue: multiprocessing.Queue = multiprocessing.Queue()
for trial_dir in trial_dirs:
for diff_test_result_path in glob.glob(
os.path.join(trial_dir, "difftest-*.json")
):
# 1. Populate the workqueue
workqueue.put(diff_test_result_path)

# 2. Find the seed test result and compute the common regex
diff_test_result = DiffTestResult.from_file(
diff_test_result_path
)
if diff_test_result.input_digest == seed_input_digest:
diff_skip_regex = self.__get_diff_paths(
diff_test_result, num_workers
)
logger.info(
"Seed input digest: %s, diff_skip_regex: %s",
seed_input_digest,
diff_skip_regex,
)
if self.config.diff_ignore_fields is None:
self.config.diff_ignore_fields = diff_skip_regex
else:
self.config.diff_ignore_fields.extend(diff_skip_regex)

logger.info(

Check warning on line 742 in acto/post_process/post_diff_test.py

View workflow job for this annotation

GitHub Actions / coverage-report

Missing coverage

Missing coverage on lines 725-742
"Additional exclude paths: %s", self.config.diff_ignore_fields
)

processes = []
for i in range(num_workers):
p = multiprocessing.Process(
Expand Down Expand Up @@ -822,6 +923,78 @@
to_state=diff_test_result.snapshot.system_state,
)

def __get_diff_paths(
self, diff_test_result: DiffTestResult, num_workers: int
) -> list[str]:
"""Get the diff paths from a diff test result
Algorithm:
Iterate on the original trials, in principle they should be the same
If they are not, the diffs should be the indeterministic fields
to be skipped when doing the comparison.
Naively, we can just append all the indeterministic fields and return
them, however, there are cases where the field path itself is not
deterministic (e.g. the name of the secret could be randomly generated)
To handle this randomness in the name, we can use the first two
original trials to compare the system state to get the initial
regex for skipping.
If we do not have random names, the subsequent trials should not
have additional indeterministic fields.

Then, we keep iterating on the rest of the original results, and
check if we have additional indeterministic fields. If we do, we
collect them and try to figure out the proper regex to skip them.

Args:
diff_test_result (DiffTestResult): The diff test result

Returns:
list[str]: The list of diff paths
"""

indeterministic_regex: set[str] = set()

args = []
for original in diff_test_result.originals:
trial = original["trial"]
gen = original["gen"]
trial_basename = os.path.basename(trial)
original_result = self.trial_to_steps[trial_basename].steps[
str(gen)
]
args.append([diff_test_result, original_result, self.config])

with multiprocessing.Pool(num_workers) as pool:
diff_results = pool.starmap(get_diff_paths_helper, args)

for diff_item in itertools.chain.from_iterable(diff_results):
indeterministic_regex.add(diff_item)

# Handle the case where the name is not deterministic
common_regex = compute_common_regex(list(indeterministic_regex))

return common_regex

Check warning on line 975 in acto/post_process/post_diff_test.py

View workflow job for this annotation

GitHub Actions / coverage-report

Missing coverage

Missing coverage on lines 954-975


def get_diff_paths_helper(
diff_test_result: DiffTestResult,
original_result: Step,
config: OperatorConfig,
) -> list[str]:
"""Get the diff paths helper"""
diff_result = PostDiffTest.check_diff_test_step(
diff_test_result, original_result, config
)
indeterministic_regex = set()
if diff_result is not None:
for diff in diff_result.diff.values():
if not isinstance(diff, list):
continue
for diff_item in diff:
if not isinstance(diff_item, DiffLevel):
continue
indeterministic_regex.add(diff_item.path())
return list(indeterministic_regex)

Check warning on line 996 in acto/post_process/post_diff_test.py

View workflow job for this annotation

GitHub Actions / coverage-report

Missing coverage

Missing coverage on lines 984-996


def main():
"""Main entry point."""
Expand All @@ -834,8 +1007,8 @@
args = parser.parse_args()

# Register custom exception hook
sys.excepthook = handle_excepthook
threading.excepthook = thread_excepthook
sys.excepthook = error_handler.handle_excepthook
threading.excepthook = error_handler.thread_excepthook

Check warning on line 1011 in acto/post_process/post_diff_test.py

View workflow job for this annotation

GitHub Actions / coverage-report

Missing coverage

Missing coverage on lines 1010-1011

log_filename = "check.log" if args.checkonly else "test.log"
os.makedirs(args.workdir_path, exist_ok=True)
Expand Down
Loading
Loading