Skip to content

Commit

Permalink
Various changes to several scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
dwest77a committed Nov 11, 2024
1 parent 3b4d598 commit af48b71
Show file tree
Hide file tree
Showing 7 changed files with 239 additions and 17 deletions.
4 changes: 2 additions & 2 deletions padocc/core/filehandlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ class FileIOMixin(LoggedOperation):
no attributes passed to either of these.
fh.create_file()
fh.save_file()
fh.close()
3. Get/set:
Expand Down Expand Up @@ -126,7 +126,7 @@ def create_file(self):
else:
self.logger.info(f'DRYRUN: Skipped creating "{self._file}"')

def save_file(self):
def close(self):
"""
Wrapper for _set_content method
"""
Expand Down
8 changes: 4 additions & 4 deletions padocc/core/project.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,10 +253,10 @@ def update_status(

def save_files(self):
# Add all files here.
self.base_cfg.save_file()
self.detail_cfg.save_file()
self.allfiles.save_file()
self.status_log.save_file()
self.base_cfg.close()
self.detail_cfg.close()
self.allfiles.close()
self.status_log.close()

def _configure_filelist(self):
pattern = self.base_cfg['pattern']
Expand Down
8 changes: 4 additions & 4 deletions padocc/operations/group.py
Original file line number Diff line number Diff line change
Expand Up @@ -315,11 +315,11 @@ def add_project(self):

def _save_proj_codes(self):
for pc in self.proj_codes.keys():
self.proj_codes[pc].save_file()
self.proj_codes[pc].close()

def save_files(self):
self.blacklist_codes.save_file()
self.datasets.save_file()
self.blacklist_codes.close()
self.datasets.close()
self._save_proj_codes()

def _add_proj_codeset(self, name : str, newcodes : list):
Expand Down Expand Up @@ -495,7 +495,7 @@ def _create_job_array(
]

sbatch.update(sbatch_contents)
sbatch.save_file()
sbatch.close()

if self._dryrun:
self.logger.info('DRYRUN: sbatch command: ')
Expand Down
10 changes: 5 additions & 5 deletions padocc/phases/compute.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ def run(self, nfile: str, filehandler=None, extension=None, **kwargs) -> dict:

if filehandler:
filehandler.set(tdict)
filehandler.save_file()
filehandler.close()
return tdict, ctype

def _convert_kerchunk(self, nfile: str, ctype, **kwargs) -> None:
Expand Down Expand Up @@ -326,12 +326,12 @@ def _run_with_timings(self, func):
detail['timings']['compute_actual'] = compute_time

self.detail_cfg.set(detail)
self.detail_cfg.save_file()
self.detail_cfg.close()
return 'Success'

def save_files(self):
super().save_files()
self.temp_zattrs.save_file()
self.temp_zattrs.close()

@property
def outpath(self):
Expand Down Expand Up @@ -703,7 +703,7 @@ def create_refs(self) -> None:
if not self.quality_required:
self._perform_shape_checks(ref)
CacheFile.set(ref)
CacheFile.save_file()
CacheFile.close()
ctypes.append(ctype)

self.success = converter.success
Expand Down Expand Up @@ -871,7 +871,7 @@ def _data_to_json(self, refs: dict) -> None:

if not self.partial:
self.logger.info(f'Written to JSON file - {self.outfile}')
self.kfile.save_file()
self.kfile.close()
else:
self.logger.info(f'Skipped writing to JSON file - {self.outfile}')

Expand Down
2 changes: 1 addition & 1 deletion padocc/phases/scan.py
Original file line number Diff line number Diff line change
Expand Up @@ -360,7 +360,7 @@ def _compile_outputs(self, std_vars, cpf, volms, timings, ctypes, escape=None, i
existing_details = details

self.detail_cfg.set(existing_details)
self.detail_cfg.save_file()
self.detail_cfg.close()

if __name__ == '__main__':
print('Kerchunk Pipeline Config Scanner - run using master scripts')
209 changes: 208 additions & 1 deletion padocc/phases/validate_new.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
import logging
import math
import re
from functools import reduce
from itertools import groupby

from padocc.core.errors import (
ShapeMismatchError,
Expand All @@ -37,7 +39,7 @@
SUFFIXES = []
SUFFIX_LIST = []

from padocc.core import ProjectOperation
from padocc.core import ProjectOperation, LoggedOperation

### Public Validation methods visible across PADOCC

Expand Down Expand Up @@ -420,6 +422,211 @@ def validate_selection(
else:
return compare_data(name, tbox, cbox, logger=logger, bypass=bypass)

def _count_duplicates(arr: list, source_num: int = None):
"""
Count the number of duplicates in a list
compared to the source number - return the values
that are not present in all source arrays.
"""

freq_items = {}
for item in arr:
if item in freq_items:
freq_items[item] += 1
else:
freq_items[item] = 1

if source_num is None:
return freq_items
else:
missing = []
for item, value in freq_items.items():
if value < source_num:
missing.append(item)
return missing


class ValidateDatasets(LoggedOperation):
def __init__(
self,
datasets: list,
identifier: str,
logger = None,
label: str = None,
fh: str = None,
logid: str = None,
verbose: bool = None,
):
"""
Initiator for the ValidateDataset Class.
Given a list of xarray.Dataset objects, all methods applied to
all datasets should give the same values as an output - the
outputs should be equivalent.
These dataset objects should be identical, just from different sources.
"""

self._identifier = identifier
self._datasets = datasets

super().__init__(
logger,
label=label,
fh=fh,
logid=logid,
verbose=verbose
)


def __str__(self):
return f'<PADOCC Validator: {self._identifier}>'

def validate_all(self, allowances: dict = None):
"""
Run all validation steps on this set of datasets.
"""

allowances = allowances or {}
ignore_vars, ignore_dims, ignore_globals = None, None, None

# Validate global attributes
if 'ignore_global_attrs' in allowances:
ignore_globals = {'ignore': allowances['ignore_global_attrs']}

self.validate_global_attrs(allowances=ignore_globals)

if 'ignore_variables' in allowances:
ignore_vars = {'ignore': allowances['ignore_variables']}
if 'ignore_dimensions' in allowances:
ignore_dims = {'ignore': allowances['ignore_dimensions']}

# Validate variables/dimensions
self.validate_variables(allowances=ignore_vars)
self.validate_dimensions(allowances=ignore_dims)

def validate_variables(self, allowances: dict = None):
"""
Validate variables public method
"""
self._validate_selector(allowances=allowances, selector='variables')

def validate_dimensions(self, allowances: dict = None):
"""
Validate dimensions public method
"""
self._validate_selector(allowances=allowances, selector='dimensions')

def _validate_selector(self, allowances: dict = None, selector: str = 'variables'):
"""
Ensure all variables/dimensions are consistent across all datasets.
Allowances dict contains configurations for skipping some variables
in the case for example of a virtual dimension.
allowances:
ignore: [list to ignore]
"""
ignore_vars = []

allowances = allowances or {}
if f'ignore' in allowances:
ignore_vars = allowances['ignore']

compare_vars = [[] for d in len(self._datasets)]
total_list = []
for index, d in enumerate(self._datasets):

vset = getattr(d, selector)

for var in vset:
if var in ignore_vars:
continue
compare_vars[index].append(var)
total_list.extend(compare_vars[index])

# Check each list has the same number of variables.
if len(total_list) != len(compare_vars[0])*len(compare_vars):
raise VariableMismatchError(
f'The number of {selector} between datasets does not match: '
f'Datasets have {[len(c) for c in compare_vars]} {selector} '
'respectively.'
)

# Check all variables are present in all datasets.
missing = _count_duplicates(total_list, source_num=len(self._datasets))
if missing:
raise VariableMismatchError(
f'Inconsistent {selector} between datasets - {selector} '
f'not present in all files: {missing}'
)

# Check variables appear in the same order in all datasets
in_order = True
for vset in zip(*compare_vars):
vars = groupby(vset)
is_equal = next(vars, True) and not next(vars, False)
in_order = in_order and is_equal

# Warning for different ordering only.
if not in_order:
self.logger.warning(
f'{selector} present in a different order between datasets'
)

def validate_global_attrs(self, allowances: dict = None):
"""
Validate the set of global attributes across all datasets
"""

allowances = allowances or {}
ignore = []
if 'ignore' in allowances:
ignore = allowances['ignore']

attrset = []
for d in self._datasets:
attrset.append(d.attrs)

self._validate_attrs(attrset, source='global.', ignore=ignore)


def _validate_attrs(self, attrset: list, source: str = '', ignore: list = None):
"""
Ensure all values across the sets of attributes are consistent
"""

ignore = ignore or []
for attr in attrset[0].keys():

# Try extracting this attribute from all attribute sets.
try:
set_of_values = [a[attr] for a in attrset]
except IndexError:
if attr not in ignore:
raise ValueError(
f'Attribute {source}{attr} not present in all datasets'
)

for s in set_of_values[1:]:
if not np.all(s == set_of_values[0]):
raise ValueError(
f'Attribute {source}{attr} is not equal across all datasets:'
f'Found values: {set_of_values}'
)

def validate_shapes(self, allowances: dict = None):
"""
Ensure all variable shapes are consistent across all datasets.
Allowances dict contains configurations for skipping some shape tests
in the case for example of a virtual dimension.
"""
pass

def validate_data(self, allowances: dict = None):
"""
Perform data validations using the growbox method for all datasets.
"""
pass


class ValidateOperation(ProjectOperation):
"""
Expand Down
15 changes: 15 additions & 0 deletions padocc/tests/test_init.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,21 @@

WORKDIR = 'padocc/tests/auto_testdata_dir'

infile = 'padocc/tests/data/myfile.csv'
# Input CSV has Identifier, Path/To/Datasets, {updates}, {removals}

groupID = 'padocc-test-suite'
workdir = '/home/username/padocc-workdir'

mygroup = GroupOperation(
groupID,
workdir=workdir,
label='test_group'
)

mygroup.init_from_file(infile)


class TestInit:

def test_init_basic(self, wd=WORKDIR):
Expand Down

0 comments on commit af48b71

Please sign in to comment.