diff --git a/padocc/core/filehandlers.py b/padocc/core/filehandlers.py index 6a485b3..37156cf 100644 --- a/padocc/core/filehandlers.py +++ b/padocc/core/filehandlers.py @@ -28,7 +28,7 @@ class FileIOMixin(LoggedOperation): no attributes passed to either of these. fh.create_file() - fh.save_file() + fh.close() 3. Get/set: @@ -126,7 +126,7 @@ def create_file(self): else: self.logger.info(f'DRYRUN: Skipped creating "{self._file}"') - def save_file(self): + def close(self): """ Wrapper for _set_content method """ diff --git a/padocc/core/project.py b/padocc/core/project.py index 588370a..5ffa226 100644 --- a/padocc/core/project.py +++ b/padocc/core/project.py @@ -253,10 +253,10 @@ def update_status( def save_files(self): # Add all files here. - self.base_cfg.save_file() - self.detail_cfg.save_file() - self.allfiles.save_file() - self.status_log.save_file() + self.base_cfg.close() + self.detail_cfg.close() + self.allfiles.close() + self.status_log.close() def _configure_filelist(self): pattern = self.base_cfg['pattern'] diff --git a/padocc/operations/group.py b/padocc/operations/group.py index 2f359bd..0db1239 100644 --- a/padocc/operations/group.py +++ b/padocc/operations/group.py @@ -315,11 +315,11 @@ def add_project(self): def _save_proj_codes(self): for pc in self.proj_codes.keys(): - self.proj_codes[pc].save_file() + self.proj_codes[pc].close() def save_files(self): - self.blacklist_codes.save_file() - self.datasets.save_file() + self.blacklist_codes.close() + self.datasets.close() self._save_proj_codes() def _add_proj_codeset(self, name : str, newcodes : list): @@ -495,7 +495,7 @@ def _create_job_array( ] sbatch.update(sbatch_contents) - sbatch.save_file() + sbatch.close() if self._dryrun: self.logger.info('DRYRUN: sbatch command: ') diff --git a/padocc/phases/compute.py b/padocc/phases/compute.py index a3408c6..78a3d52 100644 --- a/padocc/phases/compute.py +++ b/padocc/phases/compute.py @@ -121,7 +121,7 @@ def run(self, nfile: str, filehandler=None, extension=None, **kwargs) -> dict: if filehandler: filehandler.set(tdict) - filehandler.save_file() + filehandler.close() return tdict, ctype def _convert_kerchunk(self, nfile: str, ctype, **kwargs) -> None: @@ -326,12 +326,12 @@ def _run_with_timings(self, func): detail['timings']['compute_actual'] = compute_time self.detail_cfg.set(detail) - self.detail_cfg.save_file() + self.detail_cfg.close() return 'Success' def save_files(self): super().save_files() - self.temp_zattrs.save_file() + self.temp_zattrs.close() @property def outpath(self): @@ -703,7 +703,7 @@ def create_refs(self) -> None: if not self.quality_required: self._perform_shape_checks(ref) CacheFile.set(ref) - CacheFile.save_file() + CacheFile.close() ctypes.append(ctype) self.success = converter.success @@ -871,7 +871,7 @@ def _data_to_json(self, refs: dict) -> None: if not self.partial: self.logger.info(f'Written to JSON file - {self.outfile}') - self.kfile.save_file() + self.kfile.close() else: self.logger.info(f'Skipped writing to JSON file - {self.outfile}') diff --git a/padocc/phases/scan.py b/padocc/phases/scan.py index ba21d51..c5a6d7d 100644 --- a/padocc/phases/scan.py +++ b/padocc/phases/scan.py @@ -360,7 +360,7 @@ def _compile_outputs(self, std_vars, cpf, volms, timings, ctypes, escape=None, i existing_details = details self.detail_cfg.set(existing_details) - self.detail_cfg.save_file() + self.detail_cfg.close() if __name__ == '__main__': print('Kerchunk Pipeline Config Scanner - run using master scripts') \ No newline at end of file diff --git a/padocc/phases/validate_new.py b/padocc/phases/validate_new.py index 68b2143..08c4fca 100644 --- a/padocc/phases/validate_new.py +++ b/padocc/phases/validate_new.py @@ -14,6 +14,8 @@ import logging import math import re +from functools import reduce +from itertools import groupby from padocc.core.errors import ( ShapeMismatchError, @@ -37,7 +39,7 @@ SUFFIXES = [] SUFFIX_LIST = [] -from padocc.core import ProjectOperation +from padocc.core import ProjectOperation, LoggedOperation ### Public Validation methods visible across PADOCC @@ -420,6 +422,211 @@ def validate_selection( else: return compare_data(name, tbox, cbox, logger=logger, bypass=bypass) +def _count_duplicates(arr: list, source_num: int = None): + """ + Count the number of duplicates in a list + compared to the source number - return the values + that are not present in all source arrays. + """ + + freq_items = {} + for item in arr: + if item in freq_items: + freq_items[item] += 1 + else: + freq_items[item] = 1 + + if source_num is None: + return freq_items + else: + missing = [] + for item, value in freq_items.items(): + if value < source_num: + missing.append(item) + return missing + + +class ValidateDatasets(LoggedOperation): + def __init__( + self, + datasets: list, + identifier: str, + logger = None, + label: str = None, + fh: str = None, + logid: str = None, + verbose: bool = None, + ): + """ + Initiator for the ValidateDataset Class. + Given a list of xarray.Dataset objects, all methods applied to + all datasets should give the same values as an output - the + outputs should be equivalent. + + These dataset objects should be identical, just from different sources. + """ + + self._identifier = identifier + self._datasets = datasets + + super().__init__( + logger, + label=label, + fh=fh, + logid=logid, + verbose=verbose + ) + + + def __str__(self): + return f'' + + def validate_all(self, allowances: dict = None): + """ + Run all validation steps on this set of datasets. + """ + + allowances = allowances or {} + ignore_vars, ignore_dims, ignore_globals = None, None, None + + # Validate global attributes + if 'ignore_global_attrs' in allowances: + ignore_globals = {'ignore': allowances['ignore_global_attrs']} + + self.validate_global_attrs(allowances=ignore_globals) + + if 'ignore_variables' in allowances: + ignore_vars = {'ignore': allowances['ignore_variables']} + if 'ignore_dimensions' in allowances: + ignore_dims = {'ignore': allowances['ignore_dimensions']} + + # Validate variables/dimensions + self.validate_variables(allowances=ignore_vars) + self.validate_dimensions(allowances=ignore_dims) + + def validate_variables(self, allowances: dict = None): + """ + Validate variables public method + """ + self._validate_selector(allowances=allowances, selector='variables') + + def validate_dimensions(self, allowances: dict = None): + """ + Validate dimensions public method + """ + self._validate_selector(allowances=allowances, selector='dimensions') + + def _validate_selector(self, allowances: dict = None, selector: str = 'variables'): + """ + Ensure all variables/dimensions are consistent across all datasets. + Allowances dict contains configurations for skipping some variables + in the case for example of a virtual dimension. + + allowances: + ignore: [list to ignore] + """ + ignore_vars = [] + + allowances = allowances or {} + if f'ignore' in allowances: + ignore_vars = allowances['ignore'] + + compare_vars = [[] for d in len(self._datasets)] + total_list = [] + for index, d in enumerate(self._datasets): + + vset = getattr(d, selector) + + for var in vset: + if var in ignore_vars: + continue + compare_vars[index].append(var) + total_list.extend(compare_vars[index]) + + # Check each list has the same number of variables. + if len(total_list) != len(compare_vars[0])*len(compare_vars): + raise VariableMismatchError( + f'The number of {selector} between datasets does not match: ' + f'Datasets have {[len(c) for c in compare_vars]} {selector} ' + 'respectively.' + ) + + # Check all variables are present in all datasets. + missing = _count_duplicates(total_list, source_num=len(self._datasets)) + if missing: + raise VariableMismatchError( + f'Inconsistent {selector} between datasets - {selector} ' + f'not present in all files: {missing}' + ) + + # Check variables appear in the same order in all datasets + in_order = True + for vset in zip(*compare_vars): + vars = groupby(vset) + is_equal = next(vars, True) and not next(vars, False) + in_order = in_order and is_equal + + # Warning for different ordering only. + if not in_order: + self.logger.warning( + f'{selector} present in a different order between datasets' + ) + + def validate_global_attrs(self, allowances: dict = None): + """ + Validate the set of global attributes across all datasets + """ + + allowances = allowances or {} + ignore = [] + if 'ignore' in allowances: + ignore = allowances['ignore'] + + attrset = [] + for d in self._datasets: + attrset.append(d.attrs) + + self._validate_attrs(attrset, source='global.', ignore=ignore) + + + def _validate_attrs(self, attrset: list, source: str = '', ignore: list = None): + """ + Ensure all values across the sets of attributes are consistent + """ + + ignore = ignore or [] + for attr in attrset[0].keys(): + + # Try extracting this attribute from all attribute sets. + try: + set_of_values = [a[attr] for a in attrset] + except IndexError: + if attr not in ignore: + raise ValueError( + f'Attribute {source}{attr} not present in all datasets' + ) + + for s in set_of_values[1:]: + if not np.all(s == set_of_values[0]): + raise ValueError( + f'Attribute {source}{attr} is not equal across all datasets:' + f'Found values: {set_of_values}' + ) + + def validate_shapes(self, allowances: dict = None): + """ + Ensure all variable shapes are consistent across all datasets. + Allowances dict contains configurations for skipping some shape tests + in the case for example of a virtual dimension. + """ + pass + + def validate_data(self, allowances: dict = None): + """ + Perform data validations using the growbox method for all datasets. + """ + pass + class ValidateOperation(ProjectOperation): """ diff --git a/padocc/tests/test_init.py b/padocc/tests/test_init.py index cb8f677..7d1e09e 100644 --- a/padocc/tests/test_init.py +++ b/padocc/tests/test_init.py @@ -2,6 +2,21 @@ WORKDIR = 'padocc/tests/auto_testdata_dir' +infile = 'padocc/tests/data/myfile.csv' +# Input CSV has Identifier, Path/To/Datasets, {updates}, {removals} + +groupID = 'padocc-test-suite' +workdir = '/home/username/padocc-workdir' + +mygroup = GroupOperation( + groupID, + workdir=workdir, + label='test_group' +) + +mygroup.init_from_file(infile) + + class TestInit: def test_init_basic(self, wd=WORKDIR):