Various changes to several scripts

cedadev · Nov 11, 2024 · af48b71 · af48b71
1 parent 3b4d598
commit af48b71
Show file tree

Hide file tree

Showing 7 changed files with 239 additions and 17 deletions.
diff --git a/padocc/core/filehandlers.py b/padocc/core/filehandlers.py
@@ -28,7 +28,7 @@ class FileIOMixin(LoggedOperation):
     no attributes passed to either of these.
 
         fh.create_file()
-        fh.save_file()
+        fh.close()
 
     3. Get/set:
 
@@ -126,7 +126,7 @@ def create_file(self):
         else:
             self.logger.info(f'DRYRUN: Skipped creating "{self._file}"')
 
-    def save_file(self):
+    def close(self):
         """
         Wrapper for _set_content method
         """

diff --git a/padocc/core/project.py b/padocc/core/project.py
@@ -253,10 +253,10 @@ def update_status(
 
     def save_files(self):
         # Add all files here.
-        self.base_cfg.save_file()
-        self.detail_cfg.save_file()
-        self.allfiles.save_file()
-        self.status_log.save_file()
+        self.base_cfg.close()
+        self.detail_cfg.close()
+        self.allfiles.close()
+        self.status_log.close()
 
     def _configure_filelist(self):
         pattern = self.base_cfg['pattern']

diff --git a/padocc/operations/group.py b/padocc/operations/group.py
@@ -315,11 +315,11 @@ def add_project(self):
 
     def _save_proj_codes(self):
         for pc in self.proj_codes.keys():
-            self.proj_codes[pc].save_file()
+            self.proj_codes[pc].close()
 
     def save_files(self):
-        self.blacklist_codes.save_file()
-        self.datasets.save_file()
+        self.blacklist_codes.close()
+        self.datasets.close()
         self._save_proj_codes()
 
     def _add_proj_codeset(self, name : str, newcodes : list):
@@ -495,7 +495,7 @@ def _create_job_array(
         ]
 
         sbatch.update(sbatch_contents)
-        sbatch.save_file()
+        sbatch.close()
 
         if self._dryrun:
             self.logger.info('DRYRUN: sbatch command: ')

diff --git a/padocc/phases/compute.py b/padocc/phases/compute.py
@@ -121,7 +121,7 @@ def run(self, nfile: str, filehandler=None, extension=None, **kwargs) -> dict:
 
         if filehandler:
             filehandler.set(tdict)
-            filehandler.save_file()
+            filehandler.close()
         return tdict, ctype
 
     def _convert_kerchunk(self, nfile: str, ctype, **kwargs) -> None:
@@ -326,12 +326,12 @@ def _run_with_timings(self, func):
             detail['timings']['compute_actual'] = compute_time
 
         self.detail_cfg.set(detail)
-        self.detail_cfg.save_file()
+        self.detail_cfg.close()
         return 'Success'
 
     def save_files(self):
         super().save_files()
-        self.temp_zattrs.save_file()
+        self.temp_zattrs.close()
 
     @property
     def outpath(self):
@@ -703,7 +703,7 @@ def create_refs(self) -> None:
             if not self.quality_required:
                 self._perform_shape_checks(ref)
             CacheFile.set(ref)
-            CacheFile.save_file()
+            CacheFile.close()
             ctypes.append(ctype)
 
         self.success = converter.success
@@ -871,7 +871,7 @@ def _data_to_json(self, refs: dict) -> None:
 
         if not self.partial:
             self.logger.info(f'Written to JSON file - {self.outfile}')
-            self.kfile.save_file()
+            self.kfile.close()
         else:
             self.logger.info(f'Skipped writing to JSON file - {self.outfile}')
 

diff --git a/padocc/phases/scan.py b/padocc/phases/scan.py
@@ -360,7 +360,7 @@ def _compile_outputs(self, std_vars, cpf, volms, timings, ctypes, escape=None, i
             existing_details = details
 
         self.detail_cfg.set(existing_details)
-        self.detail_cfg.save_file()
+        self.detail_cfg.close()
 
 if __name__ == '__main__':
     print('Kerchunk Pipeline Config Scanner - run using master scripts')
diff --git a/padocc/phases/validate_new.py b/padocc/phases/validate_new.py
@@ -14,6 +14,8 @@
 import logging
 import math
 import re
+from functools import reduce
+from itertools import groupby
 
 from padocc.core.errors import (
     ShapeMismatchError,
@@ -37,7 +39,7 @@
 SUFFIXES = []
 SUFFIX_LIST = []
 
-from padocc.core import ProjectOperation
+from padocc.core import ProjectOperation, LoggedOperation
 
 ### Public Validation methods visible across PADOCC
 
@@ -420,6 +422,211 @@ def validate_selection(
     else:
         return compare_data(name, tbox, cbox, logger=logger, bypass=bypass)
 
+def _count_duplicates(arr: list, source_num: int = None):
+    """
+    Count the number of duplicates in a list
+    compared to the source number - return the values
+    that are not present in all source arrays.
+    """
+
+    freq_items = {}
+    for item in arr:
+        if item in freq_items:
+            freq_items[item] += 1
+        else:
+            freq_items[item] = 1
+
+    if source_num is None:
+        return freq_items
+    else:
+        missing = []
+        for item, value in freq_items.items():
+            if value < source_num:
+                missing.append(item)
+        return missing
+
+
+class ValidateDatasets(LoggedOperation):
+    def __init__(
+            self, 
+            datasets: list,
+            identifier: str,
+            logger = None,
+            label: str = None,
+            fh: str = None,
+            logid: str = None,
+            verbose: bool = None,
+        ):
+        """
+        Initiator for the ValidateDataset Class.
+        Given a list of xarray.Dataset objects, all methods applied to 
+        all datasets should give the same values as an output - the 
+        outputs should be equivalent.
+        
+        These dataset objects should be identical, just from different sources.
+        """
+
+        self._identifier = identifier
+        self._datasets   = datasets
+
+        super().__init__(
+            logger,
+            label=label,
+            fh=fh,
+            logid=logid,
+            verbose=verbose
+        )
+
+
+    def __str__(self):
+        return f'<PADOCC Validator: {self._identifier}>'
+
+    def validate_all(self, allowances: dict = None):
+        """
+        Run all validation steps on this set of datasets.
+        """
+
+        allowances = allowances or {}
+        ignore_vars, ignore_dims, ignore_globals = None, None, None
+
+        # Validate global attributes
+        if 'ignore_global_attrs' in allowances:
+            ignore_globals = {'ignore': allowances['ignore_global_attrs']}
+
+        self.validate_global_attrs(allowances=ignore_globals)
+
+        if 'ignore_variables' in allowances:
+            ignore_vars = {'ignore': allowances['ignore_variables']}
+        if 'ignore_dimensions' in allowances:
+            ignore_dims = {'ignore': allowances['ignore_dimensions']}
+
+        # Validate variables/dimensions
+        self.validate_variables(allowances=ignore_vars)
+        self.validate_dimensions(allowances=ignore_dims)
+
+    def validate_variables(self, allowances: dict = None):
+        """
+        Validate variables public method
+        """
+        self._validate_selector(allowances=allowances, selector='variables')
+
+    def validate_dimensions(self, allowances: dict = None):
+        """
+        Validate dimensions public method
+        """
+        self._validate_selector(allowances=allowances, selector='dimensions')
+
+    def _validate_selector(self, allowances: dict = None, selector: str = 'variables'):
+        """
+        Ensure all variables/dimensions are consistent across all datasets.
+        Allowances dict contains configurations for skipping some variables
+        in the case for example of a virtual dimension.
+
+        allowances:
+          ignore: [list to ignore]
+        """
+        ignore_vars = []
+
+        allowances = allowances or {}
+        if f'ignore' in allowances:
+            ignore_vars = allowances['ignore']
+
+        compare_vars = [[] for d in len(self._datasets)]
+        total_list = []
+        for index, d in enumerate(self._datasets):
+
+            vset = getattr(d, selector)
+
+            for var in vset:
+                if var in ignore_vars:
+                    continue
+                compare_vars[index].append(var)
+            total_list.extend(compare_vars[index])
+
+        # Check each list has the same number of variables.
+        if len(total_list) != len(compare_vars[0])*len(compare_vars):
+            raise VariableMismatchError(
+                f'The number of {selector} between datasets does not match: '
+                f'Datasets have {[len(c) for c in compare_vars]} {selector} '
+                'respectively.'
+            )
+
+        # Check all variables are present in all datasets.
+        missing = _count_duplicates(total_list, source_num=len(self._datasets))
+        if missing:
+            raise VariableMismatchError(
+                f'Inconsistent {selector} between datasets - {selector} '
+                f'not present in all files: {missing}'
+            )
+
+        # Check variables appear in the same order in all datasets
+        in_order = True
+        for vset in zip(*compare_vars):
+            vars = groupby(vset)
+            is_equal = next(vars, True) and not next(vars, False)
+            in_order = in_order and is_equal
+
+        # Warning for different ordering only.
+        if not in_order:
+            self.logger.warning(
+                f'{selector} present in a different order between datasets'
+            )
+
+    def validate_global_attrs(self, allowances: dict = None):
+        """
+        Validate the set of global attributes across all datasets
+        """
+
+        allowances = allowances or {}
+        ignore = []
+        if 'ignore' in allowances:
+            ignore = allowances['ignore']
+
+        attrset = []
+        for d in self._datasets:
+            attrset.append(d.attrs)
+
+        self._validate_attrs(attrset, source='global.', ignore=ignore)
+
+
+    def _validate_attrs(self, attrset: list, source: str = '', ignore: list = None):
+        """
+        Ensure all values across the sets of attributes are consistent
+        """
+
+        ignore = ignore or []
+        for attr in attrset[0].keys():
+
+            # Try extracting this attribute from all attribute sets.
+            try:
+                set_of_values = [a[attr] for a in attrset]
+            except IndexError:
+                if attr not in ignore:
+                    raise ValueError(
+                        f'Attribute {source}{attr} not present in all datasets'
+                    )
+
+            for s in set_of_values[1:]:
+                if not np.all(s == set_of_values[0]):
+                    raise ValueError(
+                        f'Attribute {source}{attr} is not equal across all datasets:'
+                        f'Found values: {set_of_values}'
+                    )
+
+    def validate_shapes(self, allowances: dict = None):
+        """
+        Ensure all variable shapes are consistent across all datasets.
+        Allowances dict contains configurations for skipping some shape tests
+        in the case for example of a virtual dimension.
+        """
+        pass
+
+    def validate_data(self, allowances: dict = None):
+        """
+        Perform data validations using the growbox method for all datasets.
+        """
+        pass
+
 
 class ValidateOperation(ProjectOperation):
     """

diff --git a/padocc/tests/test_init.py b/padocc/tests/test_init.py
@@ -2,6 +2,21 @@
 
 WORKDIR = 'padocc/tests/auto_testdata_dir'
 
+infile  = 'padocc/tests/data/myfile.csv' 
+# Input CSV has Identifier, Path/To/Datasets, {updates}, {removals}
+
+groupID = 'padocc-test-suite'
+workdir = '/home/username/padocc-workdir'
+
+mygroup = GroupOperation(
+    groupID,
+    workdir=workdir,
+    label='test_group'
+)
+
+mygroup.init_from_file(infile)
+
+
 class TestInit:
 
     def test_init_basic(self, wd=WORKDIR):