remove load_commondata_instance, use load everwhere ; freeze metadata

NNPDF · Feb 14, 2024 · 42f7928 · 42f7928
1 parent 49e8f23
commit 42f7928
Show file tree

Hide file tree

Showing 7 changed files with 92 additions and 84 deletions.
diff --git a/validphys2/src/validphys/commondataparser.py b/validphys2/src/validphys/commondataparser.py
@@ -49,10 +49,17 @@
 
 from reportengine.compat import yaml
 from validphys.coredata import KIN_NAMES, CommonData
-from validphys.datafiles import new_to_legacy_map
+from validphys.datafiles import new_to_legacy_map, path_commondata
 from validphys.plotoptions.plottingoptions import PlottingOptions, labeler_functions
 from validphys.utils import parse_yaml_inp
 
+# JCM:
+# Some notes for developers
+# The usage of `frozen` in the definitions of the dataclass is not strictly necessary
+# however, changing the metadata can have side effects in many parts on validphys.
+# By freezing the overall class (and leaving only specific attributes unfrozen) we have a more
+# granular control. Please, use setter to modify frozen class instead of removing frozen
+
 EXT = "pineappl.lz4"
 _INDEX_NAME = "entry"
 
@@ -161,24 +168,13 @@ def ValidOperation(op_str: Optional[str]) -> str:
 
 @dataclasses.dataclass
 class ValidApfelComb:
-    """Some of the grids might have been converted from apfelcomb and introduce hacks.
-    These are the allowed hacks:
-        - repetition_flag:
-            list of fktables which might need to be repeated
-            necessary to apply c-factors in compound observables
-        - normalization:
-            mapping with the single fktables which need to be normalized and the factor
-            note that when they are global factors they are promoted to conversion_factor
-        - shifts:
-            this flag is left here for compatibility purposes but has been moved to TheoryMeta
-    """
-
+    # TODO: to be removed
     repetition_flag: Optional[list[str]] = None
     normalization: Optional[dict] = None
     shifts: Optional[dict] = None
 
 
-@dataclasses.dataclass
+@dataclasses.dataclass(frozen=True)
 class TheoryMeta:
     """Contains the necessary information to load the associated fktables
 
@@ -228,16 +224,17 @@ class TheoryMeta:
 
     comment: Optional[str] = None
 
-    # The following options are transitional so that the old yamldb files can be used
+    # The following options are transitional and will eventually be removed
     apfelcomb: Optional[ValidApfelComb] = None
-    appl: Optional[bool] = False
-    target_dataset: Optional[str] = None
 
     def __post_init__(self):
         """If a ``shifts`` flag is found in the apfelcomb object, move it outside"""
         if self.apfelcomb is not None:
+            log.warning(
+                f"Apfelcomb key is being used to read {self.FK_tables}, please update the commondata file"
+            )
             if self.apfelcomb.shifts is not None and self.shifts is None:
-                self.shifts = self.apfelcomb.shifts
+                object.__setattr__(self, 'shifts', self.apfelcomb.shifts)
                 self.apfelcomb.shifts = None
 
     def fktables_to_paths(self, grids_folder):
@@ -274,7 +271,7 @@ def __hash__(self):
 ## Theory end
 
 
-@dataclasses.dataclass
+@dataclasses.dataclass(frozen=True)
 class Variant:
     """The new commondata format allow the usage of variants
     A variant can overwrite a number of keys, as defined by this dataclass
@@ -289,7 +286,7 @@ class Variant:
 
 
 ### Kinematic data
-@dataclasses.dataclass
+@dataclasses.dataclass(frozen=True)
 class ValidVariable:
     """Defines the variables"""
 
@@ -310,7 +307,7 @@ def apply_label(self, value):
         return tmp
 
 
-@dataclasses.dataclass
+@dataclasses.dataclass(frozen=True)
 class ValidKinematics:
     """Contains the metadata necessary to load the kinematics of the dataset.
     The variables should be a dictionary with the key naming the variable
@@ -352,7 +349,7 @@ def apply_label(self, var, value):
 
 
 ### Observable and dataset definitions
-@dataclasses.dataclass
+@dataclasses.dataclass(frozen=True)
 class ObservableMetaData:
     observable_name: str
     observable: dict
@@ -377,9 +374,11 @@ class ObservableMetaData:
     variants: Optional[ValidVariants] = dataclasses.field(default_factory=dict)
     applied_variant: Optional[str] = None
     ported_from: Optional[str] = None
-    _parent: Optional[
-        Any
-    ] = None  # Note that an observable without a parent will fail in many different ways
+
+    # Derived quantities:
+    # Note that an observable without a parent will fail in many different ways
+    _parent: Optional[Any] = None
+    _plotting_options: Optional[Any] = None
 
     def __post_init__(self):
         """
@@ -396,7 +395,7 @@ def __post_init__(self):
             else:
                 self.kinematic_coverage += [f"extra_{i}" for i in range(diff_to_3)]
 
-        self.process_type = self.process_type.upper()
+        object.__setattr__(self, 'process_type', self.process_type.upper())
 
     def check(self):
         """Various checks to apply manually to the observable before it is used anywhere
@@ -680,17 +679,22 @@ def _plotting_options_set(self):
 
         return self.plotting
 
-    @cached_property
+    @property
     def plotting_options(self):
-        try:
-            return self._plotting_options_set()
-        except Exception as e:
-            # There are many chances for failure here
-            log.error(f"Failure for: {self.name}")
-            raise e
+        # Using __setattr__ instead of a cached_property
+        # in order for the settings to propagate to the variants
+        if self._plotting_options is None:
+            try:
+                tmp = self._plotting_options_set()
+            except Exception as e:
+                # There are many chances for failure here
+                log.error(f"Failure for: {self.name}")
+                raise e
+            object.__setattr__(self, "_plotting_options", tmp)
+        return self._plotting_options
 
 
-@dataclasses.dataclass
+@dataclasses.dataclass(frozen=True)
 class ValidReference:
     """Holds literature information for the dataset"""
 
@@ -700,7 +704,7 @@ class ValidReference:
     tables: list[int] = dataclasses.field(default_factory=list)
 
 
-@dataclasses.dataclass
+@dataclasses.dataclass(frozen=True)
 class SetMetaData:
     """Metadata of the whole set"""
 
@@ -712,39 +716,49 @@ class SetMetaData:
     arXiv: Optional[ValidReference] = None
     iNSPIRE: Optional[ValidReference] = None
     hepdata: Optional[ValidReference] = None
-    _folder: Optional[Path] = None
 
     @property
     def folder(self):
-        # TODO: at the moment the folder is set manually by the parser of the metadata
-        # since the new commondata is still not installed (or declared in the profile)
-        return self._folder
-        # return _folder_data / self.setname
+        return path_commondata / self.setname
+
+    @cached_property
+    def allowed_observables(self):
+        """
+        Returns the implemented observables as a {observable_name.upper(): observable} dictionary
+        """
+        return {o.observable_name.upper(): o for o in self.implemented_observables}
 
     def select_observable(self, obs_name_raw):
         """Check whether the observable is implemented and return said observable"""
-        # TODO: should we check that we don't have two observables with the same name?
-        obs_name = obs_name_raw.lower().strip()
-        for observable in self.implemented_observables:
-            if observable.observable_name.lower().strip() == obs_name:
-                # Not very happy with this but not sure how to do in a better way?
-                observable._parent = self
-                observable.check()
-                return observable
-        raise ValueError(f"The selected observable {obs_name} does not exist in {self.setname}")
+        obs_name = obs_name_raw.upper()
+        try:
+            observable = self.allowed_observables[obs_name]
+        except KeyError:
+            raise ValueError(
+                f"The selected observable {obs_name_raw} does not exist in {self.setname}"
+            )
 
+        # Now burn the _parent key into the observable and apply checks
+        object.__setattr__(observable, "_parent", self)
+        observable.check()
+        return observable
 
-###
+
+@lru_cache
+def _parse_entire_set_metadata(metadata_file):
+    """Read the metadata file"""
+    return parse_yaml_inp(metadata_file, SetMetaData)
 
 
+@lru_cache
 def parse_new_metadata(metadata_file, observable_name, variant=None):
     """Given a metadata file in the new format and the specific observable to be read
-    load and parse the metadata and select the observable.
-    If any variants are selected, apply them.
+    load and parse the metadata and select the observable. If any variants are selected, apply them.
+
+    The triplet (metadata_file, observable_name, variant) define unequivocally the information
+    to be parsed from the commondata library
     """
-    # Note: we are re-loading many times the same yaml file, possibly a good target for lru_cache
-    set_metadata = parse_yaml_inp(metadata_file, SetMetaData)
-    set_metadata._folder = metadata_file.parent
+    set_metadata = _parse_entire_set_metadata(metadata_file)
 
     # Select one observable from the entire metadata
     metadata = set_metadata.select_observable(observable_name)

diff --git a/validphys2/src/validphys/commondatawriter.py b/validphys2/src/validphys/commondatawriter.py
@@ -25,7 +25,7 @@ def write_commondata_data(commondata, buffer):
     >>> from io import StringIO
 
     >>> l = Loader()
-    >>> cd = l.check_commondata("NMC").load_commondata_instance()
+    >>> cd = l.check_commondata("NMC").load()
     >>> sio = StringIO()
     >>> write_commondata_data(cd,sio)
     >>> print(sio.getvalue())
@@ -65,7 +65,7 @@ def write_systype_data(commondata, buffer):
     >>> from io import StringIO
 
     >>> l = Loader()
-    >>> cd = l.check_commondata("NMC").load_commondata_instance()
+    >>> cd = l.check_commondata("NMC").load()
     >>> sio = StringIO()
     >>> write_systype_data(cd,sio)
     >>> print(sio.getvalue())

diff --git a/validphys2/src/validphys/core.py b/validphys2/src/validphys/core.py
@@ -22,8 +22,7 @@
 from validphys import filters, lhaindex
 from validphys.commondataparser import (
     get_plot_kinlabels,
-    parse_commondata,
-    parse_commondata_new,
+    load_commondata,
     peek_commondata_metadata,
 )
 from validphys.fkparser import load_fktable, parse_cfactor
@@ -257,7 +256,7 @@ def __init__(self, name, metadata, legacy=False, datafile=None, sysfile=None, pl
     def name(self):
         return self.metadata.name
 
-    @property
+    @functools.cached_property
     def nsys(self):
         if self.legacy:
             return self.metadata.nsys
@@ -275,7 +274,7 @@ def process_type(self):
 
     @property
     def metadata(self):
-        if self._metadata is None:
+        if self.legacy:
             self._metadata = peek_commondata_metadata(self.datafile)
         return self._metadata
 
@@ -291,20 +290,10 @@ def __str__(self):
     def __iter__(self):
         return iter((self.datafile, self.sysfile, self.plotfiles))
 
-    # TODO: one of the two functions below needs to go
-    @functools.lru_cache()
     def load(self):
-        if self.legacy:
-            return parse_commondata(self.datafile, self.sysfile, self.name)
-        else:
-            return parse_commondata_new(self.metadata)
-
-    def load_commondata_instance(self):
         """
         load a validphys.core.CommonDataSpec to validphys.core.CommonData
         """
-        from validphys.commondataparser import load_commondata
-
         return load_commondata(self)
 
     @property
@@ -613,7 +602,7 @@ def load_commondata_instance(self):
         """
         commodata_list = []
         for dataset in self.datasets:
-            cd = dataset.commondata.load_commondata_instance()
+            cd = dataset.commondata.load()
             if dataset.cuts is None:
                 commodata_list.append(cd)
             else:

diff --git a/validphys2/src/validphys/cuts/filters.yaml b/validphys2/src/validphys/cuts/filters.yaml
@@ -52,14 +52,6 @@
    linearly dependent on the others
   rule: "y_ttBar<1.82"
 
-# - dataset: ATLAS1JET11
-#   reason: |
-#     We keep only the first rapidity bin since the chi2 to the whole set of
-#     rapidity bins is too large (known problems with correlation matrix).
-#     This cut will be removed in the new bunch of fits that we are going to do
-#     for the NNLO jet project.
-#   rule: eta<0.3
-
 - dataset: CMS_1JET_8TEV_PTY
   reason: |
     We keep only the bins with pTjet>74 GeV because fixed-order theory does
@@ -404,3 +396,11 @@
 # - dataset: ATLASZPT7TEV
 #   reason: Avoid the region where EWK corrections are important.
 #   rule: "p_T2 <= 500**2"
+#
+# - dataset: ATLAS1JET11
+#   reason: |
+#     We keep only the first rapidity bin since the chi2 to the whole set of
+#     rapidity bins is too large (known problems with correlation matrix).
+#     This cut will be removed in the new bunch of fits that we are going to do
+#     for the NNLO jet project.
+#   rule: eta<0.3
diff --git a/validphys2/src/validphys/datafiles/__init__.py b/validphys2/src/validphys/datafiles/__init__.py
@@ -4,12 +4,11 @@
 from reportengine.compat import yaml
 
 path_vpdata = pathlib.Path(__file__).parent
-path_commondata = pathlib.Path(__file__).with_name('commondata')
-path_new_commondata = pathlib.Path(__file__).with_name('new_commondata')
+path_commondata = path_vpdata / "new_commondata"
 path_theorydb = pathlib.Path(__file__).with_name('theory.db')
 
 # VP should not have access to this file, only to the products
-_path_legacy_mapping = path_new_commondata / "dataset_names.yml"
+_path_legacy_mapping = path_commondata / "dataset_names.yml"
 legacy_to_new_mapping = yaml.YAML().load(_path_legacy_mapping)
 
 

diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py
@@ -314,7 +314,7 @@ def level0_commondata_wc(data, fakepdf):
     # ==== Load validphys.coredata.CommonData instance with cuts ====#
 
     for dataset in data.datasets:
-        commondata_wc = dataset.commondata.load_commondata_instance()
+        commondata_wc = dataset.commondata.load()
         if dataset.cuts is not None:
             cuts = dataset.cuts.load()
             commondata_wc = commondata_wc.with_cuts(cuts)

diff --git a/validphys2/src/validphys/tests/test_pseudodata.py b/validphys2/src/validphys/tests/test_pseudodata.py
@@ -72,9 +72,15 @@ def test_no_savepseudodata():
             func(fit=FIT)
 
 
+from time import time
 def test_read_matches_recreate():
+    a = time()
     reads = API.read_fit_pseudodata(fit=PSEUDODATA_FIT)
+    b = time()
+    print("\nREAD time:", b-a)
     recreates = API.recreate_fit_pseudodata(fit=PSEUDODATA_FIT)
+    c = time()
+    print("RECREATE time:", c-b)
     for read, recreate in zip(reads, recreates):
         # We ignore the absolute ordering of the dataframes and just check
         # that they contain identical elements.