Skip to content

Commit

Permalink
remove load_commondata_instance, use load everwhere ; freeze metadata
Browse files Browse the repository at this point in the history
  • Loading branch information
scarlehoff committed Feb 14, 2024
1 parent 49e8f23 commit 42f7928
Show file tree
Hide file tree
Showing 7 changed files with 92 additions and 84 deletions.
124 changes: 69 additions & 55 deletions validphys2/src/validphys/commondataparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,10 +49,17 @@

from reportengine.compat import yaml
from validphys.coredata import KIN_NAMES, CommonData
from validphys.datafiles import new_to_legacy_map
from validphys.datafiles import new_to_legacy_map, path_commondata
from validphys.plotoptions.plottingoptions import PlottingOptions, labeler_functions
from validphys.utils import parse_yaml_inp

# JCM:
# Some notes for developers
# The usage of `frozen` in the definitions of the dataclass is not strictly necessary
# however, changing the metadata can have side effects in many parts on validphys.
# By freezing the overall class (and leaving only specific attributes unfrozen) we have a more
# granular control. Please, use setter to modify frozen class instead of removing frozen

EXT = "pineappl.lz4"
_INDEX_NAME = "entry"

Expand Down Expand Up @@ -161,24 +168,13 @@ def ValidOperation(op_str: Optional[str]) -> str:

@dataclasses.dataclass
class ValidApfelComb:
"""Some of the grids might have been converted from apfelcomb and introduce hacks.
These are the allowed hacks:
- repetition_flag:
list of fktables which might need to be repeated
necessary to apply c-factors in compound observables
- normalization:
mapping with the single fktables which need to be normalized and the factor
note that when they are global factors they are promoted to conversion_factor
- shifts:
this flag is left here for compatibility purposes but has been moved to TheoryMeta
"""

# TODO: to be removed
repetition_flag: Optional[list[str]] = None
normalization: Optional[dict] = None
shifts: Optional[dict] = None


@dataclasses.dataclass
@dataclasses.dataclass(frozen=True)
class TheoryMeta:
"""Contains the necessary information to load the associated fktables
Expand Down Expand Up @@ -228,16 +224,17 @@ class TheoryMeta:

comment: Optional[str] = None

# The following options are transitional so that the old yamldb files can be used
# The following options are transitional and will eventually be removed
apfelcomb: Optional[ValidApfelComb] = None
appl: Optional[bool] = False
target_dataset: Optional[str] = None

def __post_init__(self):
"""If a ``shifts`` flag is found in the apfelcomb object, move it outside"""
if self.apfelcomb is not None:
log.warning(
f"Apfelcomb key is being used to read {self.FK_tables}, please update the commondata file"
)
if self.apfelcomb.shifts is not None and self.shifts is None:
self.shifts = self.apfelcomb.shifts
object.__setattr__(self, 'shifts', self.apfelcomb.shifts)
self.apfelcomb.shifts = None

def fktables_to_paths(self, grids_folder):
Expand Down Expand Up @@ -274,7 +271,7 @@ def __hash__(self):
## Theory end


@dataclasses.dataclass
@dataclasses.dataclass(frozen=True)
class Variant:
"""The new commondata format allow the usage of variants
A variant can overwrite a number of keys, as defined by this dataclass
Expand All @@ -289,7 +286,7 @@ class Variant:


### Kinematic data
@dataclasses.dataclass
@dataclasses.dataclass(frozen=True)
class ValidVariable:
"""Defines the variables"""

Expand All @@ -310,7 +307,7 @@ def apply_label(self, value):
return tmp


@dataclasses.dataclass
@dataclasses.dataclass(frozen=True)
class ValidKinematics:
"""Contains the metadata necessary to load the kinematics of the dataset.
The variables should be a dictionary with the key naming the variable
Expand Down Expand Up @@ -352,7 +349,7 @@ def apply_label(self, var, value):


### Observable and dataset definitions
@dataclasses.dataclass
@dataclasses.dataclass(frozen=True)
class ObservableMetaData:
observable_name: str
observable: dict
Expand All @@ -377,9 +374,11 @@ class ObservableMetaData:
variants: Optional[ValidVariants] = dataclasses.field(default_factory=dict)
applied_variant: Optional[str] = None
ported_from: Optional[str] = None
_parent: Optional[
Any
] = None # Note that an observable without a parent will fail in many different ways

# Derived quantities:
# Note that an observable without a parent will fail in many different ways
_parent: Optional[Any] = None
_plotting_options: Optional[Any] = None

def __post_init__(self):
"""
Expand All @@ -396,7 +395,7 @@ def __post_init__(self):
else:
self.kinematic_coverage += [f"extra_{i}" for i in range(diff_to_3)]

self.process_type = self.process_type.upper()
object.__setattr__(self, 'process_type', self.process_type.upper())

def check(self):
"""Various checks to apply manually to the observable before it is used anywhere
Expand Down Expand Up @@ -680,17 +679,22 @@ def _plotting_options_set(self):

return self.plotting

@cached_property
@property
def plotting_options(self):
try:
return self._plotting_options_set()
except Exception as e:
# There are many chances for failure here
log.error(f"Failure for: {self.name}")
raise e
# Using __setattr__ instead of a cached_property
# in order for the settings to propagate to the variants
if self._plotting_options is None:
try:
tmp = self._plotting_options_set()
except Exception as e:
# There are many chances for failure here
log.error(f"Failure for: {self.name}")
raise e
object.__setattr__(self, "_plotting_options", tmp)
return self._plotting_options


@dataclasses.dataclass
@dataclasses.dataclass(frozen=True)
class ValidReference:
"""Holds literature information for the dataset"""

Expand All @@ -700,7 +704,7 @@ class ValidReference:
tables: list[int] = dataclasses.field(default_factory=list)


@dataclasses.dataclass
@dataclasses.dataclass(frozen=True)
class SetMetaData:
"""Metadata of the whole set"""

Expand All @@ -712,39 +716,49 @@ class SetMetaData:
arXiv: Optional[ValidReference] = None
iNSPIRE: Optional[ValidReference] = None
hepdata: Optional[ValidReference] = None
_folder: Optional[Path] = None

@property
def folder(self):
# TODO: at the moment the folder is set manually by the parser of the metadata
# since the new commondata is still not installed (or declared in the profile)
return self._folder
# return _folder_data / self.setname
return path_commondata / self.setname

@cached_property
def allowed_observables(self):
"""
Returns the implemented observables as a {observable_name.upper(): observable} dictionary
"""
return {o.observable_name.upper(): o for o in self.implemented_observables}

def select_observable(self, obs_name_raw):
"""Check whether the observable is implemented and return said observable"""
# TODO: should we check that we don't have two observables with the same name?
obs_name = obs_name_raw.lower().strip()
for observable in self.implemented_observables:
if observable.observable_name.lower().strip() == obs_name:
# Not very happy with this but not sure how to do in a better way?
observable._parent = self
observable.check()
return observable
raise ValueError(f"The selected observable {obs_name} does not exist in {self.setname}")
obs_name = obs_name_raw.upper()
try:
observable = self.allowed_observables[obs_name]
except KeyError:
raise ValueError(
f"The selected observable {obs_name_raw} does not exist in {self.setname}"
)

# Now burn the _parent key into the observable and apply checks
object.__setattr__(observable, "_parent", self)
observable.check()
return observable

###

@lru_cache
def _parse_entire_set_metadata(metadata_file):
"""Read the metadata file"""
return parse_yaml_inp(metadata_file, SetMetaData)


@lru_cache
def parse_new_metadata(metadata_file, observable_name, variant=None):
"""Given a metadata file in the new format and the specific observable to be read
load and parse the metadata and select the observable.
If any variants are selected, apply them.
load and parse the metadata and select the observable. If any variants are selected, apply them.
The triplet (metadata_file, observable_name, variant) define unequivocally the information
to be parsed from the commondata library
"""
# Note: we are re-loading many times the same yaml file, possibly a good target for lru_cache
set_metadata = parse_yaml_inp(metadata_file, SetMetaData)
set_metadata._folder = metadata_file.parent
set_metadata = _parse_entire_set_metadata(metadata_file)

# Select one observable from the entire metadata
metadata = set_metadata.select_observable(observable_name)
Expand Down
4 changes: 2 additions & 2 deletions validphys2/src/validphys/commondatawriter.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def write_commondata_data(commondata, buffer):
>>> from io import StringIO
>>> l = Loader()
>>> cd = l.check_commondata("NMC").load_commondata_instance()
>>> cd = l.check_commondata("NMC").load()
>>> sio = StringIO()
>>> write_commondata_data(cd,sio)
>>> print(sio.getvalue())
Expand Down Expand Up @@ -65,7 +65,7 @@ def write_systype_data(commondata, buffer):
>>> from io import StringIO
>>> l = Loader()
>>> cd = l.check_commondata("NMC").load_commondata_instance()
>>> cd = l.check_commondata("NMC").load()
>>> sio = StringIO()
>>> write_systype_data(cd,sio)
>>> print(sio.getvalue())
Expand Down
19 changes: 4 additions & 15 deletions validphys2/src/validphys/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,7 @@
from validphys import filters, lhaindex
from validphys.commondataparser import (
get_plot_kinlabels,
parse_commondata,
parse_commondata_new,
load_commondata,
peek_commondata_metadata,
)
from validphys.fkparser import load_fktable, parse_cfactor
Expand Down Expand Up @@ -257,7 +256,7 @@ def __init__(self, name, metadata, legacy=False, datafile=None, sysfile=None, pl
def name(self):
return self.metadata.name

@property
@functools.cached_property
def nsys(self):
if self.legacy:
return self.metadata.nsys
Expand All @@ -275,7 +274,7 @@ def process_type(self):

@property
def metadata(self):
if self._metadata is None:
if self.legacy:
self._metadata = peek_commondata_metadata(self.datafile)
return self._metadata

Expand All @@ -291,20 +290,10 @@ def __str__(self):
def __iter__(self):
return iter((self.datafile, self.sysfile, self.plotfiles))

# TODO: one of the two functions below needs to go
@functools.lru_cache()
def load(self):
if self.legacy:
return parse_commondata(self.datafile, self.sysfile, self.name)
else:
return parse_commondata_new(self.metadata)

def load_commondata_instance(self):
"""
load a validphys.core.CommonDataSpec to validphys.core.CommonData
"""
from validphys.commondataparser import load_commondata

return load_commondata(self)

@property
Expand Down Expand Up @@ -613,7 +602,7 @@ def load_commondata_instance(self):
"""
commodata_list = []
for dataset in self.datasets:
cd = dataset.commondata.load_commondata_instance()
cd = dataset.commondata.load()
if dataset.cuts is None:
commodata_list.append(cd)
else:
Expand Down
16 changes: 8 additions & 8 deletions validphys2/src/validphys/cuts/filters.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -52,14 +52,6 @@
linearly dependent on the others
rule: "y_ttBar<1.82"

# - dataset: ATLAS1JET11
# reason: |
# We keep only the first rapidity bin since the chi2 to the whole set of
# rapidity bins is too large (known problems with correlation matrix).
# This cut will be removed in the new bunch of fits that we are going to do
# for the NNLO jet project.
# rule: eta<0.3

- dataset: CMS_1JET_8TEV_PTY
reason: |
We keep only the bins with pTjet>74 GeV because fixed-order theory does
Expand Down Expand Up @@ -404,3 +396,11 @@
# - dataset: ATLASZPT7TEV
# reason: Avoid the region where EWK corrections are important.
# rule: "p_T2 <= 500**2"
#
# - dataset: ATLAS1JET11
# reason: |
# We keep only the first rapidity bin since the chi2 to the whole set of
# rapidity bins is too large (known problems with correlation matrix).
# This cut will be removed in the new bunch of fits that we are going to do
# for the NNLO jet project.
# rule: eta<0.3
5 changes: 2 additions & 3 deletions validphys2/src/validphys/datafiles/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,11 @@
from reportengine.compat import yaml

path_vpdata = pathlib.Path(__file__).parent
path_commondata = pathlib.Path(__file__).with_name('commondata')
path_new_commondata = pathlib.Path(__file__).with_name('new_commondata')
path_commondata = path_vpdata / "new_commondata"
path_theorydb = pathlib.Path(__file__).with_name('theory.db')

# VP should not have access to this file, only to the products
_path_legacy_mapping = path_new_commondata / "dataset_names.yml"
_path_legacy_mapping = path_commondata / "dataset_names.yml"
legacy_to_new_mapping = yaml.YAML().load(_path_legacy_mapping)


Expand Down
2 changes: 1 addition & 1 deletion validphys2/src/validphys/pseudodata.py
Original file line number Diff line number Diff line change
Expand Up @@ -314,7 +314,7 @@ def level0_commondata_wc(data, fakepdf):
# ==== Load validphys.coredata.CommonData instance with cuts ====#

for dataset in data.datasets:
commondata_wc = dataset.commondata.load_commondata_instance()
commondata_wc = dataset.commondata.load()
if dataset.cuts is not None:
cuts = dataset.cuts.load()
commondata_wc = commondata_wc.with_cuts(cuts)
Expand Down
6 changes: 6 additions & 0 deletions validphys2/src/validphys/tests/test_pseudodata.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,9 +72,15 @@ def test_no_savepseudodata():
func(fit=FIT)


from time import time
def test_read_matches_recreate():
a = time()
reads = API.read_fit_pseudodata(fit=PSEUDODATA_FIT)
b = time()
print("\nREAD time:", b-a)
recreates = API.recreate_fit_pseudodata(fit=PSEUDODATA_FIT)
c = time()
print("RECREATE time:", c-b)
for read, recreate in zip(reads, recreates):
# We ignore the absolute ordering of the dataframes and just check
# that they contain identical elements.
Expand Down

0 comments on commit 42f7928

Please sign in to comment.