From 5367ccfe0fc6de716011a1e185503fbeae7d61ff Mon Sep 17 00:00:00 2001 From: juacrumar Date: Sat, 23 Nov 2024 16:59:55 -0500 Subject: [PATCH] first version of the nnpdf data package , with versioning add utility function to read metadata just from dataset name deprecate a bunch of functions fix include --- deprecated_functions.py | 138 ++++++++++ doc/sphinx/source/vp/customplots.rst | 6 +- doc/sphinx/source/vp/pydataobjs.rst | 12 +- nnpdf_data/examples_of_use.py | 34 +++ nnpdf_data/nnpdf_data/__init__.py | 78 +----- .../nnpdf_data}/commondataparser.py | 201 +++----------- nnpdf_data/nnpdf_data/coredata.py | 246 ++++++++++++++++++ .../nnpdf_data}/process_options.py | 0 nnpdf_data/nnpdf_data/utils.py | 12 +- .../nnpdf_data/validphys_compatibility.py | 116 +++++++++ nnpdf_data/pyproject.toml | 9 +- validphys2/src/validphys/commondata.py | 17 +- validphys2/src/validphys/core.py | 18 +- validphys2/src/validphys/coredata.py | 231 ---------------- validphys2/src/validphys/covmats.py | 14 +- validphys2/src/validphys/dataplots.py | 2 +- validphys2/src/validphys/filters.py | 7 +- validphys2/src/validphys/loader.py | 7 +- validphys2/src/validphys/pineparser.py | 2 +- validphys2/src/validphys/plotoptions/core.py | 2 +- validphys2/src/validphys/pseudodata.py | 12 +- .../validphys/tests/test_commondataparser.py | 15 +- .../src/validphys/tests/test_covmats.py | 4 +- validphys2/src/validphys/utils.py | 8 - 24 files changed, 665 insertions(+), 526 deletions(-) create mode 100644 deprecated_functions.py create mode 100644 nnpdf_data/examples_of_use.py rename {validphys2/src/validphys => nnpdf_data/nnpdf_data}/commondataparser.py (87%) create mode 100644 nnpdf_data/nnpdf_data/coredata.py rename {validphys2/src/validphys => nnpdf_data/nnpdf_data}/process_options.py (100%) create mode 100644 nnpdf_data/nnpdf_data/validphys_compatibility.py diff --git a/deprecated_functions.py b/deprecated_functions.py new file mode 100644 index 0000000000..930271e720 --- /dev/null +++ b/deprecated_functions.py @@ -0,0 +1,138 @@ +""" +Note: this module will be removed after the next tag, don't use anything from here +""" + +import dataclasses +import logging +from operator import attrgetter + +import pandas as pd + +from nnpdf_data.coredata import CommonData + +log = logging.getLogger(__name__) + +log.warning( + "You are loading deprecated functionality that use the old commondata parser. This is no longer supported and will be removed in the near future" +) + + +### Old commondata: +### All code below this line is deprecated and will be removed +def load_commondata_old(commondatafile, systypefile, setname): + """Parse a commondata file and a systype file into a CommonData. + + Parameters + ---------- + commondatafile : file or path to file + systypefile : file or path to file + + Returns + ------- + commondata : CommonData + An object containing the data and information from the commondata + and systype files. + """ + # First parse commondata file + commondatatable = pd.read_csv(commondatafile, sep=r"\s+", skiprows=1, header=None) + # Remove NaNs + # TODO: replace commondata files with bad formatting + # Build header + commondataheader = ["entry", "process", "kin1", "kin2", "kin3", "data", "stat"] + nsys = (commondatatable.shape[1] - len(commondataheader)) // 2 + + commondataheader += ["ADD", "MULT"] * nsys + commondatatable.columns = commondataheader + commondatatable.set_index("entry", inplace=True) + ndata = len(commondatatable) + commondataproc = commondatatable["process"][1] + # Check for consistency with commondata metadata + cdmetadata = peek_commondata_metadata(commondatafile) + if (nsys, ndata) != attrgetter("nsys", "ndata")(cdmetadata): + raise ValueError(f"Commondata table information does not match metadata for {setname}") + + # Now parse the systype file + systypetable = parse_systypes(systypefile) + + # Populate CommonData object + return CommonData( + setname=setname, + ndata=ndata, + commondataproc=commondataproc, + nkin=3, + nsys=nsys, + commondata_table=commondatatable, + systype_table=systypetable, + legacy=True, + ) + + +def parse_systypes(systypefile): + """Parses a systype file and returns a pandas dataframe.""" + systypeheader = ["sys_index", "treatment", "name"] + try: + systypetable = pd.read_csv( + systypefile, sep=r"\s+", names=systypeheader, skiprows=1, header=None + ) + systypetable.dropna(axis="columns", inplace=True) + # Some datasets e.g. CMSWCHARMRAT have no systematics + except pd.errors.EmptyDataError: + systypetable = pd.DataFrame(columns=systypeheader) + + systypetable.set_index("sys_index", inplace=True) + + return systypetable + + +@dataclasses.dataclass(frozen=True) +class CommonDataMetadata: + """Contains metadata information about the data being read""" + + name: str + nsys: int + ndata: int + process_type: str + + +def peek_commondata_metadata(commondatafilename): + """Read some of the properties of the commondata object as a CommonData Metadata""" + with open(commondatafilename) as f: + try: + l = f.readline() + name, nsys_str, ndata_str = l.split() + l = f.readline() + process_type_str = l.split()[1] + except Exception: + log.error(f"Error processing {commondatafilename}") + raise + + return CommonDataMetadata( + name, int(nsys_str), int(ndata_str), get_kinlabel_key(process_type_str) + ) + + +def get_plot_kinlabels(commondata): + """Return the LaTex kinematic labels for a given Commondata""" + key = commondata.process_type + + # TODO: the keys in KINLABEL_LATEX need to be updated for the new commondata + return KINLABEL_LATEX.get(key, key) + + +def get_kinlabel_key(process_label): + """ + Since there is no 1:1 correspondence between latex keys and the old libNNPDF names + we match the longest key such that the proc label starts with it. + """ + l = process_label + try: + if process_label == "EWK_RAP_ASY": + # TODO this function is disappearing in this PR + l = "EWK_RAP" + return next(k for k in sorted(KINLABEL_LATEX, key=len, reverse=True) if l.startswith(k)) + except StopIteration as e: + raise ValueError( + "Could not find a set of kinematic " + "variables matching the process %s Check the " + "labels defined in commondata.cc. " % (l) + ) from e diff --git a/doc/sphinx/source/vp/customplots.rst b/doc/sphinx/source/vp/customplots.rst index f0864b6284..d91a18fdb5 100644 --- a/doc/sphinx/source/vp/customplots.rst +++ b/doc/sphinx/source/vp/customplots.rst @@ -65,7 +65,7 @@ There are two ways to take advantage of resources produced using the * Using extra modules: Additional Python modules or files can be passed to ``validphys`` using the ``--extra-modules`` (or ``-x``) flag. The functions in these modules then act ``validphys`` providers and can take - resources from ``validpys`` as input. This approach allows the + resources from ``validpys`` as input. This approach allows the immediate use of runcards or the default styles. One limitation is that there is currently no way of adding production rules or parsers in this way. Prefer this for actions that are too difficult to upstream to @@ -76,7 +76,7 @@ There are two ways to take advantage of resources produced using the from matplotlib.figure import Figure from reportengine.figure import figure - from validphys.commondataparser import load_commondata + from nnpdf_data.commondataparser import load_commondata # A simple plot that probably should be in validphys to begin with. @@ -103,7 +103,7 @@ There are two ways to take advantage of resources produced using the -Note that both of these come at the cost of risking future breakage +Note that both of these come at the cost of risking future breakage somewhat as we don't guarantee any sort of stability on the internal interfaces. diff --git a/doc/sphinx/source/vp/pydataobjs.rst b/doc/sphinx/source/vp/pydataobjs.rst index 8c2693135f..f992bca103 100644 --- a/doc/sphinx/source/vp/pydataobjs.rst +++ b/doc/sphinx/source/vp/pydataobjs.rst @@ -143,8 +143,8 @@ Loading CommonData ------------------ The underlying functions for loading CommonData can be found in -:py:mod:`validphys.commondataparser`. The data is loaded -as :py:class:`validphys.coredata.CommonData`, which uses the +:py:mod:`nnpdf_data.commondataparser`. The data is loaded +as :py:class:`nnpdf_data.coredata.CommonData`, which uses the `dataclasses `_ module which automatically generates some special methods for the class. The underlying data is stored as DataFrames, and so can be used @@ -153,7 +153,7 @@ with the standard pandas machinery:: import pandas as pd from validphys.api import API - from validphys.commondataparser import load_commondata + from nnpdf_data.commondataparser import load_commondata # define dataset settings ds_input={'dataset': 'CMSZDIFF12', 'cfac':('QCD', 'NRM'), 'sys':10} # first get the CommonDataSpec @@ -162,11 +162,11 @@ with the standard pandas machinery:: assert isinstance(lcd.central_values, pd.Series) assert isinstance(lcd.systematics_table, pd.DataFrame) -The :py:class:`validphys.coredata.CommonData` class has a method which returns +The :py:class:`nnpdf_data.coredata.CommonData` class has a method which returns a new instance of the class with cuts applied:: from validphys.api import API - from validphys.commondataparser import load_commondata + from nnpdf_data.commondataparser import load_commondata # define dataset and additional settings ds_input={'dataset': 'CMSZDIFF12', 'cfac':('QCD', 'NRM'), 'sys':10} inp = { @@ -193,7 +193,7 @@ more convenient than calling the underlying functions:: Loading Covariance Matrices --------------------------- -Functions which take :py:class:`validphys.coredata.CommonData` s and return +Functions which take :py:class:`nnpdf_data.coredata.CommonData` s and return covariance matrices can be found in :py:mod:`validphys.covmats`. As with the commondata the functions can be called in scripts directly:: diff --git a/nnpdf_data/examples_of_use.py b/nnpdf_data/examples_of_use.py new file mode 100644 index 0000000000..71063c8843 --- /dev/null +++ b/nnpdf_data/examples_of_use.py @@ -0,0 +1,34 @@ +""" + This file contains examples of use of ``nnpdf_data`` as a library. + This library is currently in pre-alpha form and should not be considered stable. + + The functions and examples in this file will be eventually removed but might become + part of the library as an external user-facing interface. + + There is currently no user-facing interface so no stability is expected. +""" + +from nnpdf_data import path_commondata +from nnpdf_data.commondataparser import parse_new_metadata + + +def parse_dataset(dataset, variant=None): + """Given a dataset name, read the observable metadata as a CommonData object. + A variant can be given. + + The output is a ``ObservableMetaData`` object, with references to all files + that form the dataset but none of them is loaded. + This can then be used to _load_ the dataset using load_commondata. + + Example + ------- + >>> from nnpdf_data.commondataparser import load_commondata + >>> cd_meta = parse_dataset("LHCB_Z0_7TEV_DIELECTRON_Y") + >>> cd = load_commondata(cd_meta) + >>> print(cd) + CommonData(setname='LHCB_Z0_7TEV_DIELECTRON_Y', ndata=9, commondataproc='DY_Z_Y', nkin=3, nsys=11, legacy=False, legacy_names=['LHCBZ940PB'], kin_variables=['y', 'm_Z2', 'sqrts']) + """ + setname, observable = dataset.rsplit("_", 1) + metadata_file = path_commondata / setname / "metadata.yaml" + metadata = parse_new_metadata(metadata_file, observable, variant=variant) + return metadata diff --git a/nnpdf_data/nnpdf_data/__init__.py b/nnpdf_data/nnpdf_data/__init__.py index f12d3e0abc..515561f2c5 100644 --- a/nnpdf_data/nnpdf_data/__init__.py +++ b/nnpdf_data/nnpdf_data/__init__.py @@ -1,76 +1,22 @@ -from functools import lru_cache import pathlib -import yaml +from ._version import __version__ +from .commondataparser import parse_new_metadata +from .validphys_compatibility import legacy_to_new_map, legacy_to_new_mapping, new_to_legacy_map path_vpdata = pathlib.Path(__file__).parent path_commondata = path_vpdata / "commondata" - -# VP should not have access to this file, only to the products -_path_legacy_mapping = path_commondata / "dataset_names.yml" theory_cards = path_vpdata / "theory_cards" -with open(_path_legacy_mapping) as file: - _legacy_to_new_mapping_raw = yaml.load(file, yaml.Loader) -# Convert strings into a dictionary -legacy_to_new_mapping = { - k: ({"dataset": v} if isinstance(v, str) else v) for k, v in _legacy_to_new_mapping_raw.items() -} - - -@lru_cache -def legacy_to_new_map(dataset_name, sys=None): - """Find the new dataset name and variant corresponding to an old dataset - and systematics choice""" - if dataset_name not in legacy_to_new_mapping: - return dataset_name, None - - new_name = legacy_to_new_mapping[dataset_name] - variant = new_name.get("variant") - new_name = new_name["dataset"] - if sys is not None: - if variant is None: - raise KeyError( - f"I cannot translate the combination of {dataset_name} and sys: {sys}. Please report this." - ) - variant += f"_{sys}" - - return new_name, variant - - -@lru_cache -def new_to_legacy_map(dataset_name, variant_used): - """Loop over the dictionary and find the right dataset. - - Since it is posible to have more than 1 dataset mapped to the same new one, - returns a list of everything that matches. - - This function will loop over the entire dictionary of mappings and selects - 1. All datasets that match exactly what's in the runcard (dataset & variant): exact_matches - 2. All datasets that match the dataset name: matches - If there are any `exact_matches`, it will return only those; otherwise, return all `matches` - if there are no `matches` at all, return None - """ - - matches = [] - exact_matches = [] - - for old_name, new_info in legacy_to_new_mapping.items(): - new_name = new_info["dataset"] - variant = new_info.get("variant") - if new_name == dataset_name: - matches.append(old_name) - # if it's a nuclear DIS data promote legacy to be legacy_dw - if "_DW_" in old_name and variant_used == "legacy": - variant = "legacy_dw" +def load_dataset_metadata(dataset_name, variant=None): + """Given a dataset name, return the metadata""" - if variant_used == variant: - exact_matches.append(old_name) + # Compatibility with old nnpdf names, these two lines + # might disappear at any given point + if variant is None: + dataset_name, variant = legacy_to_new_map(dataset_name) - # If we found exact matches, return those and stop looking - if exact_matches: - return exact_matches - elif matches: - return matches - return None + setname, observable = dataset_name.rsplit("_", 1) + metadata_file = path_commondata / setname / "metadata.yaml" + return parse_new_metadata(metadata_file, observable, variant=variant) diff --git a/validphys2/src/validphys/commondataparser.py b/nnpdf_data/nnpdf_data/commondataparser.py similarity index 87% rename from validphys2/src/validphys/commondataparser.py rename to nnpdf_data/nnpdf_data/commondataparser.py index 9f96d678cf..5d3d151fc4 100644 --- a/validphys2/src/validphys/commondataparser.py +++ b/nnpdf_data/nnpdf_data/commondataparser.py @@ -1,6 +1,6 @@ """ This module implements parsers for commondata and its associated metadata and uncertainties files -into useful structures that can be fed to the main :py:class:`validphys.coredata.CommonData` class. +into useful structures that can be fed to the main :py:class:`nnpdf_data.coredata.CommonData` class. A CommonData file is completely defined by a dataset name (which defines the folder in which the information is) @@ -13,7 +13,6 @@ The definition of all information for a given dataset (and all its observable) is in the ``metadata.yaml`` file and its ``implemented_observables``. - This module defines a number of parsers using the ``validobj`` library. The full ``metadata.yaml`` is read as a ``SetMetaData`` object @@ -28,18 +27,18 @@ Inside the ``ObservableMetaData`` we can find: - ``TheoryMeta``: contains the necessary information to read the (new style) fktables - ``KinematicsMeta``: containins metadata about the kinematics - - ``PlottingOptions``: plotting style and information for validphys - ``Variant``: variant to be used + - ``PlottingOptions``: plotting style and information for validphys, only utilized if + validphys is also installed. The CommonMetaData defines how the CommonData file is to be loaded, by modifying the CommonMetaData using one of the loaded Variants one can change the resulting -:py:class:`validphys.coredata.CommonData` object. +:py:class:`nnpdf_data.coredata.CommonData` object. """ import dataclasses from functools import cache, cached_property import logging -from operator import attrgetter from pathlib import Path from typing import Any, Optional @@ -48,12 +47,23 @@ from validobj import ValidationError, parse_input from validobj.custom import Parser -from nnpdf_data import new_to_legacy_map, path_commondata -from nnpdf_data.utils import parse_yaml_inp -from validphys.coredata import KIN_NAMES, CommonData -from validphys.plotoptions.plottingoptions import PlottingOptions, labeler_functions -from validphys.process_options import ValidProcess -from validphys.utils import yaml_fast +from .coredata import KIN_NAMES, CommonData +from .process_options import ValidProcess +from .utils import parse_yaml_inp, quick_yaml_load +from .validphys_compatibility import new_to_legacy_map, path_commondata + +try: + from validphys.plotoptions.plottingoptions import PlottingOptions, labeler_functions + + VP_AVAILABLE = True +except ModuleNotFoundError: + # if validphys is not available, the __old__ plotting options from validphys + # which we only still have because the world is a dark and horrible place + # won't be loaded. Instead, the following file is loaded. + from .validphys_compatibility import PlottingOptions, labeler_functions + + VP_AVAILABLE = False + # JCM: # Some notes for developers @@ -181,12 +191,18 @@ def ValidOperation(op_str: Optional[str]) -> str: if op_str is None: op_str = "NONE" ret = op_str.upper() + # TODO: move accepted operations to this module so that the convolution receives an operation to apply # instead of an operation to understand - from validphys.convolution import OP + try: + from validphys.convolution import OP + + if ret not in OP: + raise ValidationError(f"The operation '{op_str}' is not implemented in validphys") + except ModuleNotFoundError: + # Don't perform any checks if VP is not available + pass - if ret not in OP: - raise ValidationError(f"The operation '{op_str}' is not implemented in validphys") return str(ret) @@ -214,7 +230,7 @@ class TheoryMeta: Example ------- - >>> from validphys.commondataparser import TheoryMeta + >>> from nnpdf_data.commondataparser import TheoryMeta ... from validobj import parse_input ... from ruamel.yaml import YAML ... theory_raw = ''' @@ -249,7 +265,7 @@ def parser(cls, yaml_file): """The yaml databases in the server use "operands" as key instead of "FK_tables" """ if not yaml_file.exists(): raise FileNotFoundError(yaml_file) - meta = yaml_fast.load(yaml_file.read_text()) + meta = quick_yaml_load(yaml_file) # Make sure the operations are upper-cased for compound-compatibility meta["operation"] = "NULL" if meta["operation"] is None else meta["operation"].upper() if "operands" in meta: @@ -509,7 +525,7 @@ def load_data_central(self): if self.is_nnpdf_special: data = np.zeros(self.ndata) else: - datayaml = yaml_fast.load(self.path_data_central) + datayaml = quick_yaml_load(self.path_data_central) data = datayaml["data_central"] if len(data) != self.ndata: @@ -538,7 +554,7 @@ def load_uncertainties(self): all_df = [] for ufile in self.paths_uncertainties: - uncyaml = yaml_fast.load(ufile) + uncyaml = quick_yaml_load(ufile) mindex = pd.MultiIndex.from_tuples( [(k, v["treatment"], v["type"]) for k, v in uncyaml["definitions"].items()], names=["name", "treatment", "type"], @@ -574,7 +590,7 @@ def load_kinematics(self, fill_to_three=True, drop_minmax=True): a dataframe containing the kinematics """ kinematics_file = self.path_kinematics - kinyaml = yaml_fast.load(kinematics_file) + kinyaml = quick_yaml_load(kinematics_file) kin_dict = {} for bin_index, dbin in enumerate(kinyaml["bins"], start=1): @@ -658,6 +674,11 @@ def digest_plotting_variable(self, variable): These might be variables included as part of the kinematics or extra labels defined in the plotting dictionary. """ + if not VP_AVAILABLE: + raise ModuleNotFoundError( + "validphys, from the full nnpdf package, needs to be installed to use this functionality" + ) + # If it is part of the coverage, just return the relevant KN if variable in self.kinematic_coverage: fig_idx = self.kinematic_coverage.index(variable) @@ -841,7 +862,7 @@ def parse_new_metadata(metadata_file, observable_name, variant=None): return metadata -def load_commondata_new(metadata): +def load_commondata(metadata): """ TODO: update this docstring since now the load_commondata_new takes the information from @@ -940,143 +961,3 @@ def load_commondata_new(metadata): legacy_names=legacy_names, kin_variables=metadata.kinematic_coverage, ) - - -########################################### - - -@cache -def load_commondata(spec): - """ - Load the data corresponding to a CommonDataSpec object. - Returns an instance of CommonData - """ - if spec.legacy: - commondatafile = spec.datafile - setname = spec.name - systypefile = spec.sysfile - - return load_commondata_old(commondatafile, systypefile, setname) - - return load_commondata_new(spec.metadata) - - -### Old commondata: -### All code below this line is deprecated and will be removed -def load_commondata_old(commondatafile, systypefile, setname): - """Parse a commondata file and a systype file into a CommonData. - - Parameters - ---------- - commondatafile : file or path to file - systypefile : file or path to file - - Returns - ------- - commondata : CommonData - An object containing the data and information from the commondata - and systype files. - """ - # First parse commondata file - commondatatable = pd.read_csv(commondatafile, sep=r"\s+", skiprows=1, header=None) - # Remove NaNs - # TODO: replace commondata files with bad formatting - # Build header - commondataheader = ["entry", "process", "kin1", "kin2", "kin3", "data", "stat"] - nsys = (commondatatable.shape[1] - len(commondataheader)) // 2 - - commondataheader += ["ADD", "MULT"] * nsys - commondatatable.columns = commondataheader - commondatatable.set_index("entry", inplace=True) - ndata = len(commondatatable) - commondataproc = commondatatable["process"][1] - # Check for consistency with commondata metadata - cdmetadata = peek_commondata_metadata(commondatafile) - if (nsys, ndata) != attrgetter("nsys", "ndata")(cdmetadata): - raise ValueError(f"Commondata table information does not match metadata for {setname}") - - # Now parse the systype file - systypetable = parse_systypes(systypefile) - - # Populate CommonData object - return CommonData( - setname=setname, - ndata=ndata, - commondataproc=commondataproc, - nkin=3, - nsys=nsys, - commondata_table=commondatatable, - systype_table=systypetable, - legacy=True, - ) - - -def parse_systypes(systypefile): - """Parses a systype file and returns a pandas dataframe.""" - systypeheader = ["sys_index", "treatment", "name"] - try: - systypetable = pd.read_csv( - systypefile, sep=r"\s+", names=systypeheader, skiprows=1, header=None - ) - systypetable.dropna(axis="columns", inplace=True) - # Some datasets e.g. CMSWCHARMRAT have no systematics - except pd.errors.EmptyDataError: - systypetable = pd.DataFrame(columns=systypeheader) - - systypetable.set_index("sys_index", inplace=True) - - return systypetable - - -@dataclasses.dataclass(frozen=True) -class CommonDataMetadata: - """Contains metadata information about the data being read""" - - name: str - nsys: int - ndata: int - process_type: str - - -def peek_commondata_metadata(commondatafilename): - """Read some of the properties of the commondata object as a CommonData Metadata""" - with open(commondatafilename) as f: - try: - l = f.readline() - name, nsys_str, ndata_str = l.split() - l = f.readline() - process_type_str = l.split()[1] - except Exception: - log.error(f"Error processing {commondatafilename}") - raise - - return CommonDataMetadata( - name, int(nsys_str), int(ndata_str), get_kinlabel_key(process_type_str) - ) - - -def get_plot_kinlabels(commondata): - """Return the LaTex kinematic labels for a given Commondata""" - key = commondata.process_type - - # TODO: the keys in KINLABEL_LATEX need to be updated for the new commondata - return KINLABEL_LATEX.get(key, key) - - -def get_kinlabel_key(process_label): - """ - Since there is no 1:1 correspondence between latex keys and the old libNNPDF names - we match the longest key such that the proc label starts with it. - """ - l = process_label - try: - if process_label == "EWK_RAP_ASY": - # TODO this function is disappearing in this PR - l = "EWK_RAP" - return next(k for k in sorted(KINLABEL_LATEX, key=len, reverse=True) if l.startswith(k)) - except StopIteration as e: - raise ValueError( - "Could not find a set of kinematic " - "variables matching the process %s Check the " - "labels defined in commondata.cc. " % (l) - ) from e diff --git a/nnpdf_data/nnpdf_data/coredata.py b/nnpdf_data/nnpdf_data/coredata.py new file mode 100644 index 0000000000..233f1f59df --- /dev/null +++ b/nnpdf_data/nnpdf_data/coredata.py @@ -0,0 +1,246 @@ +""" + The all-important CommonData object +""" + +import dataclasses +from typing import Optional + +import numpy as np +import pandas as pd + +from .utils import yaml + +KIN_NAMES = ["kin1", "kin2", "kin3"] + + +def generate_path_filtered_data(fit_path, setname): + """Utility to ensure that both the loader and tools like setupfit utilize the same convention + to generate the names of generated pseudodata""" + data_path = fit_path / "filter" / setname / f"filtered_data_{setname}.yaml" + unc_path = data_path.with_name(f"filtered_uncertainties_{setname}.yaml") + return data_path, unc_path + + +@dataclasses.dataclass(eq=False) +class CommonData: + """ + Data contained in Commondata files, relevant cuts applied. + + Parameters + ---------- + + setname : str + Name of the dataset + + ndata : int + Number of data points + + commondataproc : str + Process type, one of 21 options + + nkin : int + Number of kinematics specified + + nsys : int + Number of systematics + + commondata_table : pd.DataFrame + Pandas dataframe containing the commondata + + systype_table : pd.DataFrame + Pandas dataframe containing the systype index + for each systematic alongside the uncertainty + type (ADD/MULT/RAND) and name + (CORR/UNCORR/THEORYCORR/SKIP) + + systematics_table: pd.DataFrame + Panda dataframe containing the table of systematics + """ + + setname: str + ndata: int + commondataproc: str + nkin: int + nsys: int + commondata_table: pd.DataFrame = dataclasses.field(repr=False) + systype_table: pd.DataFrame = dataclasses.field(repr=False) + legacy: bool = False + systematics_table: Optional[pd.DataFrame] = dataclasses.field(init=None, repr=False) + legacy_names: Optional[list] = None + kin_variables: Optional[list] = None + + def __post_init__(self): + self.systematics_table = self.commondata_table.drop( + columns=["process", "data", "stat"] + KIN_NAMES + ) + # TODO: set for now commondataproc as a string + self.commondataproc = str(self.commondataproc) + + def with_cuts(self, cuts): + """A method to return a CommonData object where + an integer mask has been applied, keeping only data + points which pass cuts. + + Note if the first data point passes cuts, the first entry + of ``cuts`` should be ``0``. + + Paramters + --------- + cuts: list or validphys.core.Cuts or None + """ + # Ensure that the cuts we're applying applies to this dataset + # only check, however, if the cuts is of type :py:class:`validphys.core.Cuts` + if hasattr(cuts, "name") and self.setname != cuts.name: + raise ValueError( + f"The cuts provided are for {cuts.name} which does not apply " + f"to this commondata file: {self.setname}" + ) + + if hasattr(cuts, "load"): + cuts = cuts.load() + if cuts is None: + return self + + # We must shift the cuts up by 1 since a cut of 0 implies the first data point + # while commondata indexing starts at 1. + cuts = list(map(lambda x: x + 1, cuts)) + + newndata = len(cuts) + new_commondata_table = self.commondata_table.loc[cuts] + return dataclasses.replace(self, ndata=newndata, commondata_table=new_commondata_table) + + @property + def kinematics(self): + return self.commondata_table[KIN_NAMES] + + def get_kintable(self): + return self.kinematics.values + + @property + def central_values(self): + return self.commondata_table["data"] + + def with_central_value(self, cv): + tb = self.commondata_table.copy() + tb["data"] = cv + return dataclasses.replace(self, commondata_table=tb) + + def get_cv(self): + return self.central_values.values + + @property + def stat_errors(self): + return self.commondata_table["stat"] + + @property + def multiplicative_errors(self): + """Returns the systematics which are multiplicative (systype is MULT) + in a percentage format, with SKIP uncertainties removed. + + """ + mult_systype = self.systype_table[self.systype_table["treatment"] == "MULT"] + mult_table = self.systematics_table.filter(like="MULT") + + if self.legacy: + # Needed in legacy because every uncertainty appears as both mult and add + # so it is necessary to select the uncertainties that are to be consireded as MULT/ADD + # Minus 1 because iloc starts from 0, while the systype counting starts from 1 + mult_table = mult_table.iloc[:, mult_systype.index - 1] + + mult_table.columns = mult_systype["name"].to_numpy() + return mult_table.loc[:, mult_table.columns != "SKIP"] + + @property + def additive_errors(self): + """Returns the systematics which are additive (systype is ADD) as + absolute uncertainties (same units as data), with SKIP uncertainties + removed. + + """ + add_systype = self.systype_table[self.systype_table["treatment"] == "ADD"] + add_table = self.systematics_table.filter(like="ADD") + + if self.legacy: + # Minus 1 because iloc starts from 0, while the systype counting starts from 1 + add_table = add_table.iloc[:, add_systype.index - 1] + + add_table.columns = add_systype["name"].to_numpy() + return add_table.loc[:, add_table.columns != "SKIP"] + + def systematic_errors(self, central_values=None): + """Returns all systematic errors as absolute uncertainties, with a + single column for each uncertainty. Converts + :py:attr:`multiplicative_errors` to units of data and then appends + onto :py:attr:`additive_errors`. By default uses the experimental + central values to perform conversion, but the user can supply a + 1-D array of central values, with length :py:attr:`self.ndata`, to use + instead of the experimental central values to calculate the absolute + contribution of the multiplicative systematics. + + Parameters + ---------- + central_values: None, np.array + 1-D array containing alternative central values to combine with + multiplicative uncertainties. This array must have length equal + to :py:attr:`self.ndata`. By default ``central_values`` is None, and + the central values of the commondata are used. + + Returns + ------- + systematic_errors: pd.DataFrame + Dataframe containing systematic errors. + + """ + if central_values is None: + central_values = self.central_values.to_numpy() + converted_mult_errors = self.multiplicative_errors * central_values[:, np.newaxis] / 100 + return pd.concat((self.additive_errors, converted_mult_errors), axis=1) + + def export_data(self, buffer): + """Exports the central data defined by this commondata instance to the given buffer""" + ret = {"data_central": self.central_values.tolist()} + yaml.safe_dump(ret, buffer) + + def export_uncertainties(self, buffer): + """Exports the uncertainties defined by this commondata instance to the given buffer""" + definitions = {} + for idx, row in self.systype_table.iterrows(): + if row["name"] != "SKIP": + definitions[f"sys_{idx}"] = {"treatment": row["treatment"], "type": row["name"]} + + # Order the definitions by treatment as ADD, MULT + # TODO: make it so that it corresponds to the original order exactly + sorted_definitions = { + k: v for k, v in sorted(definitions.items(), key=lambda item: item[1]["treatment"]) + } + bins = [] + for idx, row in self.systematic_errors().iterrows(): + tmp = {"stat": float(self.stat_errors[idx])} + # Hope things come in the right order... + for key_name, val in zip(sorted_definitions, row): + tmp[key_name] = float(val) + + bins.append(tmp) + + sorted_definitions["stat"] = { + "description": "Uncorrelated statistical uncertainties", + "treatment": "ADD", + "type": "UNCORR", + } + ret = {"definitions": sorted_definitions, "bins": bins} + yaml.safe_dump(ret, buffer) + + def export(self, folder_path): + """Wrapper around export_data and export_uncertainties + to write both uncertainties and data after filtering to a given folder + """ + folder_path.mkdir(exist_ok=True) + # Get the same names as one would use for the filters + data_path, unc_path = generate_path_filtered_data(folder_path, self.setname) + # And attach it to the given folder + data_path = folder_path / data_path.name + unc_path = folder_path / unc_path.name + # Export data and uncertainties + self.export_data(data_path.open("w", encoding="utf-8")) + self.export_uncertainties(unc_path.open("w", encoding="utf-8")) + return data_path, unc_path diff --git a/validphys2/src/validphys/process_options.py b/nnpdf_data/nnpdf_data/process_options.py similarity index 100% rename from validphys2/src/validphys/process_options.py rename to nnpdf_data/nnpdf_data/process_options.py diff --git a/nnpdf_data/nnpdf_data/utils.py b/nnpdf_data/nnpdf_data/utils.py index b07fe4f399..95383b212c 100644 --- a/nnpdf_data/nnpdf_data/utils.py +++ b/nnpdf_data/nnpdf_data/utils.py @@ -6,11 +6,21 @@ yaml_rt = YAML(typ="rt") try: + # If libyaml is available, use the C loader Loader = yaml.CLoader except AttributeError: + # fallback to the slow loader Loader = yaml.Loader +def quick_yaml_load(filepath): + """If libyaml is available, use the C loader to speed up some of the read + https://pyyaml.org/wiki/LibYAML + libyaml is available for most linux distributions + """ + return yaml.load(filepath.read_text(encoding="utf-8"), Loader=Loader) + + def parse_yaml_inp(input_yaml, spec): """ Helper function to parse yaml using the `validobj` library and print @@ -19,7 +29,7 @@ def parse_yaml_inp(input_yaml, spec): https://validobj.readthedocs.io/en/latest/examples.html#yaml-line-numbers """ input_yaml = pathlib.Path(input_yaml) - inp = yaml.load(input_yaml.read_text(encoding="utf-8"), Loader=Loader) + inp = quick_yaml_load(input_yaml) try: return parse_input(inp, spec) except ValidationError as e: diff --git a/nnpdf_data/nnpdf_data/validphys_compatibility.py b/nnpdf_data/nnpdf_data/validphys_compatibility.py new file mode 100644 index 0000000000..19fd445871 --- /dev/null +++ b/nnpdf_data/nnpdf_data/validphys_compatibility.py @@ -0,0 +1,116 @@ +""" + This file exists solely for me to be able to upload a package to PyPI for the nnpdf data which does not depend + on the rest of the NNPDF code. + This file should not be modified. Everything in this file is deprecated and should be removed, and the only reason + this is needed is because we are still mixing new and old data. + + This also means that _things_ that would've been loaded as a kinematic transformation or result transformation + are loaded as boring strings that cannot do anything, as it should be. +""" + +import dataclasses +from functools import cache +import pathlib +import typing + +import ruamel.yaml as yaml + +labeler_functions = [] + + +@dataclasses.dataclass +class PlottingOptions: + func_labels: dict = dataclasses.field(default_factory=dict) + dataset_label: typing.Optional[str] = None + experiment: typing.Optional[str] = None + nnpdf31_process: typing.Optional[str] = None + data_reference: typing.Optional[str] = None + theory_reference: typing.Optional[str] = None + process_description: typing.Optional[str] = None + y_label: typing.Optional[str] = None + x_label: typing.Optional[str] = None + kinematics_override: typing.Optional[str] = None + result_transform: typing.Optional[str] = None + x: typing.Optional[str] = None + plot_x: typing.Optional[str] = None + x_scale: typing.Optional[str] = None + y_scale: typing.Optional[str] = None + line_by: typing.Optional[list] = None + figure_by: typing.Optional[list] = None + extra_labels: typing.Optional[typing.Mapping[str, typing.List]] = None + normalize: typing.Optional[dict] = None + # Note that this "PlottingOptions" start already digested, because it actually does nothing! + already_digested: typing.Optional[bool] = True + + +########## Legacy names compatibility +# The functions and variables below exist solely for reproducibility of old NNPDF results +# and no external code or new feature should depend on them as they might be removed at any point +# with no previous warning + +path_vpdata = pathlib.Path(__file__).parent +path_commondata = path_vpdata / "commondata" +_path_legacy_mapping = path_commondata / "dataset_names.yml" +_legacy_to_new_mapping_raw = yaml.YAML().load(_path_legacy_mapping) +# Convert strings into a dictionary +legacy_to_new_mapping = { + k: ({"dataset": v} if isinstance(v, str) else v) for k, v in _legacy_to_new_mapping_raw.items() +} + + +@cache +def legacy_to_new_map(dataset_name, sys=None): + """Find the new dataset name and variant corresponding to an old dataset + and systematics choice""" + if dataset_name not in legacy_to_new_mapping: + return dataset_name, None + + new_name = legacy_to_new_mapping[dataset_name] + variant = new_name.get("variant") + new_name = new_name["dataset"] + if sys is not None: + if variant is None: + raise KeyError( + f"I cannot translate the combination of {dataset_name} and sys: {sys}. Please report this." + ) + variant += f"_{sys}" + + return new_name, variant + + +@cache +def new_to_legacy_map(dataset_name, variant_used): + """Loop over the dictionary and find the right dataset. + + Since it is posible to have more than 1 dataset mapped to the same new one, + returns a list of everything that matches. + + This function will loop over the entire dictionary of mappings and selects + 1. All datasets that match exactly what's in the runcard (dataset & variant): exact_matches + 2. All datasets that match the dataset name: matches + If there are any `exact_matches`, it will return only those; otherwise, return all `matches` + if there are no `matches` at all, return None + """ + + matches = [] + exact_matches = [] + + for old_name, new_info in legacy_to_new_mapping.items(): + new_name = new_info["dataset"] + variant = new_info.get("variant") + + if new_name == dataset_name: + matches.append(old_name) + # if it's a nuclear DIS data promote legacy to be legacy_dw + if "_DW_" in old_name and variant_used == "legacy": + variant = "legacy_dw" + + if variant_used == variant: + exact_matches.append(old_name) + + # If we found exact matches, return those and stop looking + if exact_matches: + return exact_matches + elif matches: + return matches + return None diff --git a/nnpdf_data/pyproject.toml b/nnpdf_data/pyproject.toml index 11609e12ed..2e7090fe1e 100644 --- a/nnpdf_data/pyproject.toml +++ b/nnpdf_data/pyproject.toml @@ -28,21 +28,22 @@ exclude = [ # Data files include = [ # The version file is ignored by git so it needs to be explicitly included - "nnpdf_data/_version.py", + {path = "nnpdf_data/_version.py", format = ["sdist", "wheel"]} ] [tool.poetry.dependencies] python = "^3.9" "ruamel.yaml" = "*" +validobj = "*" +pandas = "*" +numpy = "*" # Required to run filters: `filter_files_dependencies` -pandas = {version = "*", optional = true} -numpy = {version = "*", optional = true} scipy = {version = "*", optional = true} openpyxl = {version = "*", optional = true} uproot = {version = "*", optional = true} [tool.poetry.extras] -filter = ["openpyxl", "numpy", "scipy", "pandas", "uproot"] +filter = ["openpyxl", "scipy", "uproot"] [tool.poetry-dynamic-versioning] enable = true diff --git a/validphys2/src/validphys/commondata.py b/validphys2/src/validphys/commondata.py index 1bbd7c0c2f..7c68c0784f 100644 --- a/validphys2/src/validphys/commondata.py +++ b/validphys2/src/validphys/commondata.py @@ -2,15 +2,18 @@ commondata.py Module containing actions which return loaded commondata, leverages utils -found in :py:mod:`validphys.commondataparser`, and returns objects from -:py:mod:`validphys.coredata` +found in :py:mod:`nnpdf_data.commondataparser`, and returns objects from +:py:mod:`nnpdf_data.coredata` """ -from reportengine import collect -from validphys.commondataparser import load_commondata + import functools -@functools.lru_cache +from nnpdf_data.commondataparser import load_commondata +from reportengine import collect + + +@functools.cache def loaded_commondata_with_cuts(commondata, cuts): """Load the commondata and apply cuts. @@ -23,10 +26,10 @@ def loaded_commondata_with_cuts(commondata, cuts): Returns ------- - loaded_cut_commondata: validphys.coredata.CommonData + loaded_cut_commondata: nnpdf_data.coredata.CommonData """ - lcd = load_commondata(commondata) + lcd = load_commondata(commondata.metadata) return lcd.with_cuts(cuts) diff --git a/validphys2/src/validphys/core.py b/validphys2/src/validphys/core.py index ceb0343feb..94d148ed42 100644 --- a/validphys2/src/validphys/core.py +++ b/validphys2/src/validphys/core.py @@ -14,6 +14,7 @@ import numpy as np from ruamel.yaml import error +from nnpdf_data.commondataparser import load_commondata from nnpdf_data.theorydbutils import fetch_theory from reportengine import namespaces from reportengine.baseexceptions import AsInputError @@ -21,7 +22,6 @@ # TODO: There is a bit of a circular dependency between filters.py and this. # Maybe move the cuts logic to its own module? from validphys import filters, lhaindex -from validphys.commondataparser import get_plot_kinlabels, load_commondata, peek_commondata_metadata from validphys.fkparser import load_fktable, parse_cfactor from validphys.hyperoptplot import HyperoptTrial from validphys.lhapdfset import LHAPDFSet @@ -332,8 +332,6 @@ def process_type(self): @property def metadata(self): - if self.legacy: - self._metadata = peek_commondata_metadata(self.datafile) return self._metadata @functools.cached_property @@ -352,18 +350,16 @@ def __str__(self): def __iter__(self): return iter((self.datafile, self.sysfile, self.plotfiles)) + @functools.cache def load(self): """ load a validphys.core.CommonDataSpec to validphys.core.CommonData """ - return load_commondata(self) + return load_commondata(self.metadata) @property def plot_kinlabels(self): - if self.legacy: - return get_plot_kinlabels(self) - else: - return self.metadata.kinlabels + return self.metadata.kinlabels class DataSetInput(TupleComp): @@ -462,7 +458,7 @@ def __init__(self, inputs, threshold): self.threshold = threshold super().__init__(self.inputs, self.threshold) - @functools.lru_cache + @functools.cache def load(self): # TODO: Update this when a suitable interace becomes available from validphys.convolution import central_predictions @@ -473,7 +469,7 @@ def load(self): exp_err = np.sqrt( np.diag( covmat_from_systematics( - load_commondata(first_ds.commondata).with_cuts(first_ds.cuts), + load_commondata(first_ds.commondata.metadata).with_cuts(first_ds.cuts), first_ds, # DataSetSpec has weight attr use_weights_in_covmat=False, # Don't weight covmat ) @@ -663,7 +659,7 @@ def load_commondata(self): def load_commondata_instance(self): """ - Given Experiment load list of validphys.coredata.CommonData + Given Experiment load list of nnpdf_data.coredata.CommonData objects with cuts already applied """ commodata_list = [] diff --git a/validphys2/src/validphys/coredata.py b/validphys2/src/validphys/coredata.py index 371f9837b1..13dfd076f0 100644 --- a/validphys2/src/validphys/coredata.py +++ b/validphys2/src/validphys/coredata.py @@ -5,14 +5,10 @@ import dataclasses import logging -from typing import Optional import numpy as np import pandas as pd -from validphys.utils import generate_path_filtered_data, yaml_safe - -KIN_NAMES = ["kin1", "kin2", "kin3"] log = logging.getLogger(__name__) @@ -250,230 +246,3 @@ class CFactorData: description: str central_value: np.array uncertainty: np.array - - -@dataclasses.dataclass(eq=False) -class CommonData: - """ - Data contained in Commondata files, relevant cuts applied. - - Parameters - ---------- - - setname : str - Name of the dataset - - ndata : int - Number of data points - - commondataproc : str - Process type, one of 21 options - - nkin : int - Number of kinematics specified - - nsys : int - Number of systematics - - commondata_table : pd.DataFrame - Pandas dataframe containing the commondata - - systype_table : pd.DataFrame - Pandas dataframe containing the systype index - for each systematic alongside the uncertainty - type (ADD/MULT/RAND) and name - (CORR/UNCORR/THEORYCORR/SKIP) - - systematics_table: pd.DataFrame - Panda dataframe containing the table of systematics - """ - - setname: str - ndata: int - commondataproc: str - nkin: int - nsys: int - commondata_table: pd.DataFrame = dataclasses.field(repr=False) - systype_table: pd.DataFrame = dataclasses.field(repr=False) - legacy: bool = False - systematics_table: Optional[pd.DataFrame] = dataclasses.field(init=None, repr=False) - legacy_names: Optional[list] = None - kin_variables: Optional[list] = None - - def __post_init__(self): - self.systematics_table = self.commondata_table.drop( - columns=["process", "data", "stat"] + KIN_NAMES - ) - # TODO: set for now commondataproc as a string - self.commondataproc = str(self.commondataproc) - - def with_cuts(self, cuts): - """A method to return a CommonData object where - an integer mask has been applied, keeping only data - points which pass cuts. - - Note if the first data point passes cuts, the first entry - of ``cuts`` should be ``0``. - - Paramters - --------- - cuts: list or validphys.core.Cuts or None - """ - # Ensure that the cuts we're applying applies to this dataset - # only check, however, if the cuts is of type :py:class:`validphys.core.Cuts` - if hasattr(cuts, "name") and self.setname != cuts.name: - raise ValueError( - f"The cuts provided are for {cuts.name} which does not apply " - f"to this commondata file: {self.setname}" - ) - - if hasattr(cuts, "load"): - cuts = cuts.load() - if cuts is None: - return self - - # We must shift the cuts up by 1 since a cut of 0 implies the first data point - # while commondata indexing starts at 1. - cuts = list(map(lambda x: x + 1, cuts)) - - newndata = len(cuts) - new_commondata_table = self.commondata_table.loc[cuts] - return dataclasses.replace(self, ndata=newndata, commondata_table=new_commondata_table) - - @property - def kinematics(self): - return self.commondata_table[KIN_NAMES] - - def get_kintable(self): - return self.kinematics.values - - @property - def central_values(self): - return self.commondata_table["data"] - - def with_central_value(self, cv): - tb = self.commondata_table.copy() - tb["data"] = cv - return dataclasses.replace(self, commondata_table=tb) - - def get_cv(self): - return self.central_values.values - - @property - def stat_errors(self): - return self.commondata_table["stat"] - - @property - def multiplicative_errors(self): - """Returns the systematics which are multiplicative (systype is MULT) - in a percentage format, with SKIP uncertainties removed. - - """ - mult_systype = self.systype_table[self.systype_table["treatment"] == "MULT"] - mult_table = self.systematics_table.filter(like="MULT") - - if self.legacy: - # Needed in legacy because every uncertainty appears as both mult and add - # so it is necessary to select the uncertainties that are to be consireded as MULT/ADD - # Minus 1 because iloc starts from 0, while the systype counting starts from 1 - mult_table = mult_table.iloc[:, mult_systype.index - 1] - - mult_table.columns = mult_systype["name"].to_numpy() - return mult_table.loc[:, mult_table.columns != "SKIP"] - - @property - def additive_errors(self): - """Returns the systematics which are additive (systype is ADD) as - absolute uncertainties (same units as data), with SKIP uncertainties - removed. - - """ - add_systype = self.systype_table[self.systype_table["treatment"] == "ADD"] - add_table = self.systematics_table.filter(like="ADD") - - if self.legacy: - # Minus 1 because iloc starts from 0, while the systype counting starts from 1 - add_table = add_table.iloc[:, add_systype.index - 1] - - add_table.columns = add_systype["name"].to_numpy() - return add_table.loc[:, add_table.columns != "SKIP"] - - def systematic_errors(self, central_values=None): - """Returns all systematic errors as absolute uncertainties, with a - single column for each uncertainty. Converts - :py:attr:`multiplicative_errors` to units of data and then appends - onto :py:attr:`additive_errors`. By default uses the experimental - central values to perform conversion, but the user can supply a - 1-D array of central values, with length :py:attr:`self.ndata`, to use - instead of the experimental central values to calculate the absolute - contribution of the multiplicative systematics. - - Parameters - ---------- - central_values: None, np.array - 1-D array containing alternative central values to combine with - multiplicative uncertainties. This array must have length equal - to :py:attr:`self.ndata`. By default ``central_values`` is None, and - the central values of the commondata are used. - - Returns - ------- - systematic_errors: pd.DataFrame - Dataframe containing systematic errors. - - """ - if central_values is None: - central_values = self.central_values.to_numpy() - converted_mult_errors = self.multiplicative_errors * central_values[:, np.newaxis] / 100 - return pd.concat((self.additive_errors, converted_mult_errors), axis=1) - - def export_data(self, buffer): - """Exports the central data defined by this commondata instance to the given buffer""" - ret = {"data_central": self.central_values.tolist()} - yaml_safe.dump(ret, buffer) - - def export_uncertainties(self, buffer): - """Exports the uncertainties defined by this commondata instance to the given buffer""" - definitions = {} - for idx, row in self.systype_table.iterrows(): - if row["name"] != "SKIP": - definitions[f"sys_{idx}"] = {"treatment": row["treatment"], "type": row["name"]} - - # Order the definitions by treatment as ADD, MULT - # TODO: make it so that it corresponds to the original order exactly - sorted_definitions = { - k: v for k, v in sorted(definitions.items(), key=lambda item: item[1]["treatment"]) - } - bins = [] - for idx, row in self.systematic_errors().iterrows(): - tmp = {"stat": float(self.stat_errors[idx])} - # Hope things come in the right order... - for key_name, val in zip(sorted_definitions, row): - tmp[key_name] = float(val) - - bins.append(tmp) - - sorted_definitions["stat"] = { - "description": "Uncorrelated statistical uncertainties", - "treatment": "ADD", - "type": "UNCORR", - } - ret = {"definitions": sorted_definitions, "bins": bins} - yaml_safe.dump(ret, buffer) - - def export(self, folder_path): - """Wrapper around export_data and export_uncertainties - to write both uncertainties and data after filtering to a given folder - """ - folder_path.mkdir(exist_ok=True) - # Get the same names as one would use for the filters - data_path, unc_path = generate_path_filtered_data(folder_path, self.setname) - # And attach it to the given folder - data_path = folder_path / data_path.name - unc_path = folder_path / unc_path.name - # Export data and uncertainties - with open(data_path, "w") as file: - self.export_data(file) - with open(unc_path, "w") as file: - self.export_uncertainties(file) - return data_path, unc_path diff --git a/validphys2/src/validphys/covmats.py b/validphys2/src/validphys/covmats.py index ca537dc03a..e94c03238d 100644 --- a/validphys2/src/validphys/covmats.py +++ b/validphys2/src/validphys/covmats.py @@ -35,7 +35,7 @@ def covmat_from_systematics( _central_values=None, ): """Take the statistical uncertainty and systematics table from - a :py:class:`validphys.coredata.CommonData` object and + a :py:class:`nnpdf_data.coredata.CommonData` object and construct the covariance matrix accounting for correlations between systematics. @@ -72,7 +72,7 @@ def covmat_from_systematics( Parameters ---------- - loaded_commondata_with_cuts : validphys.coredata.CommonData + loaded_commondata_with_cuts : nnpdf_data.coredata.CommonData CommonData which stores information about systematic errors, their treatment and description. dataset_input: validphys.core.DataSetInput @@ -131,7 +131,7 @@ def dataset_inputs_covmat_from_systematics( _list_of_central_values=None, _only_additive=False, ): - """Given a list containing :py:class:`validphys.coredata.CommonData` s, + """Given a list containing :py:class:`nnpdf_data.coredata.CommonData` s, construct the full covariance matrix. This is similar to :py:meth:`covmat_from_systematics` @@ -142,7 +142,7 @@ def dataset_inputs_covmat_from_systematics( Parameters ---------- - dataset_inputs_loaded_cd_with_cuts : list[validphys.coredata.CommonData] + dataset_inputs_loaded_cd_with_cuts : list[nnpdf_data.coredata.CommonData] list of CommonData objects. data_input: list[validphys.core.DataSetInput] Settings for each dataset, each element contains the weight for the @@ -265,7 +265,7 @@ def t0_covmat_from_systematics( Parameters ---------- - loaded_commondata_with_cuts: validphys.coredata.CommonData + loaded_commondata_with_cuts: nnpdf_data.coredata.CommonData commondata object for which to generate the covmat. dataset_input: validphys.core.DataSetInput Dataset settings, contains the weight for the current dataset. @@ -307,7 +307,7 @@ def dataset_inputs_t0_covmat_from_systematics( Parameters ---------- - dataset_inputs_loaded_cd_with_cuts: list[validphys.coredata.CommonData] + dataset_inputs_loaded_cd_with_cuts: list[nnpdf_data.coredata.CommonData] The CommonData for all datasets defined in ``dataset_inputs``. data_input: list[validphys.core.DataSetInput] Settings for each dataset, each element contains the weight for the @@ -483,7 +483,7 @@ def generate_exp_covmat( Parameters ---------- - dataset_inputs: list[validphys.coredata.CommonData] + dataset_inputs: list[nnpdf_data.coredata.CommonData] list of CommonData objects. data: list[validphys.core.DataSetInput] Settings for each dataset, each element contains the weight for the diff --git a/validphys2/src/validphys/dataplots.py b/validphys2/src/validphys/dataplots.py index 37800b8be3..759c89f210 100644 --- a/validphys2/src/validphys/dataplots.py +++ b/validphys2/src/validphys/dataplots.py @@ -18,6 +18,7 @@ import pandas as pd import scipy.stats as stats +from nnpdf_data.coredata import KIN_NAMES from reportengine import collect from reportengine.checks import CheckError, check, make_argcheck, make_check from reportengine.figure import figure, figuregen @@ -25,7 +26,6 @@ from validphys import plotutils from validphys.checks import check_not_using_pdferr from validphys.core import CutsPolicy, MCStats, cut_mask -from validphys.coredata import KIN_NAMES from validphys.plotoptions.core import get_info, kitable, transform_result from validphys.results import chi2_stat_labels, chi2_stats from validphys.sumrules import POL_LIMS, partial_polarized_sum_rules diff --git a/validphys2/src/validphys/filters.py b/validphys2/src/validphys/filters.py index d148929684..510d9dd387 100644 --- a/validphys2/src/validphys/filters.py +++ b/validphys2/src/validphys/filters.py @@ -12,10 +12,11 @@ import numpy as np +from nnpdf_data.coredata import generate_path_filtered_data +from nnpdf_data.process_options import PROCESSES from reportengine.checks import check, make_check import validphys.cuts -from validphys.process_options import PROCESSES -from validphys.utils import generate_path_filtered_data, yaml_safe +from validphys.utils import yaml_safe log = logging.getLogger(__name__) @@ -707,7 +708,7 @@ def get_cuts_for_dataset(commondata, rules) -> list: Parameters ---------- - commondata: validphys.coredata.CommonData + commondata: nnpdf_data.coredata.CommonData rules: List[Rule] A list of Rule objects specifying the filters. diff --git a/validphys2/src/validphys/loader.py b/validphys2/src/validphys/loader.py index e8312b6c10..607d8e6497 100644 --- a/validphys2/src/validphys/loader.py +++ b/validphys2/src/validphys/loader.py @@ -19,9 +19,10 @@ import requests from nnpdf_data import legacy_to_new_mapping, path_vpdata +from nnpdf_data.commondataparser import parse_new_metadata, parse_set_metadata +from nnpdf_data.coredata import generate_path_filtered_data from reportengine import filefinder from validphys import lhaindex -from validphys.commondataparser import load_commondata_old, parse_new_metadata, parse_set_metadata from validphys.core import ( PDF, CommonDataSpec, @@ -37,7 +38,7 @@ PositivitySetSpec, TheoryIDSpec, ) -from validphys.utils import generate_path_filtered_data, tempfile_cleaner, yaml_safe +from validphys.utils import tempfile_cleaner, yaml_safe log = logging.getLogger(__name__) NNPDF_DIR = "NNPDF" @@ -198,6 +199,8 @@ def _use_fit_commondata_old_format_to_new_format(setname, file_path): (e.g., a closure test ran for NNPDF4.0) and creates a new-format version in a temporary folder to be read by the commondata. Note that this does not modify the fit""" + from .deprecated_functions import load_commondata_old + if not file_path.exists(): raise DataNotFoundError(f"Data for {setname} at {file_path} not found") diff --git a/validphys2/src/validphys/pineparser.py b/validphys2/src/validphys/pineparser.py index d7dd4488a8..8f6825a26c 100644 --- a/validphys2/src/validphys/pineparser.py +++ b/validphys2/src/validphys/pineparser.py @@ -10,7 +10,7 @@ import numpy as np import pandas as pd -from validphys.commondataparser import EXT, TheoryMeta +from nnpdf_data.commondataparser import EXT, TheoryMeta from validphys.coredata import FKTableData log = logging.getLogger(__name__) diff --git a/validphys2/src/validphys/plotoptions/core.py b/validphys2/src/validphys/plotoptions/core.py index 985be5cf1c..e55ff1a6db 100644 --- a/validphys2/src/validphys/plotoptions/core.py +++ b/validphys2/src/validphys/plotoptions/core.py @@ -7,11 +7,11 @@ import numpy as np import pandas as pd +from nnpdf_data.coredata import CommonData from nnpdf_data.utils import parse_yaml_inp from reportengine.floatformatting import format_number from reportengine.utils import ChainMap from validphys.core import CommonDataSpec, DataSetSpec -from validphys.coredata import CommonData from validphys.plotoptions.plottingoptions import PlottingOptions, default_labels, labeler_functions from validphys.plotoptions.utils import apply_to_all_columns diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py index f850fa2ea5..8a709e5f08 100644 --- a/validphys2/src/validphys/pseudodata.py +++ b/validphys2/src/validphys/pseudodata.py @@ -131,7 +131,7 @@ def make_replica( max_tries=int(1e6), resample_negative_pseudodata=True, ): - """Function that takes in a list of :py:class:`validphys.coredata.CommonData` + """Function that takes in a list of :py:class:`nnpdf_data.coredata.CommonData` objects and returns a pseudodata replica accounting for possible correlations between systematic uncertainties. @@ -141,7 +141,7 @@ def make_replica( Parameters --------- - groups_dataset_inputs_loaded_cd_with_cuts: list[:py:class:`validphys.coredata.CommonData`] + groups_dataset_inputs_loaded_cd_with_cuts: list[:py:class:`nnpdf_data.coredata.CommonData`] List of CommonData objects which stores information about systematic errors, their treatment and description, for each dataset. @@ -296,7 +296,7 @@ def level0_commondata_wc(data, fakepdf): Returns ------- list - list of validphys.coredata.CommonData instances corresponding to + list of nnpdf_data.coredata.CommonData instances corresponding to all datasets within one experiment. The central value is replaced by Level 0 fake data. @@ -311,7 +311,7 @@ def level0_commondata_wc(data, fakepdf): level0_commondata_instances_wc = [] - # ==== Load validphys.coredata.CommonData instance with cuts ====# + # ==== Load nnpdf_data.coredata.CommonData instance with cuts ====# for dataset in data.datasets: commondata_wc = dataset.commondata.load() @@ -360,7 +360,7 @@ def make_level1_data(data, level0_commondata_wc, filterseed, data_index, sep_mul data : validphys.core.DataGroupSpec level0_commondata_wc : list - list of validphys.coredata.CommonData instances corresponding to + list of nnpdf_data.coredata.CommonData instances corresponding to all datasets within one experiment. The central value is replaced by Level 0 fake data. Cuts already applied. @@ -372,7 +372,7 @@ def make_level1_data(data, level0_commondata_wc, filterseed, data_index, sep_mul Returns ------- list - list of validphys.coredata.CommonData instances corresponding to + list of nnpdf_data.coredata.CommonData instances corresponding to all datasets within one experiment. The central value is replaced by Level 1 fake data. diff --git a/validphys2/src/validphys/tests/test_commondataparser.py b/validphys2/src/validphys/tests/test_commondataparser.py index a1ce493652..41ec22932a 100644 --- a/validphys2/src/validphys/tests/test_commondataparser.py +++ b/validphys2/src/validphys/tests/test_commondataparser.py @@ -2,8 +2,8 @@ import pandas as pd import pytest +from nnpdf_data.commondataparser import load_commondata from validphys.api import API -from validphys.commondataparser import load_commondata from validphys.covmats_utils import construct_covmat from validphys.loader import FallbackLoader as Loader from validphys.tests.conftest import FIT, THEORYID_NEW @@ -12,7 +12,7 @@ def test_basic_commondata_loading(): l = Loader() cd = l.check_commondata(setname="SLAC_NC_NOTFIXED_D_EM-F2", variant="legacy_dw") - res = load_commondata(cd) + res = load_commondata(cd.metadata) # Test commondata loading assert res.ndata == 211 assert isinstance(res.commondata_table, pd.DataFrame) @@ -30,7 +30,7 @@ def test_basic_commondata_loading(): emptysyscd = l.check_posset( theoryID=THEORYID_NEW, setname='NNPDF_POS_2P24GEV_XDQ', postlambda=1e-10, rules=rules ) - emptysysres = load_commondata(emptysyscd.commondata) + emptysysres = load_commondata(emptysyscd.commondata.metadata) assert emptysysres.nsys == 0 assert emptysysres.systype_table.empty is True @@ -40,7 +40,7 @@ def test_commondata_with_cuts(): setname = "NMC_NC_NOTFIXED_P_EM-SIGMARED" cd = l.check_commondata(setname=setname, variant="legacy") - loaded_cd = load_commondata(cd) + loaded_cd = load_commondata(cd.metadata) fit_cuts = l.check_fit_cuts(fit=FIT, commondata=cd) internal_cuts = l.check_internal_cuts(cd, API.rules(theoryid=THEORYID_NEW, use_cuts="internal")) @@ -110,9 +110,10 @@ def test_commondata_load_write_load(tmp): original_covmat = construct_covmat(original_stats, original_data.systematic_errors(fake_data)) np.testing.assert_allclose(new_covmat, original_covmat) + def test_variant_nnpdf_metadata(): """Tests the undocumented feature of a variant which updates the key `experiment` - within the nnpdf_metadata + within the nnpdf_metadata """ l = Loader() set_name = "SLAC_NC_NOTFIXED_D_EM-F2" @@ -124,7 +125,9 @@ def test_variant_nnpdf_metadata(): pcd2 = cd2.metadata.plotting_options # ensure the nnpdf_metadata and the plotting are changed - assert cd1.metadata.nnpdf_metadata["experiment"] != cd2.metadata.nnpdf_metadata["experiment"] + assert ( + cd1.metadata.nnpdf_metadata["experiment"] != cd2.metadata.nnpdf_metadata["experiment"] + ) assert pcd2.experiment != pcd1.experiment # but the real experiment is the same assert cd1.metadata.experiment == cd2.metadata.experiment diff --git a/validphys2/src/validphys/tests/test_covmats.py b/validphys2/src/validphys/tests/test_covmats.py index cb1dc0a1bf..e533c74f33 100644 --- a/validphys2/src/validphys/tests/test_covmats.py +++ b/validphys2/src/validphys/tests/test_covmats.py @@ -7,8 +7,8 @@ import numpy as np import pytest +from nnpdf_data.commondataparser import load_commondata from validphys.api import API -from validphys.commondataparser import load_commondata from validphys.covmats import dataset_t0_predictions, reorder_thcovmat_as_expcovmat, sqrt_covmat from validphys.tests.conftest import DATA, HESSIAN_PDF, PDF, THEORYID_NEW @@ -149,7 +149,7 @@ def test_single_datapoint(single_data_single_point_internal_cuts_config): t0_predictions = dataset_t0_predictions(t0ds, t0set) cd = API.commondata(**single_data_single_point_internal_cuts_config) - ld = load_commondata(cd) + ld = load_commondata(cd.metadata) # Ensure the dataset is only a single datapoint assert ld.ndata == 1 ld.systematic_errors(t0_predictions) diff --git a/validphys2/src/validphys/utils.py b/validphys2/src/validphys/utils.py index 042d8b3925..425ca44a2f 100644 --- a/validphys2/src/validphys/utils.py +++ b/validphys2/src/validphys/utils.py @@ -11,14 +11,6 @@ yaml_fast = YAML(typ='safe', pure=False) # uses Cparser if available (not faster than yaml_safe) -def generate_path_filtered_data(fit_path, setname): - """Utility to ensure that both the loader and tools like setupfit utilize the same convention - to generate the names of generated pseudodata""" - data_path = fit_path / "filter" / setname / f"filtered_data_{setname}.yaml" - unc_path = data_path.with_name(f"filtered_uncertainties_{setname}.yaml") - return data_path, unc_path - - @contextlib.contextmanager def tempfile_cleaner(root, exit_func, exc, prefix=None, **kwargs): """A context manager to handle temporary directory creation and