From 5f77c9814d942250371fb8959869370818a8aa48 Mon Sep 17 00:00:00 2001 From: Emre_Yorat <62134151+Emre-Yorat89@users.noreply.github.com> Date: Wed, 22 Nov 2023 17:59:39 +0300 Subject: [PATCH] Retrieve databundle light data size check (#911) * retrieve_databundle size check commit #1 * retrieve_databundle size check commit #2 * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Revised get_best_bundles_by_category() function * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * retrieve_databundle_light PR update * release note is added to PR911 --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Davide Fioriti <67809479+davide-f@users.noreply.github.com> --- doc/release_notes.rst | 2 ++ scripts/retrieve_databundle_light.py | 37 ++++++++++++++++------------ 2 files changed, 23 insertions(+), 16 deletions(-) diff --git a/doc/release_notes.rst b/doc/release_notes.rst index 5a48de0c3..3f50f3920 100644 --- a/doc/release_notes.rst +++ b/doc/release_notes.rst @@ -18,6 +18,8 @@ E.g. if a new rule becomes available describe how to use it `snakemake -j1 run_t * Function added in clean_osm_data script to allow the use of custom network data instead or on-top of OSM data. `PR #842 <'https://github.com/pypsa-meets-earth/pypsa-earth/pull/842>`__ +* Improve retrieve_databundle to prioritize smallest databundles `PR #911 `__ + PyPSA-Earth 0.2.3 ================= diff --git a/scripts/retrieve_databundle_light.py b/scripts/retrieve_databundle_light.py index 7b9b6e125..c05586642 100644 --- a/scripts/retrieve_databundle_light.py +++ b/scripts/retrieve_databundle_light.py @@ -85,6 +85,7 @@ import re from zipfile import ZipFile +import pandas as pd import yaml from _helpers import ( configure_logging, @@ -483,28 +484,32 @@ def get_best_bundles_by_category( List of bundles to download """ # dictionary with the number of match by configuration for tutorial/non-tutorial configurations - dict_n_matched = { - bname: config_bundles[bname]["n_matched"] - for bname in config_bundles - if config_bundles[bname]["category"] == category - and config_bundles[bname].get("tutorial", False) == tutorial - and _check_disabled_by_opt(config_bundles[bname], config_enable) != ["all"] - } + df_matches = pd.DataFrame(columns=["bundle_name", "bundle_size", "n_matched"]) + + for bname, bvalue in config_bundles.items(): + if ( + bvalue["category"] == category + and bvalue.get("tutorial", False) == tutorial + and _check_disabled_by_opt(bvalue, config_enable) != ["all"] + ): + df_matches.loc[bname] = [ + bname, + len(bvalue["countries"]), + bvalue["n_matched"], + ] - returned_bundles = [] + df_matches["neg_bundle_size"] = -df_matches["bundle_size"] + df_matches.sort_values( + by=["n_matched", "neg_bundle_size"], inplace=True, ascending=False + ) - # check if non-empty dictionary - if dict_n_matched: - # if non-empty, then pick bundles until all countries are selected - # or no more bundles are found - dict_sort = sorted(dict_n_matched.items(), key=lambda d: d[1]) + returned_bundles = [] + if not df_matches.empty: current_matched_countries = [] remaining_countries = set(country_list) - for d_val in dict_sort: - bname = d_val[0] - + for bname in df_matches.index: cbundle_list = set(config_bundles[bname]["countries"]) # list of countries in the bundle that are not yet matched