From 5f77c9814d942250371fb8959869370818a8aa48 Mon Sep 17 00:00:00 2001
From: Emre_Yorat <62134151+Emre-Yorat89@users.noreply.github.com>
Date: Wed, 22 Nov 2023 17:59:39 +0300
Subject: [PATCH] Retrieve databundle light data size check (#911)

* retrieve_databundle size check commit #1

* retrieve_databundle size check commit #2

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Revised get_best_bundles_by_category() function

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* retrieve_databundle_light PR update

* release note is added to PR911

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Davide Fioriti <67809479+davide-f@users.noreply.github.com>
---
 doc/release_notes.rst                |  2 ++
 scripts/retrieve_databundle_light.py | 37 ++++++++++++++++------------
 2 files changed, 23 insertions(+), 16 deletions(-)

diff --git a/doc/release_notes.rst b/doc/release_notes.rst
index 5a48de0c3..3f50f3920 100644
--- a/doc/release_notes.rst
+++ b/doc/release_notes.rst
@@ -18,6 +18,8 @@ E.g. if a new rule becomes available describe how to use it `snakemake -j1 run_t
 
 * Function added in clean_osm_data script to allow the use of custom network data instead or on-top of OSM data. `PR #842 <'https://github.com/pypsa-meets-earth/pypsa-earth/pull/842>`__
 
+* Improve retrieve_databundle to prioritize smallest databundles `PR #911 <https://github.com/pypsa-meets-earth/pypsa-earth/pull/911>`__
+
 
 PyPSA-Earth 0.2.3
 =================
diff --git a/scripts/retrieve_databundle_light.py b/scripts/retrieve_databundle_light.py
index 7b9b6e125..c05586642 100644
--- a/scripts/retrieve_databundle_light.py
+++ b/scripts/retrieve_databundle_light.py
@@ -85,6 +85,7 @@
 import re
 from zipfile import ZipFile
 
+import pandas as pd
 import yaml
 from _helpers import (
     configure_logging,
@@ -483,28 +484,32 @@ def get_best_bundles_by_category(
         List of bundles to download
     """
     # dictionary with the number of match by configuration for tutorial/non-tutorial configurations
-    dict_n_matched = {
-        bname: config_bundles[bname]["n_matched"]
-        for bname in config_bundles
-        if config_bundles[bname]["category"] == category
-        and config_bundles[bname].get("tutorial", False) == tutorial
-        and _check_disabled_by_opt(config_bundles[bname], config_enable) != ["all"]
-    }
+    df_matches = pd.DataFrame(columns=["bundle_name", "bundle_size", "n_matched"])
+
+    for bname, bvalue in config_bundles.items():
+        if (
+            bvalue["category"] == category
+            and bvalue.get("tutorial", False) == tutorial
+            and _check_disabled_by_opt(bvalue, config_enable) != ["all"]
+        ):
+            df_matches.loc[bname] = [
+                bname,
+                len(bvalue["countries"]),
+                bvalue["n_matched"],
+            ]
 
-    returned_bundles = []
+    df_matches["neg_bundle_size"] = -df_matches["bundle_size"]
+    df_matches.sort_values(
+        by=["n_matched", "neg_bundle_size"], inplace=True, ascending=False
+    )
 
-    # check if non-empty dictionary
-    if dict_n_matched:
-        # if non-empty, then pick bundles until all countries are selected
-        # or no more bundles are found
-        dict_sort = sorted(dict_n_matched.items(), key=lambda d: d[1])
+    returned_bundles = []
 
+    if not df_matches.empty:
         current_matched_countries = []
         remaining_countries = set(country_list)
 
-        for d_val in dict_sort:
-            bname = d_val[0]
-
+        for bname in df_matches.index:
             cbundle_list = set(config_bundles[bname]["countries"])
 
             # list of countries in the bundle that are not yet matched