From c8ba3d42dea82a418f7afdc58717457176a7e5ad Mon Sep 17 00:00:00 2001
From: Matt Fisher <mfisher87@gmail.com>
Date: Tue, 20 Aug 2024 17:51:16 -0600
Subject: [PATCH] Remove random collection sampling from test module

---
 tests/integration/test_onprem_download.py | 82 +++++++++++++----------
 1 file changed, 48 insertions(+), 34 deletions(-)

diff --git a/tests/integration/test_onprem_download.py b/tests/integration/test_onprem_download.py
index a1641463..999a91ee 100644
--- a/tests/integration/test_onprem_download.py
+++ b/tests/integration/test_onprem_download.py
@@ -1,14 +1,13 @@
 import logging
 import os
-import random
 import shutil
 import unittest
-from collections import TypedDict
 from pathlib import Path
+from typing import TypedDict
 
 import earthaccess
 import pytest
-from earthaccess import Auth, DataCollections, DataGranules, Store
+from earthaccess import Auth, DataGranules, Store
 
 from .sample import get_sample_granules
 
@@ -16,10 +15,10 @@
 
 
 class TestParam(TypedDict):
-    daac_name: str
+    provider_name: str
 
     # How many of the top collections we will test, e.g. top 3 collections
-    top_n_collections: int
+    n_for_top_collections: int
 
     # How many granules we will query
     granules_count: int
@@ -33,26 +32,26 @@ class TestParam(TypedDict):
 
 daacs_list: list[TestParam] = [
     {
-        "short_name": "NSIDC",
-        "top_n_collections": 3,
-        "granules_count": 100,
-        "granules_sample_size": 2,
-        "granules_max_size_mb": 100,
-    },
-    {
-        "short_name": "GES_DISC",
-        "top_n_collections": 2,
-        "granules_count": 100,
-        "granules_sample_size": 2,
-        "granules_max_size_mb": 130,
-    },
-    {
-        "short_name": "LPDAAC",
-        "top_n_collections": 2,
+        "provider_name": "NSIDC_ECS",
+        "n_for_top_collections": 3,
         "granules_count": 100,
         "granules_sample_size": 2,
         "granules_max_size_mb": 100,
     },
+    # {
+    #     "provider_name": "GES_DISC",
+    #     "top_n_collections": 2,
+    #     "granules_count": 100,
+    #     "granules_sample_size": 2,
+    #     "granules_max_size_mb": 130,
+    # },
+    # {
+    #     "provider_name": "LPDAAC",
+    #     "top_n_collections": 2,
+    #     "granules_count": 100,
+    #     "granules_sample_size": 2,
+    #     "granules_max_size_mb": 100,
+    # },
 ]
 
 assertions = unittest.TestCase("__init__")
@@ -70,7 +69,25 @@ class TestParam(TypedDict):
 store = Store(auth)
 
 
+def top_collections_for_provider(provider: str, *, n: int) -> list[str]:
+    """Return the top collections for this provider.
+
+    Local cache is used as the source for this list. Run
+    `./popular_collections/generate.py` to refresh it!
+
+    TODO: Skip / exclude collections that have a EULA; filter them out in this function
+    or use a pytest skip/xfail mark?
+    """
+    popular_collections_dir = Path(__file__).parent / "popular_collections"
+    popular_collections_file = popular_collections_dir / f"{provider}.txt"
+    with open(popular_collections_file) as f:
+        popular_collections = f.read().splitlines()
+
+    return popular_collections[:n]
+
+
 def supported_collection(data_links):
+    """What is the purpose of this?"""
     for url in data_links:
         if "podaac-tools.jpl.nasa.gov/drive" in url:
             return False
@@ -80,23 +97,20 @@ def supported_collection(data_links):
 @pytest.mark.parametrize("daac", daacs_list)
 def test_earthaccess_can_download_onprem_collection_granules(daac):
     """Tests that we can download on-premises collections using HTTPS links."""
-    daac_shortname = daac["short_name"]
-    collections_count = daac["collections_count"]
-    collections_sample_size = daac["collections_sample_size"]
+    provider = daac["provider_name"]
+    n_for_top_collections = daac["n_for_top_collections"]
+
     granules_count = daac["granules_count"]
     granules_sample_size = daac["granules_sample_size"]
     granules_max_size = daac["granules_max_size_mb"]
 
-    collection_query = DataCollections().data_center(daac_shortname).cloud_hosted(False)
-    hits = collection_query.hits()
-    logger.info(f"On-premises collections for {daac_shortname}: {hits}")
-    collections = collection_query.get(collections_count)
-    assertions.assertGreater(len(collections), collections_sample_size)
-    # We sample n cloud hosted collections from the results
-    random_collections = random.sample(collections, collections_sample_size)
-    logger.info(f"Sampled {len(random_collections)} collections")
-    for collection in random_collections:
-        concept_id = collection.concept_id()
+    top_collections = top_collections_for_provider(
+        provider,
+        n=n_for_top_collections,
+    )
+    logger.info(f"On-premises collections for {provider}: {len(top_collections)}")
+
+    for concept_id in top_collections:
         granule_query = DataGranules().concept_id(concept_id)
         total_granules = granule_query.hits()
         granules = granule_query.get(granules_count)