Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Download all files from a Catalog at once #48 #57

Merged
merged 11 commits into from
Jun 13, 2024
146 changes: 146 additions & 0 deletions sospice/catalog/catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
import matplotlib.colors as mcolors
import pandas as pd
import numpy as np
from parfive import Downloader
import warnings

from astropy.utils.data import download_file

Expand Down Expand Up @@ -451,3 +453,147 @@ def plot_fov(self, ax, **kwargs):
fontsize="xx-large",
borderaxespad=2,
)

def _get_file_url_from_base_url(self, file_metadata, base_url):
"""
Get URL for a file located under some base URL

Parameters
----------
base_url: str
Base URL

Return
------
str
File URL

Notes:

* There is no guarantee that the URL corresponds to an existing location
* The base URL can be a path on disk, but paths are built using "/"
and this might not work on all operating systems.
"""
if not base_url.endswith("/"):
base_url += "/"
return base_url + file_metadata["FILE_PATH"] + "/" + file_metadata["FILENAME"]

def _get_file_url(self, file_metadata=None, base_url=None, release=None):
"""
Get file URL, from a release, from some other online file tree, or from SOAR if no parameter has been provided

Parameters
----------
base_url: str
Base URL for file
release: Release or str
Release to download file from. This can be a Release object, or a string for the release tag.

Return
------
str
File URL
"""
if release is not None:
if type(release) is str:
release = Release(release)
url = self._get_file_url_from_base_url(release.url)
elif base_url is not None:
url = self._get_file_url_from_base_url(base_url)
else:
url = "http://soar.esac.esa.int/soar-sl-tap/data"
url += "?retrieval_type=ALL_PRODUCTS"
url += "&QUERY=SELECT+filepath,filename+FROM+soar.v_sc_repository_file"
url += f"+WHERE+filename='{file_metadata.FILENAME}'"
return url

def _process_downloads(
self, row, base_dir=None, base_url=None, keep_tree=True, downloader=None
):
"""
Process individual rows for downloading.

Parameters
----------
row: pd.Series
single row containing file metadata
base_dir: Path or str
Base directory to download file to
base_url: str
Base URL for file
keep_tree: bool
Keep tree directory structure (by level and date)
downloader: parfive.Downloader
If provided, enqueue file for download instead of downloading it.
To download enqueued files, run `downloader.download()`


Return
------
parfive.Result
Download result (or None if file has only been enqueued)
"""
url = self._get_file_url(
file_metadata=row, base_url=base_url, release=self.release_tag
)
if keep_tree:
destination = Path(base_dir) / row["FILE_PATH"]
destination.mkdir(parents=True, exist_ok=True)
else:
destination = Path(base_dir)
do_download = False
if downloader is None:
downloader = Downloader(overwrite=False)
do_download = True
downloader.enqueue_file(url, destination, row["FILENAME"])
if do_download:
result = downloader.download()
return result
return None

def download_files(
self,
base_dir,
base_url=None,
keep_tree=True,
downloader=None,
max_download=1000,
):
"""
Download all files from Catalog.,

Parameters
----------
base_dir: Path or str
Base directory to download file to
base_url: str
Base URL for file
keep_tree: bool
Keep tree directory structure (by level and date)
downloader: parfive.Downloader
If provided, enqueue file for download instead of downloading it.
To download enqueued files, run `downloader.download()`
max_download: int
default maximum of 1000 files can be downloaded.
User can override it by changing the value.

Return
------
parfive.Result
Download result (or None if file has only been enqueued)
"""
get_catalog = self.read_catalog()
if max_download > 1000:
warnings.warn(
"You are overriding the default max_download: This might cause performance issues.",
UserWarning,
)

processed_downloads = get_catalog.iloc[:max_download].apply(
lambda row: self._process_downloads(
row, base_dir, base_url, keep_tree, downloader
),
axis=1,
)

return processed_downloads
34 changes: 33 additions & 1 deletion sospice/catalog/tests/test_catalog.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import pytest
from datetime import datetime
import pandas as pd

from pathlib import Path
import shutil
from parfive import Downloader

from ..catalog import Catalog

Expand All @@ -11,6 +13,11 @@ def catalog2():
return Catalog(release_tag="2.0")


@pytest.fixture
def filename(): # noqa: F811
return "solo_L2_spice-n-exp_20220305T072522_V01_100663707-014.fits"


@pytest.fixture
def catalog3():
return Catalog(release_tag="3.0")
Expand All @@ -26,6 +33,11 @@ def catalog_empty():
return Catalog()


@pytest.fixture
def max_download():
return 2


@pytest.fixture
def catalog_df():
df = pd.DataFrame(
Expand Down Expand Up @@ -189,3 +201,23 @@ def test_mid_time(self, catalog2):
).total_seconds()
< 1 # noqa: W503
)

def test_download_files(self, catalog2, max_download, filename): # noqa: F811
base_dir = Path("./local/test_download_file")
if base_dir.exists():
shutil.rmtree(base_dir)
result = catalog2.download_files(base_dir, max_download=2, keep_tree=False)
assert len(result) == max_download

if len(result) > 0:
expected_first_file_path = (
base_dir / result[0][0].split("/")[-1]
).as_posix()
assert result[0][0] == expected_first_file_path

downloader = Downloader(overwrite=False)
result = catalog2.download_files(
base_dir, max_download=1, downloader=downloader
)

assert downloader.queued_downloads == 1