Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Download all files from a Catalog at once #48 #57

Merged
merged 11 commits into from
Jun 13, 2024
23 changes: 13 additions & 10 deletions docs/source/catalog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -82,22 +82,25 @@ The file can then be put at any location on disk (optionally keeping the origina

.. code-block:: python

metadata.download_file() # no argument → download from SOAR
metadata.download_file()
# no argument → download from SOAR, to current directory

These downloads are internally managed using the ``parfive`` package, this provides the option to enqueue different files for download, and then to run the downloads in parallel. This example shows how to download the files corresponding to the first 10 rows of ``results`` from release 2.0 (from which the catalog has been extracted):
In the same way, it is possible to download files from a ``Catalog`` object (usually a selection of rows of a data release, not a full data release!). There is a limit of number of files to be downloaded, ``max_download``, that can be overridden if necessary.

.. code-block:: python

catalog.download_files()

These downloads are internally managed using the ``parfive`` package, this provides the option to enqueue files before launching the (parallel) download of these files.

This example shows how to download the files corresponding to two different ``Catalog`` objects:

.. code-block:: python

from parfive import Downloader
downloader = Downloader()
result.iloc[:10].apply(
lambda row: FileMetadata(row).download_file(
"/tmp/spice-files", # base directory
release="2.0",
downloader=downloader
),
axis=1
)
cat1.download_files()
cat2.download_files()
downloader.download()

In any case, files are not re-downloaded if they already exist (please remove them before re-downloading them if an update is really necessary); released files should never be modified anyways (although there will probably be newer versions in the following releases).
Expand Down
62 changes: 61 additions & 1 deletion sospice/catalog/catalog.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
from dataclasses import dataclass
from pathlib import Path
from itertools import cycle

import matplotlib.colors as mcolors
import pandas as pd
import numpy as np
import warnings

from parfive import Downloader
from astropy.utils.data import download_file

from .release import Release
Expand Down Expand Up @@ -451,3 +452,62 @@ def plot_fov(self, ax, **kwargs):
fontsize="xx-large",
borderaxespad=2,
)

def download_files(
self,
base_dir=".",
base_url=None,
release=None,
keep_tree=True,
downloader=None,
max_download=None,
):
"""
Download all files from Catalog.,

Parameters
----------
base_dir: Path or str
Base directory to download file to
base_url: str
Base URL for file
release: Release or str
Release to download file from
keep_tree: bool
Keep tree directory structure (by level and date)
downloader: parfive.Downloader
If provided, enqueue file for download instead of downloading it.
To download enqueued files, run `downloader.download()`
max_download: int
Maximum number of files to be downloaded.

Return
------
parfive.Result
Download result (or None if file has only been enqueued)
"""
default_max_download = 1000
if max_download is None:
max_download = default_max_download
elif max_download > default_max_download:
warnings.warn(
"You are overriding the default max_download: This might cause performance issues."
)
do_download = False
if downloader is None:
downloader = Downloader(overwrite=False)
do_download = True
self.iloc[:max_download].apply(
lambda row: FileMetadata(row).download_file(
base_dir=base_dir,
base_url=base_url,
release=release,
keep_tree=keep_tree,
downloader=downloader,
),
axis=1,
)
if do_download:
result = downloader.download()
return result
return
4 changes: 2 additions & 2 deletions sospice/catalog/file_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ def cache_file(self, base_url=None, release=None, update=False):
return Path(filename)

def download_file(
self, base_dir, base_url=None, release=None, keep_tree=True, downloader=None
self, base_dir=".", base_url=None, release=None, keep_tree=True, downloader=None
):
"""
Download file, from a release, from some other online file tree,
Expand Down Expand Up @@ -215,7 +215,7 @@ def download_file(
if do_download:
result = downloader.download()
return result
return None
return

def get_wavelengths(self):
"""
Expand Down
46 changes: 45 additions & 1 deletion sospice/catalog/tests/test_catalog.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,24 @@
import pytest
from datetime import datetime
import pandas as pd

from pathlib import Path
import shutil
from parfive import Downloader

from ..catalog import Catalog
from .test_release import release2 # noqa: F401


@pytest.fixture
def catalog2():
return Catalog(release_tag="2.0")


@pytest.fixture
def filename():
return "solo_L2_spice-n-exp_20220305T072522_V01_100663707-014.fits"


@pytest.fixture
def catalog3():
return Catalog(release_tag="3.0")
Expand All @@ -26,6 +34,11 @@ def catalog_empty():
return Catalog()


@pytest.fixture
def max_download():
return 2


@pytest.fixture
def catalog_df():
df = pd.DataFrame(
Expand Down Expand Up @@ -189,3 +202,34 @@ def test_mid_time(self, catalog2):
).total_seconds()
< 1 # noqa: W503
)

def test_download_files(
self, release2, catalog2, max_download, filename # noqa: F811
):
base_dir = Path("./local/test_download_file")
if base_dir.exists():
shutil.rmtree(base_dir)
result = catalog2.download_files(
base_dir, release=release2, max_download=max_download, keep_tree=False
)
assert len(result) == max_download

if len(result) > 0:
expected_first_file_path = (base_dir / result[0].split("/")[-1]).as_posix()
assert result[0] == expected_first_file_path

downloader = Downloader(overwrite=False)
catalog2.download_files(
base_dir, release=release2, max_download=1, downloader=downloader
)
assert downloader.queued_downloads == 1

downloader = Downloader(overwrite=False)
catalog2.download_files(
base_dir, release=release2, max_download=2000, downloader=downloader
)
assert downloader.queued_downloads == 2000

downloader = Downloader(overwrite=False)
catalog2.download_files(base_dir, release=release2, downloader=downloader)
assert downloader.queued_downloads > 10