diff --git a/app/__init__.py b/app/__init__.py index 7bf6a7e..da06d20 100644 --- a/app/__init__.py +++ b/app/__init__.py @@ -23,3 +23,25 @@ ], ) logging.debug(f"Logging to {log_path}") + + +def strtobool(val) -> bool: + """ + Replaces deprecated https://docs.python.org/3.9/distutils/apiref.html#distutils.util.strtobool + The deprecation recommendation is to re-implement the function https://peps.python.org/pep-0632/ + + ------------------------------------------------------------ + + Convert a string representation of truth to true (1) or false (0). + + True values are 'y', 'yes', 't', 'true', 'on', and '1'; false values + are 'n', 'no', 'f', 'false', 'off', and '0'. Raises ValueError if + 'val' is anything else. + """ + val_str = str(val).lower() + if val_str in ("y", "yes", "t", "true", "on", "1"): + return True + elif val_str in ("n", "no", "f", "false", "off", "0"): + return False + else: + raise ValueError(f"invalid truth value {val!r}") diff --git a/app/downloads/.gitignore b/app/downloads/.gitignore index a1b5658..2c09a62 100644 --- a/app/downloads/.gitignore +++ b/app/downloads/.gitignore @@ -1,2 +1,2 @@ *.ipynb -cache/MultiQC +sources/MultiQC diff --git a/app/downloads/daily.py b/app/downloads/daily.py index 9e3f3a3..3ebe840 100644 --- a/app/downloads/daily.py +++ b/app/downloads/daily.py @@ -33,7 +33,10 @@ logger = logging.getLogger(__name__) -PYPI_HISTORIC_PATH = Path(__file__).parent / "sources" / "pypi-historic.csv" +SOURCES_DIR = Path(__file__).parent / "sources" +# Whether we can write back daily.csv and other pulled stats to keep under version control. +# Usually the code dir is not writable in the container environment. +SOURCES_IS_WRITABLE = os.access(SOURCES_DIR, os.W_OK) @click.command() @@ -45,20 +48,18 @@ def main(days: int | None): collect_daily_download_stats(days=days) -def collect_daily_download_stats(cache_dir: Path | None = None, days: int | None = None) -> pd.DataFrame: - cache_dir = cache_dir or Path(__file__).parent / "cache" - - df = _collect_daily_download_stats(cache_dir, days=days) +def collect_daily_download_stats(days: int | None = None) -> pd.DataFrame: + df = _collect_daily_download_stats(days=days) # Update the existing CSV. # Only add data where it's not already present. # For existing data, check that it's the same as the new data. keys = [k for k in df.keys() if k != "date"] - cache_path = cache_dir / "daily.csv" - if cache_path.exists(): - logger.info(f"Loading existing daily downloads stats from cache location {cache_path}") + csv_path = SOURCES_DIR / "daily.csv" + if csv_path.exists(): + logger.info(f"Loading existing daily downloads stats from cache location {csv_path}") existing_df = pd.read_csv( - cache_path, + csv_path, dtype={k: "date" if k == "date" else "Int64" for k in keys}, # Int64 is a nullable integer version of int64 ).set_index("date") # Fixing as nan and converting values to int @@ -68,12 +69,13 @@ def collect_daily_download_stats(cache_dir: Path | None = None, days: int | None full_df = existing_df.combine_first(df) else: full_df = df - logger.info(f"Saving daily downloads stats to {cache_path}") - full_df.to_csv(cache_path, index=True) + if SOURCES_IS_WRITABLE: + logger.info(f"Saving daily downloads stats to {csv_path}") + full_df.to_csv(csv_path, index=True) return df -def _collect_daily_download_stats(cache_dir: Path, days: int | None = None) -> pd.DataFrame: +def _collect_daily_download_stats(days: int | None = None) -> pd.DataFrame: logger.info("Collecting PyPI stats...") df = get_pypi(days=days) @@ -82,7 +84,7 @@ def _collect_daily_download_stats(cache_dir: Path, days: int | None = None) -> p df = df.merge(df_bioconda, on="date", how="outer").sort_values("date") logger.info("Collecting BioContainers (Quay mirror) stats...") - df_quay = get_biocontainers_quay(cache_dir, days=days) + df_quay = get_biocontainers_quay(days=days) df = df.merge(df_quay, on="date", how="outer").sort_values("date") logger.info("Collecting GitHub PRs...") @@ -90,7 +92,7 @@ def _collect_daily_download_stats(cache_dir: Path, days: int | None = None) -> p df = df.merge(df_prs, on="date", how="outer").sort_values("date") logger.info("Collecting GitHub modules...") - df_modules = github_modules(cache_dir, days=days) + df_modules = github_modules(days=days) df = df.merge(df_modules, on="date", how="outer").sort_values("date") today = pd.to_datetime("today").strftime("%Y-%m-%d") @@ -168,6 +170,7 @@ def get_pypi_historic(): ''', project_id=os.environ["GCP_PROJECT"]) ``` """ + PYPI_HISTORIC_PATH = SOURCES_DIR / "pypi-historic.csv" logger.info(f"Loading historic PyPI stats from {PYPI_HISTORIC_PATH}") df = pd.read_csv( PYPI_HISTORIC_PATH, @@ -238,7 +241,7 @@ def biocontainers_aws_total(): return count -def get_biocontainers_quay(cache_dir: Path, days: int | None = None): +def get_biocontainers_quay(days: int | None = None): """ For the last 3 months, total numbers of BioContainers Quay.io mirror downloads. """ @@ -254,7 +257,7 @@ def get_biocontainers_quay(cache_dir: Path, days: int | None = None): df.sort_values("date", inplace=True) df = df.set_index("date") if days is None or days > 90: - path = cache_dir / "biocontainers-quay-historic.csv" + path = SOURCES_DIR / "biocontainers-quay-historic.csv" if path.exists(): print(f"Previous Quay stats found at {path}, appending") existing_df = pd.read_csv(path, index_col="date", dtype={"count": "Int64"}) @@ -264,7 +267,8 @@ def get_biocontainers_quay(cache_dir: Path, days: int | None = None): df = df[~df.index.duplicated(keep="last")] # sort by date df.sort_index(inplace=True) - df.to_csv(path) + if SOURCES_IS_WRITABLE: + df.to_csv(path) df = pd.read_csv(path) df = df.set_index("date") print(f"Saved {path}") @@ -359,7 +363,7 @@ def get_github_prs(days: int | None = None): return df.set_index("date") -def github_modules(cache_dir: Path, days: int | None = None): +def github_modules(days: int | None = None): """ Daily and total new MultiQC modules. """ @@ -369,7 +373,8 @@ def github_modules(cache_dir: Path, days: int | None = None): from git import Repo repo_url = "https://github.com/MultiQC/MultiQC.git" - clone_path = cache_dir / "MultiQC" + tmp_dir = Path(os.getenv("TMPDIR", "/tmp")) + clone_path = tmp_dir / "MultiQC" if not clone_path.exists(): Repo.clone_from(repo_url, clone_path) logger.debug(f"{repo_url} cloned at {clone_path}") diff --git a/app/downloads/cache/biocontainers-quay-historic.csv b/app/downloads/sources/biocontainers-quay-historic.csv similarity index 98% rename from app/downloads/cache/biocontainers-quay-historic.csv rename to app/downloads/sources/biocontainers-quay-historic.csv index 37b39dd..8719fd9 100644 --- a/app/downloads/cache/biocontainers-quay-historic.csv +++ b/app/downloads/sources/biocontainers-quay-historic.csv @@ -180,3 +180,6 @@ date,count 2024-02-16,3556 2024-02-17,623 2024-02-18,455 +2024-02-19,2011 +2024-02-20,2054 +2024-02-21,1958 diff --git a/app/downloads/cache/daily.csv b/app/downloads/sources/daily.csv similarity index 100% rename from app/downloads/cache/daily.csv rename to app/downloads/sources/daily.csv diff --git a/app/main.py b/app/main.py index 0b048b9..29d1bbb 100644 --- a/app/main.py +++ b/app/main.py @@ -25,7 +25,7 @@ from plotly.graph_objs import Layout from sqlalchemy.exc import ProgrammingError -from app import __version__, db, models +from app import __version__, db, models, strtobool from app.downloads import daily logger = logging.getLogger(__name__) @@ -266,17 +266,16 @@ def _update_download_stats(): except ProgrammingError: logger.error("The table does not exist, will create and populate with historical data") existing_downloads = [] - cache_dir = Path(os.getenv("MULTIQC_API_CACHE_DIR", os.getenv("TMPDIR", "/tmp"))) if len(existing_downloads) == 0: # first time, populate historical data logger.info("Collecting historical downloads data...") - df = daily.collect_daily_download_stats(cache_dir=cache_dir) + df = daily.collect_daily_download_stats() logger.info(f"Adding {len(df)} historical entries to the table...") db.insert_download_stats(df) logger.info(f"Successfully populated {len(df)} historical entries") else: # recent days only n_days = 4 - logger.info(f"Updating data for the last {n_days} days...") - df = daily.collect_daily_download_stats(cache_dir=cache_dir, days=n_days) + logger.info(f"Updating downloads data for the last {n_days} days...") + df = daily.collect_daily_download_stats(days=n_days) logger.info(f"Adding {len(df)} recent entries to the table. Will update existing entries at the same date") db.insert_download_stats(df) logger.info(f"Successfully updated {len(df)} new daily download statistics") @@ -467,27 +466,5 @@ def plotly_image_response(plot, format: PlotlyImageFormats = PlotlyImageFormats. return Response(content=plot) -def strtobool(val) -> bool: - """ - Replaces deprecated https://docs.python.org/3.9/distutils/apiref.html#distutils.util.strtobool - The deprecation recommendation is to re-implement the function https://peps.python.org/pep-0632/ - - ------------------------------------------------------------ - - Convert a string representation of truth to true (1) or false (0). - - True values are 'y', 'yes', 't', 'true', 'on', and '1'; false values - are 'n', 'no', 'f', 'false', 'off', and '0'. Raises ValueError if - 'val' is anything else. - """ - val_str = str(val).lower() - if val_str in ("y", "yes", "t", "true", "on", "1"): - return True - elif val_str in ("n", "no", "f", "false", "off", "0"): - return False - else: - raise ValueError(f"invalid truth value {val!r}") - - if __name__ == "__main__": uvicorn.run(app, host="0.0.0.0", port=8000) diff --git a/docker-compose.yml b/docker-compose.yml index 7b13d82..3ef857b 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -12,7 +12,6 @@ services: environment: # Set in .env GITHUB_TOKEN: $GITHUB_TOKEN - MULTIQC_API_CACHE_DIR: /code/app/downloads/cache # Matches the "db" service below DATABASE_URL: mysql+pymysql://root:1@db:3306/multiqc depends_on: