Skip to content

Commit

Permalink
Keep daily.csv and biocontainers-quay-historic.csv under sources, che…
Browse files Browse the repository at this point in the history
…ck if dir is writable
  • Loading branch information
vladsavelyev committed Feb 22, 2024
1 parent 0ecdeec commit 5ef7c88
Show file tree
Hide file tree
Showing 7 changed files with 54 additions and 48 deletions.
22 changes: 22 additions & 0 deletions app/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,25 @@
],
)
logging.debug(f"Logging to {log_path}")


def strtobool(val) -> bool:
"""
Replaces deprecated https://docs.python.org/3.9/distutils/apiref.html#distutils.util.strtobool
The deprecation recommendation is to re-implement the function https://peps.python.org/pep-0632/
------------------------------------------------------------
Convert a string representation of truth to true (1) or false (0).
True values are 'y', 'yes', 't', 'true', 'on', and '1'; false values
are 'n', 'no', 'f', 'false', 'off', and '0'. Raises ValueError if
'val' is anything else.
"""
val_str = str(val).lower()
if val_str in ("y", "yes", "t", "true", "on", "1"):
return True
elif val_str in ("n", "no", "f", "false", "off", "0"):
return False
else:
raise ValueError(f"invalid truth value {val!r}")
2 changes: 1 addition & 1 deletion app/downloads/.gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
*.ipynb
cache/MultiQC
sources/MultiQC
43 changes: 24 additions & 19 deletions app/downloads/daily.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,10 @@

logger = logging.getLogger(__name__)

PYPI_HISTORIC_PATH = Path(__file__).parent / "sources" / "pypi-historic.csv"
SOURCES_DIR = Path(__file__).parent / "sources"
# Whether we can write back daily.csv and other pulled stats to keep under version control.
# Usually the code dir is not writable in the container environment.
SOURCES_IS_WRITABLE = os.access(SOURCES_DIR, os.W_OK)


@click.command()
Expand All @@ -45,20 +48,18 @@ def main(days: int | None):
collect_daily_download_stats(days=days)


def collect_daily_download_stats(cache_dir: Path | None = None, days: int | None = None) -> pd.DataFrame:
cache_dir = cache_dir or Path(__file__).parent / "cache"

df = _collect_daily_download_stats(cache_dir, days=days)
def collect_daily_download_stats(days: int | None = None) -> pd.DataFrame:
df = _collect_daily_download_stats(days=days)

# Update the existing CSV.
# Only add data where it's not already present.
# For existing data, check that it's the same as the new data.
keys = [k for k in df.keys() if k != "date"]
cache_path = cache_dir / "daily.csv"
if cache_path.exists():
logger.info(f"Loading existing daily downloads stats from cache location {cache_path}")
csv_path = SOURCES_DIR / "daily.csv"
if csv_path.exists():
logger.info(f"Loading existing daily downloads stats from cache location {csv_path}")
existing_df = pd.read_csv(
cache_path,
csv_path,
dtype={k: "date" if k == "date" else "Int64" for k in keys}, # Int64 is a nullable integer version of int64
).set_index("date")
# Fixing <NA> as nan and converting values to int
Expand All @@ -68,12 +69,13 @@ def collect_daily_download_stats(cache_dir: Path | None = None, days: int | None
full_df = existing_df.combine_first(df)
else:
full_df = df
logger.info(f"Saving daily downloads stats to {cache_path}")
full_df.to_csv(cache_path, index=True)
if SOURCES_IS_WRITABLE:
logger.info(f"Saving daily downloads stats to {csv_path}")
full_df.to_csv(csv_path, index=True)
return df


def _collect_daily_download_stats(cache_dir: Path, days: int | None = None) -> pd.DataFrame:
def _collect_daily_download_stats(days: int | None = None) -> pd.DataFrame:
logger.info("Collecting PyPI stats...")
df = get_pypi(days=days)

Expand All @@ -82,15 +84,15 @@ def _collect_daily_download_stats(cache_dir: Path, days: int | None = None) -> p
df = df.merge(df_bioconda, on="date", how="outer").sort_values("date")

logger.info("Collecting BioContainers (Quay mirror) stats...")
df_quay = get_biocontainers_quay(cache_dir, days=days)
df_quay = get_biocontainers_quay(days=days)
df = df.merge(df_quay, on="date", how="outer").sort_values("date")

logger.info("Collecting GitHub PRs...")
df_prs = get_github_prs(days=days)
df = df.merge(df_prs, on="date", how="outer").sort_values("date")

logger.info("Collecting GitHub modules...")
df_modules = github_modules(cache_dir, days=days)
df_modules = github_modules(days=days)
df = df.merge(df_modules, on="date", how="outer").sort_values("date")

today = pd.to_datetime("today").strftime("%Y-%m-%d")
Expand Down Expand Up @@ -168,6 +170,7 @@ def get_pypi_historic():
''', project_id=os.environ["GCP_PROJECT"])
```
"""
PYPI_HISTORIC_PATH = SOURCES_DIR / "pypi-historic.csv"
logger.info(f"Loading historic PyPI stats from {PYPI_HISTORIC_PATH}")
df = pd.read_csv(
PYPI_HISTORIC_PATH,
Expand Down Expand Up @@ -238,7 +241,7 @@ def biocontainers_aws_total():
return count


def get_biocontainers_quay(cache_dir: Path, days: int | None = None):
def get_biocontainers_quay(days: int | None = None):
"""
For the last 3 months, total numbers of BioContainers Quay.io mirror downloads.
"""
Expand All @@ -254,7 +257,7 @@ def get_biocontainers_quay(cache_dir: Path, days: int | None = None):
df.sort_values("date", inplace=True)
df = df.set_index("date")
if days is None or days > 90:
path = cache_dir / "biocontainers-quay-historic.csv"
path = SOURCES_DIR / "biocontainers-quay-historic.csv"
if path.exists():
print(f"Previous Quay stats found at {path}, appending")
existing_df = pd.read_csv(path, index_col="date", dtype={"count": "Int64"})
Expand All @@ -264,7 +267,8 @@ def get_biocontainers_quay(cache_dir: Path, days: int | None = None):
df = df[~df.index.duplicated(keep="last")]
# sort by date
df.sort_index(inplace=True)
df.to_csv(path)
if SOURCES_IS_WRITABLE:
df.to_csv(path)
df = pd.read_csv(path)
df = df.set_index("date")
print(f"Saved {path}")
Expand Down Expand Up @@ -359,7 +363,7 @@ def get_github_prs(days: int | None = None):
return df.set_index("date")


def github_modules(cache_dir: Path, days: int | None = None):
def github_modules(days: int | None = None):
"""
Daily and total new MultiQC modules.
"""
Expand All @@ -369,7 +373,8 @@ def github_modules(cache_dir: Path, days: int | None = None):
from git import Repo

repo_url = "https://github.com/MultiQC/MultiQC.git"
clone_path = cache_dir / "MultiQC"
tmp_dir = Path(os.getenv("TMPDIR", "/tmp"))
clone_path = tmp_dir / "MultiQC"
if not clone_path.exists():
Repo.clone_from(repo_url, clone_path)
logger.debug(f"{repo_url} cloned at {clone_path}")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -180,3 +180,6 @@ date,count
2024-02-16,3556
2024-02-17,623
2024-02-18,455
2024-02-19,2011
2024-02-20,2054
2024-02-21,1958
File renamed without changes.
31 changes: 4 additions & 27 deletions app/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
from plotly.graph_objs import Layout
from sqlalchemy.exc import ProgrammingError

from app import __version__, db, models
from app import __version__, db, models, strtobool
from app.downloads import daily

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -266,17 +266,16 @@ def _update_download_stats():
except ProgrammingError:
logger.error("The table does not exist, will create and populate with historical data")
existing_downloads = []
cache_dir = Path(os.getenv("MULTIQC_API_CACHE_DIR", os.getenv("TMPDIR", "/tmp")))
if len(existing_downloads) == 0: # first time, populate historical data
logger.info("Collecting historical downloads data...")
df = daily.collect_daily_download_stats(cache_dir=cache_dir)
df = daily.collect_daily_download_stats()
logger.info(f"Adding {len(df)} historical entries to the table...")
db.insert_download_stats(df)
logger.info(f"Successfully populated {len(df)} historical entries")
else: # recent days only
n_days = 4
logger.info(f"Updating data for the last {n_days} days...")
df = daily.collect_daily_download_stats(cache_dir=cache_dir, days=n_days)
logger.info(f"Updating downloads data for the last {n_days} days...")
df = daily.collect_daily_download_stats(days=n_days)
logger.info(f"Adding {len(df)} recent entries to the table. Will update existing entries at the same date")
db.insert_download_stats(df)
logger.info(f"Successfully updated {len(df)} new daily download statistics")
Expand Down Expand Up @@ -467,27 +466,5 @@ def plotly_image_response(plot, format: PlotlyImageFormats = PlotlyImageFormats.
return Response(content=plot)


def strtobool(val) -> bool:
"""
Replaces deprecated https://docs.python.org/3.9/distutils/apiref.html#distutils.util.strtobool
The deprecation recommendation is to re-implement the function https://peps.python.org/pep-0632/
------------------------------------------------------------
Convert a string representation of truth to true (1) or false (0).
True values are 'y', 'yes', 't', 'true', 'on', and '1'; false values
are 'n', 'no', 'f', 'false', 'off', and '0'. Raises ValueError if
'val' is anything else.
"""
val_str = str(val).lower()
if val_str in ("y", "yes", "t", "true", "on", "1"):
return True
elif val_str in ("n", "no", "f", "false", "off", "0"):
return False
else:
raise ValueError(f"invalid truth value {val!r}")


if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=8000)
1 change: 0 additions & 1 deletion docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ services:
environment:
# Set in .env
GITHUB_TOKEN: $GITHUB_TOKEN
MULTIQC_API_CACHE_DIR: /code/app/downloads/cache
# Matches the "db" service below
DATABASE_URL: mysql+pymysql://root:1@db:3306/multiqc
depends_on:
Expand Down

0 comments on commit 5ef7c88

Please sign in to comment.