diff --git a/chai_lab/data/io/rcsb.py b/chai_lab/data/io/rcsb.py index ca3a8ac..3db6166 100644 --- a/chai_lab/data/io/rcsb.py +++ b/chai_lab/data/io/rcsb.py @@ -1,23 +1,15 @@ # Copyright (c) 2024 Chai Discovery, Inc. # Licensed under the Apache License, Version 2.0. # See the LICENSE file for details. -import logging -import urllib.request from pathlib import Path +from chai_lab.utils.paths import download_if_not_exists + def download_cif_file(pdb_id: str, directory: Path) -> Path: """Download the cif file for the given PDB ID from RCSB into the directory.""" outfile = directory / f"{pdb_id}.cif.gz" - if outfile.is_file() and outfile.stat().st_size > 0: - logging.warning( - f"Destination for {pdb_id=} already exists: {outfile}; will not overwrite" - ) - return outfile source_url = f"https://files.rcsb.org/download/{pdb_id}.cif.gz" - logging.info(f"Fetching {source_url} -> {outfile}") - retrieved, _ = urllib.request.urlretrieve(url=source_url, filename=outfile) - retrieved_path = Path(retrieved) - assert retrieved_path == outfile - assert retrieved_path.exists() and retrieved_path.stat().st_size > 0 - return retrieved_path + download_if_not_exists(source_url, outfile) + assert outfile.exists() and outfile.stat().st_size > 0 + return outfile diff --git a/chai_lab/utils/paths.py b/chai_lab/utils/paths.py index 89f4438..00e972d 100644 --- a/chai_lab/utils/paths.py +++ b/chai_lab/utils/paths.py @@ -3,6 +3,7 @@ # See the LICENSE file for details. import dataclasses +import logging import os import random from pathlib import Path @@ -32,7 +33,7 @@ def download_if_not_exists(http_url: str, path: Path): with FileLock(path.with_suffix(".download_lock")): if path.exists(): return # if-lock-if sandwich to download only once - print(f"downloading {http_url}") + logging.info(f"downloading {http_url}") tmp_path = path.with_suffix(f".download_tmp_{random.randint(10 ** 5, 10**6)}") with requests.get(http_url, stream=True) as response: response.raise_for_status() # Check if the request was successful