Use centralized downloading logic for RCSb

chaidiscovery · Feb 24, 2025 · 13129d2 · 13129d2
1 parent f1f012d
commit 13129d2
Show file tree

Hide file tree

Showing 2 changed files with 7 additions and 14 deletions.
diff --git a/chai_lab/data/io/rcsb.py b/chai_lab/data/io/rcsb.py
@@ -1,23 +1,15 @@
 # Copyright (c) 2024 Chai Discovery, Inc.
 # Licensed under the Apache License, Version 2.0.
 # See the LICENSE file for details.
-import logging
-import urllib.request
 from pathlib import Path
 
+from chai_lab.utils.paths import download_if_not_exists
+
 
 def download_cif_file(pdb_id: str, directory: Path) -> Path:
     """Download the cif file for the given PDB ID from RCSB into the directory."""
     outfile = directory / f"{pdb_id}.cif.gz"
-    if outfile.is_file() and outfile.stat().st_size > 0:
-        logging.warning(
-            f"Destination for {pdb_id=} already exists: {outfile}; will not overwrite"
-        )
-        return outfile
     source_url = f"https://files.rcsb.org/download/{pdb_id}.cif.gz"
-    logging.info(f"Fetching {source_url} -> {outfile}")
-    retrieved, _ = urllib.request.urlretrieve(url=source_url, filename=outfile)
-    retrieved_path = Path(retrieved)
-    assert retrieved_path == outfile
-    assert retrieved_path.exists() and retrieved_path.stat().st_size > 0
-    return retrieved_path
+    download_if_not_exists(source_url, outfile)
+    assert outfile.exists() and outfile.stat().st_size > 0
+    return outfile
diff --git a/chai_lab/utils/paths.py b/chai_lab/utils/paths.py
@@ -3,6 +3,7 @@
 # See the LICENSE file for details.
 
 import dataclasses
+import logging
 import os
 import random
 from pathlib import Path
@@ -32,7 +33,7 @@ def download_if_not_exists(http_url: str, path: Path):
     with FileLock(path.with_suffix(".download_lock")):
         if path.exists():
             return  # if-lock-if sandwich to download only once
-        print(f"downloading {http_url}")
+        logging.info(f"downloading {http_url}")
         tmp_path = path.with_suffix(f".download_tmp_{random.randint(10 ** 5, 10**6)}")
         with requests.get(http_url, stream=True) as response:
             response.raise_for_status()  # Check if the request was successful