Skip to content

Commit

Permalink
Use centralized downloading logic for RCSb
Browse files Browse the repository at this point in the history
  • Loading branch information
wukevin committed Feb 24, 2025
1 parent f1f012d commit 13129d2
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 14 deletions.
18 changes: 5 additions & 13 deletions chai_lab/data/io/rcsb.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,15 @@
# Copyright (c) 2024 Chai Discovery, Inc.
# Licensed under the Apache License, Version 2.0.
# See the LICENSE file for details.
import logging
import urllib.request
from pathlib import Path

from chai_lab.utils.paths import download_if_not_exists


def download_cif_file(pdb_id: str, directory: Path) -> Path:
"""Download the cif file for the given PDB ID from RCSB into the directory."""
outfile = directory / f"{pdb_id}.cif.gz"
if outfile.is_file() and outfile.stat().st_size > 0:
logging.warning(
f"Destination for {pdb_id=} already exists: {outfile}; will not overwrite"
)
return outfile
source_url = f"https://files.rcsb.org/download/{pdb_id}.cif.gz"
logging.info(f"Fetching {source_url} -> {outfile}")
retrieved, _ = urllib.request.urlretrieve(url=source_url, filename=outfile)
retrieved_path = Path(retrieved)
assert retrieved_path == outfile
assert retrieved_path.exists() and retrieved_path.stat().st_size > 0
return retrieved_path
download_if_not_exists(source_url, outfile)
assert outfile.exists() and outfile.stat().st_size > 0
return outfile
3 changes: 2 additions & 1 deletion chai_lab/utils/paths.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
# See the LICENSE file for details.

import dataclasses
import logging
import os
import random
from pathlib import Path
Expand Down Expand Up @@ -32,7 +33,7 @@ def download_if_not_exists(http_url: str, path: Path):
with FileLock(path.with_suffix(".download_lock")):
if path.exists():
return # if-lock-if sandwich to download only once
print(f"downloading {http_url}")
logging.info(f"downloading {http_url}")
tmp_path = path.with_suffix(f".download_tmp_{random.randint(10 ** 5, 10**6)}")
with requests.get(http_url, stream=True) as response:
response.raise_for_status() # Check if the request was successful
Expand Down

0 comments on commit 13129d2

Please sign in to comment.