-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfetch_initial_release_date.py
48 lines (42 loc) · 1.51 KB
/
fetch_initial_release_date.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
"""
Python script for fetching the initial release date for all PDB entries (via RCSB PDB APIs).
The results are saved into a CSV file, `rcsb_release_dates.csv`.
This script requires the following packages, which can be installed with:
pip install python-dateutil
pip install requests
pip install rcsb-api
"""
import requests
import csv
from dateutil import parser
from rcsbapi.data import DataQuery as Query
# Step 1: Retrieve all PDB IDs from Data API
url = 'https://data.rcsb.org/rest/v1/holdings/current/entry_ids'
response = requests.get(url)
ids = eval(response.text)
# Step 2: Split full list of IDs into batches
batchSize = 5_000
idBatches = [ids[i:i+batchSize] for i in range(0, len(ids), batchSize)]
#Step 3: Query release date
release_dates = []
for batch in idBatches:
query = Query(
input_type="entries",
input_ids=batch,
return_data_list=["rcsb_accession_info.initial_release_date"]
)
data = query.exec()
for d in data['data']['entries']:
entry_id = d['rcsb_id']
isodate = d["rcsb_accession_info"]["initial_release_date"]
date = parser.parse(isodate).strftime('%Y-%m-%d')
release_dates.append({
"pdb_id": entry_id,
"release_date": date
})
with open("rcsb_release_dates.csv", "w") as handle:
headers = list(release_dates[-1].keys())
writer = csv.DictWriter(handle, fieldnames=headers)
writer.writeheader()
writer.writerows(release_dates)
print("Wrote release dates to rcsb_release_dates.csv")