forked from VesperChen01/Discovery_of_LCK_Inhibitors
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path1_chembl_pdb_download.py
110 lines (90 loc) · 4.01 KB
/
1_chembl_pdb_download.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
# pip install pandas chembl_webresource_client matplotlib rdkit tqdm numpy
import os
import requests
import numpy as np
from tqdm.auto import tqdm
from chembl_webresource_client.new_client import new_client
from rdkit import Chem
from rdkit.Chem import SaltRemover
import pandas as pd
import matplotlib.pyplot as plt
cwd = os.getcwd()
ids_df = pd.read_csv('ids.csv')
def get_active_compounds_from_chembl(uniprot_id):
if pd.isnull(uniprot_id):
print(f"Uniprot ID is nan, skipping.")
return None
target = new_client.target
target_query = target.search(uniprot_id)
if target_query:
target_id = target_query[0]['target_chembl_id']
else:
print(f"No target found for Uniprot ID: {uniprot_id}")
return None
activity = new_client.activity
activity_query = activity.filter(target_chembl_id=target_id).filter(standard_type__in=["IC50"])
compounds = []
for act in tqdm(activity_query):
compound = {
'chembl_id': act['molecule_chembl_id'],
'smiles': act['canonical_smiles'],
}
compound[act['standard_type'].lower()] = act['standard_value']
compound[act['standard_type'].lower() + '_unit'] = act['standard_units']
compounds.append(compound)
compounds_df = pd.DataFrame(compounds)
compounds_df = compounds_df.fillna(0)
return compounds_df
def clean_smiles(smiles_list):
clean_smiles = []
remover = SaltRemover.SaltRemover()
for smiles in tqdm(smiles_list):
mol = Chem.MolFromSmiles(smiles)
if mol is not None:
mol = remover.StripMol(mol)
clean_smiles.append(Chem.MolToSmiles(mol))
return clean_smiles
def download_pdb_file(pdb_id):
url = f'https://files.rcsb.org/download/{pdb_id}.pdb'
response = requests.get(url)
file_path = f'{pdb_id}.pdb'
with open(file_path, 'w') as f:
f.write(response.text)
print(f"The PDB file has been saved to: {os.path.join(os.getcwd(), file_path)}.")
for index, row in ids_df.iterrows():
uniprot_id = row['uniprot_id']
pdb_id = row['pdb_id']
name = row['name']
print(f"Processing:{uniprot_id} {pdb_id}")
if pd.isnull(pdb_id):
print(f"No PDB ID available, using only Uniprot ID: {uniprot_id} to obtain inhibitors.")
active_compounds = get_active_compounds_from_chembl(uniprot_id)
else:
download_pdb_file(pdb_id)
active_compounds = get_active_compounds_from_chembl(uniprot_id)
if active_compounds is not None:
# wash data
smiles_list = active_compounds['smiles'].tolist()
clean_smiles_list = clean_smiles(smiles_list)
clean_compounds = active_compounds[active_compounds['smiles'].isin(clean_smiles_list)][['chembl_id', 'smiles', 'ic50', 'ic50_unit']]
clean_compounds = clean_compounds.drop_duplicates(subset=['smiles'])
clean_compounds['ic50'] = pd.to_numeric(clean_compounds['ic50'], errors='coerce')
clean_compounds = clean_compounds.dropna(subset=['ic50'])
clean_compounds = clean_compounds.sort_values(by='ic50')
# calculate pIC50
clean_compounds['ic50_M'] = clean_compounds['ic50'].astype(float) / 1_000_000_000
clean_compounds = clean_compounds[clean_compounds['ic50_M'] != 0]
clean_compounds['pIC50'] = -np.log10(clean_compounds['ic50_M'])
clean_compounds = clean_compounds.drop(columns=['ic50_M'])
# Save the DataFrame to a CSV file.
csv_file_path = os.path.join(cwd, f'{name}.csv')
clean_compounds.to_csv(csv_file_path, index=False)
print(f"CSV file has been saved to:{csv_file_path}")
# create pIC50 distribution chart
plt.figure(figsize=(10, 5))
plt.hist(clean_compounds['pIC50'], bins=50, color='blue', alpha=0.7)
plt.xlabel('pIC50')
plt.ylabel('number of molecules')
plt.title(f'The distribution of pIC50 values for {name}')
plt.savefig(f"{name}_pIC50_distribution.png")
print(f"The pIC50 distribution chart has been saved to:{name}_pIC50_distribution.png")