-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathcreate_ontology_dict.py
161 lines (115 loc) · 6.29 KB
/
create_ontology_dict.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
"""
Create a dictionary of the ontology terms.
Looking at having the terms defined here, with possible sets of values.
In the current sheet, the terms are in row 2, with samples in following rows.
We will add actual data based on this dictionary from a separate function.
Possibil
"""
import pandas as pd
import sys
import datetime
import re
def create_ontology_dict(xls):
def reading_file(sheet):
fields = pd.read_excel(xls,sheet_name=sheet,na_values=[datetime.time(0, 0),"1900-01-00","Missing","missing","Not Applicable [GENEPIO:0001619]"],keep_default_na=False, header=1)
#fields_sheet_filtered = fields_sheet[fields_sheet["Ontology ID"].str.contains("GENEPIO")==True]
array_terms =[]
if "AMR" not in sheet:
fields_noNA=fields.dropna(axis=0, subset=['sample_collector_sample_ID'])
else:
fields_noNA=fields.dropna(axis=0, subset=['isolate_ID'])
fields_sheet =fields_noNA
columns = list(fields_sheet.columns.values)
for terms in columns:
array_terms.append(terms)
return(array_terms)
sampleT_terms = reading_file("Sample Collection & Processing")
isolateT_terms = reading_file("Strain and Isolate Information")
##adding moved isolate terms
isolateT_terms.extend(['biosample_accession', 'bioproject_accession'])
hostT_terms = reading_file("Host Information")
sequenceT_terms = reading_file("Sequence Information")
repositoryT_terms = reading_file("Public Repository Information")
#removing from repository_T
#repositoryT_terms.remove('biosample_accession')
#repositoryT_terms.remove('bioproject_accession')
riskT_terms = reading_file("Risk Assessment")
amrTotal_terms= reading_file("AMR Phenotypic Test Information")
environmental_conditions_terms = reading_file("Environmental conditions")
bioinformatics_terms = reading_file("Bioinformatics and QC")
taxonomic_information_terms = reading_file("Taxonomic Information")
#print(sampleT_terms)
extractionT_terms = []
# Add the first item of `sampleT_terms` to `extractionT_terms`
extractionT_terms.append(sampleT_terms[0])
# Add "isolate_ID" to `extractionT_terms`
extractionT_terms.append("isolate_ID")
# Patterns to search for
patterns = ['experimental', 'nucleic_acid', 'sample_volume', 'residual_sample', 'sample_storage']
# Add terms that start with any of the specified patterns to `extractionT_terms`
# and remove them from `sampleT_terms`
#print (sampleT_terms)
#sys.exit()
extractionT_terms.extend([term for term in sampleT_terms if any(term.startswith(pattern) for pattern in patterns)])
sampleT_terms = [term for term in sampleT_terms if not any(term.startswith(pattern) for pattern in patterns)]
#print(sampleT_terms)
#print(extractionT_terms)
#sys.exit()
antiT_terms=amrTotal_terms[9:]
amrT_terms=amrTotal_terms[:9]
print ("done elements")
vocab_sheet = pd.read_excel(xls,keep_default_na=False,converters={column: lambda x: x.strip() for column in list(range(20))}, sheet_name="Vocabulary", header=1)
ontology_dict = vocab_sheet.to_dict(orient='list')
ontology_dict = {key.strip(): value for key, value in ontology_dict.items()}
fields_sheet = pd.read_excel(xls,keep_default_na=False, sheet_name="Reference Guide", header=4)
#fields_sheet_filtered = fields_sheet[fields_sheet["Ontology ID"].str.contains("GENEPIO")==True]
fields_sheet_filtered = fields_sheet[(fields_sheet["Ontology Identifier"].str.contains("GENEPIO") | fields_sheet.iloc[:, 1].str.startswith("antimicrobial")) & (fields_sheet["Field"])]
dict_fields={}
for index, row in fields_sheet_filtered.iterrows():
sample_key = row["Field"]
dict_fields[sample_key] = {}
new_merged_ontology_dict = {}
for key in dict_fields:
keypr = ""
if (key == "antimicrobial_resistance_phenotype"):
keypr = "antimicrobial_phenotype"
elif (key == "food_product_origin geo_loc_name (country)"):
keypr = "food_product_origin geo_loc (country)"
else:
keypr = key
if keypr in ontology_dict.keys():
# print(keypr)
str_list = list(filter(None, ontology_dict[keypr]))
#print (str_list)
temp_list=[];
for i in str_list:
newstr = i.strip()
#print (newstr)
if re.match(".+\[\w",newstr):
#print("HERE")
substrL= re.match("(.+)\s+\[(\S+)\]",newstr)
#print("NOW",substrL)
temp_list.append({newstr:{"term":substrL.groups()[0],"term_id":substrL.groups()[1]}})
#print("AFTER")
else:
temp_list.append(newstr)
new_merged_ontology_dict [key] = {"field_id":key,"terms":temp_list}
else:
new_merged_ontology_dict [key] = {"field_id":key}
antimicrobian_agent_names_ids = {}
for elements in new_merged_ontology_dict["antimicrobial_agent_name"]["terms"]:
for keys in elements:
antibiotics = elements[keys]['term'].lower()
if(antibiotics == 'amoxicillin-clavulanic'):
antibiotics = 'amoxicillin-clavulanic_acid'
antimicrobian_agent_names_ids [antibiotics]= elements[keys]['term_id']
#adding extra antibiotics that aren't in the vocabulary
#antimicrobian_agent_names_ids ['amikacin']= 'CHEBI:2637'
#antimicrobian_agent_names_ids ['kanamycin']= 'CHEBI:6104'
#print(new_merged_ontology_dict['food_product_origin geo_loc_name (country)'])
#sys.exit()
#sys.exit()
#taxonomic_identification_process
#print(new_merged_ontology_dict['antimicrobial_measurement_sign'])
#sys.exit()
return (new_merged_ontology_dict,antimicrobian_agent_names_ids,sampleT_terms,isolateT_terms,hostT_terms,sequenceT_terms,repositoryT_terms,riskT_terms,amrT_terms,antiT_terms,environmental_conditions_terms,bioinformatics_terms,taxonomic_information_terms,extractionT_terms)