-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrack_screen_exp_wf.py
291 lines (230 loc) · 8.76 KB
/
track_screen_exp_wf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
#!/usr/bin/env python
"""
The tracking of screening experiment from multiple steps
Copyright (C)
2017 - ETH Zuerich, NEXUS Personalized Health Technologies
"""
## standard python libraries
import os
import sys
import csv
import pandas
from collections import defaultdict
def csv_data_loader(file_name):
"""
load full data from the csv file, returns a dataframe.
@args file_name: csv file generated in a screening experiment
@type file_name: str
"""
exp_details = pandas.read_csv(file_name, header=0)
exp_df = exp_details.fillna(value=0)
return exp_df
def barcode_identifier(raw_df):
"""
extract the source and destination barcodes from a dataframe and returns
two lists.
@args raw_df: experiment details from csv file
@type raw_df: pandas dataframe
"""
## filter the dataframe columns for barcode information
col_names = []
## general keyword describing the barcode
keywords = ["SourceBarcode", "DestinationBarcode"]
for cols in raw_df:
try:
## keyword match is exact now and need to replaced with wild
## TODO something like x.str.contains("word")
if raw_df[cols].apply(lambda x: str(x) in keywords).any():
col_names.append(cols)
except:
pass
#print(col_names)
## now the from the selected column, get the barcodes
src_barcodes = []
dst_barcodes = []
## selection of barcodes after the matching of keyword
for ind_col in col_names:
## initializing the flags
src_bc_tag = 0
dst_bc_tag = 0
for element in raw_df[ind_col]:
if not element:
## empty lines are re-initializing the flags
src_bc_tag = 0
dst_bc_tag = 0
continue
## loading the barcode values
if src_bc_tag:
src_barcodes.append(element)
continue
if dst_bc_tag:
dst_barcodes.append(element)
continue
## checking the barcode types
if element in "DestinationBarcode":
dst_bc_tag = 1
if element in "SourceBarcode":
src_bc_tag = 1
#print src_barcodes
#print dst_barcodes
return(src_barcodes, dst_barcodes)
def search_intermediate_files(base_path):
"""
search for different intermediate files in a screening experiment and
returns a list with complete csv and tab files.
@args base_path: a location where experiment files are stored
@type base_path: str
"""
## search for the files in a specified path
csv_tab_files = []
if os.path.isdir(base_path):
## walk through the entire base path
for root, dirs, files in os.walk(base_path):
for fname in files:
file_prefix, ext = os.path.splitext(fname)
## selecting files with csv and tab extension
#if ext in [".csv", ".tab"]:
if ext in [".csv"]:
tmp_file = os.path.join(root, fname)
csv_tab_files.append(tmp_file)
return csv_tab_files
def plain_csv_reader(file_name):
"""
Pandas Error: tokenizing data when dealing with a CSV file that have
variable number of columns and read_csv inferred the number of columns
from the first few rows. To avoid this, use csv module.
@args file_name: csv file generated in a screening experiment
@type file_name: str
"""
src_barcodes = []
dst_barcodes = []
barcode_flag = 0
with open(file_name, "rbU") as csvfile:
rows = csv.reader(csvfile)
for line in rows:
if not line:
## resetting the flag variable
barcode_flag = 0
continue
try:
## FIXME general keyword for searching barcodes
src_well_ind = line.index("SourceWell")
src_barcode_ind = line.index("SourceBarcode")
dst_well_ind = line.index("DestinationWell")
dst_barcode_ind = line.index("DestinationBarcode")
barcode_flag = 1
continue
except:
pass
if barcode_flag:
try:
tmp_src = line[src_barcode_ind]
tmp_src_well = line[src_well_ind]
src_barcodes.append((tmp_src, tmp_src_well))
except IndexError:
print("error: not able to locate SourceBarcode")
try:
tmp_dst = line[dst_barcode_ind]
tmp_dst_well = line[dst_well_ind]
dst_barcodes.append((tmp_dst, tmp_dst_well))
except IndexError:
print("error: not able to locate DestinationBarcode")
return(src_barcodes, dst_barcodes)
def dfs_search(graph, start, visited=[]):
"""
method to reduce the experiment direction from the barcodes extracted from
different intermediate files.
@args graph: a dictionary with source and destination barcodes
@type graph: defaultdict
@args start: starting point to infer the path ('ACTITARG-K960PL-1', 'Q1')
@type start: tuple
"""
##FIXME depends on the well plate 96 or 384 the tracking whole and QX are
## concern to get the right combination
stack = [start]
if not visited:
visited = [start]
while stack:
try:
start = min(list(set(graph[start]) - set(visited)))
stack.append(start)
visited.append(start)
except:
stack.pop()
if (len(stack) > 0):
start = stack[-1]
else:
node = partial_key_search(graph, start[0])
for ele in node:
dfs_search(graph, ele, visited)
return visited
def partial_key_search(brcs, search_key):
"""
partial match of a tuple key of a dictionary
@args brcs: a dictionary with source and destination barcodes
@type brcs: defaultdict
@args search_key: searching key word, one element of the tuple key
@type search_key: str
"""
## making key in tuple form
search_key = (search_key, None)
start_vertex = []
for src_wellp, des_wellp in brcs.iteritems():
if all(xp == xq or xq is None for xp, xq in zip(src_wellp, search_key)):
start_vertex.append(src_wellp)
return start_vertex
## TODO experiment run based on the YAML configuration file
## 1. it is better to have the YAML file for history of experiment search
## 2. minimal input requirement for the executing the experiment
## files and folders associated with multiple screens
experiment_path = "/Users/vipin/Documents/tdu_screens/"
experiment_path = "/Users/vipin/tmp/track_files"
print('Experiment data imports %s' % experiment_path)
## searh the python dict with similarity key search to identify the right key
stock_compd_name = "ACTITARG-K960PL-1"
stock_compd_name = "Drug08_A"
## plate reformatting direction
well_96_to_384 = True
well_384_to_96 = False
## getting all intermediate experiment files from provided path
exp_files = search_intermediate_files(experiment_path)
print('Total number of %d file(s) found' % len(exp_files))
## get the barcodes from all files
src_dst_maps = defaultdict(list)
for asc_file in exp_files:
## TODO parsing details about the experiment
## 1. mapping information about the barcode and experiment details
src_bc, dst_bc = plain_csv_reader(asc_file)
#print src_bc
#print dst_bc
#print
## barcode mapping from source to destination
try:
for idx, barcode in enumerate(src_bc):
src_dst_maps[barcode].append(dst_bc[idx])
except IndexError:
print("warning: file %s missing destination barcodes" % asc_file)
pass
#print src_dst_maps.keys()
## do the key search to identify the experiment to search
start_node = partial_key_search(src_dst_maps, stock_compd_name)
#print start_node
## build the graph with extracted barcodes and resolve the experiment path
if start_node:
for compound in start_node:
root = dfs_search(src_dst_maps, compound)
#print("%s\n%s" % (compound, root))
for wfsteps in root:
sys.stdout.write("%s," % wfsteps[0])
print
else:
print("error: no stock library %s found" % stock_compd_name)
## TODO the final representation of the experiment flow path
## 1. visual representation -
## 2. csv or txt files
## 3. information about the screen readout file ie, the last element of list root
## read the formated csv files. This works with pandas
#csv_df = csv_data_loader(exp_files[0])
## get the barcodes
#src_bc, dst_bc = barcode_identifier(csv_df)
sys.exit(-1)