-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathMergeLocus.py
370 lines (362 loc) · 18.1 KB
/
MergeLocus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
#!/usr/bin/env python
"""
Description : To merge multiple transcript annotations from a single loci
and define as an alternative spliced candidate for the gene.
Usage: MergeLocus.py in.gff3 out.gff3
"""
import re, sys, time
import collections
def display_content(final_dict):
"""displaying the summary from GFF file
"""
print "\tCount and unique combination of source(s), feature type(s):"
for sftype, cnt in sorted(final_dict['gff_source_type'].items()):
if sftype[1] == 'gene': # information on parent feature listed, TODO include the statistics of other elements.
print '\t' + str(cnt) + '\t' + str(sftype[0]) + ', '+ str(sftype[1])
def available_limits(gff_file):
"""Figure out the available feature types from the given GFF file
"""
gff_handle = open(gff_file, 'rU')
filter_info = dict(gff_id = [0], gff_source_type = [1, 2],
gff_source = [1], gff_type = [2])
cur_limits = dict()
for filter_key in filter_info.keys():
cur_limits[filter_key] = collections.defaultdict(int)
for line in gff_handle:
if line.strip('\n\r')[0] != "#":
parts = [p.strip() for p in line.split('\t')]
if len(parts) == 1:
continue
assert len(parts) == 9, line
for filter_key, cur_indexes in filter_info.items():
cur_id = tuple([parts[i] for i in cur_indexes])
cur_limits[filter_key][cur_id] += 1
gff_handle.close()
final_dict = dict()
for key, value_dict in cur_limits.items(): # get rid of the default dicts
if len(key) == 1:key = key[0]
final_dict[key] = dict(value_dict)
return final_dict
def GFFWriter(merged_info, genes, transcripts, exons, utr5, cds, utr3, out_file):
"""Write GFF3 file with merged feature description
"""
out_fh = open(out_file, 'w')
for ginfo, regions in merged_info.items():
gene_cnt = 1
for interval, features in sorted(regions.items()):# master gene feature
pline = [ginfo[0],
ginfo[1],
'gene',
str(interval[0]),
str(interval[1]),
'.',
ginfo[2],
'.',
'ID=Gene_' + ginfo[0] + ginfo[2] + '_' + str(gene_cnt).zfill(5) + ';Name=Gene_' + ginfo[0] + ginfo[2] +'_' + str(gene_cnt).zfill(5)]
out_fh.write('\t'.join(pline) + '\n')
for geneid in features:# corresponding transcript info
if geneid in transcripts:
for tinfo in transcripts[geneid]:# transcript feature line
pline = [ginfo[0],
ginfo[1],
tinfo['type'],
str(tinfo['start']),
str(tinfo['stop']),
'.',
ginfo[2],
'.',
'ID=' + tinfo['ID']+ ';Parent=Gene_' + ginfo[0] + ginfo[2] + '_' + str(gene_cnt).zfill(5)]
out_fh.write('\t'.join(pline) + '\n')
if tinfo['ID'] in utr5:# check for 5 prime UTR
for u5info in utr5[tinfo['ID']]:
pline = [ginfo[0],
ginfo[1],
'five_prime_UTR',
str(u5info['start']),
str(u5info['stop']),
'.',
ginfo[2],
'.',
'Parent=' + tinfo['ID']]
out_fh.write('\t'.join(pline) + '\n')
if tinfo['ID'] in cds:# check for CDS
for cdsinfo in cds[tinfo['ID']]:
pline = [ginfo[0],
ginfo[1],
'CDS',
str(cdsinfo['start']),
str(cdsinfo['stop']),
'.',
ginfo[2],
'.',
'Parent=' + tinfo['ID']]
out_fh.write('\t'.join(pline) + '\n')
if tinfo['ID'] in utr3:# check for 3 prime UTR
for u3info in utr3[tinfo['ID']]:
pline = [ginfo[0],
ginfo[1],
'three_prime_UTR',
str(u3info['start']),
str(u3info['stop']),
'.',
ginfo[2],
'.',
'Parent=' + tinfo['ID']]
out_fh.write('\t'.join(pline) + '\n')
if tinfo['ID'] in exons:# check for exons
for exinfo in exons[tinfo['ID']]:
pline = [ginfo[0],
ginfo[1],
'exon',
str(exinfo['start']),
str(exinfo['stop']),
'.',
ginfo[2],
'.',
'Parent=' + tinfo['ID']]
out_fh.write('\t'.join(pline) + '\n')
gene_cnt += 1
out_fh.close()
def UniqLoci(genes, transcripts, exons):
"""determine unique location where features annotated multiple times
"""
uniq_loci = dict()
for gid, parts in genes.items():
gene_info = (parts['chr'], parts['source'], parts['strand'])
if gene_info in uniq_loci: # same contig, orientation, source: look for merging transcripts based on the nearby location
if (int(parts['start']), int(parts['stop'])) in uniq_loci[gene_info].keys(): ## similar transcripts will catch here (start and stop are same may be exon, CDS or intron content may vary)
uniq_loci[gene_info][(int(parts['start']), int(parts['stop']))].append(gid)
else: # heuristic approach to include closely related region on a single master loci.
got_a_range = 0
for floc in uniq_loci[gene_info].keys():# look whether it lies closely to any intervel which is already defined
if (floc[1]-parts['start']) < 150 or (parts['stop']-floc[0]) < 150:continue ## TODO boundary spanning length in same orientation for genes of each species will be great.
if floc[0] <= parts['start'] and parts['start'] < floc[1]: # the start of the new candidate is inside of any of the already defined interval ?
non_coding = 0
try: # check for small transcript whether they belong to a existing one or a new non-coding candidate.
if len(transcripts[gid]) == 1:
if len(exons[transcripts[gid][0]['ID']]) == 1:non_coding = 1
if non_coding == 0:
if parts['stop'] > floc[1]:# making global gene coordinate from individual transcript model
entries = uniq_loci[gene_info][floc]
del uniq_loci[gene_info][floc] # remove the existing interval, here we got a longer downstream position from the candidate
entries.append(gid)
uniq_loci[gene_info][(floc[0], parts['stop'])] = entries
else:
uniq_loci[gene_info][floc].append(gid)
else:# create a new interval for non-coding type entry
uniq_loci[gene_info][(parts['start'], parts['stop'])] = [gid]
got_a_range = 1
break
except: # dont have any transcripts or exons defined.
break
elif floc[0] < parts['stop'] and parts['stop'] <= floc[1]: # the stop of the new candidate is inside of any of the pre-defined interval ? the candidate seems to be from more upstream
non_coding = 0
try:
if len(transcripts[gid]) == 1:
if len(exons[transcripts[gid][0]['ID']]) == 1:non_coding = 1
if non_coding == 0:
entries = uniq_loci[gene_info][floc]
del uniq_loci[gene_info][floc] # remove the existing interval, here we got a upstream position from which the candidate transcribing
entries.append(gid)
uniq_loci[gene_info][(int(parts['start']), floc[1])] = entries
else: # create a new interval for non-coding type entry
uniq_loci[gene_info][(parts['start'], parts['stop'])] = [gid]
got_a_range = 1
break
except:
break
elif floc[0] > parts['start'] and floc[1] < parts['stop']: # whether the whole feature floc region (--) resides in the candidate location (----------) ?
non_coding = 0 # here the candidate seems to be longer than the pre-defined interval, check all entries from the pre-defined interval whether it is a small region, any chance as non-coding.
try:
for features in uniq_loci[gene_info][floc]:
if len(transcripts[features]) == 1:
if len(exons[transcripts[features][0]['ID']]) == 1:non_coding = 1
if non_coding == 1: # create a new interval for non coding
uniq_loci[gene_info][(parts['start'], parts['stop'])] = [gid]
else: # append the existing transcript cluster, here change the interval position based on the candidate location
entries = uniq_loci[gene_info][floc]
del uniq_loci[gene_info][floc] # remove the existing interval, here we got a longer upstream and downstream region.
entries.append(gid)
uniq_loci[gene_info][(parts['start'], parts['stop'])] = entries
got_a_range = 1
break
except:
break
## or create a new interval ??
if got_a_range == 0:
uniq_loci[gene_info][(parts['start'], parts['stop'])] = [gid]
else:
uniq_loci[gene_info] = {(int(parts['start']), int(parts['stop'])): [gid]}
return uniq_loci
def ParseGFF(gff_file):
"""feature extraction from provided GFF file
"""
gff_handle = open(gff_file, 'rU')
genes, transcripts, exons, utr5, cds, utr3 = dict(), dict(), dict(), dict(), dict(), dict()
for gff_line in gff_handle:
parts = gff_line.strip('\n\r').split('\t')
if re.search(r'>|#', gff_line[0]): # take out the commented lines and fasta header if present.
continue
if len(parts)==1: # if genome sequence present in the GFF file, here we don't need it.
continue
assert len(parts) == 9, '\t'.join(line) # minimum criteria for a GFF line.
if parts[2] == 'gene':
gene_info = dict()
gene_info['start'] = int(parts[3])
gene_info['stop'] = int(parts[4])
gene_info['chr'] = parts[0]
gene_info['source'] = parts[1]
gene_info['strand'] = parts[6]
gid = ''
for attr in parts[-1].split(';'):
if attr == '':
continue # avoiding wrongly formatted attributed section.
attr = attr.split('=') # key, value pairs are grouped by equal symbol.
if attr[0] == 'ID': # reserved key name identifier for the feature
gid=attr[1]
continue
gene_info[attr[0]] = attr[1]
if gid != '':
genes[gid] = gene_info
if parts[2] in ['mRNA', 'transcript', 'ncRNA', 'tRNA', 'snRNA', 'scRNA', 'snoRNA', 'snlRNA', 'rRNA', 'miRNA']:
mrna_info = dict()
mrna_info['start'] = int(parts[3])
mrna_info['stop'] = int(parts[4])
mrna_info['chr'] = parts[0]
mrna_info['strand'] = parts[6]
mrna_info['type'] = parts[2]
gid = ''
for attr in parts[-1].split(';'):
if attr == '':
continue
attr = attr.split('=')
if attr[0] == 'Parent':
gid=attr[1]
continue
mrna_info[attr[0]] = attr[1]
if gid in transcripts:
transcripts[gid].append(mrna_info)
else:
transcripts[gid] = [mrna_info]
if parts[2] == 'exon':
exon_info = dict()
exon_info['start'] = int(parts[3])
exon_info['stop'] = int(parts[4])
exon_info['chr'] = parts[0]
exon_info['strand'] = parts[6]
tid = ''
for attr in parts[-1].split(';'):
if attr == '':
continue
attr = attr.split('=')
if attr[0] == 'Parent':
tid=attr[1]
continue
exon_info[attr[0]] = attr[1]
if tid in exons:
exons[tid].append(exon_info)
else:
exons[tid] = [exon_info]
if parts[2] == 'five_prime_UTR':
utr5_info = dict()
utr5_info['start'] = int(parts[3])
utr5_info['stop'] = int(parts[4])
utr5_info['chr'] = parts[0]
utr5_info['strand'] = parts[6]
tid = ''
for attr in parts[-1].split(';'):
if attr == '':
continue
attr = attr.split('=')
if attr[0] == 'Parent':
tid=attr[1]
continue
utr5_info[attr[0]] = attr[1]
if tid in utr5:
utr5[tid].append(utr5_info)
else:
utr5[tid] = [utr5_info]
if parts[2] == 'CDS':
cds_info = dict()
cds_info['start'] = int(parts[3])
cds_info['stop'] = int(parts[4])
cds_info['chr'] = parts[0]
cds_info['strand'] = parts[6]
tid = ''
for attr in parts[-1].split(';'):
if attr == '':
continue
attr = attr.split('=')
if attr[0] == 'Parent':
tid=attr[1]
continue
cds_info[attr[0]] = attr[1]
if tid in cds:
cds[tid].append(cds_info)
else:
cds[tid] = [cds_info]
if parts[2] == 'three_prime_UTR':
utr3_info = dict()
utr3_info['start'] = int(parts[3])
utr3_info['stop'] = int(parts[4])
utr3_info['chr'] = parts[0]
utr3_info['strand'] = parts[6]
tid = ''
for attr in parts[-1].split(';'):
if attr == '':
continue
attr = attr.split('=')
if attr[0] == 'Parent':
tid=attr[1]
continue
utr3_info[attr[0]] = attr[1]
if tid in utr3:
utr3[tid].append(utr3_info)
else:
utr3[tid] = [utr3_info]
gff_handle.close()
return genes, transcripts, exons, utr5, cds, utr3
if __name__=='__main__':
try:
gff_file = sys.argv[1]
out_file = sys.argv[2]
except:
print __doc__
sys.exit(-1)
stime = time.asctime( time.localtime(time.time()) )
print '-------------------------------------------------------'
print 'MergeLocus started on ' + stime
print '-------------------------------------------------------'
print '--------'
print 'Level: 1- ' + 'Reading GFF file: ' + gff_file
print '--------'
print '--------'
print 'Level: 2- ' + 'BEFORE processing, Merging feature distribution in GFF file'
print '--------'
# initial feature distribution in file
final_dict = available_limits(gff_file)
display_content(final_dict)
# determine the whole content from GFF file
genes, transcripts, exons, utr5, cds, utr3 = ParseGFF(gff_file)
print '--------'
print 'Level: 3- ' + 'Start merging feature(s) from similar locations...'
print '--------'
# determine the same gene loci on specific chromosome based on the same source
merged_regions = UniqLoci(genes, transcripts, exons)
print '\tDone.'
print '--------'
print 'Level: 4- ' + 'Writing merged feature annotation to GFF format...'
print '--------'
# write new GFF file with merged loci information for gene feature
GFFWriter(merged_regions, genes, transcripts, exons, utr5, cds, utr3, out_file)
print '\tDone.'
# after processing display the feature distribution in the result file
print '--------'
print 'Level: 5- ' + 'Merged feature(s) summary from GFF file'
print '--------'
final_dict = available_limits(out_file)
display_content(final_dict)
etime = time.asctime( time.localtime(time.time()) )
print '-------------------------------------------------------'
print 'MergeLocus finished at ' + etime
print '-------------------------------------------------------'