-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy patharc_parse_COG_files.py
54 lines (43 loc) · 1.61 KB
/
arc_parse_COG_files.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
from ete3 import SeqGroup
import sys
import os
f_ = sys.argv[1]
alg = SeqGroup(f_)
orginal_name_COG = os.path.basename(f_)
parse_name_COG = open('/home/plaza/projects/eggnog6/sp_tree/COG_files/Arc/clean_'+orginal_name_COG, 'w')
cog_tab = open('/home/plaza/projects/eggnog6/sp_tree/COG_files/Arc/COG_table.tsv', 'a')
sp_list_egg = []
with open('/home/plaza/projects/eggnog6/ncbi_tree/arch_sp_tree.tsv') as tablefn:
for line in tablefn:
if line.strip() and not line.startswith("#"):
sp = int(line.strip())
sp_list_egg.append(sp)
eggnog62progenomes2 = {}
with open('/home/plaza/projects/eggnog6/raw_data/eggnogv6_2_progenomesv2.tsv') as tablefn:
for line in tablefn:
if line.strip() and not line.startswith("#"):
info = line.split()
taxid_egg6 = info[0].strip()
taxid_pro2 = info[1].strip()
eggnog62progenomes2[int(taxid_egg6)] = int(taxid_pro2)
sp_list = []
for sp in sp_list_egg:
sp_pro = eggnog62progenomes2[sp]
sp_list.append(int(sp_pro))
seq_list=[]
taxid_in_COG = []
for num, (name, seq, _) in enumerate(alg):
name_data = name.split()
new_name = name_data[0]
try:
taxid, sample, seq_name = new_name.split('.', 2)
if int(taxid) in sp_list and int(taxid) not in taxid_in_COG:
taxid_in_COG.append(int(taxid))
final_name = str(taxid+'@'+sample+'-'+seq_name)
seq_list.append(final_name)
parse_name_COG.write('>%s\n%s\n' %(final_name, seq))
except:
print(new_name)
parse_name_COG.close()
cog_tab.write('\t'.join(seq_list)+'\n')
cog_tab.close()