-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsplitting.py
executable file
·126 lines (112 loc) · 3.74 KB
/
splitting.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
#!/usr/bin/env python
"""
Splitting the fasta file from FluPipe into seperate files by subtypes and segments
"""
import argparse
from typing import Tuple
from Bio import SeqIO
def get_substrings(
mode: str,
seq_id: str,
) -> Tuple[str, str]:
"""
Performs the alignment of query on reference using a pairwise alignment.
Args:
mode (str): Specify source of data for right splitting of ID
seq_id (str): ID for the Influenza sequence
Returns:
Tuple[str, str]: Tuple containing string with subtype, segment
"""
if mode == 'FluPipe' or mode == 'flupipe':
split_id = seq_id.split("|")
subtype = split_id[5].split("/")[1].strip()
segment = split_id[6].strip()
elif mode == 'GISAID' or mode == 'gisaid':
split_id = seq_id.split("|")
subtype = split_id[-1].split("/")[1].strip()
segment = split_id[1].strip()
elif mode == 'OpenFlu' or mode == 'openflu':
split_id = seq_id.split("|")
subtype = split_id[-1].strip()
segment = split_id[1].strip()
return subtype, segment
def main():
"""
Main function
Returns
---------
None
"""
parser = argparse.ArgumentParser(
description="Splitting: Into subtypes and segments",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
parser.add_argument(
"-i",
"--input",
required=True,
help="Set of Influenza genome sequences (fasta format)",
)
parser.add_argument(
"-m",
"--mode",
required=True,
help="The source of the sequences: FluPipe, GISAID or OpenFlu",
)
parser.add_argument(
"--h1n1",
default="h1n1_ha.fasta",
help="Output fasta file with H1N1 HA sequences",
)
parser.add_argument(
"--h3n2",
default="h3n2_ha.fasta",
help="Output fasta file with H3N2 HA sequences",
)
parser.add_argument(
"--vic",
default="vic_ha.fasta",
help="Output fasta file with Victoria HA sequences",
)
args = parser.parse_args()
fastain = args.input
mode = args.mode
h1n1 = args.h1n1
h3n2 = args.h3n2
vic = args.vic
for record in SeqIO.parse(fastain, "fasta"):
subtype, segment = get_substrings(mode, record.description)
if subtype == 'H1N1':
if segment == 'HA':
with open(h1n1, 'a') as output:
SeqIO.write(record, output, 'fasta')
else:
print(
f"Found H1N1 sequence of segment {segment}, the sequence will not be processed"
)
SeqIO.write(record, 'h1n1_other_seg.fasta', 'fasta')
elif subtype == 'H3N2':
if segment == 'HA':
with open(h3n2, 'a') as output:
SeqIO.write(record, output, 'fasta')
else:
print(
f"Found H3N2 sequence of segment {segment}, the sequence will not be processed"
)
SeqIO.write(record, 'h3n2_other_seg.fasta', 'fasta')
elif subtype == 'H0N0' or subtype == "HN" or subtype == "B":
if segment == 'HA':
with open(vic, 'a') as output:
SeqIO.write(record, output, 'fasta')
else:
print(
f"Found Victoria sequence of segment {segment}, the sequence will not be processed"
)
SeqIO.write(record, 'vic_other_seg.fasta', 'fasta')
else:
print(
f"Found sequence of unknown subtype {subtype}, the sequence will not be processed"
)
SeqIO.write(record, 'other_subtype.fasta', 'fasta')
if __name__ == "__main__":
main()