-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathseq_extract2.py
33 lines (26 loc) · 1.22 KB
/
seq_extract2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
#!/usr/bin/env python3
import argparse
from Bio import SeqIO
import re
def extract_sequences(fasta_file, id_file, output_file):
# Extract IDs from id_file
with open(id_file, 'r') as f:
ids = [line.strip() for line in f]
# Extract the matching sequences from the fasta file
with open(fasta_file, 'r') as f:
sequences = SeqIO.parse(f, 'fasta')
extracted_sequences = []
for seq in sequences:
# Extract the number after the "g" in the fasta header
match = re.search(r'g(\d+)', seq.id)
if match and match.group(1) in ids:
extracted_sequences.append(seq)
# Write the extracted sequences to the output file
SeqIO.write(extracted_sequences, output_file, 'fasta')
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Extract sequences from a fasta file using a list of IDs.')
parser.add_argument('fasta_file', help='Path to the input fasta file.')
parser.add_argument('id_file', help='Path to the txt file containing the IDs.')
parser.add_argument('output_file', help='Path to the output fasta file.')
args = parser.parse_args()
extract_sequences(args.fasta_file, args.id_file, args.output_file)