-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathalignment_tools.py
107 lines (89 loc) · 5.11 KB
/
alignment_tools.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
#Import Modules and functions
import pandas as pd
import numpy as np
import os,sys
#Import biopython modules
#os.environ['CONDA_PREFIX']=/home/wish1832/anaconda3
#Code to append biopython location to path
os.environ['PATH'] = os.environ['PATH']+';'+os.environ['CONDA_PREFIX']+"/pkgs/biopython-1.78-py38h7b6447c_0/lib/python3.8/site-packages"
from Bio import AlignIO as msa_in
from Bio.Align import MultipleSeqAlignment as msa
import Bio.SeqRecord
from Bio import SeqIO
def read_seq(path):
"""
Reads a multiple sequence alignment in FASTA format and stores the sequences in a format readable by the sort_seqs() and manual_correction() functions.
"""
return list(SeqIO.parse(path,"fasta"))
def sort_seqs(seq_list):
"""
Takes a sequence alignment produced by read_seq() and sorts in ascending order by the cluster number (number appearing after "Uniq" in the cluster ID).
"""
return sorted(seq_list,key=lambda x:int(x.id.split(";")[0].split("Uniq")[1]))
def manual_correction(seq_list,msa_start,msa_end,erroneous_alignment,correct_alignment):
"""
Takes a sequence alignment produced by read_seq() and makes a specified correction to the alignment.
Arguments
-----------
seq_list: a list of BioPython SeqRecord objects created by read_seq()
msa_start: location of the beginning of the region to be corrected. Index is based on the residue postion, starting at 1 (i.e. to start at residue 70, enter 70 instead of 69).
msa_end: location of the end of the region to be corrected. Index is based on the residue postion, starting at 1 (i.e. to end at residue 80, enter 80 instead of 79).
erroneous_alignment: a string giving the motif of the erroneous alignment within the start and end positions specified. Use capital letters, and use blanks exactly as they appear in the alignment.
correct_alignment: enter the motif of the correct alignment within the start and end positions specified. Use capital letters, and use blanks exactly as they appear in the correct alignment.
Returns:
-----------
seq_list: a list of BioPython SeqRecord objects with the correction made.
Example
----------
To correct the alignment below, enter 72 and 76 as the msa_start and msa_end values, 'A-I--' for erroneous_alignment, and '---AI' for correct_alignment.
72 ---AI 76
A-I--
---AI
---AI
72 ---AI 76
"""
#Count sequences needing correction, and store information on sequences for display (if number of sequences to correct is below 25)
n_seq_correct=0
correction_info=[]
#Identity Erroneous sequences: print the sequence along with the cluster ID in which it appears
for i in range(len(seq_list)):
if seq_list[i].seq[(msa_start-1):msa_end]==erroneous_alignment:
#Increase count of sequences to correct by one
n_seq_correct+=1
cluster_ID=seq_list[i].id.split(";")[0]
correction_string=f"{cluster_ID} : {msa_start} {seq_list[i].seq[msa_start-1:msa_end]} {msa_end}"
correction_info.append(correction_string)
#Print information on the sequences to be corrected, depending on the number of erroneous sequences found
if n_seq_correct<=25:
#Print all sequences to be corrected if the number of sequences to be corrected is less than or equal to 25
print(f"Found {n_seq_correct} sequences with the specified motif {erroneous_alignment}.")
print(*(cluster for cluster in correction_info),sep="\n")
elif n_seq_correct==0:
print("No sequences found with the specified erroneous motif.")
else:
#If the number of sequences found is greater than 25, print the amount of sequences found and the first 25 elements
print(f"Found {n_seq_correct} sequences with the specified motif {erroneous_alignment}.")
print(*(cluster for cluster in correction_info[0:25]),"...",sep="\n")
#Edit the sequence alignment
for i in range(len(seq_list)):
if seq_list[i].seq[(msa_start-1):msa_end]==erroneous_alignment:
#Convert SeqRecord Object to MutableSeq object to make changes
seq_list[i].seq=seq_list[i].seq.tomutable()
#Modify the string within the region specified to be equal to the correct alingment
seq_list[i].seq[msa_start-1:msa_end]=correct_alignment
#Convert the sequence back into an immutable Seq object
seq_list[i].seq=seq_list[i].seq.toseq()
print(f"Complete. {n_seq_correct} sequences changed to {correct_alignment}.",end="\n\n")
return seq_list
def write_sequences(seq_list,path):
"""
Writes the list of SeqRecord objects to a FASTA file at the defined path.
"""
SeqIO.write(seq_list,path,"fasta")
def sort_FASTA(in_path,out_path):
"""
Loads a multiple sequence alignment at in_path, sorts sequences in ascending order by their cluster ID, and writes them as a FASTA file to out_path. The sequence headers must be in the format produced by USEARCH for this function to work.
"""
sequences=read_seq(in_path)
sequences=sort_seqs(sequences)
write_sequences(sequences,out_path)