-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprotein_alignment_script
241 lines (203 loc) · 10.5 KB
/
protein_alignment_script
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
#!/usr/bin/python3
import os, subprocess, sys, re, glob
###USER INPUT
print("\nHello, welcome to B118056's Python 3 script\nPlease make sure you are in the directory you wish to output data before starting the analysis")
print("Program will write outputs to a folder called \'Assignment2\' in the current directory.")
#making a folder called Assignnment2 in the current directory
if not os.path.exists("Assignment2"):
os.mkdir("Assignment2")
#moving that directory
os.chdir("Assignment2")
#create a function that takes as inputs the taxon and protein family the user is interested in
def Personal(taxon, protein_family) :
import string
return print("\nYou have provided the following details for your search:\n\tTaxon: ",taxon,"\n\tProtein Family: ",protein_family)
#dictionnary with keys taxon and protein family and values are obtained from the user's query
details={}
details["taxon"] = input("What is your taxon of interest\n\t> ")
details["protein_family"]=input("What is the protein family\n\t> ")
#run function Personal using the inputs given by the user
Personal(*list(details.values()))
#allows user to check what input they have entered and let's them decide whether they want to continue with their search
def User_continue():
resp = input ("\nDo you wish to continue?\nContinue [1]\nRedefine Query [2]\nExit [3]\n")
if resp == "1":
print("Continuing analysis..\n")
elif resp == "2":
details["taxon"] = input("What is your taxon of interest\n\t> ")
details["protein_family"]=input("What is the protein family\n\t> ")
Personal(*list(details.values()))
User_continue()
elif resp == "3":
sys.exit()
else:
print ("Sorry, that was an invalid command. Please try again!")
User_continue()
User_continue()
#Couting number of sequences to be potentially downloaded
#by first downloading their accession IDs
subprocess.call("esearch -db protein -query '{0}[organism] AND {1}[Protein name] NOT PARTIAL' | efetch -db protein -format acc > {0}_acc.acc ".format(details["taxon"], details["protein_family"]), shell=True)
def Countsequences(file_name, mode='r+'):
count=0
with open(file_name) as f:
#counting the number of lines in file
for _ in f:
count += 1
print('\n\nTotal number of sequences to be downloaded: ', int(count))
resp = input("Do You Wish To Continue? [y/n]")
if resp == "y":
print("Proceeding to download sequences! \n")
elif resp == "n":
sys.exit(0)
else:
print ("Sorry, that was an invalid command. Please try again!")
Countsequences('{}_acc.acc'.format(details["taxon"]))
Countsequences('{}_acc.acc'.format(details["taxon"]))
#Esearch using Edirect
#Not partial used to download only complete sequences
esearch_command="esearch -db protein -query '{0}[organism] AND {1}[Protein name] NOT PARTIAL' | efetch -db protein -format fasta > {0}_esearch.fa ".format(details["taxon"], details["protein_family"])
os.system(esearch_command)
subprocess.call(esearch_command, shell=True)
print("\n\nSequences for {0} proteins in the {1} taxon are now downloaded!".format(details["protein_family"], details["taxon"]))
#Find the number of unique species downloaded
data = open("{}_esearch.fa".format(details["taxon"])).read()
print("\n\nWithin your dataset,")
nspec = set(re.findall('\[(.*?)\]', data))
print("The total number of unique species represented is:", len(nspec), "\n")
##SEQUENCE ALIGNMENT
#creates a database using the user's inputs
subprocess.call('makeblastdb -in {0}_esearch.fa -dbtype prot -out {0}_reference'.format(details["taxon"]), shell=True)
print("\n\nA reference database was successfully created\n\n")
#Clustalo
#used for sequences alignment of the query
subprocess.call('clustalo -i {0}_esearch.fa -o {0}_clustalo.fa --outfmt=fa -v'.format(details["taxon"]), shell=True)
print("\n\nData was successfully aligned \n\n")
#creating a consensus sequence from the multiple alignment
subprocess.call('cons -sequence {0}_clustalo.fa -outseq {0}_alignment.fa'.format(details["taxon"]), shell=True)
#blasting our sequence alignment file against the consensus sequence
#6 means tabular
#sseqid means Subject Seq-id
#sseq means Aligned part of subject sequence
subprocess.call('blastp -db {0}_reference -query {0}_alignment.fa -outfmt "6 sseqid sseq" > {0}_blastoutput.out'.format(details["taxon"]), shell=True)
#Converting the blastoutput into a fasta file
subprocess.call("sed 's/^/>/' {0}_blastoutput.out > temp".format(details["taxon"]), shell = True)
seq_blast = open("temp").read()
seq_blast = seq_blast.replace('\t', '\n')
f_blast= open("{0}_aligned.fa".format(details["taxon"]), "w")
f_blast.write(seq_blast)
f_blast.close()
###CONSERVATION ANALYSIS
#Function to determine and plot level of protein sequence conservation across the species within that taxonomic group
#Allows user to decide whether to save their plot as an output
#Allows user to decide the window size they want to use
def Download_plot():
print("\n\nPerforming A Conservation Analysis Of The Aligned Sequences.")
resp = input("Do You Wish To: \nDownload And Visualise Your Conservation Plot [1] \nOnly Visualise Your Plot [2] \nOnly Download Your Plot [3]\n")
if resp == "1":
print("\nPlease enter a window size for your plot. Default value is 4!")
print("Larger windows will result in smoother plots at loss of sensitivity\n")
subprocess.call("plotcon -sprotein1 -sequences {0}_aligned.fa -graph svg -goutfile {0}_conservation_plot".format(details["taxon"]), shell=True)
subprocess.call("plotcon -sprotein1 -sequences {0}_aligned.fa -graph x11 ".format(details["taxon"]), shell=True)
elif resp == "2":
print("\nPlease enter a window size for your plot. Default value is 4!")
print("Larger windows will result in smoother plots at loss of sensitivity\n")
subprocess.call("plotcon -sprotein1 -sequences {0}_aligned.fa -graph x11 ".format(details["taxon"]), shell=True)
elif resp == "3":
print("\nPlease enter a window size for your plot. Default value is 4!")
print("Larger windows will result in smoother plots at loss of sensitivity\n")
subprocess.call("plotcon -sprotein1 -sequences {0}_aligned.fa -graph svg -goutfile {0}_conservation_plot".format(details["taxon"]), shell=True)
else:
print ("Sorry, that was an invalid command. Please try again!")
Download_plot()
Download_plot()
###PROSITE
print("\n\nNow let's scan your sequences with motifs from the PROSITE database!")
if not os.path.exists("individual_sequences"):
os.mkdir("individual_sequences")
#patmatmotifs only works on individual sequences
#need to split query fasta files (containing multiple sequences)
#function used to create a list of individual files for each sequence in the query fasta file
fasta_file = open("{0}_esearch.fa".format(details["taxon"]), "r")
content = fasta_file.read()
#every accession IDs is separated by a > at the start of the line
all_sequences = content.split('>')
fasta_file.close()
for seq in all_sequences[1:]:
#need to retrieve the accession value to name the new file with the single sequence accordingly
acc = seq.split()[0]
#some fasta files contain sequences that start with sp or tr before the accession values
if acc.startswith('sp') or acc.startswith('tr'):
acc = acc.split('|')[1]
new_file = open('individual_sequences/' +acc + '.fa', 'w')
new_file.write('>' + seq)
new_file.close()
#PROSITE Analysis
os.listdir(path='.')
#function to get all the names and location of the files that need to be sent to patmatmotifs
def Getlistoffiles(dirName):
# create a list of file and sub directories names in the given directory
listOfFile = os.listdir(dirName)
allFiles = list()
# Iterate over all the entries
for entry in listOfFile:
# Create full path
fullPath = os.path.join(dirName, entry)
# If entry is a directory then get the list of files in this directory
if os.path.isdir(fullPath):
allFiles = allFiles + Getlistoffiles(fullPath)
else:
allFiles.append(fullPath)
return allFiles
filelist = Getlistoffiles("individual_sequences/")
if not os.path.exists("PROSITE_analysis"):
os.mkdir("PROSITE_analysis")
#do PROSITE analysis for each individual file/ sequence
count=0
for file in filelist:
count +=1
subprocess.call("patmatmotifs -sprotein1 -sequence {0} -outfile PROSITE_analysis/{1}.txt -auto Y ".format(file, count), shell=True)
#Creating summary file from PROSITE analysis
os.chdir("PROSITE_analysis/")
read_files = glob.glob("*.txt")
with open("all_prosite.txt", "wb") as outfile:
for f in read_files:
with open(f, "rb") as infile:
outfile.write(infile.read())
#now lets look for elements in the summary file
# want to find sequences that have motifs
all = open("all_prosite.txt").read()
print("\n\nIn this dataset:")
nmotifs = re.findall(r'Motif = .+', all)
lenmotifs=len(nmotifs)
print("A motif has been identified: ", lenmotifs, "times")
#list of all different types of motifs found
dmotifs=set(nmotifs)
print("The different motifs present in the sequences are:", dmotifs)
#list of the number of times each motif has been found over all the sequences
for i in dmotifs:
n=nmotifs.count(i)
print("\nThere are", n, "occurences of", i, "in all sequences")
##WILDCARD
os.chdir("..")
print("\n\nThis is the final (and optional) stage of your analysis!")
#allow user to decide whether to carry out this optional analysis - distmat from EMBOSS
def Wild_card():
resp = input("\nDo You Wish To Calculate The Evolutionary Distance Between Every Pair Of Sequences For Your Dataset?\n [y/n]")
if resp == "y":
subprocess.call("distmat -sequence {0}_aligned.fa -protmethod 1 -outfile {0}.distmat".format(details["taxon"]), shell=True)
print("Distance matrix saved as distmat.")
elif resp == "n":
print ("Skipping step...")
else:
print ("Sorry, that was an invalid command. Please try again!")
Wild_card()
Wild_card()
print("\nAnalysis finished. Enjoy your day!\n\n")
#creating a summary file from the PROSITE analsyis
#code doesn't work if it is put earlier on...
sys.stdout = open("PROSITE_summary.txt", "w")
print("From the input dataset:")
print("A motif has been identified: ", lenmotifs, "times")
print("The different motifs present in the sequences are:", dmotifs)
print("There are", n, "occurences of", i, "in all sequences")
sys.stdout.close()