forked from jordancheah/DNA-FASTA-Python
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdna3.py
105 lines (81 loc) · 3.01 KB
/
dna3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
"""
Analyzing DNA sequences in Multi-Fasta Format using Python
A Python program that takes as input a file containing DNA sequences in multi-FASTA format, and computes
the answers to the following questions:
* How many records are in the file? A record in a FASTA file is defined as a single-line header, followed
by lines of sequence data. The header line is distinguished from the sequence data by a greater-than (">")
symbol in the first column. The word following the ">" symbol is the identifier of the sequence, and the
rest of the line is an optional description of the entry. There should be no space between the ">" and the
first letter of the identifier.
* What are the lengths of the sequences in the file? What is the longest sequence and what is the shortest
sequence? Is there more than one longest or shortest sequence? What are their identifiers?
"""
# Find ORF in Sequence
def find_orf(sequence):
# Find all ATG indexs
start_position = 1
start_indexs = []
stop_indexs = []
for i in range(1, len(sequence), 3):
if sequence[i:i+3] == "ATG":
start_indexs.append(i)
# Find all stop codon indexs
for i in range(1, len(sequence), 3):
stops =["TAA", "TAA", "TGA"]
if sequence[i:i+3] in stops:
stop_indexs.append(i)
orf = []
mark = 0
for i in range(0,len(start_indexs)):
for j in range(0, len(stop_indexs)):
if start_indexs[i] < stop_indexs[j] and start_indexs[i] > mark:
orf.append(sequence[start_indexs[i]:stop_indexs[j]+3])
mark = stop_indexs[j]+3
break
return orf
"""
Question = How many records are in the file?
Answer = 17
"""
f = open("dna3.fasta", "r")
file = f.read()
seqcount = file.count('>')
print("Number of sequence = " + str(seqcount))
"""
Question = What are the lengths of the sequences in the file?
What is the longest sequence and what is the shortest sequence?
Answer = Longest is 4773, Shortest is 37
"""
f = open("dna3.fasta", "r")
file = f.readlines()
sequences = []
seq = ""
for f in file:
if not f.startswith('>'):
f = f.replace(" ", "") # remove all spaces and newline from the text
f = f.replace("\n", "")
seq = seq + f # ... then form a long sequence
else:
sequences.append(seq)
seq = ""
sequences.append(seq)
sequences = sequences[1:] # discard the first sequence, since it is null
lengths = [len(s) for s in sequences]
print("\nMax sequence length = " + str(max(lengths)))
print("Min sequence length = " + str(min(lengths)))
print("\nSequence Length Report:")
for j in range(seqcount):
print ("Length of sequence " + str(j) + " is " + str(lengths[j]))
"""
Queston: what is the length of the longest ORF in the file?
"""
n = 1
lengths = []
for s in sequences:
orfs = find_orf(s)
print("ORF")
for j in orfs:
print(j)
print("================")
lengths.append(len(j))
print("\nLongest ORF is:" + str(max(lengths)))