-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathexe1_genome.py
121 lines (112 loc) · 3.89 KB
/
exe1_genome.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
class Genome:
def __init__(self, genome):
"""
Initialize the Genome class with the provided genome sequence.
:param genome: String with the genome sequence.
"""
self.genome = genome
def get_at_content(self):
"""
Return the AT content of the genome sequence, i.e. the combined
fraction of 'A' and 'T' in the entire genome sequence.
:return: AT content (float, rounded to 6 digits)
"""
return (self.genome.count('A') + self.genome.count('T')) / len(self.genome)
def get_codon_dist(self):
"""
Return the expected codon distribution (fractions) based on the
distribution (fractions) of the four different nucleotides (ATGC).
:return: Tree-like structure made out of nested dictionaries. The nodes
represent the different nucleotides and the path down the tree
forms the corresponding codons. The leafs contain the expected
codon frequencies (rounded to 6 digits).
"""
fractions = {k: self.genome.count(k) / len(self.genome) for k in 'ATGC'}
fraction_tree = {
k1: {
k2: {
k3: round(v1 * v2 * v3, 6) for k3, v3 in fractions.items()
} for k2, v2 in fractions.items()
} for k1, v1 in fractions.items()
}
return fraction_tree
def get_amino_acid_dist(self):
"""
Return the expected amino acid distribution (fractions) based on the
expected distribution (fractions) of the different codons.
:return: Dictionary that contains the expected amino acid distribution.
The keys are the 20 different amino acids, the values are the
corresponding frequencies (rounded to 6 digits).
"""
acids = ['K', 'N', 'T', 'R', 'S', 'I', 'M', 'Q', 'H', 'P', 'L', 'E', 'D', 'A', 'G', 'V', 'Y', 'C', 'W', 'F']
stop_codons = ['TAA', 'TAG', 'TGA']
forward_table = {
'TTT': 'F',
'TTC': 'F',
'TTA': 'L',
'TTG': 'L',
'TCT': 'S',
'TCC': 'S',
'TCA': 'S',
'TCG': 'S',
'TAT': 'Y',
'TAC': 'Y',
'TGT': 'C',
'TGC': 'C',
'TGG': 'W',
'CTT': 'L',
'CTC': 'L',
'CTA': 'L',
'CTG': 'L',
'CCT': 'P',
'CCC': 'P',
'CCA': 'P',
'CCG': 'P',
'CAT': 'H',
'CAC': 'H',
'CAA': 'Q',
'CAG': 'Q',
'CGT': 'R',
'CGC': 'R',
'CGA': 'R',
'CGG': 'R',
'ATT': 'I',
'ATC': 'I',
'ATA': 'I',
'ATG': 'M',
'ACT': 'T',
'ACC': 'T',
'ACA': 'T',
'ACG': 'T',
'AAT': 'N',
'AAC': 'N',
'AAA': 'K',
'AAG': 'K',
'AGT': 'S',
'AGC': 'S',
'AGA': 'R',
'AGG': 'R',
'GTT': 'V',
'GTC': 'V',
'GTA': 'V',
'GTG': 'V',
'GCT': 'A',
'GCC': 'A',
'GCA': 'A',
'GCG': 'A',
'GAT': 'D',
'GAC': 'D',
'GAA': 'E',
'GAG': 'E',
'GGT': 'G',
'GGC': 'G',
'GGA': 'G',
'GGG': 'G'
}
codon_dist = self.get_codon_dist()
acid_dist = {a: 0 for a in acids}
stop_codons_sum = sum(codon_dist[c[0]][c[1]][c[2]] for c in stop_codons)
for codon, acid in forward_table.items():
acid_dist[acid] += codon_dist[codon[0]][codon[1]][codon[2]]
acid_dist = {acid: round(p / (1 - stop_codons_sum), 6) for acid, p in acid_dist.items()}
return acid_dist