-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathStatistics_Methylation_Arabido_DeepSignalPlant.py
303 lines (253 loc) · 11.4 KB
/
Statistics_Methylation_Arabido_DeepSignalPlant.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
####-------- CALCULATION OF METHYLATION STATISTICS FROM TSV FILE GENERATED BY DeepSignal-Plant --------####
# Author: Fadwa EL KHADDAR
# Lab : DIADE - IRD
# University : Montpellier - France
import os
import subprocess
import numpy as np
import pandas as pd
import glob
import sys
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import argparse
import re
from tkinter import *
import tkinter as tk
from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg
matplotlib.use('TkAgg')
# Arguments
parser = argparse.ArgumentParser(description=" Script to generate methylation statistics ")
parser.add_argument('-fasta', type=str, help="Path to fasta file" )
parser.add_argument('-i', type=str, help='Path to the directory containing the .tsv files ')
parser.add_argument('-t', type=int, help='Threshold of mapped methylated reads ')
parser.add_argument('-o', type=str, help='Output directory ')
args = parser.parse_args()
# Function definition
def CpG_context():
global CpG
global nb_C
global nb_CpG
motif_counts = {}
motif_sum = {} # Dictionnary to store count all occurences
motif_occurrences = {} # Dictionnary to store count of each occurence
positions_par_chromosome = {}
for motif in CpG:
motif_counts[motif] = 0
motif_sum[motif] = 0
motif_occurrences[motif] = []
rows= []
# Loop through k_mer column of treated dataset
for i, row in treated_df.iterrows():
kmer = row['k_mer']
first_two_letters = kmer[:2]
if "CG" in first_two_letters:
motif_count = first_two_letters.count("CG")
motif_counts["CG"] += motif_count
if motif_count > 0:
motif_positions = [(row['chromosome'], row['strand'], pos) for pos in
re.finditer("CG", first_two_letters)] # Store the positions of the motif
motif_occurrences["CG"].extend(motif_positions)
rows.append(row)
rows_CG_df = pd.DataFrame(rows)
rows_CG = rows_CG_df.sort_values("chromosome")
output_file = os.path.join(output_dir, filebasename + "_CG.csv")
rows_CG.to_csv(output_file, index=False, sep='\t')
#if not rows_CG.empty:
# for chromosome in rows_CG['chromosome'].unique():
# positions_par_chromosome[chromosome] = list(rows_CG.loc[rows_CG['chromosome'] == chromosome, ['strand', 'position']])
total_count_CpG = sum(motif_counts.values())
pourcentage_CpG = (total_count_CpG / nb_CpG) * 100
pourcentage_C = (total_count_CpG / nb_C) * 100
print(" --- CpG STATISTICS --- ")
print()
print(f" * The number of occurrences of CpG methylation : {total_count_CpG} ")
print(
f" * The percentage of CpG methylation among all CpGs in the genome : {pourcentage_CpG : .2f} %")
print(
f" * The percentage of methylated cytosines (CpG type) among all cytosines in the genome : {pourcentage_C : .2f} %")
print()
return total_count_CpG
def CHG_context():
global CHG
global nb_C
global nb_CpG
global output_dir
global filebasename
motif_counts = {} # create dictionnary to store the times the motif appears in the sequence
motif_sum = {} #
motif_occurrences = {} # create a dictionary to store occurrences of each motif
positions_par_chromosome = {}
for motif in CHG:
motif_counts[motif] = 0
motif_sum[motif] = 0
motif_occurrences[motif] = [] # initialize an empty list for each motif
rows = []
for i, row in treated_df.iterrows():
kmer= row['k_mer']
for motif in CHG:
motif_count = kmer.count(motif)
motif_counts[motif] += motif_count
if motif_count > 0:
motif_positions = [(row['chromosome'], row['strand'], pos) for pos in re.finditer(motif, kmer)] # Store the positions of the motif
motif_occurrences[motif].extend(motif_positions)
rows.append(row)
rows_CHG_df = pd.DataFrame(rows)
rows_CHG = rows_CHG_df.sort_values("chromosome")
output_file = os.path.join(output_dir, filebasename + "_CHG.csv")
rows_CHG.to_csv(output_file, index=False, sep='\t')
#if not rows_CHG.empty:
# for chromosome in rows_CHG['chromosome'].unique():
# positions_par_chromosome[chromosome] = list(rows_CHG.loc[rows_CHG['chromosome'] == chromosome, ['strand', 'position']])
total_count_CHG = sum(motif_counts.values())
pourcentage_C_total = (total_count_CHG / nb_C) * 100
print(" --- CHG STATISTICS --- ")
print()
print(
f"The percentage of methylated cytosines (CHG type) among all cytosines in the genome: {pourcentage_C_total: .2f} %")
print()
print(f" The number of occurrences of CHG methylation : {total_count_CHG}, which includes : ")
for motif in CHG:
pourcentage_C_motif = (motif_counts[motif] / nb_C) * 100
print(f" -> {motif} apprears {motif_counts[motif]} times")
print(f" The pourcentage of methylated cytosines (type {motif}) is : {pourcentage_C_motif: .2f}%")
print()
return total_count_CHG
def CHH_context():
global CHH
global nb_C
global nb_CpG
global output_dir
global filebasename
motif_counts = {} # create dictionnary to store the times the motif appears in the sequence
motif_sum = {} #
motif_occurrences = {} # create a dictionary to store occurrences of each motif
positions_par_chromosome = {}
for motif in CHH:
motif_counts[motif] = 0
motif_sum[motif] = 0
motif_occurrences[motif] = [] # initialize an empty list for each motif
rows = []
for i, row in treated_df.iterrows():
kmer = row['k_mer']
for motif in CHH:
motif_count = kmer.count(motif)
motif_counts[motif] += motif_count
if motif_count > 0:
motif_positions = [(row['chromosome'], row['strand'], pos) for pos in
re.finditer(motif, kmer)] # Store the positions of the motif
motif_occurrences[motif].extend(motif_positions)
rows.append(row)
rows_CHH_df = pd.DataFrame(rows)
rows_CHH = rows_CHH_df.sort_values("chromosome")
output_file = os.path.join(output_dir, filebasename + "_CHH.csv")
rows_CHH.to_csv(output_file, index=False, sep='\t')
#if not rows_CHH.empty:
# for chromosome in rows_CHH['chromosome'].unique():
# positions_par_chromosome[chromosome] = list(
# rows_CHH.loc[rows_CHH['chromosome'] == chromosome, ['strand', 'position']])
total_count_CHH = sum(motif_counts.values())
pourcentage_C_total = (total_count_CHH / nb_C) * 100
print(" --- CHH STATISTICS --- ")
print()
print(
f"The percentage of methylated cytosines (CHH type) among all cytosines in the genome: {pourcentage_C_total: .2f} %")
print()
print(f" The number of occurrences of CHH methylation : {total_count_CHH}, which includes : ")
for motif in CHH:
pourcentage_C_motif = (motif_counts[motif] / nb_C) * 100
print(f" -> {motif} apprears {motif_counts[motif]} times")
print(f" The pourcentage of methylated cytosines (type {motif}) is : {pourcentage_C_motif: .2f}%")
print()
return total_count_CHH
def plotting():
global nb_C
CpG = CpG_context()
CHG = CHG_context()
CHH = CHH_context()
fig, ax = plt.subplots(figsize=(8,6), subplot_kw=dict(aspect="equal"))
nb_unmethylated = nb_C - CpG - CHG - CHH
counts = [CpG, CHG, CHH, nb_unmethylated]
labels = ["CpG", "CHG", "CHH", "unmethylated"]
fig = plt.figure()
ax = fig.add_subplot(111)
wedges, texts, autotexts = ax.pie(counts, autopct='%1.1f%%')
bbox_props= dict(boxstyle="square, pad=0.3", fc="w", ec="k", lw=0.72)
kw = dict(arrowprops=dict(arrowstyle="-"), bbox=bbox_props, zorder=0, va="center")
for i, p in enumerate(wedges):
ang = (p.theta2 - p.theta1) / 2. + p.theta1
y = np.sin(np.deg2rad(ang))
x = np.cos(np.deg2rad(ang))
horizontalalignment = {-1: "right", 1: "left"}[int(np.sign(x))]
connectionstyle = "angle,angleA=0,angleB={}".format(ang)
kw["arrowprops"].update({"connectionstyle": connectionstyle})
ax.annotate(f"{labels[i]}: {counts[i]}", xy=(x, y), xytext=(1.35 * np.sign(x), 1.4 * y),
horizontalalignment=horizontalalignment, **kw)
#ax.pie(counts, autopct='%1.1f%%', startangle=90, center = (-2,0))
#legend = ax.legend(labels, loc = 2)
#ax.axis('equal')
plt.title(f"Methylation context amoung all cytosines in the sample : {filebasename}")
root = tk.Tk()
root.title("Graphique")
# Créer une zone pour afficher le graphique
canvas = FigureCanvasTkAgg(fig, master=root)
canvas.draw()
canvas.get_tk_widget().pack()
# Bouton pour fermer la fenêtre
button = tk.Button(master=root, text="Fermer", command=root.quit)
button.pack()
# Démarrer la boucle principale Tkinter
tk.mainloop()
print("####-------- CALCULATION OF METHYLATION STATISTICS FROM TSV FILE GENERATED BY DeepSignal-Plant --------####")
# list of the three methylation contexts
CpG = ["CG"]
CHG = ["CCG", "CAG", "CTG"]
CHH = ["CCC", "CAA", "CTT", "CCA", "CCT", "CAT", "CAC", "CTA", "CTC"]
path = args.i
fasta = args.fasta
#nb_CpG = args.cpg
seuil = args.t
output_dir = args.o
# Running seqtk
result = subprocess.run(f"seqtk comp {fasta} | awk '{{C+=$4}}END{{print C}}'", shell=True, capture_output=True, text=True)
if result.stdout.strip():
nb_C = int(result.stdout.strip())
print(f"Total number of cytosine : {nb_C}")
else:
print("ERROR")
result = subprocess.run(f"seqtk comp {fasta} | awk '{{CpG+=$10}}END{{print CpG}}'", shell=True, capture_output=True, text=True)
if result.stdout.strip():
nb_CpG = int(result.stdout.strip())
print(f"Le nombre total de CpG : {nb_CpG}")
else:
print("ERROR")
# read files
files = path + "/*.tsv" # store the path to files
for filename in glob.glob(files): # to select all files present in that path
filebasename = os.path.splitext(os.path.basename(filename))[0] # store the name files
print()
print(f" Statistics for the file : {filebasename} ")
print()
names = ["chromosome", "position","strand", "pos_in_strand", "prob_0_sum", "prob_1_sum", "count_modified", "count_unmodified", "coverage", "modification_frequency", "k_mer"] #Ajout des entêtes des colonnes
call_mods = pd.read_csv(filename, sep='\t', names=names) # read files
# DataFrame Creation
df = pd.DataFrame(call_mods)
# mean coverage :
couverture= df["coverage"].mean()
couverture_int =int(couverture)
print(f" The mean coverage : {couverture_int}")
print()
# filtring dataset
print(f" %%% -- The following statistics are performed on a dataset whose number of reads \n in which the targeted base is considered modified is greater than or equal to {seuil} -- %%%")
print()
methyl= df[df["count_modified"] >= seuil]
methylated=pd.DataFrame(methyl) # save only the methylated reads.
#methylated.to_csv(f"/home/fadwa/Téléchargements/test/methylated_seuil.tsv", sep ="\t")
treated_df = methylated[['chromosome', 'strand', 'position', 'k_mer']] #filter dataset
treated_df = pd.DataFrame(treated_df)
treated_df['k_mer'] = treated_df['k_mer'].str[2:] # filter dataset "treated_df" which contains only from the 3th element from k_mer
#CpG_context()
#CHG_context()
#CHH_context()
plotting()