-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathStatistics_Methylation_Oryza_DeepSignalPlant.py.py
317 lines (251 loc) · 11.4 KB
/
Statistics_Methylation_Oryza_DeepSignalPlant.py.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
###-------- CALCULATION OF METHYLATION STATISTICS FROM TSV FILE GENERATED BY DeepSignal-Plant --------####
# Autor: Fadwa EL KHADDAR
# Lab : DIADE - IRD
# University : Montpellier - France
import os
import subprocess
import numpy as np
import sys
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import argparse
import re
from tkinter import *
import tkinter as tk
import glob
import pandas as pd
from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg
matplotlib.use('TkAgg')
# Arguments
parser = argparse.ArgumentParser(description=" Script to generate methylation statistics ")
parser.add_argument('-fasta', type=str, help="Path to fasta file" )
parser.add_argument('-i', type=str, help='Path to the directory containing the .tsv files ')
parser.add_argument('-t', type=int, help='Threshold of mapped methylated reads ')
parser.add_argument('-c', type=int, help='Chunk size')
parser.add_argument('-o', type=str, help='Output directory ')
args = parser.parse_args()
path = args.i
fasta = args.fasta
seuil = args.t
chunk_size = args.c
output_dir = args.o
total_motif_counts_CpG = {}
total_motif_counts_CHG = {}
total_motif_counts_CHH = {}
total_CpG_occurrences = 0
total_CHG_occurrences = 0
total_CHH_occurrences = 0
all_rows_with_CpG = pd.DataFrame(columns=["chromosome","strand","position"])
all_rows_with_CHG = pd.DataFrame(columns=["chromosome","strand","position"])
all_rows_with_CHH = pd.DataFrame(columns=["chromosome","strand","position"])
# Function definition
def CpG_context():
global CpG
global nb_CpG
global treated_df
global total_motif_counts_CpG
global total_CpG_occurrences # Ajout de la variable globale pour le total des occurrences de CpG
global all_rows_with_CpG
motif_counts = {}
for motif in CpG:
motif_counts[motif] = 0
rows_with_CpG = []
for i, row in treated_df.iterrows():
kmer = row['k_mer']
first_two_letters = kmer[:2]
if "CG" in first_two_letters:
motif_count = first_two_letters.count("CG")
motif_counts["CG"] += motif_count
if motif_count > 0:
rows_with_CpG.append(row)
if rows_with_CpG:
all_rows_with_CpG = pd.concat([all_rows_with_CpG, pd.DataFrame(rows_with_CpG)])
# Mettre à jour le dictionnaire total_motif_counts_CpG avec les totaux de ce chunk
for motif, count in motif_counts.items():
if motif in total_motif_counts_CpG:
total_motif_counts_CpG[motif] += count
else:
total_motif_counts_CpG[motif] = count
total_CpG_occurrences += sum(motif_counts.values()) # Mettre à jour le total des occurrences de CpG
pourcentage_CpG = (total_CpG_occurrences / nb_CpG) * 100
pourcentage_C = (total_CpG_occurrences / nb_C) * 100
return total_CpG_occurrences
def CHG_context():
global CHG
global nb_C
global treated_df
global total_motif_counts_CHG
global total_CHG_occurrences
global all_rows_with_CHG
motif_counts = {} # Créer un dictionnaire pour stocker les occurrences de chaque motif
# Initialiser le dictionnaire de motifs
for motif in CHG:
motif_counts[motif] = 0
rows_with_CHG = []
for i, row in treated_df.iterrows():
kmer = row['k_mer']
for motif in CHG:
motif_count = kmer.count(motif)
motif_counts[motif] += motif_count
if motif_count > 0:
rows_with_CHG.append(row)
if rows_with_CHG:
all_rows_with_CHG = pd.concat([all_rows_with_CHG, pd.DataFrame(rows_with_CHG)])
# Mettre à jour le dictionnaire total_motif_counts_CHG avec les totaux de ce chunk
for motif, count in motif_counts.items():
if motif in total_motif_counts_CHG:
total_motif_counts_CHG[motif] += count
else:
total_motif_counts_CHG[motif] = count
total_CHG_occurrences += sum(motif_counts.values())
return total_motif_counts_CHG
def CHH_context():
global CHH
global nb_C
global treated_df
global total_motif_counts_CHH
global total_CHH_occurrences
global all_rows_with_CHH
motif_counts = {} # Créer un dictionnaire pour stocker les occurrences de chaque motif
# Initialiser le dictionnaire de motifs
for motif in CHH:
motif_counts[motif] = 0
rows_with_CHH = []
for i, row in treated_df.iterrows():
kmer = row['k_mer']
for motif in CHH:
motif_count = kmer.count(motif)
motif_counts[motif] += motif_count
if motif_count > 0:
rows_with_CHH.append(row)
if rows_with_CHH:
all_rows_with_CHH = pd.concat([all_rows_with_CHH, pd.DataFrame(rows_with_CHH)])
# Mettre à jour le dictionnaire total_motif_counts_CHG avec les totaux de ce chunk
for motif, count in motif_counts.items():
if motif in total_motif_counts_CHH:
total_motif_counts_CHH[motif] += count
else:
total_motif_counts_CHH[motif] = count
total_CHH_occurrences += sum(motif_counts.values())
return total_motif_counts_CHH
def afficher_totaux():
global total_motif_counts_CHG
global total_motif_counts_CHH
global total_CHG_occurrences
global total_CHH_occurrences
global total_motif_counts_CpG
print(" --- CpG STATISTICS --- ")
print()
print(f" The number of occurrences of CpG methylation : {total_CpG_occurrences}, which includes : ")
for motif, total_count in total_motif_counts_CpG.items():
pourcentage_C_motif = (total_count / nb_CpG) * 100
print(f" -> {motif} appears {total_count} times")
print(f" The pourcentage of methylated cytosines (type {motif}) is : {pourcentage_C_motif:.2f}%")
print()
print(" --- CHG STATISTICS --- ")
print()
print(f" The number of occurrences of CHG methylation : {total_CHG_occurrences}, which includes : ")
for motif, total_count in total_motif_counts_CHG.items():
pourcentage_C_motif = (total_count / nb_C) * 100
print(f" -> {motif} appears {total_count} times")
print(f" The pourcentage of methylated cytosines (type {motif}) is : {pourcentage_C_motif: .2f}%")
print()
print(" --- CHH STATISTICS --- ")
print()
print(f" The number of occurrences of CHH methylation : {total_CHH_occurrences}, which includes : ")
for motif, total_count in total_motif_counts_CHH.items():
pourcentage_C_motif = (total_count / nb_C) * 100
print(f" -> {motif} appears {total_count} times")
print(f" The pourcentage of methylated cytosines (type {motif}) is : {pourcentage_C_motif: .2f}%")
print()
def plotting(total_motif_counts_CHH, total_motif_counts_CHG, total_CpG_occurrences):
global nb_C
CpG = total_CpG_occurrences
CHG = total_motif_counts_CHG
CHH = total_motif_counts_CHH
global filebasename
fig, ax = plt.subplots(figsize=(16, 9), subplot_kw=dict(aspect="equal"))
nb_unmethylated = nb_C - CpG - sum(CHG.values()) - sum(CHH.values())
counts = [CpG, sum(CHG.values()), sum(CHH.values()), nb_unmethylated]
labels = ["CpG", "CHG", "CHH", "unmethylated"]
wedges, texts, autotexts = ax.pie(counts, autopct='%1.1f%%')
bbox_props = dict(boxstyle="square, pad=0.3", fc="w", ec="k", lw=0.72)
kw = dict(arrowprops=dict(arrowstyle="-"), bbox=bbox_props, zorder=0, va="center")
for i, p in enumerate(wedges):
ang = (p.theta2 - p.theta1) / 2. + p.theta1
y = np.sin(np.deg2rad(ang))
x = np.cos(np.deg2rad(ang))
horizontalalignment = {-1: "right", 1: "left"}[int(np.sign(x))]
connectionstyle = "angle,angleA=0,angleB={}".format(ang)
kw["arrowprops"].update({"connectionstyle": connectionstyle})
ax.annotate(f"{labels[i]}: {counts[i]}", xy=(x, y), xytext=(1.35 * np.sign(x), 1.4 * y),
horizontalalignment=horizontalalignment, **kw)
# Ajouter une légende en bas à droite
legend_labels = [f"{label}: {count} ({count/sum(counts)*100:.1f}%)"
for label, count in zip(labels, counts)]
plt.legend(wedges, legend_labels, title="Motifs", bbox_to_anchor=(1, 0.2), loc="upper left")
plt.title(f"Methylation context among all cytosines in the sample: {filebasename}")
root = tk.Tk()
root.title("Methylation Statistics")
# Create a canvas to display the figure
canvas = FigureCanvasTkAgg(fig, master=root)
canvas.draw()
canvas.get_tk_widget().pack()
# Create a button to close the window
button = tk.Button(master=root, text="Fermer", command=root.quit)
button.pack()
# Start the tkinter event loop
tk.mainloop()
print("####-------- CALCULATION OF METHYLATION STATISTICS FROM TSV FILE GENERATED BY DeepSignal-Plant --------####")
# list of the three methylation contexts
CpG = ["CG"]
CHG = ["CCG", "CAG", "CTG"]
CHH = ["CCC", "CAA", "CTT", "CCA", "CCT", "CAT", "CAC", "CTA", "CTC"]
# Running seqtk
result = subprocess.run(f"seqtk comp {fasta} | awk '{{C+=$4}}END{{print C}}'", shell=True, capture_output=True, text=True)
nb_C = int(result.stdout.strip())
print(f"Le nombre total de cytosine : {nb_C}")
result = subprocess.run(f"seqtk comp {fasta} | awk '{{CpG+=$10}}END{{print CpG}}'", shell=True, capture_output=True, text=True)
nb_CpG = int(result.stdout.strip())
print(f"Le nombre total de CpG : {nb_CpG}")
# read files
chunk_coverage_list =[]
files = path + "/*.tsv" # store the path to files
print(
f" %%% -- The following statistics are performed on a dataset whose number of reads \n in which the targeted base is considered modified is greater than or equal to {seuil} -- %%%")
for filename in glob.glob(files): # to select all files present in that path
filebasename = os.path.splitext(os.path.basename(filename))[0] # store the name files
print()
print(f" Statistics for the file : {filebasename} ")
print()
names = ["chromosome", "position","strand", "pos_in_strand", "prob_0_sum", "prob_1_sum", "count_modified", "count_unmodified", "coverage", "modification_frequency", "k_mer"] #Ajout des entêtes des colonnes
call_mods = pd.read_csv(filename, sep='\t', names=names, chunksize=chunk_size) # read files
chunk_number = 1
for chunk in call_mods:
# DataFrame Creation
df = pd.DataFrame(chunk)
chunk_coverage = df["coverage"].tolist()
chunk_coverage_list.extend(chunk_coverage)
print()
methyl= df[df["count_modified"] >= seuil]
print(f"Processing chunk {chunk_number}")
methylated=pd.DataFrame(methyl) # save only the methylated reads.
#methylated.to_csv(f"/home/fadwa/Téléchargements/test/methylated_seuil.tsv", sep ="\t")
treated_df = methylated[['chromosome', 'strand', 'position', 'k_mer']] #filter dataset
treated_df = pd.DataFrame(treated_df)
treated_df['k_mer'] = treated_df['k_mer'].str[2:] # filter dataset "treated_df" which contains only from the 3th element from k_mer
CpG_context()
CHG_context()
CHH_context()
chunk_number += 1
mean_coverage= sum(chunk_coverage_list) / len(chunk_coverage_list)
couverture_int =int(mean_coverage)
print()
print(f"The mean coverage of the file is {couverture_int}")
print()
afficher_totaux()
all_rows_with_CpG.to_csv(f"{output_dir}/{filebasename}_CpG.csv", index=False)
all_rows_with_CHG.to_csv(f"{output_dir}/{filebasename}_CHG.csv", index=False)
all_rows_with_CHH.to_csv(f"{output_dir}/{filebasename}_CHH.csv", index=False)
plotting(total_motif_counts_CHH, total_motif_counts_CHG, total_CpG_occurrences)