Statistics_Methylation_Oryza_DeepSignalPlant.py.py

###-------- CALCULATION OF METHYLATION STATISTICS FROM TSV FILE GENERATED BY DeepSignal-Plant --------####
# Autor: Fadwa EL KHADDAR
# Lab : DIADE - IRD
# University : Montpellier - France

import os
import subprocess
import numpy as np
import sys
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import argparse
import re
from tkinter import *
import tkinter as tk
import glob
import pandas as pd
from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg
matplotlib.use('TkAgg')
# Arguments

parser = argparse.ArgumentParser(description=" Script to generate methylation statistics ")
parser.add_argument('-fasta', type=str, help="Path to fasta file" )
parser.add_argument('-i', type=str, help='Path to the directory containing the .tsv files ')
parser.add_argument('-t', type=int, help='Threshold of mapped methylated reads ')
parser.add_argument('-c', type=int, help='Chunk size')
parser.add_argument('-o', type=str, help='Output directory ')

args = parser.parse_args()

path = args.i
fasta = args.fasta
seuil = args.t
chunk_size = args.c
output_dir = args.o

total_motif_counts_CpG = {}
total_motif_counts_CHG = {}
total_motif_counts_CHH = {}
total_CpG_occurrences = 0
total_CHG_occurrences = 0
total_CHH_occurrences = 0


all_rows_with_CpG = pd.DataFrame(columns=["chromosome","strand","position"])
all_rows_with_CHG = pd.DataFrame(columns=["chromosome","strand","position"])
all_rows_with_CHH = pd.DataFrame(columns=["chromosome","strand","position"])


# Function definition

def CpG_context():
    global CpG
    global nb_CpG
    global treated_df
    global total_motif_counts_CpG
    global total_CpG_occurrences  # Ajout de la variable globale pour le total des occurrences de CpG
    global all_rows_with_CpG
    
    motif_counts = {}
    for motif in CpG:
        motif_counts[motif] = 0

    rows_with_CpG = []
    for i, row in treated_df.iterrows():
        kmer = row['k_mer']
        first_two_letters = kmer[:2]
        if "CG" in first_two_letters:
            motif_count = first_two_letters.count("CG")
            motif_counts["CG"] += motif_count
            if motif_count > 0:
               rows_with_CpG.append(row)
    if rows_with_CpG:
       all_rows_with_CpG = pd.concat([all_rows_with_CpG, pd.DataFrame(rows_with_CpG)])
    # Mettre à jour le dictionnaire total_motif_counts_CpG avec les totaux de ce chunk
    for motif, count in motif_counts.items():
        if motif in total_motif_counts_CpG:
            total_motif_counts_CpG[motif] += count
        else:
            total_motif_counts_CpG[motif] = count

    total_CpG_occurrences += sum(motif_counts.values())  # Mettre à jour le total des occurrences de CpG
    pourcentage_CpG = (total_CpG_occurrences / nb_CpG) * 100
    pourcentage_C = (total_CpG_occurrences / nb_C) * 100

    return total_CpG_occurrences


def CHG_context():
    global CHG
    global nb_C
    global treated_df
    global total_motif_counts_CHG
    global total_CHG_occurrences
    global all_rows_with_CHG
    motif_counts = {}  # Créer un dictionnaire pour stocker les occurrences de chaque motif

    # Initialiser le dictionnaire de motifs
    for motif in CHG:
        motif_counts[motif] = 0
    
    rows_with_CHG = []
    for i, row in treated_df.iterrows():
        kmer = row['k_mer']
        for motif in CHG:
            motif_count = kmer.count(motif)
            motif_counts[motif] += motif_count
            if motif_count > 0:
               rows_with_CHG.append(row)
    if rows_with_CHG:
       all_rows_with_CHG = pd.concat([all_rows_with_CHG, pd.DataFrame(rows_with_CHG)])

    # Mettre à jour le dictionnaire total_motif_counts_CHG avec les totaux de ce chunk
    for motif, count in motif_counts.items():
        if motif in total_motif_counts_CHG:
            total_motif_counts_CHG[motif] += count
        else:
            total_motif_counts_CHG[motif] = count
    total_CHG_occurrences += sum(motif_counts.values())
    return total_motif_counts_CHG

def CHH_context():
    global CHH
    global nb_C
    global treated_df
    global total_motif_counts_CHH
    global total_CHH_occurrences
    global all_rows_with_CHH
    
    motif_counts = {}  # Créer un dictionnaire pour stocker les occurrences de chaque motif

    # Initialiser le dictionnaire de motifs
    for motif in CHH:
        motif_counts[motif] = 0

    rows_with_CHH = []
    for i, row in treated_df.iterrows():
        kmer = row['k_mer']
        for motif in CHH:
            motif_count = kmer.count(motif)
            motif_counts[motif] += motif_count
            if motif_count > 0:
               rows_with_CHH.append(row)
    if rows_with_CHH:
       all_rows_with_CHH = pd.concat([all_rows_with_CHH, pd.DataFrame(rows_with_CHH)])


    # Mettre à jour le dictionnaire total_motif_counts_CHG avec les totaux de ce chunk
    for motif, count in motif_counts.items():
        if motif in total_motif_counts_CHH:
            total_motif_counts_CHH[motif] += count
        else:
            total_motif_counts_CHH[motif] = count
    total_CHH_occurrences += sum(motif_counts.values())

    return total_motif_counts_CHH


def afficher_totaux():
    global total_motif_counts_CHG
    global total_motif_counts_CHH
    global total_CHG_occurrences
    global total_CHH_occurrences
    global total_motif_counts_CpG
    
  
    print(" --- CpG STATISTICS --- ")
    print()
    print(f" The number of occurrences of CpG methylation : {total_CpG_occurrences}, which includes : ")
    for motif, total_count in total_motif_counts_CpG.items():
        pourcentage_C_motif = (total_count / nb_CpG) * 100
        print(f"    -> {motif} appears {total_count} times")
        print(f"       The pourcentage of methylated cytosines (type {motif}) is : {pourcentage_C_motif:.2f}%")
        print()

    print(" --- CHG STATISTICS --- ")
    print()
    print(f" The number of occurrences of CHG methylation : {total_CHG_occurrences}, which includes : ")
    for motif, total_count in total_motif_counts_CHG.items():
        pourcentage_C_motif = (total_count / nb_C) * 100
        print(f"    -> {motif} appears {total_count} times")
        print(f"       The pourcentage of methylated cytosines (type {motif}) is : {pourcentage_C_motif: .2f}%")
        print()

    print(" --- CHH STATISTICS --- ")
    print()
    print(f" The number of occurrences of CHH methylation : {total_CHH_occurrences}, which includes : ")
    for motif, total_count in total_motif_counts_CHH.items():
        pourcentage_C_motif = (total_count / nb_C) * 100
        print(f"    -> {motif} appears {total_count} times")
        print(f"       The pourcentage of methylated cytosines (type {motif}) is : {pourcentage_C_motif: .2f}%")
        print()


def plotting(total_motif_counts_CHH, total_motif_counts_CHG, total_CpG_occurrences):
    global nb_C
    CpG = total_CpG_occurrences
    CHG = total_motif_counts_CHG
    CHH = total_motif_counts_CHH
    global filebasename

    fig, ax = plt.subplots(figsize=(16, 9), subplot_kw=dict(aspect="equal"))
    nb_unmethylated = nb_C - CpG - sum(CHG.values()) - sum(CHH.values())
    counts = [CpG, sum(CHG.values()), sum(CHH.values()), nb_unmethylated]

    labels = ["CpG", "CHG", "CHH", "unmethylated"]

    wedges, texts, autotexts = ax.pie(counts, autopct='%1.1f%%')
    bbox_props = dict(boxstyle="square, pad=0.3", fc="w", ec="k", lw=0.72)
    kw = dict(arrowprops=dict(arrowstyle="-"), bbox=bbox_props, zorder=0, va="center")

    for i, p in enumerate(wedges):
        ang = (p.theta2 - p.theta1) / 2. + p.theta1
        y = np.sin(np.deg2rad(ang))
        x = np.cos(np.deg2rad(ang))
        horizontalalignment = {-1: "right", 1: "left"}[int(np.sign(x))]
        connectionstyle = "angle,angleA=0,angleB={}".format(ang)
        kw["arrowprops"].update({"connectionstyle": connectionstyle})
        ax.annotate(f"{labels[i]}: {counts[i]}", xy=(x, y), xytext=(1.35 * np.sign(x), 1.4 * y),
                    horizontalalignment=horizontalalignment, **kw)

    # Ajouter une légende en bas à droite
    legend_labels = [f"{label}: {count} ({count/sum(counts)*100:.1f}%)"
                     for label, count in zip(labels, counts)]
    plt.legend(wedges, legend_labels, title="Motifs", bbox_to_anchor=(1, 0.2), loc="upper left")

    plt.title(f"Methylation context among all cytosines in the sample: {filebasename}")

    root = tk.Tk()
    root.title("Methylation Statistics")

    # Create a canvas to display the figure
    canvas = FigureCanvasTkAgg(fig, master=root)
    canvas.draw()
    canvas.get_tk_widget().pack()

    # Create a button to close the window
    button = tk.Button(master=root, text="Fermer", command=root.quit)
    button.pack()

    # Start the tkinter event loop
    tk.mainloop()

print("####-------- CALCULATION OF METHYLATION STATISTICS FROM TSV FILE GENERATED BY DeepSignal-Plant  --------####")

# list of the three methylation contexts

CpG = ["CG"]
CHG = ["CCG", "CAG", "CTG"]
CHH = ["CCC", "CAA", "CTT", "CCA", "CCT", "CAT", "CAC", "CTA", "CTC"]


# Running seqtk
result = subprocess.run(f"seqtk comp {fasta} | awk '{{C+=$4}}END{{print C}}'", shell=True, capture_output=True, text=True)
nb_C = int(result.stdout.strip())
print(f"Le nombre total de cytosine : {nb_C}")

result = subprocess.run(f"seqtk comp {fasta} | awk '{{CpG+=$10}}END{{print CpG}}'", shell=True, capture_output=True, text=True)
nb_CpG = int(result.stdout.strip())
print(f"Le nombre total de CpG : {nb_CpG}")


# read files

chunk_coverage_list =[]

files = path + "/*.tsv"      # store the path to files
print(
    f" %%% -- The following statistics are performed on a dataset whose number of reads \n in which the targeted base is considered modified is greater than or equal to {seuil} -- %%%")

for filename in glob.glob(files): # to select all files present in that path
    filebasename = os.path.splitext(os.path.basename(filename))[0] # store the name files
    print()
    print(f" Statistics for the file : {filebasename} ")
    print()
    names = ["chromosome", "position","strand", "pos_in_strand", "prob_0_sum", "prob_1_sum", "count_modified", "count_unmodified", "coverage", "modification_frequency", "k_mer"]    #Ajout des entêtes des colonnes
    call_mods = pd.read_csv(filename, sep='\t', names=names, chunksize=chunk_size) # read files
    chunk_number = 1
    for chunk in call_mods:
        # DataFrame Creation
        df = pd.DataFrame(chunk)

        chunk_coverage = df["coverage"].tolist()
        chunk_coverage_list.extend(chunk_coverage)
	
        print()
        methyl= df[df["count_modified"] >= seuil]
        print(f"Processing chunk {chunk_number}")
        methylated=pd.DataFrame(methyl) # save only the methylated reads.

        #methylated.to_csv(f"/home/fadwa/Téléchargements/test/methylated_seuil.tsv", sep ="\t")
        treated_df = methylated[['chromosome', 'strand', 'position', 'k_mer']] #filter dataset
        treated_df = pd.DataFrame(treated_df)
        treated_df['k_mer'] = treated_df['k_mer'].str[2:] # filter dataset "treated_df" which contains only from the 3th element from k_mer
        CpG_context()
        CHG_context()
        CHH_context()

        chunk_number += 1

mean_coverage= sum(chunk_coverage_list) / len(chunk_coverage_list)
couverture_int =int(mean_coverage)
print()
print(f"The mean coverage of the file is {couverture_int}")
print()


afficher_totaux()
all_rows_with_CpG.to_csv(f"{output_dir}/{filebasename}_CpG.csv", index=False)
all_rows_with_CHG.to_csv(f"{output_dir}/{filebasename}_CHG.csv", index=False)
all_rows_with_CHH.to_csv(f"{output_dir}/{filebasename}_CHH.csv", index=False)
plotting(total_motif_counts_CHH, total_motif_counts_CHG, total_CpG_occurrences)