-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfeature_bed_gff_intersection.py
55 lines (47 loc) · 2.46 KB
/
feature_bed_gff_intersection.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
# Author : Fadwa EL KHADDAR
# University : Université de Montpellier
# Lab Institut : IRD -Montpellier
import os
import subprocess
import pandas as pd
import matplotlib
import argparse
matplotlib.use('TkAgg')
# Définir les arguments
parser = argparse.ArgumentParser(description="Histogramme of methylation counts using bed files")
parser.add_argument('-gff', type=str, help="Path to GFF file")
parser.add_argument('-bed', type=str, nargs='*', help="Path to bed files. PLEASE PUT files in this order: CG.bed CHG.bed CHH.bed")
parser.add_argument('-feature', type=str, help="Feature to look for")
parser.add_argument('-o', type=str, help="Path to output folder")
args = parser.parse_args()
gff = args.gff
bed = args.bed
feature = args.feature
output = args.o
filebasename1 = os.path.splitext(os.path.basename(gff))[0]
filename_liste = []
for file_name in bed:
filebasename_bed = os.path.splitext(os.path.basename(file_name))[0]
filename_liste.append(filebasename_bed)
liste = filename_liste
print(f"You are working on these files:\n - GFF file: {filebasename1}\n - BED file: {liste}")
# Filtre du fichier d'annotation gff
command1 = subprocess.Popen(f'awk -v OFS="\t" \'$3 == "{feature}" && $3 != "mRNA" {{split($9, id, ";"); print $1, $4, $5, id[1]}}\' {gff}', shell=True, stdout=subprocess.PIPE)
output1, errors1 = command1.communicate()
count_features = output1.decode('utf-8').strip().split('\n')
# Intersection de fichier d'annotation et fichier bed
counts_dict = {}
for file in bed:
filebasename_bed = os.path.splitext(os.path.basename(file))[0]
command2 = subprocess.Popen(f"bedtools intersect -wao -a stdin -b {file}", shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
output2, errors2 = command2.communicate(input=output1)
intersection = output2.decode('utf-8').strip()
columns = ["chromosome_f1", "start_f1", "end_f1", "Gene_ID", "chromosome_f2", "start_f2", "end_f2", "methylation"]
rows = [line.split('\t') for line in intersection.split('\n') if line]
dataframe = pd.DataFrame(rows, columns=columns)
dataframe = dataframe[dataframe['methylation'].astype(int) != 0]
exported_dataframe = f"{output}/{feature}_Intersection_gff_vs_{filebasename_bed}.csv"
dataframe.to_csv(exported_dataframe, index=False)
data = pd.read_csv(exported_dataframe)
grouped = data.groupby(['chromosome_f1', 'start_f1', 'end_f1','Gene_ID']).agg({'methylation': 'sum'}).reset_index()
grouped.to_csv(f"{output}/methylations_in_{feature}_{filebasename_bed}.csv")