-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathMetModels_calc_core_exchanges.py
132 lines (95 loc) · 5.21 KB
/
MetModels_calc_core_exchanges.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Script to identify the core and accessory metabolic exchanges across samples
(at the metabolite level)
outputs a summary of core vs. accessory metabolic exchanges across phenotypes
Created on 13 / Sep / 2021
Modified 22 / Sep / 2021
@author: V.R.Marcelino
"""
import pandas as pd
from argparse import ArgumentParser
import os, glob, re
parser = ArgumentParser()
parser.add_argument('-f', '--folder_w_exchange_files', help="""Path to the folder containing exchange files produced by MICOM""", required=True)
parser.add_argument('-c', '--core', help="""Definition of core - min percentage of samples where the edge should be present to be considered core.
Default = 95""", required=False, default=95)
parser.add_argument('-m', '--metadata', help="""Path to the metadata file""",required=True )
parser.add_argument('-g', '--grouping_header', help="""category to analyse core exchanges, must be a header in the metadata file (Diagnosis or HD)""",required=True)
parser.add_argument('-o', '--output_core', help="""file to store the summary of core met exchanges. Default = summary_met_exchanges_core.csv""", required=False, default="summary_met_exchanges_core.csv")
args = parser.parse_args()
in_path = args.folder_w_exchange_files
core_def = int(args.core)
metad_fp = args.metadata
grouping_header = args.grouping_header
out_file_core = args.output_core
#in_path = "2_exchanges"
#core_def = 95
#out_file_core = "summary_met_exchanges_core.csv"
#metad_fp = "metadata_rewiring_microbiome.csv"
#grouping_header = "HD"
def core_stats(in_df, core_def):
""" Returns a list with:
Total_[production|consumption], Core_[production|consumption], Proportion_core_[production|consumption]"""
n_samples = len(in_df)
# number of individuals that must have the bin_met interaction for it to be considered core:
threshold = core_def * n_samples / 100
total_n_rxn = len(in_df.columns)
print("Total number of exchanged metabolites: %i" % (total_n_rxn))
# count non-zeros for each column
count_non_zeros = in_df.astype(bool).sum(axis=0)
wanted_edges = count_non_zeros > threshold
core_X = in_df[wanted_edges.index[wanted_edges]]
number_core_rxn = len(core_X.columns)
print("Number of metabolites being exchanged in %i%% of individuals: %i" % (core_def, number_core_rxn))
# proportion:
prop_core = round(number_core_rxn * 100 / total_n_rxn, 2)
results = [total_n_rxn, number_core_rxn, prop_core]
return (results)
## read metadata and identify groups
all_files = glob.glob(os.path.join(in_path, "exchanges_grow_*.csv"))
replace_string = in_path + "/|exchanges_grow_|_cat|.csv"
names_2_path_dict = {}
for i in (all_files):
sample_id = re.sub(replace_string, '', i)
names_2_path_dict[sample_id] = i
metad_all_samples = pd.read_csv(metad_fp)
metad_all_samples['file_path'] = metad_all_samples['Sample'].map(names_2_path_dict)
metad = metad_all_samples[metad_all_samples['file_path'].notna()] # keep only metadata samples for which we have samples
wanted_categories = list(set((metad[grouping_header])))
#process data in batches corresponding to phenotype (or healthy/disease)
summary_results = pd.DataFrame(["n_samples","Total_production","Core_production","Proportion_core_prod",
"Total_consumption","Core_consumption","Proportion_core_cons"], columns=['description'])
for cat in wanted_categories:
print ("\n\nProcessing %s" %(cat))
wanted_metad_df = metad[metad[grouping_header] == cat]
wanted_files = list(wanted_metad_df['file_path'])
## merge files for each phenotype
df_from_each_file = (pd.read_csv(f, sep=',') for f in wanted_files)
df_merged = pd.concat(df_from_each_file, ignore_index=True)
## remove media:
exch_df = df_merged.loc[df_merged['taxon'] != "medium"]
## separate imports and exports in different tables
exports_df = exch_df.loc[exch_df['direction'] == "export"]
imports_df = exch_df.loc[exch_df['direction'] == "import"]
## aggregate all fluxes for each sample
exports_df_agg = exports_df.groupby(["sample_id","metabolite"]).agg({"flux": "sum"}).reset_index()
imports_df_agg = imports_df.groupby(["sample_id","metabolite"]).agg({"flux": "sum"}).reset_index()
## reshape table to have sample as rows and metabolites as columns:
exports_df_samples_as_rows = exports_df_agg.pivot(index = "sample_id", columns="metabolite", values="flux")
exports_df_samples_as_rows = exports_df_samples_as_rows.fillna(0) # replace NaNs with zeros
imports_df_samples_as_rows = imports_df_agg.pivot(index = "sample_id", columns="metabolite", values="flux")
imports_df_samples_as_rows = imports_df_samples_as_rows.fillna(0) # replace NaNs with zeros
##################################
### stats on core interactions ###
print ("\nproduction:")
results_production = core_stats(exports_df_samples_as_rows,core_def)
print ("\nconsumption:")
results_consumption = core_stats(imports_df_samples_as_rows, core_def)
wanted_info = [len(imports_df_samples_as_rows)] + results_production + results_consumption
summary_results[cat] = wanted_info
# save it to file:
summary_results.to_csv(out_file_core, index=False)
print ("\n\nSummary saved to %s" %(out_file_core))
print ("\nDone!\n")