-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathdump_model_scores.py
71 lines (61 loc) · 2.62 KB
/
dump_model_scores.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import os
import re
import warnings
from argparse import ArgumentParser
warnings.filterwarnings('ignore', category=FutureWarning,
module='rpy2.robjects.pandas2ri')
import numpy as np
import pandas as pd
import rpy2.rinterface_lib.embedded as r_embedded
r_embedded.set_initoptions(
('rpy2', '--quiet', '--no-save', '--max-ppsize=500000'))
import rpy2.robjects as robjects
from joblib import dump, load
from rpy2.robjects import numpy2ri, pandas2ri
from rpy2.robjects.packages import importr
numpy2ri.activate()
pandas2ri.activate()
parser = ArgumentParser()
parser.add_argument('--results-dir', type=str, default='results/models',
help='results dir')
args = parser.parse_args()
metric = {'surv': 'score', 'resp': 'roc_auc'}
surv_model_codes = ['cnet', 'cox_clinical']
r_base = importr('base')
all_scores_dfs = {}
split_results_regex = re.compile('^(.+?)_split_results\\.pkl$')
for dirpath, dirnames, filenames in sorted(os.walk(args.results_dir)):
for filename in filenames:
if m := re.search(split_results_regex, filename):
model_name = m.group(1)
_, cancer, analysis, target, data_type, *rest = (
model_name.split('_'))
if data_type == 'htseq':
model_code = '_'.join(rest[1:])
else:
model_code = '_'.join(rest)
split_results_file = '{}/{}'.format(dirpath, filename)
print('Loading', split_results_file)
split_results = load(split_results_file)
scores = []
for split_result in split_results:
if split_result is None:
scores.append(np.nan)
else:
scores.append(split_result['scores']['te']
[metric[analysis]])
scores_df = pd.DataFrame({model_name: scores})
if model_code not in all_scores_dfs:
all_scores_dfs[model_code] = scores_df
else:
all_scores_dfs[model_code] = pd.concat(
[all_scores_dfs[model_code], scores_df], axis=1)
for model_code, all_scores_df in all_scores_dfs.items():
analysis = 'surv' if model_code in surv_model_codes else 'resp'
out_dir = '{}/{}'.format(args.results_dir, analysis)
os.makedirs(out_dir, mode=0o755, exist_ok=True)
all_scores_df.to_csv('{}/{}_model_scores.tsv'.format(out_dir, model_code),
sep='\t')
dump(all_scores_df, '{}/{}_model_scores.pkl'.format(out_dir, model_code))
r_base.saveRDS(all_scores_df, '{}/{}_model_scores.rds'
.format(out_dir, model_code))