-
Notifications
You must be signed in to change notification settings - Fork 62
/
Copy pathMafAnnotator.py
140 lines (123 loc) · 6.65 KB
/
MafAnnotator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
#!/usr/bin/python
import sys
import argparse
import logging
from AnnotatorCore import setsampleidsfileterfile
from AnnotatorCore import setcancerhotspotsbaseurl
from AnnotatorCore import setoncokbbaseurl
from AnnotatorCore import setoncokbapitoken
from AnnotatorCore import readCancerTypes
from AnnotatorCore import validate_oncokb_token
from AnnotatorCore import processalterationevents
from AnnotatorCore import QueryType
from AnnotatorCore import ReferenceGenome
logging.basicConfig(level=logging.INFO)
log = logging.getLogger('MafAnnotator')
def main(argv):
if argv.help:
log.info(
'\n'
'MafAnnotator.py -i <input MAF file> -o <output MAF file> [-p previous results] [-c <input clinical file>] '
'[-s sample list filter] [-t <default tumor type>] [-u oncokb-base-url] [-b oncokb api bear token] [-a] '
'[-q query type] [-r default reference genome] [-d include descriptions]\n'
'For definitions of the MAF format, please see https://docs.gdc.cancer.gov/Data/File_Formats/MAF_Format/\n\n'
'Essential MAF columns for querying HGVSp_Short and HGVSp(case insensitive):\n'
' Hugo_Symbol: Hugo gene symbol\n'
' Tumor_Sample_Barcode: sample ID\n'
' HGVSp(query type: HGVSp): protein change in HGVSp format\n'
' HGVSp_Short(query type: HGVSp_Short): protein change in HGVSp format using 1-letter amino-acid codes\n'
'Essential MAF columns for querying HGVSg(case insensitive):\n'
' Tumor_Sample_Barcode: sample ID\n'
' HGVSg: Genomic change in HGVSg format\n'
'Essential MAF columns for querying genomic change(case insensitive):\n'
' Tumor_Sample_Barcode: sample ID\n'
' Chromosome: Chromosome number\n'
' Start_Position: Mutation start coordinate\n'
' End_Position: Mutation end coordinate\n'
' Reference_Allele: The plus strand reference allele at this position\n'
' Tumor_Seq_Allele1: Primary data genotype for tumor sequencing (discovery) allele\n'
' Tumor_Seq_Allele2: Tumor sequencing (discovery) allele 2\n'
'Essential clinical columns:\n'
' SAMPLE_ID: sample ID\n'
' ONCOTREE_CODE: tumor type code from oncotree (http://oncotree.mskcc.org)\n'
'Cancer type will be assigned based on the following priority:\n'
' 1) ONCOTREE_CODE in clinical data file\n'
' 2) ONCOTREE_CODE exist in MAF\n'
' 3) default tumor type (-t)\n'
'Query type only allows the following values (case-insensitive):\n'
' - HGVSp_Short\n'
' It reads from column HGVSp_Short or Alteration\n'
' - HGVSp\n'
' It reads from column HGVSp or Alteration\n'
' - HGVSg\n'
' It reads from column HGVSg or Alteration\n'
' - Genomic_Change\n'
' It reads from columns Chromosome, Start_Position, End_Position, Reference_Allele, Tumor_Seq_Allele1 and Tumor_Seq_Allele2 \n'
'Reference Genome only allows the following values(case-insensitive):\n'
' - GRCh37\n'
' GRCh38\n'
'Default OncoKB base url is https://www.oncokb.org.\n'
)
sys.exit()
if argv.input_file == '' or argv.output_file == '' or argv.oncokb_api_bearer_token == '':
required_params = []
if argv.input_file == '':
required_params.append('-i')
if argv.output_file == '':
required_params.append('-o')
if argv.oncokb_api_bearer_token == '':
required_params.append('-b')
log.error('The parameter(s) ' + ', '.join(required_params) + ' can not be empty')
log.info('For help: python MafAnnotator.py -h')
sys.exit(2)
if argv.sample_ids_filter:
setsampleidsfileterfile(argv.sample_ids_filter)
if argv.cancer_hotspots_base_url:
setcancerhotspotsbaseurl(argv.cancer_hotspots_base_url)
if argv.oncokb_api_url:
setoncokbbaseurl(argv.oncokb_api_url)
setoncokbapitoken(argv.oncokb_api_bearer_token)
cancertypemap = {}
if argv.input_clinical_file:
readCancerTypes(argv.input_clinical_file, cancertypemap)
log.info('annotating %s ...' % argv.input_file)
user_input_query_type = None
if argv.query_type is not None:
try:
user_input_query_type = QueryType[argv.query_type.upper()]
except KeyError:
log.error(
'Query type is not acceptable. Only the following allows(case insensitive): HGVSp_Short, HGVSp, HGVSg, Genomic_Change')
raise
default_reference_genome = None
if argv.default_reference_genome is not None:
try:
default_reference_genome = ReferenceGenome[argv.default_reference_genome.upper()]
except KeyError:
log.error(
'Reference genome is not acceptable. Only the following allows(case insensitive): GRCh37, GRCh38')
raise
validate_oncokb_token()
processalterationevents(argv.input_file, argv.output_file, argv.previous_result_file, argv.default_cancer_type,
cancertypemap, argv.annotate_hotspots, user_input_query_type, default_reference_genome,
argv.include_descriptions)
log.info('done!')
if __name__ == "__main__":
parser = argparse.ArgumentParser(add_help=False)
parser.add_argument('-h', dest='help', action="store_true", default=False)
parser.add_argument('-i', dest='input_file', default='', type=str)
parser.add_argument('-o', dest='output_file', default='', type=str)
parser.add_argument('-p', dest='previous_result_file', default='', type=str)
parser.add_argument('-c', dest='input_clinical_file', default='', type=str)
parser.add_argument('-s', dest='sample_ids_filter', default='', type=str)
parser.add_argument('-t', dest='default_cancer_type', default='', type=str)
parser.add_argument('-u', dest='oncokb_api_url', default='', type=str)
parser.add_argument('-a', dest='annotate_hotspots', action="store_true", default=False)
parser.add_argument('-v', dest='cancer_hotspots_base_url', default='', type=str)
parser.add_argument('-b', dest='oncokb_api_bearer_token', default='', type=str)
parser.add_argument('-q', dest='query_type', default=None, type=str)
parser.add_argument('-r', dest='default_reference_genome', default=None, type=str)
parser.add_argument('-d', dest='include_descriptions', action="store_true", default=False)
parser.set_defaults(func=main)
args = parser.parse_args()
args.func(args)