Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
wwysoc2 committed May 3, 2019
1 parent f1fb512 commit 2217755
Showing 1 changed file with 202 additions and 0 deletions.
202 changes: 202 additions & 0 deletions gdc-maf-tool.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,202 @@
import os
import re
import sys
import json
import csv
import requests
import hashlib
import gzip
import argparse
import datetime

def arg_parse():
parser = argparse.ArgumentParser(
description = '----GDC MAF Concatenation Tool v1.0----',
usage = 'python gdc-maf-cat.py <-m MANIFEST or -p PROJECT_ID>')
parser.add_argument('-m', '--manifest', action = "store",
help = 'Specify MAF files with GDC Manifest')
parser.add_argument('-p', '--project', action = "store",
help = 'Specify MAF files by project')
parser.add_argument('-o', '--output', metavar = 'FILE_PREFIX',
action = "store", dest = 'o', type = str, default = "outfile.maf",
help = 'Designates a name for the output file')
args = parser.parse_args()
return args

def main(args):
'''
Retrieves and parses the arguments
'''
global use_manifest, output_file, manifest_path, project_string
if args.manifest:
use_manifest = True
manifest_path = args.manifest
if args.project:
use_manifest = False
project_string = args.project
if args.o: output_file = args.o
if args.manifest and args.project:
error_parse("both_argue")
if not args.manifest and not args.project:
error_parse("no_argue")


def error_parse(code):
'''
Generates the error messages
'''
error = {
"bad_manifest": "Input must be valid GDC Manifest. " \
"\n\tGo to https://portal.gdc.cancer.gov/ to download a manifest",
"no_result": "Query produced no results",
"no_argue": "No argument detected, please use the -p or -m flags",
"both_argue": "Must choose either -p OR -m, not both.",
"md5sum_mis": "Expected md5sum does not match file's md5sum value",
"max_retry" : "Maximum retries exceeded"
}
print("ERROR: " + error[code])
sys.exit(2)

def strip_maf_header(maf_file):
'''
Removes the MAF header
'''
maf_list = []
for line in maf_file:
if line[0] != "#":
maf_list.append(line)
return maf_list

def jsonify_maf(maf_file):
'''
Converts MAF TSV to dict, requires header is stripped.
'''
master_dict = []
keys = maf_file[0].strip().split("\t")
for line in maf_file[1:]:
split_line = line.strip().split("\t")
one_line_dict = dict(zip(keys, split_line))
master_dict.append(one_line_dict)
return master_dict, keys

def back_to_tsv(full_dict, col_order, prefix):
'''
Converts full concatenated dict to TSV for writing out
'''
dict_writer = csv.DictWriter(open("{}".format(prefix), "w"), col_order, delimiter='\t')
dict_writer.writeheader()
dict_writer.writerows(full_dict)

def read_in_manifest(manifest_path):
'''
Reads in a GDC Manifest to parse out UUIDs
'''
manifest_file = open(manifest_path, "r").read().splitlines()
id_list = []
if manifest_file[0].strip().split("\t")[0] != "id":
error_parse("bad_manifest")
for line in manifest_file[1:]:
id_list.append(line.strip().split("\t")[0])
return id_list

def retrieve_ids_by_project(provided, project):
'''
Retrieves IDs when provided a project_id or list of UUIDs
'''
id_list = []
endpt = "https://api.gdc.cancer.gov/files"
filters = [
("files.data_format",["MAF"]),
("files.data_type",["Masked Somatic Mutation"])]
if project == True:
filters.append(("cases.project.project_id", provided.split(",")))
else:
filters.append(("files.file_id", provided))
filters_gdc = {"op":"and", "content":[]}
for field, value in filters:
filt_core = {"field": field, "value": value}
single_filt = {"op": "in", "content": filt_core}
filters_gdc["content"].append(single_filt)
params = {
"filters": json.dumps(filters_gdc),
"fields" : "file_id,md5sum,file_name",
"format" : "JSON",
"size" : "10000"
}
response = requests.get(endpt, params= params)
out_hits = json.loads(response.content)["data"]["hits"]
if len(out_hits) == 0:
error_parse("no_result")
for file_entry in out_hits:
single_dict = dict(zip(["file_id", "md5sum", "file_name"],
[file_entry["file_id"], file_entry["md5sum"], file_entry["file_name"]]))
id_list.append(single_dict)
return id_list

def download_maf(single_maf_dict, tmpdir):
'''
Downloads each MAF file and stores in tmp directory
'''
file_id, exp_md5 = single_maf_dict["file_id"], single_maf_dict["md5sum"]
retry = True
retry_num = 0
while retry == True and retry_num < 3:
data_endpt = "https://api.gdc.cancer.gov/data/{}".format(file_id)
print "> {} | Downloading File | {} |".format(datetime.datetime.now(), file_id)
response = requests.get(data_endpt, headers = {"Content-Type": "application/json"})
if response.status_code == 200:
retry = False
else:
retry_num += 1
print "> -- Retrying Download..."
if retry == False:
response_head_cd = response.headers["Content-Disposition"]
file_name = re.findall("filename=(.+)", response_head_cd)[0]
with open("/".join([tmpdir, file_name]), "wb") as output_file:
output_file.write(response.content)
check_md5sum(file_name, exp_md5, tmpdir)
elif retry_num == 3:
error_parse("max_retry")

def download_run(id_list):
'''
Runs MAF download for multiple and performed per-session tasks
'''
tmpdir = "tmpMAF_" + str(datetime.datetime.now()).split(" ")[0]
if not os.path.exists(tmpdir):
os.mkdir(tmpdir)
for single_maf in id_list:
download_maf(single_maf, tmpdir)
print ">-- All MAF Downloads Complete"
return id_list, tmpdir

def check_md5sum(file_name, exp_md5, tmpdir):
'''
Checks the MD5SUM matches the one in the GDC index
'''
hash_md5 = hashlib.md5()
with open("/".join([tmpdir, file_name]), "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
hash_md5.update(chunk)
if exp_md5 != hash_md5.hexdigest():
error_parse("md5sum_mis")


def execute():
main(arg_parse())
cat_maf = []
if use_manifest == True:
maf_ids_only = read_in_manifest(manifest_path)
maf_ids = retrieve_ids_by_project(maf_ids_only, False)
else:
maf_ids = retrieve_ids_by_project(project_string, True)

id_list, tmpdir = download_run(maf_ids)

for single_maf in id_list:
maf_list = strip_maf_header(gzip.open("/".join([tmpdir,single_maf["file_name"]]), "r"))
jsonified, keys = jsonify_maf(maf_list)
cat_maf += jsonified
back_to_tsv(cat_maf, keys, output_file)

execute()

0 comments on commit 2217755

Please sign in to comment.