-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvcfparser.py
66 lines (64 loc) · 3.51 KB
/
vcfparser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
## Use python 3.6
## Syntax: python3 vcfparser.py --vcf "Challenge_data (1).vcf"
import requests
import json
import argparse
import time
time_start = time.time()
parser = argparse.ArgumentParser(
prog='vcfparser',
usage='''python3 vcfparser.py --vcf [vcf file]''',
description='''This program analyzes a vcf file for various variants, their corresponding reads support and queries the ExAC database for allele frequency and dbSNP ID''',
epilog='''This program requires python3 with requests,json,argparse and time libraries.''')
parser.add_argument('--vcf', type=str, help='The vcf file', required=True)
args = parser.parse_args()
vcffile=args.vcf
with open("annotated."+vcffile.replace("vcf","")+"csv",'w') as xyz:
xyz.write('sep="\\t"\n')
xyz.write("Chromosome\tPosition\tReference allele\tAlternate allele\tType of substitution\tTotal depth of locus\tNumber of reads supporting variant\tPercentage of reads supporting variant\tAllele frequency\tdbSNP id\n")
for line in open(vcffile,'r'):
if line.startswith("#")==True: # Ignoring the comment lines of a vcf file
pass
else:
line2=line.split("\t")
chromosome=line2[0] # Chromosome
position=line2[1] # Position
reference=line2[3] # Reference allele
alternate=line2[4] # Alternate allele
alternate2=alternate.split(",")
num_alt_all=len(alternate2) # Number of alternate alleles
info=line2[7].split(";")
for i in range(len(info)):
# Looking for type of variation
if "TYPE=" in info[i]:
variation=info[i][5:]
variation2=variation.split(",")
# Looking for depth of coverage
elif "DP=" in info[i]:
depth=int(info[i][3:])
# Number of reads supporting the variant
elif "RPR=" in info[i]:
rightreads=info[i][4:] # Extracting the number of supporting reads lying right to the proposed variant
rightreads2=rightreads.split(",")
elif "RPL=" in info[i]:
leftreads=info[i][4:] # Extracting the number of supporting reads lying left to the proposed variant
leftreads2=leftreads.split(",")
# Iterating over each of the alternate alleles
for i in range(num_alt_all):
# Using REST API for getting allele frequency and dbSNP ID from ExAC database
r = requests.get('http://exac.hms.harvard.edu/rest/variant/variant/'+chromosome+'-'+position+'-'+reference+'-'+alternate2[i])
#print (chromosome+'-'+position+'-'+reference+'-'+alternate2[i])
r = json.dumps(r.json()) # Converting the ExAC output into json
loaded_r = json.loads(r)
try:
allele_freq=loaded_r['allele_freq'] # Extracting allele frequency
except KeyError:
allele_freq="NA" # Print "NA" when no allele frequency is found corresponding to the variant
try:
rsid=loaded_r['rsid'] # Extracting dbSNP ID
except KeyError:
rsid="." # Print "." when no dbSNP ID is found corresponding to the variant
xyz.write(chromosome+"\t"+position+"\t"+reference+"\t"+alternate2[i]+"\t"+variation2[i]+"\t"+str(depth)+"\t"+str(int(leftreads2[i])+int(rightreads2[i]))+"\t"+str(((int(leftreads2[i])+int(rightreads2[i]))*100)/depth)+"\t"+str(allele_freq)+"\t"+rsid+"\n") # Writing to table
xyz.close()
print ("Output written to "+"annotated."+vcffile.replace("vcf","")+"txt")
print ("Time taken to parse "+vcffile+": "+str(time.time()-time_start)+" seconds")