forked from jclev-uga/SWEEP
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathHomozygous.py
123 lines (88 loc) · 2.86 KB
/
Homozygous.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
from Bio import SeqIO
from Bio.SeqFeature import SeqFeature, FeatureLocation
import pysam
import sys
import re
def get_genotype(line, number):
genotype = []
for x in range(number):
prob = line[x].split(',')
if len(prob) > 3:
return 0
if int(prob[0]) == 0 and int(prob[1]) >= 0 and int(prob[2]) > 0:
genotype.append(1)
if int(prob[0]) > 0 and int(prob[1]) == 0 and int(prob[2]) > 0:
genotype.append(2)
if int(prob[0]) > 0 and int(prob[1]) >= 0 and int(prob[2]) == 0:
genotype.append(3)
if int(prob[0]) == 0 and int(prob[1]) >= 0 and int(prob[2]) == 0:
genotype.append(-1)
if int(prob[0]) > 0 and int(prob[1]) > 0 and int(prob[2]) > 0:
genotype.append(-1)
return genotype
def parseCIGAR(CIGAR):
for op, length in CIGAR:
if op == 4:
return 0
if op == 2:
return 2
return 1
def getDel(CIGAR):
for op, length in CIGAR:
if op == 2:
return length
return 0
def parseMD(MD):
#parse MD string (cf. SAM Format Specification v1.4-r985)
mismatches, location, pos = [], [], 0
for n in re.findall('(\d+|[a-zA-Z|^]+)', MD):
if n.isalpha():
mismatches.append(n)
location.append(pos)
pos += 1
#ignore deletions i.e. ^[A-Z] (isalpha() is False)
elif '^' not in n:
pos += int(n)
return mismatches,location
def getPile(samfile, chr, position, alt):
if position -2 >= 0:
for pileupcolumn in samfile.pileup(chr, position-2, position+2):
if (pileupcolumn.pos + 1) == position:
for pileupread in pileupcolumn.pileups:
if pileupread.alignment.query_sequence[pileupread.query_position] == alt:
return 1
else:
return 1
return 0
record_dict = SeqIO.to_dict(SeqIO.parse(sys.argv[1], "fasta"))
samfile = []
samfile = sys.argv[4].split()
bamnumber = len(samfile)
out = open(sys.argv[3], "w")
snps = open(sys.argv[2], 'r')
for snp in snps:
snp = snp.strip()
hold = snp.split()
if hold[0].startswith('#'):
out.write(snp)
out.write('\n')
continue
chr = hold[0]
position = int(hold[1])
ref = hold[3]
alt = hold[4]
PL = re.findall('\s(\d+\,\d+\,\d+)', snp)
genotype = get_genotype(PL,len(PL))
for x in range(bamnumber):
falsepositive = 0
if int(genotype[x]) == 1:
falsepositive = getPile(pysam.Samfile(samfile[x], "rb"),chr,position,alt)
if falsepositive == 1:
break
if falsepositive == 1:
continue
elif falsepositive == 0:
out.write(snp)
out.write('\n')
snps.close()
out.close()