Skip to content

Commit

Permalink
wittyer removed for now
Browse files Browse the repository at this point in the history
  • Loading branch information
kubranarci committed Jun 18, 2024
1 parent ef29e34 commit 09cb221
Show file tree
Hide file tree
Showing 17 changed files with 296 additions and 794 deletions.
18 changes: 16 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,23 @@
1. Standardization of SVs in test VCF files
2. Normalization of SVs in test VCF files
3. Normalization of SVs in truth VCF files
4. SV stats and histograms
4. SV stats and histograms (Survivor)

5. Germline benchmarking of small variants
- Tools:
Happy
RTGtools
5. Germline benchmarking of SVs
6. Somatic benchmarking of SVs
- Tools:
Truvari
Svbenchmark
Wittyer: Only works with Truth files annotated with SVTYPE and SVLENGHT

6. Somatic benchmarking of small variants
- Tools:
Happy
RTGtools

7. Final report and comparisons

## Usage
Expand Down
4 changes: 2 additions & 2 deletions bin/add_svtype.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@



in_vcf = pysam.VariantFile(args.graph)
out_name = os.path.basename(args.graph)
in_vcf = pysam.VariantFile(args.input)
out_name = os.path.basename(args.input)
if out_name.endswith('.gz'):
out_name = out_name[:-3]
if out_name.endswith('.vcf'):
Expand Down
102 changes: 102 additions & 0 deletions bin/reclassfy_svaba.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
#!/usr/bin/env python
import re
import sys
import os

#make mates dictionary given list input of non-comment lines
def makeMateDict(m):
d = {}
for index1, line1 in enumerate(m):
id1 = line1.split('\t')[2]
numMate = re.search(r':(\d)',id1).group(1)
origId = re.search(r'(\d+):',id1).group(1)
if int(numMate) == 1:
for index2, line2 in enumerate(m):
#never start from beginning of file
if index2 <= index1:
continue
# print str(index1) + " : " + str(index2)
id2 = line2.split('\t')[2]
duplicateId = re.search(r'(\d+):',id2).group(1)
duplicateNumMate = re.search(r':(\d)',id2).group(1)
if duplicateId == origId and int(duplicateNumMate) == 2:
d[line1] = line2
break
return d

def classify(line, ALT_INDEX, mdict):
#get alt, chrom1, chrom2, position (pos), id, old SVTYPE (should be BND if virgin svaba vcf) from line
s = line.split("\t")
alt = s[ALT_INDEX]
chrom1 = s[0]
pos = int(s[1])
id=s[2]

if int(re.search(r':(\d)',id).group(1)) != 1:
return "NONE"

mateLine = mdict[line].split('\t')
mateChrom = mateLine[0]
mateAlt = mateLine[ALT_INDEX]

oldType = re.search(r'SVTYPE=(.+?)(\s+?|:)',line).group(1)

# get new type
if oldType == 'BND' and chrom1 == mateChrom:
INV_PATTERN_1 = re.compile(r'\D\].+\]')
INV_PATTERN_2 = re.compile(r'\[.+\[\D')
if INV_PATTERN_1.match(alt) and INV_PATTERN_1.match(mateAlt):
return "INV"
if INV_PATTERN_2.match(alt) and INV_PATTERN_2.match(mateAlt):
return "INV"

# DEL
DEL_PATTERN_THIS = re.compile(r'\D\[.+\[')
DEL_PATTERN_MATE = re.compile(r'\].+\]\D')
if DEL_PATTERN_THIS.match(alt) and DEL_PATTERN_MATE.match(mateAlt):
return "DEL"

# INS
INS_PATTERN_THIS = re.compile(r'\D\].+\]')
INS_PATTERN_MATE = re.compile(r'\[.+\[\D')
if INS_PATTERN_THIS.match(alt) and INS_PATTERN_MATE.match(mateAlt):
return "DUP/INS"

return 'BND'

if __name__ == "__main__":
file = sys.argv[1]
if not os.path.exists(file):
raise IOError(file)
alt_index = -1
#generate mate:mate dictionary
#load file into ram
vcf_file=[]
with open (file, 'r') as f:
for line in f:
if line.startswith('#'):
continue
vcf_file.append(line)
matesDict = makeMateDict(vcf_file)
with open(file, "r") as f:
for line in f:
# print comments
if line.startswith("##"):
sys.stdout.write(line)
continue
# header contains indexes
if line.startswith('#'):
split = line.split("\t")
for index, val in enumerate(split):
if val == "ALT":
alt_index = index
break
sys.stdout.write(line)
continue
if alt_index == -1:
print "ERROR: NO ALT INDEX FOUND"
exit(1)
newType = classify(line, alt_index, matesDict)
if newType != "NONE":
newLine = re.sub(r'SVTYPE=BND',"SVTYPE="+newType,line)
sys.stdout.write(newLine)
20 changes: 18 additions & 2 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,21 @@ process {
}
withName: "BCFTOOLS_NORM" {
ext.prefix = { vcf.baseName - ".vcf" + ".norm"}
ext.args = {"--output-type z -N -m-any -c s" }
ext.args = {"--output-type z -m-any -c w" }
publishDir = [
path: { "${params.outdir}/test" },
enabled: false
]
}
withName: "BCFTOOLS_FILL_FROM_FASTA" {
ext.prefix = { vcf.baseName - ".vcf" + ".fill"}
ext.args = {"--output-type z" }
publishDir = [
path: { "${params.outdir}/test" },
enabled: false
]
}

withName: "BCFTOOLS_DEDUP" {
ext.prefix = { vcf.baseName - ".vcf" + ".dedup"}
ext.args = {"--output-type z --rm-du exact -c s" }
Expand Down Expand Up @@ -198,7 +207,7 @@ process {
}
withName: WITTYER {
ext.prefix = {"${meta.id}.${params.sample}.${meta.vartype}"}
ext.args = {"-em cts"}
ext.args = {""}
ext.when = { params.method.split(',').contains('wittyer') }
publishDir = [
path: {"${params.outdir}/${meta.id}/wittyer_bench"},
Expand Down Expand Up @@ -260,6 +269,13 @@ process {
ext.prefix = {input.toString() - ".vcf.gz"}
}

withName: TABIX_BGZIP_TRUTH{
ext.prefix = {input.toString() - ".vcf.gz"}
}

withName: TABIX_BGZIP_QUERY{
ext.prefix = {input.toString() - ".vcf.gz"}
}
withName: SURVIVOR_MERGE {
ext.prefix = {"${meta.id}.${meta.vartype}.${meta.tag}"}
publishDir = [
Expand Down
Loading

0 comments on commit 09cb221

Please sign in to comment.