Skip to content

Commit

Permalink
add fix_vcf_prefix script
Browse files Browse the repository at this point in the history
  • Loading branch information
kubranarci committed Dec 6, 2024
1 parent ca2236a commit 4a99966
Show file tree
Hide file tree
Showing 6 changed files with 206 additions and 14 deletions.
6 changes: 6 additions & 0 deletions assets/schema_input.json
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,12 @@
"description": "Liftover option for test vcfs, to activate add params.liftover='test' ",
"meta": ["liftover"],
"default": false
},
"fix_prefix": {
"type": "boolean",
"description": "Fixing prefix is sometimes necessary for input files, activate it if your input is not adhering to GRCh38 with chr and GRCh37 without chr standards ",
"meta": ["fix_prefix"],
"default": false
}
},
"required": ["test_vcf", "caller", "id"]
Expand Down
88 changes: 88 additions & 0 deletions bin/fix_vcf_prefix.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
#!/usr/bin/env python

# Copyright 2024 - GHGA
# Author: Kuebra Narci
'''
Generates a CSV file from a VCF
Expected usage:
$ python fix_vcf_prefix.py <vcf_file> <output>
Use --help for more information.
'''
import os
import subprocess
import argparse
import shutil


def determine_genome_version(vcf_file):
"""
Determine the genome version by inspecting chromosome naming in the VCF file.
"""
with subprocess.Popen(
f"bcftools view -h {vcf_file} | grep -m 1 '^##contig=<ID='",
shell=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
universal_newlines=True,
) as proc:
output, _ = proc.communicate()

if not output:
raise ValueError("Unable to determine genome version from contigs in the VCF header.")

if "chr" in output:
return "GRCh38"
else:
return "GRCh37"


def fix_vcf_prefix(input_vcf, output_vcf, rename_file, target_version):
"""
Check the prefix of chromosome names in the VCF and fix it if necessary using the provided rename file.
"""
# Determine genome version from input VCF
current_version = determine_genome_version(input_vcf)
print(f"Detected genome version: {current_version}")

# If the current genome version matches the target, simply copy the input VCF
if current_version == target_version:
print(f"Genome version matches the target ({target_version}). Copying input VCF to output.")
shutil.copy(input_vcf, output_vcf)
shutil.copy(input_vcf + ".tbi", output_vcf + ".tbi") # Copy index as well
return

# Verify the rename file exists
if not os.path.isfile(rename_file):
raise FileNotFoundError(f"Rename file '{rename_file}' not found.")

# Use bcftools to rename chromosomes without modifying the header
subprocess.check_call(
f"bcftools annotate --rename-chrs {rename_file} --no-version {input_vcf} -Oz -o {output_vcf}",
shell=True,
)
subprocess.check_call(f"bcftools index {output_vcf}", shell=True)
print(f"Chromosome names updated and output written to {output_vcf}")


def main():
parser = argparse.ArgumentParser(description="Check and fix VCF chromosome naming prefix.")
parser.add_argument("input_vcf", help="Input VCF file (compressed and indexed).")
parser.add_argument("output_vcf", help="Output VCF file.")
parser.add_argument(
"--rename-chr",
required=False,
help="Path to a file with chromosome rename mappings (for bcftools --rename-chrs).",
)
parser.add_argument(
"--target-version",
required=True,
choices=["GRCh37", "GRCh38"],
help="Target genome version (GRCh37 or GRCh38).",
)
args = parser.parse_args()

fix_vcf_prefix(args.input_vcf, args.output_vcf, args.rename_chr, args.target_version)


if __name__ == "__main__":
main()
8 changes: 8 additions & 0 deletions modules/local/custom/fix_vcf_prefix/environment.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
channels:
- conda-forge
- bioconda
dependencies:
- bioconda::bcftools
- pip
- pip:
- pip==24.3.1
51 changes: 51 additions & 0 deletions modules/local/custom/fix_vcf_prefix/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
process FIX_VCF_PREFIX {
tag "$meta.id"
label 'process_low'

conda "${moduleDir}/environment.yml"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/5d/5d097f8fee4db1239705e5d3929e50547796ab91a1e0bdf4201030ced8cb272d/data':
'community.wave.seqera.io/library/bcftools_pip:8a460830487271c2' }"

input:
tuple val(meta), path(input)
tuple val(meta2), path(rename_chr)

output:
tuple val(meta), path("*.vcf.gz"), emit: vcf
path "versions.yml" , emit: versions

when:
task.ext.when == null || task.ext.when

script:
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"

if ("$input" == "${prefix}.vcf.gz") error "Input and output names are the same, set prefix in module configuration to disambiguate!"
"""
fix_vcf_prefix.py \\
$input \\
${prefix}.vcf.gz \\
--rename-chr $rename_chr \\
--target-version $params.genome
cat <<-END_VERSIONS > versions.yml
"${task.process}":
bcftools: \$( bcftools --version |& sed '1!d; s/^.*bcftools //' )
END_VERSIONS
"""

stub:
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"

"""
echo '' | gzip > ${prefix}.vcf.gz
cat <<-END_VERSIONS > versions.yml
"${task.process}":
bcftools: \$( bcftools --version |& sed '1!d; s/^.*bcftools //' )
END_VERSIONS
"""
}
40 changes: 31 additions & 9 deletions subworkflows/local/prepare_vcfs_test.nf
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ include { TABIX_BGZIPTABIX } from '../../modules/nf-core/tabix/bgzip
include { TABIX_TABIX } from '../../modules/nf-core/tabix/tabix'
include { LIFTOVER_VCFS } from '../local/liftover_vcfs'
include { BCFTOOLS_VIEW as BCFTOOLS_VIEW_CONTIGS } from '../../modules/nf-core/bcftools/view'
include { FIX_VCF_PREFIX } from '../../modules/local/custom/fix_vcf_prefix'


workflow PREPARE_VCFS_TEST {
Expand All @@ -33,16 +34,37 @@ workflow PREPARE_VCFS_TEST {

vcf_ch = Channel.empty()

LIFTOVER_VCFS(
vcf.liftover,
Channel.empty(),
fasta,
chain,
rename_chr,
dictionary
if (params.liftover.contains("test")){
LIFTOVER_VCFS(
vcf.liftover,
Channel.empty(),
fasta,
chain,
rename_chr,
dictionary
)
versions = versions.mix(LIFTOVER_VCFS.out.versions.first())
vcf_ch = vcf_ch.mix(LIFTOVER_VCFS.out.vcf_ch,vcf.other)
}
vcf_ch = vcf_ch.mix(vcf.other)

// if prefix of chromosomes needs to be fixed
vcf_ch.branch{
def meta = it[0]
prefix: meta.fix_prefix
other: true}.set{fix}

vcf_ch = Channel.empty()

fix.prefix.view()

FIX_VCF_PREFIX(
fix.prefix,
rename_chr
)
versions = versions.mix(LIFTOVER_VCFS.out.versions.first())
vcf_ch = vcf_ch.mix(LIFTOVER_VCFS.out.vcf_ch,vcf.other)
versions = versions.mix(FIX_VCF_PREFIX.out.versions.first())
vcf_ch = vcf_ch.mix(FIX_VCF_PREFIX.out.vcf,fix.other)


// Add "query" to test sample
VCF_REHEADER_SAMPLENAME(
Expand Down
27 changes: 22 additions & 5 deletions workflows/variantbenchmarking.nf
Original file line number Diff line number Diff line change
Expand Up @@ -72,22 +72,39 @@ workflow VARIANTBENCHMARKING {
sdf = params.sdf ? Channel.fromPath(params.sdf, checkIfExists: true).map{ sdf -> tuple([id: sdf.getSimpleName()], sdf) }.collect()
: Channel.empty()

if (params.rename_chr){
rename_chr = Channel.fromPath(params.rename_chr, checkIfExists: true).map{ txt -> tuple([id: txt.getSimpleName()], txt) }.collect()
if (!params.genome){
log.error "Please specify params.genome to fix chromosome prefix"
exit 1
}

}else{
if (params.genome == "GRCh38"){
rename_chr = Channel.fromPath("assets/rename_contigs/grch37_grch38.txt", checkIfExists: true).map{ txt -> tuple([id: txt.getSimpleName()], txt) }.collect()
}
else if(params.genome == "GRCh37")
{
rename_chr = Channel.fromPath("assets/rename_contigs/grch38_grch37.txt", checkIfExists: true).map{ txt -> tuple([id: txt.getSimpleName()], txt) }.collect()
}
else{
rename_chr = Channel.empty()
}
}

// read chain file, liftover genome and rename chr files if liftover is true
if (params.liftover){

if (params.chain && params.rename_chr){
if (params.chain){
chain = Channel.fromPath(params.chain, checkIfExists: true).map{ bed -> tuple([id: bed.getSimpleName()], bed) }.collect()
rename_chr = Channel.fromPath(params.rename_chr, checkIfExists: true).map{ txt -> tuple([id: txt.getSimpleName()], txt) }.collect()
}else{
log.error "Please specify params.chain and params.rename_chr to process liftover of the files"
log.error "Please specify params.chain to process liftover of the files"
exit 1
}

// if dictinoary file is missing PICARD_CREATESEQUENCEDICTIONARY will create one
dictionary = params.dictionary ? Channel.fromPath(params.dictionary, checkIfExists: true).map{ dict -> tuple([id: dict.getSimpleName()], dict) }.collect() : Channel.empty()
}else{
chain = Channel.empty()
rename_chr = Channel.empty()
dictionary = Channel.empty()
}

Expand Down

0 comments on commit 4a99966

Please sign in to comment.