add fix_vcf_prefix script

EladH1 · Dec 6, 2024 · 4a99966 · 4a99966
1 parent ca2236a
commit 4a99966
Show file tree

Hide file tree

Showing 6 changed files with 206 additions and 14 deletions.
diff --git a/assets/schema_input.json b/assets/schema_input.json
@@ -160,6 +160,12 @@
                 "description": "Liftover option for test vcfs, to activate add params.liftover='test' ",
                 "meta": ["liftover"],
                 "default": false
+            },
+            "fix_prefix": {
+                "type": "boolean",
+                "description": "Fixing prefix is sometimes necessary for input files, activate it if your input is not adhering to GRCh38 with chr and GRCh37 without chr standards ",
+                "meta": ["fix_prefix"],
+                "default": false
             }
         },
         "required": ["test_vcf", "caller", "id"]

diff --git a/bin/fix_vcf_prefix.py b/bin/fix_vcf_prefix.py
@@ -0,0 +1,88 @@
+#!/usr/bin/env python
+
+# Copyright 2024 - GHGA
+# Author: Kuebra Narci
+'''
+Generates a CSV file from a VCF
+Expected usage:
+    $ python fix_vcf_prefix.py <vcf_file> <output>
+Use --help for more information.
+'''
+import os
+import subprocess
+import argparse
+import shutil
+
+
+def determine_genome_version(vcf_file):
+    """
+    Determine the genome version by inspecting chromosome naming in the VCF file.
+    """
+    with subprocess.Popen(
+        f"bcftools view -h {vcf_file} | grep -m 1 '^##contig=<ID='",
+        shell=True,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        universal_newlines=True,
+    ) as proc:
+        output, _ = proc.communicate()
+
+    if not output:
+        raise ValueError("Unable to determine genome version from contigs in the VCF header.")
+
+    if "chr" in output:
+        return "GRCh38"
+    else:
+        return "GRCh37"
+
+
+def fix_vcf_prefix(input_vcf, output_vcf, rename_file, target_version):
+    """
+    Check the prefix of chromosome names in the VCF and fix it if necessary using the provided rename file.
+    """
+    # Determine genome version from input VCF
+    current_version = determine_genome_version(input_vcf)
+    print(f"Detected genome version: {current_version}")
+
+    # If the current genome version matches the target, simply copy the input VCF
+    if current_version == target_version:
+        print(f"Genome version matches the target ({target_version}). Copying input VCF to output.")
+        shutil.copy(input_vcf, output_vcf)
+        shutil.copy(input_vcf + ".tbi", output_vcf + ".tbi")  # Copy index as well
+        return
+
+    # Verify the rename file exists
+    if not os.path.isfile(rename_file):
+        raise FileNotFoundError(f"Rename file '{rename_file}' not found.")
+
+    # Use bcftools to rename chromosomes without modifying the header
+    subprocess.check_call(
+        f"bcftools annotate --rename-chrs {rename_file} --no-version {input_vcf} -Oz -o {output_vcf}",
+        shell=True,
+    )
+    subprocess.check_call(f"bcftools index {output_vcf}", shell=True)
+    print(f"Chromosome names updated and output written to {output_vcf}")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Check and fix VCF chromosome naming prefix.")
+    parser.add_argument("input_vcf", help="Input VCF file (compressed and indexed).")
+    parser.add_argument("output_vcf", help="Output VCF file.")
+    parser.add_argument(
+        "--rename-chr",
+        required=False,
+        help="Path to a file with chromosome rename mappings (for bcftools --rename-chrs).",
+    )
+    parser.add_argument(
+        "--target-version",
+        required=True,
+        choices=["GRCh37", "GRCh38"],
+        help="Target genome version (GRCh37 or GRCh38).",
+    )
+    args = parser.parse_args()
+
+    fix_vcf_prefix(args.input_vcf, args.output_vcf, args.rename_chr, args.target_version)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/modules/local/custom/fix_vcf_prefix/environment.yml b/modules/local/custom/fix_vcf_prefix/environment.yml
@@ -0,0 +1,8 @@
+channels:
+  - conda-forge
+  - bioconda
+dependencies:
+  - bioconda::bcftools
+  - pip
+  - pip:
+      - pip==24.3.1
diff --git a/modules/local/custom/fix_vcf_prefix/main.nf b/modules/local/custom/fix_vcf_prefix/main.nf
@@ -0,0 +1,51 @@
+process FIX_VCF_PREFIX {
+    tag "$meta.id"
+    label 'process_low'
+
+    conda "${moduleDir}/environment.yml"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/5d/5d097f8fee4db1239705e5d3929e50547796ab91a1e0bdf4201030ced8cb272d/data':
+        'community.wave.seqera.io/library/bcftools_pip:8a460830487271c2' }"
+
+    input:
+    tuple val(meta), path(input)
+    tuple val(meta2), path(rename_chr)
+
+    output:
+    tuple val(meta), path("*.vcf.gz"), emit: vcf
+    path "versions.yml"              , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args    = task.ext.args ?: ''
+    def prefix  = task.ext.prefix ?: "${meta.id}"
+
+    if ("$input" == "${prefix}.vcf.gz") error "Input and output names are the same, set prefix in module configuration to disambiguate!"
+    """
+    fix_vcf_prefix.py \\
+        $input \\
+        ${prefix}.vcf.gz \\
+        --rename-chr $rename_chr \\
+        --target-version $params.genome
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        bcftools: \$( bcftools --version |& sed '1!d; s/^.*bcftools //' )
+    END_VERSIONS
+    """
+
+    stub:
+    def args = task.ext.args ?: ''
+    def prefix = task.ext.prefix ?: "${meta.id}"
+
+    """
+    echo '' | gzip > ${prefix}.vcf.gz
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        bcftools: \$( bcftools --version |& sed '1!d; s/^.*bcftools //' )
+    END_VERSIONS
+    """
+}
diff --git a/subworkflows/local/prepare_vcfs_test.nf b/subworkflows/local/prepare_vcfs_test.nf
@@ -11,6 +11,7 @@ include { TABIX_BGZIPTABIX             } from '../../modules/nf-core/tabix/bgzip
 include { TABIX_TABIX                  } from '../../modules/nf-core/tabix/tabix'
 include { LIFTOVER_VCFS                } from '../local/liftover_vcfs'
 include { BCFTOOLS_VIEW as BCFTOOLS_VIEW_CONTIGS } from '../../modules/nf-core/bcftools/view'
+include { FIX_VCF_PREFIX               } from '../../modules/local/custom/fix_vcf_prefix'
 
 
 workflow PREPARE_VCFS_TEST {
@@ -33,16 +34,37 @@ workflow PREPARE_VCFS_TEST {
 
     vcf_ch = Channel.empty()
 
-    LIFTOVER_VCFS(
-        vcf.liftover,
-        Channel.empty(),
-        fasta,
-        chain,
-        rename_chr,
-        dictionary
+    if (params.liftover.contains("test")){
+        LIFTOVER_VCFS(
+            vcf.liftover,
+            Channel.empty(),
+            fasta,
+            chain,
+            rename_chr,
+            dictionary
+        )
+        versions = versions.mix(LIFTOVER_VCFS.out.versions.first())
+        vcf_ch = vcf_ch.mix(LIFTOVER_VCFS.out.vcf_ch,vcf.other)
+    }
+    vcf_ch = vcf_ch.mix(vcf.other)
+
+    // if prefix of chromosomes needs to be fixed
+    vcf_ch.branch{
+        def meta = it[0]
+        prefix: meta.fix_prefix
+        other: true}.set{fix}
+
+    vcf_ch = Channel.empty()
+
+    fix.prefix.view()
+
+    FIX_VCF_PREFIX(
+        fix.prefix,
+        rename_chr
     )
-    versions = versions.mix(LIFTOVER_VCFS.out.versions.first())
-    vcf_ch = vcf_ch.mix(LIFTOVER_VCFS.out.vcf_ch,vcf.other)
+    versions = versions.mix(FIX_VCF_PREFIX.out.versions.first())
+    vcf_ch = vcf_ch.mix(FIX_VCF_PREFIX.out.vcf,fix.other)
+
 
     // Add "query" to test sample
     VCF_REHEADER_SAMPLENAME(

diff --git a/workflows/variantbenchmarking.nf b/workflows/variantbenchmarking.nf
@@ -72,22 +72,39 @@ workflow VARIANTBENCHMARKING {
     sdf             = params.sdf        ? Channel.fromPath(params.sdf, checkIfExists: true).map{ sdf -> tuple([id: sdf.getSimpleName()], sdf) }.collect()
                                                 : Channel.empty()
 
+    if (params.rename_chr){
+        rename_chr = Channel.fromPath(params.rename_chr, checkIfExists: true).map{ txt -> tuple([id: txt.getSimpleName()], txt) }.collect()
+        if (!params.genome){
+            log.error "Please specify params.genome to fix chromosome prefix"
+            exit 1
+        }
+
+    }else{
+        if (params.genome == "GRCh38"){
+            rename_chr = Channel.fromPath("assets/rename_contigs/grch37_grch38.txt", checkIfExists: true).map{ txt -> tuple([id: txt.getSimpleName()], txt) }.collect()
+        }
+        else if(params.genome == "GRCh37")
+        {
+            rename_chr = Channel.fromPath("assets/rename_contigs/grch38_grch37.txt", checkIfExists: true).map{ txt -> tuple([id: txt.getSimpleName()], txt) }.collect()
+        }
+        else{
+            rename_chr = Channel.empty()
+        }
+    }
+
     // read chain file, liftover genome and rename chr files if liftover is true
     if (params.liftover){
 
-        if (params.chain && params.rename_chr){
+        if (params.chain){
             chain           = Channel.fromPath(params.chain, checkIfExists: true).map{ bed -> tuple([id: bed.getSimpleName()], bed) }.collect()
-            rename_chr      = Channel.fromPath(params.rename_chr, checkIfExists: true).map{ txt -> tuple([id: txt.getSimpleName()], txt) }.collect()
         }else{
-            log.error "Please specify params.chain and params.rename_chr to process liftover of the files"
+            log.error "Please specify params.chain to process liftover of the files"
             exit 1
         }
-
         // if dictinoary file is missing PICARD_CREATESEQUENCEDICTIONARY will create one
         dictionary      = params.dictionary ? Channel.fromPath(params.dictionary, checkIfExists: true).map{ dict -> tuple([id: dict.getSimpleName()], dict) }.collect()                                           : Channel.empty()
     }else{
         chain           = Channel.empty()
-        rename_chr      = Channel.empty()
         dictionary      = Channel.empty()
     }