forked from nf-core/variantbenchmarking
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
ca2236a
commit 4a99966
Showing
6 changed files
with
206 additions
and
14 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
#!/usr/bin/env python | ||
|
||
# Copyright 2024 - GHGA | ||
# Author: Kuebra Narci | ||
''' | ||
Generates a CSV file from a VCF | ||
Expected usage: | ||
$ python fix_vcf_prefix.py <vcf_file> <output> | ||
Use --help for more information. | ||
''' | ||
import os | ||
import subprocess | ||
import argparse | ||
import shutil | ||
|
||
|
||
def determine_genome_version(vcf_file): | ||
""" | ||
Determine the genome version by inspecting chromosome naming in the VCF file. | ||
""" | ||
with subprocess.Popen( | ||
f"bcftools view -h {vcf_file} | grep -m 1 '^##contig=<ID='", | ||
shell=True, | ||
stdout=subprocess.PIPE, | ||
stderr=subprocess.PIPE, | ||
universal_newlines=True, | ||
) as proc: | ||
output, _ = proc.communicate() | ||
|
||
if not output: | ||
raise ValueError("Unable to determine genome version from contigs in the VCF header.") | ||
|
||
if "chr" in output: | ||
return "GRCh38" | ||
else: | ||
return "GRCh37" | ||
|
||
|
||
def fix_vcf_prefix(input_vcf, output_vcf, rename_file, target_version): | ||
""" | ||
Check the prefix of chromosome names in the VCF and fix it if necessary using the provided rename file. | ||
""" | ||
# Determine genome version from input VCF | ||
current_version = determine_genome_version(input_vcf) | ||
print(f"Detected genome version: {current_version}") | ||
|
||
# If the current genome version matches the target, simply copy the input VCF | ||
if current_version == target_version: | ||
print(f"Genome version matches the target ({target_version}). Copying input VCF to output.") | ||
shutil.copy(input_vcf, output_vcf) | ||
shutil.copy(input_vcf + ".tbi", output_vcf + ".tbi") # Copy index as well | ||
return | ||
|
||
# Verify the rename file exists | ||
if not os.path.isfile(rename_file): | ||
raise FileNotFoundError(f"Rename file '{rename_file}' not found.") | ||
|
||
# Use bcftools to rename chromosomes without modifying the header | ||
subprocess.check_call( | ||
f"bcftools annotate --rename-chrs {rename_file} --no-version {input_vcf} -Oz -o {output_vcf}", | ||
shell=True, | ||
) | ||
subprocess.check_call(f"bcftools index {output_vcf}", shell=True) | ||
print(f"Chromosome names updated and output written to {output_vcf}") | ||
|
||
|
||
def main(): | ||
parser = argparse.ArgumentParser(description="Check and fix VCF chromosome naming prefix.") | ||
parser.add_argument("input_vcf", help="Input VCF file (compressed and indexed).") | ||
parser.add_argument("output_vcf", help="Output VCF file.") | ||
parser.add_argument( | ||
"--rename-chr", | ||
required=False, | ||
help="Path to a file with chromosome rename mappings (for bcftools --rename-chrs).", | ||
) | ||
parser.add_argument( | ||
"--target-version", | ||
required=True, | ||
choices=["GRCh37", "GRCh38"], | ||
help="Target genome version (GRCh37 or GRCh38).", | ||
) | ||
args = parser.parse_args() | ||
|
||
fix_vcf_prefix(args.input_vcf, args.output_vcf, args.rename_chr, args.target_version) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
channels: | ||
- conda-forge | ||
- bioconda | ||
dependencies: | ||
- bioconda::bcftools | ||
- pip | ||
- pip: | ||
- pip==24.3.1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
process FIX_VCF_PREFIX { | ||
tag "$meta.id" | ||
label 'process_low' | ||
|
||
conda "${moduleDir}/environment.yml" | ||
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? | ||
'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/5d/5d097f8fee4db1239705e5d3929e50547796ab91a1e0bdf4201030ced8cb272d/data': | ||
'community.wave.seqera.io/library/bcftools_pip:8a460830487271c2' }" | ||
|
||
input: | ||
tuple val(meta), path(input) | ||
tuple val(meta2), path(rename_chr) | ||
|
||
output: | ||
tuple val(meta), path("*.vcf.gz"), emit: vcf | ||
path "versions.yml" , emit: versions | ||
|
||
when: | ||
task.ext.when == null || task.ext.when | ||
|
||
script: | ||
def args = task.ext.args ?: '' | ||
def prefix = task.ext.prefix ?: "${meta.id}" | ||
|
||
if ("$input" == "${prefix}.vcf.gz") error "Input and output names are the same, set prefix in module configuration to disambiguate!" | ||
""" | ||
fix_vcf_prefix.py \\ | ||
$input \\ | ||
${prefix}.vcf.gz \\ | ||
--rename-chr $rename_chr \\ | ||
--target-version $params.genome | ||
cat <<-END_VERSIONS > versions.yml | ||
"${task.process}": | ||
bcftools: \$( bcftools --version |& sed '1!d; s/^.*bcftools //' ) | ||
END_VERSIONS | ||
""" | ||
|
||
stub: | ||
def args = task.ext.args ?: '' | ||
def prefix = task.ext.prefix ?: "${meta.id}" | ||
|
||
""" | ||
echo '' | gzip > ${prefix}.vcf.gz | ||
cat <<-END_VERSIONS > versions.yml | ||
"${task.process}": | ||
bcftools: \$( bcftools --version |& sed '1!d; s/^.*bcftools //' ) | ||
END_VERSIONS | ||
""" | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters