From 958809ee21592a3e353a18e7d3159f1313950953 Mon Sep 17 00:00:00 2001 From: Cyriac Kandoth Date: Wed, 18 Nov 2020 09:25:28 -0800 Subject: [PATCH] Disable filter-vcf by default; update fasta path; better notes --- README.md | 15 ++++++++++----- vcf2maf.pl | 15 +++++++-------- 2 files changed, 17 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index e7bb9fa..c89917d 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -vcfmaf +vcfmaf ======= To convert a [VCF](http://samtools.github.io/hts-specs/) into a [MAF](https://docs.gdc.cancer.gov/Data/File_Formats/MAF_Format), each variant must be mapped to only one of all possible gene transcripts/isoforms that it might affect. But even within a single isoform, a `Missense_Mutation` close enough to a `Splice_Site`, can be labeled as either in MAF format, but not as both. **This selection of a single effect per variant, is often subjective. And that's what this project attempts to standardize.** The `vcf2maf` and `maf2maf` scripts leave most of that responsibility to [Ensembl's VEP](http://useast.ensembl.org/info/docs/tools/vep/index.html), but allows you to override their "canonical" isoforms, or use a custom ExAC VCF for annotation. Though the most useful feature is the **extensive support in parsing a wide range of crappy MAF-like or VCF-like formats** we've seen out in the wild. @@ -15,7 +15,7 @@ Find the [latest stable release](https://github.com/mskcc/vcf2maf/releases), dow perl vcf2maf.pl --man perl maf2maf.pl --man -If you don't have [VEP](http://useast.ensembl.org/info/docs/tools/vep/index.html) installed, then [follow this gist](https://gist.github.com/ckandoth/61c65ba96b011f286220fa4832ad2bc0). Of the many annotators out there, VEP is preferred for its large team of active coders, and its CLIA-compliant [HGVS formats](http://www.hgvs.org/mutnomen/recs.html). After installing VEP, you can test the script like so: +If you don't have [VEP](http://useast.ensembl.org/info/docs/tools/vep/index.html) installed, then [follow this gist](https://gist.github.com/ckandoth/61c65ba96b011f286220fa4832ad2bc0). Of the many annotators out there, VEP is preferred for its large team of active coders, and its CLIA-compliant [HGVS formats](http://www.hgvs.org/mutnomen/recs.html). After installing VEP, test out `vcf2maf` like this: perl vcf2maf.pl --input-vcf tests/test.vcf --output-maf tests/test.vep.maf @@ -23,15 +23,15 @@ To fill columns 16 and 17 of the output MAF with tumor/normal sample IDs, and to perl vcf2maf.pl --input-vcf tests/test.vcf --output-maf tests/test.vep.maf --tumor-id WD1309 --normal-id NB1308 -VCFs from variant callers like [VarScan](http://varscan.sourceforge.net/somatic-calling.html#somatic-output) use hardcoded sample IDs TUMOR/NORMAL in the genotype columns of the VCF. To have this script correctly parse the correct genotype columns, while still printing the proper IDs in the output MAF: +VCFs from variant callers like [VarScan](http://varscan.sourceforge.net/somatic-calling.html#somatic-output) use hardcoded sample IDs TUMOR/NORMAL to name genotype columns. To have `vcf2maf` correctly locate the columns to parse genotypes, while still printing proper sample IDs in the output MAF: perl vcf2maf.pl --input-vcf tests/test_varscan.vcf --output-maf tests/test_varscan.vep.maf --tumor-id WD1309 --normal-id NB1308 --vcf-tumor-id TUMOR --vcf-normal-id NORMAL -If you have the VEP script in a different folder like `/opt/vep`, and its cache in `/srv/vep`, there are options available to use those instead: +If VEP is installed under `/opt/vep` and the VEP cache is under `/srv/vep`, there are options available to tell `vcf2maf` where to find them: perl vcf2maf.pl --input-vcf tests/test.vcf --output-maf tests/test.vep.maf --vep-path /opt/vep --vep-data /srv/vep -If you want to skip running VEP and need a minimalist MAF format listing data from the input VCF only, then use the `--inhibit-vep` option. If your input VCF contains VEP annotation, then `vcf2maf` will try to extract it. But be warned that the accuracy of your resulting MAF depends on how VEP was operated upstream. `vcf2maf` operates VEP with very specific parameters to make sure everyone has comparable MAFs. +If you want to skip running VEP and need a minimalist MAF-like file listing data from the input VCF only, then use the `--inhibit-vep` option. If your input VCF contains VEP annotation, then `vcf2maf` will try to extract it. But be warned that the accuracy of your resulting MAF depends on how VEP was operated upstream. In standard operation, `vcf2maf` runs VEP with very specific parameters to make sure everyone produces comparable MAFs. So, it is strongly recommended to avoid `--inhibit-vep` unless you know what you're doing. maf2maf ------- @@ -53,3 +53,8 @@ License ------- Apache-2.0 | Apache License, Version 2.0 | https://www.apache.org/licenses/LICENSE-2.0 + +Citation +-------- + + Cyriac Kandoth. mskcc/vcf2maf: vcf2maf v1.6.19. (2020). doi:10.5281/zenodo.593251 diff --git a/vcf2maf.pl b/vcf2maf.pl index acee9db..116eac6 100644 --- a/vcf2maf.pl +++ b/vcf2maf.pl @@ -13,8 +13,8 @@ # Set any default paths and constants my ( $tumor_id, $normal_id ) = ( "TUMOR", "NORMAL" ); -my ( $vep_path, $vep_data, $vep_forks, $buffer_size, $any_allele, $inhibit_vep, $online ) = ( "$ENV{HOME}/vep", "$ENV{HOME}/.vep", 4, 5000, 0, 0, 0 ); -my ( $ref_fasta, $filter_vcf ) = ( "$ENV{HOME}/.vep/homo_sapiens/95_GRCh37/Homo_sapiens.GRCh37.75.dna.primary_assembly.fa.gz", "$ENV{HOME}/.vep/ExAC_nonTCGA.r0.3.1.sites.vep.vcf.gz" ); +my ( $vep_path, $vep_data, $vep_forks, $buffer_size, $any_allele, $inhibit_vep, $online ) = ( "$ENV{HOME}/miniconda3/bin", "$ENV{HOME}/.vep", 4, 5000, 0, 0, 0 ); +my ( $ref_fasta, $filter_vcf ) = ( "$ENV{HOME}/.vep/homo_sapiens/101_GRCh37/Homo_sapiens.GRCh37.dna.toplevel.fa.gz", "" ); my ( $species, $ncbi_build, $cache_version, $maf_center, $retain_info, $retain_fmt, $min_hom_vaf, $max_filter_ac ) = ( "homo_sapiens", "GRCh37", "", ".", "", "", 0.7, 10 ); my $perl_bin = $Config{perlpath}; @@ -392,8 +392,7 @@ sub GetBiotypePriority { # ::NOTE:: If input had no variants, don't break here, so we can continue to create an empty MAF ( !@regions_split or %flanking_bps ) or die "ERROR: You're either using an outdated samtools, or --ref-fasta is not the same genome build as your --input-vcf."; -# Skip filtering if not handling GRCh37, and filter-vcf is pointing to the default GRCh37 ExAC VCF -if(( $species eq "homo_sapiens" and $ncbi_build eq "GRCh37" and $filter_vcf ) or ( $filter_vcf and $filter_vcf ne "$ENV{HOME}/.vep/ExAC_nonTCGA.r0.3.1.sites.vep.vcf.gz" )) { +if( $filter_vcf ) { ( -s $filter_vcf ) or die "ERROR: Provided --filter-vcf is missing or empty: $filter_vcf\n"; # Query each variant locus on the filter VCF, using tabix, just like we used samtools earlier ( $lines, @regions_split ) = ( "", ()); @@ -436,7 +435,7 @@ sub GetBiotypePriority { warn "STATUS: Running VEP and writing to: $output_vcf\n"; # Make sure we can find the VEP script my $vep_script = ( -s "$vep_path/vep" ? "$vep_path/vep" : "$vep_path/variant_effect_predictor.pl" ); - ( -s $vep_script ) or die "ERROR: Cannot find VEP script in path: $vep_path\n"; + ( -s $vep_script ) or die "ERROR: Cannot find VEP script under: $vep_path\n"; # Contruct VEP command using some default options and run it my $vep_cmd = "$perl_bin $vep_script --species $species --assembly $ncbi_build --no_progress --no_stats --buffer_size $buffer_size --sift b --ccds --uniprot --hgvs --symbol --numbers --domains --gene_phenotype --canonical --protein --biotype --uniprot --tsl --variant_class --shift_hgvs 1 --check_existing --total_length --allele_number --no_escape --xref_refseq --failed 1 --vcf --flag_pick_allele --pick_order canonical,tsl,biotype,rank,ccds,length --dir $vep_data --fasta $ref_fasta --format vcf --input_file $input_vcf --output_file $output_vcf"; @@ -1136,12 +1135,12 @@ =head1 OPTIONS --any-allele When reporting co-located variants, allow mismatched variant alleles too --inhibit-vep Skip running VEP, but extract VEP annotation in VCF if found --online Use useastdb.ensembl.org instead of local cache (supports only GRCh38 VCFs listing <100 events) - --ref-fasta Reference FASTA file [~/.vep/homo_sapiens/95_GRCh37/Homo_sapiens.GRCh37.75.dna.primary_assembly.fa.gz] - --filter-vcf A VCF for FILTER tag common_variant. Set to 0 to disable [~/.vep/ExAC_nonTCGA.r0.3.1.sites.vep.vcf.gz] + --ref-fasta Reference FASTA file [~/.vep/homo_sapiens/101_GRCh37/Homo_sapiens.GRCh37.dna.toplevel.fa.gz] + --filter-vcf A VCF for FILTER tag common_variant; Disabled by default [] --max-filter-ac Use tag common_variant if the filter-vcf reports a subpopulation AC higher than this [10] --species Ensembl-friendly name of species (e.g. mus_musculus for mouse) [homo_sapiens] --ncbi-build NCBI reference assembly of variants MAF (e.g. GRCm38 for mouse) [GRCh37] - --cache-version Version of offline cache to use with VEP (e.g. 75, 84, 91) [Default: Installed version] + --cache-version Version of offline cache to use with VEP (e.g. 75, 91, 101) [Default: Installed version] --maf-center Variant calling center to report in MAF [.] --retain-info Comma-delimited names of INFO fields to retain as extra columns in MAF [] --retain-fmt Comma-delimited names of FORMAT fields to retain as extra columns in MAF []