From 4d6fc8b86324ac713819768f44a3190a2003b5e3 Mon Sep 17 00:00:00 2001 From: Nuno Agostinho Date: Mon, 4 Mar 2024 16:48:41 +0000 Subject: [PATCH 1/5] Support NCBI GFF with CDS only --- .../VEP/AnnotationSource/File/BaseGXF.pm | 18 ++++++++++++++---- .../EnsEMBL/VEP/AnnotationSource/File/GFF.pm | 4 ++-- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/modules/Bio/EnsEMBL/VEP/AnnotationSource/File/BaseGXF.pm b/modules/Bio/EnsEMBL/VEP/AnnotationSource/File/BaseGXF.pm index d302ce1b3..f19733410 100644 --- a/modules/Bio/EnsEMBL/VEP/AnnotationSource/File/BaseGXF.pm +++ b/modules/Bio/EnsEMBL/VEP/AnnotationSource/File/BaseGXF.pm @@ -459,10 +459,10 @@ sub _create_transcript { my $tr_record = shift; my $gene_record = shift; - return unless $tr_record->{_children}; + return unless $tr_record->{_children} or $tr_record->{type} eq 'CDS'; my $id = $tr_record->{attributes}->{transcript_id} || $self->_record_get_id($tr_record); - $id =~ s/^(gene|transcript)://i; + $id =~ s/^(gene|transcript|cds)[:-]//i; my $slice = $self->get_slice($tr_record->{chr}); @@ -518,8 +518,18 @@ sub _create_transcript { # check for exons for protein_coding biotype if ($biotype eq 'protein_coding' && scalar @exons == 0){ - $self->warning_msg("WARNING: No exons found for protein_coding transcript $id"); - return; + if ($tr_record->{type} eq 'CDS'){ + # create single-exon transcript based on a CDS without transcript parent + # example: microbial annotations from NCBI + push @exons, { + start => $tr_record->{start}, + end => $tr_record->{end}, + strand => $tr_record->{strand}, + }; + } else { + $self->warning_msg("WARNING: No exons found for protein_coding transcript $id"); + return; + } } # sort exons diff --git a/modules/Bio/EnsEMBL/VEP/AnnotationSource/File/GFF.pm b/modules/Bio/EnsEMBL/VEP/AnnotationSource/File/GFF.pm index 34dd82bdd..2ec829e05 100644 --- a/modules/Bio/EnsEMBL/VEP/AnnotationSource/File/GFF.pm +++ b/modules/Bio/EnsEMBL/VEP/AnnotationSource/File/GFF.pm @@ -227,11 +227,11 @@ sub _record_get_biotype { my ($self, $record, $gene_record) = @_; if(!exists($record->{_biotype})) { - # Ensembl-y GFFs have biotype as an attribute my $biotype = $record->{attributes}->{biotype} || $record->{attributes}->{transcript_type} || - $record->{attributes}->{transcript_biotype}; + $record->{attributes}->{transcript_biotype} || + $gene_record->{attributes}->{gene_biotype}; # others we need to (guess) work it out if(!$biotype) { From d34421839078baa40df18c08ab2e0866bc84a34b Mon Sep 17 00:00:00 2001 From: Nuno Agostinho Date: Thu, 14 Mar 2024 16:02:56 +0000 Subject: [PATCH 2/5] Support NCBI GTF with CDS only --- modules/Bio/EnsEMBL/VEP/AnnotationSource/File/GTF.pm | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/modules/Bio/EnsEMBL/VEP/AnnotationSource/File/GTF.pm b/modules/Bio/EnsEMBL/VEP/AnnotationSource/File/GTF.pm index 36fc11864..29988a39b 100644 --- a/modules/Bio/EnsEMBL/VEP/AnnotationSource/File/GTF.pm +++ b/modules/Bio/EnsEMBL/VEP/AnnotationSource/File/GTF.pm @@ -134,7 +134,12 @@ sub _record_get_parent_ids { my ($self, $record) = @_; if(!exists($record->{_parent_id})) { my $type = lc($record->{type}); - my @parent_ids = ($PARENTS{$type}) ? split(',',$record->{attributes}->{$PARENTS{$type}.'_id'}) : (); + my $parent_type = $PARENTS{$type}; + + # Avoid unassigned transcripts (fix for NCBI microbe GTFs) + $parent_type = 'gene' if $parent_type && $record->{attributes}->{$parent_type.'_id'} =~ /^unassigned/; + + my @parent_ids = ($parent_type) ? split(',',$record->{attributes}->{$parent_type.'_id'}) : (); $record->{_parent_id} = \@parent_ids; } return $record->{_parent_id}; @@ -178,6 +183,7 @@ sub _record_get_biotype { $_[1]->{attributes}->{transcript_biotype} || $_[1]->{attributes}->{transcript_type} || $_[1]->{attributes}->{biotype} || + $_[1]->{_gene_record}->{attributes}->{gene_biotype} || $_[1]->{source}; } From 6a868d290cdfb975f217b002b3a6df7c3f2aa532 Mon Sep 17 00:00:00 2001 From: Nuno Agostinho Date: Thu, 5 Sep 2024 13:47:49 +0100 Subject: [PATCH 3/5] Use argument to avoid converting NCBI CDS to transcripts by default --- .../VEP/AnnotationSource/File/BaseGXF.pm | 11 ++++++++--- .../EnsEMBL/VEP/AnnotationSource/File/GFF.pm | 2 +- .../EnsEMBL/VEP/AnnotationSource/File/GTF.pm | 18 ++++++++++-------- modules/Bio/EnsEMBL/VEP/Config.pm | 1 + modules/Bio/EnsEMBL/VEP/Stats.pm | 2 ++ 5 files changed, 22 insertions(+), 12 deletions(-) diff --git a/modules/Bio/EnsEMBL/VEP/AnnotationSource/File/BaseGXF.pm b/modules/Bio/EnsEMBL/VEP/AnnotationSource/File/BaseGXF.pm index f19733410..1056d660d 100644 --- a/modules/Bio/EnsEMBL/VEP/AnnotationSource/File/BaseGXF.pm +++ b/modules/Bio/EnsEMBL/VEP/AnnotationSource/File/BaseGXF.pm @@ -461,6 +461,11 @@ sub _create_transcript { return unless $tr_record->{_children} or $tr_record->{type} eq 'CDS'; + if (!$self->param('cds_as_transcript_gxf') && $tr_record->{type} eq 'CDS') { + $self->warning_msg("WARNING: $self->{file} contains CDS features whose parents are genes (instead of transcripts); use --cds_as_transcript_gxf to parse protein-coding CDS as single-exon transcripts (required if using NCBI prokaryotic GFF/GTF annotations)"); + return; + } + my $id = $tr_record->{attributes}->{transcript_id} || $self->_record_get_id($tr_record); $id =~ s/^(gene|transcript|cds)[:-]//i; @@ -518,16 +523,16 @@ sub _create_transcript { # check for exons for protein_coding biotype if ($biotype eq 'protein_coding' && scalar @exons == 0){ - if ($tr_record->{type} eq 'CDS'){ + if ($self->param('cds_as_transcript_gxf') && $tr_record->{type} eq 'CDS'){ # create single-exon transcript based on a CDS without transcript parent - # example: microbial annotations from NCBI + # example: prokaryotic GFF/GTF annotations from NCBI push @exons, { start => $tr_record->{start}, end => $tr_record->{end}, strand => $tr_record->{strand}, }; } else { - $self->warning_msg("WARNING: No exons found for protein_coding transcript $id"); + $self->warning_msg("WARNING: No exons found for protein-coding transcript $id"); return; } } diff --git a/modules/Bio/EnsEMBL/VEP/AnnotationSource/File/GFF.pm b/modules/Bio/EnsEMBL/VEP/AnnotationSource/File/GFF.pm index 2ec829e05..376e1d001 100644 --- a/modules/Bio/EnsEMBL/VEP/AnnotationSource/File/GFF.pm +++ b/modules/Bio/EnsEMBL/VEP/AnnotationSource/File/GFF.pm @@ -231,7 +231,7 @@ sub _record_get_biotype { my $biotype = $record->{attributes}->{biotype} || $record->{attributes}->{transcript_type} || $record->{attributes}->{transcript_biotype} || - $gene_record->{attributes}->{gene_biotype}; + ($self->param('cds_as_transcript_gxf') ? $gene_record->{attributes}->{gene_biotype} : undef); # others we need to (guess) work it out if(!$biotype) { diff --git a/modules/Bio/EnsEMBL/VEP/AnnotationSource/File/GTF.pm b/modules/Bio/EnsEMBL/VEP/AnnotationSource/File/GTF.pm index 29988a39b..d7fca2a52 100644 --- a/modules/Bio/EnsEMBL/VEP/AnnotationSource/File/GTF.pm +++ b/modules/Bio/EnsEMBL/VEP/AnnotationSource/File/GTF.pm @@ -136,7 +136,7 @@ sub _record_get_parent_ids { my $type = lc($record->{type}); my $parent_type = $PARENTS{$type}; - # Avoid unassigned transcripts (fix for NCBI microbe GTFs) + # Avoid unassigned transcripts (fix for NCBI prokaryotic GTFs) $parent_type = 'gene' if $parent_type && $record->{attributes}->{$parent_type.'_id'} =~ /^unassigned/; my @parent_ids = ($parent_type) ? split(',',$record->{attributes}->{$parent_type.'_id'}) : (); @@ -167,7 +167,8 @@ sub _record_get_id { =head2 _record_get_biotype Arg 1 : hashref $transcript_record_hash - Example : $biotype = $as->_record_get_biotype($tr_record); + Arg 2 : hashref $gene_record_hash + Example : $biotype = $as->_record_get_biotype($tr_record, $gene_record); Description: Get sequence ontology (SO) biotype of this record. Attempts to find it in the "biotype", "transcript_type" or "transcript_biotype" attribute fields, and if that fails default to the source field. @@ -179,12 +180,13 @@ sub _record_get_id { =cut sub _record_get_biotype { - return - $_[1]->{attributes}->{transcript_biotype} || - $_[1]->{attributes}->{transcript_type} || - $_[1]->{attributes}->{biotype} || - $_[1]->{_gene_record}->{attributes}->{gene_biotype} || - $_[1]->{source}; + my ($self, $record, $gene_record) = @_; + my $biotype = $record->{attributes}->{transcript_biotype} || + $record->{attributes}->{transcript_type} || + $record->{attributes}->{biotype} || + ($self->param('cds_as_transcript_gxf') ? $gene_record->{attributes}->{gene_biotype} : undef) || + $record->{source}; + return $biotype; } 1; diff --git a/modules/Bio/EnsEMBL/VEP/Config.pm b/modules/Bio/EnsEMBL/VEP/Config.pm index b826d35bd..cd4182237 100755 --- a/modules/Bio/EnsEMBL/VEP/Config.pm +++ b/modules/Bio/EnsEMBL/VEP/Config.pm @@ -256,6 +256,7 @@ our @VEP_PARAMS = ( 'ucsc_assembly=s', # required for phyloP, phastCons, e.g. use hg19 for GRCh37, hg38 for GRCh38 'ucsc_data_root=s', # replace if you have the data locally, defaults to http://hgdownload.cse.ucsc.edu/goldenpath/ 'custom_multi_allelic', # prevents filtering of custom annotation data when comma separated lists are assumed to be allele specific + 'cds_as_transcript_gxf', # parse CDS with gene parents as single-exon transcripts on GTF/GFF3 files (useful for NCBI prokaryotic annotation) # plugins 'plugin=s@', # specify a method in a module in the plugins directory diff --git a/modules/Bio/EnsEMBL/VEP/Stats.pm b/modules/Bio/EnsEMBL/VEP/Stats.pm index 048bdb48d..5bedaf84e 100644 --- a/modules/Bio/EnsEMBL/VEP/Stats.pm +++ b/modules/Bio/EnsEMBL/VEP/Stats.pm @@ -684,6 +684,8 @@ sub generate_run_stats { sub generate_data_version { my $self = shift; + return [] unless defined $self->{info}->{version_data}; + my %version_data = %{ $self->{info}->{version_data} }; my @return = map { [ $_, $version_data{$_} ] } sort keys %version_data; return \@return; From e7646b0d8ea3eadaa52a0ee9c27693088483d28f Mon Sep 17 00:00:00 2001 From: Nuno Agostinho Date: Thu, 5 Sep 2024 14:05:58 +0100 Subject: [PATCH 4/5] Replace prokariotic with microbe (more precise) --- modules/Bio/EnsEMBL/VEP/AnnotationSource/File/BaseGXF.pm | 4 ++-- modules/Bio/EnsEMBL/VEP/AnnotationSource/File/GTF.pm | 2 +- modules/Bio/EnsEMBL/VEP/Config.pm | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/modules/Bio/EnsEMBL/VEP/AnnotationSource/File/BaseGXF.pm b/modules/Bio/EnsEMBL/VEP/AnnotationSource/File/BaseGXF.pm index 1056d660d..4923ba0fa 100644 --- a/modules/Bio/EnsEMBL/VEP/AnnotationSource/File/BaseGXF.pm +++ b/modules/Bio/EnsEMBL/VEP/AnnotationSource/File/BaseGXF.pm @@ -462,7 +462,7 @@ sub _create_transcript { return unless $tr_record->{_children} or $tr_record->{type} eq 'CDS'; if (!$self->param('cds_as_transcript_gxf') && $tr_record->{type} eq 'CDS') { - $self->warning_msg("WARNING: $self->{file} contains CDS features whose parents are genes (instead of transcripts); use --cds_as_transcript_gxf to parse protein-coding CDS as single-exon transcripts (required if using NCBI prokaryotic GFF/GTF annotations)"); + $self->warning_msg("WARNING: $self->{file} contains CDS features whose parents are genes (instead of transcripts); use --cds_as_transcript_gxf to parse protein-coding CDS as single-exon transcripts (useful for NCBI microbe GFF/GTF annotations)"); return; } @@ -525,7 +525,7 @@ sub _create_transcript { if ($biotype eq 'protein_coding' && scalar @exons == 0){ if ($self->param('cds_as_transcript_gxf') && $tr_record->{type} eq 'CDS'){ # create single-exon transcript based on a CDS without transcript parent - # example: prokaryotic GFF/GTF annotations from NCBI + # example: NCBI microbe GFF/GTF annotations push @exons, { start => $tr_record->{start}, end => $tr_record->{end}, diff --git a/modules/Bio/EnsEMBL/VEP/AnnotationSource/File/GTF.pm b/modules/Bio/EnsEMBL/VEP/AnnotationSource/File/GTF.pm index d7fca2a52..515ac0ac9 100644 --- a/modules/Bio/EnsEMBL/VEP/AnnotationSource/File/GTF.pm +++ b/modules/Bio/EnsEMBL/VEP/AnnotationSource/File/GTF.pm @@ -136,7 +136,7 @@ sub _record_get_parent_ids { my $type = lc($record->{type}); my $parent_type = $PARENTS{$type}; - # Avoid unassigned transcripts (fix for NCBI prokaryotic GTFs) + # Avoid unassigned transcripts (fix for NCBI microbe GTFs) $parent_type = 'gene' if $parent_type && $record->{attributes}->{$parent_type.'_id'} =~ /^unassigned/; my @parent_ids = ($parent_type) ? split(',',$record->{attributes}->{$parent_type.'_id'}) : (); diff --git a/modules/Bio/EnsEMBL/VEP/Config.pm b/modules/Bio/EnsEMBL/VEP/Config.pm index cd4182237..59dcf1a39 100755 --- a/modules/Bio/EnsEMBL/VEP/Config.pm +++ b/modules/Bio/EnsEMBL/VEP/Config.pm @@ -256,7 +256,7 @@ our @VEP_PARAMS = ( 'ucsc_assembly=s', # required for phyloP, phastCons, e.g. use hg19 for GRCh37, hg38 for GRCh38 'ucsc_data_root=s', # replace if you have the data locally, defaults to http://hgdownload.cse.ucsc.edu/goldenpath/ 'custom_multi_allelic', # prevents filtering of custom annotation data when comma separated lists are assumed to be allele specific - 'cds_as_transcript_gxf', # parse CDS with gene parents as single-exon transcripts on GTF/GFF3 files (useful for NCBI prokaryotic annotation) + 'cds_as_transcript_gxf', # parse CDS with gene parents as single-exon transcripts on GTF/GFF3 files (useful for NCBI microbe annotation) # plugins 'plugin=s@', # specify a method in a module in the plugins directory From d55caab72e563cc800460fea9308df0d0518ae9d Mon Sep 17 00:00:00 2001 From: Nuno Agostinho Date: Thu, 5 Sep 2024 15:07:35 +0100 Subject: [PATCH 5/5] Fix unit test --- t/AnnotationSource_File_GFF.t | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/t/AnnotationSource_File_GFF.t b/t/AnnotationSource_File_GFF.t index f5c7c44f5..5e10bb447 100644 --- a/t/AnnotationSource_File_GFF.t +++ b/t/AnnotationSource_File_GFF.t @@ -284,7 +284,7 @@ SKIP: { 'type' => 'mRNA', ); my $trans = $as->lazy_load_transcript(\%feature_record, $feature_record{_gene_record}); - ok($tmp =~ /No exons found for protein_coding transcript/, 'no exons warning message'); + ok($tmp =~ /No exons found for protein-coding transcript/, 'no exons warning message'); # restore STDERR open(STDERR, ">&SAVE") or die "Can't restore STDERR\n";