From 4d6fc8b86324ac713819768f44a3190a2003b5e3 Mon Sep 17 00:00:00 2001
From: Nuno Agostinho <nuno@ebi.ac.uk>
Date: Mon, 4 Mar 2024 16:48:41 +0000
Subject: [PATCH 1/5] Support NCBI GFF with CDS only

---
 .../VEP/AnnotationSource/File/BaseGXF.pm       | 18 ++++++++++++++----
 .../EnsEMBL/VEP/AnnotationSource/File/GFF.pm   |  4 ++--
 2 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/modules/Bio/EnsEMBL/VEP/AnnotationSource/File/BaseGXF.pm b/modules/Bio/EnsEMBL/VEP/AnnotationSource/File/BaseGXF.pm
index d302ce1b3..f19733410 100644
--- a/modules/Bio/EnsEMBL/VEP/AnnotationSource/File/BaseGXF.pm
+++ b/modules/Bio/EnsEMBL/VEP/AnnotationSource/File/BaseGXF.pm
@@ -459,10 +459,10 @@ sub _create_transcript {
   my $tr_record = shift;
   my $gene_record = shift;
 
-  return unless $tr_record->{_children};
+  return unless $tr_record->{_children} or $tr_record->{type} eq 'CDS';
 
   my $id = $tr_record->{attributes}->{transcript_id} || $self->_record_get_id($tr_record);
-  $id =~ s/^(gene|transcript)://i;
+  $id =~ s/^(gene|transcript|cds)[:-]//i;
 
   my $slice = $self->get_slice($tr_record->{chr});
 
@@ -518,8 +518,18 @@ sub _create_transcript {
 
   # check for exons for protein_coding biotype
   if ($biotype eq 'protein_coding' && scalar @exons == 0){
-    $self->warning_msg("WARNING: No exons found for protein_coding transcript $id");
-    return;
+    if ($tr_record->{type} eq 'CDS'){
+      # create single-exon transcript based on a CDS without transcript parent
+      # example: microbial annotations from NCBI
+      push @exons, {
+        start  => $tr_record->{start},
+        end    => $tr_record->{end},
+        strand => $tr_record->{strand},
+      };
+    } else {
+      $self->warning_msg("WARNING: No exons found for protein_coding transcript $id");
+      return;
+    }
   }
 
   # sort exons
diff --git a/modules/Bio/EnsEMBL/VEP/AnnotationSource/File/GFF.pm b/modules/Bio/EnsEMBL/VEP/AnnotationSource/File/GFF.pm
index 34dd82bdd..2ec829e05 100644
--- a/modules/Bio/EnsEMBL/VEP/AnnotationSource/File/GFF.pm
+++ b/modules/Bio/EnsEMBL/VEP/AnnotationSource/File/GFF.pm
@@ -227,11 +227,11 @@ sub _record_get_biotype {
   my ($self, $record, $gene_record) = @_;
 
   if(!exists($record->{_biotype})) {
-
     # Ensembl-y GFFs have biotype as an attribute
     my $biotype = $record->{attributes}->{biotype} ||
                   $record->{attributes}->{transcript_type} ||
-                  $record->{attributes}->{transcript_biotype};
+                  $record->{attributes}->{transcript_biotype} ||
+                  $gene_record->{attributes}->{gene_biotype};
 
     # others we need to (guess) work it out
     if(!$biotype) {

From d34421839078baa40df18c08ab2e0866bc84a34b Mon Sep 17 00:00:00 2001
From: Nuno Agostinho <nuno@ebi.ac.uk>
Date: Thu, 14 Mar 2024 16:02:56 +0000
Subject: [PATCH 2/5] Support NCBI GTF with CDS only

---
 modules/Bio/EnsEMBL/VEP/AnnotationSource/File/GTF.pm | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/modules/Bio/EnsEMBL/VEP/AnnotationSource/File/GTF.pm b/modules/Bio/EnsEMBL/VEP/AnnotationSource/File/GTF.pm
index 36fc11864..29988a39b 100644
--- a/modules/Bio/EnsEMBL/VEP/AnnotationSource/File/GTF.pm
+++ b/modules/Bio/EnsEMBL/VEP/AnnotationSource/File/GTF.pm
@@ -134,7 +134,12 @@ sub _record_get_parent_ids {
   my ($self, $record) = @_;
   if(!exists($record->{_parent_id})) {
     my $type = lc($record->{type});
-    my @parent_ids = ($PARENTS{$type}) ? split(',',$record->{attributes}->{$PARENTS{$type}.'_id'}) : ();
+    my $parent_type = $PARENTS{$type};
+
+    # Avoid unassigned transcripts (fix for NCBI microbe GTFs)
+    $parent_type = 'gene' if $parent_type && $record->{attributes}->{$parent_type.'_id'} =~ /^unassigned/;
+
+    my @parent_ids = ($parent_type) ? split(',',$record->{attributes}->{$parent_type.'_id'}) : ();
     $record->{_parent_id} = \@parent_ids;
   }
   return $record->{_parent_id};
@@ -178,6 +183,7 @@ sub _record_get_biotype {
     $_[1]->{attributes}->{transcript_biotype} ||
     $_[1]->{attributes}->{transcript_type} ||
     $_[1]->{attributes}->{biotype} ||
+    $_[1]->{_gene_record}->{attributes}->{gene_biotype} ||
     $_[1]->{source};
 }
 

From 6a868d290cdfb975f217b002b3a6df7c3f2aa532 Mon Sep 17 00:00:00 2001
From: Nuno Agostinho <nuno@ebi.ac.uk>
Date: Thu, 5 Sep 2024 13:47:49 +0100
Subject: [PATCH 3/5] Use argument to avoid converting NCBI CDS to transcripts
 by default

---
 .../VEP/AnnotationSource/File/BaseGXF.pm       | 11 ++++++++---
 .../EnsEMBL/VEP/AnnotationSource/File/GFF.pm   |  2 +-
 .../EnsEMBL/VEP/AnnotationSource/File/GTF.pm   | 18 ++++++++++--------
 modules/Bio/EnsEMBL/VEP/Config.pm              |  1 +
 modules/Bio/EnsEMBL/VEP/Stats.pm               |  2 ++
 5 files changed, 22 insertions(+), 12 deletions(-)

diff --git a/modules/Bio/EnsEMBL/VEP/AnnotationSource/File/BaseGXF.pm b/modules/Bio/EnsEMBL/VEP/AnnotationSource/File/BaseGXF.pm
index f19733410..1056d660d 100644
--- a/modules/Bio/EnsEMBL/VEP/AnnotationSource/File/BaseGXF.pm
+++ b/modules/Bio/EnsEMBL/VEP/AnnotationSource/File/BaseGXF.pm
@@ -461,6 +461,11 @@ sub _create_transcript {
 
   return unless $tr_record->{_children} or $tr_record->{type} eq 'CDS';
 
+  if (!$self->param('cds_as_transcript_gxf') && $tr_record->{type} eq 'CDS') {
+    $self->warning_msg("WARNING: $self->{file} contains CDS features whose parents are genes (instead of transcripts); use --cds_as_transcript_gxf to parse protein-coding CDS as single-exon transcripts (required if using NCBI prokaryotic GFF/GTF annotations)");
+    return;
+  }
+
   my $id = $tr_record->{attributes}->{transcript_id} || $self->_record_get_id($tr_record);
   $id =~ s/^(gene|transcript|cds)[:-]//i;
 
@@ -518,16 +523,16 @@ sub _create_transcript {
 
   # check for exons for protein_coding biotype
   if ($biotype eq 'protein_coding' && scalar @exons == 0){
-    if ($tr_record->{type} eq 'CDS'){
+    if ($self->param('cds_as_transcript_gxf') && $tr_record->{type} eq 'CDS'){
       # create single-exon transcript based on a CDS without transcript parent
-      # example: microbial annotations from NCBI
+      # example: prokaryotic GFF/GTF annotations from NCBI
       push @exons, {
         start  => $tr_record->{start},
         end    => $tr_record->{end},
         strand => $tr_record->{strand},
       };
     } else {
-      $self->warning_msg("WARNING: No exons found for protein_coding transcript $id");
+      $self->warning_msg("WARNING: No exons found for protein-coding transcript $id");
       return;
     }
   }
diff --git a/modules/Bio/EnsEMBL/VEP/AnnotationSource/File/GFF.pm b/modules/Bio/EnsEMBL/VEP/AnnotationSource/File/GFF.pm
index 2ec829e05..376e1d001 100644
--- a/modules/Bio/EnsEMBL/VEP/AnnotationSource/File/GFF.pm
+++ b/modules/Bio/EnsEMBL/VEP/AnnotationSource/File/GFF.pm
@@ -231,7 +231,7 @@ sub _record_get_biotype {
     my $biotype = $record->{attributes}->{biotype} ||
                   $record->{attributes}->{transcript_type} ||
                   $record->{attributes}->{transcript_biotype} ||
-                  $gene_record->{attributes}->{gene_biotype};
+                  ($self->param('cds_as_transcript_gxf') ? $gene_record->{attributes}->{gene_biotype} : undef);
 
     # others we need to (guess) work it out
     if(!$biotype) {
diff --git a/modules/Bio/EnsEMBL/VEP/AnnotationSource/File/GTF.pm b/modules/Bio/EnsEMBL/VEP/AnnotationSource/File/GTF.pm
index 29988a39b..d7fca2a52 100644
--- a/modules/Bio/EnsEMBL/VEP/AnnotationSource/File/GTF.pm
+++ b/modules/Bio/EnsEMBL/VEP/AnnotationSource/File/GTF.pm
@@ -136,7 +136,7 @@ sub _record_get_parent_ids {
     my $type = lc($record->{type});
     my $parent_type = $PARENTS{$type};
 
-    # Avoid unassigned transcripts (fix for NCBI microbe GTFs)
+    # Avoid unassigned transcripts (fix for NCBI prokaryotic GTFs)
     $parent_type = 'gene' if $parent_type && $record->{attributes}->{$parent_type.'_id'} =~ /^unassigned/;
 
     my @parent_ids = ($parent_type) ? split(',',$record->{attributes}->{$parent_type.'_id'}) : ();
@@ -167,7 +167,8 @@ sub _record_get_id {
 =head2 _record_get_biotype
 
   Arg 1      : hashref $transcript_record_hash
-  Example    : $biotype = $as->_record_get_biotype($tr_record);
+  Arg 2      : hashref $gene_record_hash
+  Example    : $biotype = $as->_record_get_biotype($tr_record, $gene_record);
   Description: Get sequence ontology (SO) biotype of this record. Attempts to
                find it in the "biotype", "transcript_type" or "transcript_biotype"
                attribute fields, and if that fails default to the source field.
@@ -179,12 +180,13 @@ sub _record_get_id {
 =cut
 
 sub _record_get_biotype {
-  return
-    $_[1]->{attributes}->{transcript_biotype} ||
-    $_[1]->{attributes}->{transcript_type} ||
-    $_[1]->{attributes}->{biotype} ||
-    $_[1]->{_gene_record}->{attributes}->{gene_biotype} ||
-    $_[1]->{source};
+  my ($self, $record, $gene_record) = @_;
+  my $biotype = $record->{attributes}->{transcript_biotype} ||
+                $record->{attributes}->{transcript_type} ||
+                $record->{attributes}->{biotype} ||
+                ($self->param('cds_as_transcript_gxf') ? $gene_record->{attributes}->{gene_biotype} : undef) ||
+                $record->{source};
+  return $biotype;
 }
 
 1;
diff --git a/modules/Bio/EnsEMBL/VEP/Config.pm b/modules/Bio/EnsEMBL/VEP/Config.pm
index b826d35bd..cd4182237 100755
--- a/modules/Bio/EnsEMBL/VEP/Config.pm
+++ b/modules/Bio/EnsEMBL/VEP/Config.pm
@@ -256,6 +256,7 @@ our @VEP_PARAMS = (
   'ucsc_assembly=s',         # required for phyloP, phastCons, e.g. use hg19 for GRCh37, hg38 for GRCh38
   'ucsc_data_root=s',        # replace if you have the data locally, defaults to http://hgdownload.cse.ucsc.edu/goldenpath/
   'custom_multi_allelic',    # prevents filtering of custom annotation data when comma separated lists are assumed to be allele specific
+  'cds_as_transcript_gxf',   # parse CDS with gene parents as single-exon transcripts on GTF/GFF3 files (useful for NCBI prokaryotic annotation)
 
   # plugins
   'plugin=s@',               # specify a method in a module in the plugins directory
diff --git a/modules/Bio/EnsEMBL/VEP/Stats.pm b/modules/Bio/EnsEMBL/VEP/Stats.pm
index 048bdb48d..5bedaf84e 100644
--- a/modules/Bio/EnsEMBL/VEP/Stats.pm
+++ b/modules/Bio/EnsEMBL/VEP/Stats.pm
@@ -684,6 +684,8 @@ sub generate_run_stats {
 
 sub generate_data_version {
   my $self = shift;
+  return [] unless defined $self->{info}->{version_data};
+
   my %version_data = %{ $self->{info}->{version_data} };
   my @return = map { [ $_, $version_data{$_} ] } sort keys %version_data;
   return \@return;

From e7646b0d8ea3eadaa52a0ee9c27693088483d28f Mon Sep 17 00:00:00 2001
From: Nuno Agostinho <nuno@ebi.ac.uk>
Date: Thu, 5 Sep 2024 14:05:58 +0100
Subject: [PATCH 4/5] Replace prokariotic with microbe (more precise)

---
 modules/Bio/EnsEMBL/VEP/AnnotationSource/File/BaseGXF.pm | 4 ++--
 modules/Bio/EnsEMBL/VEP/AnnotationSource/File/GTF.pm     | 2 +-
 modules/Bio/EnsEMBL/VEP/Config.pm                        | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/modules/Bio/EnsEMBL/VEP/AnnotationSource/File/BaseGXF.pm b/modules/Bio/EnsEMBL/VEP/AnnotationSource/File/BaseGXF.pm
index 1056d660d..4923ba0fa 100644
--- a/modules/Bio/EnsEMBL/VEP/AnnotationSource/File/BaseGXF.pm
+++ b/modules/Bio/EnsEMBL/VEP/AnnotationSource/File/BaseGXF.pm
@@ -462,7 +462,7 @@ sub _create_transcript {
   return unless $tr_record->{_children} or $tr_record->{type} eq 'CDS';
 
   if (!$self->param('cds_as_transcript_gxf') && $tr_record->{type} eq 'CDS') {
-    $self->warning_msg("WARNING: $self->{file} contains CDS features whose parents are genes (instead of transcripts); use --cds_as_transcript_gxf to parse protein-coding CDS as single-exon transcripts (required if using NCBI prokaryotic GFF/GTF annotations)");
+    $self->warning_msg("WARNING: $self->{file} contains CDS features whose parents are genes (instead of transcripts); use --cds_as_transcript_gxf to parse protein-coding CDS as single-exon transcripts (useful for NCBI microbe GFF/GTF annotations)");
     return;
   }
 
@@ -525,7 +525,7 @@ sub _create_transcript {
   if ($biotype eq 'protein_coding' && scalar @exons == 0){
     if ($self->param('cds_as_transcript_gxf') && $tr_record->{type} eq 'CDS'){
       # create single-exon transcript based on a CDS without transcript parent
-      # example: prokaryotic GFF/GTF annotations from NCBI
+      # example: NCBI microbe GFF/GTF annotations
       push @exons, {
         start  => $tr_record->{start},
         end    => $tr_record->{end},
diff --git a/modules/Bio/EnsEMBL/VEP/AnnotationSource/File/GTF.pm b/modules/Bio/EnsEMBL/VEP/AnnotationSource/File/GTF.pm
index d7fca2a52..515ac0ac9 100644
--- a/modules/Bio/EnsEMBL/VEP/AnnotationSource/File/GTF.pm
+++ b/modules/Bio/EnsEMBL/VEP/AnnotationSource/File/GTF.pm
@@ -136,7 +136,7 @@ sub _record_get_parent_ids {
     my $type = lc($record->{type});
     my $parent_type = $PARENTS{$type};
 
-    # Avoid unassigned transcripts (fix for NCBI prokaryotic GTFs)
+    # Avoid unassigned transcripts (fix for NCBI microbe GTFs)
     $parent_type = 'gene' if $parent_type && $record->{attributes}->{$parent_type.'_id'} =~ /^unassigned/;
 
     my @parent_ids = ($parent_type) ? split(',',$record->{attributes}->{$parent_type.'_id'}) : ();
diff --git a/modules/Bio/EnsEMBL/VEP/Config.pm b/modules/Bio/EnsEMBL/VEP/Config.pm
index cd4182237..59dcf1a39 100755
--- a/modules/Bio/EnsEMBL/VEP/Config.pm
+++ b/modules/Bio/EnsEMBL/VEP/Config.pm
@@ -256,7 +256,7 @@ our @VEP_PARAMS = (
   'ucsc_assembly=s',         # required for phyloP, phastCons, e.g. use hg19 for GRCh37, hg38 for GRCh38
   'ucsc_data_root=s',        # replace if you have the data locally, defaults to http://hgdownload.cse.ucsc.edu/goldenpath/
   'custom_multi_allelic',    # prevents filtering of custom annotation data when comma separated lists are assumed to be allele specific
-  'cds_as_transcript_gxf',   # parse CDS with gene parents as single-exon transcripts on GTF/GFF3 files (useful for NCBI prokaryotic annotation)
+  'cds_as_transcript_gxf',   # parse CDS with gene parents as single-exon transcripts on GTF/GFF3 files (useful for NCBI microbe annotation)
 
   # plugins
   'plugin=s@',               # specify a method in a module in the plugins directory

From d55caab72e563cc800460fea9308df0d0518ae9d Mon Sep 17 00:00:00 2001
From: Nuno Agostinho <nuno@ebi.ac.uk>
Date: Thu, 5 Sep 2024 15:07:35 +0100
Subject: [PATCH 5/5] Fix unit test

---
 t/AnnotationSource_File_GFF.t | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/t/AnnotationSource_File_GFF.t b/t/AnnotationSource_File_GFF.t
index f5c7c44f5..5e10bb447 100644
--- a/t/AnnotationSource_File_GFF.t
+++ b/t/AnnotationSource_File_GFF.t
@@ -284,7 +284,7 @@ SKIP: {
      'type' => 'mRNA',
   );
   my $trans = $as->lazy_load_transcript(\%feature_record, $feature_record{_gene_record});
-  ok($tmp =~ /No exons found for protein_coding transcript/, 'no exons warning message');
+  ok($tmp =~ /No exons found for protein-coding transcript/, 'no exons warning message');
 
   # restore STDERR
   open(STDERR, ">&SAVE") or die "Can't restore STDERR\n";