diff --git a/modules/Bio/EnsEMBL/VEP/AnnotationSource/File/BaseGXF.pm b/modules/Bio/EnsEMBL/VEP/AnnotationSource/File/BaseGXF.pm index d302ce1b3..4923ba0fa 100644 --- a/modules/Bio/EnsEMBL/VEP/AnnotationSource/File/BaseGXF.pm +++ b/modules/Bio/EnsEMBL/VEP/AnnotationSource/File/BaseGXF.pm @@ -459,10 +459,15 @@ sub _create_transcript { my $tr_record = shift; my $gene_record = shift; - return unless $tr_record->{_children}; + return unless $tr_record->{_children} or $tr_record->{type} eq 'CDS'; + + if (!$self->param('cds_as_transcript_gxf') && $tr_record->{type} eq 'CDS') { + $self->warning_msg("WARNING: $self->{file} contains CDS features whose parents are genes (instead of transcripts); use --cds_as_transcript_gxf to parse protein-coding CDS as single-exon transcripts (useful for NCBI microbe GFF/GTF annotations)"); + return; + } my $id = $tr_record->{attributes}->{transcript_id} || $self->_record_get_id($tr_record); - $id =~ s/^(gene|transcript)://i; + $id =~ s/^(gene|transcript|cds)[:-]//i; my $slice = $self->get_slice($tr_record->{chr}); @@ -518,8 +523,18 @@ sub _create_transcript { # check for exons for protein_coding biotype if ($biotype eq 'protein_coding' && scalar @exons == 0){ - $self->warning_msg("WARNING: No exons found for protein_coding transcript $id"); - return; + if ($self->param('cds_as_transcript_gxf') && $tr_record->{type} eq 'CDS'){ + # create single-exon transcript based on a CDS without transcript parent + # example: NCBI microbe GFF/GTF annotations + push @exons, { + start => $tr_record->{start}, + end => $tr_record->{end}, + strand => $tr_record->{strand}, + }; + } else { + $self->warning_msg("WARNING: No exons found for protein-coding transcript $id"); + return; + } } # sort exons diff --git a/modules/Bio/EnsEMBL/VEP/AnnotationSource/File/GFF.pm b/modules/Bio/EnsEMBL/VEP/AnnotationSource/File/GFF.pm index 34dd82bdd..376e1d001 100644 --- a/modules/Bio/EnsEMBL/VEP/AnnotationSource/File/GFF.pm +++ b/modules/Bio/EnsEMBL/VEP/AnnotationSource/File/GFF.pm @@ -227,11 +227,11 @@ sub _record_get_biotype { my ($self, $record, $gene_record) = @_; if(!exists($record->{_biotype})) { - # Ensembl-y GFFs have biotype as an attribute my $biotype = $record->{attributes}->{biotype} || $record->{attributes}->{transcript_type} || - $record->{attributes}->{transcript_biotype}; + $record->{attributes}->{transcript_biotype} || + ($self->param('cds_as_transcript_gxf') ? $gene_record->{attributes}->{gene_biotype} : undef); # others we need to (guess) work it out if(!$biotype) { diff --git a/modules/Bio/EnsEMBL/VEP/AnnotationSource/File/GTF.pm b/modules/Bio/EnsEMBL/VEP/AnnotationSource/File/GTF.pm index 36fc11864..515ac0ac9 100644 --- a/modules/Bio/EnsEMBL/VEP/AnnotationSource/File/GTF.pm +++ b/modules/Bio/EnsEMBL/VEP/AnnotationSource/File/GTF.pm @@ -134,7 +134,12 @@ sub _record_get_parent_ids { my ($self, $record) = @_; if(!exists($record->{_parent_id})) { my $type = lc($record->{type}); - my @parent_ids = ($PARENTS{$type}) ? split(',',$record->{attributes}->{$PARENTS{$type}.'_id'}) : (); + my $parent_type = $PARENTS{$type}; + + # Avoid unassigned transcripts (fix for NCBI microbe GTFs) + $parent_type = 'gene' if $parent_type && $record->{attributes}->{$parent_type.'_id'} =~ /^unassigned/; + + my @parent_ids = ($parent_type) ? split(',',$record->{attributes}->{$parent_type.'_id'}) : (); $record->{_parent_id} = \@parent_ids; } return $record->{_parent_id}; @@ -162,7 +167,8 @@ sub _record_get_id { =head2 _record_get_biotype Arg 1 : hashref $transcript_record_hash - Example : $biotype = $as->_record_get_biotype($tr_record); + Arg 2 : hashref $gene_record_hash + Example : $biotype = $as->_record_get_biotype($tr_record, $gene_record); Description: Get sequence ontology (SO) biotype of this record. Attempts to find it in the "biotype", "transcript_type" or "transcript_biotype" attribute fields, and if that fails default to the source field. @@ -174,11 +180,13 @@ sub _record_get_id { =cut sub _record_get_biotype { - return - $_[1]->{attributes}->{transcript_biotype} || - $_[1]->{attributes}->{transcript_type} || - $_[1]->{attributes}->{biotype} || - $_[1]->{source}; + my ($self, $record, $gene_record) = @_; + my $biotype = $record->{attributes}->{transcript_biotype} || + $record->{attributes}->{transcript_type} || + $record->{attributes}->{biotype} || + ($self->param('cds_as_transcript_gxf') ? $gene_record->{attributes}->{gene_biotype} : undef) || + $record->{source}; + return $biotype; } 1; diff --git a/modules/Bio/EnsEMBL/VEP/Config.pm b/modules/Bio/EnsEMBL/VEP/Config.pm index b826d35bd..59dcf1a39 100755 --- a/modules/Bio/EnsEMBL/VEP/Config.pm +++ b/modules/Bio/EnsEMBL/VEP/Config.pm @@ -256,6 +256,7 @@ our @VEP_PARAMS = ( 'ucsc_assembly=s', # required for phyloP, phastCons, e.g. use hg19 for GRCh37, hg38 for GRCh38 'ucsc_data_root=s', # replace if you have the data locally, defaults to http://hgdownload.cse.ucsc.edu/goldenpath/ 'custom_multi_allelic', # prevents filtering of custom annotation data when comma separated lists are assumed to be allele specific + 'cds_as_transcript_gxf', # parse CDS with gene parents as single-exon transcripts on GTF/GFF3 files (useful for NCBI microbe annotation) # plugins 'plugin=s@', # specify a method in a module in the plugins directory diff --git a/t/AnnotationSource_File_GFF.t b/t/AnnotationSource_File_GFF.t index f5c7c44f5..5e10bb447 100644 --- a/t/AnnotationSource_File_GFF.t +++ b/t/AnnotationSource_File_GFF.t @@ -284,7 +284,7 @@ SKIP: { 'type' => 'mRNA', ); my $trans = $as->lazy_load_transcript(\%feature_record, $feature_record{_gene_record}); - ok($tmp =~ /No exons found for protein_coding transcript/, 'no exons warning message'); + ok($tmp =~ /No exons found for protein-coding transcript/, 'no exons warning message'); # restore STDERR open(STDERR, ">&SAVE") or die "Can't restore STDERR\n";