From ba54d328eff14204d36c5117cc8146314cadc306 Mon Sep 17 00:00:00 2001
From: Dave Lawrence <davmlaw@gmail.com>
Date: Fri, 7 Jul 2023 17:37:12 +0930
Subject: [PATCH] #50 - Biotype - missing in Ensembl transcripts. Bump version

---
 cdot/__init__.py                       |  2 +-
 generate_transcript_data/gff_parser.py | 41 +++++++++++---------------
 2 files changed, 19 insertions(+), 24 deletions(-)

diff --git a/cdot/__init__.py b/cdot/__init__.py
index cba3bfc..eb1c1b0 100644
--- a/cdot/__init__.py
+++ b/cdot/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.2.19"
+__version__ = "0.2.20"
 
 
 def get_json_schema_version():
diff --git a/generate_transcript_data/gff_parser.py b/generate_transcript_data/gff_parser.py
index 875067b..1609975 100644
--- a/generate_transcript_data/gff_parser.py
+++ b/generate_transcript_data/gff_parser.py
@@ -11,6 +11,7 @@
 
 CONTIG = "contig"
 STRAND = "strand"
+EXCLUDE_BIOTYPES = {"transcript"}  # feature.type we won't put into biotype
 
 
 class GFFParser(abc.ABC):
@@ -65,14 +66,11 @@ def _parse(self):
                 raise e
 
     def _finish(self):
-        self._process_coding_features()
+        self._finish_process_features()
 
         for gene_data in self.gene_data_by_accession.values():
             # TODO: Can turn this back on if we want - just removing for diff
             gene_data.pop("id", None)
-            #############
-            # Turn set into comma sep string
-            gene_data["biotype"] = ",".join(sorted(gene_data["biotype"]))
             gene_data["url"] = self.url
 
         # At the moment the transcript dict is flat - need to move it into "genome_builds" dict
@@ -105,11 +103,11 @@ def _create_gene(feature, gene_accession):
         if feature.type in {"gene", "pseudogene"}:
             gene_name = feature.attr.get("Name")
             description = feature.attr.get("description")
-            biotype = feature.attr.get("biotype")
         else:
             gene_name = feature.attr.get("gene_name")
-            biotype = feature.attr.get("gene_biotype")
 
+        # RefSeq GRCh38 has gene.gene_biotype
+        biotype = feature.attr.get("gene_biotype") or feature.attr.get("biotype")
         if biotype:
             biotype_set.add(biotype)
 
@@ -141,17 +139,6 @@ def _store_other_chrom(data, feature):
         other_chroms.add(feature.iv.chrom)
         data["other_chroms"] = other_chroms
 
-    @staticmethod
-    def _get_biotype_from_transcript_accession(transcript_accession):
-        biotypes_by_transcript_id_start = {"NM_": "protein_coding", "NR_": "non_coding"}
-        for (start, biotype) in biotypes_by_transcript_id_start.items():
-            if transcript_accession.startswith(start):
-                return biotype
-
-        if "tRNA" in transcript_accession:
-            return "tRNA"
-        return None
-
     def _add_transcript_data(self, transcript_accession, transcript, feature):
         if feature.iv.chrom != transcript[CONTIG]:
             self._store_other_chrom(transcript, feature)
@@ -173,7 +160,7 @@ def _add_transcript_data(self, transcript_accession, transcript, feature):
             features_by_type["coding_starts"].append(feature.iv.start)
             features_by_type["coding_ends"].append(feature.iv.end)
 
-    def _process_coding_features(self):
+    def _finish_process_features(self):
         for transcript_accession, transcript_data in self.transcript_data_by_accession.items():
             features_by_type = self.transcript_features_by_type.get(transcript_accession)
 
@@ -222,6 +209,16 @@ def _process_coding_features(self):
                 exons_genomic_order.reverse()
             transcript_data["exons"] = exons_genomic_order
 
+            if "cds_start" in transcript_data:
+                biotype = "mRNA"
+            else:
+                biotype = "ncRNA"
+            transcript_data["biotype"].add(biotype)
+
+            if gene_accession := transcript_data.get("gene_version"):
+                gene_data = self.gene_data_by_accession[gene_accession]
+                gene_data["biotype"].update(transcript_data["biotype"])
+
     @staticmethod
     def _create_perfect_exons(raw_exon_stranded_order):
         """ Perfectly matched exons are basically a no-gap case of cDNA match """
@@ -374,8 +371,6 @@ def handle_feature(self, feature):
             if biotype is None:
                 # Ensembl GTFs store biotype info under gene_type or transcript_type
                 biotype = feature.attr.get("gene_type")
-                if biotype is None:
-                    biotype = self._get_biotype_from_transcript_accession(transcript_accession)
 
             if biotype:
                 gene_data["biotype"].add(biotype)
@@ -472,6 +467,9 @@ def handle_feature(self, feature):
                         raise ValueError("Don't know how to handle feature type %s (not child of gene)" % feature.type)
                     gene_data = self.gene_data_by_accession[gene_accession]
                     self._handle_transcript(gene_data, transcript_accession, feature)
+                    transcript = self.transcript_data_by_accession[transcript_accession]
+                    if feature.type not in EXCLUDE_BIOTYPES:
+                        transcript["biotype"].add(feature.type)
 
     @staticmethod
     def _get_dbxref(feature):
@@ -487,9 +485,6 @@ def _handle_transcript(self, gene_data, transcript_accession, feature):
         if transcript_accession not in self.transcript_data_by_accession:
             # print("_handle_transcript(%s, %s)" % (gene, feature))
             transcript_data = self._create_transcript(feature, transcript_accession, gene_data)
-            if biotype := self._get_biotype_from_transcript_accession(transcript_accession):
-                gene_data["biotype"].add(biotype)
-                transcript_data["biotype"].add(biotype)
 
             if feature.attr.get("partial"):
                 transcript_data["partial"] = 1