feat(IPVC-2390): translation exceptions (#28)

biocommons · Apr 29, 2024 · c007662 · c007662
1 parent b915f31
commit c007662
Show file tree

Hide file tree

Showing 14 changed files with 341 additions and 33 deletions.
diff --git a/sbin/generate-loading-data b/sbin/generate-loading-data
@@ -96,7 +96,9 @@ def parse_splign(fn, txdata):
         gene_id=gene_id,
         gene_symbol=gene_symbol,
         cds_se_i=cds_se_i,
-        exons_se_i=tx_exons_str)
+        exons_se_i=tx_exons_str,
+        transl_except=None,
+    )
     exonset = uta.formats.exonset.ExonSet(
         tx_ac=tx_ac,
         alt_ac=alt_ac,
@@ -116,7 +118,7 @@ if __name__ == "__main__":
 
     txinfo_out = uta.formats.txinfo.TxInfoWriter(gzip.open(os.path.join(opts.output_dir, txinfo_fn), "wt"))
     exonset_out = uta.formats.exonset.ExonSetWriter(gzip.open(os.path.join(opts.output_dir, exonset_fn), "wt"))
-    
+
     for fn in opts.FILES:
         _logger.info("# " + fn)
         txinfo, exonset = parse_splign(fn, txdata)

diff --git a/sbin/ncbi-parse-gbff b/sbin/ncbi-parse-gbff
@@ -113,15 +113,15 @@ if __name__ == "__main__":
                 skipped_ids.add(srf.id)
                 continue
             cds_se_i = srf.cds_se_i
-            ti = TxInfo(ac=srf.id,
-                        origin=opts.origin,
-                        gene_id=srf.gene_id,
-                        gene_symbol=srf.gene_symbol,
-                        cds_se_i=None if cds_se_i is None else "{},{}".format(
-                            *cds_se_i),
-                        exons_se_i=";".join(
-                            ["{},{}".format(*ese) for ese in srf.exons_se_i])
-                        )
+            ti = TxInfo(
+                ac=srf.id,
+                origin=opts.origin,
+                gene_id=srf.gene_id,
+                gene_symbol=srf.gene_symbol,
+                cds_se_i=None if cds_se_i is None else "{},{}".format(*cds_se_i),
+                exons_se_i=";".join(["{},{}".format(*ese) for ese in srf.exons_se_i]),
+                transl_except=TxInfo.serialize_transl_except(srf.transl_except),
+            )
             tiw.write(ti)
             genes.add(srf.gene_symbol)
         logger.info("{ng} genes in {fn} ({c})".format(ng=len(genes), fn=fn, c=prefixes))

diff --git a/sbin/ncbi_process_mito.py b/sbin/ncbi_process_mito.py
@@ -58,14 +58,14 @@
 import importlib_resources
 import logging
 import logging.config
-from typing import Dict, Optional
+from typing import Dict, List, Optional
 
 from Bio.Seq import Seq
 import Bio.SeqIO
 from Bio.SeqFeature import SeqFeature
 from Bio.SeqRecord import SeqRecord
 from bioutils.digests import seq_md5
-from more_itertools import first, one
+from more_itertools import one
 
 from uta.formats.geneaccessions import GeneAccessions, GeneAccessionsWriter
 from uta.formats.seqinfo import SeqInfo, SeqInfoWriter
@@ -90,7 +90,7 @@ class MitoGeneData:
     origin: str = "NCBI"
     alignment_method: str = "splign"
     transl_table: Optional[str] = None
-    transl_except: Optional[str] = None
+    transl_except: Optional[List[str]] = None
     pro_ac: Optional[str] = None
     pro_seq: Optional[str] = None
 
@@ -195,11 +195,6 @@ def get_mito_genes(gbff_filepath: str):
                     assert int(xrefs["GeneID"]) == gene_id
                     assert feature_start == feature.location.start
                     assert feature_end == feature.location.end
-                    # if feature type not CDS, set defaults
-                    pro_ac = None
-                    pro_seq = None
-                    transl_table = None
-                    transl_except = None
 
                     # retrieve sequence, and reverse compliment if on reverse strand
                     ac = f"{record.id}_{feature.location.start:05}_{feature.location.end:05}"
@@ -212,8 +207,12 @@ def get_mito_genes(gbff_filepath: str):
                         pro_ac = one(feature.qualifiers["protein_id"])
                         pro_seq = str(one(feature.qualifiers["translation"]))
                         transl_table = one(feature.qualifiers["transl_table"])
-                        if "transl_except" in feature.qualifiers:
-                            transl_except = one(feature.qualifiers["transl_except"])
+                        transl_except = feature.qualifiers.get("transl_except")
+                    else:
+                        pro_ac = None
+                        pro_seq = None
+                        transl_table = None
+                        transl_except = None
 
                     # yield gene data
                     yield MitoGeneData(
@@ -313,6 +312,7 @@ def main(ncbi_accession: str, output_dir: str):
                     mg.gene_symbol,
                     mg.cds_se_i(),
                     mg.exons_se_i(),
+                    TxInfo.serialize_transl_except(mg.transl_except),
                 )
             )
 

diff --git a/src/alembic/versions/14eed54ff90d_create_translation_exception_table.py b/src/alembic/versions/14eed54ff90d_create_translation_exception_table.py
@@ -0,0 +1,37 @@
+"""create translation_exception table
+
+Revision ID: 14eed54ff90d
+Revises: f85dd97bd9f5
+Create Date: 2024-04-25 23:57:12.455316
+
+"""
+from typing import Sequence, Union
+
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision: str = '14eed54ff90d'
+down_revision: Union[str, None] = 'f85dd97bd9f5'
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+    op.create_table(
+        'translation_exception',
+        sa.Column('translation_exception_id', sa.Integer(), autoincrement=True, nullable=False),
+        sa.Column('tx_ac', sa.Text(), nullable=False),
+        sa.Column('start_position', sa.Integer(), nullable=False),
+        sa.Column('end_position', sa.Integer(), nullable=False),
+        sa.Column('amino_acid', sa.Text(), nullable=False),
+        sa.CheckConstraint('start_position <= end_position', name='start_less_than_or_equal_to_end'),
+        sa.ForeignKeyConstraint(['tx_ac'], ['uta.transcript.ac'], onupdate='CASCADE', ondelete='CASCADE'),
+        sa.PrimaryKeyConstraint('translation_exception_id'),
+        schema='uta',
+    )
+
+
+def downgrade() -> None:
+    op.drop_table('translation_exception', schema='uta')
diff --git a/src/uta/formats/txinfo.py b/src/uta/formats/txinfo.py
@@ -1,10 +1,23 @@
 import csv
 import recordtype
-
-
-class TxInfo(recordtype.recordtype('TxInfo',
-                                   ['origin', 'ac', 'gene_id', 'gene_symbol', 'cds_se_i', 'exons_se_i'])):
-    pass
+from typing import List, Optional
+
+
+# transl_except should be a semicolon-separated list:
+# (pos:333..335,aa:Sec);(pos:1017,aa:TERM)
+class TxInfo(
+    recordtype.recordtype(
+        'TxInfo',
+        ['origin', 'ac', 'gene_id', 'gene_symbol', 'cds_se_i', 'exons_se_i', 'transl_except'],
+)):
+
+    @staticmethod
+    def serialize_transl_except(transl_except_list: Optional[List[str]]) -> Optional[str]:
+        """Helper for formatting transl_except list as a string."""
+        if transl_except_list is None:
+            return None
+        else:
+            return ";".join(transl_except_list)
 
 
 class TxInfoWriter(csv.DictWriter):

diff --git a/src/uta/loading.py b/src/uta/loading.py
@@ -7,7 +7,7 @@
 import itertools
 import logging
 import time
-from typing import Any
+from typing import Any, Dict, List
 
 from biocommons.seqrepo import SeqRepo
 from bioutils.coordinates import strand_pm_to_int, MINUS_STRAND
@@ -681,9 +681,17 @@ def _fetch_origin_by_name(name):
                 cds_md5=cds_md5,
             )
             session.add(u_tx)
+
+            if ti.transl_except:
+                # if transl_except exists, it looks like this:
+                # (pos:333..335,aa:Sec);(pos:1017,aa:TERM)
+                transl_except_list = ti.transl_except.split(';')
+                te_list = _create_translation_exceptions(transcript=ti.ac, transl_except_list=transl_except_list)
+                for te in te_list:
+                    session.add(usam.TranslationException(**te))
+
         if u_tx.gene_id != ti.gene_id:
-            raise Exception("{ti.ac}: GeneID changed from {u_tx.gene_id} to {ti.gene_id}".format(
-                u_tx=u_tx, ti=ti))
+            raise Exception("{ti.ac}: GeneID changed from {u_tx.gene_id} to {ti.gene_id}".format(u_tx=u_tx, ti=ti))
 
         # state: transcript now exists, either existing or freshly-created
 
@@ -709,6 +717,40 @@ def _fetch_origin_by_name(name):
                 p=(i_ti + 1) / n_rows * 100))
 
 
+def _create_translation_exceptions(transcript: str, transl_except_list: List[str]) -> List[Dict]:
+    """
+    Create TranslationException object data where start and end positions are 0-based, from transl_except data that is 1-based.
+    For example, [(pos:333..335,aa:Sec), (pos:1017,aa:TERM)] should result in start and end positions [(332, 335), (1016, 1017)]
+    """
+    result = []
+
+    for te in transl_except_list:
+        # remove parens
+        te = te.replace('(','').replace(')','')
+
+        # extract positions
+        pos_str, aa_str = te.split(',')
+        pos_str = pos_str.removeprefix('pos:')
+        if '..' in pos_str:
+            start_position, _, end_position = pos_str.partition('..')
+        else:
+            start_position = end_position = pos_str
+
+        # extract amino acid
+        amino_acid = aa_str.removeprefix('aa:')
+
+        result.append(
+            {
+                'tx_ac': transcript,
+                'start_position': int(start_position) - 1,
+                'end_position': int(end_position),
+                'amino_acid': amino_acid,
+            }
+        )
+
+    return result
+
+
 def refresh_matviews(session, opts, cf):
     session.execute(text("set role {admin_role};".format(
         admin_role=cf.get("uta", "admin_role"))))

diff --git a/src/uta/models.py b/src/uta/models.py
@@ -140,6 +140,30 @@ class Transcript(Base):
     origin = sao.relationship("Origin", backref="transcripts")
 
 
+class TranslationException(Base):
+    """
+    Represents `transl_except` annotations on CDS features in transcript records from NCBI.
+
+    Examples:
+    /transl_except=(pos:333..335,aa:Sec)
+    /transl_except=(pos:1017,aa:TERM)
+    """
+
+    __tablename__ = "translation_exception"
+    __table_args__ = (
+        sa.CheckConstraint("start_position <= end_position", "start_less_than_or_equal_to_end"),
+    )
+
+    translation_exception_id = sa.Column(sa.Integer, autoincrement=True, primary_key=True)
+    tx_ac = sa.Column(sa.Text, sa.ForeignKey("transcript.ac", onupdate="CASCADE", ondelete="CASCADE"), nullable=False)
+    start_position = sa.Column(sa.Integer, nullable=False)
+    end_position = sa.Column(sa.Integer, nullable=False)
+    amino_acid = sa.Column(sa.Text, nullable=False)
+
+    # relationships:
+    transcript = sao.relationship("Transcript", backref="translation_exceptions")
+
+
 class ExonSet(Base):
     __tablename__ = "exon_set"
     __table_args__ = (

diff --git a/src/uta/parsers/seqrecord.py b/src/uta/parsers/seqrecord.py
@@ -1,6 +1,6 @@
 from collections import defaultdict
 from functools import cached_property
-from typing import Union
+from typing import List, Optional
 
 import Bio.SeqRecord
 from Bio.SeqFeature import SeqFeature
@@ -31,9 +31,20 @@ def validate_features_by_type(features: dict[str, list]) -> None:
         if "gene" not in features or len(features["gene"]) != 1:
             raise SeqRecordFeatureError("Expected exactly one `gene` feature")
 
+    @cached_property
+    def cds_feature(self) -> Optional[SeqFeature]:
+        """
+        Returns the CDS feature for any coding transcript, None for any non-coding transcript.
+        """
+        cds_features = self.features_by_type.get("CDS")
+        if cds_features is None:
+            return None
+        else:
+            return cds_features[0]
+
     @cached_property
     def gene_feature(self) -> SeqFeature:
-        """Returns the gene feature, which is assumed to exist for all transcripts. """
+        """Returns the gene feature, which should exist for all transcripts."""
         return self.features_by_type.get("gene")[0]
 
     @property
@@ -67,3 +78,10 @@ def exons_se_i(self):
         else:
             exons = [self.gene_feature]
         return [(f.location.start.real, f.location.end.real) for f in exons]
+
+    @property
+    def transl_except(self) -> Optional[List[str]]:
+        if self.cds_feature is None:
+            return None
+        else:
+            return self.cds_feature.qualifiers.get("transl_except")
diff --git a/tests/data/txinfo.gz b/tests/data/txinfo.gz
diff --git a/tests/test_ncbi_process_mito.py b/tests/test_ncbi_process_mito.py
@@ -269,7 +269,7 @@ def test_get_mito_genes(self):
             "alt_end": 4262,
             "strand": 1,
             "transl_table": "2",
-            "transl_except": "(pos:4261..4262,aa:TERM)",
+            "transl_except": ["(pos:4261..4262,aa:TERM)"],
             "pro_ac": "YP_003024026.1",
             "pro_seq": "MPMANLLLLIVPILIAMAFLMLTERKILGYMQLRKGPNVVGPYGLLQPFADAMKLFTKEPLKPATSTITLYITAPTLALTIALLLWTPLPMPN"
             "PLVNLNLGLLFILATSSLAVYSILWSGWASNSNYALIGALRAVAQTISYEVTLAIILLSTLLMSGSFNLSTLITTQEHLWLLLPSWPLAMMWFISTLAETNRTP"

diff --git a/tests/test_uta_formats_txinfo.py b/tests/test_uta_formats_txinfo.py
@@ -0,0 +1,10 @@
+import unittest
+from uta.formats.txinfo import TxInfo
+
+
+class TestUtaFormats(unittest.TestCase):
+
+    def test_txinfo_serialize_transl_except(self):
+        self.assertIsNone(TxInfo.serialize_transl_except(None))
+        self.assertEqual(TxInfo.serialize_transl_except([]), '')
+        self.assertEqual(TxInfo.serialize_transl_except(['(pos:333..335,aa:Sec)', '(pos:1017,aa:TERM)']), '(pos:333..335,aa:Sec);(pos:1017,aa:TERM)')