Skip to content

Commit

Permalink
feat(IPVC-2390): translation exceptions (#28)
Browse files Browse the repository at this point in the history
  • Loading branch information
nvta1209 authored Apr 29, 2024
1 parent b915f31 commit c007662
Show file tree
Hide file tree
Showing 14 changed files with 341 additions and 33 deletions.
6 changes: 4 additions & 2 deletions sbin/generate-loading-data
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,9 @@ def parse_splign(fn, txdata):
gene_id=gene_id,
gene_symbol=gene_symbol,
cds_se_i=cds_se_i,
exons_se_i=tx_exons_str)
exons_se_i=tx_exons_str,
transl_except=None,
)
exonset = uta.formats.exonset.ExonSet(
tx_ac=tx_ac,
alt_ac=alt_ac,
Expand All @@ -116,7 +118,7 @@ if __name__ == "__main__":

txinfo_out = uta.formats.txinfo.TxInfoWriter(gzip.open(os.path.join(opts.output_dir, txinfo_fn), "wt"))
exonset_out = uta.formats.exonset.ExonSetWriter(gzip.open(os.path.join(opts.output_dir, exonset_fn), "wt"))

for fn in opts.FILES:
_logger.info("# " + fn)
txinfo, exonset = parse_splign(fn, txdata)
Expand Down
18 changes: 9 additions & 9 deletions sbin/ncbi-parse-gbff
Original file line number Diff line number Diff line change
Expand Up @@ -113,15 +113,15 @@ if __name__ == "__main__":
skipped_ids.add(srf.id)
continue
cds_se_i = srf.cds_se_i
ti = TxInfo(ac=srf.id,
origin=opts.origin,
gene_id=srf.gene_id,
gene_symbol=srf.gene_symbol,
cds_se_i=None if cds_se_i is None else "{},{}".format(
*cds_se_i),
exons_se_i=";".join(
["{},{}".format(*ese) for ese in srf.exons_se_i])
)
ti = TxInfo(
ac=srf.id,
origin=opts.origin,
gene_id=srf.gene_id,
gene_symbol=srf.gene_symbol,
cds_se_i=None if cds_se_i is None else "{},{}".format(*cds_se_i),
exons_se_i=";".join(["{},{}".format(*ese) for ese in srf.exons_se_i]),
transl_except=TxInfo.serialize_transl_except(srf.transl_except),
)
tiw.write(ti)
genes.add(srf.gene_symbol)
logger.info("{ng} genes in {fn} ({c})".format(ng=len(genes), fn=fn, c=prefixes))
Expand Down
20 changes: 10 additions & 10 deletions sbin/ncbi_process_mito.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,14 +58,14 @@
import importlib_resources
import logging
import logging.config
from typing import Dict, Optional
from typing import Dict, List, Optional

from Bio.Seq import Seq
import Bio.SeqIO
from Bio.SeqFeature import SeqFeature
from Bio.SeqRecord import SeqRecord
from bioutils.digests import seq_md5
from more_itertools import first, one
from more_itertools import one

from uta.formats.geneaccessions import GeneAccessions, GeneAccessionsWriter
from uta.formats.seqinfo import SeqInfo, SeqInfoWriter
Expand All @@ -90,7 +90,7 @@ class MitoGeneData:
origin: str = "NCBI"
alignment_method: str = "splign"
transl_table: Optional[str] = None
transl_except: Optional[str] = None
transl_except: Optional[List[str]] = None
pro_ac: Optional[str] = None
pro_seq: Optional[str] = None

Expand Down Expand Up @@ -195,11 +195,6 @@ def get_mito_genes(gbff_filepath: str):
assert int(xrefs["GeneID"]) == gene_id
assert feature_start == feature.location.start
assert feature_end == feature.location.end
# if feature type not CDS, set defaults
pro_ac = None
pro_seq = None
transl_table = None
transl_except = None

# retrieve sequence, and reverse compliment if on reverse strand
ac = f"{record.id}_{feature.location.start:05}_{feature.location.end:05}"
Expand All @@ -212,8 +207,12 @@ def get_mito_genes(gbff_filepath: str):
pro_ac = one(feature.qualifiers["protein_id"])
pro_seq = str(one(feature.qualifiers["translation"]))
transl_table = one(feature.qualifiers["transl_table"])
if "transl_except" in feature.qualifiers:
transl_except = one(feature.qualifiers["transl_except"])
transl_except = feature.qualifiers.get("transl_except")
else:
pro_ac = None
pro_seq = None
transl_table = None
transl_except = None

# yield gene data
yield MitoGeneData(
Expand Down Expand Up @@ -313,6 +312,7 @@ def main(ncbi_accession: str, output_dir: str):
mg.gene_symbol,
mg.cds_se_i(),
mg.exons_se_i(),
TxInfo.serialize_transl_except(mg.transl_except),
)
)

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
"""create translation_exception table
Revision ID: 14eed54ff90d
Revises: f85dd97bd9f5
Create Date: 2024-04-25 23:57:12.455316
"""
from typing import Sequence, Union

from alembic import op
import sqlalchemy as sa


# revision identifiers, used by Alembic.
revision: str = '14eed54ff90d'
down_revision: Union[str, None] = 'f85dd97bd9f5'
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None


def upgrade() -> None:
op.create_table(
'translation_exception',
sa.Column('translation_exception_id', sa.Integer(), autoincrement=True, nullable=False),
sa.Column('tx_ac', sa.Text(), nullable=False),
sa.Column('start_position', sa.Integer(), nullable=False),
sa.Column('end_position', sa.Integer(), nullable=False),
sa.Column('amino_acid', sa.Text(), nullable=False),
sa.CheckConstraint('start_position <= end_position', name='start_less_than_or_equal_to_end'),
sa.ForeignKeyConstraint(['tx_ac'], ['uta.transcript.ac'], onupdate='CASCADE', ondelete='CASCADE'),
sa.PrimaryKeyConstraint('translation_exception_id'),
schema='uta',
)


def downgrade() -> None:
op.drop_table('translation_exception', schema='uta')
23 changes: 18 additions & 5 deletions src/uta/formats/txinfo.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,23 @@
import csv
import recordtype


class TxInfo(recordtype.recordtype('TxInfo',
['origin', 'ac', 'gene_id', 'gene_symbol', 'cds_se_i', 'exons_se_i'])):
pass
from typing import List, Optional


# transl_except should be a semicolon-separated list:
# (pos:333..335,aa:Sec);(pos:1017,aa:TERM)
class TxInfo(
recordtype.recordtype(
'TxInfo',
['origin', 'ac', 'gene_id', 'gene_symbol', 'cds_se_i', 'exons_se_i', 'transl_except'],
)):

@staticmethod
def serialize_transl_except(transl_except_list: Optional[List[str]]) -> Optional[str]:
"""Helper for formatting transl_except list as a string."""
if transl_except_list is None:
return None
else:
return ";".join(transl_except_list)


class TxInfoWriter(csv.DictWriter):
Expand Down
48 changes: 45 additions & 3 deletions src/uta/loading.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import itertools
import logging
import time
from typing import Any
from typing import Any, Dict, List

from biocommons.seqrepo import SeqRepo
from bioutils.coordinates import strand_pm_to_int, MINUS_STRAND
Expand Down Expand Up @@ -681,9 +681,17 @@ def _fetch_origin_by_name(name):
cds_md5=cds_md5,
)
session.add(u_tx)

if ti.transl_except:
# if transl_except exists, it looks like this:
# (pos:333..335,aa:Sec);(pos:1017,aa:TERM)
transl_except_list = ti.transl_except.split(';')
te_list = _create_translation_exceptions(transcript=ti.ac, transl_except_list=transl_except_list)
for te in te_list:
session.add(usam.TranslationException(**te))

if u_tx.gene_id != ti.gene_id:
raise Exception("{ti.ac}: GeneID changed from {u_tx.gene_id} to {ti.gene_id}".format(
u_tx=u_tx, ti=ti))
raise Exception("{ti.ac}: GeneID changed from {u_tx.gene_id} to {ti.gene_id}".format(u_tx=u_tx, ti=ti))

# state: transcript now exists, either existing or freshly-created

Expand All @@ -709,6 +717,40 @@ def _fetch_origin_by_name(name):
p=(i_ti + 1) / n_rows * 100))


def _create_translation_exceptions(transcript: str, transl_except_list: List[str]) -> List[Dict]:
"""
Create TranslationException object data where start and end positions are 0-based, from transl_except data that is 1-based.
For example, [(pos:333..335,aa:Sec), (pos:1017,aa:TERM)] should result in start and end positions [(332, 335), (1016, 1017)]
"""
result = []

for te in transl_except_list:
# remove parens
te = te.replace('(','').replace(')','')

# extract positions
pos_str, aa_str = te.split(',')
pos_str = pos_str.removeprefix('pos:')
if '..' in pos_str:
start_position, _, end_position = pos_str.partition('..')
else:
start_position = end_position = pos_str

# extract amino acid
amino_acid = aa_str.removeprefix('aa:')

result.append(
{
'tx_ac': transcript,
'start_position': int(start_position) - 1,
'end_position': int(end_position),
'amino_acid': amino_acid,
}
)

return result


def refresh_matviews(session, opts, cf):
session.execute(text("set role {admin_role};".format(
admin_role=cf.get("uta", "admin_role"))))
Expand Down
24 changes: 24 additions & 0 deletions src/uta/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,30 @@ class Transcript(Base):
origin = sao.relationship("Origin", backref="transcripts")


class TranslationException(Base):
"""
Represents `transl_except` annotations on CDS features in transcript records from NCBI.
Examples:
/transl_except=(pos:333..335,aa:Sec)
/transl_except=(pos:1017,aa:TERM)
"""

__tablename__ = "translation_exception"
__table_args__ = (
sa.CheckConstraint("start_position <= end_position", "start_less_than_or_equal_to_end"),
)

translation_exception_id = sa.Column(sa.Integer, autoincrement=True, primary_key=True)
tx_ac = sa.Column(sa.Text, sa.ForeignKey("transcript.ac", onupdate="CASCADE", ondelete="CASCADE"), nullable=False)
start_position = sa.Column(sa.Integer, nullable=False)
end_position = sa.Column(sa.Integer, nullable=False)
amino_acid = sa.Column(sa.Text, nullable=False)

# relationships:
transcript = sao.relationship("Transcript", backref="translation_exceptions")


class ExonSet(Base):
__tablename__ = "exon_set"
__table_args__ = (
Expand Down
22 changes: 20 additions & 2 deletions src/uta/parsers/seqrecord.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from collections import defaultdict
from functools import cached_property
from typing import Union
from typing import List, Optional

import Bio.SeqRecord
from Bio.SeqFeature import SeqFeature
Expand Down Expand Up @@ -31,9 +31,20 @@ def validate_features_by_type(features: dict[str, list]) -> None:
if "gene" not in features or len(features["gene"]) != 1:
raise SeqRecordFeatureError("Expected exactly one `gene` feature")

@cached_property
def cds_feature(self) -> Optional[SeqFeature]:
"""
Returns the CDS feature for any coding transcript, None for any non-coding transcript.
"""
cds_features = self.features_by_type.get("CDS")
if cds_features is None:
return None
else:
return cds_features[0]

@cached_property
def gene_feature(self) -> SeqFeature:
"""Returns the gene feature, which is assumed to exist for all transcripts. """
"""Returns the gene feature, which should exist for all transcripts."""
return self.features_by_type.get("gene")[0]

@property
Expand Down Expand Up @@ -67,3 +78,10 @@ def exons_se_i(self):
else:
exons = [self.gene_feature]
return [(f.location.start.real, f.location.end.real) for f in exons]

@property
def transl_except(self) -> Optional[List[str]]:
if self.cds_feature is None:
return None
else:
return self.cds_feature.qualifiers.get("transl_except")
Binary file added tests/data/txinfo.gz
Binary file not shown.
2 changes: 1 addition & 1 deletion tests/test_ncbi_process_mito.py
Original file line number Diff line number Diff line change
Expand Up @@ -269,7 +269,7 @@ def test_get_mito_genes(self):
"alt_end": 4262,
"strand": 1,
"transl_table": "2",
"transl_except": "(pos:4261..4262,aa:TERM)",
"transl_except": ["(pos:4261..4262,aa:TERM)"],
"pro_ac": "YP_003024026.1",
"pro_seq": "MPMANLLLLIVPILIAMAFLMLTERKILGYMQLRKGPNVVGPYGLLQPFADAMKLFTKEPLKPATSTITLYITAPTLALTIALLLWTPLPMPN"
"PLVNLNLGLLFILATSSLAVYSILWSGWASNSNYALIGALRAVAQTISYEVTLAIILLSTLLMSGSFNLSTLITTQEHLWLLLPSWPLAMMWFISTLAETNRTP"
Expand Down
10 changes: 10 additions & 0 deletions tests/test_uta_formats_txinfo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
import unittest
from uta.formats.txinfo import TxInfo


class TestUtaFormats(unittest.TestCase):

def test_txinfo_serialize_transl_except(self):
self.assertIsNone(TxInfo.serialize_transl_except(None))
self.assertEqual(TxInfo.serialize_transl_except([]), '')
self.assertEqual(TxInfo.serialize_transl_except(['(pos:333..335,aa:Sec)', '(pos:1017,aa:TERM)']), '(pos:333..335,aa:Sec);(pos:1017,aa:TERM)')
Loading

0 comments on commit c007662

Please sign in to comment.