Skip to content

Commit

Permalink
#50 - Biotype - missing in Ensembl transcripts. Bump version
Browse files Browse the repository at this point in the history
  • Loading branch information
davmlaw committed Jul 7, 2023
1 parent d5161f9 commit ba54d32
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 24 deletions.
2 changes: 1 addition & 1 deletion cdot/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = "0.2.19"
__version__ = "0.2.20"


def get_json_schema_version():
Expand Down
41 changes: 18 additions & 23 deletions generate_transcript_data/gff_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

CONTIG = "contig"
STRAND = "strand"
EXCLUDE_BIOTYPES = {"transcript"} # feature.type we won't put into biotype


class GFFParser(abc.ABC):
Expand Down Expand Up @@ -65,14 +66,11 @@ def _parse(self):
raise e

def _finish(self):
self._process_coding_features()
self._finish_process_features()

for gene_data in self.gene_data_by_accession.values():
# TODO: Can turn this back on if we want - just removing for diff
gene_data.pop("id", None)
#############
# Turn set into comma sep string
gene_data["biotype"] = ",".join(sorted(gene_data["biotype"]))
gene_data["url"] = self.url

# At the moment the transcript dict is flat - need to move it into "genome_builds" dict
Expand Down Expand Up @@ -105,11 +103,11 @@ def _create_gene(feature, gene_accession):
if feature.type in {"gene", "pseudogene"}:
gene_name = feature.attr.get("Name")
description = feature.attr.get("description")
biotype = feature.attr.get("biotype")
else:
gene_name = feature.attr.get("gene_name")
biotype = feature.attr.get("gene_biotype")

# RefSeq GRCh38 has gene.gene_biotype
biotype = feature.attr.get("gene_biotype") or feature.attr.get("biotype")
if biotype:
biotype_set.add(biotype)

Expand Down Expand Up @@ -141,17 +139,6 @@ def _store_other_chrom(data, feature):
other_chroms.add(feature.iv.chrom)
data["other_chroms"] = other_chroms

@staticmethod
def _get_biotype_from_transcript_accession(transcript_accession):
biotypes_by_transcript_id_start = {"NM_": "protein_coding", "NR_": "non_coding"}
for (start, biotype) in biotypes_by_transcript_id_start.items():
if transcript_accession.startswith(start):
return biotype

if "tRNA" in transcript_accession:
return "tRNA"
return None

def _add_transcript_data(self, transcript_accession, transcript, feature):
if feature.iv.chrom != transcript[CONTIG]:
self._store_other_chrom(transcript, feature)
Expand All @@ -173,7 +160,7 @@ def _add_transcript_data(self, transcript_accession, transcript, feature):
features_by_type["coding_starts"].append(feature.iv.start)
features_by_type["coding_ends"].append(feature.iv.end)

def _process_coding_features(self):
def _finish_process_features(self):
for transcript_accession, transcript_data in self.transcript_data_by_accession.items():
features_by_type = self.transcript_features_by_type.get(transcript_accession)

Expand Down Expand Up @@ -222,6 +209,16 @@ def _process_coding_features(self):
exons_genomic_order.reverse()
transcript_data["exons"] = exons_genomic_order

if "cds_start" in transcript_data:
biotype = "mRNA"
else:
biotype = "ncRNA"
transcript_data["biotype"].add(biotype)

if gene_accession := transcript_data.get("gene_version"):
gene_data = self.gene_data_by_accession[gene_accession]
gene_data["biotype"].update(transcript_data["biotype"])

@staticmethod
def _create_perfect_exons(raw_exon_stranded_order):
""" Perfectly matched exons are basically a no-gap case of cDNA match """
Expand Down Expand Up @@ -374,8 +371,6 @@ def handle_feature(self, feature):
if biotype is None:
# Ensembl GTFs store biotype info under gene_type or transcript_type
biotype = feature.attr.get("gene_type")
if biotype is None:
biotype = self._get_biotype_from_transcript_accession(transcript_accession)

if biotype:
gene_data["biotype"].add(biotype)
Expand Down Expand Up @@ -472,6 +467,9 @@ def handle_feature(self, feature):
raise ValueError("Don't know how to handle feature type %s (not child of gene)" % feature.type)
gene_data = self.gene_data_by_accession[gene_accession]
self._handle_transcript(gene_data, transcript_accession, feature)
transcript = self.transcript_data_by_accession[transcript_accession]
if feature.type not in EXCLUDE_BIOTYPES:
transcript["biotype"].add(feature.type)

@staticmethod
def _get_dbxref(feature):
Expand All @@ -487,9 +485,6 @@ def _handle_transcript(self, gene_data, transcript_accession, feature):
if transcript_accession not in self.transcript_data_by_accession:
# print("_handle_transcript(%s, %s)" % (gene, feature))
transcript_data = self._create_transcript(feature, transcript_accession, gene_data)
if biotype := self._get_biotype_from_transcript_accession(transcript_accession):
gene_data["biotype"].add(biotype)
transcript_data["biotype"].add(biotype)

if feature.attr.get("partial"):
transcript_data["partial"] = 1
Expand Down

0 comments on commit ba54d32

Please sign in to comment.