From ba54d328eff14204d36c5117cc8146314cadc306 Mon Sep 17 00:00:00 2001 From: Dave Lawrence Date: Fri, 7 Jul 2023 17:37:12 +0930 Subject: [PATCH] #50 - Biotype - missing in Ensembl transcripts. Bump version --- cdot/__init__.py | 2 +- generate_transcript_data/gff_parser.py | 41 +++++++++++--------------- 2 files changed, 19 insertions(+), 24 deletions(-) diff --git a/cdot/__init__.py b/cdot/__init__.py index cba3bfc..eb1c1b0 100644 --- a/cdot/__init__.py +++ b/cdot/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.2.19" +__version__ = "0.2.20" def get_json_schema_version(): diff --git a/generate_transcript_data/gff_parser.py b/generate_transcript_data/gff_parser.py index 875067b..1609975 100644 --- a/generate_transcript_data/gff_parser.py +++ b/generate_transcript_data/gff_parser.py @@ -11,6 +11,7 @@ CONTIG = "contig" STRAND = "strand" +EXCLUDE_BIOTYPES = {"transcript"} # feature.type we won't put into biotype class GFFParser(abc.ABC): @@ -65,14 +66,11 @@ def _parse(self): raise e def _finish(self): - self._process_coding_features() + self._finish_process_features() for gene_data in self.gene_data_by_accession.values(): # TODO: Can turn this back on if we want - just removing for diff gene_data.pop("id", None) - ############# - # Turn set into comma sep string - gene_data["biotype"] = ",".join(sorted(gene_data["biotype"])) gene_data["url"] = self.url # At the moment the transcript dict is flat - need to move it into "genome_builds" dict @@ -105,11 +103,11 @@ def _create_gene(feature, gene_accession): if feature.type in {"gene", "pseudogene"}: gene_name = feature.attr.get("Name") description = feature.attr.get("description") - biotype = feature.attr.get("biotype") else: gene_name = feature.attr.get("gene_name") - biotype = feature.attr.get("gene_biotype") + # RefSeq GRCh38 has gene.gene_biotype + biotype = feature.attr.get("gene_biotype") or feature.attr.get("biotype") if biotype: biotype_set.add(biotype) @@ -141,17 +139,6 @@ def _store_other_chrom(data, feature): other_chroms.add(feature.iv.chrom) data["other_chroms"] = other_chroms - @staticmethod - def _get_biotype_from_transcript_accession(transcript_accession): - biotypes_by_transcript_id_start = {"NM_": "protein_coding", "NR_": "non_coding"} - for (start, biotype) in biotypes_by_transcript_id_start.items(): - if transcript_accession.startswith(start): - return biotype - - if "tRNA" in transcript_accession: - return "tRNA" - return None - def _add_transcript_data(self, transcript_accession, transcript, feature): if feature.iv.chrom != transcript[CONTIG]: self._store_other_chrom(transcript, feature) @@ -173,7 +160,7 @@ def _add_transcript_data(self, transcript_accession, transcript, feature): features_by_type["coding_starts"].append(feature.iv.start) features_by_type["coding_ends"].append(feature.iv.end) - def _process_coding_features(self): + def _finish_process_features(self): for transcript_accession, transcript_data in self.transcript_data_by_accession.items(): features_by_type = self.transcript_features_by_type.get(transcript_accession) @@ -222,6 +209,16 @@ def _process_coding_features(self): exons_genomic_order.reverse() transcript_data["exons"] = exons_genomic_order + if "cds_start" in transcript_data: + biotype = "mRNA" + else: + biotype = "ncRNA" + transcript_data["biotype"].add(biotype) + + if gene_accession := transcript_data.get("gene_version"): + gene_data = self.gene_data_by_accession[gene_accession] + gene_data["biotype"].update(transcript_data["biotype"]) + @staticmethod def _create_perfect_exons(raw_exon_stranded_order): """ Perfectly matched exons are basically a no-gap case of cDNA match """ @@ -374,8 +371,6 @@ def handle_feature(self, feature): if biotype is None: # Ensembl GTFs store biotype info under gene_type or transcript_type biotype = feature.attr.get("gene_type") - if biotype is None: - biotype = self._get_biotype_from_transcript_accession(transcript_accession) if biotype: gene_data["biotype"].add(biotype) @@ -472,6 +467,9 @@ def handle_feature(self, feature): raise ValueError("Don't know how to handle feature type %s (not child of gene)" % feature.type) gene_data = self.gene_data_by_accession[gene_accession] self._handle_transcript(gene_data, transcript_accession, feature) + transcript = self.transcript_data_by_accession[transcript_accession] + if feature.type not in EXCLUDE_BIOTYPES: + transcript["biotype"].add(feature.type) @staticmethod def _get_dbxref(feature): @@ -487,9 +485,6 @@ def _handle_transcript(self, gene_data, transcript_accession, feature): if transcript_accession not in self.transcript_data_by_accession: # print("_handle_transcript(%s, %s)" % (gene, feature)) transcript_data = self._create_transcript(feature, transcript_accession, gene_data) - if biotype := self._get_biotype_from_transcript_accession(transcript_accession): - gene_data["biotype"].add(biotype) - transcript_data["biotype"].add(biotype) if feature.attr.get("partial"): transcript_data["partial"] = 1