Skip to content

Commit

Permalink
#710 - Store "Note" from RefSeq GTFs
Browse files Browse the repository at this point in the history
  • Loading branch information
davmlaw committed Nov 13, 2023
1 parent 76cd459 commit bd53cc2
Showing 1 changed file with 12 additions and 3 deletions.
15 changes: 12 additions & 3 deletions generate_transcript_data/gff_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,23 @@
import logging
import operator
import re
from bioutils.assemblies import make_name_ac_map
from collections import Counter, defaultdict
from typing import Optional

import HTSeq

from cdot.assembly_helper import get_name_ac_map

CONTIG = "contig"
STRAND = "strand"
EXCLUDE_BIOTYPES = {"transcript"} # feature.type we won't put into biotype


def get_name_ac_map(assembly_name):
if assembly_name == "GRCh37":
assembly_name = 'GRCh37.p13' # Original build didn't have MT
return make_name_ac_map(assembly_name)


class GFFParser(abc.ABC):
CODING_FEATURES = {"CDS", "start_codon", "stop_codon"} # Use these to work out cds_start/cds_end
FEATURE_ALLOW_LIST = {}
Expand Down Expand Up @@ -78,7 +83,7 @@ def _finish(self):
gene_data["url"] = self.url

# At the moment the transcript dict is flat - need to move it into "genome_builds" dict
GENOME_BUILD_FIELDS = ["cds_start", "cds_end", "strand", "contig", "exons", "other_chroms", "tag"]
GENOME_BUILD_FIELDS = ["cds_start", "cds_end", "strand", "contig", "exons", "other_chroms", "tag", "note"]
for transcript_accession, transcript_data in self.transcript_data_by_accession.items():
if protein := self.transcript_proteins.get(transcript_accession):
transcript_data["protein"] = protein
Expand Down Expand Up @@ -167,6 +172,10 @@ def _add_transcript_data(self, transcript_accession, transcript, feature):
features_by_type["coding_starts"].append(feature.iv.start)
features_by_type["coding_ends"].append(feature.iv.end)

if note := feature.attr.get("Note"):
transcript["note"] = note


def _finish_process_features(self):
for transcript_accession, transcript_data in self.transcript_data_by_accession.items():
features_by_type = self.transcript_features_by_type.get(transcript_accession)
Expand Down

0 comments on commit bd53cc2

Please sign in to comment.