Skip to content

Commit

Permalink
#72 - Correctly handle ncRNA_gene GFF data
Browse files Browse the repository at this point in the history
  • Loading branch information
davmlaw committed Mar 6, 2024
1 parent 280a9d6 commit 0b903f9
Show file tree
Hide file tree
Showing 5 changed files with 17 additions and 6 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
- #60 - Fix for missing protein IDs due to Genbank / GenBank (thanks holtgrewe)
- #64 - Split code/data versions. json.gz are now labelled according to data schema version (thanks holtgrewe)
- Renamed 'CHM13v2.0' to 'T2T-CHM13v2.0' so it could work with biocommons bioutils
- #72 - Correctly handle ncRNA_gene genes (thanks holtgrewe for reporting)

## [0.2.21] - 2023-08-14

Expand Down
8 changes: 4 additions & 4 deletions generate_transcript_data/gff_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ def _create_gene(feature, gene_accession):
description = None

# Non mandatory - Ensembl doesn't have some stuff on some RNAs
if feature.type in {"gene", "pseudogene"}:
if feature.type in {"gene", "pseudogene", "ncRNA_gene"}:
gene_name = feature.attr.get("Name")
description = feature.attr.get("description")
else:
Expand Down Expand Up @@ -178,7 +178,7 @@ def _add_transcript_data(self, transcript_accession, transcript, feature):

def _finish_process_features(self):
for transcript_accession, transcript_data in self.transcript_data_by_accession.items():
features_by_type = self.transcript_features_by_type.get(transcript_accession)
features_by_type = self.transcript_features_by_type.get(transcript_accession, {})

# Store coding start/stop transcript positions
# For RefSeq, we need to deal with alignment gaps, so easiest is to convert exons w/o gaps
Expand All @@ -194,7 +194,7 @@ def _finish_process_features(self):
exons_stranded_order = self._create_cdna_exons(cdna_matches_stranded_order)

else:
raw_exon_stranded_order = features_by_type["exon"]
raw_exon_stranded_order = features_by_type.get("exon", [])
raw_exon_stranded_order.sort(key=operator.itemgetter(0))
if not forward_strand:
raw_exon_stranded_order.reverse()
Expand Down Expand Up @@ -412,7 +412,7 @@ class GFF3Parser(GFFParser):
"""

GFF3_GENES = {"gene", "pseudogene"}
GFF3_GENES = {"gene", "pseudogene", "ncRNA_gene"}
GFF3_TRANSCRIPTS_DATA = {"exon", "CDS", "cDNA_match", "five_prime_UTR", "three_prime_UTR"}

def __init__(self, *args, **kwargs):
Expand Down
2 changes: 1 addition & 1 deletion generate_transcript_data/json_schema_version.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
# After 0.2.22 we split version into separate code (pip) and data schema versions
# The cdot client will use its own major/minor to determine whether it can read these data files
JSON_SCHEMA_VERSION = "0.2.23"
JSON_SCHEMA_VERSION = "0.2.24"
2 changes: 2 additions & 0 deletions tests/test_data/ensembl_test.GRCh38.mt_tg.110.gff3
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
MT insdc ncRNA_gene 8295 8364 . + . ID=gene:ENSG00000210156;Name=MT-TK;biotype=Mt_tRNA;description=mitochondrially encoded tRNA-Lys (AAA/G) [Source:HGNC Symbol%3BAcc:HGNC:7489];gene_id=ENSG00000210156;logic_name=mt_genbank_import_homo_sapiens;version=1
MT insdc tRNA 8295 8364 . + . ID=transcript:ENST00000387421;Parent=gene:ENSG00000210156;Name=MT-TK-201;biotype=Mt_tRNA;tag=basic,Ensembl_canonical;transcript_id=ENST00000387421;transcript_support_level=NA;version=1
10 changes: 9 additions & 1 deletion tests/test_gff_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ class Test(unittest.TestCase):
test_data_dir = os.path.join(this_file_dir, "test_data")
ENSEMBL_104_GTF_FILENAME = os.path.join(test_data_dir, "ensembl_test.GRCh38.104.gtf")
ENSEMBL_108_GFF3_FILENAME = os.path.join(test_data_dir, "ensembl_test.GRCh38.108.gff3")
ENSEMBL_110_GFF3_MT_TG_FILENAME = os.path.join(test_data_dir, "ensembl_test.GRCh38.mt_tg.110.gff3")
# Older RefSeq, before Genbank => GenBank changed
REFSEQ_GFF3_FILENAME_2021 = os.path.join(test_data_dir, "refseq_test.GRCh38.p13_genomic.109.20210514.gff")
# Newer RefSeq, before Genbank => GenBank changed
Expand Down Expand Up @@ -113,4 +114,11 @@ def test_chrom_contig_conversion(self):
contig = transcript["genome_builds"][genome_build].get("contig")
self.assertEqual(contig, "NC_000001.11")


def test_ncrna_gene(self):
""" We were incorrectly missing ncRNA gene info @see https://github.com/SACGF/cdot/issues/72 """
genome_build = "GRCh38"
parser = GFF3Parser(self.ENSEMBL_110_GFF3_MT_TG_FILENAME, genome_build, self.FAKE_URL)
genes, transcripts = parser.get_genes_and_transcripts()
gene = genes["ENSG00000210156"]
gene_symbol = gene["gene_symbol"]
self.assertEqual(gene_symbol, "MT-TK2")

0 comments on commit 0b903f9

Please sign in to comment.