From a981e33eb48f72056cd16038a1ce5512d5b59792 Mon Sep 17 00:00:00 2001 From: David Lawrence Date: Thu, 3 Feb 2022 11:43:43 +1030 Subject: [PATCH] PyHGVS loaders, fix tests --- .idea/.gitignore | 2 + README.md | 25 +- bin/cdot_json.py | 9 +- cdot/__init__.py | 2 +- cdot/hgvs/dataproviders/json_data_provider.py | 23 +- cdot/pyhgvs/pyhgvs_transcript.py | 115 +++++- tests/genome.py | 179 +++++++++ tests/test_data/cdot.refseq.grch37.json | 367 +++++++++--------- tests/test_data/grch37.genome | 1 + tests/test_json_data_provider.py | 2 +- tests/test_pyhgvs.py | 35 ++ 11 files changed, 554 insertions(+), 206 deletions(-) create mode 100644 tests/genome.py create mode 100644 tests/test_data/grch37.genome create mode 100644 tests/test_pyhgvs.py diff --git a/.idea/.gitignore b/.idea/.gitignore index 98f4b2f..4678a24 100644 --- a/.idea/.gitignore +++ b/.idea/.gitignore @@ -1 +1,3 @@ /workspace.xml +misc.xml + diff --git a/README.md b/README.md index 5dcdd3a..4b9d5b2 100644 --- a/README.md +++ b/README.md @@ -2,14 +2,14 @@ [![PyPi version](https://img.shields.io/pypi/v/cdot.svg)](https://pypi.org/project/cdot/) [![Python versions](https://img.shields.io/pypi/pyversions/cdot.svg)](https://pypi.org/project/cdot/) -cdot is used to load transcripts for the 2 most popular Python [HGVS](http://varnomen.hgvs.org/) libraries. +cdot provides transcripts for the 2 most popular Python [HGVS](http://varnomen.hgvs.org/) libraries. It works by: -* Converting RefSeq and Ensembl GTFs into a JSON format -* Providing loaders from that JSON format (or a REST service) +* Converting RefSeq/Ensembl GTFs to JSON +* Providing loaders from JSON.gz files, or REST API via [cdot_rest](https://github.com/SACGF/cdot_rest)) -We currently support 788k transcripts, 5.5x as many as [Universal Transcript Archive](https://github.com/biocommons/uta/) +We currently support ~800k transcripts, with API responses under 0.1 second ## Examples @@ -18,10 +18,10 @@ We currently support 788k transcripts, 5.5x as many as [Universal Transcript Arc ``` from cdot.hgvs.dataproviders import JSONDataProvider, RESTDataProvider -hp = JSONDataProvider({"GRCh37": "./cdot_220119.grch38.json.gz") # Uses local JSON file -# hp = RESTDataProvider() # Uses API server at cdot.cc +hdp = RESTDataProvider() # Uses API server at cdot.cc +# hdp = JSONDataProvider(["./cdot-0.2.1.refseq.grch38.json.gz"]) # Uses local JSON file -am = AssemblyMapper(hp, +am = AssemblyMapper(hdp, assembly_name='GRCh37', alt_aln_method='splign', replace_reference=True) @@ -33,9 +33,18 @@ am.c_to_g(var_c) [PyHGVS](https://github.com/counsyl/hgvs) example: ``` -# TODO +from cdot.pyhgvs.pyhgvs_transcript import JSONPyHGVSTranscriptFactory, RESTPyHGVSTranscriptFactory + +factory = RESTPyHGVSTranscriptFactory() +# factory = JSONPyHGVSTranscriptFactory(["./cdot-0.2.1.refseq.grch38.json.gz"]) # Uses local JSON file +pyhgvs.parse_hgvs_name(hgvs_c, genome, get_transcript=factory.get_transcript_grch37) + ``` +## Download data + +TODO + ## Philosophical differences from Universal Transcript Archive cdot aims to be as simple as possible: convert existing Ensembl/RefSeq GTFs into JSON format diff --git a/bin/cdot_json.py b/bin/cdot_json.py index 1ce176e..df27f7c 100755 --- a/bin/cdot_json.py +++ b/bin/cdot_json.py @@ -162,6 +162,9 @@ def _cigar_to_gap_and_length(cigar): def write_json(output_filename, data, url=None): + # These single GTF/GFF JSON files are used by PyReference, and not meant to be used for cdot HGVS + # If you only want 1 build for cdot, you can always run merge_historical on a single file + if url: reference_gtf = data.get("reference_gtf", {}) reference_gtf["url"] = url @@ -176,7 +179,9 @@ def write_json(output_filename, data, url=None): def merge_historical(args): - print("merge_historical") + # The merged files are not the same as individual GTF json files + # @see https://github.com/SACGF/cdot/wiki/Transcript-JSON-format + TRANSCRIPT_FIELDS = ["biotype", "start_codon", "stop_codon"] GENOME_BUILD_FIELDS = ["cds_start", "cds_end", "strand", "contig", "exons"] @@ -188,8 +193,6 @@ def merge_historical(args): with gzip.open(filename) as f: reference_gtf = next(ijson.items(f, "reference_gtf")) url = reference_gtf["url"] - - # PyReference stores transcripts under genes, while PyReference only has transcripts (that contain genes) transcript_gene_version = {} f.seek(0) # Reset for next ijson call diff --git a/cdot/__init__.py b/cdot/__init__.py index 06a9c81..f3266a8 100644 --- a/cdot/__init__.py +++ b/cdot/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.1.1" +__version__ = "0.2.1" def get_json_schema_version(): diff --git a/cdot/hgvs/dataproviders/json_data_provider.py b/cdot/hgvs/dataproviders/json_data_provider.py index 1f2b524..411b63d 100644 --- a/cdot/hgvs/dataproviders/json_data_provider.py +++ b/cdot/hgvs/dataproviders/json_data_provider.py @@ -203,18 +203,21 @@ class JSONDataProvider(AbstractJSONDataProvider): def _get_transcript(self, tx_ac): return self.transcripts.get(tx_ac) - def __init__(self, file_or_filename, mode=None, cache=None): - if isinstance(file_or_filename, str): - if file_or_filename.endswith(".gz"): - f = gzip.open(file_or_filename) + def __init__(self, file_or_filename_list, mode=None, cache=None): + assemblies = set() + self.transcripts = {} + for file_or_filename in file_or_filename_list: + if isinstance(file_or_filename, str): + if file_or_filename.endswith(".gz"): + f = gzip.open(file_or_filename) + else: + f = open(file_or_filename) else: - f = open(file_or_filename) - else: - f = file_or_filename - data = json.load(f) - assemblies = data["genome_builds"] + f = file_or_filename + data = json.load(f) + assemblies.update(data["genome_builds"]) + self.transcripts.update(data["transcripts"]) super().__init__(assemblies=assemblies, mode=mode, cache=cache) - self.transcripts = data["transcripts"] class RESTDataProvider(AbstractJSONDataProvider): diff --git a/cdot/pyhgvs/pyhgvs_transcript.py b/cdot/pyhgvs/pyhgvs_transcript.py index bdeb79b..eff0aa4 100644 --- a/cdot/pyhgvs/pyhgvs_transcript.py +++ b/cdot/pyhgvs/pyhgvs_transcript.py @@ -1,9 +1,120 @@ +import abc +import json -# if PyHGVS below a certain version - do it the old way +import requests -# If after - can use different exons, and start/stop codons to make it faster +from pyhgvs.models import Transcript, Position, Exon +class AbstractPyHGVSTranscriptFactory(abc.ABC): + def __init__(self): + pass + + @abc.abstractmethod + def _get_transcript(self, transcript_id): + pass + + def get_transcript_grch37(self, transcript_id): + return self.get_transcript(transcript_id, "GRCh37") + + def get_transcript_grch38(self, transcript_id): + return self.get_transcript(transcript_id, "GRCh38") + + def get_transcript(self, transcript_id, genome_build): + transcript_json = self._get_transcript(transcript_id) + build_coords = transcript_json["genome_builds"].get(genome_build) + if build_coords is None: + return None + + transcript_name = transcript_json['id'] + if '.' in transcript_name: + name, version = transcript_name.split('.') + else: + name, version = transcript_name, None + + contig = build_coords['contig'] + start = build_coords['exons'][0][0] + end = build_coords['exons'][-1][1] + is_forward_strand = build_coords['strand'] == '+' + + transcript = Transcript( + name=name, + version=int(version) if version is not None else None, + gene=transcript_json['gene_name'], + tx_position=Position( + contig, + start, + end, + is_forward_strand), + cds_position=Position( + contig, + build_coords['cds_start'], + build_coords['cds_end'], + is_forward_strand), + ) + + stranded_exons = sorted([ex[:3] for ex in build_coords['exons']], key=lambda ex: ex[2]) + for (exon_start, exon_end, exon_number) in stranded_exons: + transcript.exons.append( + Exon(transcript=transcript, + tx_position=Position( + contig, + exon_start, + exon_end, + is_forward_strand), + exon_number=exon_number)) + + return transcript + + +class JSONPyHGVSTranscriptFactory(AbstractPyHGVSTranscriptFactory): + def _get_transcript(self, transcript_id): + return self.transcripts.get(transcript_id) + + def __init__(self, file_or_filename_list): + super().__init__() + self.transcripts = {} + for file_or_filename in file_or_filename_list: + if isinstance(file_or_filename, str): + if file_or_filename.endswith(".gz"): + f = gzip.open(file_or_filename) + else: + f = open(file_or_filename) + else: + f = file_or_filename + data = json.load(f) + self.transcripts.update(data["transcripts"]) + + +class RESTPyHGVSTranscriptFactory(AbstractPyHGVSTranscriptFactory): + + def _get_transcript(self, transcript_id): + # We store None for 404 on REST + if transcript_id in self.transcripts: + return self.transcripts[transcript_id] + + transcript_url = self.url + "/transcript/" + transcript_id + response = requests.get(transcript_url) + if response.ok: + if 'application/json' in response.headers.get('Content-Type'): + transcript = response.json() + else: + raise ValueError("Non-json response received for '%s' - are you behind a firewall?" % transcript_url) + else: + transcript = None + self.transcripts[transcript_id] = transcript + return transcript + + def __init__(self, url=None): + super().__init__() + if url is None: + url = "http://cdot.cc" + self.url = url + self.transcripts = {} + + +# TODO: Write code to load SACGF pyhgvs changes, ie gaps and start/stop codons etc + # Changes from old loading: # See dot has no cds_start/end if non-coding diff --git a/tests/genome.py b/tests/genome.py new file mode 100644 index 0000000..f7f62e4 --- /dev/null +++ b/tests/genome.py @@ -0,0 +1,179 @@ +""" +From https://github.com/counsyl/hgvs +""" + +from __future__ import absolute_import +from __future__ import unicode_literals + +import itertools +import os + +from pyhgvs.variants import revcomp + + +try: + from pyfaidx import Genome as SequenceFileDB + # Allow pyflakes to ignore redefinition in except clause. + SequenceFileDB +except ImportError: + SequenceFileDB = None + + +class MockGenomeError(Exception): + pass + + +class MockSequence(object): + def __init__(self, sequence): + self.sequence = sequence + + def __neg__(self): + """Return reverse complement sequence.""" + return MockSequence(revcomp(self.sequence)) + + def __str__(self): + return self.sequence + + def __repr__(self): + return 'MockSequence("%s")' % self.sequence + + +class MockChromosome(object): + def __init__(self, name, genome=None): + self.name = name + self.genome = genome + + def __getitem__(self, n): + """Return sequence from region [start, end) + + Coordinates are 0-based, end-exclusive.""" + if isinstance(n, slice): + return self.genome.get_seq(self.name, n.start, n.stop) + else: + return self.genome.get_seq(self.name, n, n+1) + + def __repr__(self): + return 'MockChromosome("%s")' % (self.name) + + +class MockGenome(object): + def __init__(self, lookup=None, filename=None, db_filename=None, + default_seq=None): + """ + A mock genome object that provides a pygr compatible interface. + + lookup: a list of ((chrom, start, end), seq) values that define + a lookup table for genome sequence requests. + filename: a stream or filename containing a lookup table. + db_filename: a fasta file to use for genome sequence requests. All + requests are recorded and can be writen to a lookup table file + using the `write` method. + default_seq: if given, this base will always be returned if + region is unavailable. + """ + self._chroms = {} + self._lookup = lookup if lookup is not None else {} + self._genome = None + self._default_seq = default_seq + + if db_filename: + # Use a real genome database. + if SequenceFileDB is None: + raise ValueError('pygr is not available.') + self._genome = SequenceFileDB(db_filename) + self._source_filename = db_filename + elif filename: + # Read genome sequence from lookup table. + self.read(filename) + self._source_filename = filename + + def __contains__(self, chrom): + """Return True if genome contains chromosome.""" + return chrom in (self._genome or self._chroms) + + def __getitem__(self, chrom): + """Return a chromosome by its name.""" + if chrom not in self._chroms: + self._chroms[chrom] = MockChromosome(chrom, self) + return self._chroms[chrom] + + def get_seq(self, chrom, start, end): + """Return a sequence by chromosome name and region [start, end). + + Coordinates are 0-based, end-exclusive. + """ + if self._genome: + # Get sequence from real genome object and save result. + seq = self._genome[chrom][start:end] + self._lookup[(chrom, start, end)] = str(seq) + return seq + else: + # Use lookup table to fetch genome sequence. + try: + return MockSequence(self._lookup[(chrom, start, end)]) + except KeyError: + if self._default_seq: + # Generate default sequence. + return ''.join(itertools.islice( + itertools.cycle(self._default_seq), + None, end - start)) + else: + raise MockGenomeError( + 'Sequence not in test data: %s:%d-%d source: %s' % + (chrom, start, end, self._source_filename)) + + def read(self, filename): + """Read a sequence lookup table from a file. + + filename: a filename string or file stream. + """ + if hasattr(filename, 'read'): + infile = filename + else: + with open(filename) as infile: + return self.read(infile) + + for line in infile: + tokens = line.rstrip().split('\t') + chrom, start, end, seq = tokens + self._lookup[(chrom, int(start), int(end))] = seq + if chrom not in self._lookup: + self._chroms[chrom] = MockChromosome(chrom, self) + + def write(self, filename): + """Write a sequence lookup table to file.""" + if hasattr(filename, 'write'): + out = filename + else: + with open(filename, 'w') as out: + return self.write(out) + + for (chrom, start, end), seq in self._lookup.items(): + out.write('\t'.join(map(str, [chrom, start, end, seq])) + '\n') + + +class MockGenomeTestFile(MockGenome): + def __init__(self, lookup=None, filename=None, db_filename=None, + default_seq=None, create_data=False): + if not create_data: + db_filename = None + super(MockGenomeTestFile, self).__init__( + lookup=lookup, db_filename=db_filename, + filename=filename, + default_seq=default_seq) + + self._filename = filename + self._create_data = (db_filename is not None) + + if self._create_data and os.path.exists(filename): + # Clear output file when creating data. + os.remove(filename) + + def get_seq(self, chrom, start, end): + seq = super(MockGenomeTestFile, self).get_seq(chrom, start, end) + + # Save each query in append mode. + if self._create_data: + with open(self._filename, 'a') as out: + out.write('\t'.join(map(str, [chrom, start, end, seq])) + '\n') + return seq diff --git a/tests/test_data/cdot.refseq.grch37.json b/tests/test_data/cdot.refseq.grch37.json index dce4749..a7a090a 100644 --- a/tests/test_data/cdot.refseq.grch37.json +++ b/tests/test_data/cdot.refseq.grch37.json @@ -1,192 +1,197 @@ { "transcripts": { "NM_001637.3": { - "biotype": [ - "protein_coding" - ], - "cds_end": 36763753, - "cds_start": 36552857, - "contig": "NC_000007.13", - "exons": [ - [ - 36552548, - 36552986, - 20, - 2001, - 2440, - "M196 I1 M61 I1 M181" - ], - [ - 36561644, - 36561721, - 19, - 1924, - 2000, - null - ], - [ - 36570023, - 36570120, - 18, - 1827, - 1923, - null - ], - [ - 36571752, - 36571812, - 17, - 1767, - 1826, - null - ], - [ - 36571891, - 36571950, - 16, - 1708, - 1766, - null - ], - [ - 36579924, - 36580097, - 15, - 1535, - 1707, - null - ], - [ - 36588217, - 36588292, - 14, - 1460, - 1534, - null - ], - [ - 36589044, - 36589081, - 13, - 1423, - 1459, - null - ], - [ - 36616179, - 36616262, - 12, - 1340, - 1422, - null - ], - [ - 36633944, - 36634036, - 11, - 1248, - 1339, - null - ], - [ - 36655985, - 36656080, - 10, - 1153, - 1247, - null - ], - [ - 36657902, - 36657951, - 9, - 1104, - 1152, - null - ], - [ - 36660386, - 36660435, - 8, - 1055, - 1103, - null - ], - [ - 36661315, - 36661386, - 7, - 984, - 1054, - null - ], - [ - 36662795, - 36662856, - 6, - 923, - 983, - null - ], - [ - 36671641, - 36671712, - 5, - 852, - 922, - null - ], - [ - 36677456, - 36677516, - 4, - 792, - 851, - null - ], - [ - 36698770, - 36698870, - 3, - 692, - 791, - null - ], - [ - 36713547, - 36713614, - 2, - 625, - 691, - null - ], - [ - 36726303, - 36726399, - 1, - 529, - 624, - null - ], - [ - 36763626, - 36764154, - 0, - 1, - 528, - null - ] - ], - "start": 36552548, "start_codon": 401, - "stop": 36764154, "stop_codon": 2129, - "strand": "-", "id": "NM_001637.3", "gene_version": "313", "gene_name": "AOAH", - "url": "http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.105/GFF/ref_GRCh37.p13_top_level.gff3.gz" + "biotype": [ + "protein_coding" + ], + "genome_builds": { + "GRCh37": { + "cds_end": 36763753, + "cds_start": 36552857, + "contig": "NC_000007.13", + "exons": [ + [ + 36552548, + 36552986, + 20, + 2001, + 2440, + "M196 I1 M61 I1 M181" + ], + [ + 36561644, + 36561721, + 19, + 1924, + 2000, + null + ], + [ + 36570023, + 36570120, + 18, + 1827, + 1923, + null + ], + [ + 36571752, + 36571812, + 17, + 1767, + 1826, + null + ], + [ + 36571891, + 36571950, + 16, + 1708, + 1766, + null + ], + [ + 36579924, + 36580097, + 15, + 1535, + 1707, + null + ], + [ + 36588217, + 36588292, + 14, + 1460, + 1534, + null + ], + [ + 36589044, + 36589081, + 13, + 1423, + 1459, + null + ], + [ + 36616179, + 36616262, + 12, + 1340, + 1422, + null + ], + [ + 36633944, + 36634036, + 11, + 1248, + 1339, + null + ], + [ + 36655985, + 36656080, + 10, + 1153, + 1247, + null + ], + [ + 36657902, + 36657951, + 9, + 1104, + 1152, + null + ], + [ + 36660386, + 36660435, + 8, + 1055, + 1103, + null + ], + [ + 36661315, + 36661386, + 7, + 984, + 1054, + null + ], + [ + 36662795, + 36662856, + 6, + 923, + 983, + null + ], + [ + 36671641, + 36671712, + 5, + 852, + 922, + null + ], + [ + 36677456, + 36677516, + 4, + 792, + 851, + null + ], + [ + 36698770, + 36698870, + 3, + 692, + 791, + null + ], + [ + 36713547, + 36713614, + 2, + 625, + 691, + null + ], + [ + 36726303, + 36726399, + 1, + 529, + 624, + null + ], + [ + 36763626, + 36764154, + 0, + 1, + 528, + null + ] + ], + "start": 36552548, + "stop": 36764154, + "strand": "-", + "url": "http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.105/GFF/ref_GRCh37.p13_top_level.gff3.gz" + } + } } }, - "cdot_version": "0.1.1" + "cdot_version": "0.2.1", + "genome_builds": ["GRCh37"] } \ No newline at end of file diff --git a/tests/test_data/grch37.genome b/tests/test_data/grch37.genome new file mode 100644 index 0000000..0a1e899 --- /dev/null +++ b/tests/test_data/grch37.genome @@ -0,0 +1 @@ +NC_000007.13 36561661 36561662 C diff --git a/tests/test_json_data_provider.py b/tests/test_json_data_provider.py index a7cd9be..ef3d3fd 100644 --- a/tests/test_json_data_provider.py +++ b/tests/test_json_data_provider.py @@ -14,7 +14,7 @@ def test_transcript(self): this_file_dir = os.path.dirname(abspath(getsourcefile(lambda: 0))) # parent_dir = os.path.dirname(this_file_dir) test_json_file = os.path.join(this_file_dir, "test_data/cdot.refseq.grch37.json") - json_data_provider = JSONDataProvider(assembly_json={"GRCh37": test_json_file}) + json_data_provider = JSONDataProvider([test_json_file]) am = AssemblyMapper(json_data_provider, assembly_name='GRCh37', alt_aln_method='splign', replace_reference=True) HGVS_C_TO_G = [ diff --git a/tests/test_pyhgvs.py b/tests/test_pyhgvs.py new file mode 100644 index 0000000..f432ee1 --- /dev/null +++ b/tests/test_pyhgvs.py @@ -0,0 +1,35 @@ +import os +import unittest +from inspect import getsourcefile +from os.path import abspath + +import pyhgvs + +from cdot.pyhgvs.pyhgvs_transcript import JSONPyHGVSTranscriptFactory +from .genome import MockGenomeTestFile + + +class TestPyHGVS(unittest.TestCase): + def test_transcript(self): + this_file_dir = os.path.dirname(abspath(getsourcefile(lambda: 0))) + test_json_file = os.path.join(this_file_dir, "test_data/cdot.refseq.grch37.json") + factory = JSONPyHGVSTranscriptFactory([test_json_file]) + + HGVS_C_TO_G = [ + ('NM_001637.3:c.1582G>A', 'NC_000007.13:g.36561662C>T'), + ] + + genome = MockGenomeTestFile( + db_filename='grch37.fa', + filename=os.path.join(this_file_dir, 'test_data/grch37.genome'), + create_data=False) + + for hgvs_c, expected_hgvs_g in HGVS_C_TO_G: + result = pyhgvs.parse_hgvs_name(hgvs_c, genome, get_transcript=factory.get_transcript_grch37) + name = pyhgvs.HGVSName(expected_hgvs_g) + expected = (name.chrom, name.start, name.ref_allele, name.alt_allele) + self.assertEqual(result, expected) + + +if __name__ == '__main__': + unittest.main()