PyHGVS loaders, fix tests

SACGF · Feb 3, 2022 · a981e33 · a981e33
1 parent 48b6817
commit a981e33
Show file tree

Hide file tree

Showing 11 changed files with 554 additions and 206 deletions.
diff --git a/.idea/.gitignore b/.idea/.gitignore
diff --git a/README.md b/README.md
@@ -2,14 +2,14 @@
 
 [![PyPi version](https://img.shields.io/pypi/v/cdot.svg)](https://pypi.org/project/cdot/) [![Python versions](https://img.shields.io/pypi/pyversions/cdot.svg)](https://pypi.org/project/cdot/)
 
-cdot is used to load transcripts for the 2 most popular Python [HGVS](http://varnomen.hgvs.org/) libraries.
+cdot provides transcripts for the 2 most popular Python [HGVS](http://varnomen.hgvs.org/) libraries.
 
 It works by:
 
-* Converting RefSeq and Ensembl GTFs into a JSON format
-* Providing loaders from that JSON format (or a REST service)
+* Converting RefSeq/Ensembl GTFs to JSON
+* Providing loaders from JSON.gz files, or REST API via [cdot_rest](https://github.com/SACGF/cdot_rest))
 
-We currently support 788k transcripts, 5.5x as many as [Universal Transcript Archive](https://github.com/biocommons/uta/)
+We currently support ~800k transcripts, with API responses under 0.1 second
 
 ## Examples
 
@@ -18,10 +18,10 @@ We currently support 788k transcripts, 5.5x as many as [Universal Transcript Arc
 ```
 from cdot.hgvs.dataproviders import JSONDataProvider, RESTDataProvider
 
-hp = JSONDataProvider({"GRCh37": "./cdot_220119.grch38.json.gz")  # Uses local JSON file
-# hp = RESTDataProvider()  # Uses API server at cdot.cc
+hdp = RESTDataProvider()  # Uses API server at cdot.cc
+# hdp = JSONDataProvider(["./cdot-0.2.1.refseq.grch38.json.gz"])  # Uses local JSON file
 
-am = AssemblyMapper(hp,
+am = AssemblyMapper(hdp,
                     assembly_name='GRCh37',
                     alt_aln_method='splign', replace_reference=True)
 
@@ -33,9 +33,18 @@ am.c_to_g(var_c)
 [PyHGVS](https://github.com/counsyl/hgvs) example:
 
 ```
-# TODO
+from cdot.pyhgvs.pyhgvs_transcript import JSONPyHGVSTranscriptFactory, RESTPyHGVSTranscriptFactory
+
+factory = RESTPyHGVSTranscriptFactory()
+# factory = JSONPyHGVSTranscriptFactory(["./cdot-0.2.1.refseq.grch38.json.gz"])  # Uses local JSON file
+pyhgvs.parse_hgvs_name(hgvs_c, genome, get_transcript=factory.get_transcript_grch37)
+
 ```
 
+## Download data
+
+TODO
+
 ## Philosophical differences from Universal Transcript Archive
 
 cdot aims to be as simple as possible: convert existing Ensembl/RefSeq GTFs into JSON format

diff --git a/bin/cdot_json.py b/bin/cdot_json.py
@@ -162,6 +162,9 @@ def _cigar_to_gap_and_length(cigar):
 
 
 def write_json(output_filename, data, url=None):
+    # These single GTF/GFF JSON files are used by PyReference, and not meant to be used for cdot HGVS
+    # If you only want 1 build for cdot, you can always run merge_historical on a single file
+
     if url:
         reference_gtf = data.get("reference_gtf", {})
         reference_gtf["url"] = url
@@ -176,7 +179,9 @@ def write_json(output_filename, data, url=None):
 
 
 def merge_historical(args):
-    print("merge_historical")
+    # The merged files are not the same as individual GTF json files
+    # @see https://github.com/SACGF/cdot/wiki/Transcript-JSON-format
+
     TRANSCRIPT_FIELDS = ["biotype", "start_codon", "stop_codon"]
     GENOME_BUILD_FIELDS = ["cds_start", "cds_end", "strand", "contig", "exons"]
 
@@ -188,8 +193,6 @@ def merge_historical(args):
         with gzip.open(filename) as f:
             reference_gtf = next(ijson.items(f, "reference_gtf"))
             url = reference_gtf["url"]
-
-            # PyReference stores transcripts under genes, while PyReference only has transcripts (that contain genes)
             transcript_gene_version = {}
 
             f.seek(0)  # Reset for next ijson call

diff --git a/cdot/__init__.py b/cdot/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.1.1"
+__version__ = "0.2.1"
 
 
 def get_json_schema_version():

diff --git a/cdot/hgvs/dataproviders/json_data_provider.py b/cdot/hgvs/dataproviders/json_data_provider.py
@@ -203,18 +203,21 @@ class JSONDataProvider(AbstractJSONDataProvider):
     def _get_transcript(self, tx_ac):
         return self.transcripts.get(tx_ac)
 
-    def __init__(self, file_or_filename, mode=None, cache=None):
-        if isinstance(file_or_filename, str):
-            if file_or_filename.endswith(".gz"):
-                f = gzip.open(file_or_filename)
+    def __init__(self, file_or_filename_list, mode=None, cache=None):
+        assemblies = set()
+        self.transcripts = {}
+        for file_or_filename in file_or_filename_list:
+            if isinstance(file_or_filename, str):
+                if file_or_filename.endswith(".gz"):
+                    f = gzip.open(file_or_filename)
+                else:
+                    f = open(file_or_filename)
             else:
-                f = open(file_or_filename)
-        else:
-            f = file_or_filename
-        data = json.load(f)
-        assemblies = data["genome_builds"]
+                f = file_or_filename
+            data = json.load(f)
+            assemblies.update(data["genome_builds"])
+            self.transcripts.update(data["transcripts"])
         super().__init__(assemblies=assemblies, mode=mode, cache=cache)
-        self.transcripts = data["transcripts"]
 
 
 class RESTDataProvider(AbstractJSONDataProvider):

diff --git a/cdot/pyhgvs/pyhgvs_transcript.py b/cdot/pyhgvs/pyhgvs_transcript.py
@@ -1,9 +1,120 @@
+import abc
+import json
 
-# if PyHGVS below a certain version - do it the old way
+import requests
 
-# If after - can use different exons, and start/stop codons to make it faster
+from pyhgvs.models import Transcript, Position, Exon
 
 
+class AbstractPyHGVSTranscriptFactory(abc.ABC):
+    def __init__(self):
+        pass
+
+    @abc.abstractmethod
+    def _get_transcript(self, transcript_id):
+        pass
+
+    def get_transcript_grch37(self, transcript_id):
+        return self.get_transcript(transcript_id, "GRCh37")
+
+    def get_transcript_grch38(self, transcript_id):
+        return self.get_transcript(transcript_id, "GRCh38")
+
+    def get_transcript(self, transcript_id, genome_build):
+        transcript_json = self._get_transcript(transcript_id)
+        build_coords = transcript_json["genome_builds"].get(genome_build)
+        if build_coords is None:
+            return None
+
+        transcript_name = transcript_json['id']
+        if '.' in transcript_name:
+            name, version = transcript_name.split('.')
+        else:
+            name, version = transcript_name, None
+
+        contig = build_coords['contig']
+        start = build_coords['exons'][0][0]
+        end = build_coords['exons'][-1][1]
+        is_forward_strand = build_coords['strand'] == '+'
+
+        transcript = Transcript(
+            name=name,
+            version=int(version) if version is not None else None,
+            gene=transcript_json['gene_name'],
+            tx_position=Position(
+                contig,
+                start,
+                end,
+                is_forward_strand),
+            cds_position=Position(
+                contig,
+                build_coords['cds_start'],
+                build_coords['cds_end'],
+                is_forward_strand),
+        )
+
+        stranded_exons = sorted([ex[:3] for ex in build_coords['exons']], key=lambda ex: ex[2])
+        for (exon_start, exon_end, exon_number) in stranded_exons:
+            transcript.exons.append(
+                Exon(transcript=transcript,
+                     tx_position=Position(
+                         contig,
+                         exon_start,
+                         exon_end,
+                         is_forward_strand),
+                     exon_number=exon_number))
+
+        return transcript
+
+
+class JSONPyHGVSTranscriptFactory(AbstractPyHGVSTranscriptFactory):
+    def _get_transcript(self, transcript_id):
+        return self.transcripts.get(transcript_id)
+
+    def __init__(self, file_or_filename_list):
+        super().__init__()
+        self.transcripts = {}
+        for file_or_filename in file_or_filename_list:
+            if isinstance(file_or_filename, str):
+                if file_or_filename.endswith(".gz"):
+                    f = gzip.open(file_or_filename)
+                else:
+                    f = open(file_or_filename)
+            else:
+                f = file_or_filename
+            data = json.load(f)
+            self.transcripts.update(data["transcripts"])
+
+
+class RESTPyHGVSTranscriptFactory(AbstractPyHGVSTranscriptFactory):
+
+    def _get_transcript(self, transcript_id):
+        # We store None for 404 on REST
+        if transcript_id in self.transcripts:
+            return self.transcripts[transcript_id]
+
+        transcript_url = self.url + "/transcript/" + transcript_id
+        response = requests.get(transcript_url)
+        if response.ok:
+            if 'application/json' in response.headers.get('Content-Type'):
+                transcript = response.json()
+            else:
+                raise ValueError("Non-json response received for '%s' - are you behind a firewall?" % transcript_url)
+        else:
+            transcript = None
+        self.transcripts[transcript_id] = transcript
+        return transcript
+
+    def __init__(self, url=None):
+        super().__init__()
+        if url is None:
+            url = "http://cdot.cc"
+        self.url = url
+        self.transcripts = {}
+
+
+# TODO: Write code to load SACGF pyhgvs changes, ie gaps and start/stop codons etc
+
 # Changes from old loading:
 
 # See dot has no cds_start/end if non-coding