Skip to content

Commit

Permalink
PyHGVS loaders, fix tests
Browse files Browse the repository at this point in the history
  • Loading branch information
davmlaw committed Feb 3, 2022
1 parent 48b6817 commit a981e33
Show file tree
Hide file tree
Showing 11 changed files with 554 additions and 206 deletions.
2 changes: 2 additions & 0 deletions .idea/.gitignore

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

25 changes: 17 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,14 @@

[![PyPi version](https://img.shields.io/pypi/v/cdot.svg)](https://pypi.org/project/cdot/) [![Python versions](https://img.shields.io/pypi/pyversions/cdot.svg)](https://pypi.org/project/cdot/)

cdot is used to load transcripts for the 2 most popular Python [HGVS](http://varnomen.hgvs.org/) libraries.
cdot provides transcripts for the 2 most popular Python [HGVS](http://varnomen.hgvs.org/) libraries.

It works by:

* Converting RefSeq and Ensembl GTFs into a JSON format
* Providing loaders from that JSON format (or a REST service)
* Converting RefSeq/Ensembl GTFs to JSON
* Providing loaders from JSON.gz files, or REST API via [cdot_rest](https://github.com/SACGF/cdot_rest))

We currently support 788k transcripts, 5.5x as many as [Universal Transcript Archive](https://github.com/biocommons/uta/)
We currently support ~800k transcripts, with API responses under 0.1 second

## Examples

Expand All @@ -18,10 +18,10 @@ We currently support 788k transcripts, 5.5x as many as [Universal Transcript Arc
```
from cdot.hgvs.dataproviders import JSONDataProvider, RESTDataProvider
hp = JSONDataProvider({"GRCh37": "./cdot_220119.grch38.json.gz") # Uses local JSON file
# hp = RESTDataProvider() # Uses API server at cdot.cc
hdp = RESTDataProvider() # Uses API server at cdot.cc
# hdp = JSONDataProvider(["./cdot-0.2.1.refseq.grch38.json.gz"]) # Uses local JSON file
am = AssemblyMapper(hp,
am = AssemblyMapper(hdp,
assembly_name='GRCh37',
alt_aln_method='splign', replace_reference=True)
Expand All @@ -33,9 +33,18 @@ am.c_to_g(var_c)
[PyHGVS](https://github.com/counsyl/hgvs) example:

```
# TODO
from cdot.pyhgvs.pyhgvs_transcript import JSONPyHGVSTranscriptFactory, RESTPyHGVSTranscriptFactory
factory = RESTPyHGVSTranscriptFactory()
# factory = JSONPyHGVSTranscriptFactory(["./cdot-0.2.1.refseq.grch38.json.gz"]) # Uses local JSON file
pyhgvs.parse_hgvs_name(hgvs_c, genome, get_transcript=factory.get_transcript_grch37)
```

## Download data

TODO

## Philosophical differences from Universal Transcript Archive

cdot aims to be as simple as possible: convert existing Ensembl/RefSeq GTFs into JSON format
Expand Down
9 changes: 6 additions & 3 deletions bin/cdot_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,9 @@ def _cigar_to_gap_and_length(cigar):


def write_json(output_filename, data, url=None):
# These single GTF/GFF JSON files are used by PyReference, and not meant to be used for cdot HGVS
# If you only want 1 build for cdot, you can always run merge_historical on a single file

if url:
reference_gtf = data.get("reference_gtf", {})
reference_gtf["url"] = url
Expand All @@ -176,7 +179,9 @@ def write_json(output_filename, data, url=None):


def merge_historical(args):
print("merge_historical")
# The merged files are not the same as individual GTF json files
# @see https://github.com/SACGF/cdot/wiki/Transcript-JSON-format

TRANSCRIPT_FIELDS = ["biotype", "start_codon", "stop_codon"]
GENOME_BUILD_FIELDS = ["cds_start", "cds_end", "strand", "contig", "exons"]

Expand All @@ -188,8 +193,6 @@ def merge_historical(args):
with gzip.open(filename) as f:
reference_gtf = next(ijson.items(f, "reference_gtf"))
url = reference_gtf["url"]

# PyReference stores transcripts under genes, while PyReference only has transcripts (that contain genes)
transcript_gene_version = {}

f.seek(0) # Reset for next ijson call
Expand Down
2 changes: 1 addition & 1 deletion cdot/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = "0.1.1"
__version__ = "0.2.1"


def get_json_schema_version():
Expand Down
23 changes: 13 additions & 10 deletions cdot/hgvs/dataproviders/json_data_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,18 +203,21 @@ class JSONDataProvider(AbstractJSONDataProvider):
def _get_transcript(self, tx_ac):
return self.transcripts.get(tx_ac)

def __init__(self, file_or_filename, mode=None, cache=None):
if isinstance(file_or_filename, str):
if file_or_filename.endswith(".gz"):
f = gzip.open(file_or_filename)
def __init__(self, file_or_filename_list, mode=None, cache=None):
assemblies = set()
self.transcripts = {}
for file_or_filename in file_or_filename_list:
if isinstance(file_or_filename, str):
if file_or_filename.endswith(".gz"):
f = gzip.open(file_or_filename)
else:
f = open(file_or_filename)
else:
f = open(file_or_filename)
else:
f = file_or_filename
data = json.load(f)
assemblies = data["genome_builds"]
f = file_or_filename
data = json.load(f)
assemblies.update(data["genome_builds"])
self.transcripts.update(data["transcripts"])
super().__init__(assemblies=assemblies, mode=mode, cache=cache)
self.transcripts = data["transcripts"]


class RESTDataProvider(AbstractJSONDataProvider):
Expand Down
115 changes: 113 additions & 2 deletions cdot/pyhgvs/pyhgvs_transcript.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,120 @@
import abc
import json

# if PyHGVS below a certain version - do it the old way
import requests

# If after - can use different exons, and start/stop codons to make it faster
from pyhgvs.models import Transcript, Position, Exon


class AbstractPyHGVSTranscriptFactory(abc.ABC):
def __init__(self):
pass

@abc.abstractmethod
def _get_transcript(self, transcript_id):
pass

def get_transcript_grch37(self, transcript_id):
return self.get_transcript(transcript_id, "GRCh37")

def get_transcript_grch38(self, transcript_id):
return self.get_transcript(transcript_id, "GRCh38")

def get_transcript(self, transcript_id, genome_build):
transcript_json = self._get_transcript(transcript_id)
build_coords = transcript_json["genome_builds"].get(genome_build)
if build_coords is None:
return None

transcript_name = transcript_json['id']
if '.' in transcript_name:
name, version = transcript_name.split('.')
else:
name, version = transcript_name, None

contig = build_coords['contig']
start = build_coords['exons'][0][0]
end = build_coords['exons'][-1][1]
is_forward_strand = build_coords['strand'] == '+'

transcript = Transcript(
name=name,
version=int(version) if version is not None else None,
gene=transcript_json['gene_name'],
tx_position=Position(
contig,
start,
end,
is_forward_strand),
cds_position=Position(
contig,
build_coords['cds_start'],
build_coords['cds_end'],
is_forward_strand),
)

stranded_exons = sorted([ex[:3] for ex in build_coords['exons']], key=lambda ex: ex[2])
for (exon_start, exon_end, exon_number) in stranded_exons:
transcript.exons.append(
Exon(transcript=transcript,
tx_position=Position(
contig,
exon_start,
exon_end,
is_forward_strand),
exon_number=exon_number))

return transcript


class JSONPyHGVSTranscriptFactory(AbstractPyHGVSTranscriptFactory):
def _get_transcript(self, transcript_id):
return self.transcripts.get(transcript_id)

def __init__(self, file_or_filename_list):
super().__init__()
self.transcripts = {}
for file_or_filename in file_or_filename_list:
if isinstance(file_or_filename, str):
if file_or_filename.endswith(".gz"):
f = gzip.open(file_or_filename)
else:
f = open(file_or_filename)
else:
f = file_or_filename
data = json.load(f)
self.transcripts.update(data["transcripts"])


class RESTPyHGVSTranscriptFactory(AbstractPyHGVSTranscriptFactory):

def _get_transcript(self, transcript_id):
# We store None for 404 on REST
if transcript_id in self.transcripts:
return self.transcripts[transcript_id]

transcript_url = self.url + "/transcript/" + transcript_id
response = requests.get(transcript_url)
if response.ok:
if 'application/json' in response.headers.get('Content-Type'):
transcript = response.json()
else:
raise ValueError("Non-json response received for '%s' - are you behind a firewall?" % transcript_url)
else:
transcript = None
self.transcripts[transcript_id] = transcript
return transcript

def __init__(self, url=None):
super().__init__()
if url is None:
url = "http://cdot.cc"
self.url = url
self.transcripts = {}


# TODO: Write code to load SACGF pyhgvs changes, ie gaps and start/stop codons etc

# Changes from old loading:

# See dot has no cds_start/end if non-coding
Expand Down
Loading

0 comments on commit a981e33

Please sign in to comment.