diff --git a/README.md b/README.md index 9e528cf..876b236 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,54 @@ # seedot -Transcript versions for HGVS libraries + +Seedot is used to load transcripts for the 2 most popular Python [HGVS](http://varnomen.hgvs.org/) libraries. + +It works by: + +* Converting RefSeq and Ensembl GTFs into a JSON format +* Providing loaders from that JSON format (or a REST service) + +We currently support 788k transcripts, 5.5x as many as [Universal Transcript Archive](https://github.com/biocommons/uta/) + +## Examples + +[Biocommons HGVS](https://github.com/biocommons/hgvs) example: + +``` +from seedot.hgvs.dataproviders import JSONDataProvider, RESTDataProvider + +hp = JSONDataProvider({"GRCh37": "./seedot_220119.grch38.json.gz") # Uses local JSON file +# hp = RESTDataProvider() # Uses API server at seedot.cc + +am = AssemblyMapper(hp, + assembly_name='GRCh37', + alt_aln_method='splign', replace_reference=True) + +hp = hgvs.parser.Parser() +var_c = hp.parse_hgvs_variant('NM_001637.3:c.1582G>A') +am.c_to_g(var_c) +``` + +[PyHGVS](https://github.com/counsyl/hgvs) example: + +``` +# TODO +``` + +## Philosophical differences from Universal Transcript Archive + +Seedot aims to be as simple as possible: convert existing Ensembl/RefSeq GTFs into JSON format + +Universal transcript archive is an excellent and ambitious project that: + +* Performs its own mapping of transcript sequences to reference genomes +* Stores the transcript version data (exons etc) in a SQL database + +This has some advantages, namely that you can resolve a GRCh37 coordinate for a transcript which was never officially released for that build. + +However the complexity causes a few downsides: + +* Alignments may not exactly match those in official Ensembl/RefSeq releases +* Local install requires a PostgreSQL installation +* Internet hosted UTA is a PostgreSQL server, so requires client Postgres libraries, is inaccessible behind firewalls. They have been planning on building a [REST server since 2014](https://github.com/biocommons/uta/issues/164) +* High complexity manual process for releases means they [do not support Ensembl](https://github.com/biocommons/uta/issues/233) and take a while to make RefSeq releases. + diff --git a/generate_transcript_data/ensembl_gene_annotation_grch37.sh b/generate_transcript_data/ensembl_gene_annotation_grch37.sh index 97ee668..21fbe66 100755 --- a/generate_transcript_data/ensembl_gene_annotation_grch37.sh +++ b/generate_transcript_data/ensembl_gene_annotation_grch37.sh @@ -10,7 +10,7 @@ pyreference_args=() for release in 82 85 87; do filename=Homo_sapiens.GRCh37.${release}.gff3.gz url=ftp://ftp.ensembl.org/pub/grch37/release-${release}/gff3/homo_sapiens/${filename} - pyreference_file=${filename}.json.gz + pyreference_file=$(basename $filename .gz).json.gz if [[ ! -e ${filename} ]]; then wget ${url} fi @@ -20,9 +20,9 @@ for release in 82 85 87; do pyreference_args+=(--pyreference-json ${pyreference_file}) done -merged_file="pyhgvs_transcripts_ensembl_grch37.json.gz" +merged_file="seedot-$(date --iso).ensembl.grch37.json.gz" if [[ ! -e ${merged_file} ]]; then BASE_DIR=$(dirname ${BASH_SOURCE[0]}) - python3 ${BASE_DIR}/pyreference_to_pyhgvs_json.py ${pyreference_args[@]} --output ${merged_file} + python3 ${BASE_DIR}/pyreference_to_seedot_json.py ${pyreference_args[@]} --output ${merged_file} fi diff --git a/generate_transcript_data/ensembl_gene_annotation_grch38.sh b/generate_transcript_data/ensembl_gene_annotation_grch38.sh index 9f8c0a0..a0996f1 100755 --- a/generate_transcript_data/ensembl_gene_annotation_grch38.sh +++ b/generate_transcript_data/ensembl_gene_annotation_grch38.sh @@ -15,10 +15,10 @@ #81 is first GFF3 for GRCh38 pyreference_args=() -for release in 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104; do +for release in 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105; do filename=Homo_sapiens.GRCh38.${release}.gff3.gz url=ftp://ftp.ensembl.org/pub/release-${release}/gff3/homo_sapiens/${filename} - pyreference_file=${filename}.json.gz + pyreference_file=$(basename $filename .gz).json.gz if [[ ! -e ${filename} ]]; then wget ${url} @@ -29,9 +29,9 @@ for release in 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 pyreference_args+=(--pyreference-json ${pyreference_file}) done -merged_file="pyhgvs_transcripts_ensembl_grch38.json.gz" +merged_file="seedot-$(date --iso).ensembl.grch38.json.gz" if [[ ! -e ${merged_file} ]]; then BASE_DIR=$(dirname ${BASH_SOURCE[0]}) - python3 ${BASE_DIR}/pyreference_to_pyhgvs_json.py ${pyreference_args[@]} --output ${merged_file} + python3 ${BASE_DIR}/pyreference_to_seedot_json.py ${pyreference_args[@]} --output ${merged_file} fi diff --git a/generate_transcript_data/pyreference_to_pyhgvs_json.py b/generate_transcript_data/pyreference_to_seedot_json.py similarity index 52% rename from generate_transcript_data/pyreference_to_pyhgvs_json.py rename to generate_transcript_data/pyreference_to_seedot_json.py index 322ef98..df2219a 100644 --- a/generate_transcript_data/pyreference_to_pyhgvs_json.py +++ b/generate_transcript_data/pyreference_to_seedot_json.py @@ -1,20 +1,23 @@ """ - Converts PyReference JSON.gz files (created from RefSeq/Ensembl GTF or GFFs) into format easy to load w/PyHGVS + Converts PyReference JSON.gz files (created from RefSeq/Ensembl GTF or GFFs) into format easy to load for HGVS @see https://bitbucket.org/sacgf/pyreference/ - Dave Lawrence (davmlaw@gmail.com) on 22/10/2021 + Dave Lawrence (davmlaw@gmail.com) on 17/01/2022 """ import gzip import json from argparse import ArgumentParser from typing import Dict +import seedot + def handle_args(): - parser = ArgumentParser(description='Convert multiple PyReference json.gz files into one for PyHGVS') + parser = ArgumentParser(description='Convert multiple PyReference json.gz files into one for SeeDot') parser.add_argument('--pyreference-json', required=True, nargs="+", action="extend", help='PyReference JSON.gz - list OLDEST to NEWEST (newest is kept)') + parser.add_argument("--store-genes", action='store_true', help="Also store gene/version information") parser.add_argument('--output', required=True, help='Output filename') return parser.parse_args() @@ -37,50 +40,6 @@ def convert_gene_pyreference_to_gene_version_data(gene_data: Dict) -> Dict: return gene_version_data -def convert_transcript_pyreference_to_pyhgvs(transcript_data: Dict) -> Dict: - start = transcript_data["start"] - end = transcript_data["stop"] - strand = transcript_data["strand"] - # PyHGVS has cds_start/cds_end be equal to end/end for non-coding transcripts (so coding length ie end-start = 0) - cds_start = transcript_data.get("cds_start", end) - cds_end = transcript_data.get("cds_end", end) - # PyHGVS exons are in genomic order, PyReference are in stranded - features = transcript_data["features_by_type"] - exons = [[ed["start"], ed["stop"]] for ed in features["exon"]] - cdna_match = [] - for cdm in features.get("cDNA_match", []): - if "cdna_strand" in cdm: # None in Human RefSeq so haven't handled - raise ValueError("Haven't handled stranded Target alignment") - cdna_match.append([cdm.get(k) for k in ["start", "stop", "cdna_start", "cdna_end", "gap"]]) - - pyhgvs_data = { - 'chrom': transcript_data["chr"], - 'start': start, - 'end': end, - 'strand': strand, - 'cds_start': cds_start, - 'cds_end': cds_end, - 'exons': exons, - } - - # Optional stuff - for optional_key in ["start_codon_transcript_pos", "stop_codon_transcript_pos"]: - if value := transcript_data.get(optional_key): - pyhgvs_data[optional_key] = value - - if cdna_match: - pyhgvs_data["cdna_match"] = cdna_match - if transcript_data.get("partial"): - pyhgvs_data["partial"] = 1 - other_chroms = transcript_data.get("other_chroms") - if other_chroms: - pyhgvs_data["other_chroms"] = other_chroms - - # Few extra fields (pyHGVS doesn't currently use) - pyhgvs_data["biotype"] = ",".join(transcript_data["biotype"]) - return pyhgvs_data - - def main(): args = handle_args() @@ -110,22 +69,26 @@ def main(): for transcript_accession in gene["transcripts"]: transcript_gene_version[transcript_accession] = gene_accession - for transcript_accession, transcript in pyref_data["transcripts_by_id"].items(): - hgvs_data = convert_transcript_pyreference_to_pyhgvs(transcript) + for transcript_accession, transcript_version in pyref_data["transcripts_by_id"].items(): gene_accession = transcript_gene_version[transcript_accession] gene_version = gene_versions[gene_accession] - hgvs_data["id"] = transcript_accession - hgvs_data["gene_version"] = gene_accession - hgvs_data["gene_name"] = gene_version["gene_symbol"] - hgvs_data["url"] = url - transcript_versions[transcript_accession] = hgvs_data - - print("Writing PyHGVS data") + transcript_version["id"] = transcript_accession + transcript_version["gene_version"] = gene_accession + transcript_version["gene_name"] = gene_version["gene_symbol"] + transcript_version["url"] = url + transcript_versions[transcript_accession] = transcript_version + if hgnc := gene_version.get("hgnc"): + transcript_version["hgnc"] = hgnc + + print("Writing SeeDot data") with gzip.open(args.output, 'w') as outfile: data = { "transcripts": transcript_versions, - "genes": gene_versions, + "seedot_version": seedot.__version__, } + if args.store_genes: + data["genes"] = gene_versions + json_str = json.dumps(data) outfile.write(json_str.encode('ascii')) diff --git a/generate_transcript_data/refseq_gene_annotation_grch37.sh b/generate_transcript_data/refseq_gene_annotation_grch37.sh index c0846fc..03a9dfe 100755 --- a/generate_transcript_data/refseq_gene_annotation_grch37.sh +++ b/generate_transcript_data/refseq_gene_annotation_grch37.sh @@ -6,7 +6,7 @@ pyreference_args=() filename=ref_GRCh37.p5_top_level.gff3.gz url=http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/BUILD.37.3/GFF/${filename} -pyreference_file=${filename}.json.gz +pyreference_file=$(basename $filename .gz).json.gz if [[ ! -e ${filename} ]]; then wget ${url} fi @@ -18,7 +18,7 @@ pyreference_args+=(--pyreference-json ${pyreference_file}) filename=ref_GRCh37.p9_top_level.gff3.gz url=http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.103/GFF/${filename} -pyreference_file=${filename}.json.gz +pyreference_file=$(basename $filename .gz).json.gz if [[ ! -e ${filename} ]]; then wget ${url} fi @@ -30,7 +30,7 @@ pyreference_args+=(--pyreference-json ${pyreference_file}) filename=ref_GRCh37.p10_top_level.gff3.gz url=http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.104/GFF/${filename} -pyreference_file=${filename}.json.gz +pyreference_file=$(basename $filename .gz).json.gz if [[ ! -e ${filename} ]]; then wget ${url} fi @@ -42,7 +42,7 @@ pyreference_args+=(--pyreference-json ${pyreference_file}) filename=ref_GRCh37.p13_top_level.gff3.gz url=http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.105/GFF/${filename} -pyreference_file=${filename}.json.gz +pyreference_file=$(basename $filename .gz).json.gz if [[ ! -e ${filename} ]]; then wget ${url} fi @@ -56,7 +56,7 @@ pyreference_args+=(--pyreference-json ${pyreference_file}) for release in 105.20190906 105.20201022; do filename=GCF_000001405.25_GRCh37.p13_genomic.${release}.gff.gz url=http://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/annotation_releases/${release}/GCF_000001405.25_GRCh37.p13/GCF_000001405.25_GRCh37.p13_genomic.gff.gz - pyreference_file=${filename}.json.gz + pyreference_file=$(basename $filename .gz).json.gz if [[ ! -e ${filename} ]]; then wget ${url} --output-document=${filename} fi @@ -66,10 +66,10 @@ for release in 105.20190906 105.20201022; do pyreference_args+=(--pyreference-json ${pyreference_file}) done - -merged_file="pyhgvs_transcripts_refseq_grch37.json.gz" +merged_file="seedot-$(date --iso).refseq.grch37.json.gz" +echo "Creating ${merged_file}" if [[ ! -e ${merged_file} ]]; then BASE_DIR=$(dirname ${BASH_SOURCE[0]}) - python3 ${BASE_DIR}/pyreference_to_pyhgvs_json.py ${pyreference_args[@]} --output ${merged_file} + python3 ${BASE_DIR}/pyreference_to_seedot_json.py ${pyreference_args[@]} --output ${merged_file} fi diff --git a/generate_transcript_data/refseq_gene_annotation_grch38.sh b/generate_transcript_data/refseq_gene_annotation_grch38.sh index a4682fe..1f33cc6 100755 --- a/generate_transcript_data/refseq_gene_annotation_grch38.sh +++ b/generate_transcript_data/refseq_gene_annotation_grch38.sh @@ -4,7 +4,7 @@ filename=ref_GRCh38_top_level.gff3.gz url=http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.106/GFF/${filename} -pyreference_file=${filename}.json.gz +pyreference_file=$(basename $filename .gz).json.gz if [[ ! -e ${filename} ]]; then wget ${url} @@ -17,7 +17,7 @@ pyreference_args+=(--pyreference-json ${pyreference_file}) filename=ref_GRCh38.p2_top_level.gff3.gz url=http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.107/GFF/${filename} -pyreference_file=${filename}.json.gz +pyreference_file=$(basename $filename .gz).json.gz if [[ ! -e ${filename} ]]; then wget ${url} @@ -30,7 +30,7 @@ pyreference_args+=(--pyreference-json ${pyreference_file}) filename=ref_GRCh38.p7_top_level.gff3.gz url=http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.108/GFF/${filename} -pyreference_file=${filename}.json.gz +pyreference_file=$(basename $filename .gz).json.gz if [[ ! -e ${filename} ]]; then wget ${url} @@ -43,7 +43,7 @@ pyreference_args+=(--pyreference-json ${pyreference_file}) filename=ref_GRCh38.p12_top_level.gff3.gz url=http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.109/GFF/${filename} -pyreference_file=${filename}.json.gz +pyreference_file=$(basename $filename .gz).json.gz if [[ ! -e ${filename} ]]; then wget ${url} @@ -56,7 +56,7 @@ pyreference_args+=(--pyreference-json ${pyreference_file}) filename=GCF_000001405.38_GRCh38.p12_genomic.gff.gz url=http://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/annotation_releases/109/GCF_000001405.38_GRCh38.p12/${filename} -pyreference_file=${filename}.json.gz +pyreference_file=$(basename $filename .gz).json.gz if [[ ! -e ${filename} ]]; then wget ${url} @@ -68,10 +68,10 @@ pyreference_args+=(--pyreference-json ${pyreference_file}) # These all have the same name, so rename them based on release ID -for release in 109.20190607 109.20190905 109.20191205 109.20200228 109.20200522 109.20200815 109.20201120 109.20210226 109.20210514; do +for release in 109.20190607 109.20190905 109.20191205 109.20200228 109.20200522 109.20200815 109.20201120 109.20210226 109.20210514 109.20211119; do filename=GCF_000001405.39_GRCh38.p13_genomic.${release}.gff.gz url=http://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/annotation_releases/${release}/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.gff.gz - pyreference_file=${filename}.json.gz + pyreference_file=$(basename $filename .gz).json.gz if [[ ! -e ${filename} ]]; then wget ${url} --output-document=${filename} fi @@ -81,10 +81,9 @@ for release in 109.20190607 109.20190905 109.20191205 109.20200228 109.20200522 pyreference_args+=(--pyreference-json ${pyreference_file}) done - -merged_file="pyhgvs_transcripts_refseq_grch38.json.gz" +merged_file="seedot-$(date --iso).refseq.grch38.json.gz" if [[ ! -e ${merged_file} ]]; then BASE_DIR=$(dirname ${BASH_SOURCE[0]}) - python3 ${BASE_DIR}/pyreference_to_pyhgvs_json.py ${pyreference_args[@]} --output ${merged_file} + python3 ${BASE_DIR}/pyreference_to_seedot_json.py ${pyreference_args[@]} --output ${merged_file} fi diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..374b58c --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,6 @@ +[build-system] +requires = [ + "setuptools>=42", + "wheel" +] +build-backend = "setuptools.build_meta" diff --git a/seedot/__init__.py b/seedot/__init__.py index e69de29..06a9c81 100644 --- a/seedot/__init__.py +++ b/seedot/__init__.py @@ -0,0 +1,7 @@ +__version__ = "0.1.1" + + +def get_json_schema_version(): + """ Return an int which increments upon breaking changes - ie anything other than patch """ + major, minor, patch = __version__.split(".") + return 1000 * int(major) + int(minor) diff --git a/seedot/hgvs/dataproviders/__init__.py b/seedot/hgvs/dataproviders/__init__.py index e69de29..1a300fe 100644 --- a/seedot/hgvs/dataproviders/__init__.py +++ b/seedot/hgvs/dataproviders/__init__.py @@ -0,0 +1 @@ +from .json_data_provider import * diff --git a/seedot/hgvs/dataproviders/json_data_provider.py b/seedot/hgvs/dataproviders/json_data_provider.py index ad4ff4d..505126e 100644 --- a/seedot/hgvs/dataproviders/json_data_provider.py +++ b/seedot/hgvs/dataproviders/json_data_provider.py @@ -12,14 +12,27 @@ class AbstractJSONDataProvider(Interface): NCBI_ALN_METHOD = "splign" required_version = "1.1" - def __init__(self, mode=None, cache=None): + def __init__(self, assemblies, mode=None, cache=None): super().__init__(mode=mode, cache=cache) self.seqfetcher = SeqFetcher() + self.assembly_maps = {} + for assembly_name in assemblies: + self.assembly_maps[assembly_name] = make_ac_name_map(assembly_name) + self.assembly_by_contig = {} + for assembly_name, contig_map in self.assembly_maps.items(): + self.assembly_by_contig.update({contig: assembly_name for contig in contig_map.keys()}) @abc.abstractmethod - def _get_transcript(self, tx_ac): + def _get_transcript_for_assembly(self, tx_ac, assembly): pass + def _get_transcript(self, tx_ac, alt_ac): + assembly = self.assembly_by_contig.get(alt_ac) + if assembly is None: + supported_assemblies = ", ".join(self.assembly_maps.keys()) + raise ValueError(f"Contig '{alt_ac}' not supported. Supported assemblies: {supported_assemblies}") + return self._get_transcript_for_assembly(tx_ac, assembly) + def data_version(self): return self.required_version @@ -31,7 +44,10 @@ def get_acs_for_protein_seq(self, seq): def get_assembly_map(self, assembly_name): """return a list of accessions for the specified assembly name (e.g., GRCh38.p5) """ - return make_ac_name_map(assembly_name) + assembly_map = self.assembly_maps.get(assembly_name) + if assembly_map is None: + raise ValueError(f"Assembly '{assembly_name}' not supported.") + return assembly_map def get_gene_info(self, gene): raise NotImplementedError("JSON data provider doesn't support get_gene_info") @@ -80,15 +96,15 @@ def _convert_gap_to_cigar(gap): return "".join(cigar_ops) def get_tx_exons(self, tx_ac, alt_ac, alt_aln_method): - transcript = self.json_data.get(tx_ac) + transcript = self._get_transcript(tx_ac, alt_ac) if not transcript: return None tx_exons = [] # Genomic order - exons = transcript["cdna_match"] # PyHGVS Needs to change + exons = transcript["exons"] alt_strand = 1 if transcript["strand"] == "+" else -1 - for (exon_id, (alt_start_i, alt_end_i, cds_start_i, cds_end_i, gap)) in enumerate(exons): + for (alt_start_i, alt_end_i, exon_id, cds_start_i, cds_end_i, gap) in exons: tx_start_i = cds_start_i - 1 tx_end_i = cds_end_i if gap is not None: @@ -109,12 +125,8 @@ def get_tx_exons(self, tx_ac, alt_ac, alt_aln_method): 'alt_end_i': alt_end_i, 'cigar': cigar, } - # print(exon_data.values()) tx_exons.append(exon_data) - tx_exons.sort(key=lambda ex: ex["alt_start_i"]) # Sort by genomic order - -# print([d.values() for d in reversed(sorted(tx_exons, key=lambda e: e["ord"]))]) return tx_exons def get_tx_for_gene(self, gene): @@ -126,54 +138,70 @@ def get_tx_for_region(self, alt_ac, alt_aln_method, start_i, end_i): raise NotImplementedError("TODO") def get_tx_identity_info(self, tx_ac): - transcript = self.json_data.get(tx_ac) + # Get any transcript as it's assembly independent + transcript = None + for assembly in self.assembly_maps: + transcript = self._get_transcript_for_assembly(tx_ac, assembly) + if transcript: + break + if not transcript: return None - tx_info = self.get_tx_info(tx_ac, tx_ac, "transcript") - exons = transcript["cdna_match"] # TODO: won't exist in all current PyReference etc - tx_info["lengths"] = [end+1 - start for (_, _, start, end, _) in exons] # Stranded order + tx_info = self._get_transcript_info(transcript) + exons = transcript["exons"] + stranded_order_exons = sorted(exons, key=lambda e: e[2]) # sort by exon_id + tx_info["lengths"] = [ex[4] + 1 - ex[3] for ex in stranded_order_exons] + tx_info["tx_ac"] = tx_ac + tx_info["alt_ac"] = tx_ac # Same again + tx_info["alt_aln_method"] = "transcript" return tx_info - def get_tx_info(self, tx_ac, alt_ac, alt_aln_method): - transcript = self.json_data.get(tx_ac) - if not transcript: - return None - - hgnc = transcript["gene_name"] - cds_start_i = transcript["start_codon_transcript_pos"] - cds_end_i = transcript["stop_codon_transcript_pos"] + @staticmethod + def _get_transcript_info(transcript): + gene_name = transcript["gene_name"] + cds_start_i = transcript["start_codon"] + cds_end_i = transcript["stop_codon"] return { - "hgnc": hgnc, + "hgnc": gene_name, "cds_start_i": cds_start_i, "cds_end_i": cds_end_i, - "tx_ac": tx_ac, - "alt_ac": alt_ac, - "alt_aln_method": alt_aln_method, } + def get_tx_info(self, tx_ac, alt_ac, alt_aln_method): + transcript = self._get_transcript(tx_ac, alt_ac) + if not transcript: + return None + + tx_info = self._get_transcript_info(transcript) + tx_info["tx_ac"] = tx_ac + tx_info["alt_ac"] = alt_ac + tx_info["alt_aln_method"] = alt_aln_method + return tx_info + def get_tx_mapping_options(self, tx_ac): mapping_options = [] - transcript = self.json_data.get(tx_ac) - if transcript: - mo = { - "tx_ac": tx_ac, - "alt_ac": transcript["chrom"], - "alt_aln_method": self.NCBI_ALN_METHOD, - } - mapping_options.append(mo) + for assembly in self.assembly_maps: + if transcript := self._get_transcript_for_assembly(tx_ac, assembly): + mo = { + "tx_ac": tx_ac, + "alt_ac": transcript["contig"], + "alt_aln_method": self.NCBI_ALN_METHOD, + } + mapping_options.append(mo) return mapping_options class JSONDataProvider(AbstractJSONDataProvider): - def _get_transcript(self, tx_ac): - return self.json_data["transcripts"][tx_ac] + def _get_transcript_for_assembly(self, tx_ac, assembly): + return self._assembly_json[assembly].get(tx_ac) - def __init__(self, file_or_filename=None, json_data=None, mode=None, cache=None): - super().__init__(mode=mode, cache=cache) - if json_data: - self.json_data = json_data - elif file_or_filename: + def __init__(self, assembly_json, mode=None, cache=None): + super().__init__(assemblies=assembly_json.keys(), mode=mode, cache=cache) + + # No point being lazy as HGVS calls get_tx_mapping_options which requires us to check all builds + self._assembly_json = {} + for assembly, file_or_filename in assembly_json.items(): if isinstance(file_or_filename, str): if file_or_filename.endswith(".gz"): f = gzip.open(file_or_filename) @@ -181,6 +209,17 @@ def __init__(self, file_or_filename=None, json_data=None, mode=None, cache=None) f = open(file_or_filename) else: f = file_or_filename - self.json_data = json.load(f) - else: - raise ValueError("Must pass either 'file_or_filename' or 'json_data'") + data = json.load(f) + # TODO: Check version is ok? + self._assembly_json[assembly] = data["transcripts"] + + +class RESTDataProvider(AbstractJSONDataProvider): + def _get_transcript_for_assembly(self, tx_ac, assembly): + pass + + def __init__(self, url=None, mode=None, cache=None): + if url is None: + url = "http://seedot.cc" + assemblies = ["GRCh37", "GRCh38"] + super().__init__(assemblies=assemblies, mode=mode, cache=cache) diff --git a/seedot/pyhgvs/__init__.py b/seedot/pyhgvs/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/seedot/pyhgvs/pyhgvs_transcript.py b/seedot/pyhgvs/pyhgvs_transcript.py new file mode 100644 index 0000000..bdeb79b --- /dev/null +++ b/seedot/pyhgvs/pyhgvs_transcript.py @@ -0,0 +1,16 @@ + +# if PyHGVS below a certain version - do it the old way + +# If after - can use different exons, and start/stop codons to make it faster + + +# Changes from old loading: + +# See dot has no cds_start/end if non-coding +# PyHGVS expects cds_start/cds_end be equal to end/end for non-coding transcripts (so coding length ie end-start = 0) +# cds_start = transcript_data.get("cds_start", end) +# cds_end = transcript_data.get("cds_end", end) + + + +# VG loader also expects biotype to be comma sep, now is list \ No newline at end of file diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..70b8f39 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,23 @@ +[metadata] +name = seedot +version = attr: seedot.__version__ +author = David Lawrence +author_email = davmlaw@gmail.com +description = Transcripts for HGVS libraries +long_description = file: README.md +long_description_content_type = text/markdown +url = https://github.com/SACGF/seedot +project_urls = + Bug Tracker = https://github.com/SACGF/seedot/issues +classifiers = + Programming Language :: Python :: 3 + License :: OSI Approved :: MIT License + Operating System :: OS Independent + +[options] +package_dir = +packages = find: +python_requires = >=3.8 + +[options.packages.find] +where = diff --git a/tests/test_data/seedot.refseq.grch37.json b/tests/test_data/seedot.refseq.grch37.json new file mode 100644 index 0000000..b487759 --- /dev/null +++ b/tests/test_data/seedot.refseq.grch37.json @@ -0,0 +1,192 @@ +{ + "transcripts": { + "NM_001637.3": { + "biotype": [ + "protein_coding" + ], + "cds_end": 36763753, + "cds_start": 36552857, + "contig": "NC_000007.13", + "exons": [ + [ + 36552548, + 36552986, + 20, + 2001, + 2440, + "M196 I1 M61 I1 M181" + ], + [ + 36561644, + 36561721, + 19, + 1924, + 2000, + null + ], + [ + 36570023, + 36570120, + 18, + 1827, + 1923, + null + ], + [ + 36571752, + 36571812, + 17, + 1767, + 1826, + null + ], + [ + 36571891, + 36571950, + 16, + 1708, + 1766, + null + ], + [ + 36579924, + 36580097, + 15, + 1535, + 1707, + null + ], + [ + 36588217, + 36588292, + 14, + 1460, + 1534, + null + ], + [ + 36589044, + 36589081, + 13, + 1423, + 1459, + null + ], + [ + 36616179, + 36616262, + 12, + 1340, + 1422, + null + ], + [ + 36633944, + 36634036, + 11, + 1248, + 1339, + null + ], + [ + 36655985, + 36656080, + 10, + 1153, + 1247, + null + ], + [ + 36657902, + 36657951, + 9, + 1104, + 1152, + null + ], + [ + 36660386, + 36660435, + 8, + 1055, + 1103, + null + ], + [ + 36661315, + 36661386, + 7, + 984, + 1054, + null + ], + [ + 36662795, + 36662856, + 6, + 923, + 983, + null + ], + [ + 36671641, + 36671712, + 5, + 852, + 922, + null + ], + [ + 36677456, + 36677516, + 4, + 792, + 851, + null + ], + [ + 36698770, + 36698870, + 3, + 692, + 791, + null + ], + [ + 36713547, + 36713614, + 2, + 625, + 691, + null + ], + [ + 36726303, + 36726399, + 1, + 529, + 624, + null + ], + [ + 36763626, + 36764154, + 0, + 1, + 528, + null + ] + ], + "start": 36552548, + "start_codon": 401, + "stop": 36764154, + "stop_codon": 2129, + "strand": "-", + "id": "NM_001637.3", + "gene_version": "313", + "gene_name": "AOAH", + "url": "http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.105/GFF/ref_GRCh37.p13_top_level.gff3.gz" + } + }, + "seedot_version": "0.1.1" +} \ No newline at end of file diff --git a/tests/test_json_data_provider.py b/tests/test_json_data_provider.py index d8b307d..eaf749e 100644 --- a/tests/test_json_data_provider.py +++ b/tests/test_json_data_provider.py @@ -1,4 +1,7 @@ +import os import unittest +from inspect import getsourcefile +from os.path import abspath import hgvs from hgvs.assemblymapper import AssemblyMapper @@ -7,71 +10,12 @@ class TestJSONDataProvider(unittest.TestCase): - # TODO: Test loading from file, filename - - def setUp(self): - GRCH37_TRANSCRIPTS = { - "NM_001637.3": - {'chrom': 'NC_000007.13', - 'start': 36552548, - 'end': 36764154, - 'strand': '-', - 'cds_start': 36552857, - 'cds_end': 36763753, - 'exons': [[36763626, 36764154], - [36726303, 36726399], - [36713547, 36713614], - [36698770, 36698870], - [36677456, 36677516], - [36671641, 36671712], - [36662795, 36662856], - [36661315, 36661386], - [36660386, 36660435], - [36657902, 36657951], - [36655985, 36656080], - [36633944, 36634036], - [36616179, 36616262], - [36589044, 36589081], - [36588217, 36588292], - [36579924, 36580097], - [36571891, 36571950], - [36571752, 36571812], - [36570023, 36570120], - [36561644, 36561721], - [36552548, 36552986]], - 'start_codon_transcript_pos': 401, - 'stop_codon_transcript_pos': 2129, - 'cdna_match': [[36763626, 36764154, 1, 528, None], - [36726303, 36726399, 529, 624, None], - [36713547, 36713614, 625, 691, None], - [36698770, 36698870, 692, 791, None], - [36677456, 36677516, 792, 851, None], - [36671641, 36671712, 852, 922, None], - [36662795, 36662856, 923, 983, None], - [36661315, 36661386, 984, 1054, None], - [36660386, 36660435, 1055, 1103, None], - [36657902, 36657951, 1104, 1152, None], - [36655985, 36656080, 1153, 1247, None], - [36633944, 36634036, 1248, 1339, None], - [36616179, 36616262, 1340, 1422, None], - [36589044, 36589081, 1423, 1459, None], - [36588217, 36588292, 1460, 1534, None], - [36579924, 36580097, 1535, 1707, None], - [36571891, 36571950, 1708, 1766, None], - [36571752, 36571812, 1767, 1826, None], - [36570023, 36570120, 1827, 1923, None], - [36561644, 36561721, 1924, 2000, None], - [36552548, 36552986, 2001, 2440, 'M196 I1 M61 I1 M181']], - 'biotype': 'protein_coding', - 'id': 'NM_001637.3', - 'gene_version': '313', - 'gene_name': 'AOAH', - 'url': 'http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.105/GFF/ref_GRCh37.p13_top_level.gff3.gz'} - } - self.json_data_provider = JSONDataProvider(json_data=GRCH37_TRANSCRIPTS) - - def test_hgvs_conversion(self): - am = AssemblyMapper(self.json_data_provider, + def test_transcript(self): + this_file_dir = os.path.dirname(abspath(getsourcefile(lambda: 0))) +# parent_dir = os.path.dirname(this_file_dir) + test_json_file = os.path.join(this_file_dir, "test_data/seedot.refseq.grch37.json") + json_data_provider = JSONDataProvider(assembly_json={"GRCh37": test_json_file}) + am = AssemblyMapper(json_data_provider, assembly_name='GRCh37', alt_aln_method='splign', replace_reference=True) HGVS_C_TO_G = [ ('NM_001637.3:c.1582G>A', 'NC_000007.13:g.36561662C>T'),