Skip to content

Commit

Permalink
Initial commit - copy in code for generating PyHGVS JSON
Browse files Browse the repository at this point in the history
  • Loading branch information
davmlaw committed Jan 17, 2022
1 parent 731827b commit aad2d68
Show file tree
Hide file tree
Showing 18 changed files with 697 additions and 0 deletions.
8 changes: 8 additions & 0 deletions .idea/.gitignore

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

19 changes: 19 additions & 0 deletions .idea/inspectionProfiles/Project_Default.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions .idea/inspectionProfiles/profiles_settings.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions .idea/misc.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions .idea/modules.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions .idea/seedot.iml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions .idea/vcs.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

28 changes: 28 additions & 0 deletions generate_transcript_data/ensembl_gene_annotation_grch37.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#!/bin/bash

# v81 (points to 75) and earlier at GTFs that don't have transcript versions - just skip them

#82 is first GFF3 for GRCh37
#83 has no data
#84 is 82 again
#86 is 85 again
pyreference_args=()
for release in 82 85 87; do
filename=Homo_sapiens.GRCh37.${release}.gff3.gz
url=ftp://ftp.ensembl.org/pub/grch37/release-${release}/gff3/homo_sapiens/${filename}
pyreference_file=${filename}.json.gz
if [[ ! -e ${filename} ]]; then
wget ${url}
fi
if [[ ! -e ${pyreference_file} ]]; then
pyreference_gff_to_json.py --url "${url}" --gff3 "${filename}"
fi
pyreference_args+=(--pyreference-json ${pyreference_file})
done

merged_file="pyhgvs_transcripts_ensembl_grch37.json.gz"
if [[ ! -e ${merged_file} ]]; then
BASE_DIR=$(dirname ${BASH_SOURCE[0]})

python3 ${BASE_DIR}/pyreference_to_pyhgvs_json.py ${pyreference_args[@]} --output ${merged_file}
fi
37 changes: 37 additions & 0 deletions generate_transcript_data/ensembl_gene_annotation_grch38.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#!/bin/bash

# Skip earlier GTFs as they don't have versions
#for release in 76 77 78 79 80; do
# filename=Homo_sapiens.GRCh38.${release}.gtf.gz
# url=ftp://ftp.ensembl.org/pub/release-${release}/gtf/homo_sapiens/${filename}
# if [[ ! -e ${filename} ]]; then
# wget ${url}
# fi

# if [[ ! -e ${filename}.json.gz ]]; then
# pyreference_gff_to_json.py --url "${url}" --gff3 "${filename}"
# fi
#done

#81 is first GFF3 for GRCh38
pyreference_args=()
for release in 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104; do
filename=Homo_sapiens.GRCh38.${release}.gff3.gz
url=ftp://ftp.ensembl.org/pub/release-${release}/gff3/homo_sapiens/${filename}
pyreference_file=${filename}.json.gz

if [[ ! -e ${filename} ]]; then
wget ${url}
fi
if [[ ! -e ${filename}.json.gz ]]; then
pyreference_gff_to_json.py --url "${url}" --gff3 "${filename}"
fi
pyreference_args+=(--pyreference-json ${pyreference_file})
done

merged_file="pyhgvs_transcripts_ensembl_grch38.json.gz"
if [[ ! -e ${merged_file} ]]; then
BASE_DIR=$(dirname ${BASH_SOURCE[0]})

python3 ${BASE_DIR}/pyreference_to_pyhgvs_json.py ${pyreference_args[@]} --output ${merged_file}
fi
134 changes: 134 additions & 0 deletions generate_transcript_data/pyreference_to_pyhgvs_json.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
"""
Converts PyReference JSON.gz files (created from RefSeq/Ensembl GTF or GFFs) into format easy to load w/PyHGVS
@see https://bitbucket.org/sacgf/pyreference/
Dave Lawrence ([email protected]) on 22/10/2021
"""
import gzip
import json
from argparse import ArgumentParser
from typing import Dict


def handle_args():
parser = ArgumentParser(description='Convert multiple PyReference json.gz files into one for PyHGVS')
parser.add_argument('--pyreference-json', required=True, nargs="+", action="extend",
help='PyReference JSON.gz - list OLDEST to NEWEST (newest is kept)')
parser.add_argument('--output', required=True, help='Output filename')
return parser.parse_args()


def convert_gene_pyreference_to_gene_version_data(gene_data: Dict) -> Dict:
gene_version_data = {
'biotype': ",".join(gene_data["biotype"]),
'description': gene_data.get("description"),
'gene_symbol': gene_data["name"],
}

if hgnc_str := gene_data.get("HGNC"):
# Has HGNC: (5 characters) at start of it
gene_version_data["hgnc"] = hgnc_str[5:]

# Only Ensembl Genes have versions
if version := gene_data.get("version"):
gene_data["version"] = version

return gene_version_data


def convert_transcript_pyreference_to_pyhgvs(transcript_data: Dict) -> Dict:
start = transcript_data["start"]
end = transcript_data["stop"]
strand = transcript_data["strand"]
# PyHGVS has cds_start/cds_end be equal to end/end for non-coding transcripts (so coding length ie end-start = 0)
cds_start = transcript_data.get("cds_start", end)
cds_end = transcript_data.get("cds_end", end)
# PyHGVS exons are in genomic order, PyReference are in stranded
features = transcript_data["features_by_type"]
exons = [[ed["start"], ed["stop"]] for ed in features["exon"]]
cdna_match = []
for cdm in features.get("cDNA_match", []):
if "cdna_strand" in cdm: # None in Human RefSeq so haven't handled
raise ValueError("Haven't handled stranded Target alignment")
cdna_match.append([cdm.get(k) for k in ["start", "stop", "cdna_start", "cdna_end", "gap"]])

pyhgvs_data = {
'chrom': transcript_data["chr"],
'start': start,
'end': end,
'strand': strand,
'cds_start': cds_start,
'cds_end': cds_end,
'exons': exons,
}

# Optional stuff
for optional_key in ["start_codon_transcript_pos", "stop_codon_transcript_pos"]:
if value := transcript_data.get(optional_key):
pyhgvs_data[optional_key] = value

if cdna_match:
pyhgvs_data["cdna_match"] = cdna_match
if transcript_data.get("partial"):
pyhgvs_data["partial"] = 1
other_chroms = transcript_data.get("other_chroms")
if other_chroms:
pyhgvs_data["other_chroms"] = other_chroms

# Few extra fields (pyHGVS doesn't currently use)
pyhgvs_data["biotype"] = ",".join(transcript_data["biotype"])
return pyhgvs_data


def main():
args = handle_args()

gene_versions = {} # We only keep those that are in the latest transcript version
transcript_versions = {}

for pyref_filename in args.pyreference_json:
print(f"Loading '{pyref_filename}'")
with gzip.open(pyref_filename) as f:
pyref_data = json.load(f)

url = pyref_data["reference_gtf"]["url"]

# PyReference stores transcripts under genes, while PyReference only has transcripts (that contain genes)
transcript_gene_version = {}

for gene_id, gene in pyref_data["genes_by_id"].items():
if version := gene.get("version"):
gene_accession = f"{gene_id}.{version}"
else:
gene_accession = gene_id

gene_version = convert_gene_pyreference_to_gene_version_data(gene)
gene_version["url"] = url
gene_versions[gene_accession] = gene_version

for transcript_accession in gene["transcripts"]:
transcript_gene_version[transcript_accession] = gene_accession

for transcript_accession, transcript in pyref_data["transcripts_by_id"].items():
hgvs_data = convert_transcript_pyreference_to_pyhgvs(transcript)
gene_accession = transcript_gene_version[transcript_accession]
gene_version = gene_versions[gene_accession]
hgvs_data["id"] = transcript_accession
hgvs_data["gene_version"] = gene_accession
hgvs_data["gene_name"] = gene_version["gene_symbol"]
hgvs_data["url"] = url
transcript_versions[transcript_accession] = hgvs_data

print("Writing PyHGVS data")
with gzip.open(args.output, 'w') as outfile:
data = {
"transcripts": transcript_versions,
"genes": gene_versions,
}
json_str = json.dumps(data)
outfile.write(json_str.encode('ascii'))


if __name__ == '__main__':
main()
75 changes: 75 additions & 0 deletions generate_transcript_data/refseq_gene_annotation_grch37.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
#!/bin/bash

pyreference_args=()

# Having troubles with corrupted files downloading via FTP from NCBI via IPv6, http works ok

filename=ref_GRCh37.p5_top_level.gff3.gz
url=http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/BUILD.37.3/GFF/${filename}
pyreference_file=${filename}.json.gz
if [[ ! -e ${filename} ]]; then
wget ${url}
fi
if [[ ! -e ${pyreference_file} ]]; then
pyreference_gff_to_json.py --url "${url}" --gff3 "${filename}"
fi
pyreference_args+=(--pyreference-json ${pyreference_file})


filename=ref_GRCh37.p9_top_level.gff3.gz
url=http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.103/GFF/${filename}
pyreference_file=${filename}.json.gz
if [[ ! -e ${filename} ]]; then
wget ${url}
fi
if [[ ! -e ${pyreference_file} ]]; then
pyreference_gff_to_json.py --url "${url}" --gff3 "${filename}"
fi
pyreference_args+=(--pyreference-json ${pyreference_file})


filename=ref_GRCh37.p10_top_level.gff3.gz
url=http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.104/GFF/${filename}
pyreference_file=${filename}.json.gz
if [[ ! -e ${filename} ]]; then
wget ${url}
fi
if [[ ! -e ${pyreference_file} ]]; then
pyreference_gff_to_json.py --url "${url}" --gff3 "${filename}"
fi
pyreference_args+=(--pyreference-json ${pyreference_file})


filename=ref_GRCh37.p13_top_level.gff3.gz
url=http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.105/GFF/${filename}
pyreference_file=${filename}.json.gz
if [[ ! -e ${filename} ]]; then
wget ${url}
fi
if [[ ! -e ${pyreference_file} ]]; then
pyreference_gff_to_json.py --url "${url}" --gff3 "${filename}"
fi
pyreference_args+=(--pyreference-json ${pyreference_file})


# These all have the same name, so rename them based on release ID
for release in 105.20190906 105.20201022; do
filename=GCF_000001405.25_GRCh37.p13_genomic.${release}.gff.gz
url=http://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/annotation_releases/${release}/GCF_000001405.25_GRCh37.p13/GCF_000001405.25_GRCh37.p13_genomic.gff.gz
pyreference_file=${filename}.json.gz
if [[ ! -e ${filename} ]]; then
wget ${url} --output-document=${filename}
fi
if [[ ! -e ${pyreference_file} ]]; then
pyreference_gff_to_json.py --url "${url}" --gff3 "${filename}"
fi
pyreference_args+=(--pyreference-json ${pyreference_file})
done


merged_file="pyhgvs_transcripts_refseq_grch37.json.gz"
if [[ ! -e ${merged_file} ]]; then
BASE_DIR=$(dirname ${BASH_SOURCE[0]})

python3 ${BASE_DIR}/pyreference_to_pyhgvs_json.py ${pyreference_args[@]} --output ${merged_file}
fi
Loading

0 comments on commit aad2d68

Please sign in to comment.