-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
- Loading branch information
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
#!/bin/bash | ||
|
||
# v81 (points to 75) and earlier at GTFs that don't have transcript versions - just skip them | ||
|
||
#82 is first GFF3 for GRCh37 | ||
#83 has no data | ||
#84 is 82 again | ||
#86 is 85 again | ||
pyreference_args=() | ||
for release in 82 85 87; do | ||
filename=Homo_sapiens.GRCh37.${release}.gff3.gz | ||
url=ftp://ftp.ensembl.org/pub/grch37/release-${release}/gff3/homo_sapiens/${filename} | ||
pyreference_file=${filename}.json.gz | ||
if [[ ! -e ${filename} ]]; then | ||
wget ${url} | ||
fi | ||
if [[ ! -e ${pyreference_file} ]]; then | ||
pyreference_gff_to_json.py --url "${url}" --gff3 "${filename}" | ||
fi | ||
pyreference_args+=(--pyreference-json ${pyreference_file}) | ||
done | ||
|
||
merged_file="pyhgvs_transcripts_ensembl_grch37.json.gz" | ||
if [[ ! -e ${merged_file} ]]; then | ||
BASE_DIR=$(dirname ${BASH_SOURCE[0]}) | ||
|
||
python3 ${BASE_DIR}/pyreference_to_pyhgvs_json.py ${pyreference_args[@]} --output ${merged_file} | ||
fi |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
#!/bin/bash | ||
|
||
# Skip earlier GTFs as they don't have versions | ||
#for release in 76 77 78 79 80; do | ||
# filename=Homo_sapiens.GRCh38.${release}.gtf.gz | ||
# url=ftp://ftp.ensembl.org/pub/release-${release}/gtf/homo_sapiens/${filename} | ||
# if [[ ! -e ${filename} ]]; then | ||
# wget ${url} | ||
# fi | ||
|
||
# if [[ ! -e ${filename}.json.gz ]]; then | ||
# pyreference_gff_to_json.py --url "${url}" --gff3 "${filename}" | ||
# fi | ||
#done | ||
|
||
#81 is first GFF3 for GRCh38 | ||
pyreference_args=() | ||
for release in 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104; do | ||
filename=Homo_sapiens.GRCh38.${release}.gff3.gz | ||
url=ftp://ftp.ensembl.org/pub/release-${release}/gff3/homo_sapiens/${filename} | ||
pyreference_file=${filename}.json.gz | ||
|
||
if [[ ! -e ${filename} ]]; then | ||
wget ${url} | ||
fi | ||
if [[ ! -e ${filename}.json.gz ]]; then | ||
pyreference_gff_to_json.py --url "${url}" --gff3 "${filename}" | ||
fi | ||
pyreference_args+=(--pyreference-json ${pyreference_file}) | ||
done | ||
|
||
merged_file="pyhgvs_transcripts_ensembl_grch38.json.gz" | ||
if [[ ! -e ${merged_file} ]]; then | ||
BASE_DIR=$(dirname ${BASH_SOURCE[0]}) | ||
|
||
python3 ${BASE_DIR}/pyreference_to_pyhgvs_json.py ${pyreference_args[@]} --output ${merged_file} | ||
fi |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,134 @@ | ||
""" | ||
Converts PyReference JSON.gz files (created from RefSeq/Ensembl GTF or GFFs) into format easy to load w/PyHGVS | ||
@see https://bitbucket.org/sacgf/pyreference/ | ||
Dave Lawrence ([email protected]) on 22/10/2021 | ||
""" | ||
import gzip | ||
import json | ||
from argparse import ArgumentParser | ||
from typing import Dict | ||
|
||
|
||
def handle_args(): | ||
parser = ArgumentParser(description='Convert multiple PyReference json.gz files into one for PyHGVS') | ||
parser.add_argument('--pyreference-json', required=True, nargs="+", action="extend", | ||
help='PyReference JSON.gz - list OLDEST to NEWEST (newest is kept)') | ||
parser.add_argument('--output', required=True, help='Output filename') | ||
return parser.parse_args() | ||
|
||
|
||
def convert_gene_pyreference_to_gene_version_data(gene_data: Dict) -> Dict: | ||
gene_version_data = { | ||
'biotype': ",".join(gene_data["biotype"]), | ||
'description': gene_data.get("description"), | ||
'gene_symbol': gene_data["name"], | ||
} | ||
|
||
if hgnc_str := gene_data.get("HGNC"): | ||
# Has HGNC: (5 characters) at start of it | ||
gene_version_data["hgnc"] = hgnc_str[5:] | ||
|
||
# Only Ensembl Genes have versions | ||
if version := gene_data.get("version"): | ||
gene_data["version"] = version | ||
|
||
return gene_version_data | ||
|
||
|
||
def convert_transcript_pyreference_to_pyhgvs(transcript_data: Dict) -> Dict: | ||
start = transcript_data["start"] | ||
end = transcript_data["stop"] | ||
strand = transcript_data["strand"] | ||
# PyHGVS has cds_start/cds_end be equal to end/end for non-coding transcripts (so coding length ie end-start = 0) | ||
cds_start = transcript_data.get("cds_start", end) | ||
cds_end = transcript_data.get("cds_end", end) | ||
# PyHGVS exons are in genomic order, PyReference are in stranded | ||
features = transcript_data["features_by_type"] | ||
exons = [[ed["start"], ed["stop"]] for ed in features["exon"]] | ||
cdna_match = [] | ||
for cdm in features.get("cDNA_match", []): | ||
if "cdna_strand" in cdm: # None in Human RefSeq so haven't handled | ||
raise ValueError("Haven't handled stranded Target alignment") | ||
cdna_match.append([cdm.get(k) for k in ["start", "stop", "cdna_start", "cdna_end", "gap"]]) | ||
|
||
pyhgvs_data = { | ||
'chrom': transcript_data["chr"], | ||
'start': start, | ||
'end': end, | ||
'strand': strand, | ||
'cds_start': cds_start, | ||
'cds_end': cds_end, | ||
'exons': exons, | ||
} | ||
|
||
# Optional stuff | ||
for optional_key in ["start_codon_transcript_pos", "stop_codon_transcript_pos"]: | ||
if value := transcript_data.get(optional_key): | ||
pyhgvs_data[optional_key] = value | ||
|
||
if cdna_match: | ||
pyhgvs_data["cdna_match"] = cdna_match | ||
if transcript_data.get("partial"): | ||
pyhgvs_data["partial"] = 1 | ||
other_chroms = transcript_data.get("other_chroms") | ||
if other_chroms: | ||
pyhgvs_data["other_chroms"] = other_chroms | ||
|
||
# Few extra fields (pyHGVS doesn't currently use) | ||
pyhgvs_data["biotype"] = ",".join(transcript_data["biotype"]) | ||
return pyhgvs_data | ||
|
||
|
||
def main(): | ||
args = handle_args() | ||
|
||
gene_versions = {} # We only keep those that are in the latest transcript version | ||
transcript_versions = {} | ||
|
||
for pyref_filename in args.pyreference_json: | ||
print(f"Loading '{pyref_filename}'") | ||
with gzip.open(pyref_filename) as f: | ||
pyref_data = json.load(f) | ||
|
||
url = pyref_data["reference_gtf"]["url"] | ||
|
||
# PyReference stores transcripts under genes, while PyReference only has transcripts (that contain genes) | ||
transcript_gene_version = {} | ||
|
||
for gene_id, gene in pyref_data["genes_by_id"].items(): | ||
if version := gene.get("version"): | ||
gene_accession = f"{gene_id}.{version}" | ||
else: | ||
gene_accession = gene_id | ||
|
||
gene_version = convert_gene_pyreference_to_gene_version_data(gene) | ||
gene_version["url"] = url | ||
gene_versions[gene_accession] = gene_version | ||
|
||
for transcript_accession in gene["transcripts"]: | ||
transcript_gene_version[transcript_accession] = gene_accession | ||
|
||
for transcript_accession, transcript in pyref_data["transcripts_by_id"].items(): | ||
hgvs_data = convert_transcript_pyreference_to_pyhgvs(transcript) | ||
gene_accession = transcript_gene_version[transcript_accession] | ||
gene_version = gene_versions[gene_accession] | ||
hgvs_data["id"] = transcript_accession | ||
hgvs_data["gene_version"] = gene_accession | ||
hgvs_data["gene_name"] = gene_version["gene_symbol"] | ||
hgvs_data["url"] = url | ||
transcript_versions[transcript_accession] = hgvs_data | ||
|
||
print("Writing PyHGVS data") | ||
with gzip.open(args.output, 'w') as outfile: | ||
data = { | ||
"transcripts": transcript_versions, | ||
"genes": gene_versions, | ||
} | ||
json_str = json.dumps(data) | ||
outfile.write(json_str.encode('ascii')) | ||
|
||
|
||
if __name__ == '__main__': | ||
main() |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
#!/bin/bash | ||
|
||
pyreference_args=() | ||
|
||
# Having troubles with corrupted files downloading via FTP from NCBI via IPv6, http works ok | ||
|
||
filename=ref_GRCh37.p5_top_level.gff3.gz | ||
url=http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/BUILD.37.3/GFF/${filename} | ||
pyreference_file=${filename}.json.gz | ||
if [[ ! -e ${filename} ]]; then | ||
wget ${url} | ||
fi | ||
if [[ ! -e ${pyreference_file} ]]; then | ||
pyreference_gff_to_json.py --url "${url}" --gff3 "${filename}" | ||
fi | ||
pyreference_args+=(--pyreference-json ${pyreference_file}) | ||
|
||
|
||
filename=ref_GRCh37.p9_top_level.gff3.gz | ||
url=http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.103/GFF/${filename} | ||
pyreference_file=${filename}.json.gz | ||
if [[ ! -e ${filename} ]]; then | ||
wget ${url} | ||
fi | ||
if [[ ! -e ${pyreference_file} ]]; then | ||
pyreference_gff_to_json.py --url "${url}" --gff3 "${filename}" | ||
fi | ||
pyreference_args+=(--pyreference-json ${pyreference_file}) | ||
|
||
|
||
filename=ref_GRCh37.p10_top_level.gff3.gz | ||
url=http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.104/GFF/${filename} | ||
pyreference_file=${filename}.json.gz | ||
if [[ ! -e ${filename} ]]; then | ||
wget ${url} | ||
fi | ||
if [[ ! -e ${pyreference_file} ]]; then | ||
pyreference_gff_to_json.py --url "${url}" --gff3 "${filename}" | ||
fi | ||
pyreference_args+=(--pyreference-json ${pyreference_file}) | ||
|
||
|
||
filename=ref_GRCh37.p13_top_level.gff3.gz | ||
url=http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.105/GFF/${filename} | ||
pyreference_file=${filename}.json.gz | ||
if [[ ! -e ${filename} ]]; then | ||
wget ${url} | ||
fi | ||
if [[ ! -e ${pyreference_file} ]]; then | ||
pyreference_gff_to_json.py --url "${url}" --gff3 "${filename}" | ||
fi | ||
pyreference_args+=(--pyreference-json ${pyreference_file}) | ||
|
||
|
||
# These all have the same name, so rename them based on release ID | ||
for release in 105.20190906 105.20201022; do | ||
filename=GCF_000001405.25_GRCh37.p13_genomic.${release}.gff.gz | ||
url=http://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/annotation_releases/${release}/GCF_000001405.25_GRCh37.p13/GCF_000001405.25_GRCh37.p13_genomic.gff.gz | ||
pyreference_file=${filename}.json.gz | ||
if [[ ! -e ${filename} ]]; then | ||
wget ${url} --output-document=${filename} | ||
fi | ||
if [[ ! -e ${pyreference_file} ]]; then | ||
pyreference_gff_to_json.py --url "${url}" --gff3 "${filename}" | ||
fi | ||
pyreference_args+=(--pyreference-json ${pyreference_file}) | ||
done | ||
|
||
|
||
merged_file="pyhgvs_transcripts_refseq_grch37.json.gz" | ||
if [[ ! -e ${merged_file} ]]; then | ||
BASE_DIR=$(dirname ${BASH_SOURCE[0]}) | ||
|
||
python3 ${BASE_DIR}/pyreference_to_pyhgvs_json.py ${pyreference_args[@]} --output ${merged_file} | ||
fi |