Skip to content

Commit

Permalink
#26, #29, #30
Browse files Browse the repository at this point in the history
  • Loading branch information
davmlaw committed Dec 8, 2022
1 parent cd11c68 commit 0bf1a68
Show file tree
Hide file tree
Showing 10 changed files with 79 additions and 38 deletions.
11 changes: 7 additions & 4 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
## [unreleased]
## [0.2.12] - 2022-12-08

### Added

- Add Ensembl 108 GTF
- #30 - We now store "tag" attributes (eg "MANE Select", "RefSeq Select")
- Switch to using Ensembl GFF3 (so we can get tags out)
- Add Ensembl 108 GFF3

### Changed

- Fix for #25 - GeneInfo currently fails for some records
- Fix for #27 - Change URL for missing RefSeq GFFs

## [0.2.11] - 2022-09-27

Expand Down Expand Up @@ -112,8 +115,8 @@

- Initial commit

[unreleased]: https://github.com/SACGF/cdot/compare/v0.2.10...HEAD
[unreleased]: https://github.com/SACGF/cdot/compare/v0.2.10...HEAD
[unreleased]: https://github.com/SACGF/cdot/compare/v0.2.11...HEAD
[0.2.12]: https://github.com/SACGF/cdot/compare/v0.2.11...v0.2.12
[0.2.11]: https://github.com/SACGF/cdot/compare/v0.2.10...v0.2.11
[0.2.10]: https://github.com/SACGF/cdot/compare/v0.2.9...v0.2.10
[0.2.9]: https://github.com/SACGF/cdot/compare/v0.2.8...v0.2.9
Expand Down
2 changes: 1 addition & 1 deletion cdot/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = "0.2.11"
__version__ = "0.2.12"


def get_json_schema_version():
Expand Down
10 changes: 8 additions & 2 deletions cdot/gff/gff_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -335,7 +335,7 @@ class GTFParser(GFFParser):
GFF2 only has 2 levels of feature hierarchy, so we have to build or 3 levels of gene/transcript/exons ourselves
"""
GTF_TRANSCRIPTS_DATA = GFFParser.CODING_FEATURES | {"exon"}
FEATURE_ALLOW_LIST = GTF_TRANSCRIPTS_DATA | {"gene"}
FEATURE_ALLOW_LIST = GTF_TRANSCRIPTS_DATA | {"gene", "transcript"}

def __init__(self, *args, **kwargs):
super(GTFParser, self).__init__(*args, **kwargs)
Expand Down Expand Up @@ -376,7 +376,9 @@ def handle_feature(self, feature):
if protein_version := feature.attr.get("protein_version"):
protein = f"{protein}.{protein_version}"
self.transcript_proteins[transcript_accession] = protein

elif feature.type == "transcript":
if tag := feature.attr.get("tag"):
transcript["tag"] = tag


class GFF3Parser(GFFParser):
Expand Down Expand Up @@ -481,6 +483,10 @@ def _handle_transcript(self, gene_data, transcript_accession, feature):

if feature.attr.get("partial"):
transcript_data["partial"] = 1

if tag := feature.attr.get("tag"):
transcript_data["tag"] = tag

self.transcript_data_by_accession[transcript_accession] = transcript_data
self.transcript_accession_by_feature_id[feature.attr["ID"]] = transcript_accession

Expand Down
8 changes: 4 additions & 4 deletions generate_transcript_data/ensembl_transcripts_grch37.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,14 @@ fi
merge_args=()
for release in 82 85 87; do
# Switched to using GTFs as they contain protein version
filename=Homo_sapiens.GRCh37.${release}.gtf.gz
url=ftp://ftp.ensembl.org/pub/grch37/release-${release}/gtf/homo_sapiens/${filename}
cdot_file=$(basename $filename .gz).json.gz
filename=Homo_sapiens.GRCh37.${release}.gff3.gz
url=ftp://ftp.ensembl.org/pub/grch37/release-${release}/gff3/homo_sapiens/${filename}
cdot_file=cdot-${CDOT_VERSION}.$(basename $filename .gz).json.gz
if [[ ! -e ${filename} ]]; then
wget ${url}
fi
if [[ ! -e ${cdot_file} ]]; then
${BASE_DIR}/cdot_json.py gtf_to_json "${filename}" --url "${url}" --genome-build=GRCh37 --output "${cdot_file}" --gene-info-json="${GENE_INFO_JSON}"
${BASE_DIR}/cdot_json.py gff3_to_json "${filename}" --url "${url}" --genome-build=GRCh37 --output "${cdot_file}" --gene-info-json="${GENE_INFO_JSON}"
fi
merge_args+=(${cdot_file})
done
Expand Down
8 changes: 4 additions & 4 deletions generate_transcript_data/ensembl_transcripts_grch38.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,15 +27,15 @@ fi
merge_args=()
for release in 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108; do
# Switched to using GTFs as they contain protein version
filename=Homo_sapiens.GRCh38.${release}.gtf.gz
url=ftp://ftp.ensembl.org/pub/release-${release}/gtf/homo_sapiens/${filename}
cdot_file=$(basename $filename .gz).json.gz
filename=Homo_sapiens.GRCh38.${release}.gff3.gz
url=ftp://ftp.ensembl.org/pub/release-${release}/gff3/homo_sapiens/${filename}
cdot_file=cdot-${CDOT_VERSION}.$(basename $filename .gz).json.gz

if [[ ! -e ${filename} ]]; then
wget ${url}
fi
if [[ ! -e ${cdot_file} ]]; then
${BASE_DIR}/cdot_json.py gtf_to_json "${filename}" --url "${url}" --genome-build=GRCh38 --output "${cdot_file}" --gene-info-json="${GENE_INFO_JSON}"
${BASE_DIR}/cdot_json.py gff3_to_json "${filename}" --url "${url}" --genome-build=GRCh38 --output "${cdot_file}" --gene-info-json="${GENE_INFO_JSON}"
fi
merge_args+=(${cdot_file})
done
Expand Down
14 changes: 7 additions & 7 deletions generate_transcript_data/refseq_transcripts_grch37.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ merge_args=()

filename=ref_GRCh37.p5_top_level.gff3.gz
url=http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/BUILD.37.3/GFF/${filename}
cdot_file=$(basename $filename .gz).json.gz
cdot_file=cdot-${CDOT_VERSION}.$(basename $filename .gz).json.gz
if [[ ! -e ${filename} ]]; then
wget ${url}
fi
Expand All @@ -36,7 +36,7 @@ merge_args+=(${cdot_file})

filename=ref_GRCh37.p9_top_level.gff3.gz
url=http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.103/GFF/${filename}
cdot_file=$(basename $filename .gz).json.gz
cdot_file=cdot-${CDOT_VERSION}.$(basename $filename .gz).json.gz
if [[ ! -e ${filename} ]]; then
wget ${url}
fi
Expand All @@ -48,7 +48,7 @@ merge_args+=(${cdot_file})

filename=ref_GRCh37.p10_top_level.gff3.gz
url=http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.104/GFF/${filename}
cdot_file=$(basename $filename .gz).json.gz
cdot_file=cdot-${CDOT_VERSION}.$(basename $filename .gz).json.gz
if [[ ! -e ${filename} ]]; then
wget ${url}
fi
Expand All @@ -61,14 +61,14 @@ merge_args+=(${cdot_file})
if [[ ! -z ${UTA_TRANSCRIPTS} ]]; then
# UTA transcripts have gaps, so they should overwrite the earlier refseq transcripts (without gaps)
# But will be overwritten by newer (post p13) official transcripts
cdot_file="cdot.uta_${UTA_VERSION}.GRCh37.json.gz"
cdot_file="cdot-${CDOT_VERSION}.uta_${UTA_VERSION}.GRCh37.json.gz"
${BASE_DIR}/uta_transcripts.sh ${UTA_VERSION} GRCh37
merge_args+=(${cdot_file})
fi

filename=ref_GRCh37.p13_top_level.gff3.gz
url=http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.105/GFF/${filename}
cdot_file=$(basename $filename .gz).json.gz
cdot_file=cdot-${CDOT_VERSION}.$(basename $filename .gz).json.gz
if [[ ! -e ${filename} ]]; then
wget ${url}
fi
Expand All @@ -81,8 +81,8 @@ merge_args+=(${cdot_file})
# These all have the same name, so rename them based on release ID
for release in 105.20190906 105.20201022 105.20220307; do
filename=GCF_000001405.25_GRCh37.p13_genomic.${release}.gff.gz
url=http://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/annotation_releases/${release}/GCF_000001405.25_GRCh37.p13/GCF_000001405.25_GRCh37.p13_genomic.gff.gz
cdot_file=$(basename $filename .gz).json.gz
url=https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/${release}/GCF_000001405.25_GRCh37.p13/GCF_000001405.25_GRCh37.p13_genomic.gff.gz
cdot_file=cdot-${CDOT_VERSION}.$(basename $filename .gz).json.gz
if [[ ! -e ${filename} ]]; then
wget ${url} --output-document=${filename}
fi
Expand Down
22 changes: 11 additions & 11 deletions generate_transcript_data/refseq_transcripts_grch38.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,14 @@ merge_args=()

if [[ ! -z ${UTA_TRANSCRIPTS} ]]; then
# All GRCh38 transcripts have alignments gaps, so use UTA first (and override with official releases)
uta_cdot_file="cdot.uta_${UTA_VERSION}.GRCh38.json.gz"
uta_cdot_file="cdot-${CDOT_VERSION}.uta_${UTA_VERSION}.GRCh38.json.gz"
${BASE_DIR}/uta_transcripts.sh ${UTA_VERSION} GRCh38
merge_args+=(${uta_cdot_file})
fi

filename=ref_GRCh38_top_level.gff3.gz
url=http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.106/GFF/${filename}
cdot_file=$(basename $filename .gz).json.gz
cdot_file=cdot-${CDOT_VERSION}.$(basename $filename .gz).json.gz

if [[ ! -e ${filename} ]]; then
wget ${url}
Expand All @@ -44,7 +44,7 @@ merge_args+=(${cdot_file})

filename=ref_GRCh38.p2_top_level.gff3.gz
url=http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.107/GFF/${filename}
cdot_file=$(basename $filename .gz).json.gz
cdot_file=cdot-${CDOT_VERSION}.$(basename $filename .gz).json.gz

if [[ ! -e ${filename} ]]; then
wget ${url}
Expand All @@ -57,7 +57,7 @@ merge_args+=(${cdot_file})

filename=ref_GRCh38.p7_top_level.gff3.gz
url=http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.108/GFF/${filename}
cdot_file=$(basename $filename .gz).json.gz
cdot_file=cdot-${CDOT_VERSION}.$(basename $filename .gz).json.gz

if [[ ! -e ${filename} ]]; then
wget ${url}
Expand All @@ -70,7 +70,7 @@ merge_args+=(${cdot_file})

filename=ref_GRCh38.p12_top_level.gff3.gz
url=http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.109/GFF/${filename}
cdot_file=$(basename $filename .gz).json.gz
cdot_file=cdot-${CDOT_VERSION}.$(basename $filename .gz).json.gz

if [[ ! -e ${filename} ]]; then
wget ${url}
Expand All @@ -82,8 +82,8 @@ merge_args+=(${cdot_file})


filename=GCF_000001405.38_GRCh38.p12_genomic.gff.gz
url=http://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/annotation_releases/109/GCF_000001405.38_GRCh38.p12/${filename}
cdot_file=$(basename $filename .gz).json.gz
url=https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109/GCF_000001405.38_GRCh38.p12/${filename}
cdot_file=cdot-${CDOT_VERSION}.$(basename $filename .gz).json.gz

if [[ ! -e ${filename} ]]; then
wget ${url}
Expand All @@ -98,8 +98,8 @@ merge_args+=(${cdot_file})
for release in 109.20190607 109.20190905 109.20191205 109.20200228 109.20200522 109.20200815 109.20201120 109.20210226 109.20210514 109.20211119; do
# These all have the same name, so rename them based on release ID
filename=GCF_000001405.39_GRCh38.p13_genomic.${release}.gff.gz
url=http://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/annotation_releases/${release}/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.gff.gz
cdot_file=$(basename $filename .gz).json.gz
url=https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/${release}/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.gff.gz
cdot_file=cdot-${CDOT_VERSION}.$(basename $filename .gz).json.gz
if [[ ! -e ${filename} ]]; then
wget ${url} --output-document=${filename}
fi
Expand All @@ -111,8 +111,8 @@ done


filename=GCF_000001405.40_GRCh38.p14_genomic.gff.gz
url=https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/annotation_releases/110/GCF_000001405.40_GRCh38.p14/${filename}
cdot_file=$(basename $filename .gz).json.gz
url=https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/110/GCF_000001405.40_GRCh38.p14/${filename}
cdot_file=cdot-${CDOT_VERSION}.$(basename $filename .gz).json.gz

if [[ ! -e ${filename} ]]; then
wget ${url}
Expand Down
5 changes: 3 additions & 2 deletions generate_transcript_data/uta_transcripts.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,12 @@ if [ "$#" -ne 2 ]; then
exit 1;
fi

BASE_DIR=$(dirname ${BASH_SOURCE[0]})
UTA_BASE_URL=uta.biocommons.org # uta.invitae.com moved here
CDOT_VERSION=$(${BASE_DIR}/cdot_json.py --version)
UTA_VERSION=${1}
GENOME_BUILD=${2}

BASE_DIR=$(dirname ${BASH_SOURCE[0]})
export PGPASSWORD=anonymous

uta_csv_filename=uta_${UTA_VERSION}_${GENOME_BUILD,,}.csv
Expand All @@ -20,7 +21,7 @@ if [[ ! -e ${uta_csv_filename} ]]; then
cat ${SQL} | tr -s '\n' ' ' | psql -h ${UTA_BASE_URL} -U anonymous -d uta
fi

cdot_file="cdot.uta_${UTA_VERSION}.${GENOME_BUILD}.json.gz"
cdot_file="cdot-${CDOT_VERSION}.uta_${UTA_VERSION}.${GENOME_BUILD}.json.gz"
if [[ ! -e ${cdot_file} ]]; then
POSTGRES_URL=postgresql://${UTA_BASE_URL}/uta_${UTA_VERSION}
${BASE_DIR}/cdot_json.py uta_to_json "${uta_csv_filename}" --url "${POSTGRES_URL}" --output "${cdot_file}" --genome-build=${GENOME_BUILD}
Expand Down
19 changes: 19 additions & 0 deletions tests/test_data/ensembl_test.GRCh38.108.gff3
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
##gff-version 3
##sequence-region 1 1 248956422
#!genome-build Genome Reference Consortium GRCh38.p13
#!genome-version GRCh38
#!genome-date 2013-12
#!genome-build-accession GCA_000001405.28
#!genebuild-last-updated 2022-07
1 GRCh38 chromosome 1 248956422 . . . ID=chromosome:1;Alias=CM000663.2,chr1,NC_000001.11
###
1 ensembl_havana gene 65419 71585 . + . ID=gene:ENSG00000186092;Name=OR4F5;biotype=protein_coding;description=olfactory receptor family 4 subfamily F member 5 [Source:HGNC Symbol%3BAcc:HGNC:14825];gene_id=ENSG00000186092;logic_name=ensembl_havana_gene_homo_sapiens;version=7
1 havana mRNA 65419 71585 . + . ID=transcript:ENST00000641515;Parent=gene:ENSG00000186092;Name=OR4F5-201;biotype=protein_coding;tag=basic,Ensembl_canonical,MANE_Select;transcript_id=ENST00000641515;version=2
1 havana exon 65419 65433 . + . Parent=transcript:ENST00000641515;Name=ENSE00003812156;constitutive=1;ensembl_end_phase=-1;ensembl_phase=-1;exon_id=ENSE00003812156;rank=1;version=1
1 havana five_prime_UTR 65419 65433 . + . Parent=transcript:ENST00000641515
1 havana five_prime_UTR 65520 65564 . + . Parent=transcript:ENST00000641515
1 havana exon 65520 65573 . + . Parent=transcript:ENST00000641515;Name=ENSE00003813641;constitutive=1;ensembl_end_phase=0;ensembl_phase=-1;exon_id=ENSE00003813641;rank=2;version=1
1 havana CDS 65565 65573 . + 0 ID=CDS:ENSP00000493376;Parent=transcript:ENST00000641515;protein_id=ENSP00000493376
1 havana CDS 69037 70008 . + 0 ID=CDS:ENSP00000493376;Parent=transcript:ENST00000641515;protein_id=ENSP00000493376
1 havana exon 69037 71585 . + . Parent=transcript:ENST00000641515;Name=ENSE00003813949;constitutive=1;ensembl_end_phase=-1;ensembl_phase=0;exon_id=ENSE00003813949;rank=3;version=1
1 havana three_prime_UTR 70009 71585 . + . Parent=transcript:ENST00000641515
18 changes: 15 additions & 3 deletions tests/test_gff_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@
class Test(unittest.TestCase):
this_file_dir = os.path.dirname(os.path.abspath(getsourcefile(lambda: 0)))
test_data_dir = os.path.join(this_file_dir, "test_data")
ENSEMBL_GTF_FILENAME = os.path.join(test_data_dir, "ensembl_test.GRCh38.104.gtf")
ENSEMBL_104_GTF_FILENAME = os.path.join(test_data_dir, "ensembl_test.GRCh38.104.gtf")
ENSEMBL_108_GFF3_FILENAME = os.path.join(test_data_dir, "ensembl_test.GRCh38.108.gff3")
REFSEQ_GFF3_FILENAME = os.path.join(test_data_dir, "refseq_test.GRCh38.p13_genomic.109.20210514.gff")
UCSC_GTF_FILENAME = os.path.join(test_data_dir, "hg19_chrY_300kb_genes.gtf")
FAKE_URL = "http://fake.url"
Expand All @@ -27,7 +28,7 @@ def test_ucsc_gtf(self):

def test_ensembl_gtf(self):
genome_build = "GRCh38"
parser = GTFParser(self.ENSEMBL_GTF_FILENAME, genome_build, self.FAKE_URL)
parser = GTFParser(self.ENSEMBL_104_GTF_FILENAME, genome_build, self.FAKE_URL)
genes, transcripts = parser.get_genes_and_transcripts()
self._test_exon_length(transcripts, genome_build, "ENST00000357654.9", 7088)

Expand Down Expand Up @@ -55,7 +56,7 @@ def test_refseq_gff3(self):

def test_exons_in_genomic_order(self):
genome_build = "GRCh38"
parser = GTFParser(self.ENSEMBL_GTF_FILENAME, genome_build, self.FAKE_URL)
parser = GTFParser(self.ENSEMBL_104_GTF_FILENAME, genome_build, self.FAKE_URL)
_, transcripts = parser.get_genes_and_transcripts()
transcript = transcripts["ENST00000357654.9"]
exons = transcript["genome_builds"][genome_build]["exons"]
Expand All @@ -71,3 +72,14 @@ def test_exons_in_genomic_order(self):
first_exon = exons[0]
last_exon = exons[-1]
self.assertGreater(last_exon[0], first_exon[0])

def test_ensembl_gtf_tags(self):
genome_build = "GRCh38"
parser = GFF3Parser(self.ENSEMBL_108_GFF3_FILENAME, genome_build, self.FAKE_URL)
genes, transcripts = parser.get_genes_and_transcripts()
print(transcripts)
transcript = transcripts["ENST00000641515.2"]
tags = transcript.get("tag")
print(f"tags={tags}")
for tag in ["basic", "Ensembl_canonical", "MANE_Select"]:
self.assertIn(tag, tags)

0 comments on commit 0bf1a68

Please sign in to comment.