Skip to content

Commit

Permalink
Add a separate data version, separate client/data generation as much …
Browse files Browse the repository at this point in the history
…as possible. Add new GFFs
  • Loading branch information
davmlaw committed Nov 13, 2023
1 parent bd53cc2 commit 7b009b5
Show file tree
Hide file tree
Showing 15 changed files with 98 additions and 92 deletions.
9 changes: 9 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,18 @@
## [unreleased]

### Added

- New GFFs: RefSeq RS_2023_10, Ensembl VEP110
- #66 - We now store 'Note' field (thanks holtgrewe for suggestion)
- Added requirements.txt for 'generate_transcript_data' sections
- client / JSON data schema version compatability check

### Changed

- #57 - Correctly handle retrieving genomic position and dealing w/indels in GFF (thanks ltnetcase for reporting)
- #60 - Fix for missing protein IDs due to Genbank / GenBank (thanks holtgrewe)
- #64 - Split code/data versions. json.gz are now labelled according to data schema version (thanks holtgrewe)
- Renamed 'CHM13v2.0' to 'T2T-CHM13v2.0' so it could work with biocommons bioutils

## [0.2.21] - 2023-08-14

Expand Down
4 changes: 2 additions & 2 deletions cdot/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
__version__ = "0.2.21"


def get_json_schema_version():
def get_data_schema_int(version: str) -> int:
""" Return an int which increments upon breaking changes - ie anything other than patch """
major, minor, patch = __version__.split(".")
major, minor, patch = version.split(".")
return 1000 * int(major) + int(minor)
52 changes: 0 additions & 52 deletions cdot/assembly_helper.py

This file was deleted.

21 changes: 19 additions & 2 deletions cdot/hgvs/dataproviders/json_data_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,15 @@
from intervaltree import IntervalTree
from typing import List

from cdot.assembly_helper import get_ac_name_map
from bioutils.assemblies import make_ac_name_map, make_name_ac_map

from cdot import get_data_schema_int, __version__

def get_ac_name_map(assembly_name):
if assembly_name == "GRCh37":
assembly_name = 'GRCh37.p13' # Original build didn't have MT
return make_ac_name_map(assembly_name)


class AbstractJSONDataProvider(Interface):
# All cdot data is 'splign', it's the method used in NCBI/Ensembl GTFs, and we also only pull out 'splign' from UTA
Expand Down Expand Up @@ -42,6 +50,7 @@ def __init__(self, assemblies: List[str] = None, mode=None, cache=None, seqfetch
for assembly_name, contig_map in self.assembly_maps.items():
self.assembly_by_contig.update({contig: assembly_name for contig in contig_map.keys()})


@abc.abstractmethod
def _get_transcript(self, tx_ac):
pass
Expand Down Expand Up @@ -260,6 +269,12 @@ def get_tx_for_gene(self, gene):
def get_tx_for_region(self, alt_ac, alt_aln_method, start_i, end_i):
pass

def _validate_schema_compatability(self, json_schema_version: str):
""" Raise an error if versions out of sync """
cdot_client_data_schema_int = get_data_schema_int(__version__)
cdot_data_schema_version = get_data_schema_int(json_schema_version)
if cdot_client_data_schema_int < cdot_data_schema_version:
raise ValueError(f"This cdot client ({__version__}) cannot read {json_schema_version=} - please upgrade.")

class LocalDataProvider(AbstractJSONDataProvider):
""" For JSON and Redis providers (implemented in cdot_rest)
Expand Down Expand Up @@ -359,7 +374,9 @@ def __init__(self, file_or_filename_list, mode=None, cache=None, seqfetcher=None
for g in genes.values():
if gene_symbol := g.get("gene_symbol"):
self.genes[gene_symbol] = g
self.cdot_data_version = tuple(int(v) for v in data["cdot_version"].split("."))
cdot_data_version_str = data["cdot_version"]
self._validate_schema_compatability(cdot_data_version_str)
self.cdot_data_version = tuple(int(v) for v in cdot_data_version_str.split("."))

super().__init__(assemblies=assemblies, mode=mode, cache=cache, seqfetcher=seqfetcher)

Expand Down
25 changes: 13 additions & 12 deletions generate_transcript_data/all_transcripts.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,12 @@ BASE_DIR=$(dirname ${FULL_PATH_TO_SCRIPT})
# Python scripts will import via generate_transcript_data
export PYTHONPATH=${BASE_DIR}/..

CDOT_VERSION=$(${BASE_DIR}/cdot_json.py --version)
CDOT_DATA_VERSION=$(${BASE_DIR}/cdot_json.py --version)

echo "Generating all transcripts for cdot data version ${CDOT_DATA_VERSION}"

# This needs to be passed to called bash scripts, so they are invoked with "." to use these variables
export GENE_INFO_JSON=$(pwd)/Homo_sapiens.gene-info-${CDOT_VERSION}.json.gz
export GENE_INFO_JSON=$(pwd)/Homo_sapiens.gene-info-${CDOT_DATA_VERSION}.json.gz

if [[ ! -e ${GENE_INFO_JSON} ]]; then
${BASE_DIR}/gene_info.sh
Expand All @@ -34,17 +35,17 @@ cd GRCh38
${BASE_DIR}/refseq_transcripts_grch38.sh
cd ..

mkdir -p CHM13v2.0
cd CHM13v2.0
mkdir -p T2T-CHM13v2.0
cd T2T-CHM13v2.0
${BASE_DIR}/refseq_transcripts_chm13v2.sh
cd ..

# Combine genome builds (we're in refseq dir)
REFSEQ_COMBO=cdot-${CDOT_VERSION}.refseq.grch37_grch38.json.gz
REFSEQ_COMBO=cdot-${CDOT_DATA_VERSION}.refseq.grch37_grch38.json.gz
if [[ ! -e ${REFSEQ_COMBO} ]]; then
${BASE_DIR}/cdot_json.py combine_builds \
--grch37 GRCh37/cdot-${CDOT_VERSION}.refseq.grch37.json.gz \
--grch38 GRCh38/cdot-${CDOT_VERSION}.refseq.grch38.json.gz \
--grch37 GRCh37/cdot-${CDOT_DATA_VERSION}.refseq.grch37.json.gz \
--grch38 GRCh38/cdot-${CDOT_DATA_VERSION}.refseq.grch38.json.gz \
--output ${REFSEQ_COMBO}
fi

Expand All @@ -64,18 +65,18 @@ cd GRCh38
${BASE_DIR}/ensembl_transcripts_grch38.sh
cd ..

mkdir -p CHM13v2.0
cd CHM13v2.0
mkdir -p T2T-CHM13v2.0
cd T2T-CHM13v2.0
${BASE_DIR}/ensembl_transcripts_chm13v2.sh
cd ..


# Combine genome builds (we're in ensembl dir)
ENSEMBL_COMBO=cdot-${CDOT_VERSION}.ensembl.grch37_grch38.json.gz
ENSEMBL_COMBO=cdot-${CDOT_DATA_VERSION}.ensembl.grch37_grch38.json.gz
if [[ ! -e ${ENSEMBL_COMBO} ]]; then
${BASE_DIR}/cdot_json.py combine_builds \
--grch37 GRCh37/cdot-${CDOT_VERSION}.ensembl.grch37.json.gz \
--grch38 GRCh38/cdot-${CDOT_VERSION}.ensembl.grch38.json.gz \
--grch37 GRCh37/cdot-${CDOT_DATA_VERSION}.ensembl.grch37.json.gz \
--grch38 GRCh38/cdot-${CDOT_DATA_VERSION}.ensembl.grch38.json.gz \
--output ${ENSEMBL_COMBO}
fi

Expand Down
7 changes: 2 additions & 5 deletions generate_transcript_data/cdot_gene_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,12 @@
import json
import os
from argparse import ArgumentParser
from csv import DictReader
from datetime import datetime
from typing import Iterable, Iterator, List, TypeVar

import cdot
from Bio import Entrez
from cdot.json_encoders import SortedSetEncoder
from io import BytesIO
from lxml import etree
from json_encoders import SortedSetEncoder

T = TypeVar("T")

Expand Down Expand Up @@ -45,7 +42,7 @@ def _get_entrez_gene_summary(id_list):
web_env = result["WebEnv"]
query_key = result["QueryKey"]
data = Entrez.esummary(db="gene", webenv=web_env, query_key=query_key)
document = Entrez.read(data, ignore_errors=True) # Need recent BioPython
document = Entrez.read(data, ignore_errors=True, validate=False) # Need recent BioPython
return document["DocumentSummarySet"]["DocumentSummary"]


Expand Down
14 changes: 7 additions & 7 deletions generate_transcript_data/cdot_json.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,20 @@
#!/usr/bin/env python3

import gzip
import ijson
import json
import logging
import re
import sys
from argparse import ArgumentParser
from collections import defaultdict, Counter
from csv import DictReader
from pyhgvs import CDNACoord

import cdot
import ijson
from cdot.pyhgvs.pyhgvs_transcript import PyHGVSTranscriptFactory
from generate_transcript_data.gff_parser import GTFParser, GFF3Parser
from cdot.json_encoders import SortedSetEncoder
from pyhgvs import CDNACoord
from generate_transcript_data.json_encoders import SortedSetEncoder
from generate_transcript_data.json_schema_version import JSON_SCHEMA_VERSION


def _setup_arg_parser():
Expand Down Expand Up @@ -270,7 +270,7 @@ def _cigar_to_gap_and_length(cigar):
def write_cdot_json(filename, genes, transcript_versions, genome_builds, refseq_gene_summary_api_retrieval_date=None):
print(f"Writing cdot file: '{filename}'")
data = {
"cdot_version": cdot.__version__,
"cdot_version": JSON_SCHEMA_VERSION,
"genome_builds": genome_builds,
"transcripts": transcript_versions,
}
Expand Down Expand Up @@ -382,7 +382,7 @@ def combine_builds(args):
with gzip.open(args.output, 'wt') as outfile:
data = {
"transcripts": transcripts,
"cdot_version": cdot.__version__,
"cdot_version": JSON_SCHEMA_VERSION,
"genome_builds": list(genome_build_file.keys()),
}
if genes:
Expand All @@ -399,7 +399,7 @@ def main():
parser = _setup_arg_parser()
args = parser.parse_args()
if args.version:
print(cdot.__version__)
print(JSON_SCHEMA_VERSION)
sys.exit(0)

if args.subcommand is None:
Expand Down
9 changes: 5 additions & 4 deletions generate_transcript_data/ensembl_transcripts_chm13v2.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ set -e

BASE_DIR=$(dirname ${BASH_SOURCE[0]})
CDOT_VERSION=$(${BASE_DIR}/cdot_json.py --version)
GENOME_BUILD=T2T-CHM13v2.0

if [[ -z ${GENE_INFO_JSON} ]]; then
echo "You need to set environment variable GENE_INFO_JSON, pointing to the filename produced by cdot_gene_info.py"
Expand All @@ -14,18 +15,18 @@ merge_args=()
for release in 2022_06 2022_07; do
filename=Homo_sapiens-GCA_009914755.4-${release}-genes.gff3.gz
url=https://ftp.ensembl.org/pub/rapid-release/species/Homo_sapiens/GCA_009914755.4/ensembl/geneset/${release}/${filename}
cdot_file=cdot-${CDOT_VERSION}.$(basename $filename .gz).json.gz
cdot_file=cdot-${CDOT_VERSION}.ensembl.$(basename $filename .gz).json.gz

if [[ ! -e ${filename} ]]; then
wget ${url}
fi
if [[ ! -e ${cdot_file} ]]; then
${BASE_DIR}/cdot_json.py gff3_to_json "${filename}" --url "${url}" --genome-build=CHM13v2.0 --output "${cdot_file}" --gene-info-json="${GENE_INFO_JSON}"
${BASE_DIR}/cdot_json.py gff3_to_json "${filename}" --url "${url}" --genome-build=${GENOME_BUILD} --output "${cdot_file}" --gene-info-json="${GENE_INFO_JSON}"
fi
merge_args+=(${cdot_file})
done

merged_file="cdot-${CDOT_VERSION}.ensembl.CHM13v2.0.json.gz"
merged_file="cdot-${CDOT_VERSION}.ensembl.${GENOME_BUILD}.json.gz"
if [[ ! -e ${merged_file} ]]; then
${BASE_DIR}/cdot_json.py merge_historical ${merge_args[@]} --genome-build=CHM13v2.0 --output "${merged_file}"
${BASE_DIR}/cdot_json.py merge_historical ${merge_args[@]} --genome-build=${GENOME_BUILD} --output "${merged_file}"
fi
2 changes: 1 addition & 1 deletion generate_transcript_data/ensembl_transcripts_grch37.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ for release in 82 85 87; do
# Switched to using GTFs as they contain protein version
filename=Homo_sapiens.GRCh37.${release}.gff3.gz
url=ftp://ftp.ensembl.org/pub/grch37/release-${release}/gff3/homo_sapiens/${filename}
cdot_file=cdot-${CDOT_VERSION}.$(basename $filename .gz).json.gz
cdot_file=cdot-${CDOT_VERSION}.ensembl.$(basename $filename .gz).json.gz
if [[ ! -e ${filename} ]]; then
wget ${url}
fi
Expand Down
2 changes: 1 addition & 1 deletion generate_transcript_data/ensembl_transcripts_grch38.sh
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ for release in 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101
# Switched to using GTFs as they contain protein version
filename=Homo_sapiens.GRCh38.${release}.gff3.gz
url=ftp://ftp.ensembl.org/pub/release-${release}/gff3/homo_sapiens/${filename}
cdot_file=cdot-${CDOT_VERSION}.$(basename $filename .gz).json.gz
cdot_file=cdot-${CDOT_VERSION}.ensembl.$(basename $filename .gz).json.gz

if [[ ! -e ${filename} ]]; then
wget ${url}
Expand Down
File renamed without changes.
3 changes: 3 additions & 0 deletions generate_transcript_data/json_schema_version.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# After 0.2.22 we split version into separate code (pip) and data schema versions
# The cdot client will use its own major/minor to determine whether it can read these data files
JSON_SCHEMA_VERSION = "0.2.22"
22 changes: 18 additions & 4 deletions generate_transcript_data/refseq_transcripts_chm13v2.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ set -e

BASE_DIR=$(dirname ${BASH_SOURCE[0]})
CDOT_VERSION=$(${BASE_DIR}/cdot_json.py --version)
GENOME_BUILD=T2T-CHM13v2.0

if [[ -z ${GENE_INFO_JSON} ]]; then
echo "You need to set environment variable GENE_INFO_JSON, pointing to the filename produced by cdot_gene_info.py"
Expand All @@ -20,7 +21,7 @@ if [[ ! -e ${filename} ]]; then
wget ${url} --output-document=${filename}
fi
if [[ ! -e ${cdot_file} ]]; then
${BASE_DIR}/cdot_json.py gff3_to_json "${filename}" --url "${url}" --genome-build=CHM13v2.0 --output "${cdot_file}" --gene-info-json="${GENE_INFO_JSON}"
${BASE_DIR}/cdot_json.py gff3_to_json "${filename}" --url "${url}" --genome-build=${GENOME_BUILD} --output "${cdot_file}" --gene-info-json="${GENE_INFO_JSON}"
fi
merge_args+=(${cdot_file})

Expand All @@ -33,12 +34,25 @@ if [[ ! -e ${filename} ]]; then
wget ${url} --output-document=${filename}
fi
if [[ ! -e ${cdot_file} ]]; then
${BASE_DIR}/cdot_json.py gff3_to_json "${filename}" --url "${url}" --genome-build=CHM13v2.0 --output "${cdot_file}" --gene-info-json="${GENE_INFO_JSON}"
${BASE_DIR}/cdot_json.py gff3_to_json "${filename}" --url "${url}" --genome-build=${GENOME_BUILD} --output "${cdot_file}" --gene-info-json="${GENE_INFO_JSON}"
fi
merge_args+=(${cdot_file})

merged_file="cdot-${CDOT_VERSION}.refseq.CHM13v2.0.json.gz"
filename=GCF_009914755.1_T2T-CHM13v2.0_genomic.RS_2023_10.gff.gz
url=https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/GCF_009914755.1-RS_2023_10/GCF_009914755.1_T2T-CHM13v2.0_genomic.gff.gz
cdot_file=cdot-${CDOT_VERSION}.$(basename $filename .gz).json.gz

if [[ ! -e ${filename} ]]; then
wget ${url} --output-document=${filename}
fi
if [[ ! -e ${cdot_file} ]]; then
${BASE_DIR}/cdot_json.py gff3_to_json "${filename}" --url "${url}" --genome-build=${GENOME_BUILD} --output "${cdot_file}" --gene-info-json="${GENE_INFO_JSON}"
fi
merge_args+=(${cdot_file})


merged_file="cdot-${CDOT_VERSION}.refseq.${GENOME_BUILD}.json.gz"
if [[ ! -e ${merged_file} ]]; then
echo "Creating ${merged_file}"
${BASE_DIR}/cdot_json.py merge_historical ${merge_args[@]} --genome-build=CHM13v2.0 --output "${merged_file}"
${BASE_DIR}/cdot_json.py merge_historical ${merge_args[@]} --genome-build=${GENOME_BUILD} --output "${merged_file}"
fi
Loading

0 comments on commit 7b009b5

Please sign in to comment.