Skip to content

Commit

Permalink
#42 - Ensembl T2T
Browse files Browse the repository at this point in the history
  • Loading branch information
davmlaw committed May 8, 2023
1 parent 43cb050 commit cc10783
Show file tree
Hide file tree
Showing 2 changed files with 42 additions and 5 deletions.
16 changes: 11 additions & 5 deletions generate_transcript_data/all_transcripts.sh
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,11 @@ cd GRCh38
${BASE_DIR}/refseq_transcripts_grch38.sh
cd ..

mkdir -p CHM13v2.0
cd CHM13v2.0
${BASE_DIR}/refseq_transcripts_chm13v2.sh
cd ..

# Combine genome builds (we're in refseq dir)
REFSEQ_COMBO=cdot-${CDOT_VERSION}.refseq.grch37_grch38.json.gz
if [[ ! -e ${REFSEQ_COMBO} ]]; then
Expand All @@ -43,11 +48,6 @@ if [[ ! -e ${REFSEQ_COMBO} ]]; then
--output ${REFSEQ_COMBO}
fi

mkdir -p CHM13v2.0
cd CHM13v2.0
${BASE_DIR}/refseq_transcripts_chm13v2.sh
cd ..

cd ..

# Ensembl
Expand All @@ -64,6 +64,12 @@ cd GRCh38
${BASE_DIR}/ensembl_transcripts_grch38.sh
cd ..

mkdir -p CHM13v2.0
cd CHM13v2.0
${BASE_DIR}/ensembl_transcripts_chm13v2.sh
cd ..


# Combine genome builds (we're in ensembl dir)
ENSEMBL_COMBO=cdot-${CDOT_VERSION}.ensembl.grch37_grch38.json.gz
if [[ ! -e ${ENSEMBL_COMBO} ]]; then
Expand Down
31 changes: 31 additions & 0 deletions generate_transcript_data/ensembl_transcripts_chm13v2.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#!/bin/bash

set -e

BASE_DIR=$(dirname ${BASH_SOURCE[0]})
CDOT_VERSION=$(${BASE_DIR}/cdot_json.py --version)

if [[ -z ${GENE_INFO_JSON} ]]; then
echo "You need to set environment variable GENE_INFO_JSON, pointing to the filename produced by cdot_gene_info.py"
exit 1
fi

merge_args=()
for release in 2022_06 2022_07; do
filename=Homo_sapiens-GCA_009914755.4-${release}-genes.gff3.gz
url=https://ftp.ensembl.org/pub/rapid-release/species/Homo_sapiens/GCA_009914755.4/ensembl/geneset/${release}/${filename}
cdot_file=cdot-${CDOT_VERSION}.$(basename $filename .gz).json.gz

if [[ ! -e ${filename} ]]; then
wget ${url}
fi
if [[ ! -e ${cdot_file} ]]; then
${BASE_DIR}/cdot_json.py gff3_to_json "${filename}" --url "${url}" --genome-build=CHM13v2.0 --output "${cdot_file}" --gene-info-json="${GENE_INFO_JSON}"
fi
merge_args+=(${cdot_file})
done

merged_file="cdot-${CDOT_VERSION}.ensembl.CHM13v2.0.json.gz"
if [[ ! -e ${merged_file} ]]; then
${BASE_DIR}/cdot_json.py merge_historical ${merge_args[@]} --genome-build=CHM13v2.0 --output "${merged_file}"
fi

0 comments on commit cc10783

Please sign in to comment.