Skip to content

Commit

Permalink
feat(IPVC-2386): Backfill transl_excepts (#32)
Browse files Browse the repository at this point in the history
  • Loading branch information
bsgiles73 authored May 8, 2024
1 parent 93e79f7 commit 4cb5f4f
Show file tree
Hide file tree
Showing 15 changed files with 539 additions and 67 deletions.
1 change: 1 addition & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ RUN pip install pysam
WORKDIR /opt/repos/uta/
COPY pyproject.toml ./
COPY etc ./etc
COPY misc ./misc
COPY sbin ./sbin
COPY src ./src
RUN pip install -e .[dev]
Expand Down
2 changes: 0 additions & 2 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ services:
command: sbin/uta-extract /ncbi-dir /uta-extract/work /uta-extract/logs
volumes:
- ${UTA_ETL_NCBI_DIR}:/ncbi-dir
- ${UTA_ETL_SEQREPO_DIR}:/usr/local/share/seqrepo
- ${UTA_ETL_WORK_DIR}:/uta-extract/work
- ${UTA_ETL_LOG_DIR}:/uta-extract/logs
working_dir: /opt/repos/uta
Expand Down Expand Up @@ -47,7 +46,6 @@ services:
uta:
condition: service_healthy
volumes:
- ${UTA_ETL_NCBI_DIR}:/ncbi-dir
- ${UTA_ETL_SEQREPO_DIR}:/usr/local/share/seqrepo
- ${UTA_ETL_WORK_DIR}:/uta-load/work
- ${UTA_ETL_LOG_DIR}:/uta-load/logs
Expand Down
9 changes: 0 additions & 9 deletions etc/ncbi-files.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,6 @@
# │ └── GCF_000001405.25_GRCh37.p13_genomic.gff.gz
# └── refseq
# └── H_sapiens
# ├── historical
# │ └── GRCh38
# │ └── GCF_000001405.40-RS_2023_03_historical
# │ ├── GCF_000001405.40-RS_2023_03_knownrefseq_alns.gff.gz
# │ └── GCF_000001405.40-RS_2023_03_knownrefseq_rna.gbff.gz
# └── mRNA_Prot
# ├── human.1.protein.faa.gz
# ├── human.1.rna.fna.gz
Expand All @@ -36,10 +31,6 @@ refseq/H_sapiens/mRNA_Prot/human.*.rna.gbff.gz
refseq/H_sapiens/mRNA_Prot/human.*.rna.fna.gz
refseq/H_sapiens/mRNA_Prot/human.*.protein.faa.gz

## Historical RefSeq alignments
refseq/H_sapiens/historical/GRCh38/GCF_000001405.40-RS_2023_03_historical/GCF_000001405.40-RS_2023_03_knownrefseq_alns.gff.gz
refseq/H_sapiens/historical/GRCh38/GCF_000001405.40-RS_2023_03_historical/GCF_000001405.40-RS_2023_03_knownrefseq_rna.gbff.gz

## Genome build and alignment data
# Build 37
genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.25_GRCh37.p13/GCF_000001405.25_GRCh37.p13_assembly_report.txt
Expand Down
14 changes: 14 additions & 0 deletions misc/refseq-historical-backfill/docker-compose-backfill.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# docker compose file for the RefSeq historical backfill procedure

version: '3'

services:
uta-extract-historical:
image: uta-update
command: misc/refseq-historical-backfill/uta-extract-historical /ncbi-dir /uta-extract/work /uta-extract/logs
volumes:
- ${UTA_ETL_NCBI_DIR}:/ncbi-dir
- ${UTA_ETL_WORK_DIR}:/uta-extract/work
- ${UTA_ETL_LOG_DIR}:/uta-extract/logs
working_dir: /opt/repos/uta
network_mode: host
196 changes: 196 additions & 0 deletions misc/refseq-historical-backfill/ncbi_extract_gbff.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@
"""
Extract and write all files needed by UTA load, except alt accession exonsets (aka, alignments). From a single
GBFF file we can create dna fasta, protein fasta, associated accessions, geneinfo, and txinfo files.
"""
import argparse
import gzip
import importlib_resources
import io
import logging
import logging.config
from collections import Counter
from contextlib import ExitStack
from typing import Iterable

from Bio.Seq import Seq
import Bio.SeqIO
from Bio.SeqRecord import SeqRecord

from uta.formats.geneaccessions import GeneAccessions, GeneAccessionsWriter
from uta.formats.geneinfo import GeneInfo, GeneInfoWriter
from uta.formats.txinfo import TxInfo, TxInfoWriter
from uta.parsers.seqrecord import SeqRecordFacade, SeqRecordFeatureError
from uta.tools.file_utils import open_file


logging_conf_fn = importlib_resources.files("uta").joinpath("etc/logging.conf")
logging.config.fileConfig(logging_conf_fn)
logging.getLogger().setLevel(logging.INFO)
logger = logging.getLogger(__name__)


def parse_args():
ap = argparse.ArgumentParser(
description=__doc__,
)
ap.add_argument("GBFF_FILES", nargs="+")
ap.add_argument("--origin", "-o", default="NCBI")
ap.add_argument("--prefix", "-p", default="")
ap.add_argument("--output_dir", "-d", default=".", type=str)
opts = ap.parse_args()
return opts


def main(gbff_files: Iterable, origin: str, prefix: str, output_dir: str) -> None:
if prefix:
prefix = f"{prefix}."

# setup context managers for file writers
with ExitStack() as stack:
# DNA fasta file
dna_fasta_fh = stack.enter_context(
io.TextIOWrapper(
gzip.open(f"{output_dir}/{prefix}rna.fna.gz", "wb"), encoding="utf-8"
)
)

# Protein fasta file
protein_fasta_fh = stack.enter_context(
io.TextIOWrapper(
gzip.open(f"{output_dir}/{prefix}protein.faa.gz", "wb"),
encoding="utf-8",
)
)

geneinfo_fh = stack.enter_context(
io.TextIOWrapper(
gzip.open(f"{output_dir}/{prefix}geneinfo.gz", "wb"), encoding="utf-8"
)
)
geneinfo_writer = GeneInfoWriter(geneinfo_fh)

txinfo_fh = stack.enter_context(
io.TextIOWrapper(
gzip.open(f"{output_dir}/{prefix}txinfo.gz", "w"), encoding="utf-8"
)
)
txinfo_writer = TxInfoWriter(txinfo_fh)

assocacs_fh = stack.enter_context(
io.TextIOWrapper(
gzip.open(f"{output_dir}/{prefix}assocacs.gz", "w"), encoding="utf-8"
)
)
assocacs_writer = GeneAccessionsWriter(assocacs_fh)

total_genes = set()
total_skipped = set()
all_prefixes = Counter()
for gbff_fn in gbff_files:
logger.info(f"Processing {gbff_fn}")
gbff_file_handler = stack.enter_context(open_file(gbff_fn))
i = 0
genes = set()
skipped = set()
prefixes = Counter()
for r in Bio.SeqIO.parse(gbff_file_handler, "gb"):
srf = SeqRecordFacade(r)

# skip transcripts where the exon structure is unknown
if not srf.exons_se_i:
skipped.add(srf.id)
continue

prefixes.update([srf.id[:2]])
try:
fna_record = SeqRecord(
Seq(srf.feature_seq), id=srf.id, description=""
)
dna_fasta_fh.write(fna_record.format("fasta"))

if srf.gene_id not in genes:
geneinfo_writer.write(
GeneInfo(
gene_id=srf.gene_id,
gene_symbol=srf.gene_symbol,
tax_id="9606",
hgnc=srf.gene_symbol,
maploc="",
aliases=srf.gene_synonyms,
type=srf.gene_type,
summary="",
descr="",
xrefs=srf.db_xrefs,
)
)

txinfo_writer.write(
TxInfo(
origin=origin,
ac=srf.id,
gene_id=srf.gene_id,
gene_symbol=srf.gene_symbol,
cds_se_i=TxInfo.serialize_cds_se_i(srf.cds_se_i),
exons_se_i=TxInfo.serialize_exons_se_i(srf.exons_se_i),
transl_except=TxInfo.serialize_transl_except(
srf.transl_except
),
)
)

# only write cds features for protein-coding transcripts
if srf.cds_feature is not None:
pro_record = SeqRecord(
Seq(srf.cds_translation),
id=srf.cds_protein_id,
description=srf.cds_product,
)
protein_fasta_fh.write(pro_record.format("fasta"))

assocacs_writer.write(
GeneAccessions(
origin=origin,
gene_id=srf.gene_id,
gene_symbol=srf.gene_symbol,
tx_ac=srf.id,
pro_ac=srf.cds_protein_id,
)
)

genes.add(srf.gene_id)
i += 1
if i % 5000 == 0:
logger.info(
" - {ng} genes in {fn} ({c}); {s} transcripts skipped".format(
ng=len(genes),
fn=gbff_fn,
c=prefixes,
s=len(skipped),
)
)
except SeqRecordFeatureError as e:
logger.error(f"SeqRecordFeatureError processing {r.id}: {e}")
raise
except ValueError as e:
logger.error(f"ValueError processing {r.id}: {e}")
raise


logger.info(
"{ng} genes in {fn} ({c}); {s} transcripts skipped".format(
ng=len(genes), fn=gbff_fn, c=prefixes, s=len(skipped)
)
)
total_genes ^= genes
total_skipped ^= skipped
all_prefixes += prefixes
logger.info(
"{ng} genes in {nf} ({c}); {s} transcripts skipped".format(
ng=len(total_genes), nf=len(gbff_files), c=all_prefixes, s=len(total_skipped)
)
)


if __name__ == "__main__":
args = parse_args()
main(args.GBFF_FILES, args.origin, args.prefix, args.output_dir)
52 changes: 52 additions & 0 deletions misc/refseq-historical-backfill/uta-extract-historical
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
#!/usr/bin/env bash

# Download, then extract intermediate files out of the NCBI historical alignment files.

set -e

ncbi_dir=$1
working_dir=$2
log_dir=$3

if [ -z "$ncbi_dir" ] || [ -z "$working_dir" ] || [ -z "$log_dir" ]
then
echo 'Usage: sbin/uta-extract-historical <ncbi_dir> <working_dir> <log_dir>'
exit 1
fi

download_ncbi_file () {
download_path=$1
download_dir=$2

download_module="${download_path%%/*}"
download_source="ftp.ncbi.nlm.nih.gov::$download_path"
download_destination="$download_dir/$download_module"

mkdir -p $download_destination
echo "Downloading $download_source to $download_destination"
rsync --no-motd -DHPRprtv "$download_source" "$download_destination"
}

relative_path="refseq/H_sapiens/historical/GRCh38/GCF_000001405.40-RS_2023_03_historical"

# download historical genbank file
file_path="$relative_path/GCF_000001405.40-RS_2023_03_knownrefseq_rna.gbff.gz"
download_ncbi_file $file_path $ncbi_dir

# download historical gff file
file_path="$relative_path/GCF_000001405.40-RS_2023_03_knownrefseq_alns.gff.gz"
download_ncbi_file $file_path $ncbi_dir

# extract intermediate files from genbank file
python misc/refseq-historical-backfill/ncbi_extract_gbff.py \
"$ncbi_dir/$relative_path/GCF_000001405.40-RS_2023_03_knownrefseq_rna.gbff.gz" \
--output_dir "$working_dir" 2>&1 | tee "$log_dir/ncbi-parse-historical-ggbb.log"

# extract exonset intermediate file from gff file
python sbin/ncbi_parse_genomic_gff.py "$ncbi_dir/$relative_path/GCF_000001405.40-RS_2023_03_knownrefseq_alns.gff.gz" | \
gzip -c > "$working_dir/unfiltered_exonsets.gz" 2>&1 | tee "$log_dir/ncbi-parse-historical-gff.log"

# filter exonset alignments by txinfo
sbin/filter_exonset_transcripts.py --tx-info "$working_dir/txinfo.gz" --exonsets "$working_dir/unfiltered_exonsets.gz" \
--missing-ids "$working_dir/filtered_tx_acs.txt" | gzip -c > "$working_dir/exonsets.gz" 2>&1 | \
tee "$log_dir/filter_exonset_transcripts.log"
14 changes: 5 additions & 9 deletions sbin/exonset-to-seqinfo
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,15 @@
import argparse
import configparser as ConfigParser
import gzip
import importlib_resources
import itertools
import logging
import logging.config
import pkg_resources
import re
import sys

from bioutils.digests import seq_md5
from biocommons.seqrepo import SeqRepo
# from multifastadb import MultiFastaDB

from uta.formats.exonset import ExonSetReader
from uta.formats.seqinfo import SeqInfo, SeqInfoWriter
Expand All @@ -32,16 +31,15 @@ def parse_args(argv):
required=True)
ap.add_argument("--conf",
default=[
pkg_resources.resource_filename("uta", "../../etc/global.conf")]
)
importlib_resources.files("uta").joinpath("../../etc/global.conf")
])

opts = ap.parse_args(argv)
return opts


if __name__ == "__main__":
logging_conf_fn = pkg_resources.resource_filename(
"uta", "etc/logging.conf")
logging_conf_fn = importlib_resources.files("uta").joinpath("etc/logging.conf")
logging.config.fileConfig(logging_conf_fn)
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
Expand All @@ -53,15 +51,13 @@ if __name__ == "__main__":
cf = ConfigParser.ConfigParser()
for conf_fn in opts.conf:
cf.read_file(open(conf_fn))
logger.info("loaded " + conf_fn)
logger.info("loaded " + str(conf_fn))

in_fn = opts.FILES[0]
in_fh = gzip.open(in_fn, 'rt') if in_fn.endswith(".gz") else open(in_fn)
esr = ExonSetReader(in_fh)
logger.info("opened " + in_fn)

#fa_dirs = cf.get("sequences", "fasta_directories").strip().splitlines()
#mfdb = MultiFastaDB(fa_dirs, use_meta_index=True)
sr_dir = cf.get("sequences", "seqrepo")
sr = SeqRepo(root_dir=sr_dir)
logger.info("Opened sequence directories: " + sr_dir)
Expand Down
4 changes: 2 additions & 2 deletions sbin/filter_exonset_transcripts.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,10 @@ def filter_exonset(exonset_file, transcript_ids, missing_ids_file):
if exonset.tx_ac in transcript_ids:
esw.write(exonset)
else:
logger.warning(f"Exon set transcript {exonset.tx_ac} not found in txinfo file. Filtering out.")
logger.debug(f"Exon set transcript {exonset.tx_ac} not found in txinfo file. Filtering out.")
writer_missing.writerow([exonset.tx_ac])
missing_acs.add(exonset.tx_ac)
logger.info(f"Filtered out exon sets for {len(missing_acs)} transcript(s): {','.join(missing_acs)}")
logger.info(f"Filtered out exon sets for {len(missing_acs)} transcript(s)")


def main():
Expand Down
Loading

0 comments on commit 4cb5f4f

Please sign in to comment.