feat(IPVC-2386): Backfill transl_excepts (#32)

biocommons · May 8, 2024 · 4cb5f4f · 4cb5f4f
1 parent 93e79f7
commit 4cb5f4f
Show file tree

Hide file tree

Showing 15 changed files with 539 additions and 67 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -17,6 +17,7 @@ RUN pip install pysam
 WORKDIR /opt/repos/uta/
 COPY pyproject.toml ./
 COPY etc ./etc
+COPY misc ./misc
 COPY sbin ./sbin
 COPY src ./src
 RUN pip install -e .[dev]

diff --git a/docker-compose.yml b/docker-compose.yml
@@ -16,7 +16,6 @@ services:
     command: sbin/uta-extract /ncbi-dir /uta-extract/work /uta-extract/logs
     volumes:
       - ${UTA_ETL_NCBI_DIR}:/ncbi-dir
-      - ${UTA_ETL_SEQREPO_DIR}:/usr/local/share/seqrepo
       - ${UTA_ETL_WORK_DIR}:/uta-extract/work
       - ${UTA_ETL_LOG_DIR}:/uta-extract/logs
     working_dir: /opt/repos/uta
@@ -47,7 +46,6 @@ services:
       uta:
         condition: service_healthy
     volumes:
-      - ${UTA_ETL_NCBI_DIR}:/ncbi-dir
       - ${UTA_ETL_SEQREPO_DIR}:/usr/local/share/seqrepo
       - ${UTA_ETL_WORK_DIR}:/uta-load/work
       - ${UTA_ETL_LOG_DIR}:/uta-load/logs

diff --git a/etc/ncbi-files.txt b/etc/ncbi-files.txt
@@ -16,11 +16,6 @@
 #    │                       └── GCF_000001405.25_GRCh37.p13_genomic.gff.gz
 #    └── refseq
 #        └── H_sapiens
-#            ├── historical
-#            │   └── GRCh38
-#            │       └── GCF_000001405.40-RS_2023_03_historical
-#            │           ├── GCF_000001405.40-RS_2023_03_knownrefseq_alns.gff.gz
-#            │           └── GCF_000001405.40-RS_2023_03_knownrefseq_rna.gbff.gz
 #            └── mRNA_Prot
 #                ├── human.1.protein.faa.gz
 #                ├── human.1.rna.fna.gz
@@ -36,10 +31,6 @@ refseq/H_sapiens/mRNA_Prot/human.*.rna.gbff.gz
 refseq/H_sapiens/mRNA_Prot/human.*.rna.fna.gz
 refseq/H_sapiens/mRNA_Prot/human.*.protein.faa.gz
 
-## Historical RefSeq alignments
-refseq/H_sapiens/historical/GRCh38/GCF_000001405.40-RS_2023_03_historical/GCF_000001405.40-RS_2023_03_knownrefseq_alns.gff.gz
-refseq/H_sapiens/historical/GRCh38/GCF_000001405.40-RS_2023_03_historical/GCF_000001405.40-RS_2023_03_knownrefseq_rna.gbff.gz
-
 ## Genome build and alignment data
 # Build 37
 genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.25_GRCh37.p13/GCF_000001405.25_GRCh37.p13_assembly_report.txt

diff --git a/misc/refseq-historical-backfill/docker-compose-backfill.yml b/misc/refseq-historical-backfill/docker-compose-backfill.yml
@@ -0,0 +1,14 @@
+# docker compose file for the RefSeq historical backfill procedure
+
+version: '3'
+
+services:
+  uta-extract-historical:
+    image: uta-update
+    command: misc/refseq-historical-backfill/uta-extract-historical /ncbi-dir /uta-extract/work /uta-extract/logs
+    volumes:
+      - ${UTA_ETL_NCBI_DIR}:/ncbi-dir
+      - ${UTA_ETL_WORK_DIR}:/uta-extract/work
+      - ${UTA_ETL_LOG_DIR}:/uta-extract/logs
+    working_dir: /opt/repos/uta
+    network_mode: host
diff --git a/misc/refseq-historical-backfill/ncbi_extract_gbff.py b/misc/refseq-historical-backfill/ncbi_extract_gbff.py
@@ -0,0 +1,196 @@
+"""
+Extract and write all files needed by UTA load, except alt accession exonsets (aka, alignments). From a single
+GBFF file we can create dna fasta, protein fasta, associated accessions, geneinfo, and txinfo files.
+"""
+import argparse
+import gzip
+import importlib_resources
+import io
+import logging
+import logging.config
+from collections import Counter
+from contextlib import ExitStack
+from typing import Iterable
+
+from Bio.Seq import Seq
+import Bio.SeqIO
+from Bio.SeqRecord import SeqRecord
+
+from uta.formats.geneaccessions import GeneAccessions, GeneAccessionsWriter
+from uta.formats.geneinfo import GeneInfo, GeneInfoWriter
+from uta.formats.txinfo import TxInfo, TxInfoWriter
+from uta.parsers.seqrecord import SeqRecordFacade, SeqRecordFeatureError
+from uta.tools.file_utils import open_file
+
+
+logging_conf_fn = importlib_resources.files("uta").joinpath("etc/logging.conf")
+logging.config.fileConfig(logging_conf_fn)
+logging.getLogger().setLevel(logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+def parse_args():
+    ap = argparse.ArgumentParser(
+        description=__doc__,
+    )
+    ap.add_argument("GBFF_FILES", nargs="+")
+    ap.add_argument("--origin", "-o", default="NCBI")
+    ap.add_argument("--prefix", "-p", default="")
+    ap.add_argument("--output_dir", "-d", default=".", type=str)
+    opts = ap.parse_args()
+    return opts
+
+
+def main(gbff_files: Iterable, origin: str, prefix: str, output_dir: str) -> None:
+    if prefix:
+        prefix = f"{prefix}."
+
+    # setup context managers for file writers
+    with ExitStack() as stack:
+        # DNA fasta file
+        dna_fasta_fh = stack.enter_context(
+            io.TextIOWrapper(
+                gzip.open(f"{output_dir}/{prefix}rna.fna.gz", "wb"), encoding="utf-8"
+            )
+        )
+
+        # Protein fasta file
+        protein_fasta_fh = stack.enter_context(
+            io.TextIOWrapper(
+                gzip.open(f"{output_dir}/{prefix}protein.faa.gz", "wb"),
+                encoding="utf-8",
+            )
+        )
+
+        geneinfo_fh = stack.enter_context(
+            io.TextIOWrapper(
+                gzip.open(f"{output_dir}/{prefix}geneinfo.gz", "wb"), encoding="utf-8"
+            )
+        )
+        geneinfo_writer = GeneInfoWriter(geneinfo_fh)
+
+        txinfo_fh = stack.enter_context(
+            io.TextIOWrapper(
+                gzip.open(f"{output_dir}/{prefix}txinfo.gz", "w"), encoding="utf-8"
+            )
+        )
+        txinfo_writer = TxInfoWriter(txinfo_fh)
+
+        assocacs_fh = stack.enter_context(
+            io.TextIOWrapper(
+                gzip.open(f"{output_dir}/{prefix}assocacs.gz", "w"), encoding="utf-8"
+            )
+        )
+        assocacs_writer = GeneAccessionsWriter(assocacs_fh)
+
+        total_genes = set()
+        total_skipped = set()
+        all_prefixes = Counter()
+        for gbff_fn in gbff_files:
+            logger.info(f"Processing {gbff_fn}")
+            gbff_file_handler = stack.enter_context(open_file(gbff_fn))
+            i = 0
+            genes = set()
+            skipped = set()
+            prefixes = Counter()
+            for r in Bio.SeqIO.parse(gbff_file_handler, "gb"):
+                srf = SeqRecordFacade(r)
+
+                # skip transcripts where the exon structure is unknown
+                if not srf.exons_se_i:
+                    skipped.add(srf.id)
+                    continue
+
+                prefixes.update([srf.id[:2]])
+                try:
+                    fna_record = SeqRecord(
+                        Seq(srf.feature_seq), id=srf.id, description=""
+                    )
+                    dna_fasta_fh.write(fna_record.format("fasta"))
+
+                    if srf.gene_id not in genes:
+                        geneinfo_writer.write(
+                            GeneInfo(
+                                gene_id=srf.gene_id,
+                                gene_symbol=srf.gene_symbol,
+                                tax_id="9606",
+                                hgnc=srf.gene_symbol,
+                                maploc="",
+                                aliases=srf.gene_synonyms,
+                                type=srf.gene_type,
+                                summary="",
+                                descr="",
+                                xrefs=srf.db_xrefs,
+                            )
+                        )
+
+                    txinfo_writer.write(
+                        TxInfo(
+                            origin=origin,
+                            ac=srf.id,
+                            gene_id=srf.gene_id,
+                            gene_symbol=srf.gene_symbol,
+                            cds_se_i=TxInfo.serialize_cds_se_i(srf.cds_se_i),
+                            exons_se_i=TxInfo.serialize_exons_se_i(srf.exons_se_i),
+                            transl_except=TxInfo.serialize_transl_except(
+                                srf.transl_except
+                            ),
+                        )
+                    )
+
+                    # only write cds features for protein-coding transcripts
+                    if srf.cds_feature is not None:
+                        pro_record = SeqRecord(
+                            Seq(srf.cds_translation),
+                            id=srf.cds_protein_id,
+                            description=srf.cds_product,
+                        )
+                        protein_fasta_fh.write(pro_record.format("fasta"))
+
+                        assocacs_writer.write(
+                            GeneAccessions(
+                                origin=origin,
+                                gene_id=srf.gene_id,
+                                gene_symbol=srf.gene_symbol,
+                                tx_ac=srf.id,
+                                pro_ac=srf.cds_protein_id,
+                            )
+                        )
+
+                    genes.add(srf.gene_id)
+                    i += 1
+                    if i % 5000 == 0:
+                        logger.info(
+                            "  - {ng} genes in {fn} ({c}); {s} transcripts skipped".format(
+                                ng=len(genes),
+                                fn=gbff_fn,
+                                c=prefixes,
+                                s=len(skipped),
+                            )
+                        )
+                except SeqRecordFeatureError as e:
+                    logger.error(f"SeqRecordFeatureError processing {r.id}: {e}")
+                    raise
+                except ValueError as e:
+                    logger.error(f"ValueError processing {r.id}: {e}")
+                    raise
+
+
+            logger.info(
+                "{ng} genes in {fn} ({c}); {s} transcripts skipped".format(
+                    ng=len(genes), fn=gbff_fn, c=prefixes, s=len(skipped)
+                )
+            )
+            total_genes ^= genes
+            total_skipped ^= skipped
+            all_prefixes += prefixes
+        logger.info(
+            "{ng} genes in {nf} ({c}); {s} transcripts skipped".format(
+                ng=len(total_genes), nf=len(gbff_files), c=all_prefixes, s=len(total_skipped)
+            )
+        )
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args.GBFF_FILES, args.origin, args.prefix, args.output_dir)
diff --git a/misc/refseq-historical-backfill/uta-extract-historical b/misc/refseq-historical-backfill/uta-extract-historical
@@ -0,0 +1,52 @@
+#!/usr/bin/env bash
+
+# Download, then extract intermediate files out of the NCBI historical alignment files.
+
+set -e
+
+ncbi_dir=$1
+working_dir=$2
+log_dir=$3
+
+if [ -z "$ncbi_dir" ] || [ -z "$working_dir" ] || [ -z "$log_dir" ]
+then
+    echo 'Usage: sbin/uta-extract-historical <ncbi_dir> <working_dir> <log_dir>'
+    exit 1
+fi
+
+download_ncbi_file () {
+    download_path=$1
+    download_dir=$2
+
+    download_module="${download_path%%/*}"
+    download_source="ftp.ncbi.nlm.nih.gov::$download_path"
+    download_destination="$download_dir/$download_module"
+
+    mkdir -p $download_destination
+    echo "Downloading $download_source to $download_destination"
+    rsync --no-motd -DHPRprtv "$download_source" "$download_destination"
+}
+
+relative_path="refseq/H_sapiens/historical/GRCh38/GCF_000001405.40-RS_2023_03_historical"
+
+# download historical genbank file
+file_path="$relative_path/GCF_000001405.40-RS_2023_03_knownrefseq_rna.gbff.gz"
+download_ncbi_file $file_path $ncbi_dir
+
+# download historical gff file
+file_path="$relative_path/GCF_000001405.40-RS_2023_03_knownrefseq_alns.gff.gz"
+download_ncbi_file $file_path $ncbi_dir
+
+# extract intermediate files from genbank file
+python misc/refseq-historical-backfill/ncbi_extract_gbff.py \
+  "$ncbi_dir/$relative_path/GCF_000001405.40-RS_2023_03_knownrefseq_rna.gbff.gz" \
+  --output_dir "$working_dir" 2>&1 | tee "$log_dir/ncbi-parse-historical-ggbb.log"
+
+# extract exonset intermediate file from gff file
+python sbin/ncbi_parse_genomic_gff.py "$ncbi_dir/$relative_path/GCF_000001405.40-RS_2023_03_knownrefseq_alns.gff.gz" | \
+  gzip -c > "$working_dir/unfiltered_exonsets.gz" 2>&1 | tee "$log_dir/ncbi-parse-historical-gff.log"
+
+# filter exonset alignments by txinfo
+sbin/filter_exonset_transcripts.py --tx-info "$working_dir/txinfo.gz" --exonsets "$working_dir/unfiltered_exonsets.gz" \
+    --missing-ids "$working_dir/filtered_tx_acs.txt" | gzip -c > "$working_dir/exonsets.gz" 2>&1 | \
+    tee "$log_dir/filter_exonset_transcripts.log"
diff --git a/sbin/exonset-to-seqinfo b/sbin/exonset-to-seqinfo
@@ -5,16 +5,15 @@
 import argparse
 import configparser as ConfigParser
 import gzip
+import importlib_resources
 import itertools
 import logging
 import logging.config
-import pkg_resources
 import re
 import sys
 
 from bioutils.digests import seq_md5
 from biocommons.seqrepo import SeqRepo
-# from multifastadb import MultiFastaDB
 
 from uta.formats.exonset import ExonSetReader
 from uta.formats.seqinfo import SeqInfo, SeqInfoWriter
@@ -32,16 +31,15 @@ def parse_args(argv):
                     required=True)
     ap.add_argument("--conf",
                     default=[
-                        pkg_resources.resource_filename("uta", "../../etc/global.conf")]
-                    )
+                        importlib_resources.files("uta").joinpath("../../etc/global.conf")
+                    ])
 
     opts = ap.parse_args(argv)
     return opts
 
 
 if __name__ == "__main__":
-    logging_conf_fn = pkg_resources.resource_filename(
-        "uta", "etc/logging.conf")
+    logging_conf_fn = importlib_resources.files("uta").joinpath("etc/logging.conf")
     logging.config.fileConfig(logging_conf_fn)
     logger = logging.getLogger(__name__)
     logger.setLevel(logging.INFO)
@@ -53,15 +51,13 @@ if __name__ == "__main__":
     cf = ConfigParser.ConfigParser()
     for conf_fn in opts.conf:
         cf.read_file(open(conf_fn))
-        logger.info("loaded " + conf_fn)
+        logger.info("loaded " + str(conf_fn))
 
     in_fn = opts.FILES[0]
     in_fh = gzip.open(in_fn, 'rt') if in_fn.endswith(".gz") else open(in_fn)
     esr = ExonSetReader(in_fh)
     logger.info("opened " + in_fn)
 
-    #fa_dirs = cf.get("sequences", "fasta_directories").strip().splitlines()
-    #mfdb = MultiFastaDB(fa_dirs, use_meta_index=True)
     sr_dir = cf.get("sequences", "seqrepo")
     sr = SeqRepo(root_dir=sr_dir)
     logger.info("Opened sequence directories: " + sr_dir)

diff --git a/sbin/filter_exonset_transcripts.py b/sbin/filter_exonset_transcripts.py
@@ -28,10 +28,10 @@ def filter_exonset(exonset_file, transcript_ids, missing_ids_file):
             if exonset.tx_ac in transcript_ids:
                 esw.write(exonset)
             else:
-                logger.warning(f"Exon set transcript {exonset.tx_ac} not found in txinfo file. Filtering out.")
+                logger.debug(f"Exon set transcript {exonset.tx_ac} not found in txinfo file. Filtering out.")
                 writer_missing.writerow([exonset.tx_ac])
                 missing_acs.add(exonset.tx_ac)
-    logger.info(f"Filtered out exon sets for {len(missing_acs)} transcript(s): {','.join(missing_acs)}")
+    logger.info(f"Filtered out exon sets for {len(missing_acs)} transcript(s)")
 
 
 def main():