From 0be5d93f24a7a7ec52a43a5e423a5e6c1597e4cd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=A4=D0=B8=D1=88=D0=BC=D0=B0=D0=BD=20=D0=92=D0=B5=D0=BD?=
 =?UTF-8?q?=D0=B8=D0=B0=D0=BC=D0=B8=D0=BD?= <fishman@airi.net>
Date: Tue, 18 Jul 2023 16:17:01 +0000
Subject: [PATCH] Variant-aware corpus generation

---
 pyproject.toml                                |   5 +-
 requirements.txt                              |   4 +-
 .../1KG+HGDP.selected_samples.tsv             |  38 ++
 src/gena_lm/genome_tools/README.md            |  63 +++-
 .../genome_tools/create_allele_corpus.py      | 334 ++++++++++++++++++
 src/gena_lm/genome_tools/create_corpus.py     |  18 +-
 .../genome_tools/create_multigenome_corpus.sh |   5 +-
 7 files changed, 448 insertions(+), 19 deletions(-)
 create mode 100644 src/gena_lm/genome_tools/1KG+HGDP.selected_samples.tsv
 create mode 100755 src/gena_lm/genome_tools/create_allele_corpus.py
 mode change 100644 => 100755 src/gena_lm/genome_tools/create_corpus.py
 mode change 100644 => 100755 src/gena_lm/genome_tools/create_multigenome_corpus.sh

diff --git a/pyproject.toml b/pyproject.toml
index fa7093a..2837cad 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,3 +1,6 @@
 [build-system]
 requires = ["setuptools>=42"]
-build-backend = "setuptools.build_meta"
\ No newline at end of file
+build-backend = "setuptools.build_meta"
+
+[tool.mypy]
+ignore_missing_imports = true
diff --git a/requirements.txt b/requirements.txt
index 5c70927..f10881e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,9 +1,9 @@
 numpy
-biopython
+cyvcf2
 tokenizers
 tqdm
 torch">=1.7.1,<=1.9.1"
 transformers==4.17.0
 pysam>=0.19.1
 biopython>=1.79
-pandas>=1.5.0
\ No newline at end of file
+pandas>=1.5.0
diff --git a/src/gena_lm/genome_tools/1KG+HGDP.selected_samples.tsv b/src/gena_lm/genome_tools/1KG+HGDP.selected_samples.tsv
new file mode 100644
index 0000000..d224c2d
--- /dev/null
+++ b/src/gena_lm/genome_tools/1KG+HGDP.selected_samples.tsv
@@ -0,0 +1,38 @@
+NA12877	ceph	M
+NA12878	ceph	F
+HGDP00815	chinese	M
+HG00513	chinese	F
+HG00190	finnish	M
+HG00171	finnish	F
+HG00138	british	M
+HG00253	british	F
+HGDP00888	russian	M
+HGDP00898	russian	F
+HGDP00950	yakut	M
+HGDP00959	yakut	F
+HGDP01404	adygei	M
+HGDP01399	adygei	F
+HGDP01050	pima	M
+HGDP01041	pima	F
+HGDP01081	mbuti	M
+HGDP00471	mbuti	F
+HG01953	peruvian	M
+HG01573	peruvian	F
+HGDP00727	palestinian	M
+HGDP00699	palestinian	F
+HGDP00551	papuan	M
+HGDP00550	papuan	F
+NA21111	gujarati	M
+NA20896	gujarati	F
+HGDP00258	pathan	M
+HGDP00237	pathan	F
+HGDP00929	yoruba	M
+NA19129	yoruba	F
+HGDP00845	surui	M
+HGDP00838	surui	F
+HGDP01167	tuscan	M
+HGDP01169	tuscan	F
+HGDP01300	uygur	M
+HGDP01305	uygur	F
+HG03081	mende	M
+HG03055	mende	F
diff --git a/src/gena_lm/genome_tools/README.md b/src/gena_lm/genome_tools/README.md
index c3a4c44..d26e9f5 100644
--- a/src/gena_lm/genome_tools/README.md
+++ b/src/gena_lm/genome_tools/README.md
@@ -1,8 +1,61 @@
-A single genome file can be processed with create_corpus.py
+A single genome reference file can be processed with `create_corpus.py`.
+
+## Multi-species corpus
 
 To download and process multiple genome datasets, there are following steps:
+
 1. Download fasta files:
-'cat ensemble_genomes_metadata.tsv | cut -f2 | xargs -L 1 wget -P "fasta/" --no-clobber'
-2. Download and extract processed agp information into "contigs/" folder
-3. Run
-'bash create_multigenome_corpus.sh /path/to/full_ensembl_genomes_metadata /path/to/directory/with/fasta/ /path/to/directory/with/contigs/ /path/to/output/folder'
\ No newline at end of file
+```shell
+cut -f2 ensemble_genomes_metadata.tsv | xargs -L 1 wget -P fasta --no-clobber
+```
+
+2. Download and extract processed agp information into `contigs/` folder
+
+3. Run:
+```shell
+bash create_multigenome_corpus.sh           \
+    /path/to/full_ensembl_genomes_metadata  \
+    /path/to/directory/with/fasta/          \
+    /path/to/directory/with/contigs/        \
+    /path/to/output/folder
+```
+
+## Variation corpus
+
+1. Download [1000 Genomes + HGDP whole genome callset from gnomAD](https://gnomad.broadinstitute.org/downloads#v3-hgdp-1kg):
+```shell
+    GOO=https://storage.googleapis.com/gcp-public-data--gnomad/
+    AWS=https://gnomad-public-us-east-1.s3.amazonaws.com/
+    AZU=https://datasetgnomad.blob.core.windows.net/dataset/
+	# choose whichever cloud
+	LINK=$GOO
+	LINK=$LINK/release/3.1.2/vcf/genomes/gnomad.genomes.v3.1.2.hgdp_tgp.
+
+	mkdir 1KG+HGDP
+    for CHR in chr{1..22} chr{X,Y} ; do
+		wget -P 1KG+HGDP -c $LINK.$CHR.vcf.{bgz,bgz.tbi}
+    done
+```
+
+2. Download hg38 reference:
+```shell
+	mkdir hg38
+	wget -P hg38 https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/analysisSet/hg38.analysisSet.fa.gz
+	gunzip hg38.analysisSet.fa.gz
+	samtools faidx hg38.analysisSet.fa
+```
+
+3. Apply variants:
+```shell
+    # select some samples from across the world
+    SAMPLES=$(cut -f1 | 1KG+HGDP.selected_samples.tsv | tr , \n)
+
+    mkdir variation_corpus
+    for CHR in chr{1..22} chr{X,Y} ; do 
+    	python create_allele_corpus.py                                  \
+            --reference hg38/hg38.analysisSet.fa                        \
+            --vcf 1KG+HGDP/gnomad.genomes.v3.1.2.hgdp_tgp.$CHR.vcf.bgz  \
+            --samples ${SAMPLES%?}                                      \
+            --output-dir variation_corpus
+    done
+```
diff --git a/src/gena_lm/genome_tools/create_allele_corpus.py b/src/gena_lm/genome_tools/create_allele_corpus.py
new file mode 100755
index 0000000..83bae35
--- /dev/null
+++ b/src/gena_lm/genome_tools/create_allele_corpus.py
@@ -0,0 +1,334 @@
+#!/usr/bin/env python
+import click
+import json
+import random
+
+from pathlib import Path
+from typing import Generator, List, Literal, Optional, Tuple
+
+from Bio import Seq, SeqIO
+from cyvcf2 import VCF, Variant
+from tqdm import tqdm
+
+# reproducible randomness
+random.seed(42)
+
+
+class Document:
+    def __init__(
+        self, sentences: Optional[List[str]] = None, metadata: Optional[dict] = None
+    ) -> None:
+        """
+        Data structure to store subsequences that are forming the chromosome
+
+        Each document contains up to `max(sentence_bound) * max(lengths_bound)` of nts
+        """
+        if sentences is None:
+            self.sentences = []
+        else:
+            self.sentences = sentences
+        self.metadata = metadata
+
+    def append(self, sentence: str) -> None:
+        self.sentences.append(sentence)
+
+    def set_metadata(self, metadata: dict) -> None:
+        self.metadata = metadata
+
+    def to_text(self) -> str:
+        return "\n".join(self.sentences)
+
+    def to_jsonl(self) -> str:
+        if self.metadata:
+            return json.dumps({"text": self.sentences, **self.metadata})
+        else:
+            return json.dumps({"text": self.sentences})
+
+
+def generate_consensus_documents(
+    chr_sequence: SeqIO.SeqRecord,
+    vcf_file: VCF,
+    sample_list: List[str],
+    sentences_bounds: Tuple[int, int] = (50, 100),
+    lengths_bounds: Tuple[int, int] = (500, 1000),
+) -> Generator[Document, None, None]:
+    """
+    Yield documents covering the chromosome from a reference
+    replacing its positions with alleles from a VCF file for each sample in it.
+    """
+
+    def choose_alleles(
+        genotype: List,
+        ref_allele: str,
+        alt_alleles: List[str],
+        AF: float,
+    ) -> Tuple[str, str, int, int]:
+        """
+        Helper function to return two allele letters.
+        cyvcf2 returns genotype as [allele1:int, allele2:int, phase_status]
+        """
+        if genotype[0] == -1:
+            # not genotyped, will use reference
+            return (ref_allele, ref_allele, 0, 0)
+        else:
+            alleles = [ref_allele] + alt_alleles
+            
+            is_common = 1
+            if AF < 0.1:
+                is_common = 0
+
+            return (alleles[genotype[0]], alleles[genotype[1]], 1, is_common)
+
+    def make_sample_use_mask(
+        samples: List[str],
+        variants: List[Variant]
+    ) -> List[bool]:
+        """
+        Helper function to understand if we should skip any of samples.
+        Returns a bitmask over all samples: False = skip, True = use
+        """
+        sample_use_mask = [False for _ in samples]
+        for sample_id in range(len(samples)):
+            # check every sample if it has genotyped variants in this region
+            for variant in variants:
+                if variant.genotypes[sample_id][0] not in [0, -1]:
+                    # sample actually is genotyped and not a reference
+                    sample_use_mask[sample_id] = True
+                    break
+        return sample_use_mask
+
+    def prepare_documents(
+        sequence: str,
+        snt_bounds: List[int],
+        metadata: dict
+    ) -> Tuple[Document, Document]:
+        """
+        Helper function for a final compilation of documents.
+        Given a ready full sequence and sentence bounds,
+        split it into sentences and repeat it for the revcomp copy
+        """
+        d = Document(metadata=metadata)
+        d_rc = Document(metadata=metadata)
+
+        snt_pos = 0
+        for snt_len in snt_bounds:
+            snt = sequence[snt_pos:snt_pos+snt_len]
+            d.append(snt)
+            snt_pos += snt_len
+
+        snt_pos = 0
+        sequence_rc = Seq.reverse_complement(sequence)
+        for snt_len in reversed(snt_bounds):
+            snt = sequence_rc[snt_pos:snt_pos+snt_len]
+            d_rc.append(snt)
+            snt_pos += snt_len
+        return (d, d_rc)
+
+    C = len(chr_sequence)
+
+    for _ in range(1):
+        doc_start = random.randint(0, 5000) 
+        while doc_start + lengths_bounds[1] < C:
+            # prepare a document
+            current_pos = doc_start
+            
+            snt_len = random.randint(*lengths_bounds)
+            snt_left = random.randint(*sentences_bounds)
+            snt_bounds = []
+            while (current_pos + snt_len < C) and (snt_left > 0):
+                # prepare a sentence in this document
+                snt_left -= 1
+                current_pos += snt_len
+                snt_bounds.append(snt_len)
+                snt_len = random.randint(*lengths_bounds)
+            
+            doc_end = current_pos
+            ref_part = str(chr_sequence.seq[doc_start:doc_end].upper())
+            ref_len = len(ref_part)
+            NF = ref_part.count("N") / ref_len
+            if NF > 0.5:
+                doc_start = doc_end
+                continue
+
+            region = f"{chr_sequence.id}:{doc_start}-{doc_end}"
+            variants = list(vcf_file(region))
+            if not variants:
+                doc_start = doc_end
+                continue
+
+            sample_use_mask = make_sample_use_mask(vcf_file.samples, variants)
+            for sample_id in range(len(vcf_file.samples)):
+                if not sample_use_mask[sample_id]:
+                    continue
+
+                sample_cons1 = ""
+                sample_cons2 = ""
+                total_VCS = 0
+                total_CCS = 0
+                total_CC = 0
+                total_RC = 0
+                var_pos = 0
+                for v in variants:
+                    # -1: v.POS is VCF (1-based) and doc_start is python (0-based)
+                    local_pos = v.POS - doc_start - 1
+                    if local_pos > var_pos:
+                        # parts prior to variant
+                        sample_cons1 += ref_part[var_pos:local_pos]
+                        sample_cons2 += ref_part[var_pos:local_pos]
+                    
+#                   AF = v.INFO.get("gnomad_AF")
+#                   if not AF:
+#                       AF = v.INFO.get("AF")
+#                   if not AF:
+#                       AF = v.aaf
+# requesting AF slows it down 3-10x
+                    AF = 0.5
+
+                    # variant alleles
+                    (allele1, allele2, variant_count, common_count) = choose_alleles(
+                        v.genotypes[sample_id],
+                        v.REF, v.ALT, AF
+                    )
+                    sample_cons1 += allele1
+                    sample_cons2 += allele2
+
+                    total_VCS += variant_count
+                    total_CCS += common_count
+
+#                   if AF < 0.01:
+#                       total_RC += 1
+#                   if AF > 0.1:
+#                       total_CC += 1
+
+                    # +1: we already selected var_pos+0 for the variant itself
+                    # -1: if REF is long (in case of deletion), 
+                    # skip letters after the first one
+                    # as choose_alleles() already added right amount of REF
+                    # if sample was REF
+                    var_pos = local_pos + 1 + len(v.REF) - 1
+
+                if (total_CCS / ref_len < 0.01):
+                    # skip region with low common
+                    continue
+                
+                metadata = {
+                    "VCS": total_VCS,                       # applied variant count
+#                   "CCS": total_CCS,                       # applied common >10% count
+#                   "RC": total_RC,                         # pop rare <1% count
+#                   "CC": total_CC,                         # pop common count
+                    "VF": f"{total_VCS/len(ref_part):.2f}", # VCS normalized by doc len
+                    "sample": vcf_file.samples[sample_id],
+                    "len": len(ref_part),
+                    "snt": len(snt_bounds)
+                }
+
+                cons_unequal = True
+                if sample_cons1 == sample_cons2:
+                    cons_unequal = False
+
+                # parts after the last variant
+                sample_cons1 += ref_part[var_pos:]
+
+                (d1, d1_rc) = prepare_documents(sample_cons1, snt_bounds, metadata)
+                yield d1
+                yield d1_rc
+
+                if cons_unequal:
+                    sample_cons2 += ref_part[var_pos:]
+                    (d2, d2_rc) = prepare_documents(sample_cons2, snt_bounds, metadata)
+                    yield d2
+                    yield d2_rc
+
+            doc_start = doc_end
+
+
+def handle_chromosome_and_variants(
+    chr: SeqIO.SeqRecord,
+    vcf_file: VCF,
+    sample_list: List[str],
+    outdir: Path,
+    io_mode: Literal["single_txt", "jsonl", "multiple_txt"] = "single_txt",
+):
+    """
+    For a given chromosome and variants make documents and write them to corresponding files
+    """
+    samples = ",".join(sample_list)
+
+    if io_mode == "single_txt":
+        filename = outdir / f"{chr.id}.{samples}.documents.txt"
+        with filename.open(mode="w") as out:
+            for document in tqdm(generate_consensus_documents(chr, vcf_file, sample_list), desc=chr.description):
+                out.write(document.to_text())
+                out.write("\n")
+    elif io_mode == "jsonl":
+        filename = outdir / f"{chr.id}.{samples}.documents.jsonl"
+        with filename.open(mode="w") as out:
+            for document in tqdm(generate_consensus_documents(chr, vcf_file, sample_list), desc=chr.description):
+                out.write(document.to_jsonl())
+                out.write("\n")
+    elif io_mode == "multiple_txt":
+        for idx, document in enumerate(generate_consensus_documents(chr, vcf_file, sample_list)):
+            filename = outdir / f"{chr.id}.{samples}.document_{idx}.txt"
+            with filename.open(mode="w") as out:
+                out.write(document.to_text())
+
+
+def read_fasta_and_vcf(
+    fasta_file: Path,
+    vcf_file: Path,
+    samples: str,
+    output_dir: Optional[Path] = None,
+    io_mode: Literal["single_txt", "jsonl", "multiple_txt"] = "single_txt"
+) -> None:
+    if not output_dir:
+        output_dir = Path(".")
+
+    sample_list = samples.split(",")
+    vcf = VCF(vcf_file, samples=sample_list, threads=2)
+    
+    with open(fasta_file) as input_handle:
+        for record in SeqIO.parse(input_handle, "fasta"):
+            if record.id in ("chrM"):
+                continue
+            try:
+                x = next(vcf(record.id))
+            except StopIteration:
+                continue
+            handle_chromosome_and_variants(
+                record,
+                vcf,
+                sample_list,
+                outdir=output_dir, io_mode=io_mode
+            )
+
+
+# example usage:
+# python create_corpus.py --input-file /path/to/hg38.fa \
+#  --vcf /path/to/gnomad.vcf \
+#  --samples NA12877,NA12878 \
+#  --output_dir data/processed/alleles/ --io_mode jsonl
+
+@click.command()
+@click.option("--reference", required=True,
+              help="Path to the genome reference fasta",
+              type=click.Path(path_type=Path, dir_okay=False))
+@click.option("--vcf", required=True,
+              help="Path to the indexed vcf",
+              type=click.Path(path_type=Path, dir_okay=False))
+@click.option("--samples", required=False,
+              help="""Comma-separated list of samples from the vcf to use.
+                   By default all will be used.""")
+@click.option("--output-dir", required=False,
+              type=click.Path(path_type=Path, dir_okay=True))
+@click.option("--io-mode", 
+              type=click.Choice(["single_txt", "jsonl", "multiple_txt"]),
+              default="jsonl")
+def cli(reference, vcf, samples, output_dir, io_mode):
+    if output_dir:
+        output_dir.mkdir(parents=True, exist_ok=True)
+
+    read_fasta_and_vcf(reference, vcf, samples, output_dir=output_dir, io_mode=io_mode)
+
+
+if __name__ == "__main__":
+    cli()
diff --git a/src/gena_lm/genome_tools/create_corpus.py b/src/gena_lm/genome_tools/create_corpus.py
old mode 100644
new mode 100755
index 2432272..2db4a09
--- a/src/gena_lm/genome_tools/create_corpus.py
+++ b/src/gena_lm/genome_tools/create_corpus.py
@@ -1,4 +1,4 @@
-# import numpy as np
+#!/usr/bin/env python
 import ast
 import gzip
 import json
@@ -149,15 +149,15 @@ def read_single_fasta(fna_file: Path, output_dir: Optional[Path] = None,
                     handle_chromosome(record, outdir=output_dir, io_mode=io_mode)
 
 # example usage:
-# python create_corpus.py --input_file ./ncbi_dataset/data/GCA_009914755.4/GCA_009914755.4_T2T-CHM13v2.0_genomic.fna \
-#  --output_dir data/processed/human/ --io_mode jsonl --min_len 10000
+# python create_corpus.py --input-file ./ncbi_dataset/data/GCA_009914755.4/GCA_009914755.4_T2T-CHM13v2.0_genomic.fna \
+#  --output-dir data/processed/human/ --io-mode jsonl --min-len 10000
 
 @click.command()
-@click.option("--input_file", type=str)
-@click.option("--contigs_split_file", type=str)
-@click.option("--output_dir", type=click.Path(path_type=Path, dir_okay=True))
-@click.option("--io_mode", type=click.Choice(["single_txt", "jsonl", "multiple_txt"]), default="single_txt")
-@click.option("--min_len", type=click.INT, default=10000, help="Minimum contig length to be included")
+@click.option("--input-file", type=str)
+@click.option("--contigs-split-file", type=str)
+@click.option("--output-dir", type=click.Path(path_type=Path, dir_okay=True))
+@click.option("--io-mode", type=click.Choice(["single_txt", "jsonl", "multiple_txt"]), default="single_txt")
+@click.option("--min-len", type=click.INT, default=10000, help="Minimum contig length to be included")
 #@click.option("--rc", is_flag=True, show_default=True, default=False, help="Reverse-complement all sequences")
 
 def cli(input_file, contigs_split_file, output_dir, io_mode, min_len):
@@ -171,4 +171,4 @@ def cli(input_file, contigs_split_file, output_dir, io_mode, min_len):
 
 
 if __name__ == "__main__":
-    cli()
\ No newline at end of file
+    cli()
diff --git a/src/gena_lm/genome_tools/create_multigenome_corpus.sh b/src/gena_lm/genome_tools/create_multigenome_corpus.sh
old mode 100644
new mode 100755
index 5306c93..c7dcf81
--- a/src/gena_lm/genome_tools/create_multigenome_corpus.sh
+++ b/src/gena_lm/genome_tools/create_multigenome_corpus.sh
@@ -1,3 +1,4 @@
+#!/bin/bash
 # create_multigenome_corpus.sh metadata_table fasta_dir contigs_dir out_folder
 
 set -e
@@ -21,5 +22,5 @@ do
 	contigs_split_file=$contigs_dir"/"${spname::-1}".breaks.csv"
 
 	echo "Processing $spname"
-	python create_corpus.py --input_file $fastafile --contigs_split_file $contigs_split_file --output_dir $spoutdir
-done
\ No newline at end of file
+	python create_corpus.py --input-file $fastafile --contigs-split-file $contigs_split_file --output-dir $spoutdir
+done