From 0be5d93f24a7a7ec52a43a5e423a5e6c1597e4cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=A4=D0=B8=D1=88=D0=BC=D0=B0=D0=BD=20=D0=92=D0=B5=D0=BD?= =?UTF-8?q?=D0=B8=D0=B0=D0=BC=D0=B8=D0=BD?= Date: Tue, 18 Jul 2023 16:17:01 +0000 Subject: [PATCH] Variant-aware corpus generation --- pyproject.toml | 5 +- requirements.txt | 4 +- .../1KG+HGDP.selected_samples.tsv | 38 ++ src/gena_lm/genome_tools/README.md | 63 +++- .../genome_tools/create_allele_corpus.py | 334 ++++++++++++++++++ src/gena_lm/genome_tools/create_corpus.py | 18 +- .../genome_tools/create_multigenome_corpus.sh | 5 +- 7 files changed, 448 insertions(+), 19 deletions(-) create mode 100644 src/gena_lm/genome_tools/1KG+HGDP.selected_samples.tsv create mode 100755 src/gena_lm/genome_tools/create_allele_corpus.py mode change 100644 => 100755 src/gena_lm/genome_tools/create_corpus.py mode change 100644 => 100755 src/gena_lm/genome_tools/create_multigenome_corpus.sh diff --git a/pyproject.toml b/pyproject.toml index fa7093a..2837cad 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,3 +1,6 @@ [build-system] requires = ["setuptools>=42"] -build-backend = "setuptools.build_meta" \ No newline at end of file +build-backend = "setuptools.build_meta" + +[tool.mypy] +ignore_missing_imports = true diff --git a/requirements.txt b/requirements.txt index 5c70927..f10881e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,9 +1,9 @@ numpy -biopython +cyvcf2 tokenizers tqdm torch">=1.7.1,<=1.9.1" transformers==4.17.0 pysam>=0.19.1 biopython>=1.79 -pandas>=1.5.0 \ No newline at end of file +pandas>=1.5.0 diff --git a/src/gena_lm/genome_tools/1KG+HGDP.selected_samples.tsv b/src/gena_lm/genome_tools/1KG+HGDP.selected_samples.tsv new file mode 100644 index 0000000..d224c2d --- /dev/null +++ b/src/gena_lm/genome_tools/1KG+HGDP.selected_samples.tsv @@ -0,0 +1,38 @@ +NA12877 ceph M +NA12878 ceph F +HGDP00815 chinese M +HG00513 chinese F +HG00190 finnish M +HG00171 finnish F +HG00138 british M +HG00253 british F +HGDP00888 russian M +HGDP00898 russian F +HGDP00950 yakut M +HGDP00959 yakut F +HGDP01404 adygei M +HGDP01399 adygei F +HGDP01050 pima M +HGDP01041 pima F +HGDP01081 mbuti M +HGDP00471 mbuti F +HG01953 peruvian M +HG01573 peruvian F +HGDP00727 palestinian M +HGDP00699 palestinian F +HGDP00551 papuan M +HGDP00550 papuan F +NA21111 gujarati M +NA20896 gujarati F +HGDP00258 pathan M +HGDP00237 pathan F +HGDP00929 yoruba M +NA19129 yoruba F +HGDP00845 surui M +HGDP00838 surui F +HGDP01167 tuscan M +HGDP01169 tuscan F +HGDP01300 uygur M +HGDP01305 uygur F +HG03081 mende M +HG03055 mende F diff --git a/src/gena_lm/genome_tools/README.md b/src/gena_lm/genome_tools/README.md index c3a4c44..d26e9f5 100644 --- a/src/gena_lm/genome_tools/README.md +++ b/src/gena_lm/genome_tools/README.md @@ -1,8 +1,61 @@ -A single genome file can be processed with create_corpus.py +A single genome reference file can be processed with `create_corpus.py`. + +## Multi-species corpus To download and process multiple genome datasets, there are following steps: + 1. Download fasta files: -'cat ensemble_genomes_metadata.tsv | cut -f2 | xargs -L 1 wget -P "fasta/" --no-clobber' -2. Download and extract processed agp information into "contigs/" folder -3. Run -'bash create_multigenome_corpus.sh /path/to/full_ensembl_genomes_metadata /path/to/directory/with/fasta/ /path/to/directory/with/contigs/ /path/to/output/folder' \ No newline at end of file +```shell +cut -f2 ensemble_genomes_metadata.tsv | xargs -L 1 wget -P fasta --no-clobber +``` + +2. Download and extract processed agp information into `contigs/` folder + +3. Run: +```shell +bash create_multigenome_corpus.sh \ + /path/to/full_ensembl_genomes_metadata \ + /path/to/directory/with/fasta/ \ + /path/to/directory/with/contigs/ \ + /path/to/output/folder +``` + +## Variation corpus + +1. Download [1000 Genomes + HGDP whole genome callset from gnomAD](https://gnomad.broadinstitute.org/downloads#v3-hgdp-1kg): +```shell + GOO=https://storage.googleapis.com/gcp-public-data--gnomad/ + AWS=https://gnomad-public-us-east-1.s3.amazonaws.com/ + AZU=https://datasetgnomad.blob.core.windows.net/dataset/ + # choose whichever cloud + LINK=$GOO + LINK=$LINK/release/3.1.2/vcf/genomes/gnomad.genomes.v3.1.2.hgdp_tgp. + + mkdir 1KG+HGDP + for CHR in chr{1..22} chr{X,Y} ; do + wget -P 1KG+HGDP -c $LINK.$CHR.vcf.{bgz,bgz.tbi} + done +``` + +2. Download hg38 reference: +```shell + mkdir hg38 + wget -P hg38 https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/analysisSet/hg38.analysisSet.fa.gz + gunzip hg38.analysisSet.fa.gz + samtools faidx hg38.analysisSet.fa +``` + +3. Apply variants: +```shell + # select some samples from across the world + SAMPLES=$(cut -f1 | 1KG+HGDP.selected_samples.tsv | tr , \n) + + mkdir variation_corpus + for CHR in chr{1..22} chr{X,Y} ; do + python create_allele_corpus.py \ + --reference hg38/hg38.analysisSet.fa \ + --vcf 1KG+HGDP/gnomad.genomes.v3.1.2.hgdp_tgp.$CHR.vcf.bgz \ + --samples ${SAMPLES%?} \ + --output-dir variation_corpus + done +``` diff --git a/src/gena_lm/genome_tools/create_allele_corpus.py b/src/gena_lm/genome_tools/create_allele_corpus.py new file mode 100755 index 0000000..83bae35 --- /dev/null +++ b/src/gena_lm/genome_tools/create_allele_corpus.py @@ -0,0 +1,334 @@ +#!/usr/bin/env python +import click +import json +import random + +from pathlib import Path +from typing import Generator, List, Literal, Optional, Tuple + +from Bio import Seq, SeqIO +from cyvcf2 import VCF, Variant +from tqdm import tqdm + +# reproducible randomness +random.seed(42) + + +class Document: + def __init__( + self, sentences: Optional[List[str]] = None, metadata: Optional[dict] = None + ) -> None: + """ + Data structure to store subsequences that are forming the chromosome + + Each document contains up to `max(sentence_bound) * max(lengths_bound)` of nts + """ + if sentences is None: + self.sentences = [] + else: + self.sentences = sentences + self.metadata = metadata + + def append(self, sentence: str) -> None: + self.sentences.append(sentence) + + def set_metadata(self, metadata: dict) -> None: + self.metadata = metadata + + def to_text(self) -> str: + return "\n".join(self.sentences) + + def to_jsonl(self) -> str: + if self.metadata: + return json.dumps({"text": self.sentences, **self.metadata}) + else: + return json.dumps({"text": self.sentences}) + + +def generate_consensus_documents( + chr_sequence: SeqIO.SeqRecord, + vcf_file: VCF, + sample_list: List[str], + sentences_bounds: Tuple[int, int] = (50, 100), + lengths_bounds: Tuple[int, int] = (500, 1000), +) -> Generator[Document, None, None]: + """ + Yield documents covering the chromosome from a reference + replacing its positions with alleles from a VCF file for each sample in it. + """ + + def choose_alleles( + genotype: List, + ref_allele: str, + alt_alleles: List[str], + AF: float, + ) -> Tuple[str, str, int, int]: + """ + Helper function to return two allele letters. + cyvcf2 returns genotype as [allele1:int, allele2:int, phase_status] + """ + if genotype[0] == -1: + # not genotyped, will use reference + return (ref_allele, ref_allele, 0, 0) + else: + alleles = [ref_allele] + alt_alleles + + is_common = 1 + if AF < 0.1: + is_common = 0 + + return (alleles[genotype[0]], alleles[genotype[1]], 1, is_common) + + def make_sample_use_mask( + samples: List[str], + variants: List[Variant] + ) -> List[bool]: + """ + Helper function to understand if we should skip any of samples. + Returns a bitmask over all samples: False = skip, True = use + """ + sample_use_mask = [False for _ in samples] + for sample_id in range(len(samples)): + # check every sample if it has genotyped variants in this region + for variant in variants: + if variant.genotypes[sample_id][0] not in [0, -1]: + # sample actually is genotyped and not a reference + sample_use_mask[sample_id] = True + break + return sample_use_mask + + def prepare_documents( + sequence: str, + snt_bounds: List[int], + metadata: dict + ) -> Tuple[Document, Document]: + """ + Helper function for a final compilation of documents. + Given a ready full sequence and sentence bounds, + split it into sentences and repeat it for the revcomp copy + """ + d = Document(metadata=metadata) + d_rc = Document(metadata=metadata) + + snt_pos = 0 + for snt_len in snt_bounds: + snt = sequence[snt_pos:snt_pos+snt_len] + d.append(snt) + snt_pos += snt_len + + snt_pos = 0 + sequence_rc = Seq.reverse_complement(sequence) + for snt_len in reversed(snt_bounds): + snt = sequence_rc[snt_pos:snt_pos+snt_len] + d_rc.append(snt) + snt_pos += snt_len + return (d, d_rc) + + C = len(chr_sequence) + + for _ in range(1): + doc_start = random.randint(0, 5000) + while doc_start + lengths_bounds[1] < C: + # prepare a document + current_pos = doc_start + + snt_len = random.randint(*lengths_bounds) + snt_left = random.randint(*sentences_bounds) + snt_bounds = [] + while (current_pos + snt_len < C) and (snt_left > 0): + # prepare a sentence in this document + snt_left -= 1 + current_pos += snt_len + snt_bounds.append(snt_len) + snt_len = random.randint(*lengths_bounds) + + doc_end = current_pos + ref_part = str(chr_sequence.seq[doc_start:doc_end].upper()) + ref_len = len(ref_part) + NF = ref_part.count("N") / ref_len + if NF > 0.5: + doc_start = doc_end + continue + + region = f"{chr_sequence.id}:{doc_start}-{doc_end}" + variants = list(vcf_file(region)) + if not variants: + doc_start = doc_end + continue + + sample_use_mask = make_sample_use_mask(vcf_file.samples, variants) + for sample_id in range(len(vcf_file.samples)): + if not sample_use_mask[sample_id]: + continue + + sample_cons1 = "" + sample_cons2 = "" + total_VCS = 0 + total_CCS = 0 + total_CC = 0 + total_RC = 0 + var_pos = 0 + for v in variants: + # -1: v.POS is VCF (1-based) and doc_start is python (0-based) + local_pos = v.POS - doc_start - 1 + if local_pos > var_pos: + # parts prior to variant + sample_cons1 += ref_part[var_pos:local_pos] + sample_cons2 += ref_part[var_pos:local_pos] + +# AF = v.INFO.get("gnomad_AF") +# if not AF: +# AF = v.INFO.get("AF") +# if not AF: +# AF = v.aaf +# requesting AF slows it down 3-10x + AF = 0.5 + + # variant alleles + (allele1, allele2, variant_count, common_count) = choose_alleles( + v.genotypes[sample_id], + v.REF, v.ALT, AF + ) + sample_cons1 += allele1 + sample_cons2 += allele2 + + total_VCS += variant_count + total_CCS += common_count + +# if AF < 0.01: +# total_RC += 1 +# if AF > 0.1: +# total_CC += 1 + + # +1: we already selected var_pos+0 for the variant itself + # -1: if REF is long (in case of deletion), + # skip letters after the first one + # as choose_alleles() already added right amount of REF + # if sample was REF + var_pos = local_pos + 1 + len(v.REF) - 1 + + if (total_CCS / ref_len < 0.01): + # skip region with low common + continue + + metadata = { + "VCS": total_VCS, # applied variant count +# "CCS": total_CCS, # applied common >10% count +# "RC": total_RC, # pop rare <1% count +# "CC": total_CC, # pop common count + "VF": f"{total_VCS/len(ref_part):.2f}", # VCS normalized by doc len + "sample": vcf_file.samples[sample_id], + "len": len(ref_part), + "snt": len(snt_bounds) + } + + cons_unequal = True + if sample_cons1 == sample_cons2: + cons_unequal = False + + # parts after the last variant + sample_cons1 += ref_part[var_pos:] + + (d1, d1_rc) = prepare_documents(sample_cons1, snt_bounds, metadata) + yield d1 + yield d1_rc + + if cons_unequal: + sample_cons2 += ref_part[var_pos:] + (d2, d2_rc) = prepare_documents(sample_cons2, snt_bounds, metadata) + yield d2 + yield d2_rc + + doc_start = doc_end + + +def handle_chromosome_and_variants( + chr: SeqIO.SeqRecord, + vcf_file: VCF, + sample_list: List[str], + outdir: Path, + io_mode: Literal["single_txt", "jsonl", "multiple_txt"] = "single_txt", +): + """ + For a given chromosome and variants make documents and write them to corresponding files + """ + samples = ",".join(sample_list) + + if io_mode == "single_txt": + filename = outdir / f"{chr.id}.{samples}.documents.txt" + with filename.open(mode="w") as out: + for document in tqdm(generate_consensus_documents(chr, vcf_file, sample_list), desc=chr.description): + out.write(document.to_text()) + out.write("\n") + elif io_mode == "jsonl": + filename = outdir / f"{chr.id}.{samples}.documents.jsonl" + with filename.open(mode="w") as out: + for document in tqdm(generate_consensus_documents(chr, vcf_file, sample_list), desc=chr.description): + out.write(document.to_jsonl()) + out.write("\n") + elif io_mode == "multiple_txt": + for idx, document in enumerate(generate_consensus_documents(chr, vcf_file, sample_list)): + filename = outdir / f"{chr.id}.{samples}.document_{idx}.txt" + with filename.open(mode="w") as out: + out.write(document.to_text()) + + +def read_fasta_and_vcf( + fasta_file: Path, + vcf_file: Path, + samples: str, + output_dir: Optional[Path] = None, + io_mode: Literal["single_txt", "jsonl", "multiple_txt"] = "single_txt" +) -> None: + if not output_dir: + output_dir = Path(".") + + sample_list = samples.split(",") + vcf = VCF(vcf_file, samples=sample_list, threads=2) + + with open(fasta_file) as input_handle: + for record in SeqIO.parse(input_handle, "fasta"): + if record.id in ("chrM"): + continue + try: + x = next(vcf(record.id)) + except StopIteration: + continue + handle_chromosome_and_variants( + record, + vcf, + sample_list, + outdir=output_dir, io_mode=io_mode + ) + + +# example usage: +# python create_corpus.py --input-file /path/to/hg38.fa \ +# --vcf /path/to/gnomad.vcf \ +# --samples NA12877,NA12878 \ +# --output_dir data/processed/alleles/ --io_mode jsonl + +@click.command() +@click.option("--reference", required=True, + help="Path to the genome reference fasta", + type=click.Path(path_type=Path, dir_okay=False)) +@click.option("--vcf", required=True, + help="Path to the indexed vcf", + type=click.Path(path_type=Path, dir_okay=False)) +@click.option("--samples", required=False, + help="""Comma-separated list of samples from the vcf to use. + By default all will be used.""") +@click.option("--output-dir", required=False, + type=click.Path(path_type=Path, dir_okay=True)) +@click.option("--io-mode", + type=click.Choice(["single_txt", "jsonl", "multiple_txt"]), + default="jsonl") +def cli(reference, vcf, samples, output_dir, io_mode): + if output_dir: + output_dir.mkdir(parents=True, exist_ok=True) + + read_fasta_and_vcf(reference, vcf, samples, output_dir=output_dir, io_mode=io_mode) + + +if __name__ == "__main__": + cli() diff --git a/src/gena_lm/genome_tools/create_corpus.py b/src/gena_lm/genome_tools/create_corpus.py old mode 100644 new mode 100755 index 2432272..2db4a09 --- a/src/gena_lm/genome_tools/create_corpus.py +++ b/src/gena_lm/genome_tools/create_corpus.py @@ -1,4 +1,4 @@ -# import numpy as np +#!/usr/bin/env python import ast import gzip import json @@ -149,15 +149,15 @@ def read_single_fasta(fna_file: Path, output_dir: Optional[Path] = None, handle_chromosome(record, outdir=output_dir, io_mode=io_mode) # example usage: -# python create_corpus.py --input_file ./ncbi_dataset/data/GCA_009914755.4/GCA_009914755.4_T2T-CHM13v2.0_genomic.fna \ -# --output_dir data/processed/human/ --io_mode jsonl --min_len 10000 +# python create_corpus.py --input-file ./ncbi_dataset/data/GCA_009914755.4/GCA_009914755.4_T2T-CHM13v2.0_genomic.fna \ +# --output-dir data/processed/human/ --io-mode jsonl --min-len 10000 @click.command() -@click.option("--input_file", type=str) -@click.option("--contigs_split_file", type=str) -@click.option("--output_dir", type=click.Path(path_type=Path, dir_okay=True)) -@click.option("--io_mode", type=click.Choice(["single_txt", "jsonl", "multiple_txt"]), default="single_txt") -@click.option("--min_len", type=click.INT, default=10000, help="Minimum contig length to be included") +@click.option("--input-file", type=str) +@click.option("--contigs-split-file", type=str) +@click.option("--output-dir", type=click.Path(path_type=Path, dir_okay=True)) +@click.option("--io-mode", type=click.Choice(["single_txt", "jsonl", "multiple_txt"]), default="single_txt") +@click.option("--min-len", type=click.INT, default=10000, help="Minimum contig length to be included") #@click.option("--rc", is_flag=True, show_default=True, default=False, help="Reverse-complement all sequences") def cli(input_file, contigs_split_file, output_dir, io_mode, min_len): @@ -171,4 +171,4 @@ def cli(input_file, contigs_split_file, output_dir, io_mode, min_len): if __name__ == "__main__": - cli() \ No newline at end of file + cli() diff --git a/src/gena_lm/genome_tools/create_multigenome_corpus.sh b/src/gena_lm/genome_tools/create_multigenome_corpus.sh old mode 100644 new mode 100755 index 5306c93..c7dcf81 --- a/src/gena_lm/genome_tools/create_multigenome_corpus.sh +++ b/src/gena_lm/genome_tools/create_multigenome_corpus.sh @@ -1,3 +1,4 @@ +#!/bin/bash # create_multigenome_corpus.sh metadata_table fasta_dir contigs_dir out_folder set -e @@ -21,5 +22,5 @@ do contigs_split_file=$contigs_dir"/"${spname::-1}".breaks.csv" echo "Processing $spname" - python create_corpus.py --input_file $fastafile --contigs_split_file $contigs_split_file --output_dir $spoutdir -done \ No newline at end of file + python create_corpus.py --input-file $fastafile --contigs-split-file $contigs_split_file --output-dir $spoutdir +done