Variant-aware corpus generation

AIRI-Institute · Jul 25, 2023 · 0be5d93 · 0be5d93
1 parent 203841e
commit 0be5d93
Show file tree

Hide file tree

Showing 7 changed files with 448 additions and 19 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,3 +1,6 @@
 [build-system]
 requires = ["setuptools>=42"]
-build-backend = "setuptools.build_meta"
+build-backend = "setuptools.build_meta"
+
+[tool.mypy]
+ignore_missing_imports = true
diff --git a/requirements.txt b/requirements.txt
@@ -1,9 +1,9 @@
 numpy
-biopython
+cyvcf2
 tokenizers
 tqdm
 torch">=1.7.1,<=1.9.1"
 transformers==4.17.0
 pysam>=0.19.1
 biopython>=1.79
-pandas>=1.5.0
+pandas>=1.5.0
diff --git a/src/gena_lm/genome_tools/1KG+HGDP.selected_samples.tsv b/src/gena_lm/genome_tools/1KG+HGDP.selected_samples.tsv
@@ -0,0 +1,38 @@
+NA12877	ceph	M
+NA12878	ceph	F
+HGDP00815	chinese	M
+HG00513	chinese	F
+HG00190	finnish	M
+HG00171	finnish	F
+HG00138	british	M
+HG00253	british	F
+HGDP00888	russian	M
+HGDP00898	russian	F
+HGDP00950	yakut	M
+HGDP00959	yakut	F
+HGDP01404	adygei	M
+HGDP01399	adygei	F
+HGDP01050	pima	M
+HGDP01041	pima	F
+HGDP01081	mbuti	M
+HGDP00471	mbuti	F
+HG01953	peruvian	M
+HG01573	peruvian	F
+HGDP00727	palestinian	M
+HGDP00699	palestinian	F
+HGDP00551	papuan	M
+HGDP00550	papuan	F
+NA21111	gujarati	M
+NA20896	gujarati	F
+HGDP00258	pathan	M
+HGDP00237	pathan	F
+HGDP00929	yoruba	M
+NA19129	yoruba	F
+HGDP00845	surui	M
+HGDP00838	surui	F
+HGDP01167	tuscan	M
+HGDP01169	tuscan	F
+HGDP01300	uygur	M
+HGDP01305	uygur	F
+HG03081	mende	M
+HG03055	mende	F
diff --git a/src/gena_lm/genome_tools/README.md b/src/gena_lm/genome_tools/README.md
@@ -1,8 +1,61 @@
-A single genome file can be processed with create_corpus.py
+A single genome reference file can be processed with `create_corpus.py`.
+
+## Multi-species corpus
 
 To download and process multiple genome datasets, there are following steps:
+
 1. Download fasta files:
-'cat ensemble_genomes_metadata.tsv | cut -f2 | xargs -L 1 wget -P "fasta/" --no-clobber'
-2. Download and extract processed agp information into "contigs/" folder
-3. Run
-'bash create_multigenome_corpus.sh /path/to/full_ensembl_genomes_metadata /path/to/directory/with/fasta/ /path/to/directory/with/contigs/ /path/to/output/folder'
+```shell
+cut -f2 ensemble_genomes_metadata.tsv | xargs -L 1 wget -P fasta --no-clobber
+```
+
+2. Download and extract processed agp information into `contigs/` folder
+
+3. Run:
+```shell
+bash create_multigenome_corpus.sh           \
+    /path/to/full_ensembl_genomes_metadata  \
+    /path/to/directory/with/fasta/          \
+    /path/to/directory/with/contigs/        \
+    /path/to/output/folder
+```
+
+## Variation corpus
+
+1. Download [1000 Genomes + HGDP whole genome callset from gnomAD](https://gnomad.broadinstitute.org/downloads#v3-hgdp-1kg):
+```shell
+    GOO=https://storage.googleapis.com/gcp-public-data--gnomad/
+    AWS=https://gnomad-public-us-east-1.s3.amazonaws.com/
+    AZU=https://datasetgnomad.blob.core.windows.net/dataset/
+	# choose whichever cloud
+	LINK=$GOO
+	LINK=$LINK/release/3.1.2/vcf/genomes/gnomad.genomes.v3.1.2.hgdp_tgp.
+
+	mkdir 1KG+HGDP
+    for CHR in chr{1..22} chr{X,Y} ; do
+		wget -P 1KG+HGDP -c $LINK.$CHR.vcf.{bgz,bgz.tbi}
+    done
+```
+
+2. Download hg38 reference:
+```shell
+	mkdir hg38
+	wget -P hg38 https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/analysisSet/hg38.analysisSet.fa.gz
+	gunzip hg38.analysisSet.fa.gz
+	samtools faidx hg38.analysisSet.fa
+```
+
+3. Apply variants:
+```shell
+    # select some samples from across the world
+    SAMPLES=$(cut -f1 | 1KG+HGDP.selected_samples.tsv | tr , \n)
+
+    mkdir variation_corpus
+    for CHR in chr{1..22} chr{X,Y} ; do 
+    	python create_allele_corpus.py                                  \
+            --reference hg38/hg38.analysisSet.fa                        \
+            --vcf 1KG+HGDP/gnomad.genomes.v3.1.2.hgdp_tgp.$CHR.vcf.bgz  \
+            --samples ${SAMPLES%?}                                      \
+            --output-dir variation_corpus
+    done
+```