Skip to content

Commit

Permalink
feat: add scripts for species data, APARENT task, RMT on spliceai and…
Browse files Browse the repository at this point in the history
… enformer tasks, clinvar analysis, splicing onemut and updates
  • Loading branch information
Kuratov committed Sep 3, 2024
1 parent c243205 commit d338b2e
Show file tree
Hide file tree
Showing 67 changed files with 1,097,039 additions and 16 deletions.
19 changes: 19 additions & 0 deletions data/athaliana/ReadMe.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
0. Install [bedtools](https://anaconda.org/bioconda/bedtools)>2.30, [samtools](https://anaconda.org/bioconda/samtools), and [NCBI datasets](https://www.ncbi.nlm.nih.gov/datasets/).

1. Download and process data (run from dnalm/data/athaliana directory):
```bash
bash download_and_process_athaliana_data.sh
```

This will create fasta file `athaliana_fasta_for_corpus_generation.fa.gz`, which can be further used as an input for [create_corpus.py](../../src/gena_lm/genome_tools/create_corpus.py):


```bash
python3 src/gena_lm/genome_tools/create_corpus.py --input-file data/athaliana/athaliana_fasta_for_corpus_generation.fa.gz --output-dir data/athaliana/athaliana_corpus/ --io-mode jsonl --min-len 10000 --n_augmentations 6
```

Data amount w/o augmentations, nucleotides:

train 4,147,986,974

test 346,219,192
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
NC_003070.9 4674472 6177301
NC_003070.9 20056289 22301256
NC_003071.7 4023736 8011438
NC_003074.8 6008006 9455455
34 changes: 34 additions & 0 deletions data/athaliana/athaliana_holdouts/valid_species_stats.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
Genome test_size_before_merge test_after_merge n_test_intervals file_name
GCF_000001735_4 11182947 11182947 4 athaliana_holdouts/GCF_000001735_4.testholdout.bed
44_ket_10 9544805 10439654 26 athaliana_holdouts/44_ket_10.testholdout.merge10000.bed
43_elk_1 9679502 10484806 33 athaliana_holdouts/43_elk_1.testholdout.merge10000.bed
19_kz_9 9593433 10164732 27 athaliana_holdouts/19_kz_9.testholdout.merge10000.bed
23_sij_1 9680342 10359051 23 athaliana_holdouts/23_sij_1.testholdout.merge10000.bed
42_arb_0 9823637 10476292 28 athaliana_holdouts/42_arb_0.testholdout.merge10000.bed
38_dra_2 10059012 10666455 23 athaliana_holdouts/38_dra_2.testholdout.merge10000.bed
13_got_22 9915467 10469778 24 athaliana_holdouts/13_got_22.testholdout.merge10000.bed
37_pu_2_23 10121649 10678008 25 athaliana_holdouts/37_pu_2_23.testholdout.merge10000.bed
03_yilong 9804995 10507930 29 athaliana_holdouts/03_yilong.testholdout.merge10000.bed
24_hs_0 9863507 10503293 28 athaliana_holdouts/24_hs_0.testholdout.merge10000.bed
26_nz_1 10128423 10739190 22 athaliana_holdouts/26_nz_1.testholdout.merge10000.bed
12_li_of_095 9823153 10436382 29 athaliana_holdouts/12_li_of_095.testholdout.merge10000.bed
08_kondara 9595746 10245318 26 athaliana_holdouts/08_kondara.testholdout.merge10000.bed
21_ms_0 9855266 10503110 29 athaliana_holdouts/21_ms_0.testholdout.merge10000.bed
45_meh_0 9442887 10166947 26 athaliana_holdouts/45_meh_0.testholdout.merge10000.bed
20_ll_0 9894290 10443986 26 athaliana_holdouts/20_ll_0.testholdout.merge10000.bed
14_st_0 10059925 10635733 24 athaliana_holdouts/14_st_0.testholdout.merge10000.bed
02_tibet 9599299 10314513 29 athaliana_holdouts/02_tibet.testholdout.merge10000.bed
05_cdm_0 9813564 10518786 32 athaliana_holdouts/05_cdm_0.testholdout.merge10000.bed
15_kelsterbach_2 10041812 10546723 29 athaliana_holdouts/15_kelsterbach_2.testholdout.merge10000.bed
04_bor_1 9885046 10504624 26 athaliana_holdouts/04_bor_1.testholdout.merge10000.bed
39_ah_7 9646330 10462932 31 athaliana_holdouts/39_ah_7.testholdout.merge10000.bed
25_per_1 9467201 10106497 29 athaliana_holdouts/25_per_1.testholdout.merge10000.bed
27_belmonte_4_94 10077231 10613923 26 athaliana_holdouts/27_belmonte_4_94.testholdout.merge10000.bed
29_sij_2 9680345 10358921 23 athaliana_holdouts/29_sij_2.testholdout.merge10000.bed
30_tu_sb30_3 9988258 10641802 28 athaliana_holdouts/30_tu_sb30_3.testholdout.merge10000.bed
31_mammo_1 9718680 10279294 28 athaliana_holdouts/31_mammo_1.testholdout.merge10000.bed
01_col 11152172 11175087 5 athaliana_holdouts/01_col.testholdout.merge10000.bed
33_sha 9708039 10351950 27 athaliana_holdouts/33_sha.testholdout.merge10000.bed
36_pra_6 9954303 10542889 25 athaliana_holdouts/36_pra_6.testholdout.merge10000.bed
40_etna_2 9640936 10366946 32 athaliana_holdouts/40_etna_2.testholdout.merge10000.bed
41_sorbo 9913451 10598563 37 athaliana_holdouts/41_sorbo.testholdout.merge10000.bed
152 changes: 152 additions & 0 deletions data/athaliana/download_and_process_athaliana_data.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
export sp="athaliana"
export reference="GCF_000001735_4"
export d=10000
export hal=${sp}_hals/${sp}.full.hal
export fasta=${sp}_fasta
export splits=${sp}_traintest_splits
export fasta4corpus=${sp}_fasta_for_corpus_generation

#download data from https://doi.org/10.1038/s41467-023-42029-4

mkdir $fasta

wget 'https://figshare.com/ndownloader/files/41661786' -O $fasta/41661786.tar.gz
tar -xvf $fasta/41661786.tar.gz -C $fasta/
mv $fasta/ath_genome_and_annotation/genomes/* $fasta/
rm -r $fasta/ath_genome_and_annotation/
for i in $fasta/*.fasta; \
do \
filename=$(basename "$i"); \
filename="${filename%.*}"; \
filename=$(echo $filename | sed 's/\./_/g'); \
mv $i $fasta/${filename}.fa; \
done;
rm $fasta/41661786.tar.gz

# download reference assembly
datasets download genome accession GCF_000001735.4 --include genome --assembly-level chromosome --filename $fasta/$reference.zip
unzip $fasta/$reference.zip -d $fasta
mv $fasta/ncbi_dataset/data/GCF_000001735.4/GCF_000001735.4_TAIR10.1_genomic.fna \
$fasta/GCF_000001735_4.fa
rm $fasta/$reference.zip
rm $fasta/README.md
rm -r $fasta/ncbi_dataset/

# create cactus input file

rm cactus_minigraph_samples.tsv
for i in $fasta/*.fa; \
do \
filename=$(basename "$i"); \
filename="${filename%.*}"; \
sample=$(echo $filename | sed 's/\./_/g'); \
echo "$sample $i" >> cactus_minigraph_samples.tsv; \
done;

# get cactus and hal tools
docker pull quay.io/comparative-genomics-toolkit/cactus:v2.7.0

# create alignment using cactus (takes ~12h for arabidopsis)
docker run -v $(pwd):/data --rm quay.io/comparative-genomics-toolkit/cactus:v2.7.0 \
cactus-pangenome ./tmp cactus_minigraph_samples.tsv \
--outDir ${sp}_hals/ --outName ${sp} --reference $reference

# # validate hal - note that it may take several hours
# run -v $(pwd):/data --rm quay.io/comparative-genomics-toolkit/cactus:v2.7.0 halValidate $hal

# get genome names
genomes=$(docker run -v $(pwd):/data \
--rm quay.io/comparative-genomics-toolkit/cactus:v2.7.0 \
halStats $hal --genomes); echo $genomes | tr ' ' '\n' | grep -v "Anc" | grep -v "_MINIGRAPH_" > $sp.genomes.txt
echo "Found "$(wc -l $sp.genomes.txt)" genomes in file $sp.genomes.txt"

# generate test holdouts for each species
# this is done by liftovering of the manually selected reference test holdout set

echo "Genome test_size_before_merge test_after_merge n_test_intervals file_name" > ${sp}_holdouts/stats.txt; \
line="$reference"; \
before=$(awk 'BEGIN{s=0}; {s+=$3-$2}; END{print s}' ${sp}_holdouts/$line.testholdout.bed); \
after=$before; \
n_intervals=$(wc -l ${sp}_holdouts/$line.testholdout.bed); \
echo "$line $before $after $n_intervals" >> ${sp}_holdouts/stats.txt; \
cat ${sp}.genomes.txt | while read -r line; \
do \
if [[ "$line" == "$reference" ]]; \
then \
continue; \
fi; \
echo $(date -u) $line; \
docker run -v $(pwd):/data --rm \
quay.io/comparative-genomics-toolkit/cactus:v2.7.0 \
halLiftover $hal \
$reference ${sp}_holdouts/$reference.testholdout.bed \
$line ${sp}_holdouts/$line.testholdout.bed; \
bedtools sort -i ${sp}_holdouts/$line.testholdout.bed > ${sp}_holdouts/$line.testholdout.sorted.bed; \
bedtools merge -d $d -i ${sp}_holdouts/$line.testholdout.sorted.bed > \
${sp}_holdouts/$line.testholdout.merge${d}.bed; \
before=$(awk 'BEGIN{s=0}; {s+=$3-$2}; END{print s}' ${sp}_holdouts/$line.testholdout.bed); \
after=$(awk 'BEGIN{s=0}; {s+=$3-$2}; END{print s}' ${sp}_holdouts/$line.testholdout.merge${d}.bed); \
n_intervals=$(wc -l ${sp}_holdouts/$line.testholdout.merge${d}.bed); \
echo "$line $before $after $n_intervals" >> ${sp}_holdouts/stats.txt; \
rm ${sp}_holdouts/$line.testholdout.bed ${sp}_holdouts/$line.testholdout.sorted.bed; \
done;

# get train and test fasta files
mkdir $splits

cat ${sp}_holdouts/valid_species_stats.tsv | while read -r line; \
do \
genome=$(echo $line | cut -d ' ' -f1); \
if [[ $genome == "Genome" ]]; then continue; fi; \
echo $(date -u) $genome; \
if [[ $genome == "$reference" ]]; \
then \
suffix=""; \
else \
suffix="merge${d}."; \
fi; \
samtools faidx $fasta/$genome.fa; \
sort -k1,1 $fasta/$genome.fa.fai > $fasta/$genome.fa.sorted.fai; \
sort -k1,1 -k2,2n ${sp}_holdouts/$genome.testholdout.${suffix}bed > \
${sp}_holdouts/$genome.testholdout.sorted.${suffix}bed; \
bedtools complement \
-i ${sp}_holdouts/$genome.testholdout.sorted.${suffix}bed \
-g $fasta/$genome.fa.sorted.fai > \
$splits/$genome.train.${suffix}bed; \
awk -v genome="$genome" \
'{print $1 "\t" $2 "\t" $3 "\t" genome "_" $1 "_" $2 "_" $3 "_test"}' \
${sp}_holdouts/$genome.testholdout.${suffix}bed > \
$splits/$genome.traintestsplit.${suffix}bed ;\
awk -v genome="$genome" \
'{print $1 "\t" $2 "\t" $3 "\t" genome "_" $1 "_" $2 "_" $3 "_train"}' \
$splits/$genome.train.${suffix}bed >> \
$splits/$genome.traintestsplit.${suffix}bed ;\
done;

mkdir $fasta4corpus
cat ${sp}_holdouts/valid_species_stats.tsv | while read -r line; \
do \
genome=$(echo $line | cut -d ' ' -f1); \
if [[ $genome == "Genome" ]]; then continue; fi; \
echo $(date -u) $genome; \
if [[ $genome == "$reference" ]]; \
then \
suffix=""; \
else \
suffix="merge${d}."; \
fi; \
bedtools getfasta \
-fi $fasta/$genome.fa \
-bed $splits/$genome.traintestsplit.${suffix}bed \
-nameOnly \
> $fasta4corpus/$genome.splits.fa
done;

# concatenate into final multifasta file
cat $fasta4corpus/*.fa | gzip > ${sp}_fasta_for_corpus_generation.fa.gz

# cleanup
rm -r $fasta4corpus
# # rm -r ${sp}_hals
# # rm -r $fasta
# # rm -r $splits
18 changes: 18 additions & 0 deletions data/fly/ReadMe.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
0. Install [bedtools](https://anaconda.org/bioconda/bedtools)>2.30 and [samtools](https://anaconda.org/bioconda/samtools)

1. Download and process data (run from dnalm/data/fly directory):
```bash
bash download_and_process_fly_data.sh
```

This will create fasta file `fly_fasta_for_corpus_generation.fa.gz`, which can be further used as an input for [create_corpus.py](../../src/gena_lm/genome_tools/create_corpus.py). Note that we used `--n_augmentations 2` when generating corpus for _Drosophila_ data:

```
python3 src/gena_lm/genome_tools/create_corpus.py --input-file data/fly/fly_fasta_for_corpus_generation.fa.gz --output-dir data/fly/fly_corpus/ --io-mode jsonl --min-len 10000 --n_augmentations 2
```

Data amount w/o augmentations, nucleotides:

train 38,208,813,984

test 2,519,835,748
129 changes: 129 additions & 0 deletions data/fly/download_and_process_fly_data.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
set -e

# download Progressive Cactus alignment of 298 drosophilid species
# preprint: https://www.biorxiv.org/content/10.1101/2023.10.02.560517v1
# dataset source: https://doi.org/10.5061/dryad.x0k6djhrd

wget https://datadryad.org/stash/downloads/file_stream/2752661 -O hals/drosophila.hal.00
wget https://datadryad.org/stash/downloads/file_stream/2752662 -O hals/drosophila.hal.01
wget https://datadryad.org/stash/downloads/file_stream/2752663 -O hals/drosophila.hal.02
wget https://datadryad.org/stash/downloads/file_stream/2752664 -O hals/drosophila.hal.03
wget https://datadryad.org/stash/downloads/file_stream/2752665 -O hals/drosophila.hal.04

cat hals/drosophila.hal.* > hals/drosophila.hal
rm hals/drosophila.hal.0*

# get hal tools
docker pull quay.io/comparative-genomics-toolkit/cactus:v2.7.0

# validate hals - note that it may take several hours
# docker run -v $(pwd):/data --rm quay.io/comparative-genomics-toolkit/cactus:v2.7.0 halValidate hals/drosophila.hal

# get genome names
genomes=$(docker run -v $(pwd):/data --rm quay.io/comparative-genomics-toolkit/cactus:v2.7.0 halStats hals/drosophila.hal --genomes); echo $genomes | tr ' ' '\n' | grep -v "Anc" > fly.genomes.txt

# generate test holdouts for each species
# this is done by liftovering of the manually selected D_MELANOGASTER test holdout

echo "Genome test_size_before_merge test_after_merge n_test_intervals file_name" > fly_holdouts/stats.txt; \
line="D_MELANOGASTER"; \
before=$(awk 'BEGIN{s=0}; {s+=$3-$2}; END{print s}' fly_holdouts/$line.testholdout.bed); \
after=$before; \
n_intervals=$(wc -l fly_holdouts/$line.testholdout.bed); \
echo "$line $before $after $n_intervals" >> fly_holdouts/stats.txt; \
d=10000; cat fly.genomes.txt | while read -r line; \
do \
if [[ "$line" == "D_MELANOGASTER" ]]; \
then \
continue; \
fi; \
echo $(date -u) $line; \
docker run -v $(pwd):/data --rm \
quay.io/comparative-genomics-toolkit/cactus:v2.7.0 \
halLiftover hals/drosophila.hal \
D_MELANOGASTER fly_holdouts/D_MELANOGASTER.testholdout.bed \
$line fly_holdouts/$line.testholdout.bed; \
bedtools sort -i fly_holdouts/$line.testholdout.bed > fly_holdouts/$line.testholdout.sorted.bed; \
bedtools merge -d $d -i fly_holdouts/$line.testholdout.sorted.bed > \
fly_holdouts/$line.testholdout.merge${d}.bed; \
before=$(awk 'BEGIN{s=0}; {s+=$3-$2}; END{print s}' fly_holdouts/$line.testholdout.bed); \
after=$(awk 'BEGIN{s=0}; {s+=$3-$2}; END{print s}' fly_holdouts/$line.testholdout.merge${d}.bed); \
n_intervals=$(wc -l fly_holdouts/$line.testholdout.merge${d}.bed); \
echo "$line $before $after $n_intervals" >> fly_holdouts/stats.txt; \
rm fly_holdouts/$line.testholdout.bed fly_holdouts/$line.testholdout.sorted.bed; \
done;

# get genomic sequences for selected species
mkdir fasta

cat fly_holdouts/valid_species_stats.tsv | while read -r line; \
do \
genome=$(echo $line | cut -d ' ' -f1); \
if [[ $genome == 'Genome' ]]; then continue; fi; \
echo $(date -u) $genome; \
docker run -v $(pwd):/data --rm \
quay.io/comparative-genomics-toolkit/cactus:v2.7.0 \
hal2fasta hals/drosophila.hal $genome --outFaPath fasta/$genome.fa; \
done;

# get train and test fasta files
mkdir fly_traintest_splits

d=10000; cat fly_holdouts/valid_species_stats.tsv | while read -r line; \
do \
genome=$(echo $line | cut -d ' ' -f1); \
if [[ $genome == "Genome" ]]; then continue; fi; \
echo $(date -u) $genome; \
if [[ $genome == "D_MELANOGASTER" ]]; \
then \
suffix=""; \
else \
suffix="merge${d}."; \
fi; \
samtools faidx fasta/$genome.fa; \
sort -k1,1 fasta/$genome.fa.fai > fasta/$genome.fa.sorted.fai; \
sort -k1,1 -k2,2n fly_holdouts/$genome.testholdout.${suffix}bed > \
fly_holdouts/$genome.testholdout.sorted.${suffix}bed; \
bedtools complement \
-i fly_holdouts/$genome.testholdout.sorted.${suffix}bed \
-g fasta/$genome.fa.sorted.fai > \
fly_traintest_splits/$genome.train.${suffix}bed; \
awk -v genome="$genome" \
'{print $1 "\t" $2 "\t" $3 "\t" genome "_" $1 "_" $2 "_" $3 "_test"}' \
fly_holdouts/$genome.testholdout.${suffix}bed > \
fly_traintest_splits/$genome.traintestsplit.${suffix}bed ;\
awk -v genome="$genome" \
'{print $1 "\t" $2 "\t" $3 "\t" genome "_" $1 "_" $2 "_" $3 "_train"}' \
fly_traintest_splits/$genome.train.${suffix}bed >> \
fly_traintest_splits/$genome.traintestsplit.${suffix}bed ;\
done;


mkdir fly_fasta_for_corpus_generation

d=10000; cat fly_holdouts/valid_species_stats.tsv | while read -r line; \
do \
genome=$(echo $line | cut -d ' ' -f1); \
if [[ $genome == "Genome" ]]; then continue; fi; \
echo $(date -u) $genome; \
if [[ $genome == "D_MELANOGASTER" ]]; \
then \
suffix=""; \
else \
suffix="merge${d}."; \
fi; \
bedtools getfasta \
-fi fasta/$genome.fa \
-bed fly_traintest_splits/$genome.traintestsplit.${suffix}bed \
-nameOnly \
> fly_fasta_for_corpus_generation/$genome.splits.fa
done;

# concatenate into final multifasta file
cat fly_fasta_for_corpus_generation/*.fa | gzip > fly_fasta_for_corpus_generation.fa.gz

# cleanup
rm -r fly_fasta_for_corpus_generation
# rm -r hals
# rm -r fasta
# rm -r fly_traintest_splits
Loading

0 comments on commit d338b2e

Please sign in to comment.