-
Notifications
You must be signed in to change notification settings - Fork 18
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: add scripts for species data, APARENT task, RMT on spliceai and…
… enformer tasks, clinvar analysis, splicing onemut and updates
- Loading branch information
Kuratov
committed
Sep 3, 2024
1 parent
c243205
commit d338b2e
Showing
67 changed files
with
1,097,039 additions
and
16 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
0. Install [bedtools](https://anaconda.org/bioconda/bedtools)>2.30, [samtools](https://anaconda.org/bioconda/samtools), and [NCBI datasets](https://www.ncbi.nlm.nih.gov/datasets/). | ||
|
||
1. Download and process data (run from dnalm/data/athaliana directory): | ||
```bash | ||
bash download_and_process_athaliana_data.sh | ||
``` | ||
|
||
This will create fasta file `athaliana_fasta_for_corpus_generation.fa.gz`, which can be further used as an input for [create_corpus.py](../../src/gena_lm/genome_tools/create_corpus.py): | ||
|
||
|
||
```bash | ||
python3 src/gena_lm/genome_tools/create_corpus.py --input-file data/athaliana/athaliana_fasta_for_corpus_generation.fa.gz --output-dir data/athaliana/athaliana_corpus/ --io-mode jsonl --min-len 10000 --n_augmentations 6 | ||
``` | ||
|
||
Data amount w/o augmentations, nucleotides: | ||
|
||
train 4,147,986,974 | ||
|
||
test 346,219,192 |
4 changes: 4 additions & 0 deletions
4
data/athaliana/athaliana_holdouts/GCF_000001735_4.testholdout.bed
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
NC_003070.9 4674472 6177301 | ||
NC_003070.9 20056289 22301256 | ||
NC_003071.7 4023736 8011438 | ||
NC_003074.8 6008006 9455455 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
Genome test_size_before_merge test_after_merge n_test_intervals file_name | ||
GCF_000001735_4 11182947 11182947 4 athaliana_holdouts/GCF_000001735_4.testholdout.bed | ||
44_ket_10 9544805 10439654 26 athaliana_holdouts/44_ket_10.testholdout.merge10000.bed | ||
43_elk_1 9679502 10484806 33 athaliana_holdouts/43_elk_1.testholdout.merge10000.bed | ||
19_kz_9 9593433 10164732 27 athaliana_holdouts/19_kz_9.testholdout.merge10000.bed | ||
23_sij_1 9680342 10359051 23 athaliana_holdouts/23_sij_1.testholdout.merge10000.bed | ||
42_arb_0 9823637 10476292 28 athaliana_holdouts/42_arb_0.testholdout.merge10000.bed | ||
38_dra_2 10059012 10666455 23 athaliana_holdouts/38_dra_2.testholdout.merge10000.bed | ||
13_got_22 9915467 10469778 24 athaliana_holdouts/13_got_22.testholdout.merge10000.bed | ||
37_pu_2_23 10121649 10678008 25 athaliana_holdouts/37_pu_2_23.testholdout.merge10000.bed | ||
03_yilong 9804995 10507930 29 athaliana_holdouts/03_yilong.testholdout.merge10000.bed | ||
24_hs_0 9863507 10503293 28 athaliana_holdouts/24_hs_0.testholdout.merge10000.bed | ||
26_nz_1 10128423 10739190 22 athaliana_holdouts/26_nz_1.testholdout.merge10000.bed | ||
12_li_of_095 9823153 10436382 29 athaliana_holdouts/12_li_of_095.testholdout.merge10000.bed | ||
08_kondara 9595746 10245318 26 athaliana_holdouts/08_kondara.testholdout.merge10000.bed | ||
21_ms_0 9855266 10503110 29 athaliana_holdouts/21_ms_0.testholdout.merge10000.bed | ||
45_meh_0 9442887 10166947 26 athaliana_holdouts/45_meh_0.testholdout.merge10000.bed | ||
20_ll_0 9894290 10443986 26 athaliana_holdouts/20_ll_0.testholdout.merge10000.bed | ||
14_st_0 10059925 10635733 24 athaliana_holdouts/14_st_0.testholdout.merge10000.bed | ||
02_tibet 9599299 10314513 29 athaliana_holdouts/02_tibet.testholdout.merge10000.bed | ||
05_cdm_0 9813564 10518786 32 athaliana_holdouts/05_cdm_0.testholdout.merge10000.bed | ||
15_kelsterbach_2 10041812 10546723 29 athaliana_holdouts/15_kelsterbach_2.testholdout.merge10000.bed | ||
04_bor_1 9885046 10504624 26 athaliana_holdouts/04_bor_1.testholdout.merge10000.bed | ||
39_ah_7 9646330 10462932 31 athaliana_holdouts/39_ah_7.testholdout.merge10000.bed | ||
25_per_1 9467201 10106497 29 athaliana_holdouts/25_per_1.testholdout.merge10000.bed | ||
27_belmonte_4_94 10077231 10613923 26 athaliana_holdouts/27_belmonte_4_94.testholdout.merge10000.bed | ||
29_sij_2 9680345 10358921 23 athaliana_holdouts/29_sij_2.testholdout.merge10000.bed | ||
30_tu_sb30_3 9988258 10641802 28 athaliana_holdouts/30_tu_sb30_3.testholdout.merge10000.bed | ||
31_mammo_1 9718680 10279294 28 athaliana_holdouts/31_mammo_1.testholdout.merge10000.bed | ||
01_col 11152172 11175087 5 athaliana_holdouts/01_col.testholdout.merge10000.bed | ||
33_sha 9708039 10351950 27 athaliana_holdouts/33_sha.testholdout.merge10000.bed | ||
36_pra_6 9954303 10542889 25 athaliana_holdouts/36_pra_6.testholdout.merge10000.bed | ||
40_etna_2 9640936 10366946 32 athaliana_holdouts/40_etna_2.testholdout.merge10000.bed | ||
41_sorbo 9913451 10598563 37 athaliana_holdouts/41_sorbo.testholdout.merge10000.bed |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,152 @@ | ||
export sp="athaliana" | ||
export reference="GCF_000001735_4" | ||
export d=10000 | ||
export hal=${sp}_hals/${sp}.full.hal | ||
export fasta=${sp}_fasta | ||
export splits=${sp}_traintest_splits | ||
export fasta4corpus=${sp}_fasta_for_corpus_generation | ||
|
||
#download data from https://doi.org/10.1038/s41467-023-42029-4 | ||
|
||
mkdir $fasta | ||
|
||
wget 'https://figshare.com/ndownloader/files/41661786' -O $fasta/41661786.tar.gz | ||
tar -xvf $fasta/41661786.tar.gz -C $fasta/ | ||
mv $fasta/ath_genome_and_annotation/genomes/* $fasta/ | ||
rm -r $fasta/ath_genome_and_annotation/ | ||
for i in $fasta/*.fasta; \ | ||
do \ | ||
filename=$(basename "$i"); \ | ||
filename="${filename%.*}"; \ | ||
filename=$(echo $filename | sed 's/\./_/g'); \ | ||
mv $i $fasta/${filename}.fa; \ | ||
done; | ||
rm $fasta/41661786.tar.gz | ||
|
||
# download reference assembly | ||
datasets download genome accession GCF_000001735.4 --include genome --assembly-level chromosome --filename $fasta/$reference.zip | ||
unzip $fasta/$reference.zip -d $fasta | ||
mv $fasta/ncbi_dataset/data/GCF_000001735.4/GCF_000001735.4_TAIR10.1_genomic.fna \ | ||
$fasta/GCF_000001735_4.fa | ||
rm $fasta/$reference.zip | ||
rm $fasta/README.md | ||
rm -r $fasta/ncbi_dataset/ | ||
|
||
# create cactus input file | ||
|
||
rm cactus_minigraph_samples.tsv | ||
for i in $fasta/*.fa; \ | ||
do \ | ||
filename=$(basename "$i"); \ | ||
filename="${filename%.*}"; \ | ||
sample=$(echo $filename | sed 's/\./_/g'); \ | ||
echo "$sample $i" >> cactus_minigraph_samples.tsv; \ | ||
done; | ||
|
||
# get cactus and hal tools | ||
docker pull quay.io/comparative-genomics-toolkit/cactus:v2.7.0 | ||
|
||
# create alignment using cactus (takes ~12h for arabidopsis) | ||
docker run -v $(pwd):/data --rm quay.io/comparative-genomics-toolkit/cactus:v2.7.0 \ | ||
cactus-pangenome ./tmp cactus_minigraph_samples.tsv \ | ||
--outDir ${sp}_hals/ --outName ${sp} --reference $reference | ||
|
||
# # validate hal - note that it may take several hours | ||
# run -v $(pwd):/data --rm quay.io/comparative-genomics-toolkit/cactus:v2.7.0 halValidate $hal | ||
|
||
# get genome names | ||
genomes=$(docker run -v $(pwd):/data \ | ||
--rm quay.io/comparative-genomics-toolkit/cactus:v2.7.0 \ | ||
halStats $hal --genomes); echo $genomes | tr ' ' '\n' | grep -v "Anc" | grep -v "_MINIGRAPH_" > $sp.genomes.txt | ||
echo "Found "$(wc -l $sp.genomes.txt)" genomes in file $sp.genomes.txt" | ||
|
||
# generate test holdouts for each species | ||
# this is done by liftovering of the manually selected reference test holdout set | ||
|
||
echo "Genome test_size_before_merge test_after_merge n_test_intervals file_name" > ${sp}_holdouts/stats.txt; \ | ||
line="$reference"; \ | ||
before=$(awk 'BEGIN{s=0}; {s+=$3-$2}; END{print s}' ${sp}_holdouts/$line.testholdout.bed); \ | ||
after=$before; \ | ||
n_intervals=$(wc -l ${sp}_holdouts/$line.testholdout.bed); \ | ||
echo "$line $before $after $n_intervals" >> ${sp}_holdouts/stats.txt; \ | ||
cat ${sp}.genomes.txt | while read -r line; \ | ||
do \ | ||
if [[ "$line" == "$reference" ]]; \ | ||
then \ | ||
continue; \ | ||
fi; \ | ||
echo $(date -u) $line; \ | ||
docker run -v $(pwd):/data --rm \ | ||
quay.io/comparative-genomics-toolkit/cactus:v2.7.0 \ | ||
halLiftover $hal \ | ||
$reference ${sp}_holdouts/$reference.testholdout.bed \ | ||
$line ${sp}_holdouts/$line.testholdout.bed; \ | ||
bedtools sort -i ${sp}_holdouts/$line.testholdout.bed > ${sp}_holdouts/$line.testholdout.sorted.bed; \ | ||
bedtools merge -d $d -i ${sp}_holdouts/$line.testholdout.sorted.bed > \ | ||
${sp}_holdouts/$line.testholdout.merge${d}.bed; \ | ||
before=$(awk 'BEGIN{s=0}; {s+=$3-$2}; END{print s}' ${sp}_holdouts/$line.testholdout.bed); \ | ||
after=$(awk 'BEGIN{s=0}; {s+=$3-$2}; END{print s}' ${sp}_holdouts/$line.testholdout.merge${d}.bed); \ | ||
n_intervals=$(wc -l ${sp}_holdouts/$line.testholdout.merge${d}.bed); \ | ||
echo "$line $before $after $n_intervals" >> ${sp}_holdouts/stats.txt; \ | ||
rm ${sp}_holdouts/$line.testholdout.bed ${sp}_holdouts/$line.testholdout.sorted.bed; \ | ||
done; | ||
|
||
# get train and test fasta files | ||
mkdir $splits | ||
|
||
cat ${sp}_holdouts/valid_species_stats.tsv | while read -r line; \ | ||
do \ | ||
genome=$(echo $line | cut -d ' ' -f1); \ | ||
if [[ $genome == "Genome" ]]; then continue; fi; \ | ||
echo $(date -u) $genome; \ | ||
if [[ $genome == "$reference" ]]; \ | ||
then \ | ||
suffix=""; \ | ||
else \ | ||
suffix="merge${d}."; \ | ||
fi; \ | ||
samtools faidx $fasta/$genome.fa; \ | ||
sort -k1,1 $fasta/$genome.fa.fai > $fasta/$genome.fa.sorted.fai; \ | ||
sort -k1,1 -k2,2n ${sp}_holdouts/$genome.testholdout.${suffix}bed > \ | ||
${sp}_holdouts/$genome.testholdout.sorted.${suffix}bed; \ | ||
bedtools complement \ | ||
-i ${sp}_holdouts/$genome.testholdout.sorted.${suffix}bed \ | ||
-g $fasta/$genome.fa.sorted.fai > \ | ||
$splits/$genome.train.${suffix}bed; \ | ||
awk -v genome="$genome" \ | ||
'{print $1 "\t" $2 "\t" $3 "\t" genome "_" $1 "_" $2 "_" $3 "_test"}' \ | ||
${sp}_holdouts/$genome.testholdout.${suffix}bed > \ | ||
$splits/$genome.traintestsplit.${suffix}bed ;\ | ||
awk -v genome="$genome" \ | ||
'{print $1 "\t" $2 "\t" $3 "\t" genome "_" $1 "_" $2 "_" $3 "_train"}' \ | ||
$splits/$genome.train.${suffix}bed >> \ | ||
$splits/$genome.traintestsplit.${suffix}bed ;\ | ||
done; | ||
|
||
mkdir $fasta4corpus | ||
cat ${sp}_holdouts/valid_species_stats.tsv | while read -r line; \ | ||
do \ | ||
genome=$(echo $line | cut -d ' ' -f1); \ | ||
if [[ $genome == "Genome" ]]; then continue; fi; \ | ||
echo $(date -u) $genome; \ | ||
if [[ $genome == "$reference" ]]; \ | ||
then \ | ||
suffix=""; \ | ||
else \ | ||
suffix="merge${d}."; \ | ||
fi; \ | ||
bedtools getfasta \ | ||
-fi $fasta/$genome.fa \ | ||
-bed $splits/$genome.traintestsplit.${suffix}bed \ | ||
-nameOnly \ | ||
> $fasta4corpus/$genome.splits.fa | ||
done; | ||
|
||
# concatenate into final multifasta file | ||
cat $fasta4corpus/*.fa | gzip > ${sp}_fasta_for_corpus_generation.fa.gz | ||
|
||
# cleanup | ||
rm -r $fasta4corpus | ||
# # rm -r ${sp}_hals | ||
# # rm -r $fasta | ||
# # rm -r $splits |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
0. Install [bedtools](https://anaconda.org/bioconda/bedtools)>2.30 and [samtools](https://anaconda.org/bioconda/samtools) | ||
|
||
1. Download and process data (run from dnalm/data/fly directory): | ||
```bash | ||
bash download_and_process_fly_data.sh | ||
``` | ||
|
||
This will create fasta file `fly_fasta_for_corpus_generation.fa.gz`, which can be further used as an input for [create_corpus.py](../../src/gena_lm/genome_tools/create_corpus.py). Note that we used `--n_augmentations 2` when generating corpus for _Drosophila_ data: | ||
|
||
``` | ||
python3 src/gena_lm/genome_tools/create_corpus.py --input-file data/fly/fly_fasta_for_corpus_generation.fa.gz --output-dir data/fly/fly_corpus/ --io-mode jsonl --min-len 10000 --n_augmentations 2 | ||
``` | ||
|
||
Data amount w/o augmentations, nucleotides: | ||
|
||
train 38,208,813,984 | ||
|
||
test 2,519,835,748 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,129 @@ | ||
set -e | ||
|
||
# download Progressive Cactus alignment of 298 drosophilid species | ||
# preprint: https://www.biorxiv.org/content/10.1101/2023.10.02.560517v1 | ||
# dataset source: https://doi.org/10.5061/dryad.x0k6djhrd | ||
|
||
wget https://datadryad.org/stash/downloads/file_stream/2752661 -O hals/drosophila.hal.00 | ||
wget https://datadryad.org/stash/downloads/file_stream/2752662 -O hals/drosophila.hal.01 | ||
wget https://datadryad.org/stash/downloads/file_stream/2752663 -O hals/drosophila.hal.02 | ||
wget https://datadryad.org/stash/downloads/file_stream/2752664 -O hals/drosophila.hal.03 | ||
wget https://datadryad.org/stash/downloads/file_stream/2752665 -O hals/drosophila.hal.04 | ||
|
||
cat hals/drosophila.hal.* > hals/drosophila.hal | ||
rm hals/drosophila.hal.0* | ||
|
||
# get hal tools | ||
docker pull quay.io/comparative-genomics-toolkit/cactus:v2.7.0 | ||
|
||
# validate hals - note that it may take several hours | ||
# docker run -v $(pwd):/data --rm quay.io/comparative-genomics-toolkit/cactus:v2.7.0 halValidate hals/drosophila.hal | ||
|
||
# get genome names | ||
genomes=$(docker run -v $(pwd):/data --rm quay.io/comparative-genomics-toolkit/cactus:v2.7.0 halStats hals/drosophila.hal --genomes); echo $genomes | tr ' ' '\n' | grep -v "Anc" > fly.genomes.txt | ||
|
||
# generate test holdouts for each species | ||
# this is done by liftovering of the manually selected D_MELANOGASTER test holdout | ||
|
||
echo "Genome test_size_before_merge test_after_merge n_test_intervals file_name" > fly_holdouts/stats.txt; \ | ||
line="D_MELANOGASTER"; \ | ||
before=$(awk 'BEGIN{s=0}; {s+=$3-$2}; END{print s}' fly_holdouts/$line.testholdout.bed); \ | ||
after=$before; \ | ||
n_intervals=$(wc -l fly_holdouts/$line.testholdout.bed); \ | ||
echo "$line $before $after $n_intervals" >> fly_holdouts/stats.txt; \ | ||
d=10000; cat fly.genomes.txt | while read -r line; \ | ||
do \ | ||
if [[ "$line" == "D_MELANOGASTER" ]]; \ | ||
then \ | ||
continue; \ | ||
fi; \ | ||
echo $(date -u) $line; \ | ||
docker run -v $(pwd):/data --rm \ | ||
quay.io/comparative-genomics-toolkit/cactus:v2.7.0 \ | ||
halLiftover hals/drosophila.hal \ | ||
D_MELANOGASTER fly_holdouts/D_MELANOGASTER.testholdout.bed \ | ||
$line fly_holdouts/$line.testholdout.bed; \ | ||
bedtools sort -i fly_holdouts/$line.testholdout.bed > fly_holdouts/$line.testholdout.sorted.bed; \ | ||
bedtools merge -d $d -i fly_holdouts/$line.testholdout.sorted.bed > \ | ||
fly_holdouts/$line.testholdout.merge${d}.bed; \ | ||
before=$(awk 'BEGIN{s=0}; {s+=$3-$2}; END{print s}' fly_holdouts/$line.testholdout.bed); \ | ||
after=$(awk 'BEGIN{s=0}; {s+=$3-$2}; END{print s}' fly_holdouts/$line.testholdout.merge${d}.bed); \ | ||
n_intervals=$(wc -l fly_holdouts/$line.testholdout.merge${d}.bed); \ | ||
echo "$line $before $after $n_intervals" >> fly_holdouts/stats.txt; \ | ||
rm fly_holdouts/$line.testholdout.bed fly_holdouts/$line.testholdout.sorted.bed; \ | ||
done; | ||
|
||
# get genomic sequences for selected species | ||
mkdir fasta | ||
|
||
cat fly_holdouts/valid_species_stats.tsv | while read -r line; \ | ||
do \ | ||
genome=$(echo $line | cut -d ' ' -f1); \ | ||
if [[ $genome == 'Genome' ]]; then continue; fi; \ | ||
echo $(date -u) $genome; \ | ||
docker run -v $(pwd):/data --rm \ | ||
quay.io/comparative-genomics-toolkit/cactus:v2.7.0 \ | ||
hal2fasta hals/drosophila.hal $genome --outFaPath fasta/$genome.fa; \ | ||
done; | ||
|
||
# get train and test fasta files | ||
mkdir fly_traintest_splits | ||
|
||
d=10000; cat fly_holdouts/valid_species_stats.tsv | while read -r line; \ | ||
do \ | ||
genome=$(echo $line | cut -d ' ' -f1); \ | ||
if [[ $genome == "Genome" ]]; then continue; fi; \ | ||
echo $(date -u) $genome; \ | ||
if [[ $genome == "D_MELANOGASTER" ]]; \ | ||
then \ | ||
suffix=""; \ | ||
else \ | ||
suffix="merge${d}."; \ | ||
fi; \ | ||
samtools faidx fasta/$genome.fa; \ | ||
sort -k1,1 fasta/$genome.fa.fai > fasta/$genome.fa.sorted.fai; \ | ||
sort -k1,1 -k2,2n fly_holdouts/$genome.testholdout.${suffix}bed > \ | ||
fly_holdouts/$genome.testholdout.sorted.${suffix}bed; \ | ||
bedtools complement \ | ||
-i fly_holdouts/$genome.testholdout.sorted.${suffix}bed \ | ||
-g fasta/$genome.fa.sorted.fai > \ | ||
fly_traintest_splits/$genome.train.${suffix}bed; \ | ||
awk -v genome="$genome" \ | ||
'{print $1 "\t" $2 "\t" $3 "\t" genome "_" $1 "_" $2 "_" $3 "_test"}' \ | ||
fly_holdouts/$genome.testholdout.${suffix}bed > \ | ||
fly_traintest_splits/$genome.traintestsplit.${suffix}bed ;\ | ||
awk -v genome="$genome" \ | ||
'{print $1 "\t" $2 "\t" $3 "\t" genome "_" $1 "_" $2 "_" $3 "_train"}' \ | ||
fly_traintest_splits/$genome.train.${suffix}bed >> \ | ||
fly_traintest_splits/$genome.traintestsplit.${suffix}bed ;\ | ||
done; | ||
|
||
|
||
mkdir fly_fasta_for_corpus_generation | ||
|
||
d=10000; cat fly_holdouts/valid_species_stats.tsv | while read -r line; \ | ||
do \ | ||
genome=$(echo $line | cut -d ' ' -f1); \ | ||
if [[ $genome == "Genome" ]]; then continue; fi; \ | ||
echo $(date -u) $genome; \ | ||
if [[ $genome == "D_MELANOGASTER" ]]; \ | ||
then \ | ||
suffix=""; \ | ||
else \ | ||
suffix="merge${d}."; \ | ||
fi; \ | ||
bedtools getfasta \ | ||
-fi fasta/$genome.fa \ | ||
-bed fly_traintest_splits/$genome.traintestsplit.${suffix}bed \ | ||
-nameOnly \ | ||
> fly_fasta_for_corpus_generation/$genome.splits.fa | ||
done; | ||
|
||
# concatenate into final multifasta file | ||
cat fly_fasta_for_corpus_generation/*.fa | gzip > fly_fasta_for_corpus_generation.fa.gz | ||
|
||
# cleanup | ||
rm -r fly_fasta_for_corpus_generation | ||
# rm -r hals | ||
# rm -r fasta | ||
# rm -r fly_traintest_splits |
Oops, something went wrong.