Skip to content

Commit

Permalink
upload scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
panushri25 committed Aug 27, 2024
1 parent e326b17 commit 188125c
Show file tree
Hide file tree
Showing 42 changed files with 1,763 additions and 2 deletions.
15 changes: 15 additions & 0 deletions afr_caqtls_enformer_gm.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@

#tail -n +2 /mnt/lab_data2/anusri/enformer/eu_caqtls/source2.tsv > /mnt/lab_data2/anusri/variant-scorer/src/output/afr_caqtls_window/meta_data.tsv
#split -l 38833 /mnt/lab_data2/anusri/variant-scorer/src/output/afr_caqtls_window/meta_data.tsv /mnt/lab_data2/anusri/variant-scorer/src/output/afr_caqtls_window/splits/split

dsqtl=/mnt/lab_data2/anusri/variant-scorer/src/output/afr_caqtls_window/splits/splitaa
genome=/mnt/lab_data2/anusri/chrombpnet/reference/hg38.genome.fa
#chrom_sizes=/mnt/data/annotations/by_release/hg19/hg19.chrom.sizes
output_dirn=/mnt/lab_data2/anusri/variant-scorer/src/output/afr_caqtls_window/splitaa/
mkdir $output_dirn
gpu=0


CUDA_VISIBLE_DEVICES=$gpu python src/evaluation/variant_effect_prediction/snp_scoring_enformer_new_center.py -i $dsqtl -g $genome -o $output_dirn -bs 1 --debug_mode_on 0


15 changes: 15 additions & 0 deletions afr_caqtls_enformer_gm_1.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@

#tail -n +2 /mnt/lab_data2/anusri/enformer/eu_caqtls/source2.tsv > /mnt/lab_data2/anusri/variant-scorer/src/output/afr_caqtls_window/meta_data.tsv
#split -l 38833 /mnt/lab_data2/anusri/variant-scorer/src/output/afr_caqtls_window/meta_data.tsv /mnt/lab_data2/anusri/variant-scorer/src/output/afr_caqtls_window/splits/split

dsqtl=/mnt/lab_data2/anusri/variant-scorer/src/output/afr_caqtls_window/splits/splitab
genome=/mnt/lab_data2/anusri/chrombpnet/reference/hg38.genome.fa
#chrom_sizes=/mnt/data/annotations/by_release/hg19/hg19.chrom.sizes
output_dirn=/mnt/lab_data2/anusri/variant-scorer/src/output/afr_caqtls_window/splitab/
mkdir $output_dirn
gpu=1


CUDA_VISIBLE_DEVICES=$gpu python src/evaluation/variant_effect_prediction/snp_scoring_enformer_new_center.py -i $dsqtl -g $genome -o $output_dirn -bs 1 --debug_mode_on 0


15 changes: 15 additions & 0 deletions afr_caqtls_enformer_gm_2.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@

#tail -n +2 /mnt/lab_data2/anusri/enformer/eu_caqtls/source2.tsv > /mnt/lab_data2/anusri/variant-scorer/src/output/afr_caqtls_window/meta_data.tsv
#split -l 38833 /mnt/lab_data2/anusri/variant-scorer/src/output/afr_caqtls_window/meta_data.tsv /mnt/lab_data2/anusri/variant-scorer/src/output/afr_caqtls_window/splits/split

dsqtl=/mnt/lab_data2/anusri/variant-scorer/src/output/afr_caqtls_window/splits/splitac
genome=/mnt/lab_data2/anusri/chrombpnet/reference/hg38.genome.fa
#chrom_sizes=/mnt/data/annotations/by_release/hg19/hg19.chrom.sizes
output_dirn=/mnt/lab_data2/anusri/variant-scorer/src/output/afr_caqtls_window/splitac/
mkdir $output_dirn
gpu=MIG-166d7783-762d-5f61-b31c-549eb4e0fba0


CUDA_VISIBLE_DEVICES=$gpu python src/evaluation/variant_effect_prediction/snp_scoring_enformer_new_center.py -i $dsqtl -g $genome -o $output_dirn -bs 1 --debug_mode_on 0


14 changes: 14 additions & 0 deletions afr_caqtls_enformer_gm_3.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@

tail -n +2 /mnt/lab_data2/anusri/enformer/eu_caqtls/source2_new_filter.tsv > /mnt/lab_data2/anusri/variant-scorer/src/output/afr_caqtls_window_new_filter/meta_data.tsv

dsqtl=/mnt/lab_data2/anusri/variant-scorer/src/output/afr_caqtls_window_new_filter/meta_data.tsv
genome=/mnt/lab_data2/anusri/chrombpnet/reference/hg38.genome.fa
#chrom_sizes=/mnt/data/annotations/by_release/hg19/hg19.chrom.sizes
output_dirn=/mnt/lab_data2/anusri/variant-scorer/src/output/afr_caqtls_window_new_filter/
mkdir $output_dirn
gpu=MIG-166d7783-762d-5f61-b31c-549eb4e0fba0


CUDA_VISIBLE_DEVICES=$gpu python src/evaluation/variant_effect_prediction/snp_scoring_enformer_new_center.py -i $dsqtl -g $genome -o $output_dirn -bs 1 --debug_mode_on 0


56 changes: 56 additions & 0 deletions atlas_gm_fold_0retrain.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#!/bin/bash

cell_line=GM12878
data_type="DNASE_SE"
date=$(date +'%m.%d.%Y')
setting=$data_type"_"$date
cur_file_name="gm_fold_0retrain_hep_transfer.sh"
### SIGNAL INPUT
fold=/oak/stanford/groups/akundaje/projects/chrombpnet/model_inputs/ENCODE_ATAC_downloads/splits/fold_0.json
bias_h5=/mnt/lab_data2/anusri/chrombpnet/results/chrombpnet/DNASE_PE/HEPG2/HEPG2_06.08.2022_bias_128_4_1234_0.8_fold_0/bias_model/bias.h5

overlap_peak=/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/DNASE/ENCSR000EMT/preprocessing/downloads/peaks.bed.gz
blacklist_region=/mnt/data/annotations/blacklist/GRch38/GRch38_unified_blacklist.bed.gz
chrom_sizes=/mnt/data/annotations/by_release/hg38/hg38.chrom.sizes
ref_fasta=/mnt/data/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta

main_dir=$PWD/results/chrombpnet/$data_type/$cell_line
data_dir=$main_dir/data
output_dir=$main_dir/$setting
neg_dir=$main_dir/negatives_data/
inputlen=2114
gpu=0


function timestamp {
# Function to get the current time with the new line character
# removed

# current time
date +"%Y-%m-%d_%H-%M-%S" | tr -d '\n'
}


## CREATE DIRS
if [[ -d $main_dir ]] ; then
echo "main director already exists"
else
mkdir $main_dir
fi

if [[ -d $output_dir ]] ; then
echo "output director already exists"
else
mkdir $output_dir
fi



### STEP 2 - TRAIN CHROMBPNET MODEL

if [[ -d $output_dir/chrombpnet_model ]] ; then
echo "skipping chrombpnet model training - directory present "
else
mkdir $output_dir/chrombpnet_model
CUDA_VISIBLE_DEVICES=$gpu bash step6_train_chrombpnet_model.sh $ref_fasta $data_dir"/"$cell_line"_unstranded.bw" $overlap_peak $neg_dir/negatives_with_summit.bed $fold $bias_h5 $output_dir/chrombpnet_model $data_type
fi
55 changes: 55 additions & 0 deletions atlas_imr90_fold_0retrain.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#!/bin/bash

cell_line=IMR90
data_type="DNASE_SE"
date=$(date +'%m.%d.%Y')
setting=$data_type"_"$date"_fold_0_hep_bias_transfer"
cur_file_name="imr90_fold_0retrain.sh"
### SIGNAL INPUT
fold=/oak/stanford/groups/akundaje/projects/chrombpnet/model_inputs/ENCODE_ATAC_downloads/splits/fold_0.json
bias_h5=/mnt/lab_data2/anusri/chrombpnet/results/chrombpnet/DNASE_PE/HEPG2/HEPG2_06.08.2022_bias_128_4_1234_0.8_fold_0/bias_model/bias.h5

overlap_peak=/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/DNASE/ENCSR477RTP/preprocessing/downloads/peaks.bed.gz
chrom_sizes=reference/chrom.sizes
ref_fasta=reference/hg38.genome.fa

main_dir=$PWD/results/chrombpnet/$data_type/$cell_line
neg_dir=$main_dir/negatives_data/
data_dir=$main_dir/data
output_dir=$main_dir/$setting
inputlen=2114
gpu=MIG-166d7783-762d-5f61-b31c-549eb4e0fba0


function timestamp {
# Function to get the current time with the new line character
# removed

# current time
date +"%Y-%m-%d_%H-%M-%S" | tr -d '\n'
}


## CREATE DIRS
if [[ -d $main_dir ]] ; then
echo "main director already exists"
else
mkdir $main_dir
fi

if [[ -d $output_dir ]] ; then
echo "output director already exists"
else
mkdir $output_dir
fi



### STEP 2 - TRAIN CHROMBPNET MODEL

if [[ -d $output_dir/chrombpnet_model ]] ; then
echo "skipping chrombpnet model training - directory present "
else
mkdir $output_dir/chrombpnet_model
CUDA_VISIBLE_DEVICES=$gpu bash step6_train_chrombpnet_model.sh $ref_fasta $data_dir"/"$cell_line"_unstranded.bw" $overlap_peak $neg_dir/negatives_with_summit.bed $fold $bias_h5 $output_dir/chrombpnet_model $data_type
fi
9 changes: 9 additions & 0 deletions bqtls_enformer_gm.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
dsqtl=/mnt/lab_data2/anusri/variant-scorer/src/output/bqtls_lcl/pu1/pu1_sig.tsv
genome=/mnt/lab_data2/anusri/chrombpnet/reference/male.hg19.fa
#chrom_sizes=/mnt/data/annotations/by_release/hg19/hg19.chrom.sizes
output_dirn=/mnt/lab_data2/anusri/variant-scorer/src/output/bqtls_lcl/pu1/enformer_preds_small_window/
gpu=MIG-166d7783-762d-5f61-b31c-549eb4e0fba0

CUDA_VISIBLE_DEVICES=$gpu python src/evaluation/variant_effect_prediction/snp_scoring_enformer_new_center.py -i $dsqtl -g $genome -o $output_dirn -bs 1 --debug_mode_on 0


28 changes: 28 additions & 0 deletions chr1wide_make_bigwigs.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
chrombpnet_nb=$1
chrombpnet=$2
cellline=$3
gpu=$4

regions=results/chrombpnet/auprc_curves/narrowpeak_genomewide_chr1.bed
output_dir=results/chrombpnet/auprc_curves/$cellline
mkdir $output_dir


chrom_sizes=$PWD/reference/chrom.sizes
ref_fasta=$PWD/reference/hg38.genome.fa
file=$output_dir/$cellline

echo "CUDA_VISIBLE_DEVICES=$gpu python src/evaluation/make_bigwigs/predict_to_bigwig_new.py -cm $chrombpnet -cmb $chrombpnet_nb --regions $regions -g $ref_fasta -c $chrom_sizes -o $output_dir/$cellline -t 1"
CUDA_VISIBLE_DEVICES=$gpu python src/evaluation/make_bigwigs/predict_to_bigwig_new.py -cm $chrombpnet -cmb $chrombpnet_nb --regions $regions \
-g $ref_fasta -c $chrom_sizes -o $output_dir/$cellline -t 1


chrombpnet=results/chrombpnet/auprc_curves/$cellline/$cellline"_w_bias_predictions.h5"
chrombpnet_nb=results/chrombpnet/auprc_curves/$cellline/$cellline"_wo_bias_predictions.h5"

echo "CUDA_VISIBLE_DEVICES=$gpu python src/evaluation/make_bigwigs/make_only_bigwigs.py -cm $chrombpnet -cmb $chrombpnet_nb --regions $regions -g $ref_fasta -c $chrom_sizes -o $output_dir/$cellline -t 1"
CUDA_VISIBLE_DEVICES=$gpu python src/evaluation/make_bigwigs/make_only_bigwigs.py -cm $chrombpnet -cmb $chrombpnet_nb --regions $regions \
-g $ref_fasta -c $chrom_sizes -o $output_dir/$cellline -t 1



9 changes: 9 additions & 0 deletions dsqtls_enformer_gm.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
dsqtl=/mnt/lab_data2/anusri/variant-scorer/src/output/dsqtls_lcl/dsqtl_meta_data.tsv
genome=/mnt/lab_data2/anusri/chrombpnet/reference/male.hg19.fa
#chrom_sizes=/mnt/data/annotations/by_release/hg19/hg19.chrom.sizes
output_dirn=/mnt/lab_data2/anusri/variant-scorer/src/output/dsqtls_lcl/enformer_preds_small_window/
gpu=MIG-166d7783-762d-5f61-b31c-549eb4e0fba0

CUDA_VISIBLE_DEVICES=$gpu python src/evaluation/variant_effect_prediction/snp_scoring_enformer_new_center.py -i $dsqtl -g $genome -o $output_dirn -bs 1 --debug_mode_on 0


10 changes: 10 additions & 0 deletions dsqtls_enformer_gm_1.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
dsqtl=/mnt/lab_data2/anusri/variant-scorer/src/output/dsqtls_lcl/enformer_preds_small_window_try1/dsqtl_meta_data.tsv
genome=/mnt/lab_data2/anusri/chrombpnet/reference/male.hg19.fa
#chrom_sizes=/mnt/data/annotations/by_release/hg19/hg19.chrom.sizes
output_dirn=/mnt/lab_data2/anusri/variant-scorer/src/output/dsqtls_lcl/enformer_preds_small_window_try1/
mkdir $output_dirn
gpu=MIG-166d7783-762d-5f61-b31c-549eb4e0fba0

CUDA_VISIBLE_DEVICES=$gpu python src/evaluation/variant_effect_prediction/snp_scoring_enformer_new_center.py -i $dsqtl -g $genome -o $output_dirn -bs 1 --debug_mode_on 0


15 changes: 15 additions & 0 deletions eu_caqtls_enformer_gm.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@

#split -l 78366 /mnt/lab_data2/anusri/variant-scorer/src/output/caqtls_lcl_latest/enformer_preds_small_window/metad_data.tsv /mnt/lab_data2/anusri/variant-scorer/src/output/caqtls_lcl_latest/enformer_preds_small_window/split

dsqtl=/mnt/lab_data2/anusri/variant-scorer/src/output/caqtls_lcl_latest/enformer_preds_small_window/splitaa
genome=/mnt/lab_data2/anusri/chrombpnet/reference/male.hg19.fa
#chrom_sizes=/mnt/data/annotations/by_release/hg19/hg19.chrom.sizes
output_dirn=/mnt/lab_data2/anusri/variant-scorer/src/output/caqtls_lcl_latest/enformer_preds_small_window/splitaa/
mkdir $output_dirn

gpu=MIG-40f43250-998e-586a-ac37-d6520e92590f

CUDA_VISIBLE_DEVICES=$gpu python src/evaluation/variant_effect_prediction/snp_scoring_enformer_new_center.py -i $dsqtl -g $genome -o $output_dirn -bs 1 --debug_mode_on 0



15 changes: 15 additions & 0 deletions eu_caqtls_enformer_gm_2.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@

#split -l 78366 /mnt/lab_data2/anusri/variant-scorer/src/output/caqtls_lcl_latest/enformer_preds_small_window/metad_data.tsv /mnt/lab_data2/anusri/variant-scorer/src/output/caqtls_lcl_latest/enformer_preds_small_window/split

dsqtl=/mnt/lab_data2/anusri/variant-scorer/src/output/caqtls_lcl_latest/enformer_preds_small_window/splitab
genome=/mnt/lab_data2/anusri/chrombpnet/reference/male.hg19.fa
#chrom_sizes=/mnt/data/annotations/by_release/hg19/hg19.chrom.sizes
output_dirn=/mnt/lab_data2/anusri/variant-scorer/src/output/caqtls_lcl_latest/enformer_preds_small_window/splitab/
mkdir $output_dirn

gpu=2

CUDA_VISIBLE_DEVICES=$gpu python src/evaluation/variant_effect_prediction/snp_scoring_enformer_new_center.py -i $dsqtl -g $genome -o $output_dirn -bs 1 --debug_mode_on 0



15 changes: 15 additions & 0 deletions get_bias_in_h5_format.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import tensorflow as tf
import pandas as pd

data = pd.read_csv("logs/checkpoint/JAN_02_2023/model_dir_atac.csv", sep=',', header=None)
data = data[data[1]=="HEPG2"].reset_index(drop=True)

print(data.head())

for i,r in data.iterrows():
ppath = r[2]+"/bias_model/new_model_formats/bias/"
print(ppath)
modelf = tf.keras.models.load_model(ppath)
odir="/oak/stanford/groups/akundaje/anusri/dil_layer_expts/HEPG2/"+r[2].strip().split("/")[-1]
print(odir)
modelf.save(odir+"/bias_model/bias.h5")
56 changes: 56 additions & 0 deletions gm_fold_0retrain.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#!/bin/bash

cell_line=GM12878
data_type="DNASE_SE"
date=$(date +'%m.%d.%Y')
setting=$data_type"_"$date
cur_file_name="gm_fold_0retrain.sh"
### SIGNAL INPUT
fold=/oak/stanford/groups/akundaje/projects/chrombpnet/model_inputs/ENCODE_ATAC_downloads/splits/fold_0.json
bias_h5=/mnt/lab_data2/anusri/chrombpnet/results/chrombpnet/DNASE_SE/GM12878/nautilus_runs/GM12878_03.06.2022_bias_128_4_1234_0.8_fold_0/bias_model/bias.h5

overlap_peak=/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/DNASE/ENCSR000EMT/preprocessing/downloads/peaks.bed.gz
blacklist_region=/mnt/data/annotations/blacklist/GRch38/GRch38_unified_blacklist.bed.gz
chrom_sizes=/mnt/data/annotations/by_release/hg38/hg38.chrom.sizes
ref_fasta=/mnt/data/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta

main_dir=$PWD/results/chrombpnet/$data_type/$cell_line
data_dir=$main_dir/data
output_dir=$main_dir/$setting
neg_dir=$main_dir/negatives_data/
inputlen=2114
gpu=0


function timestamp {
# Function to get the current time with the new line character
# removed

# current time
date +"%Y-%m-%d_%H-%M-%S" | tr -d '\n'
}


## CREATE DIRS
if [[ -d $main_dir ]] ; then
echo "main director already exists"
else
mkdir $main_dir
fi

if [[ -d $output_dir ]] ; then
echo "output director already exists"
else
mkdir $output_dir
fi



### STEP 2 - TRAIN CHROMBPNET MODEL

if [[ -d $output_dir/chrombpnet_model ]] ; then
echo "skipping chrombpnet model training - directory present "
else
mkdir $output_dir/chrombpnet_model
CUDA_VISIBLE_DEVICES=$gpu bash step6_train_chrombpnet_model.sh $ref_fasta $data_dir"/"$cell_line"_unstranded.bw" $overlap_peak $neg_dir/negatives_with_summit.bed $fold $bias_h5 $output_dir/chrombpnet_model $data_type
fi
56 changes: 56 additions & 0 deletions gm_fold_1retrain.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#!/bin/bash

cell_line=GM12878
data_type="DNASE_SE"
date=$(date +'%m.%d.%Y')
setting=$data_type"_"$date"_fold_1"
cur_file_name="gm_fold_1retrain.sh"
### SIGNAL INPUT
fold=/oak/stanford/groups/akundaje/projects/chrombpnet/model_inputs/ENCODE_ATAC_downloads/splits/fold_1.json
bias_h5=/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/GM12878/GM12878_07.07.2022_bias_128_4_1234_0.8_fold_1_data_type_DNASE_SE/bias_model/bias.h5

overlap_peak=/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/DNASE/ENCSR000EMT/preprocessing/downloads/peaks.bed.gz
blacklist_region=/mnt/data/annotations/blacklist/GRch38/GRch38_unified_blacklist.bed.gz
chrom_sizes=/mnt/data/annotations/by_release/hg38/hg38.chrom.sizes
ref_fasta=/mnt/data/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta

main_dir=$PWD/results/chrombpnet/$data_type/$cell_line
data_dir=$main_dir/data
output_dir=$main_dir/$setting
neg_dir=$main_dir/negatives_data_1/
inputlen=2114
gpu=MIG-40f43250-998e-586a-ac37-d6520e92590f


function timestamp {
# Function to get the current time with the new line character
# removed

# current time
date +"%Y-%m-%d_%H-%M-%S" | tr -d '\n'
}


## CREATE DIRS
if [[ -d $main_dir ]] ; then
echo "main director already exists"
else
mkdir $main_dir
fi

if [[ -d $output_dir ]] ; then
echo "output director already exists"
else
mkdir $output_dir
fi



### STEP 2 - TRAIN CHROMBPNET MODEL

if [[ -d $output_dir/chrombpnet_model ]] ; then
echo "skipping chrombpnet model training - directory present "
else
mkdir $output_dir/chrombpnet_model
CUDA_VISIBLE_DEVICES=$gpu bash step6_train_chrombpnet_model.sh $ref_fasta $data_dir"/"$cell_line"_unstranded.bw" $overlap_peak $neg_dir/negatives_with_summit.bed $fold $bias_h5 $output_dir/chrombpnet_model $data_type
fi
Loading

0 comments on commit 188125c

Please sign in to comment.