diff --git a/README.md b/README.md index eef5a21..12b3932 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,7 @@ _MIRFLOWZ_ is a [Snakemake][snakemake] workflow for mapping miRNAs and isomiRs. 2. [Usage](#usage) - [Preparing inputs](#preparing-inputs) - [Running the workflow](#running-the-workflow) + - [Expected output files](#expected-output-files) - [Creating a Snakemake report](#creating-a-snakemake-report) 3. [Workflow description](#workflow-description) 4. [Contributing](#contributing) @@ -219,7 +220,7 @@ We recommend creating a copy of the ```bash cp config/config_template.yaml path/to/config.yaml -``` +``` So on that PR I could move this information in the section/file all of this will be written. Open the new copy in your editor of choice and adjust the configuration parameters to your liking. The template explains what each of the @@ -251,6 +252,49 @@ snakemake \ After successful execution of the workflow, results and logs will be found in the `results/` and `logs/` directories, respectively. +### Expected output files + +Upon successful execution of _MIRFLOWZ_, the tool automatically removes all +intermediate files generated during the process. The final outputs comprise: + +1. A SAM file containing alignments intersecting a pri-miR locus. These +alignments intersect with extended start and/or end positions specified in the +provided pri-miR annotations. Please note that they may not contribute to the +final counting and may not appear in the final table. Alignments are discarded +if their start and/or end positions differ from the ends of the provided +pri-miR annotations by more bases than the extension used. + +2. A SAM file containing alignments intersecting a mature miRNA locus. Similar +to the previous file, these alignments intersect with extended start and/or end +positions specified in the provided miRNA annotations. They may not contribute +to the final counting and might be absent from the final table. + +3. A BAM file containing the set of alignments contributing to the final +counting and its corresponding index file (`.bam.bai`). + +4. Table(s) containing the counting data from all libraries for (iso)miRs +and/or pri-miRs. Each row corresponds to a miRNA species, and each column +represents a sample library. Each read is counted towards all the annotated +miRNA species it aligns to, with 1/n, where n is the number of genomic and/or +transcriptomic loci that read aligns to. + +To retain all intermediate files, include `--no-hooks` in the workflow call. + +```bash +snakemake \ + --snakefile="path/to/Snakefile" \ + --cores 4 \ + --configfile="path/to/config.yaml" \ + --use-conda \ + --printshellcmds \ + --rerun-incomplete \ + --no-hooks \ + --verbose +``` + +After successful execution of the workflow, the intermediate files will be +found in the `results/intermediates` directory. + ### Creating a Snakemake report Snakemake provides the option to generate a detailed HTML report on runtime diff --git a/config/config_schema.json b/config/config_schema.json index 2d56367..842c175 100644 --- a/config/config_schema.json +++ b/config/config_schema.json @@ -30,6 +30,11 @@ "default": "results/", "description": "Path to the output directory." }, + "intermediates_dir":{ + "type": "string", + "default": "results/intermediates", + "description": "Path to the directory storing the intermediate files." + }, "local_log":{ "type": "string", "default": "logs/local/", diff --git a/config/config_template.yaml b/config/config_template.yaml index d6d01c9..06f89bd 100644 --- a/config/config_template.yaml +++ b/config/config_template.yaml @@ -32,6 +32,7 @@ map_chr_file: path/to/ucsc_ensembl_mappings.tsv #### DIRECTORIES #### output_dir: results/ +intermediates_dir: results/intermediates local_log: logs/local/ cluster_log: logs/cluster/ scripts_dir: ../scripts/ diff --git a/images/rule_graph.svg b/images/rule_graph.svg index 8926ad4..9836032 100644 --- a/images/rule_graph.svg +++ b/images/rule_graph.svg @@ -4,796 +4,784 @@ - + snakemake_dag - + 0 - -finish + +finish 1 - -filter_sam_by_intersecting_primir + +filter_sam_by_intersecting_primir - + 1->0 - - + + 42 - -filter_sam_by_intersecting_mirna + +filter_sam_by_intersecting_mirna - + 1->42 - - + + 45 - -convert_intersecting_primir_sam_to_bam + +convert_intersecting_primir_sam_to_bam - + 1->45 - - + + 2 - -filter_by_indels + +filter_by_indels - + 2->1 - - + + 37 - -convert_all_alns_sam_to_bam + +convert_all_alns_sam_to_bam - + 2->37 - - + + 3 - -remove_inferiors + +remove_inferiors 3->2 - - + + 4 - -sort_maps_by_id + +sort_maps_by_id 4->3 - - + + 5 - -add_header_all_maps + +add_header_all_maps 5->4 - - + + 6 - -create_genome_header + +create_genome_header - + 6->5 - - + + 7 - -trim_genome_seq_id + +trim_genome_seq_ids 7->6 - - + + 19 - -extract_transcriptome_seqs + +extract_transcriptome_seqs 7->19 - - + + 30 - -map_genome_segemehl + +map_genome_segemehl 7->30 - - + + 31 - -generate_segemehl_index_genome + +generate_segemehl_index_genome - + 7->31 - - + + - - -33 - -map_genome_oligomap + + +34 + +map_genome_oligomap - + -7->33 - - +7->34 + + 41 - -create_index_genome_fasta + +create_index_genome_fasta - + 7->41 - - + + 8 - -merge_all_maps + +merge_all_maps - + 8->5 - - + + 9 - -transcriptome_to_genome_maps + +transcriptome_to_genome_maps 9->8 - - + + 10 - -remove_header_transcriptome_mappings + +remove_header_transcriptome_mappings - + 10->9 - - + + 11 - -filter_transcriptome_by_nh + +filter_transcriptome_by_nh 11->10 - - + + 12 - -merge_transcriptome_maps + +merge_transcriptome_maps 12->11 - - + + 13 - -map_transcriptome_segemehl + +map_transcriptome_segemehl - + 13->12 - - + + 14 - -collapse_identical_reads + +collapse_identical_reads - + 14->13 - - + + - - -23 - -filter_fasta_for_oligomap + + +24 + +filter_fasta_for_oligomap - + -14->23 - - +14->24 + + 14->30 - - + + 15 - -remove_adapters + +remove_adapters 15->14 - - + + 16 - -format_fasta + +format_fasta 16->15 - - + + 17 - -start + +start 17->16 - - + + 18 - -trim_fasta_seq_ids + +trim_transcriptome_seq_ids 18->13 - - + + 20 - -generate_segemehl_index_transcriptome + +generate_segemehl_index_transcriptome 18->20 - - + + - - -22 - -map_transcriptome_oligomap + + +23 + +map_transcriptome_oligomap - + -18->22 - - +18->23 + + 19->18 - - + + - + 20->13 - - + + 21 - -oligomap_transcriptome_to_sam + +convert_transcriptome_to_sam_oligomap - + 21->12 - - + + + + + +22 + +sort_transcriptome_oligomap 22->21 - - - - - -24 - -sort_transcriptome_oligomap - - - -22->24 - - + + - + 23->22 - - + + - - -23->33 - - + + +24->23 + + - - -24->21 - - + + +24->34 + + 25 - -exons_gtf_to_bed + +convert_exons_gtf_to_bed - + 25->9 - - + + 26 - -get_exons_gtf + +get_exons_gtf - + 26->25 - - + + 27 - -remove_header_genome_mappings + +remove_header_genome_mappings 27->8 - - + + 28 - -filter_genome_by_nh + +filter_genome_by_nh - + 28->27 - - + + 29 - -merge_genome_maps + +merge_genome_maps - + 29->28 - - + + 30->29 - - + + - + 31->30 - - + + 32 - -oligomap_genome_to_sam + +convert_genome_to_sam_oligomap - + 32->29 - - + + + + + +33 + +sort_genome_oligomap - + 33->32 - - - - - -34 - -sort_genome_oligomap - - - -33->34 - - + + - - -34->32 - - + + +34->33 + + 35 - -intersect_extended_primir + +intersect_extended_primir - + 35->1 - - + + 50 - -quantify_primir + +quantify_primir - + 35->50 - - + + 36 - -sort_all_alns_bam_by_position + +sort_all_alns_bam_by_position - + 36->35 - - + + - + 37->36 - - + + 38 - -extend_mirs_annotations + +extend_mirs_annotations - + 38->35 - - + + 43 - -intersect_extended_mirna + +intersect_extended_mirna - + 38->43 - - + + 39 - -map_chr_names + +map_chr_names - + 39->38 - - + + 40 - -extract_chr_len + +extract_chr_len - + 40->38 - - + + - + 41->40 - - + + - + 42->0 - - + + - - -47 - -add_intersecting_mirna_tag + + +49 + +add_intersecting_mirna_tag - - -42->47 - - + + +42->49 + + - - -53 - -uncollapse_reads + + +51 + +uncollapse_reads - - -42->53 - - + + +42->51 + + - + 43->42 - - + + - - -43->47 - - + + +43->49 + + 44 - -sort_intersecting_primir_bam_by_position + +sort_intersecting_primir_bam_by_position - + 44->43 - - + + - + 45->44 - - + + 46 - -sort_intersecting_mirna_by_feat_tag + +merge_tables 46->0 - - - - - -49 - -quantify_mirna + + - - -46->49 - - + + +47 + +quantify_mirna - + 47->46 - - + + 48 - -merge_tables + +sort_intersecting_mirna_by_feat_tag - - -48->0 - - + + +48->47 + + - + 49->48 - - + + - - -50->48 - - - - - -51 - -sort_uncollpased_reads_bam_by_position + + +50->46 + + - + 51->0 - - + + - - -54 - -index_uncollapsed_reads_bam + + +53 + +convert_uncollpased_reads_sam_to_bam - - -51->54 - - + + +51->53 + + 52 - -convert_uncollpased_reads_sam_to_bam + +sort_uncollpased_reads_bam_by_position - - -52->51 - - + + +52->0 + + - + + +54 + +index_uncollapsed_reads_bam + + +52->54 + + + + + 53->52 - - + + - + 54->0 - - + + diff --git a/test/expected_output.md5 b/test/expected_output.md5 index 0f32828..c3fc05e 100644 --- a/test/expected_output.md5 +++ b/test/expected_output.md5 @@ -1,58 +1,58 @@ 68f943f89b52d628851dd97fb1399d68 results/TABLES/all_mirna_counts.tab -eec9be6cda61d2728290c92c1209f455 results/TABLES/mirna_counts_test_lib 363ecee318c57ee7e2e45ca468007baa results/TABLES/all_pri-mir_counts.tab -a844e3a29159e36e2f17a0646d1e8c5f results/TABLES/pri-mir_counts_test_lib 0d76977b2e36046cc176112776c5fa4e results/test_lib/alignments_intersecting_mirna_uncollapsed_sorted.bam.bai -36f7d024fe6ddfd3e788aebf61c61061 results/test_lib/oligomap_genome_sorted.fasta -48e605df55bf2dd37ea5a5a74eb5872a results/test_lib/mappings_all.sam -d41d8cd98f00b204e9800998ecf8427e results/test_lib/oligomap_transcriptome_mappings.fasta -eea903fc0ab81054cf8e34193f80f4a7 results/test_lib/mappings_all_removed_inferiors.sam -98498ac521f451426a9dbabcbecb5f25 results/test_lib/alignments_intersecting_primir.bam -defdc8c46e1d73692edde0e0278f2d5e results/test_lib/oligomap_genome_mappings.fasta -1649738f226e8979d4d88a3ae47fa423 results/test_lib/segemehl_transcriptome_mappings.sam -9ecee9ab80daba0a53076b05c9f6ff53 results/test_lib/alignments_intersecting_mirna_uncollapsed_sorted.bam -1649738f226e8979d4d88a3ae47fa423 results/test_lib/transcriptome_mappings_filtered_nh.sam -8e22ddfa7c39ce7e4ec5945dff1576ef results/test_lib/alignments_all.bam -a124a5afdb5f7bfbcc5683260556c9c4 results/test_lib/mappings_all_no_header.sam -dd00dea3549dc1ad14f9e1505d397de5 results/test_lib/alignments_all.sam -8c24d619073f4c5ca1f439fe429d0ef4 results/test_lib/alignments_intersecting_mirna_tag.sam -d41d8cd98f00b204e9800998ecf8427e results/test_lib/oligomap_transcriptome_sorted.fasta -c218718d93f48e5987fc18b33dc488f0 results/test_lib/segemehl_genome_mappings.sam -d41d8cd98f00b204e9800998ecf8427e results/test_lib/transcriptome_mappings_to_genome.sam -63a32839360a985b68e0685aafad5c54 results/test_lib/fa/reads.fa -5cc557ec2073144f47fe28ac145f4869 results/test_lib/alignments_intersecting_mirna_uncollapsed.sam -edcb854702519c0002d8ce89a21e54ef results/test_lib/reads_formatted.fasta -1a547487b8e92ad85bb26ff9b1db1f93 results/test_lib/intersected_extended_mirna.bed -721071f3ead528aa71978508db8d73f9 results/test_lib/alignments_all_sorted_test_lib.bam -ec0e9bcc8ea857da897035c8fca4078f results/test_lib/reads_trimmed_adapters.fasta -bbfc27c84b66ff41bfeee73f701b4b29 results/test_lib/alignments_intersecting_mirna_uncollapsed.bam -81bed7fc879f7a16c12d2ba912263c46 results/test_lib/alignments_intersecting_mirna.sam -dd560414078330bf3138f039da109093 results/test_lib/genome_mappings.sam -f5cb65466d328036a15b66cfbd4d8419 results/test_lib/oligomap_genome_report.txt -6cbdb9299e09b3e39b79a50db69226b5 results/test_lib/transcriptome_mappings_no_header.sam -1649738f226e8979d4d88a3ae47fa423 results/test_lib/transcriptome_mappings.sam -947607be69c16246f8dc9adbd9b971c8 results/test_lib/oligomap_genome_mappings.sam -9833208a79143eaf3f2a5fdeca0b2d94 results/test_lib/alignments_intersecting_mirna_sorted_tag.sam -02096523b293082629d5b895085468a3 results/test_lib/alignments_intersecting_primir_sorted.bam -d41d8cd98f00b204e9800998ecf8427e results/test_lib/oligomap_transcriptome_mappings.sam -a124a5afdb5f7bfbcc5683260556c9c4 results/test_lib/genome_mappings_no_header.sam -dd560414078330bf3138f039da109093 results/test_lib/genome_mappings_filtered_nh.sam -ae4c4963ca2cd206952b2ea2c58301dd results/test_lib/mappings_all_sorted_by_id.sam -2c77ffa021dda190d82f3f54a3312393 results/test_lib/reads_collapsed.fasta -f68693cfaa1e6ea78e1a5562ade6d9ed results/test_lib/intersected_extended_primir.bed -61f12595db9421926073d6675f7c3c42 results/test_lib/alignments_intersecting_primir.sam -c2a5770a755ada66ef63d96eec4afb00 results/test_lib/reads_filtered_for_oligomap.fasta -fe5388094985e9604a302d39d2abc82c results/test_lib/oligomap_transcriptome_report.txt -be7a0d92e57480190de57eb30baffa36 results/extended_mirna_annotation_6_nt.gff3 -8148cd880602255be166beb59bbed95a results/genome_header.sam -09e24a504bfec37fee3d5ff1b5c7738e results/exons.bed -4fb453846e88593d0cac13220ec2d685 results/segemehl_genome_index.idx -d34fc868b861b1bc46db07a397dc0f10 results/genome_processed.fa.fai -21e102e4ebd3508bb06f46366a3d578d results/exons.gtf -003b92b245ac336e3d70a513033e1cee results/transcriptome_trimmed_id.fa -44dbf7c3eae00d0bc8d5e1319123746c results/chr_size.txt -cc5c3512dab0e269d82bd625de74198e results/extended_primir_annotation_6_nt.gff3 -f28cc0143ab6659bef3de3a7afa1dccc results/mirna_annotations.gff3 -2d437f8681f4248d4f2075f86debb920 results/transcriptome.fa -7eb64c112830266bcf416ded60b4cf77 results/segemehl_transcriptome_index.idx -4fba145540a2c61f29bfddfd0f5a4d4e results/genome_processed.fa +25aca3f96e7ed644067d2050393bf7a4 results/test_lib/alignments_intersecting_mirna_uncollapsed_sorted.bam +cc01c7884838a597c587437cb0acf64e results/test_lib/alignments_intersecting_mirna.sam +b1eb81426f890d671bba8c8a815edc1e results/test_lib/alignments_intersecting_primir.sam +eec9be6cda61d2728290c92c1209f455 results/intermediates/TABLES/mirna_counts_test_lib +a844e3a29159e36e2f17a0646d1e8c5f results/intermediates/TABLES/pri-mir_counts_test_lib +36f7d024fe6ddfd3e788aebf61c61061 results/intermediates/test_lib/oligomap_genome_sorted.fasta +48e605df55bf2dd37ea5a5a74eb5872a results/intermediates/test_lib/mappings_all.sam +d41d8cd98f00b204e9800998ecf8427e results/intermediates/test_lib/oligomap_transcriptome_mappings.fasta +e9aac4afeb2053385d60f5e4b07a9774 results/intermediates/test_lib/mappings_all_removed_inferiors.sam +9ebcb4ac877f37921b88ceca3ff03b62 results/intermediates/test_lib/alignments_intersecting_primir.bam +defdc8c46e1d73692edde0e0278f2d5e results/intermediates/test_lib/oligomap_genome_mappings.fasta +e632f8984d423d46bbb377ec75468521 results/intermediates/test_lib/segemehl_transcriptome_mappings.sam +e632f8984d423d46bbb377ec75468521 results/intermediates/test_lib/transcriptome_mappings_filtered_nh.sam +3344bbeb9fe01f07c04831e5b4a795ba results/intermediates/test_lib/alignments_all.bam +a124a5afdb5f7bfbcc5683260556c9c4 results/intermediates/test_lib/mappings_all_no_header.sam +d62630102c33d43d593af14c2a642839 results/intermediates/test_lib/alignments_all.sam +81103749d61bc55ee2cfc84ca1527456 results/intermediates/test_lib/alignments_intersecting_mirna_tag.sam +d41d8cd98f00b204e9800998ecf8427e results/intermediates/test_lib/oligomap_transcriptome_sorted.fasta +76643f87bb2e2bff77d1b1223d7720b5 results/intermediates/test_lib/segemehl_genome_mappings.sam +d41d8cd98f00b204e9800998ecf8427e results/intermediates/test_lib/transcriptome_mappings_to_genome.sam +63a32839360a985b68e0685aafad5c54 results/intermediates/test_lib/fa/reads.fa +e9e9698d9350b64b64c1f6d96019fce8 results/intermediates/test_lib/alignments_intersecting_mirna_uncollapsed.sam +edcb854702519c0002d8ce89a21e54ef results/intermediates/test_lib/reads_formatted.fasta +1a547487b8e92ad85bb26ff9b1db1f93 results/intermediates/test_lib/intersected_extended_mirna.bed +a287ffc43b6afbdde3e9905bc27c28a5 results/intermediates/test_lib/alignments_all_sorted_test_lib.bam +ec0e9bcc8ea857da897035c8fca4078f results/intermediates/test_lib/reads_trimmed_adapters.fasta +d7a5ab720ff9c96f41f3755a05b8f9e0 results/intermediates/test_lib/alignments_intersecting_mirna_uncollapsed.bam +1f1b873d05ec14ef9b16376a1c98315b results/intermediates/test_lib/genome_mappings.sam +f5cb65466d328036a15b66cfbd4d8419 results/intermediates/test_lib/oligomap_genome_report.txt +6cbdb9299e09b3e39b79a50db69226b5 results/intermediates/test_lib/transcriptome_mappings_no_header.sam +e632f8984d423d46bbb377ec75468521 results/intermediates/test_lib/transcriptome_mappings.sam +947607be69c16246f8dc9adbd9b971c8 results/intermediates/test_lib/oligomap_genome_mappings.sam +ce3fcd037e0a6a0b1a7a3253219e7053 results/intermediates/test_lib/alignments_intersecting_mirna_sorted_tag.sam +53764354c520d9700f13761c2721d8aa results/intermediates/test_lib/alignments_intersecting_primir_sorted.bam +d41d8cd98f00b204e9800998ecf8427e results/intermediates/test_lib/oligomap_transcriptome_mappings.sam +a124a5afdb5f7bfbcc5683260556c9c4 results/intermediates/test_lib/genome_mappings_no_header.sam +1f1b873d05ec14ef9b16376a1c98315b results/intermediates/test_lib/genome_mappings_filtered_nh.sam +6cc6165e8942a08420552aa810e629f8 results/intermediates/test_lib/mappings_all_sorted_by_id.sam +2c77ffa021dda190d82f3f54a3312393 results/intermediates/test_lib/reads_collapsed.fasta +f68693cfaa1e6ea78e1a5562ade6d9ed results/intermediates/test_lib/intersected_extended_primir.bed +c2a5770a755ada66ef63d96eec4afb00 results/intermediates/test_lib/reads_filtered_for_oligomap.fasta +fe5388094985e9604a302d39d2abc82c results/intermediates/test_lib/oligomap_transcriptome_report.txt +be7a0d92e57480190de57eb30baffa36 results/intermediates/extended_mirna_annotation_6_nt.gff3 +8148cd880602255be166beb59bbed95a results/intermediates/genome_header.sam +09e24a504bfec37fee3d5ff1b5c7738e results/intermediates/exons.bed +4fb453846e88593d0cac13220ec2d685 results/intermediates/segemehl_genome_index.idx +d34fc868b861b1bc46db07a397dc0f10 results/intermediates/genome_processed.fa.fai +21e102e4ebd3508bb06f46366a3d578d results/intermediates/exons.gtf +003b92b245ac336e3d70a513033e1cee results/intermediates/transcriptome_trimmed_id.fa +44dbf7c3eae00d0bc8d5e1319123746c results/intermediates/chr_size.txt +cc5c3512dab0e269d82bd625de74198e results/intermediates/extended_primir_annotation_6_nt.gff3 +f28cc0143ab6659bef3de3a7afa1dccc results/intermediates/mirna_annotations.gff3 +2d437f8681f4248d4f2075f86debb920 results/intermediates/transcriptome.fa +7eb64c112830266bcf416ded60b4cf77 results/intermediates/segemehl_transcriptome_index.idx +4fba145540a2c61f29bfddfd0f5a4d4e results/intermediates/genome_processed.fa diff --git a/test/test_workflow_local_with_conda.sh b/test/test_workflow_local_with_conda.sh index 6b17996..20c0555 100755 --- a/test/test_workflow_local_with_conda.sh +++ b/test/test_workflow_local_with_conda.sh @@ -25,9 +25,9 @@ snakemake \ --use-conda \ --printshellcmds \ --rerun-incomplete \ + --no-hooks \ --verbose - # Snakemake report snakemake \ --snakefile="../workflow/Snakefile" \ diff --git a/test/test_workflow_local_with_singularity.sh b/test/test_workflow_local_with_singularity.sh index 306caac..b8f3dea 100755 --- a/test/test_workflow_local_with_singularity.sh +++ b/test/test_workflow_local_with_singularity.sh @@ -26,9 +26,9 @@ snakemake \ --singularity-args "--bind ${PWD}/../" \ --printshellcmds \ --rerun-incomplete \ + --no-hooks \ --verbose - # Snakemake report snakemake \ --snakefile="../workflow/Snakefile" \ diff --git a/test/test_workflow_slurm_with_conda.sh b/test/test_workflow_slurm_with_conda.sh index 190ffdc..2e6c715 100755 --- a/test/test_workflow_slurm_with_conda.sh +++ b/test/test_workflow_slurm_with_conda.sh @@ -41,6 +41,7 @@ snakemake \ --use-conda \ --printshellcmds \ --rerun-incomplete \ + --no-hooks \ --verbose # Snakemake report diff --git a/test/test_workflow_slurm_with_singularity.sh b/test/test_workflow_slurm_with_singularity.sh index 4ea8a9b..88781fd 100755 --- a/test/test_workflow_slurm_with_singularity.sh +++ b/test/test_workflow_slurm_with_singularity.sh @@ -42,6 +42,7 @@ snakemake \ --singularity-args="--bind ${PWD}/../" \ --printshellcmds \ --rerun-incomplete \ + --no-hooks \ --verbose # Snakemake report diff --git a/workflow/Snakefile b/workflow/Snakefile index 4dcd95c..e9648d6 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -31,6 +31,23 @@ validate(config, Path("../config/config_schema.json")) OUT_DIR = Path(config["output_dir"]) +INTERMEDIATES_DIR = Path(config["intermediates_dir"]) +LOG_DIR = Path(f"{config['local_log']}/../") + + +############################################################################### +### onSuccess/onError handlers configuration +############################################################################### + + +onsuccess: + print("\nWORKFLOW SUCCEED. Removing intermediate files.\n") + shell("rm -rf {INTERMEDIATES_DIR}") + + +onerror: + print("\nWORKFLOW FAILED. Check the log file in the log directory.\n") + shell("cat {log} > {LOG_DIR}/failed_workflow.log") ############################################################################### @@ -67,10 +84,6 @@ rule finish: OUT_DIR / "{sample}" / "alignments_intersecting_mirna.sam", sample=pd.unique(samples_table.index.values), ), - intersect_sam=expand( - OUT_DIR / "{sample}" / "alignments_intersecting_mirna_sorted_tag.sam", - sample=pd.unique(samples_table.index.values), - ), table=expand( OUT_DIR / "TABLES" / "all_{mir}_counts.tab", mir=[mir for mir in config["mir_list"] if mir != "isomir"], diff --git a/workflow/rules/map.smk b/workflow/rules/map.smk index effa9a6..0c1131f 100644 --- a/workflow/rules/map.smk +++ b/workflow/rules/map.smk @@ -24,7 +24,7 @@ validate(config, Path("../../config/config_schema.json")) ENV_DIR = Path(f"{workflow.basedir}/envs") -OUT_DIR = Path(config["output_dir"]) +INTERMEDIATES_DIR = Path(config["intermediates_dir"]) SCRIPTS_DIR = Path(config["scripts_dir"]) CLUSTER_LOG = Path(config["cluster_log"]) @@ -70,7 +70,7 @@ localrules: rule finish_map: input: maps=expand( - OUT_DIR / "{sample}" / "alignments_all_sorted_{sample}.bam.bai", + INTERMEDIATES_DIR / "{sample}" / "alignments_all_sorted_{sample}.bam.bai", sample=pd.unique(samples_table.index.values), ), @@ -87,7 +87,7 @@ rule start: format=convert_lib_format(get_sample("format")), ), output: - reads=OUT_DIR / "{sample}" / "{format}" / "reads.{format}", + reads=INTERMEDIATES_DIR / "{sample}" / "{format}" / "reads.{format}", params: cluster_log=CLUSTER_LOG / "uncompress_zipped_files_{sample}_{format}.log", log: @@ -105,9 +105,9 @@ rule start: rule fastq_quality_filter: input: - reads=OUT_DIR / "{sample}" / "fastq" / "reads.fastq", + reads=INTERMEDIATES_DIR / "{sample}" / "fastq" / "reads.fastq", output: - reads=OUT_DIR / "{sample}" / "fastq" / "filtered_reads.fastq", + reads=INTERMEDIATES_DIR / "{sample}" / "fastq" / "filtered_reads.fastq", params: cluster_log=CLUSTER_LOG / "fastq_quality_filter_{sample}.log", p=config["p_value"], @@ -135,9 +135,9 @@ rule fastq_quality_filter: rule fastq_to_fasta: input: - reads=OUT_DIR / "{sample}" / "fastq" / "filtered_reads.fastq", + reads=INTERMEDIATES_DIR / "{sample}" / "fastq" / "filtered_reads.fastq", output: - reads=OUT_DIR / "{sample}" / "fastq" / "reads.fa", + reads=INTERMEDIATES_DIR / "{sample}" / "fastq" / "reads.fa", params: cluster_log=CLUSTER_LOG / "fastq_to_fasta_{sample}.log", log: @@ -157,12 +157,12 @@ rule fastq_to_fasta: rule format_fasta: input: - reads=lambda wildcards: OUT_DIR + reads=lambda wildcards: INTERMEDIATES_DIR / wildcards.sample / convert_lib_format(get_sample("format", wildcards.sample)) / "reads.fa", output: - reads=OUT_DIR / "{sample}" / "reads_formatted.fasta", + reads=INTERMEDIATES_DIR / "{sample}" / "reads_formatted.fasta", params: cluster_log=CLUSTER_LOG / "format_fasta_{sample}.log", log: @@ -182,9 +182,9 @@ rule format_fasta: rule remove_adapters: input: - reads=OUT_DIR / "{sample}" / "reads_formatted.fasta", + reads=INTERMEDIATES_DIR / "{sample}" / "reads_formatted.fasta", output: - reads=OUT_DIR / "{sample}" / "reads_trimmed_adapters.fasta", + reads=INTERMEDIATES_DIR / "{sample}" / "reads_trimmed_adapters.fasta", params: adapter=lambda wildcards: get_sample("adapter", wildcards.sample).upper(), error_rate=config["error_rate"], @@ -219,9 +219,9 @@ rule remove_adapters: rule collapse_identical_reads: input: - reads=OUT_DIR / "{sample}" / "reads_trimmed_adapters.fasta", + reads=INTERMEDIATES_DIR / "{sample}" / "reads_trimmed_adapters.fasta", output: - reads=OUT_DIR / "{sample}" / "reads_collapsed.fasta", + reads=INTERMEDIATES_DIR / "{sample}" / "reads_collapsed.fasta", params: cluster_log=CLUSTER_LOG / "collapse_identical_reads_{sample}.log", log: @@ -241,11 +241,11 @@ rule collapse_identical_reads: rule map_genome_segemehl: input: - reads=OUT_DIR / "{sample}" / "reads_collapsed.fasta", - genome=OUT_DIR / "genome_processed.fa", - genome_index_segemehl=OUT_DIR / "segemehl_genome_index.idx", + reads=INTERMEDIATES_DIR / "{sample}" / "reads_collapsed.fasta", + genome=INTERMEDIATES_DIR / "genome_processed.fa", + genome_index_segemehl=INTERMEDIATES_DIR / "segemehl_genome_index.idx", output: - gmap=OUT_DIR / "{sample}" / "segemehl_genome_mappings.sam", + gmap=INTERMEDIATES_DIR / "{sample}" / "segemehl_genome_mappings.sam", params: cluster_log=CLUSTER_LOG / "map_genome_segemehl_{sample}.log", log: @@ -276,11 +276,12 @@ rule map_genome_segemehl: rule map_transcriptome_segemehl: input: - reads=OUT_DIR / "{sample}" / "reads_collapsed.fasta", - transcriptome=OUT_DIR / "transcriptome_trimmed_id.fa", - transcriptome_index_segemehl=OUT_DIR / "segemehl_transcriptome_index.idx", + reads=INTERMEDIATES_DIR / "{sample}" / "reads_collapsed.fasta", + transcriptome=INTERMEDIATES_DIR / "transcriptome_trimmed_id.fa", + transcriptome_index_segemehl=INTERMEDIATES_DIR + / "segemehl_transcriptome_index.idx", output: - tmap=OUT_DIR / "{sample}" / "segemehl_transcriptome_mappings.sam", + tmap=INTERMEDIATES_DIR / "{sample}" / "segemehl_transcriptome_mappings.sam", params: cluster_log=CLUSTER_LOG / "map_transcriptome_segemehl_{sample}.log", log: @@ -311,10 +312,10 @@ rule map_transcriptome_segemehl: rule filter_fasta_for_oligomap: input: - reads=OUT_DIR / "{sample}" / "reads_collapsed.fasta", + reads=INTERMEDIATES_DIR / "{sample}" / "reads_collapsed.fasta", script=SCRIPTS_DIR / "validation_fasta.py", output: - reads=OUT_DIR / "{sample}" / "reads_filtered_for_oligomap.fasta", + reads=INTERMEDIATES_DIR / "{sample}" / "reads_filtered_for_oligomap.fasta", params: cluster_log=CLUSTER_LOG / "filter_fasta_for_oligomap_{sample}.log", max_length_reads=config["max_length_reads"], @@ -339,11 +340,11 @@ rule filter_fasta_for_oligomap: rule map_genome_oligomap: input: - reads=OUT_DIR / "{sample}" / "reads_filtered_for_oligomap.fasta", - target=OUT_DIR / "genome_processed.fa", + reads=INTERMEDIATES_DIR / "{sample}" / "reads_filtered_for_oligomap.fasta", + target=INTERMEDIATES_DIR / "genome_processed.fa", output: - gmap=OUT_DIR / "{sample}" / "oligomap_genome_mappings.fasta", - report=OUT_DIR / "{sample}" / "oligomap_genome_report.txt", + gmap=INTERMEDIATES_DIR / "{sample}" / "oligomap_genome_mappings.fasta", + report=INTERMEDIATES_DIR / "{sample}" / "oligomap_genome_report.txt", params: cluster_log=CLUSTER_LOG / "map_genome_oligomap_{sample}.log", log: @@ -372,11 +373,11 @@ rule map_genome_oligomap: rule sort_genome_oligomap: input: - tmap=OUT_DIR / "{sample}" / "oligomap_genome_mappings.fasta", - report=OUT_DIR / "{sample}" / "oligomap_genome_report.txt", + tmap=INTERMEDIATES_DIR / "{sample}" / "oligomap_genome_mappings.fasta", + report=INTERMEDIATES_DIR / "{sample}" / "oligomap_genome_report.txt", script=SCRIPTS_DIR / "blocksort.sh", output: - sort=OUT_DIR / "{sample}" / "oligomap_genome_sorted.fasta", + sort=INTERMEDIATES_DIR / "{sample}" / "oligomap_genome_sorted.fasta", params: cluster_log=CLUSTER_LOG / "sort_genome_oligomap_{sample}.log", log: @@ -401,10 +402,10 @@ rule sort_genome_oligomap: rule convert_genome_to_sam_oligomap: input: - sort=OUT_DIR / "{sample}" / "oligomap_genome_sorted.fasta", + sort=INTERMEDIATES_DIR / "{sample}" / "oligomap_genome_sorted.fasta", script=SCRIPTS_DIR / "oligomap_output_to_sam_nh_filtered.py", output: - gmap=OUT_DIR / "{sample}" / "oligomap_genome_mappings.sam", + gmap=INTERMEDIATES_DIR / "{sample}" / "oligomap_genome_mappings.sam", params: cluster_log=CLUSTER_LOG / "oligomap_genome_to_sam_{sample}.log", nh=config["nh"], @@ -431,11 +432,11 @@ rule convert_genome_to_sam_oligomap: rule map_transcriptome_oligomap: input: - reads=OUT_DIR / "{sample}" / "reads_filtered_for_oligomap.fasta", - target=OUT_DIR / "transcriptome_trimmed_id.fa", + reads=INTERMEDIATES_DIR / "{sample}" / "reads_filtered_for_oligomap.fasta", + target=INTERMEDIATES_DIR / "transcriptome_trimmed_id.fa", output: - tmap=OUT_DIR / "{sample}" / "oligomap_transcriptome_mappings.fasta", - report=OUT_DIR / "{sample}" / "oligomap_transcriptome_report.txt", + tmap=INTERMEDIATES_DIR / "{sample}" / "oligomap_transcriptome_mappings.fasta", + report=INTERMEDIATES_DIR / "{sample}" / "oligomap_transcriptome_report.txt", params: cluster_log=CLUSTER_LOG / "map_transcriptome_oligomap_{sample}.log", log: @@ -465,11 +466,11 @@ rule map_transcriptome_oligomap: rule sort_transcriptome_oligomap: input: - tmap=OUT_DIR / "{sample}" / "oligomap_transcriptome_mappings.fasta", - report=OUT_DIR / "{sample}" / "oligomap_transcriptome_report.txt", + tmap=INTERMEDIATES_DIR / "{sample}" / "oligomap_transcriptome_mappings.fasta", + report=INTERMEDIATES_DIR / "{sample}" / "oligomap_transcriptome_report.txt", script=SCRIPTS_DIR / "blocksort.sh", output: - sort=OUT_DIR / "{sample}" / "oligomap_transcriptome_sorted.fasta", + sort=INTERMEDIATES_DIR / "{sample}" / "oligomap_transcriptome_sorted.fasta", params: cluster_log=CLUSTER_LOG / "sort_transcriptome_oligomap_{sample}.log", log: @@ -493,10 +494,10 @@ rule sort_transcriptome_oligomap: rule convert_transcriptome_to_sam_oligomap: input: - sort=OUT_DIR / "{sample}" / "oligomap_transcriptome_sorted.fasta", + sort=INTERMEDIATES_DIR / "{sample}" / "oligomap_transcriptome_sorted.fasta", script=SCRIPTS_DIR / "oligomap_output_to_sam_nh_filtered.py", output: - tmap=OUT_DIR / "{sample}" / "oligomap_transcriptome_mappings.sam", + tmap=INTERMEDIATES_DIR / "{sample}" / "oligomap_transcriptome_mappings.sam", params: cluster_log=CLUSTER_LOG / "oligomap_transcriptome_to_sam_{sample}.log", nh=config["nh"], @@ -520,10 +521,10 @@ rule convert_transcriptome_to_sam_oligomap: rule merge_genome_maps: input: - gmap1=OUT_DIR / "{sample}" / "segemehl_genome_mappings.sam", - gmap2=OUT_DIR / "{sample}" / "oligomap_genome_mappings.sam", + gmap1=INTERMEDIATES_DIR / "{sample}" / "segemehl_genome_mappings.sam", + gmap2=INTERMEDIATES_DIR / "{sample}" / "oligomap_genome_mappings.sam", output: - gmaps=OUT_DIR / "{sample}" / "genome_mappings.sam", + gmaps=INTERMEDIATES_DIR / "{sample}" / "genome_mappings.sam", params: cluster_log=CLUSTER_LOG / "merge_genome_maps_{sample}.log", log: @@ -541,10 +542,10 @@ rule merge_genome_maps: rule merge_transcriptome_maps: input: - tmap1=OUT_DIR / "{sample}" / "segemehl_transcriptome_mappings.sam", - tmap2=OUT_DIR / "{sample}" / "oligomap_transcriptome_mappings.sam", + tmap1=INTERMEDIATES_DIR / "{sample}" / "segemehl_transcriptome_mappings.sam", + tmap2=INTERMEDIATES_DIR / "{sample}" / "oligomap_transcriptome_mappings.sam", output: - tmaps=OUT_DIR / "{sample}" / "transcriptome_mappings.sam", + tmaps=INTERMEDIATES_DIR / "{sample}" / "transcriptome_mappings.sam", params: cluster_log=CLUSTER_LOG / "merge_transcriptome_maps_{sample}.log", log: @@ -562,10 +563,10 @@ rule merge_transcriptome_maps: rule filter_genome_by_nh: input: - gmaps=OUT_DIR / "{sample}" / "genome_mappings.sam", + gmaps=INTERMEDIATES_DIR / "{sample}" / "genome_mappings.sam", script=SCRIPTS_DIR / "nh_filter.py", output: - gmaps=OUT_DIR / "{sample}" / "genome_mappings_filtered_nh.sam", + gmaps=INTERMEDIATES_DIR / "{sample}" / "genome_mappings_filtered_nh.sam", params: cluster_log=CLUSTER_LOG / "filter_genome_by_nh_{sample}.log", nh=config["nh"], @@ -590,10 +591,10 @@ rule filter_genome_by_nh: rule filter_transcriptome_by_nh: input: - tmaps=OUT_DIR / "{sample}" / "transcriptome_mappings.sam", + tmaps=INTERMEDIATES_DIR / "{sample}" / "transcriptome_mappings.sam", script=SCRIPTS_DIR / "nh_filter.py", output: - tmaps=OUT_DIR / "{sample}" / "transcriptome_mappings_filtered_nh.sam", + tmaps=INTERMEDIATES_DIR / "{sample}" / "transcriptome_mappings_filtered_nh.sam", params: cluster_log=CLUSTER_LOG / "filter_transcriptome_by_nh_{sample}.log", nh=config["nh"], @@ -618,9 +619,9 @@ rule filter_transcriptome_by_nh: rule remove_header_genome_mappings: input: - gmap=OUT_DIR / "{sample}" / "genome_mappings_filtered_nh.sam", + gmap=INTERMEDIATES_DIR / "{sample}" / "genome_mappings_filtered_nh.sam", output: - gmap=OUT_DIR / "{sample}" / "genome_mappings_no_header.sam", + gmap=INTERMEDIATES_DIR / "{sample}" / "genome_mappings_no_header.sam", params: cluster_log=CLUSTER_LOG / "remove_header_genome_mappings_{sample}.log", log: @@ -640,9 +641,9 @@ rule remove_header_genome_mappings: rule remove_header_transcriptome_mappings: input: - tmap=OUT_DIR / "{sample}" / "transcriptome_mappings_filtered_nh.sam", + tmap=INTERMEDIATES_DIR / "{sample}" / "transcriptome_mappings_filtered_nh.sam", output: - tmap=OUT_DIR / "{sample}" / "transcriptome_mappings_no_header.sam", + tmap=INTERMEDIATES_DIR / "{sample}" / "transcriptome_mappings_no_header.sam", params: cluster_log=CLUSTER_LOG / "remove_header_transcriptome_mappings_{sample}.log", log: @@ -662,11 +663,11 @@ rule remove_header_transcriptome_mappings: rule transcriptome_to_genome_maps: input: - tmap=OUT_DIR / "{sample}" / "transcriptome_mappings_no_header.sam", + tmap=INTERMEDIATES_DIR / "{sample}" / "transcriptome_mappings_no_header.sam", script=SCRIPTS_DIR / "sam_trx_to_sam_gen.pl", - exons=OUT_DIR / "exons.bed", + exons=INTERMEDIATES_DIR / "exons.bed", output: - genout=OUT_DIR / "{sample}" / "transcriptome_mappings_to_genome.sam", + genout=INTERMEDIATES_DIR / "{sample}" / "transcriptome_mappings_to_genome.sam", params: cluster_log=CLUSTER_LOG / "transcriptome_to_genome_maps_{sample}.log", log: @@ -690,10 +691,10 @@ rule transcriptome_to_genome_maps: rule merge_all_maps: input: - gmap1=OUT_DIR / "{sample}" / "transcriptome_mappings_to_genome.sam", - gmap2=OUT_DIR / "{sample}" / "genome_mappings_no_header.sam", + gmap1=INTERMEDIATES_DIR / "{sample}" / "transcriptome_mappings_to_genome.sam", + gmap2=INTERMEDIATES_DIR / "{sample}" / "genome_mappings_no_header.sam", output: - catmaps=OUT_DIR / "{sample}" / "mappings_all_no_header.sam", + catmaps=INTERMEDIATES_DIR / "{sample}" / "mappings_all_no_header.sam", params: cluster_log=CLUSTER_LOG / "merge_all_mappings_{sample}.log", log: @@ -711,10 +712,10 @@ rule merge_all_maps: rule add_header_all_maps: input: - header=OUT_DIR / "genome_header.sam", - catmaps=OUT_DIR / "{sample}" / "mappings_all_no_header.sam", + header=INTERMEDIATES_DIR / "genome_header.sam", + catmaps=INTERMEDIATES_DIR / "{sample}" / "mappings_all_no_header.sam", output: - concatenate=OUT_DIR / "{sample}" / "mappings_all.sam", + concatenate=INTERMEDIATES_DIR / "{sample}" / "mappings_all.sam", params: cluster_log=CLUSTER_LOG / "add_header_{sample}.log", log: @@ -732,9 +733,9 @@ rule add_header_all_maps: rule sort_maps_by_id: input: - concatenate=OUT_DIR / "{sample}" / "mappings_all.sam", + concatenate=INTERMEDIATES_DIR / "{sample}" / "mappings_all.sam", output: - sort=OUT_DIR / "{sample}" / "mappings_all_sorted_by_id.sam", + sort=INTERMEDIATES_DIR / "{sample}" / "mappings_all_sorted_by_id.sam", params: cluster_log=CLUSTER_LOG / "sort_maps_by_id_{sample}.log", log: @@ -754,10 +755,10 @@ rule sort_maps_by_id: rule remove_inferiors: input: - sort=OUT_DIR / "{sample}" / "mappings_all_sorted_by_id.sam", + sort=INTERMEDIATES_DIR / "{sample}" / "mappings_all_sorted_by_id.sam", script=SCRIPTS_DIR / "sam_remove_duplicates_inferior_alignments_multimappers.pl", output: - remove_inf=OUT_DIR / "{sample}" / "mappings_all_removed_inferiors.sam", + remove_inf=INTERMEDIATES_DIR / "{sample}" / "mappings_all_removed_inferiors.sam", params: cluster_log=CLUSTER_LOG / "remove_inferiors_{sample}.log", log: @@ -785,10 +786,10 @@ rule remove_inferiors: rule filter_by_indels: input: - sam=OUT_DIR / "{sample}" / "mappings_all_removed_inferiors.sam", + sam=INTERMEDIATES_DIR / "{sample}" / "mappings_all_removed_inferiors.sam", script=SCRIPTS_DIR / "filter_multimappers.py", output: - sam=OUT_DIR / "{sample}" / "alignments_all.sam", + sam=INTERMEDIATES_DIR / "{sample}" / "alignments_all.sam", params: cluster_log=CLUSTER_LOG / "remove_multimappers_{sample}.log", log: @@ -815,9 +816,9 @@ rule filter_by_indels: rule convert_all_alns_sam_to_bam: input: - maps=OUT_DIR / "{sample}" / "alignments_all.sam", + maps=INTERMEDIATES_DIR / "{sample}" / "alignments_all.sam", output: - maps=OUT_DIR / "{sample}" / "alignments_all.bam", + maps=INTERMEDIATES_DIR / "{sample}" / "alignments_all.bam", params: cluster_log=CLUSTER_LOG / "convert_all_alns_sam_to_bam_{sample}.log", log: @@ -837,9 +838,9 @@ rule convert_all_alns_sam_to_bam: rule sort_all_alns_bam_by_position: input: - maps=OUT_DIR / "{sample}" / "alignments_all.bam", + maps=INTERMEDIATES_DIR / "{sample}" / "alignments_all.bam", output: - maps=OUT_DIR / "{sample}" / "alignments_all_sorted_{sample}.bam", + maps=INTERMEDIATES_DIR / "{sample}" / "alignments_all_sorted_{sample}.bam", params: cluster_log=CLUSTER_LOG / "sort_all_alns_bam_by_position_{sample}.log", log: @@ -859,9 +860,9 @@ rule sort_all_alns_bam_by_position: rule index_all_alns_bam: input: - maps=OUT_DIR / "{sample}" / "alignments_all_sorted.bam", + maps=INTERMEDIATES_DIR / "{sample}" / "alignments_all_sorted.bam", output: - maps=OUT_DIR / "{sample}" / "alignments_all_sorted.bam.bai", + maps=INTERMEDIATES_DIR / "{sample}" / "alignments_all_sorted.bam.bai", params: cluster_log=CLUSTER_LOG / "index_all_alns_bam_{sample}.log", log: diff --git a/workflow/rules/prepare.smk b/workflow/rules/prepare.smk index 0f98a4b..85d5625 100644 --- a/workflow/rules/prepare.smk +++ b/workflow/rules/prepare.smk @@ -26,7 +26,7 @@ validate(config, Path("../../config/config_schema.json")) ENV_DIR = Path(f"{workflow.basedir}/envs") -OUT_DIR = Path(config["output_dir"]) +INTERMEDIATES_DIR = Path(config["intermediates_dir"]) SCRIPTS_DIR = Path(config["scripts_dir"]) CLUSTER_LOG = Path(config["cluster_log"]) @@ -49,17 +49,17 @@ localrules: rule finish_prepare: input: - idx_transcriptome=OUT_DIR / "segemehl_transcriptome_index.idx", - idx_genome=OUT_DIR / "segemehl_genome_index.idx", - exons=OUT_DIR / "exons.bed", - header=OUT_DIR / "genome_header.sam", - chrsize=OUT_DIR / "chr_size.txt", + idx_transcriptome=INTERMEDIATES_DIR / "segemehl_transcriptome_index.idx", + idx_genome=INTERMEDIATES_DIR / "segemehl_genome_index.idx", + exons=INTERMEDIATES_DIR / "exons.bed", + header=INTERMEDIATES_DIR / "genome_header.sam", + chrsize=INTERMEDIATES_DIR / "chr_size.txt", extended_mir=expand( - OUT_DIR / "extended_mirna_annotation_{extension}_nt.gff3", + INTERMEDIATES_DIR / "extended_mirna_annotation_{extension}_nt.gff3", extension=config["extension"], ), extended_primir=expand( - OUT_DIR / "extended_primir_annotation_{extension}_nt.gff3", + INTERMEDIATES_DIR / "extended_primir_annotation_{extension}_nt.gff3", extension=config["extension"], ), @@ -74,7 +74,7 @@ rule trim_genome_seq_ids: genome=config["genome_file"], script=SCRIPTS_DIR / "trim_id_fasta.sh", output: - genome=OUT_DIR / "genome_processed.fa", + genome=INTERMEDIATES_DIR / "genome_processed.fa", params: cluster_log=CLUSTER_LOG / "genome_process.log", log: @@ -92,10 +92,10 @@ rule trim_genome_seq_ids: rule extract_transcriptome_seqs: input: - genome=OUT_DIR / "genome_processed.fa", + genome=INTERMEDIATES_DIR / "genome_processed.fa", gtf=config["gtf_file"], output: - fasta=OUT_DIR / "transcriptome.fa", + fasta=INTERMEDIATES_DIR / "transcriptome.fa", params: cluster_log=CLUSTER_LOG / "extract_transcriptome_seqs.log", log: @@ -115,10 +115,10 @@ rule extract_transcriptome_seqs: rule trim_transcriptome_seq_ids: input: - fasta=OUT_DIR / "transcriptome.fa", + fasta=INTERMEDIATES_DIR / "transcriptome.fa", script=SCRIPTS_DIR / "trim_id_fasta.sh", output: - fasta=OUT_DIR / "transcriptome_trimmed_id.fa", + fasta=INTERMEDIATES_DIR / "transcriptome_trimmed_id.fa", params: cluster_log=CLUSTER_LOG / "trim_transcriptome.log", log: @@ -136,9 +136,9 @@ rule trim_transcriptome_seq_ids: rule generate_segemehl_index_transcriptome: input: - fasta=OUT_DIR / "transcriptome_trimmed_id.fa", + fasta=INTERMEDIATES_DIR / "transcriptome_trimmed_id.fa", output: - idx=OUT_DIR / "segemehl_transcriptome_index.idx", + idx=INTERMEDIATES_DIR / "segemehl_transcriptome_index.idx", params: cluster_log=CLUSTER_LOG / "generate_segemehl_index_transcriptome.log", log: @@ -162,9 +162,9 @@ rule generate_segemehl_index_transcriptome: rule generate_segemehl_index_genome: input: - genome=OUT_DIR / "genome_processed.fa", + genome=INTERMEDIATES_DIR / "genome_processed.fa", output: - idx=OUT_DIR / "segemehl_genome_index.idx", + idx=INTERMEDIATES_DIR / "segemehl_genome_index.idx", params: cluster_log=CLUSTER_LOG / "generate_segemehl_index_genome.log", log: @@ -191,7 +191,7 @@ rule get_exons_gtf: gtf=config["gtf_file"], script=SCRIPTS_DIR / "get_lines_w_pattern.sh", output: - exons=OUT_DIR / "exons.gtf", + exons=INTERMEDIATES_DIR / "exons.gtf", params: cluster_log=CLUSTER_LOG / "get_exons_gtf.log", log: @@ -215,10 +215,10 @@ rule get_exons_gtf: rule convert_exons_gtf_to_bed: input: - exons=OUT_DIR / "exons.gtf", + exons=INTERMEDIATES_DIR / "exons.gtf", script=SCRIPTS_DIR / "gtf_exons_bed.1.1.2.R", output: - exons=OUT_DIR / "exons.bed", + exons=INTERMEDIATES_DIR / "exons.bed", params: cluster_log=CLUSTER_LOG / "exons_gtf_to_bed.log", log: @@ -242,9 +242,9 @@ rule convert_exons_gtf_to_bed: rule create_genome_header: input: - genome=OUT_DIR / "genome_processed.fa", + genome=INTERMEDIATES_DIR / "genome_processed.fa", output: - header=OUT_DIR / "genome_header.sam", + header=INTERMEDIATES_DIR / "genome_header.sam", params: cluster_log=CLUSTER_LOG / "create_genome_header.log", log: @@ -268,7 +268,7 @@ rule map_chr_names: script=SCRIPTS_DIR / "map_chromosomes.pl", map_chr=config["map_chr_file"], output: - gff=OUT_DIR / "mirna_annotations.gff3", + gff=INTERMEDIATES_DIR / "mirna_annotations.gff3", params: cluster_log=CLUSTER_LOG / "map_chr_names.log", column="1", @@ -296,9 +296,9 @@ rule map_chr_names: rule create_index_genome_fasta: input: - genome=OUT_DIR / "genome_processed.fa", + genome=INTERMEDIATES_DIR / "genome_processed.fa", output: - genome=OUT_DIR / "genome_processed.fa.fai", + genome=INTERMEDIATES_DIR / "genome_processed.fa.fai", params: cluster_log=CLUSTER_LOG / "create_index_genome_fasta.log", log: @@ -318,9 +318,9 @@ rule create_index_genome_fasta: rule extract_chr_len: input: - genome=OUT_DIR / "genome_processed.fa.fai", + genome=INTERMEDIATES_DIR / "genome_processed.fa.fai", output: - chrsize=OUT_DIR / "chr_size.txt", + chrsize=INTERMEDIATES_DIR / "chr_size.txt", params: cluster_log=CLUSTER_LOG / "extract_chr_len.log", log: @@ -338,21 +338,21 @@ rule extract_chr_len: rule extend_mirs_annotations: input: - gff3=OUT_DIR / "mirna_annotations.gff3", - chrsize=OUT_DIR / "chr_size.txt", + gff3=INTERMEDIATES_DIR / "mirna_annotations.gff3", + chrsize=INTERMEDIATES_DIR / "chr_size.txt", script=SCRIPTS_DIR / "mirna_extension.py", output: extended_mir=expand( - OUT_DIR / "extended_mirna_annotation_{extension}_nt.gff3", + INTERMEDIATES_DIR / "extended_mirna_annotation_{extension}_nt.gff3", extension=config["extension"], ), extended_primir=expand( - OUT_DIR / "extended_primir_annotation_{extension}_nt.gff3", + INTERMEDIATES_DIR / "extended_primir_annotation_{extension}_nt.gff3", extension=config["extension"], ), params: cluster_log=CLUSTER_LOG / "extend_mirs_annotations.log", - out_dir=OUT_DIR, + out_dir=INTERMEDIATES_DIR, extension=config["extension"], log: LOCAL_LOG / "extend_mirs_annotations.log", diff --git a/workflow/rules/quantify.smk b/workflow/rules/quantify.smk index 15aa1e7..e3fe767 100644 --- a/workflow/rules/quantify.smk +++ b/workflow/rules/quantify.smk @@ -25,6 +25,7 @@ validate(config, Path("../../config/config_schema.json")) ENV_DIR = Path(f"{workflow.basedir}/envs") OUT_DIR = Path(config["output_dir"]) +INTERMEDIATES_DIR = Path(config["intermediates_dir"]) SCRIPTS_DIR = Path(config["scripts_dir"]) CLUSTER_LOG = Path(config["cluster_log"]) @@ -63,9 +64,6 @@ rule finish_quantify: input: primir_intersect_sam=OUT_DIR / "{sample}" / "alignments_intersecting_primir.sam", mirna_intersect_sam=OUT_DIR / "{sample}" / "alignments_intersecting_mirna.sam", - intersect_sam=OUT_DIR - / "{sample}" - / "alignments_intersecting_mirna_sorted_tag.sam", table=OUT_DIR / "TABLES" / "all_{mir}_counts.tab", uncollapsed_bam=expand( OUT_DIR @@ -88,13 +86,13 @@ rule finish_quantify: rule intersect_extended_primir: input: - alignment=OUT_DIR / "{sample}" / "alignments_all_sorted_{sample}.bam", + alignment=INTERMEDIATES_DIR / "{sample}" / "alignments_all_sorted_{sample}.bam", primir=expand( - OUT_DIR / "extended_primir_annotation_{extension}_nt.gff3", + INTERMEDIATES_DIR / "extended_primir_annotation_{extension}_nt.gff3", extension=config["extension"], ), output: - intersect=OUT_DIR / "{sample}" / "intersected_extended_primir.bed", + intersect=INTERMEDIATES_DIR / "{sample}" / "intersected_extended_primir.bed", params: cluster_log=CLUSTER_LOG / "intersect_extended_primir_{sample}.log", log: @@ -122,8 +120,8 @@ rule intersect_extended_primir: rule filter_sam_by_intersecting_primir: input: - alignments=OUT_DIR / "{sample}" / "alignments_all.sam", - intersect=OUT_DIR / "{sample}" / "intersected_extended_primir.bed", + alignments=INTERMEDIATES_DIR / "{sample}" / "alignments_all.sam", + intersect=INTERMEDIATES_DIR / "{sample}" / "intersected_extended_primir.bed", output: sam=OUT_DIR / "{sample}" / "alignments_intersecting_primir.sam", params: @@ -152,7 +150,7 @@ rule convert_intersecting_primir_sam_to_bam: input: maps=OUT_DIR / "{sample}" / "alignments_intersecting_primir.sam", output: - maps=OUT_DIR / "{sample}" / "alignments_intersecting_primir.bam", + maps=INTERMEDIATES_DIR / "{sample}" / "alignments_intersecting_primir.bam", params: cluster_log=CLUSTER_LOG / "convert_intersecting_primir_sam_to_bam_{sample}.log", log: @@ -172,9 +170,11 @@ rule convert_intersecting_primir_sam_to_bam: rule sort_intersecting_primir_bam_by_position: input: - maps=OUT_DIR / "{sample}" / "alignments_intersecting_primir.bam", + maps=INTERMEDIATES_DIR / "{sample}" / "alignments_intersecting_primir.bam", output: - maps=OUT_DIR / "{sample}" / "alignments_intersecting_primir_sorted.bam", + maps=INTERMEDIATES_DIR + / "{sample}" + / "alignments_intersecting_primir_sorted.bam", params: cluster_log=CLUSTER_LOG / "sort_intersecting_primir_bam_by_position_{sample}.log", @@ -195,9 +195,13 @@ rule sort_intersecting_primir_bam_by_position: rule index_intersecting_primir_bam: input: - maps=OUT_DIR / "{sample}" / "alignments_intersecting_primir_sorted.bam", + maps=INTERMEDIATES_DIR + / "{sample}" + / "alignments_intersecting_primir_sorted.bam", output: - maps=OUT_DIR / "{sample}" / "alignments_intersecting_primir_sorted.bam.bai", + maps=INTERMEDIATES_DIR + / "{sample}" + / "alignments_intersecting_primir_sorted.bam.bai", params: cluster_log=CLUSTER_LOG / "index_intersecting_primir_bam_{sample}.log", log: @@ -217,13 +221,15 @@ rule index_intersecting_primir_bam: rule intersect_extended_mirna: input: - alignment=OUT_DIR / "{sample}" / "alignments_intersecting_primir_sorted.bam", + alignment=INTERMEDIATES_DIR + / "{sample}" + / "alignments_intersecting_primir_sorted.bam", mirna=expand( - OUT_DIR / "extended_mirna_annotation_{extension}_nt.gff3", + INTERMEDIATES_DIR / "extended_mirna_annotation_{extension}_nt.gff3", extension=config["extension"], ), output: - intersect=OUT_DIR / "{sample}" / "intersected_extended_mirna.bed", + intersect=INTERMEDIATES_DIR / "{sample}" / "intersected_extended_mirna.bed", params: cluster_log=CLUSTER_LOG / "intersect_extended_mirna_{sample}.log", log: @@ -252,7 +258,7 @@ rule intersect_extended_mirna: rule filter_sam_by_intersecting_mirna: input: alignments=OUT_DIR / "{sample}" / "alignments_intersecting_primir.sam", - intersect=OUT_DIR / "{sample}" / "intersected_extended_mirna.bed", + intersect=INTERMEDIATES_DIR / "{sample}" / "intersected_extended_mirna.bed", output: sam=OUT_DIR / "{sample}" / "alignments_intersecting_mirna.sam", params: @@ -280,10 +286,10 @@ rule filter_sam_by_intersecting_mirna: rule add_intersecting_mirna_tag: input: alignments=OUT_DIR / "{sample}" / "alignments_intersecting_mirna.sam", - intersect=OUT_DIR / "{sample}" / "intersected_extended_mirna.bed", + intersect=INTERMEDIATES_DIR / "{sample}" / "intersected_extended_mirna.bed", script=SCRIPTS_DIR / "iso_name_tagging.py", output: - sam=OUT_DIR / "{sample}" / "alignments_intersecting_mirna_tag.sam", + sam=INTERMEDIATES_DIR / "{sample}" / "alignments_intersecting_mirna_tag.sam", params: extension=config["extension"], cluster_log=CLUSTER_LOG / "add_intersecting_mirna_tag_{sample}.log", @@ -309,9 +315,11 @@ rule add_intersecting_mirna_tag: rule sort_intersecting_mirna_by_feat_tag: input: - sam=OUT_DIR / "{sample}" / "alignments_intersecting_mirna_tag.sam", + sam=INTERMEDIATES_DIR / "{sample}" / "alignments_intersecting_mirna_tag.sam", output: - sam=OUT_DIR / "{sample}" / "alignments_intersecting_mirna_sorted_tag.sam", + sam=INTERMEDIATES_DIR + / "{sample}" + / "alignments_intersecting_mirna_sorted_tag.sam", params: cluster_log=CLUSTER_LOG / "sort_intersecting_mirna_by_feat_tag_{sample}.log", log: @@ -331,15 +339,17 @@ rule sort_intersecting_mirna_by_feat_tag: rule quantify_mirna: input: - alignments=OUT_DIR / "{sample}" / "alignments_intersecting_mirna_sorted_tag.sam", + alignments=INTERMEDIATES_DIR + / "{sample}" + / "alignments_intersecting_mirna_sorted_tag.sam", script=SCRIPTS_DIR / "mirna_quantification.py", output: - table=OUT_DIR / "TABLES" / "mirna_counts_{sample}", + table=INTERMEDIATES_DIR / "TABLES" / "mirna_counts_{sample}", params: cluster_log=CLUSTER_LOG / "quantify_mirna_{sample}.log", mir_list=config["mir_list"], library="{sample}", - out_dir=OUT_DIR / "TABLES", + out_dir=INTERMEDIATES_DIR / "TABLES", log: LOCAL_LOG / "quantify_mirna_{sample}.log", container: @@ -365,10 +375,10 @@ rule quantify_mirna: rule quantify_primir: input: - intersect=OUT_DIR / "{sample}" / "intersected_extended_primir.bed", + intersect=INTERMEDIATES_DIR / "{sample}" / "intersected_extended_primir.bed", script=SCRIPTS_DIR / "primir_quantification.py", output: - table=OUT_DIR / "TABLES" / "pri-mir_counts_{sample}", + table=INTERMEDIATES_DIR / "TABLES" / "pri-mir_counts_{sample}", params: cluster_log=CLUSTER_LOG / "quantify_primir_{sample}.log", log: @@ -395,7 +405,7 @@ rule quantify_primir: rule merge_tables: input: table=expand( - OUT_DIR / "TABLES" / "{mir}_counts_{sample}", + INTERMEDIATES_DIR / "TABLES" / "{mir}_counts_{sample}", sample=pd.unique(samples_table.index.values), mir=[mir for mir in config["mir_list"] if mir != "isomir"], ), @@ -405,7 +415,7 @@ rule merge_tables: params: cluster_log=CLUSTER_LOG / "merge_tables_{mirna}.log", prefix="{mirna}_counts_", - input_dir=OUT_DIR / "TABLES", + input_dir=INTERMEDIATES_DIR / "TABLES", log: LOCAL_LOG / "merge_tables_{mirna}.log", container: @@ -432,7 +442,9 @@ rule uncollapse_reads: maps=OUT_DIR / "{sample}" / "alignments_intersecting_mirna.sam", script=SCRIPTS_DIR / "sam_uncollapse.pl", output: - maps=OUT_DIR / "{sample}" / "alignments_intersecting_mirna_uncollapsed.sam", + maps=INTERMEDIATES_DIR + / "{sample}" + / "alignments_intersecting_mirna_uncollapsed.sam", params: cluster_log=CLUSTER_LOG / "uncollapse_reads_{sample}.log", log: @@ -456,9 +468,13 @@ rule uncollapse_reads: rule convert_uncollpased_reads_sam_to_bam: input: - maps=OUT_DIR / "{sample}" / "alignments_intersecting_mirna_uncollapsed.sam", + maps=INTERMEDIATES_DIR + / "{sample}" + / "alignments_intersecting_mirna_uncollapsed.sam", output: - maps=OUT_DIR / "{sample}" / "alignments_intersecting_mirna_uncollapsed.bam", + maps=INTERMEDIATES_DIR + / "{sample}" + / "alignments_intersecting_mirna_uncollapsed.bam", params: cluster_log=CLUSTER_LOG / "convert_uncollapsed_reads_sam_to_bam_{sample}.log", log: @@ -478,7 +494,9 @@ rule convert_uncollpased_reads_sam_to_bam: rule sort_uncollpased_reads_bam_by_position: input: - maps=OUT_DIR / "{sample}" / "alignments_intersecting_mirna_uncollapsed.bam", + maps=INTERMEDIATES_DIR + / "{sample}" + / "alignments_intersecting_mirna_uncollapsed.bam", output: maps=OUT_DIR / "{sample}"