From 701ab06728187108a384b3c007c7fe45fe621e84 Mon Sep 17 00:00:00 2001 From: deliaBlue Date: Sun, 19 Nov 2023 20:47:48 +0100 Subject: [PATCH 01/21] test: add --notemp to test script --- test/test_workflow_local_with_conda.sh | 3 ++- test/test_workflow_local_with_singularity.sh | 3 ++- test/test_workflow_slurm_with_conda.sh | 3 ++- test/test_workflow_slurm_with_singularity.sh | 3 ++- 4 files changed, 8 insertions(+), 4 deletions(-) diff --git a/test/test_workflow_local_with_conda.sh b/test/test_workflow_local_with_conda.sh index 6b179966..805a141f 100755 --- a/test/test_workflow_local_with_conda.sh +++ b/test/test_workflow_local_with_conda.sh @@ -25,7 +25,8 @@ snakemake \ --use-conda \ --printshellcmds \ --rerun-incomplete \ - --verbose + --verbose \ + --notemp # Snakemake report diff --git a/test/test_workflow_local_with_singularity.sh b/test/test_workflow_local_with_singularity.sh index 306caac2..ccacd006 100755 --- a/test/test_workflow_local_with_singularity.sh +++ b/test/test_workflow_local_with_singularity.sh @@ -26,7 +26,8 @@ snakemake \ --singularity-args "--bind ${PWD}/../" \ --printshellcmds \ --rerun-incomplete \ - --verbose + --verbose \ + --notemp # Snakemake report diff --git a/test/test_workflow_slurm_with_conda.sh b/test/test_workflow_slurm_with_conda.sh index 190ffdc8..d909963d 100755 --- a/test/test_workflow_slurm_with_conda.sh +++ b/test/test_workflow_slurm_with_conda.sh @@ -41,7 +41,8 @@ snakemake \ --use-conda \ --printshellcmds \ --rerun-incomplete \ - --verbose + --verbose \ + --notemp # Snakemake report snakemake \ diff --git a/test/test_workflow_slurm_with_singularity.sh b/test/test_workflow_slurm_with_singularity.sh index 4ea8a9b1..e68809b8 100755 --- a/test/test_workflow_slurm_with_singularity.sh +++ b/test/test_workflow_slurm_with_singularity.sh @@ -42,7 +42,8 @@ snakemake \ --singularity-args="--bind ${PWD}/../" \ --printshellcmds \ --rerun-incomplete \ - --verbose + --verbose \ + --notemp # Snakemake report snakemake \ From c2ba640a1827f5701a8a6b0b3ba4b50c042e16b6 Mon Sep 17 00:00:00 2001 From: deliaBlue Date: Sun, 19 Nov 2023 20:49:26 +0100 Subject: [PATCH 02/21] refactor: set intermediate files as temp --- workflow/rules/map.smk | 56 ++++++++++++++++++------------------- workflow/rules/prepare.smk | 30 ++++++++++---------- workflow/rules/quantify.smk | 16 +++++------ 3 files changed, 51 insertions(+), 51 deletions(-) diff --git a/workflow/rules/map.smk b/workflow/rules/map.smk index 672b9c56..60ba520a 100644 --- a/workflow/rules/map.smk +++ b/workflow/rules/map.smk @@ -89,7 +89,7 @@ rule start: format=convert_lib_format(get_sample("format")), ), output: - reads=OUT_DIR / "{sample}" / "{format}" / "reads.{format}", + reads=temp(OUT_DIR / "{sample}" / "{format}" / "reads.{format}"), params: cluster_log=CLUSTER_LOG / "uncompress_zipped_files_{sample}_{format}.log", @@ -110,7 +110,7 @@ rule fastq_quality_filter: input: reads=OUT_DIR / "{sample}" / "fastq" / "reads.fastq", output: - reads=OUT_DIR / "{sample}" / "fastq" / "filtered_reads.fastq", + reads=temp(OUT_DIR / "{sample}" / "fastq" / "filtered_reads.fastq"), params: cluster_log=CLUSTER_LOG / "fastq_quality_filter_{sample}.log", p=config["p_value"], @@ -140,7 +140,7 @@ rule fastq_to_fasta: input: reads=OUT_DIR / "{sample}" / "fastq" / "filtered_reads.fastq", output: - reads=OUT_DIR / "{sample}" / "fastq" / "reads.fa", + reads=temp(OUT_DIR / "{sample}" / "fastq" / "reads.fa"), params: cluster_log=CLUSTER_LOG / "fastq_to_fasta_{sample}.log", log: @@ -165,7 +165,7 @@ rule format_fasta: / convert_lib_format(get_sample("format", wildcards.sample)) / "reads.fa", output: - reads=OUT_DIR / "{sample}" / "reads_formatted.fasta", + reads=temp(OUT_DIR / "{sample}" / "reads_formatted.fasta"), params: cluster_log=CLUSTER_LOG / "format_fasta_{sample}.log", log: @@ -187,7 +187,7 @@ rule remove_adapters: input: reads=OUT_DIR / "{sample}" / "reads_formatted.fasta", output: - reads=OUT_DIR / "{sample}" / "reads_trimmed_adapters.fasta", + reads=temp(OUT_DIR / "{sample}" / "reads_trimmed_adapters.fasta"), params: adapter=lambda wildcards: get_sample( "adapter", wildcards.sample @@ -226,7 +226,7 @@ rule collapse_identical_reads: input: reads=OUT_DIR / "{sample}" / "reads_trimmed_adapters.fasta", output: - reads=OUT_DIR / "{sample}" / "reads_collapsed.fasta", + reads=temp(OUT_DIR / "{sample}" / "reads_collapsed.fasta"), params: cluster_log=CLUSTER_LOG / "collapse_identical_reads_{sample}.log", log: @@ -250,7 +250,7 @@ rule map_genome_segemehl: genome=OUT_DIR / "genome_processed.fa", genome_index_segemehl=OUT_DIR / "segemehl_genome_index.idx", output: - gmap=OUT_DIR / "{sample}" / "segemehl_genome_mappings.sam", + gmap=temp(OUT_DIR / "{sample}" / "segemehl_genome_mappings.sam"), params: cluster_log=CLUSTER_LOG / "map_genome_segemehl_{sample}.log", log: @@ -286,7 +286,7 @@ rule map_transcriptome_segemehl: transcriptome_index_segemehl=OUT_DIR / "segemehl_transcriptome_index.idx", output: - tmap=OUT_DIR / "{sample}" / "segemehl_transcriptome_mappings.sam", + tmap=temp(OUT_DIR / "{sample}" / "segemehl_transcriptome_mappings.sam"), params: cluster_log=CLUSTER_LOG / "map_transcriptome_segemehl_{sample}.log", log: @@ -320,7 +320,7 @@ rule filter_fasta_for_oligomap: reads=OUT_DIR / "{sample}" / "reads_collapsed.fasta", script=SCRIPTS_DIR / "validation_fasta.py", output: - reads=OUT_DIR / "{sample}" / "reads_filtered_for_oligomap.fasta", + reads=temp(OUT_DIR / "{sample}" / "reads_filtered_for_oligomap.fasta"), params: cluster_log=CLUSTER_LOG / "filter_fasta_for_oligomap_{sample}.log", max_length_reads=config["max_length_reads"], @@ -348,8 +348,8 @@ rule map_genome_oligomap: reads=OUT_DIR / "{sample}" / "reads_filtered_for_oligomap.fasta", target=OUT_DIR / "genome_processed.fa", output: - gmap=OUT_DIR / "{sample}" / "oligomap_genome_mappings.fasta", - report=OUT_DIR / "{sample}" / "oligomap_genome_report.txt", + gmap=temp(OUT_DIR / "{sample}" / "oligomap_genome_mappings.fasta"), + report=temp(OUT_DIR / "{sample}" / "oligomap_genome_report.txt"), params: cluster_log=CLUSTER_LOG / "map_genome_oligomap_{sample}.log", log: @@ -382,7 +382,7 @@ rule sort_genome_oligomap: report=OUT_DIR / "{sample}" / "oligomap_genome_report.txt", script=SCRIPTS_DIR / "blocksort.sh", output: - sort=OUT_DIR / "{sample}" / "oligomap_genome_sorted.fasta", + sort=temp(OUT_DIR / "{sample}" / "oligomap_genome_sorted.fasta"), params: cluster_log=CLUSTER_LOG / "sort_genome_oligomap_{sample}.log", log: @@ -410,7 +410,7 @@ rule convert_genome_to_sam_oligomap: sort=OUT_DIR / "{sample}" / "oligomap_genome_sorted.fasta", script=SCRIPTS_DIR / "oligomap_output_to_sam_nh_filtered.py", output: - gmap=OUT_DIR / "{sample}" / "oligomap_genome_mappings.sam", + gmap=temp(OUT_DIR / "{sample}" / "oligomap_genome_mappings.sam"), params: cluster_log=CLUSTER_LOG / "oligomap_genome_to_sam_{sample}.log", nh=config["nh"], @@ -440,8 +440,8 @@ rule map_transcriptome_oligomap: reads=OUT_DIR / "{sample}" / "reads_filtered_for_oligomap.fasta", target=OUT_DIR / "transcriptome_trimmed_id.fa", output: - tmap=OUT_DIR / "{sample}" / "oligomap_transcriptome_mappings.fasta", - report=OUT_DIR / "{sample}" / "oligomap_transcriptome_report.txt", + tmap=temp(OUT_DIR / "{sample}" / "oligomap_transcriptome_mappings.fasta"), + report=temp(OUT_DIR / "{sample}" / "oligomap_transcriptome_report.txt"), params: cluster_log=CLUSTER_LOG / "map_transcriptome_oligomap_{sample}.log", log: @@ -475,7 +475,7 @@ rule sort_transcriptome_oligomap: report=OUT_DIR / "{sample}" / "oligomap_transcriptome_report.txt", script=SCRIPTS_DIR / "blocksort.sh", output: - sort=OUT_DIR / "{sample}" / "oligomap_transcriptome_sorted.fasta", + sort=temp(OUT_DIR / "{sample}" / "oligomap_transcriptome_sorted.fasta"), params: cluster_log=CLUSTER_LOG / "sort_transcriptome_oligomap_{sample}.log", log: @@ -502,7 +502,7 @@ rule convert_transcriptome_to_sam_oligomap: sort=OUT_DIR / "{sample}" / "oligomap_transcriptome_sorted.fasta", script=SCRIPTS_DIR / "oligomap_output_to_sam_nh_filtered.py", output: - tmap=OUT_DIR / "{sample}" / "oligomap_transcriptome_mappings.sam", + tmap=temp(OUT_DIR / "{sample}" / "oligomap_transcriptome_mappings.sam"), params: cluster_log=CLUSTER_LOG / "oligomap_transcriptome_to_sam_{sample}.log", nh=config["nh"], @@ -529,7 +529,7 @@ rule merge_genome_maps: gmap1=OUT_DIR / "{sample}" / "segemehl_genome_mappings.sam", gmap2=OUT_DIR / "{sample}" / "oligomap_genome_mappings.sam", output: - gmaps=OUT_DIR / "{sample}" / "genome_mappings.sam", + gmaps=temp(OUT_DIR / "{sample}" / "genome_mappings.sam"), params: cluster_log=CLUSTER_LOG / "merge_genome_maps_{sample}.log", log: @@ -550,7 +550,7 @@ rule merge_transcriptome_maps: tmap1=OUT_DIR / "{sample}" / "segemehl_transcriptome_mappings.sam", tmap2=OUT_DIR / "{sample}" / "oligomap_transcriptome_mappings.sam", output: - tmaps=OUT_DIR / "{sample}" / "transcriptome_mappings.sam", + tmaps=temp(OUT_DIR / "{sample}" / "transcriptome_mappings.sam"), params: cluster_log=CLUSTER_LOG / "merge_transcriptome_maps_{sample}.log", log: @@ -571,7 +571,7 @@ rule filter_genome_by_nh: gmaps=OUT_DIR / "{sample}" / "genome_mappings.sam", script=SCRIPTS_DIR / "nh_filter.py", output: - gmaps=OUT_DIR / "{sample}" / "genome_mappings_filtered_nh.sam", + gmaps=temp(OUT_DIR / "{sample}" / "genome_mappings_filtered_nh.sam"), params: cluster_log=CLUSTER_LOG / "filter_genome_by_nh_{sample}.log", nh=config["nh"], @@ -599,7 +599,7 @@ rule filter_transcriptome_by_nh: tmaps=OUT_DIR / "{sample}" / "transcriptome_mappings.sam", script=SCRIPTS_DIR / "nh_filter.py", output: - tmaps=OUT_DIR / "{sample}" / "transcriptome_mappings_filtered_nh.sam", + tmaps=temp(OUT_DIR / "{sample}" / "transcriptome_mappings_filtered_nh.sam"), params: cluster_log=CLUSTER_LOG / "filter_transcriptome_by_nh_{sample}.log", nh=config["nh"], @@ -626,7 +626,7 @@ rule remove_header_genome_mappings: input: gmap=OUT_DIR / "{sample}" / "genome_mappings_filtered_nh.sam", output: - gmap=OUT_DIR / "{sample}" / "genome_mappings_no_header.sam", + gmap=temp(OUT_DIR / "{sample}" / "genome_mappings_no_header.sam"), params: cluster_log=CLUSTER_LOG / "remove_header_genome_mappings_{sample}.log", log: @@ -648,7 +648,7 @@ rule remove_header_transcriptome_mappings: input: tmap=OUT_DIR / "{sample}" / "transcriptome_mappings_filtered_nh.sam", output: - tmap=OUT_DIR / "{sample}" / "transcriptome_mappings_no_header.sam", + tmap=temp(OUT_DIR / "{sample}" / "transcriptome_mappings_no_header.sam"), params: cluster_log=CLUSTER_LOG / "remove_header_transcriptome_mappings_{sample}.log", @@ -673,7 +673,7 @@ rule transcriptome_to_genome_maps: script=SCRIPTS_DIR / "sam_trx_to_sam_gen.pl", exons=OUT_DIR / "exons.bed", output: - genout=OUT_DIR / "{sample}" / "transcriptome_mappings_to_genome.sam", + genout=temp(OUT_DIR / "{sample}" / "transcriptome_mappings_to_genome.sam"), params: cluster_log=CLUSTER_LOG / "transcriptome_to_genome_maps_{sample}.log", log: @@ -700,7 +700,7 @@ rule merge_all_maps: gmap1=OUT_DIR / "{sample}" / "transcriptome_mappings_to_genome.sam", gmap2=OUT_DIR / "{sample}" / "genome_mappings_no_header.sam", output: - catmaps=OUT_DIR / "{sample}" / "mappings_all_no_header.sam", + catmaps=temp(OUT_DIR / "{sample}" / "mappings_all_no_header.sam"), params: cluster_log=CLUSTER_LOG / "merge_all_mappings_{sample}.log", log: @@ -721,7 +721,7 @@ rule add_header_all_maps: header=OUT_DIR / "genome_header.sam", catmaps=OUT_DIR / "{sample}" / "mappings_all_no_header.sam", output: - concatenate=OUT_DIR / "{sample}" / "mappings_all.sam", + concatenate=temp(OUT_DIR / "{sample}" / "mappings_all.sam"), params: cluster_log=CLUSTER_LOG / "add_header_{sample}.log", log: @@ -741,7 +741,7 @@ rule sort_maps_by_id: input: concatenate=OUT_DIR / "{sample}" / "mappings_all.sam", output: - sort=OUT_DIR / "{sample}" / "mappings_all_sorted_by_id.sam", + sort=temp(OUT_DIR / "{sample}" / "mappings_all_sorted_by_id.sam"), params: cluster_log=CLUSTER_LOG / "sort_maps_by_id_{sample}.log", log: @@ -765,7 +765,7 @@ rule remove_inferiors: script=SCRIPTS_DIR / "sam_remove_duplicates_inferior_alignments_multimappers.pl", output: - remove_inf=OUT_DIR / "{sample}" / "mappings_all_removed_inferiors.sam", + remove_inf=temp(OUT_DIR / "{sample}" / "mappings_all_removed_inferiors.sam"), params: cluster_log=CLUSTER_LOG / "remove_inferiors_{sample}.log", log: diff --git a/workflow/rules/prepare.smk b/workflow/rules/prepare.smk index 0f98a4b0..e3e44cc2 100644 --- a/workflow/rules/prepare.smk +++ b/workflow/rules/prepare.smk @@ -74,7 +74,7 @@ rule trim_genome_seq_ids: genome=config["genome_file"], script=SCRIPTS_DIR / "trim_id_fasta.sh", output: - genome=OUT_DIR / "genome_processed.fa", + genome=temp(OUT_DIR / "genome_processed.fa"), params: cluster_log=CLUSTER_LOG / "genome_process.log", log: @@ -95,7 +95,7 @@ rule extract_transcriptome_seqs: genome=OUT_DIR / "genome_processed.fa", gtf=config["gtf_file"], output: - fasta=OUT_DIR / "transcriptome.fa", + fasta=temp(OUT_DIR / "transcriptome.fa"), params: cluster_log=CLUSTER_LOG / "extract_transcriptome_seqs.log", log: @@ -118,7 +118,7 @@ rule trim_transcriptome_seq_ids: fasta=OUT_DIR / "transcriptome.fa", script=SCRIPTS_DIR / "trim_id_fasta.sh", output: - fasta=OUT_DIR / "transcriptome_trimmed_id.fa", + fasta=temp(OUT_DIR / "transcriptome_trimmed_id.fa"), params: cluster_log=CLUSTER_LOG / "trim_transcriptome.log", log: @@ -138,7 +138,7 @@ rule generate_segemehl_index_transcriptome: input: fasta=OUT_DIR / "transcriptome_trimmed_id.fa", output: - idx=OUT_DIR / "segemehl_transcriptome_index.idx", + idx=temp(OUT_DIR / "segemehl_transcriptome_index.idx"), params: cluster_log=CLUSTER_LOG / "generate_segemehl_index_transcriptome.log", log: @@ -164,7 +164,7 @@ rule generate_segemehl_index_genome: input: genome=OUT_DIR / "genome_processed.fa", output: - idx=OUT_DIR / "segemehl_genome_index.idx", + idx=temp(OUT_DIR / "segemehl_genome_index.idx"), params: cluster_log=CLUSTER_LOG / "generate_segemehl_index_genome.log", log: @@ -191,7 +191,7 @@ rule get_exons_gtf: gtf=config["gtf_file"], script=SCRIPTS_DIR / "get_lines_w_pattern.sh", output: - exons=OUT_DIR / "exons.gtf", + exons=temp(OUT_DIR / "exons.gtf"), params: cluster_log=CLUSTER_LOG / "get_exons_gtf.log", log: @@ -218,7 +218,7 @@ rule convert_exons_gtf_to_bed: exons=OUT_DIR / "exons.gtf", script=SCRIPTS_DIR / "gtf_exons_bed.1.1.2.R", output: - exons=OUT_DIR / "exons.bed", + exons=temp(OUT_DIR / "exons.bed"), params: cluster_log=CLUSTER_LOG / "exons_gtf_to_bed.log", log: @@ -244,7 +244,7 @@ rule create_genome_header: input: genome=OUT_DIR / "genome_processed.fa", output: - header=OUT_DIR / "genome_header.sam", + header=temp(OUT_DIR / "genome_header.sam"), params: cluster_log=CLUSTER_LOG / "create_genome_header.log", log: @@ -268,7 +268,7 @@ rule map_chr_names: script=SCRIPTS_DIR / "map_chromosomes.pl", map_chr=config["map_chr_file"], output: - gff=OUT_DIR / "mirna_annotations.gff3", + gff=temp(OUT_DIR / "mirna_annotations.gff3"), params: cluster_log=CLUSTER_LOG / "map_chr_names.log", column="1", @@ -298,7 +298,7 @@ rule create_index_genome_fasta: input: genome=OUT_DIR / "genome_processed.fa", output: - genome=OUT_DIR / "genome_processed.fa.fai", + genome=temp(OUT_DIR / "genome_processed.fa.fai"), params: cluster_log=CLUSTER_LOG / "create_index_genome_fasta.log", log: @@ -320,7 +320,7 @@ rule extract_chr_len: input: genome=OUT_DIR / "genome_processed.fa.fai", output: - chrsize=OUT_DIR / "chr_size.txt", + chrsize=temp(OUT_DIR / "chr_size.txt"), params: cluster_log=CLUSTER_LOG / "extract_chr_len.log", log: @@ -342,14 +342,14 @@ rule extend_mirs_annotations: chrsize=OUT_DIR / "chr_size.txt", script=SCRIPTS_DIR / "mirna_extension.py", output: - extended_mir=expand( + extended_mir=temp(expand( OUT_DIR / "extended_mirna_annotation_{extension}_nt.gff3", extension=config["extension"], - ), - extended_primir=expand( + )), + extended_primir=temp(expand( OUT_DIR / "extended_primir_annotation_{extension}_nt.gff3", extension=config["extension"], - ), + )), params: cluster_log=CLUSTER_LOG / "extend_mirs_annotations.log", out_dir=OUT_DIR, diff --git a/workflow/rules/quantify.smk b/workflow/rules/quantify.smk index 422635a4..6b8522e6 100644 --- a/workflow/rules/quantify.smk +++ b/workflow/rules/quantify.smk @@ -157,7 +157,7 @@ rule convert_intersecting_primir_sam_to_bam: input: maps=OUT_DIR / "{sample}" / "alignments_intersecting_primir.sam", output: - maps=OUT_DIR / "{sample}" / "alignments_intersecting_primir.bam", + maps=temp(OUT_DIR / "{sample}" / "alignments_intersecting_primir.bam"), params: cluster_log=CLUSTER_LOG / "convert_intersecting_primir_sam_to_bam_{sample}.log", @@ -180,7 +180,7 @@ rule sort_intersecting_primir_bam_by_position: input: maps=OUT_DIR / "{sample}" / "alignments_intersecting_primir.bam", output: - maps=OUT_DIR / "{sample}" / "alignments_intersecting_primir_sorted.bam", + maps=temp(OUT_DIR / "{sample}" / "alignments_intersecting_primir_sorted.bam"), params: cluster_log=CLUSTER_LOG / "sort_intersecting_primir_bam_by_position_{sample}.log", @@ -203,9 +203,9 @@ rule index_intersecting_primir_bam: input: maps=OUT_DIR / "{sample}" / "alignments_intersecting_primir_sorted.bam", output: - maps=OUT_DIR + maps=temp(OUT_DIR / "{sample}" - / "alignments_intersecting_primir_sorted.bam.bai", + / "alignments_intersecting_primir_sorted.bam.bai"), params: cluster_log=CLUSTER_LOG / "index_intersecting_primir_bam_{sample}.log", log: @@ -350,7 +350,7 @@ rule quantify_mirna: / "alignments_intersecting_mirna_sorted_tag.sam", script=SCRIPTS_DIR / "mirna_quantification.py", output: - table=OUT_DIR / "TABLES" / "mirna_counts_{sample}", + table=temp(OUT_DIR / "TABLES" / "mirna_counts_{sample}"), params: cluster_log=CLUSTER_LOG / "quantify_mirna_{sample}.log", mir_list=config["mir_list"], @@ -384,7 +384,7 @@ rule quantify_primir: intersect=OUT_DIR / "{sample}" / "intersected_extended_primir.bed", script=SCRIPTS_DIR / "primir_quantification.py", output: - table=OUT_DIR / "TABLES" / "pri-mir_counts_{sample}", + table=temp(OUT_DIR / "TABLES" / "pri-mir_counts_{sample}"), params: cluster_log=CLUSTER_LOG / "quantify_primir_{sample}.log", log: @@ -478,9 +478,9 @@ rule convert_uncollpased_reads_sam_to_bam: / "{sample}" / "alignments_intersecting_mirna_uncollapsed.sam", output: - maps=OUT_DIR + maps=temp(OUT_DIR / "{sample}" - / "alignments_intersecting_mirna_uncollapsed.bam", + / "alignments_intersecting_mirna_uncollapsed.bam"), params: cluster_log=CLUSTER_LOG / "convert_uncollapsed_reads_sam_to_bam_{sample}.log", From 0ccb2e001420cf3c1159a3ef3e0f3724106b4b01 Mon Sep 17 00:00:00 2001 From: deliaBlue Date: Wed, 29 Nov 2023 18:51:32 +0100 Subject: [PATCH 03/21] refactor: start tmp files --- workflow/rules/map.smk | 57 +++++++++++++++++++------------------ workflow/rules/prepare.smk | 31 ++++++++++---------- workflow/rules/quantify.smk | 17 +++++------ 3 files changed, 54 insertions(+), 51 deletions(-) diff --git a/workflow/rules/map.smk b/workflow/rules/map.smk index e64f7199..1f4ab53f 100644 --- a/workflow/rules/map.smk +++ b/workflow/rules/map.smk @@ -25,6 +25,7 @@ validate(config, Path("../../config/config_schema.json")) ENV_DIR = Path(f"{workflow.basedir}/envs") OUT_DIR = Path(config["output_dir"]) +TMP_DIR = Path(config["tmp_dir"]) SCRIPTS_DIR = Path(config["scripts_dir"]) CLUSTER_LOG = Path(config["cluster_log"]) @@ -87,7 +88,7 @@ rule start: format=convert_lib_format(get_sample("format")), ), output: - reads=temp(OUT_DIR / "{sample}" / "{format}" / "reads.{format}"), + reads=OUT_DIR / "{sample}" / "{format}" / "reads.{format}", params: cluster_log=CLUSTER_LOG / "uncompress_zipped_files_{sample}_{format}.log", log: @@ -107,7 +108,7 @@ rule fastq_quality_filter: input: reads=OUT_DIR / "{sample}" / "fastq" / "reads.fastq", output: - reads=temp(OUT_DIR / "{sample}" / "fastq" / "filtered_reads.fastq"), + reads=OUT_DIR / "{sample}" / "fastq" / "filtered_reads.fastq", params: cluster_log=CLUSTER_LOG / "fastq_quality_filter_{sample}.log", p=config["p_value"], @@ -137,7 +138,7 @@ rule fastq_to_fasta: input: reads=OUT_DIR / "{sample}" / "fastq" / "filtered_reads.fastq", output: - reads=temp(OUT_DIR / "{sample}" / "fastq" / "reads.fa"), + reads=OUT_DIR / "{sample}" / "fastq" / "reads.fa", params: cluster_log=CLUSTER_LOG / "fastq_to_fasta_{sample}.log", log: @@ -162,7 +163,7 @@ rule format_fasta: / convert_lib_format(get_sample("format", wildcards.sample)) / "reads.fa", output: - reads=temp(OUT_DIR / "{sample}" / "reads_formatted.fasta"), + reads=OUT_DIR / "{sample}" / "reads_formatted.fasta", params: cluster_log=CLUSTER_LOG / "format_fasta_{sample}.log", log: @@ -184,7 +185,7 @@ rule remove_adapters: input: reads=OUT_DIR / "{sample}" / "reads_formatted.fasta", output: - reads=temp(OUT_DIR / "{sample}" / "reads_trimmed_adapters.fasta"), + reads=OUT_DIR / "{sample}" / "reads_trimmed_adapters.fasta", params: adapter=lambda wildcards: get_sample("adapter", wildcards.sample).upper(), error_rate=config["error_rate"], @@ -221,7 +222,7 @@ rule collapse_identical_reads: input: reads=OUT_DIR / "{sample}" / "reads_trimmed_adapters.fasta", output: - reads=temp(OUT_DIR / "{sample}" / "reads_collapsed.fasta"), + reads=OUT_DIR / "{sample}" / "reads_collapsed.fasta", params: cluster_log=CLUSTER_LOG / "collapse_identical_reads_{sample}.log", log: @@ -245,7 +246,7 @@ rule map_genome_segemehl: genome=OUT_DIR / "genome_processed.fa", genome_index_segemehl=OUT_DIR / "segemehl_genome_index.idx", output: - gmap=temp(OUT_DIR / "{sample}" / "segemehl_genome_mappings.sam"), + gmap=OUT_DIR / "{sample}" / "segemehl_genome_mappings.sam", params: cluster_log=CLUSTER_LOG / "map_genome_segemehl_{sample}.log", log: @@ -280,7 +281,7 @@ rule map_transcriptome_segemehl: transcriptome=OUT_DIR / "transcriptome_trimmed_id.fa", transcriptome_index_segemehl=OUT_DIR / "segemehl_transcriptome_index.idx", output: - tmap=temp(OUT_DIR / "{sample}" / "segemehl_transcriptome_mappings.sam"), + tmap=OUT_DIR / "{sample}" / "segemehl_transcriptome_mappings.sam", params: cluster_log=CLUSTER_LOG / "map_transcriptome_segemehl_{sample}.log", log: @@ -314,7 +315,7 @@ rule filter_fasta_for_oligomap: reads=OUT_DIR / "{sample}" / "reads_collapsed.fasta", script=SCRIPTS_DIR / "validation_fasta.py", output: - reads=temp(OUT_DIR / "{sample}" / "reads_filtered_for_oligomap.fasta"), + reads=OUT_DIR / "{sample}" / "reads_filtered_for_oligomap.fasta", params: cluster_log=CLUSTER_LOG / "filter_fasta_for_oligomap_{sample}.log", max_length_reads=config["max_length_reads"], @@ -342,8 +343,8 @@ rule map_genome_oligomap: reads=OUT_DIR / "{sample}" / "reads_filtered_for_oligomap.fasta", target=OUT_DIR / "genome_processed.fa", output: - gmap=temp(OUT_DIR / "{sample}" / "oligomap_genome_mappings.fasta"), - report=temp(OUT_DIR / "{sample}" / "oligomap_genome_report.txt"), + gmap=OUT_DIR / "{sample}" / "oligomap_genome_mappings.fasta", + report=OUT_DIR / "{sample}" / "oligomap_genome_report.txt", params: cluster_log=CLUSTER_LOG / "map_genome_oligomap_{sample}.log", log: @@ -376,7 +377,7 @@ rule sort_genome_oligomap: report=OUT_DIR / "{sample}" / "oligomap_genome_report.txt", script=SCRIPTS_DIR / "blocksort.sh", output: - sort=temp(OUT_DIR / "{sample}" / "oligomap_genome_sorted.fasta"), + sort=OUT_DIR / "{sample}" / "oligomap_genome_sorted.fasta", params: cluster_log=CLUSTER_LOG / "sort_genome_oligomap_{sample}.log", log: @@ -404,7 +405,7 @@ rule convert_genome_to_sam_oligomap: sort=OUT_DIR / "{sample}" / "oligomap_genome_sorted.fasta", script=SCRIPTS_DIR / "oligomap_output_to_sam_nh_filtered.py", output: - gmap=temp(OUT_DIR / "{sample}" / "oligomap_genome_mappings.sam"), + gmap=OUT_DIR / "{sample}" / "oligomap_genome_mappings.sam", params: cluster_log=CLUSTER_LOG / "oligomap_genome_to_sam_{sample}.log", nh=config["nh"], @@ -434,8 +435,8 @@ rule map_transcriptome_oligomap: reads=OUT_DIR / "{sample}" / "reads_filtered_for_oligomap.fasta", target=OUT_DIR / "transcriptome_trimmed_id.fa", output: - tmap=temp(OUT_DIR / "{sample}" / "oligomap_transcriptome_mappings.fasta"), - report=temp(OUT_DIR / "{sample}" / "oligomap_transcriptome_report.txt"), + tmap=OUT_DIR / "{sample}" / "oligomap_transcriptome_mappings.fasta", + report=OUT_DIR / "{sample}" / "oligomap_transcriptome_report.txt", params: cluster_log=CLUSTER_LOG / "map_transcriptome_oligomap_{sample}.log", log: @@ -469,7 +470,7 @@ rule sort_transcriptome_oligomap: report=OUT_DIR / "{sample}" / "oligomap_transcriptome_report.txt", script=SCRIPTS_DIR / "blocksort.sh", output: - sort=temp(OUT_DIR / "{sample}" / "oligomap_transcriptome_sorted.fasta"), + sort=OUT_DIR / "{sample}" / "oligomap_transcriptome_sorted.fasta", params: cluster_log=CLUSTER_LOG / "sort_transcriptome_oligomap_{sample}.log", log: @@ -496,7 +497,7 @@ rule convert_transcriptome_to_sam_oligomap: sort=OUT_DIR / "{sample}" / "oligomap_transcriptome_sorted.fasta", script=SCRIPTS_DIR / "oligomap_output_to_sam_nh_filtered.py", output: - tmap=temp(OUT_DIR / "{sample}" / "oligomap_transcriptome_mappings.sam"), + tmap=OUT_DIR / "{sample}" / "oligomap_transcriptome_mappings.sam", params: cluster_log=CLUSTER_LOG / "oligomap_transcriptome_to_sam_{sample}.log", nh=config["nh"], @@ -523,7 +524,7 @@ rule merge_genome_maps: gmap1=OUT_DIR / "{sample}" / "segemehl_genome_mappings.sam", gmap2=OUT_DIR / "{sample}" / "oligomap_genome_mappings.sam", output: - gmaps=temp(OUT_DIR / "{sample}" / "genome_mappings.sam"), + gmaps=OUT_DIR / "{sample}" / "genome_mappings.sam", params: cluster_log=CLUSTER_LOG / "merge_genome_maps_{sample}.log", log: @@ -544,7 +545,7 @@ rule merge_transcriptome_maps: tmap1=OUT_DIR / "{sample}" / "segemehl_transcriptome_mappings.sam", tmap2=OUT_DIR / "{sample}" / "oligomap_transcriptome_mappings.sam", output: - tmaps=temp(OUT_DIR / "{sample}" / "transcriptome_mappings.sam"), + tmaps=OUT_DIR / "{sample}" / "transcriptome_mappings.sam", params: cluster_log=CLUSTER_LOG / "merge_transcriptome_maps_{sample}.log", log: @@ -565,7 +566,7 @@ rule filter_genome_by_nh: gmaps=OUT_DIR / "{sample}" / "genome_mappings.sam", script=SCRIPTS_DIR / "nh_filter.py", output: - gmaps=temp(OUT_DIR / "{sample}" / "genome_mappings_filtered_nh.sam"), + gmaps=OUT_DIR / "{sample}" / "genome_mappings_filtered_nh.sam", params: cluster_log=CLUSTER_LOG / "filter_genome_by_nh_{sample}.log", nh=config["nh"], @@ -593,7 +594,7 @@ rule filter_transcriptome_by_nh: tmaps=OUT_DIR / "{sample}" / "transcriptome_mappings.sam", script=SCRIPTS_DIR / "nh_filter.py", output: - tmaps=temp(OUT_DIR / "{sample}" / "transcriptome_mappings_filtered_nh.sam"), + tmaps=OUT_DIR / "{sample}" / "transcriptome_mappings_filtered_nh.sam", params: cluster_log=CLUSTER_LOG / "filter_transcriptome_by_nh_{sample}.log", nh=config["nh"], @@ -620,7 +621,7 @@ rule remove_header_genome_mappings: input: gmap=OUT_DIR / "{sample}" / "genome_mappings_filtered_nh.sam", output: - gmap=temp(OUT_DIR / "{sample}" / "genome_mappings_no_header.sam"), + gmap=OUT_DIR / "{sample}" / "genome_mappings_no_header.sam", params: cluster_log=CLUSTER_LOG / "remove_header_genome_mappings_{sample}.log", log: @@ -642,7 +643,7 @@ rule remove_header_transcriptome_mappings: input: tmap=OUT_DIR / "{sample}" / "transcriptome_mappings_filtered_nh.sam", output: - tmap=temp(OUT_DIR / "{sample}" / "transcriptome_mappings_no_header.sam"), + tmap=OUT_DIR / "{sample}" / "transcriptome_mappings_no_header.sam", params: cluster_log=CLUSTER_LOG / "remove_header_transcriptome_mappings_{sample}.log", log: @@ -666,7 +667,7 @@ rule transcriptome_to_genome_maps: script=SCRIPTS_DIR / "sam_trx_to_sam_gen.pl", exons=OUT_DIR / "exons.bed", output: - genout=temp(OUT_DIR / "{sample}" / "transcriptome_mappings_to_genome.sam"), + genout=OUT_DIR / "{sample}" / "transcriptome_mappings_to_genome.sam", params: cluster_log=CLUSTER_LOG / "transcriptome_to_genome_maps_{sample}.log", log: @@ -693,7 +694,7 @@ rule merge_all_maps: gmap1=OUT_DIR / "{sample}" / "transcriptome_mappings_to_genome.sam", gmap2=OUT_DIR / "{sample}" / "genome_mappings_no_header.sam", output: - catmaps=temp(OUT_DIR / "{sample}" / "mappings_all_no_header.sam"), + catmaps=OUT_DIR / "{sample}" / "mappings_all_no_header.sam", params: cluster_log=CLUSTER_LOG / "merge_all_mappings_{sample}.log", log: @@ -714,7 +715,7 @@ rule add_header_all_maps: header=OUT_DIR / "genome_header.sam", catmaps=OUT_DIR / "{sample}" / "mappings_all_no_header.sam", output: - concatenate=temp(OUT_DIR / "{sample}" / "mappings_all.sam"), + concatenate=OUT_DIR / "{sample}" / "mappings_all.sam", params: cluster_log=CLUSTER_LOG / "add_header_{sample}.log", log: @@ -734,7 +735,7 @@ rule sort_maps_by_id: input: concatenate=OUT_DIR / "{sample}" / "mappings_all.sam", output: - sort=temp(OUT_DIR / "{sample}" / "mappings_all_sorted_by_id.sam"), + sort=OUT_DIR / "{sample}" / "mappings_all_sorted_by_id.sam", params: cluster_log=CLUSTER_LOG / "sort_maps_by_id_{sample}.log", log: @@ -757,7 +758,7 @@ rule remove_inferiors: sort=OUT_DIR / "{sample}" / "mappings_all_sorted_by_id.sam", script=SCRIPTS_DIR / "sam_remove_duplicates_inferior_alignments_multimappers.pl", output: - remove_inf=temp(OUT_DIR / "{sample}" / "mappings_all_removed_inferiors.sam"), + remove_inf=OUT_DIR / "{sample}" / "mappings_all_removed_inferiors.sam", params: cluster_log=CLUSTER_LOG / "remove_inferiors_{sample}.log", log: diff --git a/workflow/rules/prepare.smk b/workflow/rules/prepare.smk index e3e44cc2..1202b5cc 100644 --- a/workflow/rules/prepare.smk +++ b/workflow/rules/prepare.smk @@ -27,6 +27,7 @@ validate(config, Path("../../config/config_schema.json")) ENV_DIR = Path(f"{workflow.basedir}/envs") OUT_DIR = Path(config["output_dir"]) +TMP_DIR = Path(config["tmp_dir"]) SCRIPTS_DIR = Path(config["scripts_dir"]) CLUSTER_LOG = Path(config["cluster_log"]) @@ -74,7 +75,7 @@ rule trim_genome_seq_ids: genome=config["genome_file"], script=SCRIPTS_DIR / "trim_id_fasta.sh", output: - genome=temp(OUT_DIR / "genome_processed.fa"), + genome=OUT_DIR / "genome_processed.fa", params: cluster_log=CLUSTER_LOG / "genome_process.log", log: @@ -95,7 +96,7 @@ rule extract_transcriptome_seqs: genome=OUT_DIR / "genome_processed.fa", gtf=config["gtf_file"], output: - fasta=temp(OUT_DIR / "transcriptome.fa"), + fasta=OUT_DIR / "transcriptome.fa", params: cluster_log=CLUSTER_LOG / "extract_transcriptome_seqs.log", log: @@ -118,7 +119,7 @@ rule trim_transcriptome_seq_ids: fasta=OUT_DIR / "transcriptome.fa", script=SCRIPTS_DIR / "trim_id_fasta.sh", output: - fasta=temp(OUT_DIR / "transcriptome_trimmed_id.fa"), + fasta=OUT_DIR / "transcriptome_trimmed_id.fa", params: cluster_log=CLUSTER_LOG / "trim_transcriptome.log", log: @@ -138,7 +139,7 @@ rule generate_segemehl_index_transcriptome: input: fasta=OUT_DIR / "transcriptome_trimmed_id.fa", output: - idx=temp(OUT_DIR / "segemehl_transcriptome_index.idx"), + idx=OUT_DIR / "segemehl_transcriptome_index.idx", params: cluster_log=CLUSTER_LOG / "generate_segemehl_index_transcriptome.log", log: @@ -164,7 +165,7 @@ rule generate_segemehl_index_genome: input: genome=OUT_DIR / "genome_processed.fa", output: - idx=temp(OUT_DIR / "segemehl_genome_index.idx"), + idx=OUT_DIR / "segemehl_genome_index.idx", params: cluster_log=CLUSTER_LOG / "generate_segemehl_index_genome.log", log: @@ -191,7 +192,7 @@ rule get_exons_gtf: gtf=config["gtf_file"], script=SCRIPTS_DIR / "get_lines_w_pattern.sh", output: - exons=temp(OUT_DIR / "exons.gtf"), + exons=OUT_DIR / "exons.gtf", params: cluster_log=CLUSTER_LOG / "get_exons_gtf.log", log: @@ -218,7 +219,7 @@ rule convert_exons_gtf_to_bed: exons=OUT_DIR / "exons.gtf", script=SCRIPTS_DIR / "gtf_exons_bed.1.1.2.R", output: - exons=temp(OUT_DIR / "exons.bed"), + exons=OUT_DIR / "exons.bed", params: cluster_log=CLUSTER_LOG / "exons_gtf_to_bed.log", log: @@ -244,7 +245,7 @@ rule create_genome_header: input: genome=OUT_DIR / "genome_processed.fa", output: - header=temp(OUT_DIR / "genome_header.sam"), + header=OUT_DIR / "genome_header.sam", params: cluster_log=CLUSTER_LOG / "create_genome_header.log", log: @@ -268,7 +269,7 @@ rule map_chr_names: script=SCRIPTS_DIR / "map_chromosomes.pl", map_chr=config["map_chr_file"], output: - gff=temp(OUT_DIR / "mirna_annotations.gff3"), + gff=OUT_DIR / "mirna_annotations.gff3", params: cluster_log=CLUSTER_LOG / "map_chr_names.log", column="1", @@ -298,7 +299,7 @@ rule create_index_genome_fasta: input: genome=OUT_DIR / "genome_processed.fa", output: - genome=temp(OUT_DIR / "genome_processed.fa.fai"), + genome=OUT_DIR / "genome_processed.fa.fai", params: cluster_log=CLUSTER_LOG / "create_index_genome_fasta.log", log: @@ -320,7 +321,7 @@ rule extract_chr_len: input: genome=OUT_DIR / "genome_processed.fa.fai", output: - chrsize=temp(OUT_DIR / "chr_size.txt"), + chrsize=OUT_DIR / "chr_size.txt", params: cluster_log=CLUSTER_LOG / "extract_chr_len.log", log: @@ -342,14 +343,14 @@ rule extend_mirs_annotations: chrsize=OUT_DIR / "chr_size.txt", script=SCRIPTS_DIR / "mirna_extension.py", output: - extended_mir=temp(expand( + extended_mir=expand( OUT_DIR / "extended_mirna_annotation_{extension}_nt.gff3", extension=config["extension"], - )), - extended_primir=temp(expand( + ), + extended_primir=expand( OUT_DIR / "extended_primir_annotation_{extension}_nt.gff3", extension=config["extension"], - )), + ), params: cluster_log=CLUSTER_LOG / "extend_mirs_annotations.log", out_dir=OUT_DIR, diff --git a/workflow/rules/quantify.smk b/workflow/rules/quantify.smk index 9d546d89..c30b048c 100644 --- a/workflow/rules/quantify.smk +++ b/workflow/rules/quantify.smk @@ -25,6 +25,7 @@ validate(config, Path("../../config/config_schema.json")) ENV_DIR = Path(f"{workflow.basedir}/envs") OUT_DIR = Path(config["output_dir"]) +TMP_DIR = Path(config["tmp_dir"]) SCRIPTS_DIR = Path(config["scripts_dir"]) CLUSTER_LOG = Path(config["cluster_log"]) @@ -152,7 +153,7 @@ rule convert_intersecting_primir_sam_to_bam: input: maps=OUT_DIR / "{sample}" / "alignments_intersecting_primir.sam", output: - maps=temp(OUT_DIR / "{sample}" / "alignments_intersecting_primir.bam"), + maps=OUT_DIR / "{sample}" / "alignments_intersecting_primir.bam", params: cluster_log=CLUSTER_LOG / "convert_intersecting_primir_sam_to_bam_{sample}.log", log: @@ -174,7 +175,7 @@ rule sort_intersecting_primir_bam_by_position: input: maps=OUT_DIR / "{sample}" / "alignments_intersecting_primir.bam", output: - maps=temp(OUT_DIR / "{sample}" / "alignments_intersecting_primir_sorted.bam"), + maps=OUT_DIR / "{sample}" / "alignments_intersecting_primir_sorted.bam", params: cluster_log=CLUSTER_LOG / "sort_intersecting_primir_bam_by_position_{sample}.log", @@ -197,9 +198,9 @@ rule index_intersecting_primir_bam: input: maps=OUT_DIR / "{sample}" / "alignments_intersecting_primir_sorted.bam", output: - maps=temp(OUT_DIR + maps=OUT_DIR / "{sample}" - / "alignments_intersecting_primir_sorted.bam.bai"), + / "alignments_intersecting_primir_sorted.bam.bai", params: cluster_log=CLUSTER_LOG / "index_intersecting_primir_bam_{sample}.log", log: @@ -336,7 +337,7 @@ rule quantify_mirna: alignments=OUT_DIR / "{sample}" / "alignments_intersecting_mirna_sorted_tag.sam", script=SCRIPTS_DIR / "mirna_quantification.py", output: - table=temp(OUT_DIR / "TABLES" / "mirna_counts_{sample}"), + table=OUT_DIR / "TABLES" / "mirna_counts_{sample}", params: cluster_log=CLUSTER_LOG / "quantify_mirna_{sample}.log", mir_list=config["mir_list"], @@ -370,7 +371,7 @@ rule quantify_primir: intersect=OUT_DIR / "{sample}" / "intersected_extended_primir.bed", script=SCRIPTS_DIR / "primir_quantification.py", output: - table=temp(OUT_DIR / "TABLES" / "pri-mir_counts_{sample}"), + table=OUT_DIR / "TABLES" / "pri-mir_counts_{sample}", params: cluster_log=CLUSTER_LOG / "quantify_primir_{sample}.log", log: @@ -460,9 +461,9 @@ rule convert_uncollpased_reads_sam_to_bam: input: maps=OUT_DIR / "{sample}" / "alignments_intersecting_mirna_uncollapsed.sam", output: - maps=temp(OUT_DIR + maps=OUT_DIR / "{sample}" - / "alignments_intersecting_mirna_uncollapsed.bam"), + / "alignments_intersecting_mirna_uncollapsed.bam", params: cluster_log=CLUSTER_LOG / "convert_uncollapsed_reads_sam_to_bam_{sample}.log", log: From 2f2fb15672fc6e0586f445b4f837808f134b549a Mon Sep 17 00:00:00 2001 From: deliaBlue Date: Wed, 29 Nov 2023 18:52:19 +0100 Subject: [PATCH 04/21] refactor: start tmp files --- test/test_workflow_local_with_conda.sh | 4 +--- test/test_workflow_local_with_singularity.sh | 5 ++--- test/test_workflow_slurm_with_conda.sh | 4 ++-- test/test_workflow_slurm_with_singularity.sh | 4 ++-- 4 files changed, 7 insertions(+), 10 deletions(-) diff --git a/test/test_workflow_local_with_conda.sh b/test/test_workflow_local_with_conda.sh index 805a141f..f92c4249 100755 --- a/test/test_workflow_local_with_conda.sh +++ b/test/test_workflow_local_with_conda.sh @@ -25,9 +25,7 @@ snakemake \ --use-conda \ --printshellcmds \ --rerun-incomplete \ - --verbose \ - --notemp - + --verbose # Snakemake report snakemake \ diff --git a/test/test_workflow_local_with_singularity.sh b/test/test_workflow_local_with_singularity.sh index ccacd006..b8f3deaf 100755 --- a/test/test_workflow_local_with_singularity.sh +++ b/test/test_workflow_local_with_singularity.sh @@ -26,9 +26,8 @@ snakemake \ --singularity-args "--bind ${PWD}/../" \ --printshellcmds \ --rerun-incomplete \ - --verbose \ - --notemp - + --no-hooks \ + --verbose # Snakemake report snakemake \ diff --git a/test/test_workflow_slurm_with_conda.sh b/test/test_workflow_slurm_with_conda.sh index d909963d..2e6c7154 100755 --- a/test/test_workflow_slurm_with_conda.sh +++ b/test/test_workflow_slurm_with_conda.sh @@ -41,8 +41,8 @@ snakemake \ --use-conda \ --printshellcmds \ --rerun-incomplete \ - --verbose \ - --notemp + --no-hooks \ + --verbose # Snakemake report snakemake \ diff --git a/test/test_workflow_slurm_with_singularity.sh b/test/test_workflow_slurm_with_singularity.sh index e68809b8..88781fda 100755 --- a/test/test_workflow_slurm_with_singularity.sh +++ b/test/test_workflow_slurm_with_singularity.sh @@ -42,8 +42,8 @@ snakemake \ --singularity-args="--bind ${PWD}/../" \ --printshellcmds \ --rerun-incomplete \ - --verbose \ - --notemp + --no-hooks \ + --verbose # Snakemake report snakemake \ From 0eadf76958f27c5610902f550bce3cac91175251 Mon Sep 17 00:00:00 2001 From: deliaBlue Date: Thu, 30 Nov 2023 14:28:22 +0100 Subject: [PATCH 05/21] ci: update paths for expected output --- test/expected_output.md5 | 110 +++++++++++++++++++-------------------- 1 file changed, 55 insertions(+), 55 deletions(-) diff --git a/test/expected_output.md5 b/test/expected_output.md5 index 0f328284..8ccb8fa7 100644 --- a/test/expected_output.md5 +++ b/test/expected_output.md5 @@ -1,58 +1,58 @@ 68f943f89b52d628851dd97fb1399d68 results/TABLES/all_mirna_counts.tab -eec9be6cda61d2728290c92c1209f455 results/TABLES/mirna_counts_test_lib 363ecee318c57ee7e2e45ca468007baa results/TABLES/all_pri-mir_counts.tab -a844e3a29159e36e2f17a0646d1e8c5f results/TABLES/pri-mir_counts_test_lib 0d76977b2e36046cc176112776c5fa4e results/test_lib/alignments_intersecting_mirna_uncollapsed_sorted.bam.bai -36f7d024fe6ddfd3e788aebf61c61061 results/test_lib/oligomap_genome_sorted.fasta -48e605df55bf2dd37ea5a5a74eb5872a results/test_lib/mappings_all.sam -d41d8cd98f00b204e9800998ecf8427e results/test_lib/oligomap_transcriptome_mappings.fasta -eea903fc0ab81054cf8e34193f80f4a7 results/test_lib/mappings_all_removed_inferiors.sam -98498ac521f451426a9dbabcbecb5f25 results/test_lib/alignments_intersecting_primir.bam -defdc8c46e1d73692edde0e0278f2d5e results/test_lib/oligomap_genome_mappings.fasta -1649738f226e8979d4d88a3ae47fa423 results/test_lib/segemehl_transcriptome_mappings.sam -9ecee9ab80daba0a53076b05c9f6ff53 results/test_lib/alignments_intersecting_mirna_uncollapsed_sorted.bam -1649738f226e8979d4d88a3ae47fa423 results/test_lib/transcriptome_mappings_filtered_nh.sam -8e22ddfa7c39ce7e4ec5945dff1576ef results/test_lib/alignments_all.bam -a124a5afdb5f7bfbcc5683260556c9c4 results/test_lib/mappings_all_no_header.sam -dd00dea3549dc1ad14f9e1505d397de5 results/test_lib/alignments_all.sam -8c24d619073f4c5ca1f439fe429d0ef4 results/test_lib/alignments_intersecting_mirna_tag.sam -d41d8cd98f00b204e9800998ecf8427e results/test_lib/oligomap_transcriptome_sorted.fasta -c218718d93f48e5987fc18b33dc488f0 results/test_lib/segemehl_genome_mappings.sam -d41d8cd98f00b204e9800998ecf8427e results/test_lib/transcriptome_mappings_to_genome.sam -63a32839360a985b68e0685aafad5c54 results/test_lib/fa/reads.fa -5cc557ec2073144f47fe28ac145f4869 results/test_lib/alignments_intersecting_mirna_uncollapsed.sam -edcb854702519c0002d8ce89a21e54ef results/test_lib/reads_formatted.fasta -1a547487b8e92ad85bb26ff9b1db1f93 results/test_lib/intersected_extended_mirna.bed -721071f3ead528aa71978508db8d73f9 results/test_lib/alignments_all_sorted_test_lib.bam -ec0e9bcc8ea857da897035c8fca4078f results/test_lib/reads_trimmed_adapters.fasta -bbfc27c84b66ff41bfeee73f701b4b29 results/test_lib/alignments_intersecting_mirna_uncollapsed.bam -81bed7fc879f7a16c12d2ba912263c46 results/test_lib/alignments_intersecting_mirna.sam -dd560414078330bf3138f039da109093 results/test_lib/genome_mappings.sam -f5cb65466d328036a15b66cfbd4d8419 results/test_lib/oligomap_genome_report.txt -6cbdb9299e09b3e39b79a50db69226b5 results/test_lib/transcriptome_mappings_no_header.sam -1649738f226e8979d4d88a3ae47fa423 results/test_lib/transcriptome_mappings.sam -947607be69c16246f8dc9adbd9b971c8 results/test_lib/oligomap_genome_mappings.sam -9833208a79143eaf3f2a5fdeca0b2d94 results/test_lib/alignments_intersecting_mirna_sorted_tag.sam -02096523b293082629d5b895085468a3 results/test_lib/alignments_intersecting_primir_sorted.bam -d41d8cd98f00b204e9800998ecf8427e results/test_lib/oligomap_transcriptome_mappings.sam -a124a5afdb5f7bfbcc5683260556c9c4 results/test_lib/genome_mappings_no_header.sam -dd560414078330bf3138f039da109093 results/test_lib/genome_mappings_filtered_nh.sam -ae4c4963ca2cd206952b2ea2c58301dd results/test_lib/mappings_all_sorted_by_id.sam -2c77ffa021dda190d82f3f54a3312393 results/test_lib/reads_collapsed.fasta -f68693cfaa1e6ea78e1a5562ade6d9ed results/test_lib/intersected_extended_primir.bed -61f12595db9421926073d6675f7c3c42 results/test_lib/alignments_intersecting_primir.sam -c2a5770a755ada66ef63d96eec4afb00 results/test_lib/reads_filtered_for_oligomap.fasta -fe5388094985e9604a302d39d2abc82c results/test_lib/oligomap_transcriptome_report.txt -be7a0d92e57480190de57eb30baffa36 results/extended_mirna_annotation_6_nt.gff3 -8148cd880602255be166beb59bbed95a results/genome_header.sam -09e24a504bfec37fee3d5ff1b5c7738e results/exons.bed -4fb453846e88593d0cac13220ec2d685 results/segemehl_genome_index.idx -d34fc868b861b1bc46db07a397dc0f10 results/genome_processed.fa.fai -21e102e4ebd3508bb06f46366a3d578d results/exons.gtf -003b92b245ac336e3d70a513033e1cee results/transcriptome_trimmed_id.fa -44dbf7c3eae00d0bc8d5e1319123746c results/chr_size.txt -cc5c3512dab0e269d82bd625de74198e results/extended_primir_annotation_6_nt.gff3 -f28cc0143ab6659bef3de3a7afa1dccc results/mirna_annotations.gff3 -2d437f8681f4248d4f2075f86debb920 results/transcriptome.fa -7eb64c112830266bcf416ded60b4cf77 results/segemehl_transcriptome_index.idx -4fba145540a2c61f29bfddfd0f5a4d4e results/genome_processed.fa +ddb9272db1b04e67aaa65a8dbcee69b6 results/test_lib/alignments_intersecting_mirna_uncollapsed_sorted.bam +d025851bf31e88874bcc972b36c3f9ee results/test_lib/alignments_intersecting_mirna_uncollapsed.sam +80a664262886fe0b1b7dad80b6cc0b39 results/test_lib/alignments_intersecting_mirna.sam +ae3f3374170ce0ae90087b5672163ba3 results/test_lib/alignments_intersecting_primir.sam +eec9be6cda61d2728290c92c1209f455 results/tmp/TABLES/mirna_counts_test_lib +a844e3a29159e36e2f17a0646d1e8c5f results/tmp/TABLES/pri-mir_counts_test_lib +36f7d024fe6ddfd3e788aebf61c61061 results/tmp/test_lib/oligomap_genome_sorted.fasta +48e605df55bf2dd37ea5a5a74eb5872a results/tmp/test_lib/mappings_all.sam +d41d8cd98f00b204e9800998ecf8427e results/tmp/test_lib/oligomap_transcriptome_mappings.fasta +4b8a81b633b84921ae1b3fa9a15b0a14 results/tmp/test_lib/mappings_all_removed_inferiors.sam +78124c5993a98d67a47538865d624ef5 results/tmp/test_lib/alignments_intersecting_primir.bam +defdc8c46e1d73692edde0e0278f2d5e results/tmp/test_lib/oligomap_genome_mappings.fasta +48c6346d7326e8718dd06c9b642a2f97 results/tmp/test_lib/segemehl_transcriptome_mappings.sam +48c6346d7326e8718dd06c9b642a2f97 results/tmp/test_lib/transcriptome_mappings_filtered_nh.sam +9e21710edb045a1d7ba653e21b40a8b1 results/tmp/test_lib/alignments_all.bam +a124a5afdb5f7bfbcc5683260556c9c4 results/tmp/test_lib/mappings_all_no_header.sam +ac4cf96f8e35bf1ace8750e72aa27a95 results/tmp/test_lib/alignments_all.sam +552b836f23069c5b569cba621df1e0b0 results/tmp/test_lib/alignments_intersecting_mirna_tag.sam +d41d8cd98f00b204e9800998ecf8427e results/tmp/test_lib/oligomap_transcriptome_sorted.fasta +52f42f222c4a3d89f852a5a31ce685ea results/tmp/test_lib/segemehl_genome_mappings.sam +d41d8cd98f00b204e9800998ecf8427e results/tmp/test_lib/transcriptome_mappings_to_genome.sam +63a32839360a985b68e0685aafad5c54 results/tmp/test_lib/fa/reads.fa +edcb854702519c0002d8ce89a21e54ef results/tmp/test_lib/reads_formatted.fasta +1a547487b8e92ad85bb26ff9b1db1f93 results/tmp/test_lib/intersected_extended_mirna.bed +cc2fbcd9435b8d8e5affc7280d4a59f8 results/tmp/test_lib/alignments_all_sorted_test_lib.bam +ec0e9bcc8ea857da897035c8fca4078f results/tmp/test_lib/reads_trimmed_adapters.fasta +21b9623034d519b6ef4c7bb02b631b27 results/tmp/test_lib/alignments_intersecting_mirna_uncollapsed.bam +4c9b71346d76e90b37a9a3d4e5457a49 results/tmp/test_lib/genome_mappings.sam +f5cb65466d328036a15b66cfbd4d8419 results/tmp/test_lib/oligomap_genome_report.txt +6cbdb9299e09b3e39b79a50db69226b5 results/tmp/test_lib/transcriptome_mappings_no_header.sam +48c6346d7326e8718dd06c9b642a2f97 results/tmp/test_lib/transcriptome_mappings.sam +947607be69c16246f8dc9adbd9b971c8 results/tmp/test_lib/oligomap_genome_mappings.sam +4ac38f4b52af65eae9bb0607863ce3f9 results/tmp/test_lib/alignments_intersecting_mirna_sorted_tag.sam +db62a036defe14394eec1c3e664e2960 results/tmp/test_lib/alignments_intersecting_primir_sorted.bam +d41d8cd98f00b204e9800998ecf8427e results/tmp/test_lib/oligomap_transcriptome_mappings.sam +a124a5afdb5f7bfbcc5683260556c9c4 results/tmp/test_lib/genome_mappings_no_header.sam +4c9b71346d76e90b37a9a3d4e5457a49 results/tmp/test_lib/genome_mappings_filtered_nh.sam +27bcc29a265451fa46ae75ea3683f87e results/tmp/test_lib/mappings_all_sorted_by_id.sam +2c77ffa021dda190d82f3f54a3312393 results/tmp/test_lib/reads_collapsed.fasta +f68693cfaa1e6ea78e1a5562ade6d9ed results/tmp/test_lib/intersected_extended_primir.bed +c2a5770a755ada66ef63d96eec4afb00 results/tmp/test_lib/reads_filtered_for_oligomap.fasta +fe5388094985e9604a302d39d2abc82c results/tmp/test_lib/oligomap_transcriptome_report.txt +be7a0d92e57480190de57eb30baffa36 results/tmp/extended_mirna_annotation_6_nt.gff3 +8148cd880602255be166beb59bbed95a results/tmp/genome_header.sam +09e24a504bfec37fee3d5ff1b5c7738e results/tmp/exons.bed +4fb453846e88593d0cac13220ec2d685 results/tmp/segemehl_genome_index.idx +d34fc868b861b1bc46db07a397dc0f10 results/tmp/genome_processed.fa.fai +21e102e4ebd3508bb06f46366a3d578d results/tmp/exons.gtf +003b92b245ac336e3d70a513033e1cee results/tmp/transcriptome_trimmed_id.fa +44dbf7c3eae00d0bc8d5e1319123746c results/tmp/chr_size.txt +cc5c3512dab0e269d82bd625de74198e results/tmp/extended_primir_annotation_6_nt.gff3 +f28cc0143ab6659bef3de3a7afa1dccc results/tmp/mirna_annotations.gff3 +2d437f8681f4248d4f2075f86debb920 results/tmp/transcriptome.fa +7eb64c112830266bcf416ded60b4cf77 results/tmp/segemehl_transcriptome_index.idx +4fba145540a2c61f29bfddfd0f5a4d4e results/tmp/genome_processed.fa From 06fcfa3ef1a39f55bb485b6b57ccc2e98bc2d2d0 Mon Sep 17 00:00:00 2001 From: deliaBlue Date: Thu, 30 Nov 2023 14:29:44 +0100 Subject: [PATCH 06/21] refactor: change intermediate files to tmp dir --- workflow/Snakefile | 25 +++++- workflow/rules/map.smk | 151 ++++++++++++++++++------------------ workflow/rules/prepare.smk | 63 ++++++++------- workflow/rules/quantify.smk | 61 ++++++++------- 4 files changed, 158 insertions(+), 142 deletions(-) diff --git a/workflow/Snakefile b/workflow/Snakefile index 4dcd95c3..aa852cd7 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -31,6 +31,23 @@ validate(config, Path("../config/config_schema.json")) OUT_DIR = Path(config["output_dir"]) +TMP_DIR = Path(config["tmp_dir"]) +LOG_DIR = Path(f"{config['local_log']}/../") + + +############################################################################### +### onSuccess/onError handlers configuration +############################################################################### + + +onsuccess: + print("\nWORKFLOW SUCCEED. Removing intermediate files.\n") + shell("rm -rf {TMP_DIR}") + + +onerror: + print("\nWORKFLOW FAILED. Check the log file in the LOGS/ directory.\n") + shell("cat {log} > {LOG_DIR}/failed_workflow.log") ############################################################################### @@ -67,14 +84,14 @@ rule finish: OUT_DIR / "{sample}" / "alignments_intersecting_mirna.sam", sample=pd.unique(samples_table.index.values), ), - intersect_sam=expand( - OUT_DIR / "{sample}" / "alignments_intersecting_mirna_sorted_tag.sam", - sample=pd.unique(samples_table.index.values), - ), table=expand( OUT_DIR / "TABLES" / "all_{mir}_counts.tab", mir=[mir for mir in config["mir_list"] if mir != "isomir"], ), + uncollapsed_sam=expand( + OUT_DIR / "{sample}" / "alignments_intersecting_mirna_uncollapsed.sam", + sample=pd.unique(samples_table.index.values), + ), uncollapsed_bam=expand( OUT_DIR / "{sample}" diff --git a/workflow/rules/map.smk b/workflow/rules/map.smk index 1f4ab53f..cdf515eb 100644 --- a/workflow/rules/map.smk +++ b/workflow/rules/map.smk @@ -24,7 +24,6 @@ validate(config, Path("../../config/config_schema.json")) ENV_DIR = Path(f"{workflow.basedir}/envs") -OUT_DIR = Path(config["output_dir"]) TMP_DIR = Path(config["tmp_dir"]) SCRIPTS_DIR = Path(config["scripts_dir"]) @@ -71,7 +70,7 @@ localrules: rule finish_map: input: maps=expand( - OUT_DIR / "{sample}" / "alignments_all_sorted_{sample}.bam.bai", + TMP_DIR / "{sample}" / "alignments_all_sorted_{sample}.bam.bai", sample=pd.unique(samples_table.index.values), ), @@ -88,7 +87,7 @@ rule start: format=convert_lib_format(get_sample("format")), ), output: - reads=OUT_DIR / "{sample}" / "{format}" / "reads.{format}", + reads=TMP_DIR / "{sample}" / "{format}" / "reads.{format}", params: cluster_log=CLUSTER_LOG / "uncompress_zipped_files_{sample}_{format}.log", log: @@ -106,9 +105,9 @@ rule start: rule fastq_quality_filter: input: - reads=OUT_DIR / "{sample}" / "fastq" / "reads.fastq", + reads=TMP_DIR / "{sample}" / "fastq" / "reads.fastq", output: - reads=OUT_DIR / "{sample}" / "fastq" / "filtered_reads.fastq", + reads=TMP_DIR / "{sample}" / "fastq" / "filtered_reads.fastq", params: cluster_log=CLUSTER_LOG / "fastq_quality_filter_{sample}.log", p=config["p_value"], @@ -136,9 +135,9 @@ rule fastq_quality_filter: rule fastq_to_fasta: input: - reads=OUT_DIR / "{sample}" / "fastq" / "filtered_reads.fastq", + reads=TMP_DIR / "{sample}" / "fastq" / "filtered_reads.fastq", output: - reads=OUT_DIR / "{sample}" / "fastq" / "reads.fa", + reads=TMP_DIR / "{sample}" / "fastq" / "reads.fa", params: cluster_log=CLUSTER_LOG / "fastq_to_fasta_{sample}.log", log: @@ -158,12 +157,12 @@ rule fastq_to_fasta: rule format_fasta: input: - reads=lambda wildcards: OUT_DIR + reads=lambda wildcards: TMP_DIR / wildcards.sample / convert_lib_format(get_sample("format", wildcards.sample)) / "reads.fa", output: - reads=OUT_DIR / "{sample}" / "reads_formatted.fasta", + reads=TMP_DIR / "{sample}" / "reads_formatted.fasta", params: cluster_log=CLUSTER_LOG / "format_fasta_{sample}.log", log: @@ -183,9 +182,9 @@ rule format_fasta: rule remove_adapters: input: - reads=OUT_DIR / "{sample}" / "reads_formatted.fasta", + reads=TMP_DIR / "{sample}" / "reads_formatted.fasta", output: - reads=OUT_DIR / "{sample}" / "reads_trimmed_adapters.fasta", + reads=TMP_DIR / "{sample}" / "reads_trimmed_adapters.fasta", params: adapter=lambda wildcards: get_sample("adapter", wildcards.sample).upper(), error_rate=config["error_rate"], @@ -220,9 +219,9 @@ rule remove_adapters: rule collapse_identical_reads: input: - reads=OUT_DIR / "{sample}" / "reads_trimmed_adapters.fasta", + reads=TMP_DIR / "{sample}" / "reads_trimmed_adapters.fasta", output: - reads=OUT_DIR / "{sample}" / "reads_collapsed.fasta", + reads=TMP_DIR / "{sample}" / "reads_collapsed.fasta", params: cluster_log=CLUSTER_LOG / "collapse_identical_reads_{sample}.log", log: @@ -242,11 +241,11 @@ rule collapse_identical_reads: rule map_genome_segemehl: input: - reads=OUT_DIR / "{sample}" / "reads_collapsed.fasta", - genome=OUT_DIR / "genome_processed.fa", - genome_index_segemehl=OUT_DIR / "segemehl_genome_index.idx", + reads=TMP_DIR / "{sample}" / "reads_collapsed.fasta", + genome=TMP_DIR / "genome_processed.fa", + genome_index_segemehl=TMP_DIR / "segemehl_genome_index.idx", output: - gmap=OUT_DIR / "{sample}" / "segemehl_genome_mappings.sam", + gmap=TMP_DIR / "{sample}" / "segemehl_genome_mappings.sam", params: cluster_log=CLUSTER_LOG / "map_genome_segemehl_{sample}.log", log: @@ -277,11 +276,11 @@ rule map_genome_segemehl: rule map_transcriptome_segemehl: input: - reads=OUT_DIR / "{sample}" / "reads_collapsed.fasta", - transcriptome=OUT_DIR / "transcriptome_trimmed_id.fa", - transcriptome_index_segemehl=OUT_DIR / "segemehl_transcriptome_index.idx", + reads=TMP_DIR / "{sample}" / "reads_collapsed.fasta", + transcriptome=TMP_DIR / "transcriptome_trimmed_id.fa", + transcriptome_index_segemehl=TMP_DIR / "segemehl_transcriptome_index.idx", output: - tmap=OUT_DIR / "{sample}" / "segemehl_transcriptome_mappings.sam", + tmap=TMP_DIR / "{sample}" / "segemehl_transcriptome_mappings.sam", params: cluster_log=CLUSTER_LOG / "map_transcriptome_segemehl_{sample}.log", log: @@ -312,10 +311,10 @@ rule map_transcriptome_segemehl: rule filter_fasta_for_oligomap: input: - reads=OUT_DIR / "{sample}" / "reads_collapsed.fasta", + reads=TMP_DIR / "{sample}" / "reads_collapsed.fasta", script=SCRIPTS_DIR / "validation_fasta.py", output: - reads=OUT_DIR / "{sample}" / "reads_filtered_for_oligomap.fasta", + reads=TMP_DIR / "{sample}" / "reads_filtered_for_oligomap.fasta", params: cluster_log=CLUSTER_LOG / "filter_fasta_for_oligomap_{sample}.log", max_length_reads=config["max_length_reads"], @@ -340,11 +339,11 @@ rule filter_fasta_for_oligomap: rule map_genome_oligomap: input: - reads=OUT_DIR / "{sample}" / "reads_filtered_for_oligomap.fasta", - target=OUT_DIR / "genome_processed.fa", + reads=TMP_DIR / "{sample}" / "reads_filtered_for_oligomap.fasta", + target=TMP_DIR / "genome_processed.fa", output: - gmap=OUT_DIR / "{sample}" / "oligomap_genome_mappings.fasta", - report=OUT_DIR / "{sample}" / "oligomap_genome_report.txt", + gmap=TMP_DIR / "{sample}" / "oligomap_genome_mappings.fasta", + report=TMP_DIR / "{sample}" / "oligomap_genome_report.txt", params: cluster_log=CLUSTER_LOG / "map_genome_oligomap_{sample}.log", log: @@ -373,11 +372,11 @@ rule map_genome_oligomap: rule sort_genome_oligomap: input: - tmap=OUT_DIR / "{sample}" / "oligomap_genome_mappings.fasta", - report=OUT_DIR / "{sample}" / "oligomap_genome_report.txt", + tmap=TMP_DIR / "{sample}" / "oligomap_genome_mappings.fasta", + report=TMP_DIR / "{sample}" / "oligomap_genome_report.txt", script=SCRIPTS_DIR / "blocksort.sh", output: - sort=OUT_DIR / "{sample}" / "oligomap_genome_sorted.fasta", + sort=TMP_DIR / "{sample}" / "oligomap_genome_sorted.fasta", params: cluster_log=CLUSTER_LOG / "sort_genome_oligomap_{sample}.log", log: @@ -402,10 +401,10 @@ rule sort_genome_oligomap: rule convert_genome_to_sam_oligomap: input: - sort=OUT_DIR / "{sample}" / "oligomap_genome_sorted.fasta", + sort=TMP_DIR / "{sample}" / "oligomap_genome_sorted.fasta", script=SCRIPTS_DIR / "oligomap_output_to_sam_nh_filtered.py", output: - gmap=OUT_DIR / "{sample}" / "oligomap_genome_mappings.sam", + gmap=TMP_DIR / "{sample}" / "oligomap_genome_mappings.sam", params: cluster_log=CLUSTER_LOG / "oligomap_genome_to_sam_{sample}.log", nh=config["nh"], @@ -432,11 +431,11 @@ rule convert_genome_to_sam_oligomap: rule map_transcriptome_oligomap: input: - reads=OUT_DIR / "{sample}" / "reads_filtered_for_oligomap.fasta", - target=OUT_DIR / "transcriptome_trimmed_id.fa", + reads=TMP_DIR / "{sample}" / "reads_filtered_for_oligomap.fasta", + target=TMP_DIR / "transcriptome_trimmed_id.fa", output: - tmap=OUT_DIR / "{sample}" / "oligomap_transcriptome_mappings.fasta", - report=OUT_DIR / "{sample}" / "oligomap_transcriptome_report.txt", + tmap=TMP_DIR / "{sample}" / "oligomap_transcriptome_mappings.fasta", + report=TMP_DIR / "{sample}" / "oligomap_transcriptome_report.txt", params: cluster_log=CLUSTER_LOG / "map_transcriptome_oligomap_{sample}.log", log: @@ -466,11 +465,11 @@ rule map_transcriptome_oligomap: rule sort_transcriptome_oligomap: input: - tmap=OUT_DIR / "{sample}" / "oligomap_transcriptome_mappings.fasta", - report=OUT_DIR / "{sample}" / "oligomap_transcriptome_report.txt", + tmap=TMP_DIR / "{sample}" / "oligomap_transcriptome_mappings.fasta", + report=TMP_DIR / "{sample}" / "oligomap_transcriptome_report.txt", script=SCRIPTS_DIR / "blocksort.sh", output: - sort=OUT_DIR / "{sample}" / "oligomap_transcriptome_sorted.fasta", + sort=TMP_DIR / "{sample}" / "oligomap_transcriptome_sorted.fasta", params: cluster_log=CLUSTER_LOG / "sort_transcriptome_oligomap_{sample}.log", log: @@ -494,10 +493,10 @@ rule sort_transcriptome_oligomap: rule convert_transcriptome_to_sam_oligomap: input: - sort=OUT_DIR / "{sample}" / "oligomap_transcriptome_sorted.fasta", + sort=TMP_DIR / "{sample}" / "oligomap_transcriptome_sorted.fasta", script=SCRIPTS_DIR / "oligomap_output_to_sam_nh_filtered.py", output: - tmap=OUT_DIR / "{sample}" / "oligomap_transcriptome_mappings.sam", + tmap=TMP_DIR / "{sample}" / "oligomap_transcriptome_mappings.sam", params: cluster_log=CLUSTER_LOG / "oligomap_transcriptome_to_sam_{sample}.log", nh=config["nh"], @@ -521,10 +520,10 @@ rule convert_transcriptome_to_sam_oligomap: rule merge_genome_maps: input: - gmap1=OUT_DIR / "{sample}" / "segemehl_genome_mappings.sam", - gmap2=OUT_DIR / "{sample}" / "oligomap_genome_mappings.sam", + gmap1=TMP_DIR / "{sample}" / "segemehl_genome_mappings.sam", + gmap2=TMP_DIR / "{sample}" / "oligomap_genome_mappings.sam", output: - gmaps=OUT_DIR / "{sample}" / "genome_mappings.sam", + gmaps=TMP_DIR / "{sample}" / "genome_mappings.sam", params: cluster_log=CLUSTER_LOG / "merge_genome_maps_{sample}.log", log: @@ -542,10 +541,10 @@ rule merge_genome_maps: rule merge_transcriptome_maps: input: - tmap1=OUT_DIR / "{sample}" / "segemehl_transcriptome_mappings.sam", - tmap2=OUT_DIR / "{sample}" / "oligomap_transcriptome_mappings.sam", + tmap1=TMP_DIR / "{sample}" / "segemehl_transcriptome_mappings.sam", + tmap2=TMP_DIR / "{sample}" / "oligomap_transcriptome_mappings.sam", output: - tmaps=OUT_DIR / "{sample}" / "transcriptome_mappings.sam", + tmaps=TMP_DIR / "{sample}" / "transcriptome_mappings.sam", params: cluster_log=CLUSTER_LOG / "merge_transcriptome_maps_{sample}.log", log: @@ -563,10 +562,10 @@ rule merge_transcriptome_maps: rule filter_genome_by_nh: input: - gmaps=OUT_DIR / "{sample}" / "genome_mappings.sam", + gmaps=TMP_DIR / "{sample}" / "genome_mappings.sam", script=SCRIPTS_DIR / "nh_filter.py", output: - gmaps=OUT_DIR / "{sample}" / "genome_mappings_filtered_nh.sam", + gmaps=TMP_DIR / "{sample}" / "genome_mappings_filtered_nh.sam", params: cluster_log=CLUSTER_LOG / "filter_genome_by_nh_{sample}.log", nh=config["nh"], @@ -591,10 +590,10 @@ rule filter_genome_by_nh: rule filter_transcriptome_by_nh: input: - tmaps=OUT_DIR / "{sample}" / "transcriptome_mappings.sam", + tmaps=TMP_DIR / "{sample}" / "transcriptome_mappings.sam", script=SCRIPTS_DIR / "nh_filter.py", output: - tmaps=OUT_DIR / "{sample}" / "transcriptome_mappings_filtered_nh.sam", + tmaps=TMP_DIR / "{sample}" / "transcriptome_mappings_filtered_nh.sam", params: cluster_log=CLUSTER_LOG / "filter_transcriptome_by_nh_{sample}.log", nh=config["nh"], @@ -619,9 +618,9 @@ rule filter_transcriptome_by_nh: rule remove_header_genome_mappings: input: - gmap=OUT_DIR / "{sample}" / "genome_mappings_filtered_nh.sam", + gmap=TMP_DIR / "{sample}" / "genome_mappings_filtered_nh.sam", output: - gmap=OUT_DIR / "{sample}" / "genome_mappings_no_header.sam", + gmap=TMP_DIR / "{sample}" / "genome_mappings_no_header.sam", params: cluster_log=CLUSTER_LOG / "remove_header_genome_mappings_{sample}.log", log: @@ -641,9 +640,9 @@ rule remove_header_genome_mappings: rule remove_header_transcriptome_mappings: input: - tmap=OUT_DIR / "{sample}" / "transcriptome_mappings_filtered_nh.sam", + tmap=TMP_DIR / "{sample}" / "transcriptome_mappings_filtered_nh.sam", output: - tmap=OUT_DIR / "{sample}" / "transcriptome_mappings_no_header.sam", + tmap=TMP_DIR / "{sample}" / "transcriptome_mappings_no_header.sam", params: cluster_log=CLUSTER_LOG / "remove_header_transcriptome_mappings_{sample}.log", log: @@ -663,11 +662,11 @@ rule remove_header_transcriptome_mappings: rule transcriptome_to_genome_maps: input: - tmap=OUT_DIR / "{sample}" / "transcriptome_mappings_no_header.sam", + tmap=TMP_DIR / "{sample}" / "transcriptome_mappings_no_header.sam", script=SCRIPTS_DIR / "sam_trx_to_sam_gen.pl", - exons=OUT_DIR / "exons.bed", + exons=TMP_DIR / "exons.bed", output: - genout=OUT_DIR / "{sample}" / "transcriptome_mappings_to_genome.sam", + genout=TMP_DIR / "{sample}" / "transcriptome_mappings_to_genome.sam", params: cluster_log=CLUSTER_LOG / "transcriptome_to_genome_maps_{sample}.log", log: @@ -691,10 +690,10 @@ rule transcriptome_to_genome_maps: rule merge_all_maps: input: - gmap1=OUT_DIR / "{sample}" / "transcriptome_mappings_to_genome.sam", - gmap2=OUT_DIR / "{sample}" / "genome_mappings_no_header.sam", + gmap1=TMP_DIR / "{sample}" / "transcriptome_mappings_to_genome.sam", + gmap2=TMP_DIR / "{sample}" / "genome_mappings_no_header.sam", output: - catmaps=OUT_DIR / "{sample}" / "mappings_all_no_header.sam", + catmaps=TMP_DIR / "{sample}" / "mappings_all_no_header.sam", params: cluster_log=CLUSTER_LOG / "merge_all_mappings_{sample}.log", log: @@ -712,10 +711,10 @@ rule merge_all_maps: rule add_header_all_maps: input: - header=OUT_DIR / "genome_header.sam", - catmaps=OUT_DIR / "{sample}" / "mappings_all_no_header.sam", + header=TMP_DIR / "genome_header.sam", + catmaps=TMP_DIR / "{sample}" / "mappings_all_no_header.sam", output: - concatenate=OUT_DIR / "{sample}" / "mappings_all.sam", + concatenate=TMP_DIR / "{sample}" / "mappings_all.sam", params: cluster_log=CLUSTER_LOG / "add_header_{sample}.log", log: @@ -733,9 +732,9 @@ rule add_header_all_maps: rule sort_maps_by_id: input: - concatenate=OUT_DIR / "{sample}" / "mappings_all.sam", + concatenate=TMP_DIR / "{sample}" / "mappings_all.sam", output: - sort=OUT_DIR / "{sample}" / "mappings_all_sorted_by_id.sam", + sort=TMP_DIR / "{sample}" / "mappings_all_sorted_by_id.sam", params: cluster_log=CLUSTER_LOG / "sort_maps_by_id_{sample}.log", log: @@ -755,10 +754,10 @@ rule sort_maps_by_id: rule remove_inferiors: input: - sort=OUT_DIR / "{sample}" / "mappings_all_sorted_by_id.sam", + sort=TMP_DIR / "{sample}" / "mappings_all_sorted_by_id.sam", script=SCRIPTS_DIR / "sam_remove_duplicates_inferior_alignments_multimappers.pl", output: - remove_inf=OUT_DIR / "{sample}" / "mappings_all_removed_inferiors.sam", + remove_inf=TMP_DIR / "{sample}" / "mappings_all_removed_inferiors.sam", params: cluster_log=CLUSTER_LOG / "remove_inferiors_{sample}.log", log: @@ -786,10 +785,10 @@ rule remove_inferiors: rule filter_by_indels: input: - sam=OUT_DIR / "{sample}" / "mappings_all_removed_inferiors.sam", + sam=TMP_DIR / "{sample}" / "mappings_all_removed_inferiors.sam", script=SCRIPTS_DIR / "filter_multimappers.py", output: - sam=OUT_DIR / "{sample}" / "alignments_all.sam", + sam=TMP_DIR / "{sample}" / "alignments_all.sam", params: cluster_log=CLUSTER_LOG / "remove_multimappers_{sample}.log", log: @@ -816,9 +815,9 @@ rule filter_by_indels: rule convert_all_alns_sam_to_bam: input: - maps=OUT_DIR / "{sample}" / "alignments_all.sam", + maps=TMP_DIR / "{sample}" / "alignments_all.sam", output: - maps=OUT_DIR / "{sample}" / "alignments_all.bam", + maps=TMP_DIR / "{sample}" / "alignments_all.bam", params: cluster_log=CLUSTER_LOG / "convert_all_alns_sam_to_bam_{sample}.log", log: @@ -838,9 +837,9 @@ rule convert_all_alns_sam_to_bam: rule sort_all_alns_bam_by_position: input: - maps=OUT_DIR / "{sample}" / "alignments_all.bam", + maps=TMP_DIR / "{sample}" / "alignments_all.bam", output: - maps=OUT_DIR / "{sample}" / "alignments_all_sorted_{sample}.bam", + maps=TMP_DIR / "{sample}" / "alignments_all_sorted_{sample}.bam", params: cluster_log=CLUSTER_LOG / "sort_all_alns_bam_by_position_{sample}.log", log: @@ -860,9 +859,9 @@ rule sort_all_alns_bam_by_position: rule index_all_alns_bam: input: - maps=OUT_DIR / "{sample}" / "alignments_all_sorted.bam", + maps=TMP_DIR / "{sample}" / "alignments_all_sorted.bam", output: - maps=OUT_DIR / "{sample}" / "alignments_all_sorted.bam.bai", + maps=TMP_DIR / "{sample}" / "alignments_all_sorted.bam.bai", params: cluster_log=CLUSTER_LOG / "index_all_alns_bam_{sample}.log", log: diff --git a/workflow/rules/prepare.smk b/workflow/rules/prepare.smk index 1202b5cc..437ccad5 100644 --- a/workflow/rules/prepare.smk +++ b/workflow/rules/prepare.smk @@ -26,7 +26,6 @@ validate(config, Path("../../config/config_schema.json")) ENV_DIR = Path(f"{workflow.basedir}/envs") -OUT_DIR = Path(config["output_dir"]) TMP_DIR = Path(config["tmp_dir"]) SCRIPTS_DIR = Path(config["scripts_dir"]) @@ -50,17 +49,17 @@ localrules: rule finish_prepare: input: - idx_transcriptome=OUT_DIR / "segemehl_transcriptome_index.idx", - idx_genome=OUT_DIR / "segemehl_genome_index.idx", - exons=OUT_DIR / "exons.bed", - header=OUT_DIR / "genome_header.sam", - chrsize=OUT_DIR / "chr_size.txt", + idx_transcriptome=TMP_DIR / "segemehl_transcriptome_index.idx", + idx_genome=TMP_DIR / "segemehl_genome_index.idx", + exons=TMP_DIR / "exons.bed", + header=TMP_DIR / "genome_header.sam", + chrsize=TMP_DIR / "chr_size.txt", extended_mir=expand( - OUT_DIR / "extended_mirna_annotation_{extension}_nt.gff3", + TMP_DIR / "extended_mirna_annotation_{extension}_nt.gff3", extension=config["extension"], ), extended_primir=expand( - OUT_DIR / "extended_primir_annotation_{extension}_nt.gff3", + TMP_DIR / "extended_primir_annotation_{extension}_nt.gff3", extension=config["extension"], ), @@ -75,7 +74,7 @@ rule trim_genome_seq_ids: genome=config["genome_file"], script=SCRIPTS_DIR / "trim_id_fasta.sh", output: - genome=OUT_DIR / "genome_processed.fa", + genome=TMP_DIR / "genome_processed.fa", params: cluster_log=CLUSTER_LOG / "genome_process.log", log: @@ -93,10 +92,10 @@ rule trim_genome_seq_ids: rule extract_transcriptome_seqs: input: - genome=OUT_DIR / "genome_processed.fa", + genome=TMP_DIR / "genome_processed.fa", gtf=config["gtf_file"], output: - fasta=OUT_DIR / "transcriptome.fa", + fasta=TMP_DIR / "transcriptome.fa", params: cluster_log=CLUSTER_LOG / "extract_transcriptome_seqs.log", log: @@ -116,10 +115,10 @@ rule extract_transcriptome_seqs: rule trim_transcriptome_seq_ids: input: - fasta=OUT_DIR / "transcriptome.fa", + fasta=TMP_DIR / "transcriptome.fa", script=SCRIPTS_DIR / "trim_id_fasta.sh", output: - fasta=OUT_DIR / "transcriptome_trimmed_id.fa", + fasta=TMP_DIR / "transcriptome_trimmed_id.fa", params: cluster_log=CLUSTER_LOG / "trim_transcriptome.log", log: @@ -137,9 +136,9 @@ rule trim_transcriptome_seq_ids: rule generate_segemehl_index_transcriptome: input: - fasta=OUT_DIR / "transcriptome_trimmed_id.fa", + fasta=TMP_DIR / "transcriptome_trimmed_id.fa", output: - idx=OUT_DIR / "segemehl_transcriptome_index.idx", + idx=TMP_DIR / "segemehl_transcriptome_index.idx", params: cluster_log=CLUSTER_LOG / "generate_segemehl_index_transcriptome.log", log: @@ -163,9 +162,9 @@ rule generate_segemehl_index_transcriptome: rule generate_segemehl_index_genome: input: - genome=OUT_DIR / "genome_processed.fa", + genome=TMP_DIR / "genome_processed.fa", output: - idx=OUT_DIR / "segemehl_genome_index.idx", + idx=TMP_DIR / "segemehl_genome_index.idx", params: cluster_log=CLUSTER_LOG / "generate_segemehl_index_genome.log", log: @@ -192,7 +191,7 @@ rule get_exons_gtf: gtf=config["gtf_file"], script=SCRIPTS_DIR / "get_lines_w_pattern.sh", output: - exons=OUT_DIR / "exons.gtf", + exons=TMP_DIR / "exons.gtf", params: cluster_log=CLUSTER_LOG / "get_exons_gtf.log", log: @@ -216,10 +215,10 @@ rule get_exons_gtf: rule convert_exons_gtf_to_bed: input: - exons=OUT_DIR / "exons.gtf", + exons=TMP_DIR / "exons.gtf", script=SCRIPTS_DIR / "gtf_exons_bed.1.1.2.R", output: - exons=OUT_DIR / "exons.bed", + exons=TMP_DIR / "exons.bed", params: cluster_log=CLUSTER_LOG / "exons_gtf_to_bed.log", log: @@ -243,9 +242,9 @@ rule convert_exons_gtf_to_bed: rule create_genome_header: input: - genome=OUT_DIR / "genome_processed.fa", + genome=TMP_DIR / "genome_processed.fa", output: - header=OUT_DIR / "genome_header.sam", + header=TMP_DIR / "genome_header.sam", params: cluster_log=CLUSTER_LOG / "create_genome_header.log", log: @@ -269,7 +268,7 @@ rule map_chr_names: script=SCRIPTS_DIR / "map_chromosomes.pl", map_chr=config["map_chr_file"], output: - gff=OUT_DIR / "mirna_annotations.gff3", + gff=TMP_DIR / "mirna_annotations.gff3", params: cluster_log=CLUSTER_LOG / "map_chr_names.log", column="1", @@ -297,9 +296,9 @@ rule map_chr_names: rule create_index_genome_fasta: input: - genome=OUT_DIR / "genome_processed.fa", + genome=TMP_DIR / "genome_processed.fa", output: - genome=OUT_DIR / "genome_processed.fa.fai", + genome=TMP_DIR / "genome_processed.fa.fai", params: cluster_log=CLUSTER_LOG / "create_index_genome_fasta.log", log: @@ -319,9 +318,9 @@ rule create_index_genome_fasta: rule extract_chr_len: input: - genome=OUT_DIR / "genome_processed.fa.fai", + genome=TMP_DIR / "genome_processed.fa.fai", output: - chrsize=OUT_DIR / "chr_size.txt", + chrsize=TMP_DIR / "chr_size.txt", params: cluster_log=CLUSTER_LOG / "extract_chr_len.log", log: @@ -339,21 +338,21 @@ rule extract_chr_len: rule extend_mirs_annotations: input: - gff3=OUT_DIR / "mirna_annotations.gff3", - chrsize=OUT_DIR / "chr_size.txt", + gff3=TMP_DIR / "mirna_annotations.gff3", + chrsize=TMP_DIR / "chr_size.txt", script=SCRIPTS_DIR / "mirna_extension.py", output: extended_mir=expand( - OUT_DIR / "extended_mirna_annotation_{extension}_nt.gff3", + TMP_DIR / "extended_mirna_annotation_{extension}_nt.gff3", extension=config["extension"], ), extended_primir=expand( - OUT_DIR / "extended_primir_annotation_{extension}_nt.gff3", + TMP_DIR / "extended_primir_annotation_{extension}_nt.gff3", extension=config["extension"], ), params: cluster_log=CLUSTER_LOG / "extend_mirs_annotations.log", - out_dir=OUT_DIR, + out_dir=TMP_DIR, extension=config["extension"], log: LOCAL_LOG / "extend_mirs_annotations.log", diff --git a/workflow/rules/quantify.smk b/workflow/rules/quantify.smk index c30b048c..d290f03e 100644 --- a/workflow/rules/quantify.smk +++ b/workflow/rules/quantify.smk @@ -64,10 +64,11 @@ rule finish_quantify: input: primir_intersect_sam=OUT_DIR / "{sample}" / "alignments_intersecting_primir.sam", mirna_intersect_sam=OUT_DIR / "{sample}" / "alignments_intersecting_mirna.sam", - intersect_sam=OUT_DIR - / "{sample}" - / "alignments_intersecting_mirna_sorted_tag.sam", table=OUT_DIR / "TABLES" / "all_{mir}_counts.tab", + uncollapsed_sam=expand( + OUT_DIR / "{sample}" / "alignments_intersecting_mirna_uncollapsed.sam", + sample=pd.unique(samples_table.index.values), + ), uncollapsed_bam=expand( OUT_DIR / "{sample}" @@ -89,13 +90,13 @@ rule finish_quantify: rule intersect_extended_primir: input: - alignment=OUT_DIR / "{sample}" / "alignments_all_sorted_{sample}.bam", + alignment=TMP_DIR / "{sample}" / "alignments_all_sorted_{sample}.bam", primir=expand( - OUT_DIR / "extended_primir_annotation_{extension}_nt.gff3", + TMP_DIR / "extended_primir_annotation_{extension}_nt.gff3", extension=config["extension"], ), output: - intersect=OUT_DIR / "{sample}" / "intersected_extended_primir.bed", + intersect=TMP_DIR / "{sample}" / "intersected_extended_primir.bed", params: cluster_log=CLUSTER_LOG / "intersect_extended_primir_{sample}.log", log: @@ -123,8 +124,8 @@ rule intersect_extended_primir: rule filter_sam_by_intersecting_primir: input: - alignments=OUT_DIR / "{sample}" / "alignments_all.sam", - intersect=OUT_DIR / "{sample}" / "intersected_extended_primir.bed", + alignments=TMP_DIR / "{sample}" / "alignments_all.sam", + intersect=TMP_DIR / "{sample}" / "intersected_extended_primir.bed", output: sam=OUT_DIR / "{sample}" / "alignments_intersecting_primir.sam", params: @@ -153,7 +154,7 @@ rule convert_intersecting_primir_sam_to_bam: input: maps=OUT_DIR / "{sample}" / "alignments_intersecting_primir.sam", output: - maps=OUT_DIR / "{sample}" / "alignments_intersecting_primir.bam", + maps=TMP_DIR / "{sample}" / "alignments_intersecting_primir.bam", params: cluster_log=CLUSTER_LOG / "convert_intersecting_primir_sam_to_bam_{sample}.log", log: @@ -173,9 +174,9 @@ rule convert_intersecting_primir_sam_to_bam: rule sort_intersecting_primir_bam_by_position: input: - maps=OUT_DIR / "{sample}" / "alignments_intersecting_primir.bam", + maps=TMP_DIR / "{sample}" / "alignments_intersecting_primir.bam", output: - maps=OUT_DIR / "{sample}" / "alignments_intersecting_primir_sorted.bam", + maps=TMP_DIR / "{sample}" / "alignments_intersecting_primir_sorted.bam", params: cluster_log=CLUSTER_LOG / "sort_intersecting_primir_bam_by_position_{sample}.log", @@ -196,9 +197,9 @@ rule sort_intersecting_primir_bam_by_position: rule index_intersecting_primir_bam: input: - maps=OUT_DIR / "{sample}" / "alignments_intersecting_primir_sorted.bam", + maps=TMP_DIR / "{sample}" / "alignments_intersecting_primir_sorted.bam", output: - maps=OUT_DIR + maps=TMP_DIR / "{sample}" / "alignments_intersecting_primir_sorted.bam.bai", params: @@ -220,13 +221,13 @@ rule index_intersecting_primir_bam: rule intersect_extended_mirna: input: - alignment=OUT_DIR / "{sample}" / "alignments_intersecting_primir_sorted.bam", + alignment=TMP_DIR / "{sample}" / "alignments_intersecting_primir_sorted.bam", mirna=expand( - OUT_DIR / "extended_mirna_annotation_{extension}_nt.gff3", + TMP_DIR / "extended_mirna_annotation_{extension}_nt.gff3", extension=config["extension"], ), output: - intersect=OUT_DIR / "{sample}" / "intersected_extended_mirna.bed", + intersect=TMP_DIR / "{sample}" / "intersected_extended_mirna.bed", params: cluster_log=CLUSTER_LOG / "intersect_extended_mirna_{sample}.log", log: @@ -255,7 +256,7 @@ rule intersect_extended_mirna: rule filter_sam_by_intersecting_mirna: input: alignments=OUT_DIR / "{sample}" / "alignments_intersecting_primir.sam", - intersect=OUT_DIR / "{sample}" / "intersected_extended_mirna.bed", + intersect=TMP_DIR / "{sample}" / "intersected_extended_mirna.bed", output: sam=OUT_DIR / "{sample}" / "alignments_intersecting_mirna.sam", params: @@ -283,10 +284,10 @@ rule filter_sam_by_intersecting_mirna: rule add_intersecting_mirna_tag: input: alignments=OUT_DIR / "{sample}" / "alignments_intersecting_mirna.sam", - intersect=OUT_DIR / "{sample}" / "intersected_extended_mirna.bed", + intersect=TMP_DIR / "{sample}" / "intersected_extended_mirna.bed", script=SCRIPTS_DIR / "iso_name_tagging.py", output: - sam=OUT_DIR / "{sample}" / "alignments_intersecting_mirna_tag.sam", + sam=TMP_DIR / "{sample}" / "alignments_intersecting_mirna_tag.sam", params: extension=config["extension"], cluster_log=CLUSTER_LOG / "add_intersecting_mirna_tag_{sample}.log", @@ -312,9 +313,9 @@ rule add_intersecting_mirna_tag: rule sort_intersecting_mirna_by_feat_tag: input: - sam=OUT_DIR / "{sample}" / "alignments_intersecting_mirna_tag.sam", + sam=TMP_DIR / "{sample}" / "alignments_intersecting_mirna_tag.sam", output: - sam=OUT_DIR / "{sample}" / "alignments_intersecting_mirna_sorted_tag.sam", + sam=TMP_DIR / "{sample}" / "alignments_intersecting_mirna_sorted_tag.sam", params: cluster_log=CLUSTER_LOG / "sort_intersecting_mirna_by_feat_tag_{sample}.log", log: @@ -334,15 +335,15 @@ rule sort_intersecting_mirna_by_feat_tag: rule quantify_mirna: input: - alignments=OUT_DIR / "{sample}" / "alignments_intersecting_mirna_sorted_tag.sam", + alignments=TMP_DIR / "{sample}" / "alignments_intersecting_mirna_sorted_tag.sam", script=SCRIPTS_DIR / "mirna_quantification.py", output: - table=OUT_DIR / "TABLES" / "mirna_counts_{sample}", + table=TMP_DIR / "TABLES" / "mirna_counts_{sample}", params: cluster_log=CLUSTER_LOG / "quantify_mirna_{sample}.log", mir_list=config["mir_list"], library="{sample}", - out_dir=OUT_DIR / "TABLES", + out_dir=TMP_DIR / "TABLES", log: LOCAL_LOG / "quantify_mirna_{sample}.log", container: @@ -368,10 +369,10 @@ rule quantify_mirna: rule quantify_primir: input: - intersect=OUT_DIR / "{sample}" / "intersected_extended_primir.bed", + intersect=TMP_DIR / "{sample}" / "intersected_extended_primir.bed", script=SCRIPTS_DIR / "primir_quantification.py", output: - table=OUT_DIR / "TABLES" / "pri-mir_counts_{sample}", + table=TMP_DIR / "TABLES" / "pri-mir_counts_{sample}", params: cluster_log=CLUSTER_LOG / "quantify_primir_{sample}.log", log: @@ -398,7 +399,7 @@ rule quantify_primir: rule merge_tables: input: table=expand( - OUT_DIR / "TABLES" / "{mir}_counts_{sample}", + TMP_DIR / "TABLES" / "{mir}_counts_{sample}", sample=pd.unique(samples_table.index.values), mir=[mir for mir in config["mir_list"] if mir != "isomir"], ), @@ -408,7 +409,7 @@ rule merge_tables: params: cluster_log=CLUSTER_LOG / "merge_tables_{mirna}.log", prefix="{mirna}_counts_", - input_dir=OUT_DIR / "TABLES", + input_dir=TMP_DIR / "TABLES", log: LOCAL_LOG / "merge_tables_{mirna}.log", container: @@ -461,7 +462,7 @@ rule convert_uncollpased_reads_sam_to_bam: input: maps=OUT_DIR / "{sample}" / "alignments_intersecting_mirna_uncollapsed.sam", output: - maps=OUT_DIR + maps=TMP_DIR / "{sample}" / "alignments_intersecting_mirna_uncollapsed.bam", params: @@ -483,7 +484,7 @@ rule convert_uncollpased_reads_sam_to_bam: rule sort_uncollpased_reads_bam_by_position: input: - maps=OUT_DIR / "{sample}" / "alignments_intersecting_mirna_uncollapsed.bam", + maps=TMP_DIR / "{sample}" / "alignments_intersecting_mirna_uncollapsed.bam", output: maps=OUT_DIR / "{sample}" From bc5230396cbe8557c22c2b05874f43a353d1fa2f Mon Sep 17 00:00:00 2001 From: deliaBlue Date: Thu, 30 Nov 2023 14:31:25 +0100 Subject: [PATCH 07/21] test: add --no-hooks CLI option --- test/test_workflow_local_with_conda.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/test/test_workflow_local_with_conda.sh b/test/test_workflow_local_with_conda.sh index f92c4249..20c0555b 100755 --- a/test/test_workflow_local_with_conda.sh +++ b/test/test_workflow_local_with_conda.sh @@ -25,6 +25,7 @@ snakemake \ --use-conda \ --printshellcmds \ --rerun-incomplete \ + --no-hooks \ --verbose # Snakemake report From b162f13e600d347cb5f022a25faa361ea9a01efe Mon Sep 17 00:00:00 2001 From: deliaBlue Date: Thu, 30 Nov 2023 14:47:58 +0100 Subject: [PATCH 08/21] docs: update rule graph --- images/rule_graph.svg | 760 +++++++++++++++++++++--------------------- 1 file changed, 374 insertions(+), 386 deletions(-) diff --git a/images/rule_graph.svg b/images/rule_graph.svg index 8926ad42..98360322 100644 --- a/images/rule_graph.svg +++ b/images/rule_graph.svg @@ -4,796 +4,784 @@ - + snakemake_dag - + 0 - -finish + +finish 1 - -filter_sam_by_intersecting_primir + +filter_sam_by_intersecting_primir - + 1->0 - - + + 42 - -filter_sam_by_intersecting_mirna + +filter_sam_by_intersecting_mirna - + 1->42 - - + + 45 - -convert_intersecting_primir_sam_to_bam + +convert_intersecting_primir_sam_to_bam - + 1->45 - - + + 2 - -filter_by_indels + +filter_by_indels - + 2->1 - - + + 37 - -convert_all_alns_sam_to_bam + +convert_all_alns_sam_to_bam - + 2->37 - - + + 3 - -remove_inferiors + +remove_inferiors 3->2 - - + + 4 - -sort_maps_by_id + +sort_maps_by_id 4->3 - - + + 5 - -add_header_all_maps + +add_header_all_maps 5->4 - - + + 6 - -create_genome_header + +create_genome_header - + 6->5 - - + + 7 - -trim_genome_seq_id + +trim_genome_seq_ids 7->6 - - + + 19 - -extract_transcriptome_seqs + +extract_transcriptome_seqs 7->19 - - + + 30 - -map_genome_segemehl + +map_genome_segemehl 7->30 - - + + 31 - -generate_segemehl_index_genome + +generate_segemehl_index_genome - + 7->31 - - + + - - -33 - -map_genome_oligomap + + +34 + +map_genome_oligomap - + -7->33 - - +7->34 + + 41 - -create_index_genome_fasta + +create_index_genome_fasta - + 7->41 - - + + 8 - -merge_all_maps + +merge_all_maps - + 8->5 - - + + 9 - -transcriptome_to_genome_maps + +transcriptome_to_genome_maps 9->8 - - + + 10 - -remove_header_transcriptome_mappings + +remove_header_transcriptome_mappings - + 10->9 - - + + 11 - -filter_transcriptome_by_nh + +filter_transcriptome_by_nh 11->10 - - + + 12 - -merge_transcriptome_maps + +merge_transcriptome_maps 12->11 - - + + 13 - -map_transcriptome_segemehl + +map_transcriptome_segemehl - + 13->12 - - + + 14 - -collapse_identical_reads + +collapse_identical_reads - + 14->13 - - + + - - -23 - -filter_fasta_for_oligomap + + +24 + +filter_fasta_for_oligomap - + -14->23 - - +14->24 + + 14->30 - - + + 15 - -remove_adapters + +remove_adapters 15->14 - - + + 16 - -format_fasta + +format_fasta 16->15 - - + + 17 - -start + +start 17->16 - - + + 18 - -trim_fasta_seq_ids + +trim_transcriptome_seq_ids 18->13 - - + + 20 - -generate_segemehl_index_transcriptome + +generate_segemehl_index_transcriptome 18->20 - - + + - - -22 - -map_transcriptome_oligomap + + +23 + +map_transcriptome_oligomap - + -18->22 - - +18->23 + + 19->18 - - + + - + 20->13 - - + + 21 - -oligomap_transcriptome_to_sam + +convert_transcriptome_to_sam_oligomap - + 21->12 - - + + + + + +22 + +sort_transcriptome_oligomap 22->21 - - - - - -24 - -sort_transcriptome_oligomap - - - -22->24 - - + + - + 23->22 - - + + - - -23->33 - - + + +24->23 + + - - -24->21 - - + + +24->34 + + 25 - -exons_gtf_to_bed + +convert_exons_gtf_to_bed - + 25->9 - - + + 26 - -get_exons_gtf + +get_exons_gtf - + 26->25 - - + + 27 - -remove_header_genome_mappings + +remove_header_genome_mappings 27->8 - - + + 28 - -filter_genome_by_nh + +filter_genome_by_nh - + 28->27 - - + + 29 - -merge_genome_maps + +merge_genome_maps - + 29->28 - - + + 30->29 - - + + - + 31->30 - - + + 32 - -oligomap_genome_to_sam + +convert_genome_to_sam_oligomap - + 32->29 - - + + + + + +33 + +sort_genome_oligomap - + 33->32 - - - - - -34 - -sort_genome_oligomap - - - -33->34 - - + + - - -34->32 - - + + +34->33 + + 35 - -intersect_extended_primir + +intersect_extended_primir - + 35->1 - - + + 50 - -quantify_primir + +quantify_primir - + 35->50 - - + + 36 - -sort_all_alns_bam_by_position + +sort_all_alns_bam_by_position - + 36->35 - - + + - + 37->36 - - + + 38 - -extend_mirs_annotations + +extend_mirs_annotations - + 38->35 - - + + 43 - -intersect_extended_mirna + +intersect_extended_mirna - + 38->43 - - + + 39 - -map_chr_names + +map_chr_names - + 39->38 - - + + 40 - -extract_chr_len + +extract_chr_len - + 40->38 - - + + - + 41->40 - - + + - + 42->0 - - + + - - -47 - -add_intersecting_mirna_tag + + +49 + +add_intersecting_mirna_tag - - -42->47 - - + + +42->49 + + - - -53 - -uncollapse_reads + + +51 + +uncollapse_reads - - -42->53 - - + + +42->51 + + - + 43->42 - - + + - - -43->47 - - + + +43->49 + + 44 - -sort_intersecting_primir_bam_by_position + +sort_intersecting_primir_bam_by_position - + 44->43 - - + + - + 45->44 - - + + 46 - -sort_intersecting_mirna_by_feat_tag + +merge_tables 46->0 - - - - - -49 - -quantify_mirna + + - - -46->49 - - + + +47 + +quantify_mirna - + 47->46 - - + + 48 - -merge_tables + +sort_intersecting_mirna_by_feat_tag - - -48->0 - - + + +48->47 + + - + 49->48 - - + + - - -50->48 - - - - - -51 - -sort_uncollpased_reads_bam_by_position + + +50->46 + + - + 51->0 - - + + - - -54 - -index_uncollapsed_reads_bam + + +53 + +convert_uncollpased_reads_sam_to_bam - - -51->54 - - + + +51->53 + + 52 - -convert_uncollpased_reads_sam_to_bam + +sort_uncollpased_reads_bam_by_position - - -52->51 - - + + +52->0 + + - + + +54 + +index_uncollapsed_reads_bam + + +52->54 + + + + + 53->52 - - + + - + 54->0 - - + + From 93316f371767d18cc6866410183544996487bc48 Mon Sep 17 00:00:00 2001 From: deliaBlue Date: Thu, 30 Nov 2023 16:29:36 +0100 Subject: [PATCH 09/21] test: restore expected output --- test/expected_output.md5 | 110 +++++++++++++++++++-------------------- 1 file changed, 55 insertions(+), 55 deletions(-) diff --git a/test/expected_output.md5 b/test/expected_output.md5 index 8ccb8fa7..0f328284 100644 --- a/test/expected_output.md5 +++ b/test/expected_output.md5 @@ -1,58 +1,58 @@ 68f943f89b52d628851dd97fb1399d68 results/TABLES/all_mirna_counts.tab +eec9be6cda61d2728290c92c1209f455 results/TABLES/mirna_counts_test_lib 363ecee318c57ee7e2e45ca468007baa results/TABLES/all_pri-mir_counts.tab +a844e3a29159e36e2f17a0646d1e8c5f results/TABLES/pri-mir_counts_test_lib 0d76977b2e36046cc176112776c5fa4e results/test_lib/alignments_intersecting_mirna_uncollapsed_sorted.bam.bai -ddb9272db1b04e67aaa65a8dbcee69b6 results/test_lib/alignments_intersecting_mirna_uncollapsed_sorted.bam -d025851bf31e88874bcc972b36c3f9ee results/test_lib/alignments_intersecting_mirna_uncollapsed.sam -80a664262886fe0b1b7dad80b6cc0b39 results/test_lib/alignments_intersecting_mirna.sam -ae3f3374170ce0ae90087b5672163ba3 results/test_lib/alignments_intersecting_primir.sam -eec9be6cda61d2728290c92c1209f455 results/tmp/TABLES/mirna_counts_test_lib -a844e3a29159e36e2f17a0646d1e8c5f results/tmp/TABLES/pri-mir_counts_test_lib -36f7d024fe6ddfd3e788aebf61c61061 results/tmp/test_lib/oligomap_genome_sorted.fasta -48e605df55bf2dd37ea5a5a74eb5872a results/tmp/test_lib/mappings_all.sam -d41d8cd98f00b204e9800998ecf8427e results/tmp/test_lib/oligomap_transcriptome_mappings.fasta -4b8a81b633b84921ae1b3fa9a15b0a14 results/tmp/test_lib/mappings_all_removed_inferiors.sam -78124c5993a98d67a47538865d624ef5 results/tmp/test_lib/alignments_intersecting_primir.bam -defdc8c46e1d73692edde0e0278f2d5e results/tmp/test_lib/oligomap_genome_mappings.fasta -48c6346d7326e8718dd06c9b642a2f97 results/tmp/test_lib/segemehl_transcriptome_mappings.sam -48c6346d7326e8718dd06c9b642a2f97 results/tmp/test_lib/transcriptome_mappings_filtered_nh.sam -9e21710edb045a1d7ba653e21b40a8b1 results/tmp/test_lib/alignments_all.bam -a124a5afdb5f7bfbcc5683260556c9c4 results/tmp/test_lib/mappings_all_no_header.sam -ac4cf96f8e35bf1ace8750e72aa27a95 results/tmp/test_lib/alignments_all.sam -552b836f23069c5b569cba621df1e0b0 results/tmp/test_lib/alignments_intersecting_mirna_tag.sam -d41d8cd98f00b204e9800998ecf8427e results/tmp/test_lib/oligomap_transcriptome_sorted.fasta -52f42f222c4a3d89f852a5a31ce685ea results/tmp/test_lib/segemehl_genome_mappings.sam -d41d8cd98f00b204e9800998ecf8427e results/tmp/test_lib/transcriptome_mappings_to_genome.sam -63a32839360a985b68e0685aafad5c54 results/tmp/test_lib/fa/reads.fa -edcb854702519c0002d8ce89a21e54ef results/tmp/test_lib/reads_formatted.fasta -1a547487b8e92ad85bb26ff9b1db1f93 results/tmp/test_lib/intersected_extended_mirna.bed -cc2fbcd9435b8d8e5affc7280d4a59f8 results/tmp/test_lib/alignments_all_sorted_test_lib.bam -ec0e9bcc8ea857da897035c8fca4078f results/tmp/test_lib/reads_trimmed_adapters.fasta -21b9623034d519b6ef4c7bb02b631b27 results/tmp/test_lib/alignments_intersecting_mirna_uncollapsed.bam -4c9b71346d76e90b37a9a3d4e5457a49 results/tmp/test_lib/genome_mappings.sam -f5cb65466d328036a15b66cfbd4d8419 results/tmp/test_lib/oligomap_genome_report.txt -6cbdb9299e09b3e39b79a50db69226b5 results/tmp/test_lib/transcriptome_mappings_no_header.sam -48c6346d7326e8718dd06c9b642a2f97 results/tmp/test_lib/transcriptome_mappings.sam -947607be69c16246f8dc9adbd9b971c8 results/tmp/test_lib/oligomap_genome_mappings.sam -4ac38f4b52af65eae9bb0607863ce3f9 results/tmp/test_lib/alignments_intersecting_mirna_sorted_tag.sam -db62a036defe14394eec1c3e664e2960 results/tmp/test_lib/alignments_intersecting_primir_sorted.bam -d41d8cd98f00b204e9800998ecf8427e results/tmp/test_lib/oligomap_transcriptome_mappings.sam -a124a5afdb5f7bfbcc5683260556c9c4 results/tmp/test_lib/genome_mappings_no_header.sam -4c9b71346d76e90b37a9a3d4e5457a49 results/tmp/test_lib/genome_mappings_filtered_nh.sam -27bcc29a265451fa46ae75ea3683f87e results/tmp/test_lib/mappings_all_sorted_by_id.sam -2c77ffa021dda190d82f3f54a3312393 results/tmp/test_lib/reads_collapsed.fasta -f68693cfaa1e6ea78e1a5562ade6d9ed results/tmp/test_lib/intersected_extended_primir.bed -c2a5770a755ada66ef63d96eec4afb00 results/tmp/test_lib/reads_filtered_for_oligomap.fasta -fe5388094985e9604a302d39d2abc82c results/tmp/test_lib/oligomap_transcriptome_report.txt -be7a0d92e57480190de57eb30baffa36 results/tmp/extended_mirna_annotation_6_nt.gff3 -8148cd880602255be166beb59bbed95a results/tmp/genome_header.sam -09e24a504bfec37fee3d5ff1b5c7738e results/tmp/exons.bed -4fb453846e88593d0cac13220ec2d685 results/tmp/segemehl_genome_index.idx -d34fc868b861b1bc46db07a397dc0f10 results/tmp/genome_processed.fa.fai -21e102e4ebd3508bb06f46366a3d578d results/tmp/exons.gtf -003b92b245ac336e3d70a513033e1cee results/tmp/transcriptome_trimmed_id.fa -44dbf7c3eae00d0bc8d5e1319123746c results/tmp/chr_size.txt -cc5c3512dab0e269d82bd625de74198e results/tmp/extended_primir_annotation_6_nt.gff3 -f28cc0143ab6659bef3de3a7afa1dccc results/tmp/mirna_annotations.gff3 -2d437f8681f4248d4f2075f86debb920 results/tmp/transcriptome.fa -7eb64c112830266bcf416ded60b4cf77 results/tmp/segemehl_transcriptome_index.idx -4fba145540a2c61f29bfddfd0f5a4d4e results/tmp/genome_processed.fa +36f7d024fe6ddfd3e788aebf61c61061 results/test_lib/oligomap_genome_sorted.fasta +48e605df55bf2dd37ea5a5a74eb5872a results/test_lib/mappings_all.sam +d41d8cd98f00b204e9800998ecf8427e results/test_lib/oligomap_transcriptome_mappings.fasta +eea903fc0ab81054cf8e34193f80f4a7 results/test_lib/mappings_all_removed_inferiors.sam +98498ac521f451426a9dbabcbecb5f25 results/test_lib/alignments_intersecting_primir.bam +defdc8c46e1d73692edde0e0278f2d5e results/test_lib/oligomap_genome_mappings.fasta +1649738f226e8979d4d88a3ae47fa423 results/test_lib/segemehl_transcriptome_mappings.sam +9ecee9ab80daba0a53076b05c9f6ff53 results/test_lib/alignments_intersecting_mirna_uncollapsed_sorted.bam +1649738f226e8979d4d88a3ae47fa423 results/test_lib/transcriptome_mappings_filtered_nh.sam +8e22ddfa7c39ce7e4ec5945dff1576ef results/test_lib/alignments_all.bam +a124a5afdb5f7bfbcc5683260556c9c4 results/test_lib/mappings_all_no_header.sam +dd00dea3549dc1ad14f9e1505d397de5 results/test_lib/alignments_all.sam +8c24d619073f4c5ca1f439fe429d0ef4 results/test_lib/alignments_intersecting_mirna_tag.sam +d41d8cd98f00b204e9800998ecf8427e results/test_lib/oligomap_transcriptome_sorted.fasta +c218718d93f48e5987fc18b33dc488f0 results/test_lib/segemehl_genome_mappings.sam +d41d8cd98f00b204e9800998ecf8427e results/test_lib/transcriptome_mappings_to_genome.sam +63a32839360a985b68e0685aafad5c54 results/test_lib/fa/reads.fa +5cc557ec2073144f47fe28ac145f4869 results/test_lib/alignments_intersecting_mirna_uncollapsed.sam +edcb854702519c0002d8ce89a21e54ef results/test_lib/reads_formatted.fasta +1a547487b8e92ad85bb26ff9b1db1f93 results/test_lib/intersected_extended_mirna.bed +721071f3ead528aa71978508db8d73f9 results/test_lib/alignments_all_sorted_test_lib.bam +ec0e9bcc8ea857da897035c8fca4078f results/test_lib/reads_trimmed_adapters.fasta +bbfc27c84b66ff41bfeee73f701b4b29 results/test_lib/alignments_intersecting_mirna_uncollapsed.bam +81bed7fc879f7a16c12d2ba912263c46 results/test_lib/alignments_intersecting_mirna.sam +dd560414078330bf3138f039da109093 results/test_lib/genome_mappings.sam +f5cb65466d328036a15b66cfbd4d8419 results/test_lib/oligomap_genome_report.txt +6cbdb9299e09b3e39b79a50db69226b5 results/test_lib/transcriptome_mappings_no_header.sam +1649738f226e8979d4d88a3ae47fa423 results/test_lib/transcriptome_mappings.sam +947607be69c16246f8dc9adbd9b971c8 results/test_lib/oligomap_genome_mappings.sam +9833208a79143eaf3f2a5fdeca0b2d94 results/test_lib/alignments_intersecting_mirna_sorted_tag.sam +02096523b293082629d5b895085468a3 results/test_lib/alignments_intersecting_primir_sorted.bam +d41d8cd98f00b204e9800998ecf8427e results/test_lib/oligomap_transcriptome_mappings.sam +a124a5afdb5f7bfbcc5683260556c9c4 results/test_lib/genome_mappings_no_header.sam +dd560414078330bf3138f039da109093 results/test_lib/genome_mappings_filtered_nh.sam +ae4c4963ca2cd206952b2ea2c58301dd results/test_lib/mappings_all_sorted_by_id.sam +2c77ffa021dda190d82f3f54a3312393 results/test_lib/reads_collapsed.fasta +f68693cfaa1e6ea78e1a5562ade6d9ed results/test_lib/intersected_extended_primir.bed +61f12595db9421926073d6675f7c3c42 results/test_lib/alignments_intersecting_primir.sam +c2a5770a755ada66ef63d96eec4afb00 results/test_lib/reads_filtered_for_oligomap.fasta +fe5388094985e9604a302d39d2abc82c results/test_lib/oligomap_transcriptome_report.txt +be7a0d92e57480190de57eb30baffa36 results/extended_mirna_annotation_6_nt.gff3 +8148cd880602255be166beb59bbed95a results/genome_header.sam +09e24a504bfec37fee3d5ff1b5c7738e results/exons.bed +4fb453846e88593d0cac13220ec2d685 results/segemehl_genome_index.idx +d34fc868b861b1bc46db07a397dc0f10 results/genome_processed.fa.fai +21e102e4ebd3508bb06f46366a3d578d results/exons.gtf +003b92b245ac336e3d70a513033e1cee results/transcriptome_trimmed_id.fa +44dbf7c3eae00d0bc8d5e1319123746c results/chr_size.txt +cc5c3512dab0e269d82bd625de74198e results/extended_primir_annotation_6_nt.gff3 +f28cc0143ab6659bef3de3a7afa1dccc results/mirna_annotations.gff3 +2d437f8681f4248d4f2075f86debb920 results/transcriptome.fa +7eb64c112830266bcf416ded60b4cf77 results/segemehl_transcriptome_index.idx +4fba145540a2c61f29bfddfd0f5a4d4e results/genome_processed.fa From 35953db09164ec75edb7bff98c528bb908393bb5 Mon Sep 17 00:00:00 2001 From: deliaBlue Date: Sat, 2 Dec 2023 19:26:53 +0100 Subject: [PATCH 10/21] docs: add expected output files section --- README.md | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/README.md b/README.md index eef5a214..8bb7ece5 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,7 @@ _MIRFLOWZ_ is a [Snakemake][snakemake] workflow for mapping miRNAs and isomiRs. 2. [Usage](#usage) - [Preparing inputs](#preparing-inputs) - [Running the workflow](#running-the-workflow) + - [Expected output files](#expected-output-files) - [Creating a Snakemake report](#creating-a-snakemake-report) 3. [Workflow description](#workflow-description) 4. [Contributing](#contributing) @@ -251,6 +252,50 @@ snakemake \ After successful execution of the workflow, results and logs will be found in the `results/` and `logs/` directories, respectively. +### Expected output files + +Upon successful execution of _MIRFLOWZ_, the tool automatically removes all +intermediate files generated during the process. The final output comprises: + +1. A SAM file containing alignments intersecting a pri-miR locus. These +alignments intersect with extended start and/or end positions specified in the +provided pri-miR annotations. Please note that they may not contribute to the +final counting and will not appear in the final table. + +2. A SAM file containing alignments intersecting a miRNA locus. Similar to the +previous file, these alignments intersect with extended start and/or end +positions specified in the provided miRNA annotations. They may not contribute +to the final counting and might be absent from the final table. + +3. A SAM file containing the uncollapsed set of alignments that contribute to +the final counting. + +4. A BAM file containing the uncollapsed set of alignments contributing to the +final counting and its corresponding index file (`bam..bai`). + +5. Table(s) containing the counting data from all libraries for (iso)miRs +and/or pri-miRs. Each row corresponds to a miRNA species, and each column +represents a sample library. Counting involves aggregating contributions from +all alignments, calculated as the ratio of collapsed reads in th alignment to +the number of hits (NH value). + +To retain all intermediate files, include --no-hooks in the workflow call. + +```bash +snakemake \ + --snakefile="path/to/Snakefile" \ + --cores 4 \ + --configfile="path/to/config.yaml" \ + --use-conda \ + --printshellcmds \ + --rerun-incomplete \ + --no-hooks \ + --verbose +``` + +After successful execution of the workflow, the intermediate files will be +found in the `results/inter_files` directory. + ### Creating a Snakemake report Snakemake provides the option to generate a detailed HTML report on runtime From d3e9b1b68f1c006fbe5ea9fa59d4925447f2b970 Mon Sep 17 00:00:00 2001 From: deliaBlue Date: Sat, 2 Dec 2023 19:27:22 +0100 Subject: [PATCH 11/21] build: rename temporary directory --- config/config_schema.json | 5 +++++ config/config_template.yaml | 1 + 2 files changed, 6 insertions(+) diff --git a/config/config_schema.json b/config/config_schema.json index 2d56367e..86a6f18c 100644 --- a/config/config_schema.json +++ b/config/config_schema.json @@ -30,6 +30,11 @@ "default": "results/", "description": "Path to the output directory." }, + "tmp_dir":{ + "type": "string", + "default": "results/inter_files", + "description": "Path to the temporary directory storing the intermediate files." + }, "local_log":{ "type": "string", "default": "logs/local/", diff --git a/config/config_template.yaml b/config/config_template.yaml index d6d01c97..70ddac51 100644 --- a/config/config_template.yaml +++ b/config/config_template.yaml @@ -32,6 +32,7 @@ map_chr_file: path/to/ucsc_ensembl_mappings.tsv #### DIRECTORIES #### output_dir: results/ +tmp_dir: results/inter_files local_log: logs/local/ cluster_log: logs/cluster/ scripts_dir: ../scripts/ From 41174d0b81b6f503b51b80165dc2a2c7074c61bb Mon Sep 17 00:00:00 2001 From: deliaBlue Date: Sat, 2 Dec 2023 19:31:56 +0100 Subject: [PATCH 12/21] test: update expected output with new tmp dir name --- test/expected_output.md5 | 110 +++++++++++++++++++-------------------- 1 file changed, 55 insertions(+), 55 deletions(-) diff --git a/test/expected_output.md5 b/test/expected_output.md5 index 0f328284..9a0bdd11 100644 --- a/test/expected_output.md5 +++ b/test/expected_output.md5 @@ -1,58 +1,58 @@ 68f943f89b52d628851dd97fb1399d68 results/TABLES/all_mirna_counts.tab -eec9be6cda61d2728290c92c1209f455 results/TABLES/mirna_counts_test_lib 363ecee318c57ee7e2e45ca468007baa results/TABLES/all_pri-mir_counts.tab -a844e3a29159e36e2f17a0646d1e8c5f results/TABLES/pri-mir_counts_test_lib 0d76977b2e36046cc176112776c5fa4e results/test_lib/alignments_intersecting_mirna_uncollapsed_sorted.bam.bai -36f7d024fe6ddfd3e788aebf61c61061 results/test_lib/oligomap_genome_sorted.fasta -48e605df55bf2dd37ea5a5a74eb5872a results/test_lib/mappings_all.sam -d41d8cd98f00b204e9800998ecf8427e results/test_lib/oligomap_transcriptome_mappings.fasta -eea903fc0ab81054cf8e34193f80f4a7 results/test_lib/mappings_all_removed_inferiors.sam -98498ac521f451426a9dbabcbecb5f25 results/test_lib/alignments_intersecting_primir.bam -defdc8c46e1d73692edde0e0278f2d5e results/test_lib/oligomap_genome_mappings.fasta -1649738f226e8979d4d88a3ae47fa423 results/test_lib/segemehl_transcriptome_mappings.sam -9ecee9ab80daba0a53076b05c9f6ff53 results/test_lib/alignments_intersecting_mirna_uncollapsed_sorted.bam -1649738f226e8979d4d88a3ae47fa423 results/test_lib/transcriptome_mappings_filtered_nh.sam -8e22ddfa7c39ce7e4ec5945dff1576ef results/test_lib/alignments_all.bam -a124a5afdb5f7bfbcc5683260556c9c4 results/test_lib/mappings_all_no_header.sam -dd00dea3549dc1ad14f9e1505d397de5 results/test_lib/alignments_all.sam -8c24d619073f4c5ca1f439fe429d0ef4 results/test_lib/alignments_intersecting_mirna_tag.sam -d41d8cd98f00b204e9800998ecf8427e results/test_lib/oligomap_transcriptome_sorted.fasta -c218718d93f48e5987fc18b33dc488f0 results/test_lib/segemehl_genome_mappings.sam -d41d8cd98f00b204e9800998ecf8427e results/test_lib/transcriptome_mappings_to_genome.sam -63a32839360a985b68e0685aafad5c54 results/test_lib/fa/reads.fa -5cc557ec2073144f47fe28ac145f4869 results/test_lib/alignments_intersecting_mirna_uncollapsed.sam -edcb854702519c0002d8ce89a21e54ef results/test_lib/reads_formatted.fasta -1a547487b8e92ad85bb26ff9b1db1f93 results/test_lib/intersected_extended_mirna.bed -721071f3ead528aa71978508db8d73f9 results/test_lib/alignments_all_sorted_test_lib.bam -ec0e9bcc8ea857da897035c8fca4078f results/test_lib/reads_trimmed_adapters.fasta -bbfc27c84b66ff41bfeee73f701b4b29 results/test_lib/alignments_intersecting_mirna_uncollapsed.bam -81bed7fc879f7a16c12d2ba912263c46 results/test_lib/alignments_intersecting_mirna.sam -dd560414078330bf3138f039da109093 results/test_lib/genome_mappings.sam -f5cb65466d328036a15b66cfbd4d8419 results/test_lib/oligomap_genome_report.txt -6cbdb9299e09b3e39b79a50db69226b5 results/test_lib/transcriptome_mappings_no_header.sam -1649738f226e8979d4d88a3ae47fa423 results/test_lib/transcriptome_mappings.sam -947607be69c16246f8dc9adbd9b971c8 results/test_lib/oligomap_genome_mappings.sam -9833208a79143eaf3f2a5fdeca0b2d94 results/test_lib/alignments_intersecting_mirna_sorted_tag.sam -02096523b293082629d5b895085468a3 results/test_lib/alignments_intersecting_primir_sorted.bam -d41d8cd98f00b204e9800998ecf8427e results/test_lib/oligomap_transcriptome_mappings.sam -a124a5afdb5f7bfbcc5683260556c9c4 results/test_lib/genome_mappings_no_header.sam -dd560414078330bf3138f039da109093 results/test_lib/genome_mappings_filtered_nh.sam -ae4c4963ca2cd206952b2ea2c58301dd results/test_lib/mappings_all_sorted_by_id.sam -2c77ffa021dda190d82f3f54a3312393 results/test_lib/reads_collapsed.fasta -f68693cfaa1e6ea78e1a5562ade6d9ed results/test_lib/intersected_extended_primir.bed -61f12595db9421926073d6675f7c3c42 results/test_lib/alignments_intersecting_primir.sam -c2a5770a755ada66ef63d96eec4afb00 results/test_lib/reads_filtered_for_oligomap.fasta -fe5388094985e9604a302d39d2abc82c results/test_lib/oligomap_transcriptome_report.txt -be7a0d92e57480190de57eb30baffa36 results/extended_mirna_annotation_6_nt.gff3 -8148cd880602255be166beb59bbed95a results/genome_header.sam -09e24a504bfec37fee3d5ff1b5c7738e results/exons.bed -4fb453846e88593d0cac13220ec2d685 results/segemehl_genome_index.idx -d34fc868b861b1bc46db07a397dc0f10 results/genome_processed.fa.fai -21e102e4ebd3508bb06f46366a3d578d results/exons.gtf -003b92b245ac336e3d70a513033e1cee results/transcriptome_trimmed_id.fa -44dbf7c3eae00d0bc8d5e1319123746c results/chr_size.txt -cc5c3512dab0e269d82bd625de74198e results/extended_primir_annotation_6_nt.gff3 -f28cc0143ab6659bef3de3a7afa1dccc results/mirna_annotations.gff3 -2d437f8681f4248d4f2075f86debb920 results/transcriptome.fa -7eb64c112830266bcf416ded60b4cf77 results/segemehl_transcriptome_index.idx -4fba145540a2c61f29bfddfd0f5a4d4e results/genome_processed.fa +f91c144e491e447a50369a67220a832f results/test_lib/alignments_intersecting_mirna_uncollapsed_sorted.bam +a8b1a66aecf4d7b583362ea8619228ed results/test_lib/alignments_intersecting_mirna_uncollapsed.sam +9f0bad0ed3c62d0410060d8b332315e8 results/test_lib/alignments_intersecting_mirna.sam +4ae56cdb8de0fbaac24b4a49d356f7f8 results/test_lib/alignments_intersecting_primir.sam +eec9be6cda61d2728290c92c1209f455 results/inter_files/TABLES/mirna_counts_test_lib +a844e3a29159e36e2f17a0646d1e8c5f results/inter_files/TABLES/pri-mir_counts_test_lib +36f7d024fe6ddfd3e788aebf61c61061 results/inter_files/test_lib/oligomap_genome_sorted.fasta +48e605df55bf2dd37ea5a5a74eb5872a results/inter_files/test_lib/mappings_all.sam +d41d8cd98f00b204e9800998ecf8427e results/inter_files/test_lib/oligomap_transcriptome_mappings.fasta +f54bacf9bf4188541a0c0fedc203e3ed results/inter_files/test_lib/mappings_all_removed_inferiors.sam +4b86be9b7ed15ddc0067b8de4aad431c results/inter_files/test_lib/alignments_intersecting_primir.bam +defdc8c46e1d73692edde0e0278f2d5e results/inter_files/test_lib/oligomap_genome_mappings.fasta +3aca095999e737c5d9cdb66540e8b195 results/inter_files/test_lib/segemehl_transcriptome_mappings.sam +3aca095999e737c5d9cdb66540e8b195 results/inter_files/test_lib/transcriptome_mappings_filtered_nh.sam +698711937e6d98dd65b70b3a738388b4 results/inter_files/test_lib/alignments_all.bam +a124a5afdb5f7bfbcc5683260556c9c4 results/inter_files/test_lib/mappings_all_no_header.sam +cb542d2dd6b4405d690086de0bb5ec70 results/inter_files/test_lib/alignments_all.sam +d8ab74abfa3ed2b2a92c83142af1c638 results/inter_files/test_lib/alignments_intersecting_mirna_tag.sam +d41d8cd98f00b204e9800998ecf8427e results/inter_files/test_lib/oligomap_transcriptome_sorted.fasta +f34a0091f633db03a940d0c790ad265a results/inter_files/test_lib/segemehl_genome_mappings.sam +d41d8cd98f00b204e9800998ecf8427e results/inter_files/test_lib/transcriptome_mappings_to_genome.sam +63a32839360a985b68e0685aafad5c54 results/inter_files/test_lib/fa/reads.fa +edcb854702519c0002d8ce89a21e54ef results/inter_files/test_lib/reads_formatted.fasta +1a547487b8e92ad85bb26ff9b1db1f93 results/inter_files/test_lib/intersected_extended_mirna.bed +a71a2dd39c82baee52d5dbe2e3a39457 results/inter_files/test_lib/alignments_all_sorted_test_lib.bam +ec0e9bcc8ea857da897035c8fca4078f results/inter_files/test_lib/reads_trimmed_adapters.fasta +acf1608593f39294e0137069f6351058 results/inter_files/test_lib/alignments_intersecting_mirna_uncollapsed.bam +0454bc9f3edd9348a7b3e08d9c3007d8 results/inter_files/test_lib/genome_mappings.sam +f5cb65466d328036a15b66cfbd4d8419 results/inter_files/test_lib/oligomap_genome_report.txt +6cbdb9299e09b3e39b79a50db69226b5 results/inter_files/test_lib/transcriptome_mappings_no_header.sam +3aca095999e737c5d9cdb66540e8b195 results/inter_files/test_lib/transcriptome_mappings.sam +947607be69c16246f8dc9adbd9b971c8 results/inter_files/test_lib/oligomap_genome_mappings.sam +fa14b33623fd12b068a6d4ae301e7f49 results/inter_files/test_lib/alignments_intersecting_mirna_sorted_tag.sam +b6de7f5615b4b05834f4af11df993345 results/inter_files/test_lib/alignments_intersecting_primir_sorted.bam +d41d8cd98f00b204e9800998ecf8427e results/inter_files/test_lib/oligomap_transcriptome_mappings.sam +a124a5afdb5f7bfbcc5683260556c9c4 results/inter_files/test_lib/genome_mappings_no_header.sam +0454bc9f3edd9348a7b3e08d9c3007d8 results/inter_files/test_lib/genome_mappings_filtered_nh.sam +09c89a2769c919e58c3a3d3cbe2ceaf6 results/inter_files/test_lib/mappings_all_sorted_by_id.sam +2c77ffa021dda190d82f3f54a3312393 results/inter_files/test_lib/reads_collapsed.fasta +f68693cfaa1e6ea78e1a5562ade6d9ed results/inter_files/test_lib/intersected_extended_primir.bed +c2a5770a755ada66ef63d96eec4afb00 results/inter_files/test_lib/reads_filtered_for_oligomap.fasta +fe5388094985e9604a302d39d2abc82c results/inter_files/test_lib/oligomap_transcriptome_report.txt +be7a0d92e57480190de57eb30baffa36 results/inter_files/extended_mirna_annotation_6_nt.gff3 +8148cd880602255be166beb59bbed95a results/inter_files/genome_header.sam +09e24a504bfec37fee3d5ff1b5c7738e results/inter_files/exons.bed +4fb453846e88593d0cac13220ec2d685 results/inter_files/segemehl_genome_index.idx +d34fc868b861b1bc46db07a397dc0f10 results/inter_files/genome_processed.fa.fai +21e102e4ebd3508bb06f46366a3d578d results/inter_files/exons.gtf +003b92b245ac336e3d70a513033e1cee results/inter_files/transcriptome_trimmed_id.fa +44dbf7c3eae00d0bc8d5e1319123746c results/inter_files/chr_size.txt +cc5c3512dab0e269d82bd625de74198e results/inter_files/extended_primir_annotation_6_nt.gff3 +f28cc0143ab6659bef3de3a7afa1dccc results/inter_files/mirna_annotations.gff3 +2d437f8681f4248d4f2075f86debb920 results/inter_files/transcriptome.fa +7eb64c112830266bcf416ded60b4cf77 results/inter_files/segemehl_transcriptome_index.idx +4fba145540a2c61f29bfddfd0f5a4d4e results/inter_files/genome_processed.fa From a487f7bf5eb9887eef022215090a7829d47640cc Mon Sep 17 00:00:00 2001 From: deliaBlue Date: Sat, 2 Dec 2023 20:07:12 +0100 Subject: [PATCH 13/21] style: format to pass snakefmt test --- workflow/rules/quantify.smk | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/workflow/rules/quantify.smk b/workflow/rules/quantify.smk index d290f03e..faf1352b 100644 --- a/workflow/rules/quantify.smk +++ b/workflow/rules/quantify.smk @@ -199,9 +199,7 @@ rule index_intersecting_primir_bam: input: maps=TMP_DIR / "{sample}" / "alignments_intersecting_primir_sorted.bam", output: - maps=TMP_DIR - / "{sample}" - / "alignments_intersecting_primir_sorted.bam.bai", + maps=TMP_DIR / "{sample}" / "alignments_intersecting_primir_sorted.bam.bai", params: cluster_log=CLUSTER_LOG / "index_intersecting_primir_bam_{sample}.log", log: @@ -462,9 +460,7 @@ rule convert_uncollpased_reads_sam_to_bam: input: maps=OUT_DIR / "{sample}" / "alignments_intersecting_mirna_uncollapsed.sam", output: - maps=TMP_DIR - / "{sample}" - / "alignments_intersecting_mirna_uncollapsed.bam", + maps=TMP_DIR / "{sample}" / "alignments_intersecting_mirna_uncollapsed.bam", params: cluster_log=CLUSTER_LOG / "convert_uncollapsed_reads_sam_to_bam_{sample}.log", log: From bda17549f0c67da7c534a664a17e62f63fcfd6f0 Mon Sep 17 00:00:00 2001 From: deliaBlue Date: Mon, 4 Dec 2023 17:27:45 +0100 Subject: [PATCH 14/21] test: update uncollapsed sam dir --- test/expected_output.md5 | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/expected_output.md5 b/test/expected_output.md5 index 9a0bdd11..3129cc56 100644 --- a/test/expected_output.md5 +++ b/test/expected_output.md5 @@ -1,8 +1,7 @@ 68f943f89b52d628851dd97fb1399d68 results/TABLES/all_mirna_counts.tab 363ecee318c57ee7e2e45ca468007baa results/TABLES/all_pri-mir_counts.tab 0d76977b2e36046cc176112776c5fa4e results/test_lib/alignments_intersecting_mirna_uncollapsed_sorted.bam.bai -f91c144e491e447a50369a67220a832f results/test_lib/alignments_intersecting_mirna_uncollapsed_sorted.bam -a8b1a66aecf4d7b583362ea8619228ed results/test_lib/alignments_intersecting_mirna_uncollapsed.sam +f448bbeab20b0db75a5ca8bfb83c6ceb results/test_lib/alignments_intersecting_mirna_uncollapsed_sorted.bam 9f0bad0ed3c62d0410060d8b332315e8 results/test_lib/alignments_intersecting_mirna.sam 4ae56cdb8de0fbaac24b4a49d356f7f8 results/test_lib/alignments_intersecting_primir.sam eec9be6cda61d2728290c92c1209f455 results/inter_files/TABLES/mirna_counts_test_lib @@ -23,11 +22,12 @@ d41d8cd98f00b204e9800998ecf8427e results/inter_files/test_lib/oligomap_transcri f34a0091f633db03a940d0c790ad265a results/inter_files/test_lib/segemehl_genome_mappings.sam d41d8cd98f00b204e9800998ecf8427e results/inter_files/test_lib/transcriptome_mappings_to_genome.sam 63a32839360a985b68e0685aafad5c54 results/inter_files/test_lib/fa/reads.fa +a8b1a66aecf4d7b583362ea8619228ed results/inter_files/test_lib/alignments_intersecting_mirna_uncollapsed.sam edcb854702519c0002d8ce89a21e54ef results/inter_files/test_lib/reads_formatted.fasta 1a547487b8e92ad85bb26ff9b1db1f93 results/inter_files/test_lib/intersected_extended_mirna.bed a71a2dd39c82baee52d5dbe2e3a39457 results/inter_files/test_lib/alignments_all_sorted_test_lib.bam ec0e9bcc8ea857da897035c8fca4078f results/inter_files/test_lib/reads_trimmed_adapters.fasta -acf1608593f39294e0137069f6351058 results/inter_files/test_lib/alignments_intersecting_mirna_uncollapsed.bam +6c6284e7328dbcb903afaadd4df857b8 results/inter_files/test_lib/alignments_intersecting_mirna_uncollapsed.bam 0454bc9f3edd9348a7b3e08d9c3007d8 results/inter_files/test_lib/genome_mappings.sam f5cb65466d328036a15b66cfbd4d8419 results/inter_files/test_lib/oligomap_genome_report.txt 6cbdb9299e09b3e39b79a50db69226b5 results/inter_files/test_lib/transcriptome_mappings_no_header.sam From 9d7b8223f2e1adc8846a7f2f61f16c4ffeb11bd6 Mon Sep 17 00:00:00 2001 From: deliaBlue Date: Mon, 4 Dec 2023 17:28:14 +0100 Subject: [PATCH 15/21] refactor: remove uncollapsed sam form final output --- workflow/Snakefile | 4 ---- 1 file changed, 4 deletions(-) diff --git a/workflow/Snakefile b/workflow/Snakefile index aa852cd7..8e41580c 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -88,10 +88,6 @@ rule finish: OUT_DIR / "TABLES" / "all_{mir}_counts.tab", mir=[mir for mir in config["mir_list"] if mir != "isomir"], ), - uncollapsed_sam=expand( - OUT_DIR / "{sample}" / "alignments_intersecting_mirna_uncollapsed.sam", - sample=pd.unique(samples_table.index.values), - ), uncollapsed_bam=expand( OUT_DIR / "{sample}" From d0b274f51345199cbddf7fa9f8c68f68ede99fed Mon Sep 17 00:00:00 2001 From: deliaBlue Date: Mon, 4 Dec 2023 17:28:29 +0100 Subject: [PATCH 16/21] refactor: remove uncollapsed sam from final output --- workflow/rules/quantify.smk | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/workflow/rules/quantify.smk b/workflow/rules/quantify.smk index faf1352b..043d7744 100644 --- a/workflow/rules/quantify.smk +++ b/workflow/rules/quantify.smk @@ -65,10 +65,6 @@ rule finish_quantify: primir_intersect_sam=OUT_DIR / "{sample}" / "alignments_intersecting_primir.sam", mirna_intersect_sam=OUT_DIR / "{sample}" / "alignments_intersecting_mirna.sam", table=OUT_DIR / "TABLES" / "all_{mir}_counts.tab", - uncollapsed_sam=expand( - OUT_DIR / "{sample}" / "alignments_intersecting_mirna_uncollapsed.sam", - sample=pd.unique(samples_table.index.values), - ), uncollapsed_bam=expand( OUT_DIR / "{sample}" @@ -434,7 +430,7 @@ rule uncollapse_reads: maps=OUT_DIR / "{sample}" / "alignments_intersecting_mirna.sam", script=SCRIPTS_DIR / "sam_uncollapse.pl", output: - maps=OUT_DIR / "{sample}" / "alignments_intersecting_mirna_uncollapsed.sam", + maps=TMP_DIR / "{sample}" / "alignments_intersecting_mirna_uncollapsed.sam", params: cluster_log=CLUSTER_LOG / "uncollapse_reads_{sample}.log", log: @@ -458,7 +454,7 @@ rule uncollapse_reads: rule convert_uncollpased_reads_sam_to_bam: input: - maps=OUT_DIR / "{sample}" / "alignments_intersecting_mirna_uncollapsed.sam", + maps=TMP_DIR / "{sample}" / "alignments_intersecting_mirna_uncollapsed.sam", output: maps=TMP_DIR / "{sample}" / "alignments_intersecting_mirna_uncollapsed.bam", params: From f6fc8124fcae7f3c99fc3553707e13560cb2c0de Mon Sep 17 00:00:00 2001 From: deliaBlue Date: Wed, 6 Dec 2023 03:47:25 +0100 Subject: [PATCH 17/21] refactor: change intermediates directory --- workflow/Snakefile | 4 +- workflow/rules/map.smk | 153 ++++++++++++++++++------------------ workflow/rules/prepare.smk | 64 +++++++-------- workflow/rules/quantify.smk | 80 ++++++++++++------- 4 files changed, 161 insertions(+), 140 deletions(-) diff --git a/workflow/Snakefile b/workflow/Snakefile index 8e41580c..9f116726 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -31,7 +31,7 @@ validate(config, Path("../config/config_schema.json")) OUT_DIR = Path(config["output_dir"]) -TMP_DIR = Path(config["tmp_dir"]) +INTERMEDIATES_DIR = Path(config["intermediates_dir"]) LOG_DIR = Path(f"{config['local_log']}/../") @@ -42,7 +42,7 @@ LOG_DIR = Path(f"{config['local_log']}/../") onsuccess: print("\nWORKFLOW SUCCEED. Removing intermediate files.\n") - shell("rm -rf {TMP_DIR}") + shell("rm -rf {INTERMEDIATES_DIR}") onerror: diff --git a/workflow/rules/map.smk b/workflow/rules/map.smk index cdf515eb..0c1131ff 100644 --- a/workflow/rules/map.smk +++ b/workflow/rules/map.smk @@ -24,7 +24,7 @@ validate(config, Path("../../config/config_schema.json")) ENV_DIR = Path(f"{workflow.basedir}/envs") -TMP_DIR = Path(config["tmp_dir"]) +INTERMEDIATES_DIR = Path(config["intermediates_dir"]) SCRIPTS_DIR = Path(config["scripts_dir"]) CLUSTER_LOG = Path(config["cluster_log"]) @@ -70,7 +70,7 @@ localrules: rule finish_map: input: maps=expand( - TMP_DIR / "{sample}" / "alignments_all_sorted_{sample}.bam.bai", + INTERMEDIATES_DIR / "{sample}" / "alignments_all_sorted_{sample}.bam.bai", sample=pd.unique(samples_table.index.values), ), @@ -87,7 +87,7 @@ rule start: format=convert_lib_format(get_sample("format")), ), output: - reads=TMP_DIR / "{sample}" / "{format}" / "reads.{format}", + reads=INTERMEDIATES_DIR / "{sample}" / "{format}" / "reads.{format}", params: cluster_log=CLUSTER_LOG / "uncompress_zipped_files_{sample}_{format}.log", log: @@ -105,9 +105,9 @@ rule start: rule fastq_quality_filter: input: - reads=TMP_DIR / "{sample}" / "fastq" / "reads.fastq", + reads=INTERMEDIATES_DIR / "{sample}" / "fastq" / "reads.fastq", output: - reads=TMP_DIR / "{sample}" / "fastq" / "filtered_reads.fastq", + reads=INTERMEDIATES_DIR / "{sample}" / "fastq" / "filtered_reads.fastq", params: cluster_log=CLUSTER_LOG / "fastq_quality_filter_{sample}.log", p=config["p_value"], @@ -135,9 +135,9 @@ rule fastq_quality_filter: rule fastq_to_fasta: input: - reads=TMP_DIR / "{sample}" / "fastq" / "filtered_reads.fastq", + reads=INTERMEDIATES_DIR / "{sample}" / "fastq" / "filtered_reads.fastq", output: - reads=TMP_DIR / "{sample}" / "fastq" / "reads.fa", + reads=INTERMEDIATES_DIR / "{sample}" / "fastq" / "reads.fa", params: cluster_log=CLUSTER_LOG / "fastq_to_fasta_{sample}.log", log: @@ -157,12 +157,12 @@ rule fastq_to_fasta: rule format_fasta: input: - reads=lambda wildcards: TMP_DIR + reads=lambda wildcards: INTERMEDIATES_DIR / wildcards.sample / convert_lib_format(get_sample("format", wildcards.sample)) / "reads.fa", output: - reads=TMP_DIR / "{sample}" / "reads_formatted.fasta", + reads=INTERMEDIATES_DIR / "{sample}" / "reads_formatted.fasta", params: cluster_log=CLUSTER_LOG / "format_fasta_{sample}.log", log: @@ -182,9 +182,9 @@ rule format_fasta: rule remove_adapters: input: - reads=TMP_DIR / "{sample}" / "reads_formatted.fasta", + reads=INTERMEDIATES_DIR / "{sample}" / "reads_formatted.fasta", output: - reads=TMP_DIR / "{sample}" / "reads_trimmed_adapters.fasta", + reads=INTERMEDIATES_DIR / "{sample}" / "reads_trimmed_adapters.fasta", params: adapter=lambda wildcards: get_sample("adapter", wildcards.sample).upper(), error_rate=config["error_rate"], @@ -219,9 +219,9 @@ rule remove_adapters: rule collapse_identical_reads: input: - reads=TMP_DIR / "{sample}" / "reads_trimmed_adapters.fasta", + reads=INTERMEDIATES_DIR / "{sample}" / "reads_trimmed_adapters.fasta", output: - reads=TMP_DIR / "{sample}" / "reads_collapsed.fasta", + reads=INTERMEDIATES_DIR / "{sample}" / "reads_collapsed.fasta", params: cluster_log=CLUSTER_LOG / "collapse_identical_reads_{sample}.log", log: @@ -241,11 +241,11 @@ rule collapse_identical_reads: rule map_genome_segemehl: input: - reads=TMP_DIR / "{sample}" / "reads_collapsed.fasta", - genome=TMP_DIR / "genome_processed.fa", - genome_index_segemehl=TMP_DIR / "segemehl_genome_index.idx", + reads=INTERMEDIATES_DIR / "{sample}" / "reads_collapsed.fasta", + genome=INTERMEDIATES_DIR / "genome_processed.fa", + genome_index_segemehl=INTERMEDIATES_DIR / "segemehl_genome_index.idx", output: - gmap=TMP_DIR / "{sample}" / "segemehl_genome_mappings.sam", + gmap=INTERMEDIATES_DIR / "{sample}" / "segemehl_genome_mappings.sam", params: cluster_log=CLUSTER_LOG / "map_genome_segemehl_{sample}.log", log: @@ -276,11 +276,12 @@ rule map_genome_segemehl: rule map_transcriptome_segemehl: input: - reads=TMP_DIR / "{sample}" / "reads_collapsed.fasta", - transcriptome=TMP_DIR / "transcriptome_trimmed_id.fa", - transcriptome_index_segemehl=TMP_DIR / "segemehl_transcriptome_index.idx", + reads=INTERMEDIATES_DIR / "{sample}" / "reads_collapsed.fasta", + transcriptome=INTERMEDIATES_DIR / "transcriptome_trimmed_id.fa", + transcriptome_index_segemehl=INTERMEDIATES_DIR + / "segemehl_transcriptome_index.idx", output: - tmap=TMP_DIR / "{sample}" / "segemehl_transcriptome_mappings.sam", + tmap=INTERMEDIATES_DIR / "{sample}" / "segemehl_transcriptome_mappings.sam", params: cluster_log=CLUSTER_LOG / "map_transcriptome_segemehl_{sample}.log", log: @@ -311,10 +312,10 @@ rule map_transcriptome_segemehl: rule filter_fasta_for_oligomap: input: - reads=TMP_DIR / "{sample}" / "reads_collapsed.fasta", + reads=INTERMEDIATES_DIR / "{sample}" / "reads_collapsed.fasta", script=SCRIPTS_DIR / "validation_fasta.py", output: - reads=TMP_DIR / "{sample}" / "reads_filtered_for_oligomap.fasta", + reads=INTERMEDIATES_DIR / "{sample}" / "reads_filtered_for_oligomap.fasta", params: cluster_log=CLUSTER_LOG / "filter_fasta_for_oligomap_{sample}.log", max_length_reads=config["max_length_reads"], @@ -339,11 +340,11 @@ rule filter_fasta_for_oligomap: rule map_genome_oligomap: input: - reads=TMP_DIR / "{sample}" / "reads_filtered_for_oligomap.fasta", - target=TMP_DIR / "genome_processed.fa", + reads=INTERMEDIATES_DIR / "{sample}" / "reads_filtered_for_oligomap.fasta", + target=INTERMEDIATES_DIR / "genome_processed.fa", output: - gmap=TMP_DIR / "{sample}" / "oligomap_genome_mappings.fasta", - report=TMP_DIR / "{sample}" / "oligomap_genome_report.txt", + gmap=INTERMEDIATES_DIR / "{sample}" / "oligomap_genome_mappings.fasta", + report=INTERMEDIATES_DIR / "{sample}" / "oligomap_genome_report.txt", params: cluster_log=CLUSTER_LOG / "map_genome_oligomap_{sample}.log", log: @@ -372,11 +373,11 @@ rule map_genome_oligomap: rule sort_genome_oligomap: input: - tmap=TMP_DIR / "{sample}" / "oligomap_genome_mappings.fasta", - report=TMP_DIR / "{sample}" / "oligomap_genome_report.txt", + tmap=INTERMEDIATES_DIR / "{sample}" / "oligomap_genome_mappings.fasta", + report=INTERMEDIATES_DIR / "{sample}" / "oligomap_genome_report.txt", script=SCRIPTS_DIR / "blocksort.sh", output: - sort=TMP_DIR / "{sample}" / "oligomap_genome_sorted.fasta", + sort=INTERMEDIATES_DIR / "{sample}" / "oligomap_genome_sorted.fasta", params: cluster_log=CLUSTER_LOG / "sort_genome_oligomap_{sample}.log", log: @@ -401,10 +402,10 @@ rule sort_genome_oligomap: rule convert_genome_to_sam_oligomap: input: - sort=TMP_DIR / "{sample}" / "oligomap_genome_sorted.fasta", + sort=INTERMEDIATES_DIR / "{sample}" / "oligomap_genome_sorted.fasta", script=SCRIPTS_DIR / "oligomap_output_to_sam_nh_filtered.py", output: - gmap=TMP_DIR / "{sample}" / "oligomap_genome_mappings.sam", + gmap=INTERMEDIATES_DIR / "{sample}" / "oligomap_genome_mappings.sam", params: cluster_log=CLUSTER_LOG / "oligomap_genome_to_sam_{sample}.log", nh=config["nh"], @@ -431,11 +432,11 @@ rule convert_genome_to_sam_oligomap: rule map_transcriptome_oligomap: input: - reads=TMP_DIR / "{sample}" / "reads_filtered_for_oligomap.fasta", - target=TMP_DIR / "transcriptome_trimmed_id.fa", + reads=INTERMEDIATES_DIR / "{sample}" / "reads_filtered_for_oligomap.fasta", + target=INTERMEDIATES_DIR / "transcriptome_trimmed_id.fa", output: - tmap=TMP_DIR / "{sample}" / "oligomap_transcriptome_mappings.fasta", - report=TMP_DIR / "{sample}" / "oligomap_transcriptome_report.txt", + tmap=INTERMEDIATES_DIR / "{sample}" / "oligomap_transcriptome_mappings.fasta", + report=INTERMEDIATES_DIR / "{sample}" / "oligomap_transcriptome_report.txt", params: cluster_log=CLUSTER_LOG / "map_transcriptome_oligomap_{sample}.log", log: @@ -465,11 +466,11 @@ rule map_transcriptome_oligomap: rule sort_transcriptome_oligomap: input: - tmap=TMP_DIR / "{sample}" / "oligomap_transcriptome_mappings.fasta", - report=TMP_DIR / "{sample}" / "oligomap_transcriptome_report.txt", + tmap=INTERMEDIATES_DIR / "{sample}" / "oligomap_transcriptome_mappings.fasta", + report=INTERMEDIATES_DIR / "{sample}" / "oligomap_transcriptome_report.txt", script=SCRIPTS_DIR / "blocksort.sh", output: - sort=TMP_DIR / "{sample}" / "oligomap_transcriptome_sorted.fasta", + sort=INTERMEDIATES_DIR / "{sample}" / "oligomap_transcriptome_sorted.fasta", params: cluster_log=CLUSTER_LOG / "sort_transcriptome_oligomap_{sample}.log", log: @@ -493,10 +494,10 @@ rule sort_transcriptome_oligomap: rule convert_transcriptome_to_sam_oligomap: input: - sort=TMP_DIR / "{sample}" / "oligomap_transcriptome_sorted.fasta", + sort=INTERMEDIATES_DIR / "{sample}" / "oligomap_transcriptome_sorted.fasta", script=SCRIPTS_DIR / "oligomap_output_to_sam_nh_filtered.py", output: - tmap=TMP_DIR / "{sample}" / "oligomap_transcriptome_mappings.sam", + tmap=INTERMEDIATES_DIR / "{sample}" / "oligomap_transcriptome_mappings.sam", params: cluster_log=CLUSTER_LOG / "oligomap_transcriptome_to_sam_{sample}.log", nh=config["nh"], @@ -520,10 +521,10 @@ rule convert_transcriptome_to_sam_oligomap: rule merge_genome_maps: input: - gmap1=TMP_DIR / "{sample}" / "segemehl_genome_mappings.sam", - gmap2=TMP_DIR / "{sample}" / "oligomap_genome_mappings.sam", + gmap1=INTERMEDIATES_DIR / "{sample}" / "segemehl_genome_mappings.sam", + gmap2=INTERMEDIATES_DIR / "{sample}" / "oligomap_genome_mappings.sam", output: - gmaps=TMP_DIR / "{sample}" / "genome_mappings.sam", + gmaps=INTERMEDIATES_DIR / "{sample}" / "genome_mappings.sam", params: cluster_log=CLUSTER_LOG / "merge_genome_maps_{sample}.log", log: @@ -541,10 +542,10 @@ rule merge_genome_maps: rule merge_transcriptome_maps: input: - tmap1=TMP_DIR / "{sample}" / "segemehl_transcriptome_mappings.sam", - tmap2=TMP_DIR / "{sample}" / "oligomap_transcriptome_mappings.sam", + tmap1=INTERMEDIATES_DIR / "{sample}" / "segemehl_transcriptome_mappings.sam", + tmap2=INTERMEDIATES_DIR / "{sample}" / "oligomap_transcriptome_mappings.sam", output: - tmaps=TMP_DIR / "{sample}" / "transcriptome_mappings.sam", + tmaps=INTERMEDIATES_DIR / "{sample}" / "transcriptome_mappings.sam", params: cluster_log=CLUSTER_LOG / "merge_transcriptome_maps_{sample}.log", log: @@ -562,10 +563,10 @@ rule merge_transcriptome_maps: rule filter_genome_by_nh: input: - gmaps=TMP_DIR / "{sample}" / "genome_mappings.sam", + gmaps=INTERMEDIATES_DIR / "{sample}" / "genome_mappings.sam", script=SCRIPTS_DIR / "nh_filter.py", output: - gmaps=TMP_DIR / "{sample}" / "genome_mappings_filtered_nh.sam", + gmaps=INTERMEDIATES_DIR / "{sample}" / "genome_mappings_filtered_nh.sam", params: cluster_log=CLUSTER_LOG / "filter_genome_by_nh_{sample}.log", nh=config["nh"], @@ -590,10 +591,10 @@ rule filter_genome_by_nh: rule filter_transcriptome_by_nh: input: - tmaps=TMP_DIR / "{sample}" / "transcriptome_mappings.sam", + tmaps=INTERMEDIATES_DIR / "{sample}" / "transcriptome_mappings.sam", script=SCRIPTS_DIR / "nh_filter.py", output: - tmaps=TMP_DIR / "{sample}" / "transcriptome_mappings_filtered_nh.sam", + tmaps=INTERMEDIATES_DIR / "{sample}" / "transcriptome_mappings_filtered_nh.sam", params: cluster_log=CLUSTER_LOG / "filter_transcriptome_by_nh_{sample}.log", nh=config["nh"], @@ -618,9 +619,9 @@ rule filter_transcriptome_by_nh: rule remove_header_genome_mappings: input: - gmap=TMP_DIR / "{sample}" / "genome_mappings_filtered_nh.sam", + gmap=INTERMEDIATES_DIR / "{sample}" / "genome_mappings_filtered_nh.sam", output: - gmap=TMP_DIR / "{sample}" / "genome_mappings_no_header.sam", + gmap=INTERMEDIATES_DIR / "{sample}" / "genome_mappings_no_header.sam", params: cluster_log=CLUSTER_LOG / "remove_header_genome_mappings_{sample}.log", log: @@ -640,9 +641,9 @@ rule remove_header_genome_mappings: rule remove_header_transcriptome_mappings: input: - tmap=TMP_DIR / "{sample}" / "transcriptome_mappings_filtered_nh.sam", + tmap=INTERMEDIATES_DIR / "{sample}" / "transcriptome_mappings_filtered_nh.sam", output: - tmap=TMP_DIR / "{sample}" / "transcriptome_mappings_no_header.sam", + tmap=INTERMEDIATES_DIR / "{sample}" / "transcriptome_mappings_no_header.sam", params: cluster_log=CLUSTER_LOG / "remove_header_transcriptome_mappings_{sample}.log", log: @@ -662,11 +663,11 @@ rule remove_header_transcriptome_mappings: rule transcriptome_to_genome_maps: input: - tmap=TMP_DIR / "{sample}" / "transcriptome_mappings_no_header.sam", + tmap=INTERMEDIATES_DIR / "{sample}" / "transcriptome_mappings_no_header.sam", script=SCRIPTS_DIR / "sam_trx_to_sam_gen.pl", - exons=TMP_DIR / "exons.bed", + exons=INTERMEDIATES_DIR / "exons.bed", output: - genout=TMP_DIR / "{sample}" / "transcriptome_mappings_to_genome.sam", + genout=INTERMEDIATES_DIR / "{sample}" / "transcriptome_mappings_to_genome.sam", params: cluster_log=CLUSTER_LOG / "transcriptome_to_genome_maps_{sample}.log", log: @@ -690,10 +691,10 @@ rule transcriptome_to_genome_maps: rule merge_all_maps: input: - gmap1=TMP_DIR / "{sample}" / "transcriptome_mappings_to_genome.sam", - gmap2=TMP_DIR / "{sample}" / "genome_mappings_no_header.sam", + gmap1=INTERMEDIATES_DIR / "{sample}" / "transcriptome_mappings_to_genome.sam", + gmap2=INTERMEDIATES_DIR / "{sample}" / "genome_mappings_no_header.sam", output: - catmaps=TMP_DIR / "{sample}" / "mappings_all_no_header.sam", + catmaps=INTERMEDIATES_DIR / "{sample}" / "mappings_all_no_header.sam", params: cluster_log=CLUSTER_LOG / "merge_all_mappings_{sample}.log", log: @@ -711,10 +712,10 @@ rule merge_all_maps: rule add_header_all_maps: input: - header=TMP_DIR / "genome_header.sam", - catmaps=TMP_DIR / "{sample}" / "mappings_all_no_header.sam", + header=INTERMEDIATES_DIR / "genome_header.sam", + catmaps=INTERMEDIATES_DIR / "{sample}" / "mappings_all_no_header.sam", output: - concatenate=TMP_DIR / "{sample}" / "mappings_all.sam", + concatenate=INTERMEDIATES_DIR / "{sample}" / "mappings_all.sam", params: cluster_log=CLUSTER_LOG / "add_header_{sample}.log", log: @@ -732,9 +733,9 @@ rule add_header_all_maps: rule sort_maps_by_id: input: - concatenate=TMP_DIR / "{sample}" / "mappings_all.sam", + concatenate=INTERMEDIATES_DIR / "{sample}" / "mappings_all.sam", output: - sort=TMP_DIR / "{sample}" / "mappings_all_sorted_by_id.sam", + sort=INTERMEDIATES_DIR / "{sample}" / "mappings_all_sorted_by_id.sam", params: cluster_log=CLUSTER_LOG / "sort_maps_by_id_{sample}.log", log: @@ -754,10 +755,10 @@ rule sort_maps_by_id: rule remove_inferiors: input: - sort=TMP_DIR / "{sample}" / "mappings_all_sorted_by_id.sam", + sort=INTERMEDIATES_DIR / "{sample}" / "mappings_all_sorted_by_id.sam", script=SCRIPTS_DIR / "sam_remove_duplicates_inferior_alignments_multimappers.pl", output: - remove_inf=TMP_DIR / "{sample}" / "mappings_all_removed_inferiors.sam", + remove_inf=INTERMEDIATES_DIR / "{sample}" / "mappings_all_removed_inferiors.sam", params: cluster_log=CLUSTER_LOG / "remove_inferiors_{sample}.log", log: @@ -785,10 +786,10 @@ rule remove_inferiors: rule filter_by_indels: input: - sam=TMP_DIR / "{sample}" / "mappings_all_removed_inferiors.sam", + sam=INTERMEDIATES_DIR / "{sample}" / "mappings_all_removed_inferiors.sam", script=SCRIPTS_DIR / "filter_multimappers.py", output: - sam=TMP_DIR / "{sample}" / "alignments_all.sam", + sam=INTERMEDIATES_DIR / "{sample}" / "alignments_all.sam", params: cluster_log=CLUSTER_LOG / "remove_multimappers_{sample}.log", log: @@ -815,9 +816,9 @@ rule filter_by_indels: rule convert_all_alns_sam_to_bam: input: - maps=TMP_DIR / "{sample}" / "alignments_all.sam", + maps=INTERMEDIATES_DIR / "{sample}" / "alignments_all.sam", output: - maps=TMP_DIR / "{sample}" / "alignments_all.bam", + maps=INTERMEDIATES_DIR / "{sample}" / "alignments_all.bam", params: cluster_log=CLUSTER_LOG / "convert_all_alns_sam_to_bam_{sample}.log", log: @@ -837,9 +838,9 @@ rule convert_all_alns_sam_to_bam: rule sort_all_alns_bam_by_position: input: - maps=TMP_DIR / "{sample}" / "alignments_all.bam", + maps=INTERMEDIATES_DIR / "{sample}" / "alignments_all.bam", output: - maps=TMP_DIR / "{sample}" / "alignments_all_sorted_{sample}.bam", + maps=INTERMEDIATES_DIR / "{sample}" / "alignments_all_sorted_{sample}.bam", params: cluster_log=CLUSTER_LOG / "sort_all_alns_bam_by_position_{sample}.log", log: @@ -859,9 +860,9 @@ rule sort_all_alns_bam_by_position: rule index_all_alns_bam: input: - maps=TMP_DIR / "{sample}" / "alignments_all_sorted.bam", + maps=INTERMEDIATES_DIR / "{sample}" / "alignments_all_sorted.bam", output: - maps=TMP_DIR / "{sample}" / "alignments_all_sorted.bam.bai", + maps=INTERMEDIATES_DIR / "{sample}" / "alignments_all_sorted.bam.bai", params: cluster_log=CLUSTER_LOG / "index_all_alns_bam_{sample}.log", log: diff --git a/workflow/rules/prepare.smk b/workflow/rules/prepare.smk index 437ccad5..85d56259 100644 --- a/workflow/rules/prepare.smk +++ b/workflow/rules/prepare.smk @@ -26,7 +26,7 @@ validate(config, Path("../../config/config_schema.json")) ENV_DIR = Path(f"{workflow.basedir}/envs") -TMP_DIR = Path(config["tmp_dir"]) +INTERMEDIATES_DIR = Path(config["intermediates_dir"]) SCRIPTS_DIR = Path(config["scripts_dir"]) CLUSTER_LOG = Path(config["cluster_log"]) @@ -49,17 +49,17 @@ localrules: rule finish_prepare: input: - idx_transcriptome=TMP_DIR / "segemehl_transcriptome_index.idx", - idx_genome=TMP_DIR / "segemehl_genome_index.idx", - exons=TMP_DIR / "exons.bed", - header=TMP_DIR / "genome_header.sam", - chrsize=TMP_DIR / "chr_size.txt", + idx_transcriptome=INTERMEDIATES_DIR / "segemehl_transcriptome_index.idx", + idx_genome=INTERMEDIATES_DIR / "segemehl_genome_index.idx", + exons=INTERMEDIATES_DIR / "exons.bed", + header=INTERMEDIATES_DIR / "genome_header.sam", + chrsize=INTERMEDIATES_DIR / "chr_size.txt", extended_mir=expand( - TMP_DIR / "extended_mirna_annotation_{extension}_nt.gff3", + INTERMEDIATES_DIR / "extended_mirna_annotation_{extension}_nt.gff3", extension=config["extension"], ), extended_primir=expand( - TMP_DIR / "extended_primir_annotation_{extension}_nt.gff3", + INTERMEDIATES_DIR / "extended_primir_annotation_{extension}_nt.gff3", extension=config["extension"], ), @@ -74,7 +74,7 @@ rule trim_genome_seq_ids: genome=config["genome_file"], script=SCRIPTS_DIR / "trim_id_fasta.sh", output: - genome=TMP_DIR / "genome_processed.fa", + genome=INTERMEDIATES_DIR / "genome_processed.fa", params: cluster_log=CLUSTER_LOG / "genome_process.log", log: @@ -92,10 +92,10 @@ rule trim_genome_seq_ids: rule extract_transcriptome_seqs: input: - genome=TMP_DIR / "genome_processed.fa", + genome=INTERMEDIATES_DIR / "genome_processed.fa", gtf=config["gtf_file"], output: - fasta=TMP_DIR / "transcriptome.fa", + fasta=INTERMEDIATES_DIR / "transcriptome.fa", params: cluster_log=CLUSTER_LOG / "extract_transcriptome_seqs.log", log: @@ -115,10 +115,10 @@ rule extract_transcriptome_seqs: rule trim_transcriptome_seq_ids: input: - fasta=TMP_DIR / "transcriptome.fa", + fasta=INTERMEDIATES_DIR / "transcriptome.fa", script=SCRIPTS_DIR / "trim_id_fasta.sh", output: - fasta=TMP_DIR / "transcriptome_trimmed_id.fa", + fasta=INTERMEDIATES_DIR / "transcriptome_trimmed_id.fa", params: cluster_log=CLUSTER_LOG / "trim_transcriptome.log", log: @@ -136,9 +136,9 @@ rule trim_transcriptome_seq_ids: rule generate_segemehl_index_transcriptome: input: - fasta=TMP_DIR / "transcriptome_trimmed_id.fa", + fasta=INTERMEDIATES_DIR / "transcriptome_trimmed_id.fa", output: - idx=TMP_DIR / "segemehl_transcriptome_index.idx", + idx=INTERMEDIATES_DIR / "segemehl_transcriptome_index.idx", params: cluster_log=CLUSTER_LOG / "generate_segemehl_index_transcriptome.log", log: @@ -162,9 +162,9 @@ rule generate_segemehl_index_transcriptome: rule generate_segemehl_index_genome: input: - genome=TMP_DIR / "genome_processed.fa", + genome=INTERMEDIATES_DIR / "genome_processed.fa", output: - idx=TMP_DIR / "segemehl_genome_index.idx", + idx=INTERMEDIATES_DIR / "segemehl_genome_index.idx", params: cluster_log=CLUSTER_LOG / "generate_segemehl_index_genome.log", log: @@ -191,7 +191,7 @@ rule get_exons_gtf: gtf=config["gtf_file"], script=SCRIPTS_DIR / "get_lines_w_pattern.sh", output: - exons=TMP_DIR / "exons.gtf", + exons=INTERMEDIATES_DIR / "exons.gtf", params: cluster_log=CLUSTER_LOG / "get_exons_gtf.log", log: @@ -215,10 +215,10 @@ rule get_exons_gtf: rule convert_exons_gtf_to_bed: input: - exons=TMP_DIR / "exons.gtf", + exons=INTERMEDIATES_DIR / "exons.gtf", script=SCRIPTS_DIR / "gtf_exons_bed.1.1.2.R", output: - exons=TMP_DIR / "exons.bed", + exons=INTERMEDIATES_DIR / "exons.bed", params: cluster_log=CLUSTER_LOG / "exons_gtf_to_bed.log", log: @@ -242,9 +242,9 @@ rule convert_exons_gtf_to_bed: rule create_genome_header: input: - genome=TMP_DIR / "genome_processed.fa", + genome=INTERMEDIATES_DIR / "genome_processed.fa", output: - header=TMP_DIR / "genome_header.sam", + header=INTERMEDIATES_DIR / "genome_header.sam", params: cluster_log=CLUSTER_LOG / "create_genome_header.log", log: @@ -268,7 +268,7 @@ rule map_chr_names: script=SCRIPTS_DIR / "map_chromosomes.pl", map_chr=config["map_chr_file"], output: - gff=TMP_DIR / "mirna_annotations.gff3", + gff=INTERMEDIATES_DIR / "mirna_annotations.gff3", params: cluster_log=CLUSTER_LOG / "map_chr_names.log", column="1", @@ -296,9 +296,9 @@ rule map_chr_names: rule create_index_genome_fasta: input: - genome=TMP_DIR / "genome_processed.fa", + genome=INTERMEDIATES_DIR / "genome_processed.fa", output: - genome=TMP_DIR / "genome_processed.fa.fai", + genome=INTERMEDIATES_DIR / "genome_processed.fa.fai", params: cluster_log=CLUSTER_LOG / "create_index_genome_fasta.log", log: @@ -318,9 +318,9 @@ rule create_index_genome_fasta: rule extract_chr_len: input: - genome=TMP_DIR / "genome_processed.fa.fai", + genome=INTERMEDIATES_DIR / "genome_processed.fa.fai", output: - chrsize=TMP_DIR / "chr_size.txt", + chrsize=INTERMEDIATES_DIR / "chr_size.txt", params: cluster_log=CLUSTER_LOG / "extract_chr_len.log", log: @@ -338,21 +338,21 @@ rule extract_chr_len: rule extend_mirs_annotations: input: - gff3=TMP_DIR / "mirna_annotations.gff3", - chrsize=TMP_DIR / "chr_size.txt", + gff3=INTERMEDIATES_DIR / "mirna_annotations.gff3", + chrsize=INTERMEDIATES_DIR / "chr_size.txt", script=SCRIPTS_DIR / "mirna_extension.py", output: extended_mir=expand( - TMP_DIR / "extended_mirna_annotation_{extension}_nt.gff3", + INTERMEDIATES_DIR / "extended_mirna_annotation_{extension}_nt.gff3", extension=config["extension"], ), extended_primir=expand( - TMP_DIR / "extended_primir_annotation_{extension}_nt.gff3", + INTERMEDIATES_DIR / "extended_primir_annotation_{extension}_nt.gff3", extension=config["extension"], ), params: cluster_log=CLUSTER_LOG / "extend_mirs_annotations.log", - out_dir=TMP_DIR, + out_dir=INTERMEDIATES_DIR, extension=config["extension"], log: LOCAL_LOG / "extend_mirs_annotations.log", diff --git a/workflow/rules/quantify.smk b/workflow/rules/quantify.smk index 043d7744..e3fe7675 100644 --- a/workflow/rules/quantify.smk +++ b/workflow/rules/quantify.smk @@ -25,7 +25,7 @@ validate(config, Path("../../config/config_schema.json")) ENV_DIR = Path(f"{workflow.basedir}/envs") OUT_DIR = Path(config["output_dir"]) -TMP_DIR = Path(config["tmp_dir"]) +INTERMEDIATES_DIR = Path(config["intermediates_dir"]) SCRIPTS_DIR = Path(config["scripts_dir"]) CLUSTER_LOG = Path(config["cluster_log"]) @@ -86,13 +86,13 @@ rule finish_quantify: rule intersect_extended_primir: input: - alignment=TMP_DIR / "{sample}" / "alignments_all_sorted_{sample}.bam", + alignment=INTERMEDIATES_DIR / "{sample}" / "alignments_all_sorted_{sample}.bam", primir=expand( - TMP_DIR / "extended_primir_annotation_{extension}_nt.gff3", + INTERMEDIATES_DIR / "extended_primir_annotation_{extension}_nt.gff3", extension=config["extension"], ), output: - intersect=TMP_DIR / "{sample}" / "intersected_extended_primir.bed", + intersect=INTERMEDIATES_DIR / "{sample}" / "intersected_extended_primir.bed", params: cluster_log=CLUSTER_LOG / "intersect_extended_primir_{sample}.log", log: @@ -120,8 +120,8 @@ rule intersect_extended_primir: rule filter_sam_by_intersecting_primir: input: - alignments=TMP_DIR / "{sample}" / "alignments_all.sam", - intersect=TMP_DIR / "{sample}" / "intersected_extended_primir.bed", + alignments=INTERMEDIATES_DIR / "{sample}" / "alignments_all.sam", + intersect=INTERMEDIATES_DIR / "{sample}" / "intersected_extended_primir.bed", output: sam=OUT_DIR / "{sample}" / "alignments_intersecting_primir.sam", params: @@ -150,7 +150,7 @@ rule convert_intersecting_primir_sam_to_bam: input: maps=OUT_DIR / "{sample}" / "alignments_intersecting_primir.sam", output: - maps=TMP_DIR / "{sample}" / "alignments_intersecting_primir.bam", + maps=INTERMEDIATES_DIR / "{sample}" / "alignments_intersecting_primir.bam", params: cluster_log=CLUSTER_LOG / "convert_intersecting_primir_sam_to_bam_{sample}.log", log: @@ -170,9 +170,11 @@ rule convert_intersecting_primir_sam_to_bam: rule sort_intersecting_primir_bam_by_position: input: - maps=TMP_DIR / "{sample}" / "alignments_intersecting_primir.bam", + maps=INTERMEDIATES_DIR / "{sample}" / "alignments_intersecting_primir.bam", output: - maps=TMP_DIR / "{sample}" / "alignments_intersecting_primir_sorted.bam", + maps=INTERMEDIATES_DIR + / "{sample}" + / "alignments_intersecting_primir_sorted.bam", params: cluster_log=CLUSTER_LOG / "sort_intersecting_primir_bam_by_position_{sample}.log", @@ -193,9 +195,13 @@ rule sort_intersecting_primir_bam_by_position: rule index_intersecting_primir_bam: input: - maps=TMP_DIR / "{sample}" / "alignments_intersecting_primir_sorted.bam", + maps=INTERMEDIATES_DIR + / "{sample}" + / "alignments_intersecting_primir_sorted.bam", output: - maps=TMP_DIR / "{sample}" / "alignments_intersecting_primir_sorted.bam.bai", + maps=INTERMEDIATES_DIR + / "{sample}" + / "alignments_intersecting_primir_sorted.bam.bai", params: cluster_log=CLUSTER_LOG / "index_intersecting_primir_bam_{sample}.log", log: @@ -215,13 +221,15 @@ rule index_intersecting_primir_bam: rule intersect_extended_mirna: input: - alignment=TMP_DIR / "{sample}" / "alignments_intersecting_primir_sorted.bam", + alignment=INTERMEDIATES_DIR + / "{sample}" + / "alignments_intersecting_primir_sorted.bam", mirna=expand( - TMP_DIR / "extended_mirna_annotation_{extension}_nt.gff3", + INTERMEDIATES_DIR / "extended_mirna_annotation_{extension}_nt.gff3", extension=config["extension"], ), output: - intersect=TMP_DIR / "{sample}" / "intersected_extended_mirna.bed", + intersect=INTERMEDIATES_DIR / "{sample}" / "intersected_extended_mirna.bed", params: cluster_log=CLUSTER_LOG / "intersect_extended_mirna_{sample}.log", log: @@ -250,7 +258,7 @@ rule intersect_extended_mirna: rule filter_sam_by_intersecting_mirna: input: alignments=OUT_DIR / "{sample}" / "alignments_intersecting_primir.sam", - intersect=TMP_DIR / "{sample}" / "intersected_extended_mirna.bed", + intersect=INTERMEDIATES_DIR / "{sample}" / "intersected_extended_mirna.bed", output: sam=OUT_DIR / "{sample}" / "alignments_intersecting_mirna.sam", params: @@ -278,10 +286,10 @@ rule filter_sam_by_intersecting_mirna: rule add_intersecting_mirna_tag: input: alignments=OUT_DIR / "{sample}" / "alignments_intersecting_mirna.sam", - intersect=TMP_DIR / "{sample}" / "intersected_extended_mirna.bed", + intersect=INTERMEDIATES_DIR / "{sample}" / "intersected_extended_mirna.bed", script=SCRIPTS_DIR / "iso_name_tagging.py", output: - sam=TMP_DIR / "{sample}" / "alignments_intersecting_mirna_tag.sam", + sam=INTERMEDIATES_DIR / "{sample}" / "alignments_intersecting_mirna_tag.sam", params: extension=config["extension"], cluster_log=CLUSTER_LOG / "add_intersecting_mirna_tag_{sample}.log", @@ -307,9 +315,11 @@ rule add_intersecting_mirna_tag: rule sort_intersecting_mirna_by_feat_tag: input: - sam=TMP_DIR / "{sample}" / "alignments_intersecting_mirna_tag.sam", + sam=INTERMEDIATES_DIR / "{sample}" / "alignments_intersecting_mirna_tag.sam", output: - sam=TMP_DIR / "{sample}" / "alignments_intersecting_mirna_sorted_tag.sam", + sam=INTERMEDIATES_DIR + / "{sample}" + / "alignments_intersecting_mirna_sorted_tag.sam", params: cluster_log=CLUSTER_LOG / "sort_intersecting_mirna_by_feat_tag_{sample}.log", log: @@ -329,15 +339,17 @@ rule sort_intersecting_mirna_by_feat_tag: rule quantify_mirna: input: - alignments=TMP_DIR / "{sample}" / "alignments_intersecting_mirna_sorted_tag.sam", + alignments=INTERMEDIATES_DIR + / "{sample}" + / "alignments_intersecting_mirna_sorted_tag.sam", script=SCRIPTS_DIR / "mirna_quantification.py", output: - table=TMP_DIR / "TABLES" / "mirna_counts_{sample}", + table=INTERMEDIATES_DIR / "TABLES" / "mirna_counts_{sample}", params: cluster_log=CLUSTER_LOG / "quantify_mirna_{sample}.log", mir_list=config["mir_list"], library="{sample}", - out_dir=TMP_DIR / "TABLES", + out_dir=INTERMEDIATES_DIR / "TABLES", log: LOCAL_LOG / "quantify_mirna_{sample}.log", container: @@ -363,10 +375,10 @@ rule quantify_mirna: rule quantify_primir: input: - intersect=TMP_DIR / "{sample}" / "intersected_extended_primir.bed", + intersect=INTERMEDIATES_DIR / "{sample}" / "intersected_extended_primir.bed", script=SCRIPTS_DIR / "primir_quantification.py", output: - table=TMP_DIR / "TABLES" / "pri-mir_counts_{sample}", + table=INTERMEDIATES_DIR / "TABLES" / "pri-mir_counts_{sample}", params: cluster_log=CLUSTER_LOG / "quantify_primir_{sample}.log", log: @@ -393,7 +405,7 @@ rule quantify_primir: rule merge_tables: input: table=expand( - TMP_DIR / "TABLES" / "{mir}_counts_{sample}", + INTERMEDIATES_DIR / "TABLES" / "{mir}_counts_{sample}", sample=pd.unique(samples_table.index.values), mir=[mir for mir in config["mir_list"] if mir != "isomir"], ), @@ -403,7 +415,7 @@ rule merge_tables: params: cluster_log=CLUSTER_LOG / "merge_tables_{mirna}.log", prefix="{mirna}_counts_", - input_dir=TMP_DIR / "TABLES", + input_dir=INTERMEDIATES_DIR / "TABLES", log: LOCAL_LOG / "merge_tables_{mirna}.log", container: @@ -430,7 +442,9 @@ rule uncollapse_reads: maps=OUT_DIR / "{sample}" / "alignments_intersecting_mirna.sam", script=SCRIPTS_DIR / "sam_uncollapse.pl", output: - maps=TMP_DIR / "{sample}" / "alignments_intersecting_mirna_uncollapsed.sam", + maps=INTERMEDIATES_DIR + / "{sample}" + / "alignments_intersecting_mirna_uncollapsed.sam", params: cluster_log=CLUSTER_LOG / "uncollapse_reads_{sample}.log", log: @@ -454,9 +468,13 @@ rule uncollapse_reads: rule convert_uncollpased_reads_sam_to_bam: input: - maps=TMP_DIR / "{sample}" / "alignments_intersecting_mirna_uncollapsed.sam", + maps=INTERMEDIATES_DIR + / "{sample}" + / "alignments_intersecting_mirna_uncollapsed.sam", output: - maps=TMP_DIR / "{sample}" / "alignments_intersecting_mirna_uncollapsed.bam", + maps=INTERMEDIATES_DIR + / "{sample}" + / "alignments_intersecting_mirna_uncollapsed.bam", params: cluster_log=CLUSTER_LOG / "convert_uncollapsed_reads_sam_to_bam_{sample}.log", log: @@ -476,7 +494,9 @@ rule convert_uncollpased_reads_sam_to_bam: rule sort_uncollpased_reads_bam_by_position: input: - maps=TMP_DIR / "{sample}" / "alignments_intersecting_mirna_uncollapsed.bam", + maps=INTERMEDIATES_DIR + / "{sample}" + / "alignments_intersecting_mirna_uncollapsed.bam", output: maps=OUT_DIR / "{sample}" From c41d03f3211c711816fe0c587170b58a5d57c0e5 Mon Sep 17 00:00:00 2001 From: deliaBlue Date: Wed, 6 Dec 2023 03:47:57 +0100 Subject: [PATCH 18/21] build: change intermediates directory --- config/config_schema.json | 6 +++--- config/config_template.yaml | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/config/config_schema.json b/config/config_schema.json index 86a6f18c..842c1753 100644 --- a/config/config_schema.json +++ b/config/config_schema.json @@ -30,10 +30,10 @@ "default": "results/", "description": "Path to the output directory." }, - "tmp_dir":{ + "intermediates_dir":{ "type": "string", - "default": "results/inter_files", - "description": "Path to the temporary directory storing the intermediate files." + "default": "results/intermediates", + "description": "Path to the directory storing the intermediate files." }, "local_log":{ "type": "string", diff --git a/config/config_template.yaml b/config/config_template.yaml index 70ddac51..06f89bd6 100644 --- a/config/config_template.yaml +++ b/config/config_template.yaml @@ -32,7 +32,7 @@ map_chr_file: path/to/ucsc_ensembl_mappings.tsv #### DIRECTORIES #### output_dir: results/ -tmp_dir: results/inter_files +intermediates_dir: results/intermediates local_log: logs/local/ cluster_log: logs/cluster/ scripts_dir: ../scripts/ From d299225de0250596f3b780dad2f713e51d1f0873 Mon Sep 17 00:00:00 2001 From: deliaBlue Date: Wed, 6 Dec 2023 03:48:19 +0100 Subject: [PATCH 19/21] test: update intermediates directory --- test/expected_output.md5 | 110 +++++++++++++++++++-------------------- 1 file changed, 55 insertions(+), 55 deletions(-) diff --git a/test/expected_output.md5 b/test/expected_output.md5 index 3129cc56..c3fc05e6 100644 --- a/test/expected_output.md5 +++ b/test/expected_output.md5 @@ -1,58 +1,58 @@ 68f943f89b52d628851dd97fb1399d68 results/TABLES/all_mirna_counts.tab 363ecee318c57ee7e2e45ca468007baa results/TABLES/all_pri-mir_counts.tab 0d76977b2e36046cc176112776c5fa4e results/test_lib/alignments_intersecting_mirna_uncollapsed_sorted.bam.bai -f448bbeab20b0db75a5ca8bfb83c6ceb results/test_lib/alignments_intersecting_mirna_uncollapsed_sorted.bam -9f0bad0ed3c62d0410060d8b332315e8 results/test_lib/alignments_intersecting_mirna.sam -4ae56cdb8de0fbaac24b4a49d356f7f8 results/test_lib/alignments_intersecting_primir.sam -eec9be6cda61d2728290c92c1209f455 results/inter_files/TABLES/mirna_counts_test_lib -a844e3a29159e36e2f17a0646d1e8c5f results/inter_files/TABLES/pri-mir_counts_test_lib -36f7d024fe6ddfd3e788aebf61c61061 results/inter_files/test_lib/oligomap_genome_sorted.fasta -48e605df55bf2dd37ea5a5a74eb5872a results/inter_files/test_lib/mappings_all.sam -d41d8cd98f00b204e9800998ecf8427e results/inter_files/test_lib/oligomap_transcriptome_mappings.fasta -f54bacf9bf4188541a0c0fedc203e3ed results/inter_files/test_lib/mappings_all_removed_inferiors.sam -4b86be9b7ed15ddc0067b8de4aad431c results/inter_files/test_lib/alignments_intersecting_primir.bam -defdc8c46e1d73692edde0e0278f2d5e results/inter_files/test_lib/oligomap_genome_mappings.fasta -3aca095999e737c5d9cdb66540e8b195 results/inter_files/test_lib/segemehl_transcriptome_mappings.sam -3aca095999e737c5d9cdb66540e8b195 results/inter_files/test_lib/transcriptome_mappings_filtered_nh.sam -698711937e6d98dd65b70b3a738388b4 results/inter_files/test_lib/alignments_all.bam -a124a5afdb5f7bfbcc5683260556c9c4 results/inter_files/test_lib/mappings_all_no_header.sam -cb542d2dd6b4405d690086de0bb5ec70 results/inter_files/test_lib/alignments_all.sam -d8ab74abfa3ed2b2a92c83142af1c638 results/inter_files/test_lib/alignments_intersecting_mirna_tag.sam -d41d8cd98f00b204e9800998ecf8427e results/inter_files/test_lib/oligomap_transcriptome_sorted.fasta -f34a0091f633db03a940d0c790ad265a results/inter_files/test_lib/segemehl_genome_mappings.sam -d41d8cd98f00b204e9800998ecf8427e results/inter_files/test_lib/transcriptome_mappings_to_genome.sam -63a32839360a985b68e0685aafad5c54 results/inter_files/test_lib/fa/reads.fa -a8b1a66aecf4d7b583362ea8619228ed results/inter_files/test_lib/alignments_intersecting_mirna_uncollapsed.sam -edcb854702519c0002d8ce89a21e54ef results/inter_files/test_lib/reads_formatted.fasta -1a547487b8e92ad85bb26ff9b1db1f93 results/inter_files/test_lib/intersected_extended_mirna.bed -a71a2dd39c82baee52d5dbe2e3a39457 results/inter_files/test_lib/alignments_all_sorted_test_lib.bam -ec0e9bcc8ea857da897035c8fca4078f results/inter_files/test_lib/reads_trimmed_adapters.fasta -6c6284e7328dbcb903afaadd4df857b8 results/inter_files/test_lib/alignments_intersecting_mirna_uncollapsed.bam -0454bc9f3edd9348a7b3e08d9c3007d8 results/inter_files/test_lib/genome_mappings.sam -f5cb65466d328036a15b66cfbd4d8419 results/inter_files/test_lib/oligomap_genome_report.txt -6cbdb9299e09b3e39b79a50db69226b5 results/inter_files/test_lib/transcriptome_mappings_no_header.sam -3aca095999e737c5d9cdb66540e8b195 results/inter_files/test_lib/transcriptome_mappings.sam -947607be69c16246f8dc9adbd9b971c8 results/inter_files/test_lib/oligomap_genome_mappings.sam -fa14b33623fd12b068a6d4ae301e7f49 results/inter_files/test_lib/alignments_intersecting_mirna_sorted_tag.sam -b6de7f5615b4b05834f4af11df993345 results/inter_files/test_lib/alignments_intersecting_primir_sorted.bam -d41d8cd98f00b204e9800998ecf8427e results/inter_files/test_lib/oligomap_transcriptome_mappings.sam -a124a5afdb5f7bfbcc5683260556c9c4 results/inter_files/test_lib/genome_mappings_no_header.sam -0454bc9f3edd9348a7b3e08d9c3007d8 results/inter_files/test_lib/genome_mappings_filtered_nh.sam -09c89a2769c919e58c3a3d3cbe2ceaf6 results/inter_files/test_lib/mappings_all_sorted_by_id.sam -2c77ffa021dda190d82f3f54a3312393 results/inter_files/test_lib/reads_collapsed.fasta -f68693cfaa1e6ea78e1a5562ade6d9ed results/inter_files/test_lib/intersected_extended_primir.bed -c2a5770a755ada66ef63d96eec4afb00 results/inter_files/test_lib/reads_filtered_for_oligomap.fasta -fe5388094985e9604a302d39d2abc82c results/inter_files/test_lib/oligomap_transcriptome_report.txt -be7a0d92e57480190de57eb30baffa36 results/inter_files/extended_mirna_annotation_6_nt.gff3 -8148cd880602255be166beb59bbed95a results/inter_files/genome_header.sam -09e24a504bfec37fee3d5ff1b5c7738e results/inter_files/exons.bed -4fb453846e88593d0cac13220ec2d685 results/inter_files/segemehl_genome_index.idx -d34fc868b861b1bc46db07a397dc0f10 results/inter_files/genome_processed.fa.fai -21e102e4ebd3508bb06f46366a3d578d results/inter_files/exons.gtf -003b92b245ac336e3d70a513033e1cee results/inter_files/transcriptome_trimmed_id.fa -44dbf7c3eae00d0bc8d5e1319123746c results/inter_files/chr_size.txt -cc5c3512dab0e269d82bd625de74198e results/inter_files/extended_primir_annotation_6_nt.gff3 -f28cc0143ab6659bef3de3a7afa1dccc results/inter_files/mirna_annotations.gff3 -2d437f8681f4248d4f2075f86debb920 results/inter_files/transcriptome.fa -7eb64c112830266bcf416ded60b4cf77 results/inter_files/segemehl_transcriptome_index.idx -4fba145540a2c61f29bfddfd0f5a4d4e results/inter_files/genome_processed.fa +25aca3f96e7ed644067d2050393bf7a4 results/test_lib/alignments_intersecting_mirna_uncollapsed_sorted.bam +cc01c7884838a597c587437cb0acf64e results/test_lib/alignments_intersecting_mirna.sam +b1eb81426f890d671bba8c8a815edc1e results/test_lib/alignments_intersecting_primir.sam +eec9be6cda61d2728290c92c1209f455 results/intermediates/TABLES/mirna_counts_test_lib +a844e3a29159e36e2f17a0646d1e8c5f results/intermediates/TABLES/pri-mir_counts_test_lib +36f7d024fe6ddfd3e788aebf61c61061 results/intermediates/test_lib/oligomap_genome_sorted.fasta +48e605df55bf2dd37ea5a5a74eb5872a results/intermediates/test_lib/mappings_all.sam +d41d8cd98f00b204e9800998ecf8427e results/intermediates/test_lib/oligomap_transcriptome_mappings.fasta +e9aac4afeb2053385d60f5e4b07a9774 results/intermediates/test_lib/mappings_all_removed_inferiors.sam +9ebcb4ac877f37921b88ceca3ff03b62 results/intermediates/test_lib/alignments_intersecting_primir.bam +defdc8c46e1d73692edde0e0278f2d5e results/intermediates/test_lib/oligomap_genome_mappings.fasta +e632f8984d423d46bbb377ec75468521 results/intermediates/test_lib/segemehl_transcriptome_mappings.sam +e632f8984d423d46bbb377ec75468521 results/intermediates/test_lib/transcriptome_mappings_filtered_nh.sam +3344bbeb9fe01f07c04831e5b4a795ba results/intermediates/test_lib/alignments_all.bam +a124a5afdb5f7bfbcc5683260556c9c4 results/intermediates/test_lib/mappings_all_no_header.sam +d62630102c33d43d593af14c2a642839 results/intermediates/test_lib/alignments_all.sam +81103749d61bc55ee2cfc84ca1527456 results/intermediates/test_lib/alignments_intersecting_mirna_tag.sam +d41d8cd98f00b204e9800998ecf8427e results/intermediates/test_lib/oligomap_transcriptome_sorted.fasta +76643f87bb2e2bff77d1b1223d7720b5 results/intermediates/test_lib/segemehl_genome_mappings.sam +d41d8cd98f00b204e9800998ecf8427e results/intermediates/test_lib/transcriptome_mappings_to_genome.sam +63a32839360a985b68e0685aafad5c54 results/intermediates/test_lib/fa/reads.fa +e9e9698d9350b64b64c1f6d96019fce8 results/intermediates/test_lib/alignments_intersecting_mirna_uncollapsed.sam +edcb854702519c0002d8ce89a21e54ef results/intermediates/test_lib/reads_formatted.fasta +1a547487b8e92ad85bb26ff9b1db1f93 results/intermediates/test_lib/intersected_extended_mirna.bed +a287ffc43b6afbdde3e9905bc27c28a5 results/intermediates/test_lib/alignments_all_sorted_test_lib.bam +ec0e9bcc8ea857da897035c8fca4078f results/intermediates/test_lib/reads_trimmed_adapters.fasta +d7a5ab720ff9c96f41f3755a05b8f9e0 results/intermediates/test_lib/alignments_intersecting_mirna_uncollapsed.bam +1f1b873d05ec14ef9b16376a1c98315b results/intermediates/test_lib/genome_mappings.sam +f5cb65466d328036a15b66cfbd4d8419 results/intermediates/test_lib/oligomap_genome_report.txt +6cbdb9299e09b3e39b79a50db69226b5 results/intermediates/test_lib/transcriptome_mappings_no_header.sam +e632f8984d423d46bbb377ec75468521 results/intermediates/test_lib/transcriptome_mappings.sam +947607be69c16246f8dc9adbd9b971c8 results/intermediates/test_lib/oligomap_genome_mappings.sam +ce3fcd037e0a6a0b1a7a3253219e7053 results/intermediates/test_lib/alignments_intersecting_mirna_sorted_tag.sam +53764354c520d9700f13761c2721d8aa results/intermediates/test_lib/alignments_intersecting_primir_sorted.bam +d41d8cd98f00b204e9800998ecf8427e results/intermediates/test_lib/oligomap_transcriptome_mappings.sam +a124a5afdb5f7bfbcc5683260556c9c4 results/intermediates/test_lib/genome_mappings_no_header.sam +1f1b873d05ec14ef9b16376a1c98315b results/intermediates/test_lib/genome_mappings_filtered_nh.sam +6cc6165e8942a08420552aa810e629f8 results/intermediates/test_lib/mappings_all_sorted_by_id.sam +2c77ffa021dda190d82f3f54a3312393 results/intermediates/test_lib/reads_collapsed.fasta +f68693cfaa1e6ea78e1a5562ade6d9ed results/intermediates/test_lib/intersected_extended_primir.bed +c2a5770a755ada66ef63d96eec4afb00 results/intermediates/test_lib/reads_filtered_for_oligomap.fasta +fe5388094985e9604a302d39d2abc82c results/intermediates/test_lib/oligomap_transcriptome_report.txt +be7a0d92e57480190de57eb30baffa36 results/intermediates/extended_mirna_annotation_6_nt.gff3 +8148cd880602255be166beb59bbed95a results/intermediates/genome_header.sam +09e24a504bfec37fee3d5ff1b5c7738e results/intermediates/exons.bed +4fb453846e88593d0cac13220ec2d685 results/intermediates/segemehl_genome_index.idx +d34fc868b861b1bc46db07a397dc0f10 results/intermediates/genome_processed.fa.fai +21e102e4ebd3508bb06f46366a3d578d results/intermediates/exons.gtf +003b92b245ac336e3d70a513033e1cee results/intermediates/transcriptome_trimmed_id.fa +44dbf7c3eae00d0bc8d5e1319123746c results/intermediates/chr_size.txt +cc5c3512dab0e269d82bd625de74198e results/intermediates/extended_primir_annotation_6_nt.gff3 +f28cc0143ab6659bef3de3a7afa1dccc results/intermediates/mirna_annotations.gff3 +2d437f8681f4248d4f2075f86debb920 results/intermediates/transcriptome.fa +7eb64c112830266bcf416ded60b4cf77 results/intermediates/segemehl_transcriptome_index.idx +4fba145540a2c61f29bfddfd0f5a4d4e results/intermediates/genome_processed.fa From 29d6f1fe4113e986ab25e3d1f7b63af2ed5e9bc2 Mon Sep 17 00:00:00 2001 From: deliaBlue Date: Wed, 6 Dec 2023 03:48:46 +0100 Subject: [PATCH 20/21] docs: rewrite output files section --- README.md | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index 8bb7ece5..12b39325 100644 --- a/README.md +++ b/README.md @@ -220,7 +220,7 @@ We recommend creating a copy of the ```bash cp config/config_template.yaml path/to/config.yaml -``` +``` So on that PR I could move this information in the section/file all of this will be written. Open the new copy in your editor of choice and adjust the configuration parameters to your liking. The template explains what each of the @@ -255,31 +255,30 @@ the `results/` and `logs/` directories, respectively. ### Expected output files Upon successful execution of _MIRFLOWZ_, the tool automatically removes all -intermediate files generated during the process. The final output comprises: +intermediate files generated during the process. The final outputs comprise: 1. A SAM file containing alignments intersecting a pri-miR locus. These alignments intersect with extended start and/or end positions specified in the provided pri-miR annotations. Please note that they may not contribute to the -final counting and will not appear in the final table. +final counting and may not appear in the final table. Alignments are discarded +if their start and/or end positions differ from the ends of the provided +pri-miR annotations by more bases than the extension used. -2. A SAM file containing alignments intersecting a miRNA locus. Similar to the -previous file, these alignments intersect with extended start and/or end +2. A SAM file containing alignments intersecting a mature miRNA locus. Similar +to the previous file, these alignments intersect with extended start and/or end positions specified in the provided miRNA annotations. They may not contribute to the final counting and might be absent from the final table. -3. A SAM file containing the uncollapsed set of alignments that contribute to -the final counting. - -4. A BAM file containing the uncollapsed set of alignments contributing to the -final counting and its corresponding index file (`bam..bai`). +3. A BAM file containing the set of alignments contributing to the final +counting and its corresponding index file (`.bam.bai`). -5. Table(s) containing the counting data from all libraries for (iso)miRs +4. Table(s) containing the counting data from all libraries for (iso)miRs and/or pri-miRs. Each row corresponds to a miRNA species, and each column -represents a sample library. Counting involves aggregating contributions from -all alignments, calculated as the ratio of collapsed reads in th alignment to -the number of hits (NH value). +represents a sample library. Each read is counted towards all the annotated +miRNA species it aligns to, with 1/n, where n is the number of genomic and/or +transcriptomic loci that read aligns to. -To retain all intermediate files, include --no-hooks in the workflow call. +To retain all intermediate files, include `--no-hooks` in the workflow call. ```bash snakemake \ @@ -294,7 +293,7 @@ snakemake \ ``` After successful execution of the workflow, the intermediate files will be -found in the `results/inter_files` directory. +found in the `results/intermediates` directory. ### Creating a Snakemake report From d0862157eb197beb31b18d1a73746ea26b8ff855 Mon Sep 17 00:00:00 2001 From: deliaBlue Date: Fri, 8 Dec 2023 15:09:05 +0100 Subject: [PATCH 21/21] change logs dir --- workflow/Snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/Snakefile b/workflow/Snakefile index 9f116726..e9648d69 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -46,7 +46,7 @@ onsuccess: onerror: - print("\nWORKFLOW FAILED. Check the log file in the LOGS/ directory.\n") + print("\nWORKFLOW FAILED. Check the log file in the log directory.\n") shell("cat {log} > {LOG_DIR}/failed_workflow.log")