From b2f667ea0f3b06a6c84eba47426136cdb87d8185 Mon Sep 17 00:00:00 2001 From: deliaBlue <103108590+deliaBlue@users.noreply.github.com> Date: Mon, 29 Jan 2024 15:45:37 +0100 Subject: [PATCH] feat: generate alignment pileups (#132) * feat: add pileup workflow * docs: add pileups data and workflow rules * build: add pileups parameters * test: update test config file * test: add BED test file * docs: update rule graph * feat: add pileup script * refactor: add create empty BED function * test: update expected output --- README.md | 25 +- config/README.md | 8 +- config/config_schema.json | 21 +- config/config_template.yaml | 13 + images/rule_graph.svg | 844 ++++++++++++--------- pipeline_documentation.md | 179 ++++- scripts/ascii_alignment_pileup.R | 355 +++++++++ scripts/tests/test_mirna_quantification.py | 8 +- test/config.yaml | 4 +- test/expected_output.md5 | 12 + test/test_files/pileup_regions.bed | 2 + workflow/Snakefile | 14 +- workflow/rules/common.smk | 10 + workflow/rules/pileup.smk | 258 +++++++ 14 files changed, 1376 insertions(+), 377 deletions(-) create mode 100644 scripts/ascii_alignment_pileup.R create mode 100644 test/test_files/pileup_regions.bed create mode 100644 workflow/rules/pileup.smk diff --git a/README.md b/README.md index 12b3932..2a8588d 100644 --- a/README.md +++ b/README.md @@ -209,10 +209,15 @@ There are 4 files you must provide: resource][chrMap] provides such files for various organisms, and in the expected format. +5. **OPTIONAL**: A **BED6** file with regions for which to produce + [ASCII-style alignment pileups][ascii-pileups]. If not provided, no pileups + will be generated. See [here][bed-format] for the expected format. + > General note: If you want to process the genome resources before use (e.g., > filtering), you can do that, but make sure the formats of any modified > resource files meet the formatting expectations outlined above! + #### 3. Prepare a configuration file We recommend creating a copy of the @@ -220,7 +225,7 @@ We recommend creating a copy of the ```bash cp config/config_template.yaml path/to/config.yaml -``` So on that PR I could move this information in the section/file all of this will be written. +``` Open the new copy in your editor of choice and adjust the configuration parameters to your liking. The template explains what each of the @@ -278,6 +283,13 @@ represents a sample library. Each read is counted towards all the annotated miRNA species it aligns to, with 1/n, where n is the number of genomic and/or transcriptomic loci that read aligns to. +5. **OPTIONAL**. ASCII-style pileups of read alignments produced for individual +libraries, combinations of libraries and/or all libraries of a given run. The +exact number and nature of the outputs depends on the workflow +inputs/parameters. See the +[pileups section](pipeline_documentation.md/#pileup-workflow) for a detailed +description. + To retain all intermediate files, include `--no-hooks` in the workflow call. ```bash @@ -319,10 +331,11 @@ be aligned separately against the genome and transcriptome. For increased fidelity, two separated aligners, [Segemehl][segemehl] and our in-house tool [Oligomap][oligomap], are used. All the resulting alignments are merged such that only the best alignments of each read are kept (smallest edit distance). -Finally, alignments are intersected with the user-provided, pre-processed -miRNA annotation file using [BEDTools][bedtools]. Counts are tabulated -separately for reads consistent with either miRNA precursors, mature miRNA -and/or isomiRs. +Alignments are intersected with the user-provided, pre-processed miRNA +annotation file using [BEDTools][bedtools]. Counts are tabulated separately for +reads consistent with either miRNA precursors, mature miRNA and/or isomiRs. +Finally, ASCII-style alignment pileups are optionally generated for +user-defined regions of interest. > **NOTE:** For a detailed description of each rule, please, refer to the > [workflow documentation](pipeline_documentation.md) @@ -350,6 +363,8 @@ For questions or suggestions regarding the code, please use the [issue tracker][ © 2023 [Zavolab, Biozentrum, University of Basel][zavolab] +[ascii-pileups]: +[bed-format]: [bedtools]: [chrMap]: [conda]: diff --git a/config/README.md b/config/README.md index a52e86e..1d896c5 100644 --- a/config/README.md +++ b/config/README.md @@ -100,6 +100,10 @@ There are 4 files you must provide: resource][chrMap] provides such files for various organisms, and in the expected format. +5. **OPTIONAL**: A **BED6** file with regions for which to produce + [ASCII-style alignment pileups][ascii-pileups]. If not provided, no pileups + will be generated. See [here][bed-format] for the expected format. + > General note: If you want to process the genome resources before use (e.g., > filtering), you can do that, but make sure the formats of any modified > resource files meet the formatting expectations outlined above! @@ -118,8 +122,10 @@ Open the new copy in your editor of choice and adjust the configuration parameters to your liking. The template explains what each of the parameters mean and how you can meaningfully adjust them. - +[ascii-pileups]: +[bed-format]: [chrMap]: [ensembl]: +[ensembl-bed]: [mamba]: [mirbase]: diff --git a/config/config_schema.json b/config/config_schema.json index 842c175..00054ef 100644 --- a/config/config_schema.json +++ b/config/config_schema.json @@ -9,6 +9,11 @@ "type": "string", "description": "Path to the samples table." }, + "bed_file":{ + "type": "string", + "default": "", + "description": "Path to the genomic regions file to do the pileups for." + }, "genome_file":{ "type": "string", "description": "Path to the reference genome file." @@ -35,6 +40,11 @@ "default": "results/intermediates", "description": "Path to the directory storing the intermediate files." }, + "pileups_dir":{ + "type": "string", + "default": "results/pileups", + "description": "Path to the directory storing the ASCII-style pileups." + }, "local_log":{ "type": "string", "default": "logs/local/", @@ -101,7 +111,16 @@ "type": "string", "enum": ["isomir", "mirna", "pri-mir"] }, - "default": ["isomir", "mirna", "pri-mir"] + "default": ["isomir", "mirna", "pri-mir"], + "description": "miRNA speices to be quantified." + }, + "lib_dict":{ + "type": "object", + "additionalProperties":{ + "type": "array", + }, + "default": {}, + "description": "Dictionary of arbitrary condition names (keys) and library names to aggregate alignment pileups for (values; MUST correspond to names in samples table)." } } } diff --git a/config/config_template.yaml b/config/config_template.yaml index 06f89bd..4c4f2eb 100644 --- a/config/config_template.yaml +++ b/config/config_template.yaml @@ -15,6 +15,7 @@ samples: path/to/samples_table.tsv genome_file: path/to/gzipped/ensembl/genome.fa.gz gtf_file: path/to/gzipped/ensembl/gene_annotations.gtf.gz mirna_file: path/to/unzipped/mirbase/mirna_annotations.gff3 +bed_file: path/to/pileups/genomic_regions.bed # Tab-separated mappings table between UCSC (column 1) # and Ensembl (coulm 2) chromosome names @@ -32,6 +33,7 @@ map_chr_file: path/to/ucsc_ensembl_mappings.tsv #### DIRECTORIES #### output_dir: results/ +pileups_dir: results/pileups intermediates_dir: results/intermediates local_log: logs/local/ cluster_log: logs/cluster/ @@ -63,4 +65,15 @@ nh: 100 # discard reads with more mappings than the indicated number # If 'isomir' and 'mirna' are both in the list, a single table with both types # is made. mir_list: ['isomir', 'mirna', 'pri-mir'] + +#### ASCII-STYLE ALIGNMENT PILEUPS PARAMETERS #### + +# Dictionary with the list of library names to aggregate when performing the +# pileups as values and the condition as keys. Library names must match the +# ones in the samples table `sample` column. The dictionary keys will be used +# as the pileup's output directory name. +# e.g. lib_dict: {"group_A": ["lib_1", "lib_3"], "group_B": ["lib_2"]} +# +# Leave as an empty dictionary if no pileups are desired. +lib_dict: {} ... diff --git a/images/rule_graph.svg b/images/rule_graph.svg index 9836032..4b4bcb2 100644 --- a/images/rule_graph.svg +++ b/images/rule_graph.svg @@ -4,784 +4,922 @@ - - + + snakemake_dag - + 0 - -finish + +finish 1 - -filter_sam_by_intersecting_primir + +filter_sam_by_intersecting_primir 1->0 - - + + 42 - -filter_sam_by_intersecting_mirna + +filter_sam_by_intersecting_mirna - + 1->42 - - + + 45 - -convert_intersecting_primir_sam_to_bam + +convert_intersecting_primir_sam_to_bam - + 1->45 - - + + 2 - -filter_by_indels + +filter_by_indels - + 2->1 - - + + 37 - -convert_all_alns_sam_to_bam + +convert_all_alns_sam_to_bam - + 2->37 - - + + 3 - -remove_inferiors + +remove_inferiors - + 3->2 - - + + 4 - -sort_maps_by_id + +sort_maps_by_id - + 4->3 - - + + 5 - -add_header_all_maps + +add_header_all_maps - + 5->4 - - + + 6 - -create_genome_header + +create_genome_header - + 6->5 - - + + 7 - -trim_genome_seq_ids + +trim_genome_seq_ids - + 7->6 - - + + 19 - -extract_transcriptome_seqs + +extract_transcriptome_seqs - + 7->19 - - + + 30 - -map_genome_segemehl + +map_genome_segemehl - + 7->30 - - + + 31 - -generate_segemehl_index_genome + +generate_segemehl_index_genome - + 7->31 - - + + 34 - -map_genome_oligomap + +map_genome_oligomap - + 7->34 - - + + 41 - -create_index_genome_fasta + +create_index_genome_fasta - + 7->41 - - + + + + + +56 + +compress_reference_genome + + + +7->56 + + 8 - -merge_all_maps + +merge_all_maps - + 8->5 - - + + 9 - -transcriptome_to_genome_maps + +transcriptome_to_genome_maps - + 9->8 - - + + 10 - -remove_header_transcriptome_mappings + +remove_header_transcriptome_mappings - + 10->9 - - + + 11 - -filter_transcriptome_by_nh + +filter_transcriptome_by_nh - + 11->10 - - + + 12 - -merge_transcriptome_maps + +merge_transcriptome_maps - + 12->11 - - + + 13 - -map_transcriptome_segemehl + +map_transcriptome_segemehl - + 13->12 - - + + 14 - -collapse_identical_reads + +collapse_identical_reads 14->13 - - + + 24 - -filter_fasta_for_oligomap + +filter_fasta_for_oligomap - + 14->24 - - + + 14->30 - - + + 15 - -remove_adapters + +remove_adapters - + 15->14 - - + + 16 - -format_fasta + +format_fasta - + 16->15 - - + + 17 - -start + +start - + 17->16 - - + + 18 - -trim_transcriptome_seq_ids + +trim_transcriptome_seq_ids - + 18->13 - - + + 20 - -generate_segemehl_index_transcriptome + +generate_segemehl_index_transcriptome - + 18->20 - - + + 23 - -map_transcriptome_oligomap + +map_transcriptome_oligomap - + 18->23 - - + + - + 19->18 - - + + - + 20->13 - - + + 21 - -convert_transcriptome_to_sam_oligomap + +convert_transcriptome_to_sam_oligomap - + 21->12 - - + + 22 - -sort_transcriptome_oligomap + +sort_transcriptome_oligomap - + 22->21 - - + + - + 23->22 - - + + - + 24->23 - - + + - + 24->34 - - + + 25 - -convert_exons_gtf_to_bed + +convert_exons_gtf_to_bed - + 25->9 - - + + 26 - -get_exons_gtf + +get_exons_gtf - + 26->25 - - + + 27 - -remove_header_genome_mappings + +remove_header_genome_mappings - + 27->8 - - + + 28 - -filter_genome_by_nh + +filter_genome_by_nh - + 28->27 - - + + 29 - -merge_genome_maps + +merge_genome_maps - + 29->28 - - + + - + 30->29 - - + + - + 31->30 - - + + 32 - -convert_genome_to_sam_oligomap + +convert_genome_to_sam_oligomap - + 32->29 - - + + 33 - -sort_genome_oligomap + +sort_genome_oligomap - + 33->32 - - + + - + 34->33 - - + + 35 - -intersect_extended_primir + +intersect_extended_primir - + 35->1 - - + + 50 - -quantify_primir + +quantify_primir - + 35->50 - - + + 36 - -sort_all_alns_bam_by_position + +sort_all_alns_bam_by_position - + 36->35 - - + + - + 37->36 - - + + 38 - -extend_mirs_annotations + +extend_mirs_annotations - + 38->35 - - + + 43 - -intersect_extended_mirna + +intersect_extended_mirna - + 38->43 - - + + 39 - -map_chr_names + +map_chr_names - + 39->38 - - + + + + + +55 + +create_per_run_ascii_pileups + + + +39->55 + + + + + +58 + +create_per_library_ascii_pileups + + + +39->58 + + + + + +59 + +create_per_condition_ascii_pileups + + + +39->59 + + 40 - -extract_chr_len + +extract_chr_len - + 40->38 - - + + - + 41->40 - - + + - + 42->0 - - + + 49 - -add_intersecting_mirna_tag + +add_intersecting_mirna_tag - + 42->49 - - + + - - -51 - -uncollapse_reads + + +53 + +uncollapse_reads - - -42->51 - - + + +42->53 + + - + 43->42 - - + + - + 43->49 - - + + 44 - -sort_intersecting_primir_bam_by_position + +sort_intersecting_primir_bam_by_position - + 44->43 - - + + - + 45->44 - - + + 46 - -merge_tables + +merge_tables - + 46->0 - - + + 47 - -quantify_mirna + +quantify_mirna - + 47->46 - - + + 48 - -sort_intersecting_mirna_by_feat_tag + +sort_intersecting_mirna_by_feat_tag - + 48->47 - - + + - + 49->48 - - + + - + 50->46 - - + + + + + +51 + +sort_uncollpased_reads_bam_by_position - + 51->0 - - - - - -53 - -convert_uncollpased_reads_sam_to_bam + + - - -51->53 - - + + +54 + +index_uncollapsed_reads_bam + + + +51->54 + + + + + +51->55 + + + + + +51->58 + + + + + +51->59 + + 52 - -sort_uncollpased_reads_bam_by_position - - - -52->0 - - - - - -54 - -index_uncollapsed_reads_bam + +convert_uncollpased_reads_sam_to_bam - - -52->54 - - + + +52->51 + + - + 53->52 - - + + - + 54->0 - - + + + + + +54->55 + + + + + +54->58 + + + + + +54->59 + + + + + +55->0 + + + + + +56->55 + + + + + +56->58 + + + + + +56->59 + + + + + +57 + +create_empty_bed + + + +57->55 + + + + + +57->58 + + + + + +57->59 + + + + + +58->0 + + + + + +59->0 + + diff --git a/pipeline_documentation.md b/pipeline_documentation.md index a3f50ba..2e2ba72 100644 --- a/pipeline_documentation.md +++ b/pipeline_documentation.md @@ -78,6 +78,13 @@ on installation and usage please see [here](README.md). - [`convert_uncollapse_reads_sam_to_bam`](#convert_uncollapse_reads_sam_to_bam) - [`sort_uncollapse_reads_bam_by_position`](#sort_uncollapse_reads_bam_by_position) - [`index_uncollapse_reads_bam`](#index_uncollapse_reads_bam) + - [Pileup workflow](#pileup-workflow) + - [`finish_pileup`](#finish_pileup) + - [`create_empty_bed`](#create_empty_bed) + - [`compress_reference_genome`](#compress_reference_genome) + - [`create_per_library_ascii_pileups`](#create_per_library_ascii_pileups) + - [`create_per_run_ascii_pileups`](#create_per_run_ascii_pileups) + - [`create_per_condition_ascii_pileups`](#create_per_condition_ascii_pileups) @@ -87,6 +94,7 @@ on installation and usage please see [here](README.md). | Name | License | Tag line | More info | | --- | --- | --- | --- | +| **ASCII-style alignment pileups** | [Apache 2.0][license-apache2] | _"Generates ASCII-style pileups of read alignments in one or more BAM files for one or more genomic regions."_ | [code][code-ascii] | | **BEDTools** | [GPLv2][license-gpl2] | _"[...] intersect, merge, count, complement, and shuffle genomic intervals from multiple files in widely-used genomic file formats such as BAM, BED, GFF/GTF, VCF"_ | [code][code-bedtools] / [manual][manual-bedtools] / [publication][pub-bedtools] | | **cufflinks** | [BSL-1.0][license-bsl1] | _"[...] assembles transcripts, estimates their abundances, and tests for differential expression and regulation in RNA-Seq samples"_ | [code][code-cufflinks] / [manual][docs-cufflinks] / [publication][pub-cufflinks] | | **cutadapt** | [MIT][license-mit] | _"[...] finds and removes adapter sequences, primers, poly-A tails and other types of unwanted sequence from your high-throughput sequencing reads"_ | [code][code-cutadapt] / [manual][docs-cutadapt] / [publication][pub-cutadapt] | @@ -100,12 +108,12 @@ on installation and usage please see [here](README.md). > The workflow consists of four Snakemake files: A main `Snakefile` and an individual Snakemake file for each step in the workflow (the file preparation, -the reads mapping and the miRNA quantification). The main `Snakefile` contains -the configuration file validation along with the inclusion of the -sub-workflows. Individual steps of the workflow are described briefly, and -links to the respective software manuals are given. Parameters that can be -modified by the user (via the samples table and the configuration file) are -also described. +the reads mapping, the miRNA quantification and the ASCII-style alignment +pileups generation). The main `Snakefile` contains the configuration file +validation along with the inclusion of the sub-workflows. Individual steps of +the workflow are described briefly, and links to the respective software +manuals are given. Parameters that can be modified by the user (via the samples +table and the configuration file) are also described. ### Rule graph @@ -160,6 +168,10 @@ Target rule as required by [Snakemake][docs-snakemake]. [**sort_uncollapsed_reads_bam_by_position**](#sort_uncollapsed_reads_bam_by_position) - `BAM` index file (`.bam.bai`); from [**index_uncollapsed_reads_bam**](#index_uncollapsed_reads_bam) + - Empty text file (`.txt`); from + [**create_per_library_ascii_pileups**](#create_per_library_ascii_pileups), + [**create_per_run_ascii_pileups**](#create_per_run_ascii_pileups) and/or + [**create_per_condition_ascii_pileups**](#create_per_condition_ascii_pileups) ### Prepare workflow @@ -199,8 +211,9 @@ Trim genome sequence IDs with a [**custom script**][custom-script-trim-id]. [**create_genome_header**](#create_genome_header), [**create_index_genome_fasta**](#create_index_genome), [**generate_segemehl_index_genome**](#generate_segemehl_index_genome), - [**mapping_genome_segemehl**](#mapping_genome_segemehl) and - [**mapping_genome_oligomap**](#mapping_genome_oligomap) + [**mapping_genome_segemehl**](#mapping_genome_segemehl), + [**mapping_genome_oligomap**](#mapping_genome_oligomap) and + [**compress_reference_genome**](#compress_reference_genome) #### `extract_transcriptome_seqs` @@ -312,7 +325,10 @@ miRNA annotations. Several mapping tables are available [here][chr-maps]. - Tab-separated mappings table (`.tsv`) - **Output** - miRNA annotations with mapped genes(`.gff3`); used in - [**extend_mirs_annotations**](#extend_mirs_annotations) + [**extend_mirs_annotations**](#extend_mirs_annotations), + [**create_per_library_ascii_pileups**](#create_per_library_ascii_pileups), + [**create_per_run_ascii_pileups**](#create_per_run_ascii_pileups) and/or + [**create_per_condition_ascii_pileups**](#create_per_condition_ascii_pileups) #### `create_index_genome_fasta` @@ -1103,7 +1119,10 @@ Sort alignments by position with [**SAMtools**](#third-party-software-used). [**convert_uncollapsed_reads_sam_to_bam**](#convert_uncollapsed_reads_sam_to_bam) - **Output** - Alignments file, sorted (`.bam`); used in - [**index_uncollapsed_reads_bam**](#index_uncollapsed_reads_bam) + [**index_uncollapsed_reads_bam**](#index_uncollapsed_reads_bam), + [**create_per_library_ascii_pileups**](#create_per_library_ascii_pileups), + [**create_per_run_ascii_pileups**](#create_per_run_ascii_pileups) and/or + [**create_per_condition_ascii_pileups**](#create_per_condition_ascii_pileups) #### `index_uncollapsed_reads_bam` @@ -1117,7 +1136,140 @@ alignments in a genomic region of interest. - Alignments file (`.bam`); from [**sort_uncollapsed_reads_bam_by_position**](#sort_uncollapsed_reads_bam_by_position) - **Output** - - `BAM` index file (`.bam.bai`) + - `BAM` index file (`.bam.bai`); used in + [**create_per_library_ascii_pileups**](#create_per_library_ascii_pileups), + [**create_per_run_ascii_pileups**](#create_per_run_ascii_pileups) and/or + [**create_per_condition_ascii_pileups**](#create_per_condition_ascii_pileups) + + +### Pileup workflow + +#### `finish_pileup` + +Target rule as required by [Snakemake][docs-snakemake]. + +> Local rule + +- **Input** + - Empty text files (`.txt`); from + [**create_per_library_ascii_pileups**](#create_per_library_ascii_pileups) and + [**create_per_run_ascii_pileups**](#create_per_run_ascii_pileups) + + +#### `create_empty_bed` + +Create an empty BED file if the user has not provided one. + +> **OPTIONAL RULE.** This rule will be executed if, and only if, the user has +> not provided a BED file in the [configuration file](#configuration-file) +> with the regions the ASCII-style alignment pileups must be performed on. + +- **Condition** + - **config_template.yaml** + - `bed_file`: BED6 file with all the desired annotation regions to perform + the ASCII-style alignment pileups on. (Default: None) +- **Output** + - BED empty file (`.bed`); used in + [**create_per_library_ascii_pileups**](#create_per_library_ascii_pileups), + [**create_per_run_ascii_pileups**](#create_per_run_ascii_pileups) and/or + [**create_per_condition_ascii_pileups**](#create_per_condition_ascii_pileups) + + +#### `compress_reference_genome` + +Compress the processed genome with trimmed IDs using `bgzip`. + +> Required to perform the ASCII-style alignment pileups. +> In order to be able to use the `bgzip` command when running the workflow +> using Singularity, [**SAMtools**](#third-party-software-used) is used. + +- **Input** + - Genome sequence file, trimmed IDs (`.fa`); from + [**trim_genome_seq_ids**](#trim_genome_seq_ids) +- **Output** + - Genome sequence file, trimmed IDs, `bgzip`ed (`.fa.bz`); used in + [**create_per_library_ascii_pileups**](#create_per_library_ascii_pileups), + [**create_per_run_ascii_pileups**](#create_per_run_ascii_pileups) and/or + [**create_per_condition_ascii_pileups**](#create_per_condition_ascii_pileups) + + +#### `create_per_library_ascii_pileups` + +Create ASCII-style pileups for all the desired annotated regions across +libraries with [**ASCII-style alignment pilueups**](#third-party-software-used). + +> A directory containing the ASCII-style pileups is created for each +> library. If no BED file is provided, the pileups' output directories will +> only contain an empty file. + +- **Input** + - Genome sequence file, trimmed IDs, `bgzip`ed (`.fa.bz`); from + [**compress_reference_genome**](#compress_reference_genome) + - miRNA annotations with mapped genes(`.gff3`); from + [**map_chr_names**](#map_chr_names) + - Alignments file (`.bam`); from + [**sort_uncollapsed_reads_bam_by_position**](#sort_uncollapsed_reads_bam_by_position) + - `BAM` index file (`.bam.bai`); from + [**index_uncollapsed_reads_bam**](#index_uncollapsed_reads_bam) + - Annotated genomic regions (`.bed`); from workflow input files or + [**create_empty_bed**](#create_empty_bed) +- **Output** + - Empty text file (`.txt`) + + +#### `create_per_run_ascii_pileups` + +Create ASCII-style pileups for all the desired annotated regions for the whole +run with [**ASCII-style alignment pilueups**](#third-party-software-used). + +> If no BED file is provided, the pileups' output directory will only contain +> an empty file. + +- **Input** + - Genome sequence file, trimmed IDs, `bgzip`ed (`.fa.bz`); from + [**compress_reference_genome**](#compress_reference_genome) + - miRNA annotations with mapped genes(`.gff3`); from + [**map_chr_names**](#map_chr_names) + - Alignments file (`.bam`); from + [**sort_uncollapsed_reads_bam_by_position**](#sort_uncollapsed_reads_bam_by_position) + - `BAM` index file (`.bam.bai`); from + [**index_uncollapsed_reads_bam**](#index_uncollapsed_reads_bam) + - Annotated genomic regions (`.bed`); from workflow input files or + [**create_empty_bed**](#create_empty_bed) +- **Output** + - Empty text file (`.txt`) + + +#### `create_per_condition_ascii_pileups` + +Create ASCII-style pileups for all the desired annotated regions across the +different library subsets if provided with +[**ASCII-style alignment pilueups**](#third-party-software-used). + +> **OPTIONAL RULE.** The ASCII-style pileups for each annotated region are +> made if, and only if, at least one library subset is specified in the +> [configuration file](#configuration-file). Otherwise, this rule will not be +> executed, and no output will be generated. + +- **Input** + - Genome sequence file, trimmed IDs, `bgzip`ed (`.fa.bz`); from + [**compress_reference_genome**](#compress_reference_genome) + - miRNA annotations with mapped genes(`.gff3`); from + [**map_chr_names**](#map_chr_names) + - Alignments file (`.bam`); from + [**sort_uncollapsed_reads_bam_by_position**](#sort_uncollapsed_reads_bam_by_position) + - `BAM` index file (`.bam.bai`); from + [**index_uncollapsed_reads_bam**](#index_uncollapsed_reads_bam) + - Annotated genomic regions (`.bed`); from workflow input files or + [**create_empty_bed**](#create_empty_bed) +- **Parameters** + - **config_template.yaml** + - `lib_dict`: Subset(s) of library name(s), as specified in the samples' + table column `sample` and the subset identifier stored in a dictionary. + (default: None) +- **Output** + - Empty text file (`.txt`) + [chr-maps]: [custom-script-blocksort]: scripts/blocksort.sh @@ -1137,6 +1289,7 @@ alignments in a genomic region of interest. [custom-script-trim-id]: scripts/trim_id_fasta.sh [custom-script-uncollapse]: scripts/sam_uncollapse.pl [custom-script-validation]: scripts/validation_fasta.py +[code-ascii]: [code-bedtools]: [code-cufflinks]: [code-cutadapt]: @@ -1154,6 +1307,7 @@ alignments in a genomic region of interest. [docs-snakemake]: [license-afl3]: [license-agpl3]: +[license-apache2]: [license-bsl1]: [license-gpl2]: [license-gpl3]: @@ -1161,7 +1315,8 @@ alignments in a genomic region of interest. [pub-bedtools]: [pub-cufflinks]: [pub-cutadapt]: -[pub-oligomap]: +[pub-oligomap]: [pub-samtools]: [pub-segemehl]: [rule-graph]: images/rule_graph.svg + diff --git a/scripts/ascii_alignment_pileup.R b/scripts/ascii_alignment_pileup.R new file mode 100644 index 0000000..ff7bb87 --- /dev/null +++ b/scripts/ascii_alignment_pileup.R @@ -0,0 +1,355 @@ +#!/usr/bin/env Rscript + +#==================# +# HEADER START # +#==================# +### Created: Sep 29, 2019 +### Author: Alexander Kanitz +### Affiliation: Zavolan Group, Biozentrum, University of Basel +### Email: alexander.kanitz@alumni.ethz.ch +#==================# +# HEADER END # +#==================# + + +#==========================# +# PRE-REQUISITES START # +#==========================# +#---> LOAD OPTION PARSER <---# +if (suppressWarnings(suppressPackageStartupMessages(require("optparse"))) == FALSE) { stop("Package 'optparse' required!\nExecution aborted.") } + +#---> GET SCRIPT NAME <---# +args.all <- commandArgs(trailingOnly = FALSE) +name.field <- "--file=" +script <- basename(sub(name.field, "", args.all[grep(name.field, args.all)])) + +#---> DESCRIPTION <---# +description <- "Generates an ASCII-style pileup of read alignments in one or more BAM files +against one or more regions specified in a BED file.\n" +author <- "Author: Alexander Kanitz" +affiliation <- "Affiliation: Biozentrum, University of Basel" +email <- "Email: alexander.kanitz@alumni.ethz.ch" +version <- "1.2.1" +version_formatted <- paste("Version:", version, sep=" ") +requirements <- c("optparse", "rtracklayer", "GenomicAlignments", "tools") +requirements_txt <- paste("Requires:", paste(requirements, collapse=", "), sep=" ") +msg <- paste(description, author, affiliation, email, version_formatted, requirements_txt, sep="\n") +notes <- "Notes: +- For the input queries, consider the `--maximum-region-width` parameter, which + is provided for safety. While it is possible to increase it, wide regions may + require excessive memory. +" + +#---> COMMAND-LINE ARGUMENTS <---# +## List of allowed/recognized arguments +option_list <- list( + make_option( + "--reference", + action="store", + type="character", + default=NULL, + help="Reference genome sequence in FASTA format. The file *MUST* be compressed with + BGZIP. If supplied, the reference sequence for the query region(s) will be added to + the output. Note that on the first run with a specific reference genome file, an FAI + index is generated which will take some time.", + metavar="file" + ), + make_option( + "--annotations", + action="store", + type="character", + default=NULL, + help="Annotation file in GFF/GTF format used to annotate sequences. If supplied, + features overlapping the query region(s) will be visualized in the output. Ensure that + the argument to option `annotation-name-field` corresponds to a field in the + annotations, otherwise the script will fail.", + metavar="file" + ), + make_option( + "--output-directory", + action="store", + type="character", + default=getwd(), + help="Output directory. One output file will be created for each region in `--bed` and + the filenames will be generated from the basenames of the supplied BAM file(s) and the + name field (4th column) of the BED file. [default \"%default\"]", + metavar="dir" + ), + make_option( + "--maximum-region-width", + action="store", + type="integer", + default=200, + help="Maximum input region width. Use with care as wide regions will use excessive + resources. [default %default]", + metavar="int" + ), + make_option( + "--do-not-collapse-alignments", + action="store_true", + type="logical", + default=FALSE, + help="Show alignments of reads with identical sequences individually." + ), + make_option( + "--minimum-count", + action="store", + type="integer", + default=1, + help="Alignments of reads with less copies than the specified number will not be + printed. Option is not considered if `do-not-collapse-alignments` is set. + [default %default]", + metavar="int" + ), + make_option( + "--annotation-name-field", + action="store", + type="character", + default="Name", + help="Annotation field used to populate the `name` column in the output. + [default \"%default\"]", + metavar="str" + ), + make_option( + "--padding-character", + action="store", + default=".", + help="Character used for padding alignments. [default \"%default\"]", + metavar="char" + ), + make_option( + "--indel-character", + action="store", + default="-", + help="Character to denote insertions and deletions in alignments. [default \"%default\"]", + metavar="char" + ), + make_option( + "--prefix", + action="store", + type="character", + default=NULL, + help="Prefix to be used in the output file name(s). If not provided + the input BAM file(s) name will be used instead,", + metavar="string" + ), + make_option( + c("-h", "--help"), + action="store_true", + default=FALSE, + help="Show this information and die." + ), + make_option( + c("-v", "--verbose"), + action="store_true", + default=FALSE, + help="Print log messages to STDOUT." + ) +) + +## Parse options +opt_parser <- OptionParser( + usage=paste(script, "[--help] [--verbose] [OPTIONS] BED BAM [BAM2 ...]\n"), + option_list=option_list, + add_help_option=FALSE, + description=msg, + epilogue=notes +) +cli <- parse_args(opt_parser, positional_arguments=c(2, Inf)) + +# Re-assign CLI arguments +fl.query <- cli$args[[1]] +fl.bam <- cli$args[2:length(cli$args)] +fl.ref <- cli$options[["reference"]] +fl.anno <- cli$options[["annotations"]] +dir.out <- cli$options[["output-directory"]] +prefix.out <- cli$options[["prefix"]] +width.max <- cli$options[["maximum-region-width"]] +collapse <- ! cli$options[["do-not-collapse-alignments"]] +count.min <- cli$options[["minimum-count"]] +char.pad <- cli$options[["padding-character"]] +char.indel <- cli$options[["indel-character"]] +field.name.anno <- cli$options[["annotation-name-field"]] +verb <- cli$options[["verbose"]] +#==========================# +# PRE-REQUISITES END # +#==========================# + + +#================# +# MAIN START # +#================# +#---> START MESSAGE <---# +if (verb) cat("Starting '", script, "'...\n", sep="") + +#---> LOAD REQUIRED LIBRARIES <---# +if (verb) cat("Loading libraries...\n", sep="") +for (req in requirements) { + if ( suppressWarnings(suppressPackageStartupMessages(require(req, character.only=TRUE))) == FALSE ) { stop("Package '", req, "' required!") } +} + +#---> IMPORT FILES <---# +# Print status message +if (verb) cat("Importing input files...\n", sep="") +# Load files +bed <- import(con=fl.query) +if (! is.null(fl.ref)) {ref <- FaFile(fl.ref)} +if (! is.null(fl.anno)) {anno <- import(con=fl.anno)} + +# Get file prefix from BAM files or from CLI argument +fl.prefix <- if (! is.null(prefix.out)) prefix.out else paste(basename(file_path_sans_ext(fl.bam)), collapse=".") + +#---> <---# +# Print status message +if (verb) cat("Iterating over regions in BED file...\n") +# Iterate over input regions +for(index in seq_along(bed)) { + + #---> PREPARE STACKING OF ALIGNMENTS <---# + # Assign current region + region <- bed[index] + # Print status message + if (verb) cat("Processing region '", mcols(region)[["name"]], "'...\n", sep="") + # Exit with error if region is too wide + if (width(region) > width.max) {stop("Supplied region too large. Consider increasing the `width.max` parameter, but note that for very large regions, the memory footprint may be excessive.")} + # Initialize DNAStringSet container object + seq.out <- DNAStringSet() + + #---> ADD READ ALIGNMENTS <---# + # Print status message + if (verb) cat("Iterating over BAM files...\n") + # Iterate over BAM files + for (bam in fl.bam) { + # Print status message + if (verb) cat("Adding read alignments for file '", bam,"'...\n", sep="") + # Get alignments for current BAM file + seq.stack <- stackStringsFromBam( + bam, + param=region, + D.letter=char.indel, + N.letter=char.indel, + Lpadding.letter=char.pad, + Rpadding.letter=char.pad + ) + # Add to container + seq.out <- append(seq.out, seq.stack) + } + # Get complement if on minus strand + if (as.character(strand(region))[[1]] == "-") { + seq.out <- complement(seq.out) + } + # Convert to character + seq.out <- as.character(seq.out) + # Convert to dataframe for writing output + df <- data.frame(seq=seq.out, count=rep(NA, length(seq.out)), stringsAsFactors=FALSE) + + #---> COLLAPSE READ ALIGNMENTS <---# + if (collapse) { + # Print status message + if (verb) cat("Collapsing identical reads/alignments...\n") + # Get unique alignments + df <- data.frame(table(df[["seq"]]), stringsAsFactors=FALSE) + if (! ncol(df) == 2) df <- data.frame(matrix(ncol = 2, nrow = 0)) + colnames(df) <- c("seq", "count") + df[["seq"]] <- as.character(df[["seq"]]) + # Filter out any alignments that do not make the specified minimum count cutoff + df <- df[df[["count"]] >= count.min, ] + } + + #---> SORTING ALIGNMENTS <---# + # Print status message + if (verb) cat("Sorting alignments...\n") + # Reverse sequence if on minus strand + if (as.character(strand(region))[[1]] == "-") { + df[["seq"]] <- reverse(df[["seq"]]) + } + # Sort by position of first nucleotide, count and position of last nucleotide + if (nrow(df)) { + last_char <- nchar(df[["seq"]][[1]]) + pos.nuc.first <- regexpr(paste0("[^", char.pad, "\\.]"), df[["seq"]]) + pos.nuc.last <- last_char - regexpr(paste0("[^", char.pad, "\\.]"), unlist(lapply(df[["seq"]], reverse))) + 1 + df <- df[order( + pos.nuc.first, df[["count"]], pos.nuc.last, + decreasing=c(FALSE, TRUE, FALSE) + ), ] + } + # Reverse sequence again if on minus strand + if (as.character(strand(region))[[1]] == "-") { + df[["seq"]] <- reverse(df[["seq"]]) + } + + #---> ADD REFERENCE SEQUENCE <---# + if (! is.null(fl.ref)) { + # Print status message + if (verb) cat("Adding reference sequence...\n") + # Generate name for reference sequence + name.ref <- paste( + seqnames(region), + paste(start(region), end(region), sep="-"), + strand(region), sep=":" + ) + # Get sequence + row.ref <- getSeq(ref, region) # will create .fai index if not present + # Get reverse if on minus strand + if (as.character(strand(region))[[1]] == "-") { + row.ref <- reverse(row.ref) + } + # Compile row and add to dataframe + df.ref <- data.frame(seq=as.character(row.ref), count=name.ref, stringsAsFactors=FALSE) + df <- rbind(df.ref, df) + } + + #---> ADD ANNOTATIONS FOR REGION <---# + # Add annotations + if (! is.null(fl.anno)) { + # Print status message + if (verb) cat("Adding overlapping features...\n") + # Find overlapping features + features <- anno[to(findOverlaps(region, anno))] + # Set order of addition from most upstream to most downstream + if (as.character(strand(region)) == "+") { + order.features <- order(start(features)) + } else { + order.features <- rev(order(end(features))) + } + # Order features + features <- features[order.features] + # Iterate over features + seqs <- sapply(seq_along(features), function(index) { + feat <- features[index] + # Set markup character depending on strand + char.anno <- ifelse(strand(region) == "+", ">", "<") + # Calculate lengths of feature (in region) & left/right padding + diff.start <- start(feat) - start(region) + n.padl <- max(0, diff.start) + n.anno <- min(min(0, diff.start) + width(feat), width(region) - n.padl) + n.padr <- width(region) - n.padl - n.anno + # Generate string encompassing feature + str.padl <- paste(rep(char.pad, n.padl), collapse="") + str.anno <- paste(rep(char.anno, n.anno), collapse="") + str.padr <- paste(rep(char.pad, n.padr), collapse="") + str.final <- paste0(str.padl, str.anno, str.padr) + }) + df.anno <- data.frame(seq=seqs, count=mcols(features)[[field.name.anno]], stringsAsFactors=FALSE) + df <- rbind(df.anno, df) + } + + #---> WRITE OUTPUT <---# + if (collapse) { + name.file <- paste(fl.prefix, mcols(region)[["name"]], "min", as.character(count.min), "pileup", "tab", sep=".") + } else { + name.file <- paste(fl.prefix, mcols(region)[["name"]], "uncollapsed", "pileup", "tab", sep=".") + } + fl.out <- file.path(dir.out, name.file) + # Print status message + if (verb) cat("Writing output to file '", fl.out, "'...\n", sep="") + # Write tab-separated output + write.table(df, fl.out, quote=FALSE, sep="\t", col.names=FALSE, row.names=FALSE) +} + +#---> END MESSAGE <---# +if (verb) cat("Done.\n\nSession info:\n") +if (verb) print(sessionInfo()) +#================# +# MAIN END # +#================# diff --git a/scripts/tests/test_mirna_quantification.py b/scripts/tests/test_mirna_quantification.py index cc044b1..e08d630 100644 --- a/scripts/tests/test_mirna_quantification.py +++ b/scripts/tests/test_mirna_quantification.py @@ -1,4 +1,5 @@ """Unit tests for module 'mirna_quantification.py'.""" + import argparse from pathlib import Path import sys @@ -344,9 +345,10 @@ def test_main_iso_sam_file( args = parse_arguments().parse_args() main(args) - with open(iso_out_table, "r") as expected, open( - mirna_output, "r" - ) as out_file: + with ( + open(iso_out_table, "r") as expected, + open(mirna_output, "r") as out_file, + ): assert out_file.read() == expected.read() def test_main_mirna_sam_file( diff --git a/test/config.yaml b/test/config.yaml index ad09200..17a0ec0 100644 --- a/test/config.yaml +++ b/test/config.yaml @@ -1,6 +1,8 @@ -samples: test_files/samples_table.tsv +samples: test_files/samples_table.tsv +bed_file: test_files/pileup_regions.bed genome_file: test_files/genome.fa.gz gtf_file: test_files/gene_annotations.gtf.gz mirna_file: test_files/mirna_annotations.gff3 map_chr_file: test_files/ucsc_to_ensembl.tsv mir_list: ['isomir', 'mirna', 'pri-mir'] +lib_dict: {"group_A": ['test_lib', 'test_lib']} diff --git a/test/expected_output.md5 b/test/expected_output.md5 index 6071fd6..1e41f88 100644 --- a/test/expected_output.md5 +++ b/test/expected_output.md5 @@ -43,11 +43,14 @@ a124a5afdb5f7bfbcc5683260556c9c4 results/intermediates/test_lib/genome_mappings f68693cfaa1e6ea78e1a5562ade6d9ed results/intermediates/test_lib/intersected_extended_primir.bed c2a5770a755ada66ef63d96eec4afb00 results/intermediates/test_lib/reads_filtered_for_oligomap.fasta fe5388094985e9604a302d39d2abc82c results/intermediates/test_lib/oligomap_transcriptome_report.txt +96a178aabc41d3020992efd6acbecb5a results/intermediates/genome_processed.fa.bz +d34fc868b861b1bc46db07a397dc0f10 results/intermediates/genome_processed.fa.bz.fai be7a0d92e57480190de57eb30baffa36 results/intermediates/extended_mirna_annotation_6_nt.gff3 8148cd880602255be166beb59bbed95a results/intermediates/genome_header.sam 09e24a504bfec37fee3d5ff1b5c7738e results/intermediates/exons.bed 4fb453846e88593d0cac13220ec2d685 results/intermediates/segemehl_genome_index.idx 44dbf7c3eae00d0bc8d5e1319123746c results/intermediates/chr_size.tsv +f5749f5d2096386444a3d021e0a01817 results/intermediates/genome_processed.fa.bz.gzi d34fc868b861b1bc46db07a397dc0f10 results/intermediates/genome_processed.fa.fai 21e102e4ebd3508bb06f46366a3d578d results/intermediates/exons.gtf 003b92b245ac336e3d70a513033e1cee results/intermediates/transcriptome_trimmed_id.fa @@ -56,3 +59,12 @@ f28cc0143ab6659bef3de3a7afa1dccc results/intermediates/mirna_annotations.gff3 2d437f8681f4248d4f2075f86debb920 results/intermediates/transcriptome.fa 7eb64c112830266bcf416ded60b4cf77 results/intermediates/segemehl_transcriptome_index.idx 4fba145540a2c61f29bfddfd0f5a4d4e results/intermediates/genome_processed.fa +fbbaf665220bc2c2222b04e394515f69 results/pileups/test_lib/test_lib.hsa-mir-516a-1.min.1.pileup.tab +d41d8cd98f00b204e9800998ecf8427e results/pileups/test_lib/check_file.txt +b974c1abfed36e33b37e040b7a1afb2d results/pileups/test_lib/test_lib.hsa-mir-1323.min.1.pileup.tab +fbbaf665220bc2c2222b04e394515f69 results/pileups/group_A/group_A.hsa-mir-516a-1.min.1.pileup.tab +ced1f017e7758c03686310072e20f92d results/pileups/group_A/group_A.hsa-mir-1323.min.1.pileup.tab +d41d8cd98f00b204e9800998ecf8427e results/pileups/group_A/check_file_group_A.txt +d41d8cd98f00b204e9800998ecf8427e results/pileups/all/check_file.txt +b974c1abfed36e33b37e040b7a1afb2d results/pileups/all/all_samples.hsa-mir-1323.min.1.pileup.tab +fbbaf665220bc2c2222b04e394515f69 results/pileups/all/all_samples.hsa-mir-516a-1.min.1.pileup.tab diff --git a/test/test_files/pileup_regions.bed b/test/test_files/pileup_regions.bed new file mode 100644 index 0000000..3f9f552 --- /dev/null +++ b/test/test_files/pileup_regions.bed @@ -0,0 +1,2 @@ +19 5327 5400 hsa-mir-1323 . + +19 90100 90190 hsa-mir-516a-1 . + diff --git a/workflow/Snakefile b/workflow/Snakefile index e9648d6..336d70b 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -31,8 +31,8 @@ validate(config, Path("../config/config_schema.json")) OUT_DIR = Path(config["output_dir"]) +PILEUP_DIR = Path(config["pileups_dir"]) INTERMEDIATES_DIR = Path(config["intermediates_dir"]) -LOG_DIR = Path(f"{config['local_log']}/../") ############################################################################### @@ -67,6 +67,7 @@ localrules: include: Path("rules/prepare.smk") include: Path("rules/map.smk") include: Path("rules/quantify.smk") +include: Path("rules/pileup.smk") ############################################################################### @@ -100,3 +101,14 @@ rule finish: / "alignments_intersecting_mirna_uncollapsed_sorted.bam.bai", sample=pd.unique(samples_table.index.values), ), + piles_run=PILEUP_DIR / "all/check_file.txt", + piles_lib=expand( + PILEUP_DIR / "{sample}" / "check_file.txt", + sample=pd.unique(samples_table.index.values), + ), + piles_design=expand( + PILEUP_DIR / "{condition}" / "check_file_{condition}.txt" + if config["lib_dict"] != None + else [], + condition=list(config["lib_dict"].keys()), + ), diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 1565371..b5d48a1 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -1,3 +1,5 @@ +from pathlib import Path + ############################################################################### ### Functions ############################################################################### @@ -22,3 +24,11 @@ def convert_lib_format(lib_format: str) -> str: "FASTQ": "fastq", } return formats[lib_format] + + +def create_empty_bed_file(config_file: dict, dir: Path) -> Path: + """Create empty bed file and store its path in the configuration file.""" + empty_bed = f"{dir}/empty_file.bed" + config_file["bed_file"] = empty_bed + + return empty_bed diff --git a/workflow/rules/pileup.smk b/workflow/rules/pileup.smk new file mode 100644 index 0000000..97654c3 --- /dev/null +++ b/workflow/rules/pileup.smk @@ -0,0 +1,258 @@ +############################################################################### +# (c) 2024 Iris Mestres, Zavolan Lab, Biozentrum, University of Basel +# (@) iris.mestres@alumn.esci.upf.edu +# +# Workflow to create ASCII-style pileups of read alignments. +############################################################################### + +import pandas as pd +from snakemake.utils import validate + +from pathlib import Path + + +############################################################################### +### Configuration validation +############################################################################### + +validate(config, Path("../../config/config_schema.json")) + + +############################################################################### +### Paths configuration +############################################################################### + + +ENV_DIR = Path(f"{workflow.basedir}/envs") +INTERMEDIATES_DIR = Path(config["intermediates_dir"]) +OUT_DIR = Path(config["output_dir"]) +PILEUP_DIR = Path(config["pileups_dir"]) +SCRIPTS_DIR = Path(config["scripts_dir"]) + +CLUSTER_LOG = Path(config["cluster_log"]) +LOCAL_LOG = Path(config["local_log"]) + + +############################################################################### +### Including functions +############################################################################### + + +include: "common.smk" + + +############################################################################### +### Reading samples table +############################################################################### + +samples_table = pd.read_csv( + config["samples"], + header=0, + index_col=0, + comment="#", + engine="python", + sep="\t", +) + +############################################################################### +### Global configuration +############################################################################### + + +localrules: + finish_pileup, + + +############################################################################### +### Finish rule +############################################################################### + + +rule finish_pileup: + input: + piles_run=PILEUP_DIR / "all/check_file.txt", + piles_lib=expand( + PILEUP_DIR / "{sample}" / "check_file.txt", + sample=pd.unique(samples_table.index.values), + ), + + +############################################################################### +### Generate empty BED file if not user-provided +############################################################################### + + +if config["bed_file"] == "": + + rule create_empty_bed: + output: + create_empty_bed_file(config, INTERMEDIATES_DIR), + params: + cluster_log=CLUSTER_LOG / "create_empty_bed.log", + log: + LOCAL_LOG / "create_empty_bed.log", + container: + "docker://ubuntu:lunar-20221207" + shell: + "(touch {output})" + + +############################################################################### +### Compress reference genome with trimmed IDs +############################################################################### + + +rule compress_reference_genome: + input: + genome=INTERMEDIATES_DIR / "genome_processed.fa", + output: + genome=INTERMEDIATES_DIR / "genome_processed.fa.bz", + params: + cluster_log=CLUSTER_LOG / "compress_reference_genome.log", + log: + LOCAL_LOG / "compress_reference_genome.log", + container: + "docker://quay.io/biocontainers/samtools:1.16.1--h00cdaf9_2" + shell: + "(bgzip < {input.genome} > {output.genome}) &> {log}" + + +############################################################################### +### Generate ASCII-style pileups (per library) +############################################################################### + + +rule create_per_library_ascii_pileups: + input: + annotations=INTERMEDIATES_DIR / "mirna_annotations.gff3", + maps=OUT_DIR + / "{sample}" + / "alignments_intersecting_mirna_uncollapsed_sorted.bam", + maps_index=OUT_DIR + / "{sample}" + / "alignments_intersecting_mirna_uncollapsed_sorted.bam.bai", + reference=INTERMEDIATES_DIR / "genome_processed.fa.bz", + regions=config["bed_file"], + script=SCRIPTS_DIR / "ascii_alignment_pileup.R", + output: + piles=PILEUP_DIR / "{sample}" / "check_file.txt", + params: + cluster_log=CLUSTER_LOG / "pileups_{sample}.log", + out_dir=lambda wildcards: expand( + PILEUP_DIR / "{sample}", sample=pd.unique(samples_table.index.values) + ), + prefix="{sample}", + log: + LOCAL_LOG / "pileups_{sample}.log", + container: + "docker://zavolab/ascii-alignment-pileup:1.1.1" + conda: + ENV_DIR / "r.yaml" + shell: + "(touch {output.piles} && Rscript {input.script} \ + --verbose \ + --annotations={input.annotations} \ + --reference={input.reference} \ + --prefix={params.prefix} \ + --output-directory {params.out_dir} \ + {input.regions} \ + {input.maps} \ + ) &> {log}" + + +############################################################################### +### Generate ASCII-style pileups (per run) +############################################################################### + + +rule create_per_run_ascii_pileups: + input: + annotations=INTERMEDIATES_DIR / "mirna_annotations.gff3", + maps=expand( + OUT_DIR + / "{sample}" + / "alignments_intersecting_mirna_uncollapsed_sorted.bam", + sample=pd.unique(samples_table.index.values), + ), + maps_index=expand( + OUT_DIR + / "{sample}" + / "alignments_intersecting_mirna_uncollapsed_sorted.bam.bai", + sample=pd.unique(samples_table.index.values), + ), + reference=INTERMEDIATES_DIR / "genome_processed.fa.bz", + regions=config["bed_file"], + script=SCRIPTS_DIR / "ascii_alignment_pileup.R", + output: + piles=PILEUP_DIR / "all/check_file.txt", + params: + cluster_log=CLUSTER_LOG / "pileups_whole_run.log", + out_dir=PILEUP_DIR / "all", + prefix="all_samples", + log: + LOCAL_LOG / "pileups_whole_run.log", + container: + "docker://zavolab/ascii-alignment-pileup:1.1.1" + conda: + ENV_DIR / "r.yaml" + shell: + "(touch {output.piles} && Rscript {input.script} \ + --verbose \ + --annotations={input.annotations} \ + --reference={input.reference} \ + --prefix={params.prefix} \ + --output-directory {params.out_dir} \ + {input.regions} \ + {input.maps} \ + ) &> {log}" + + +############################################################################### +### Generate ASCII-style pileups (per experiment design) +############################################################################### + +if config["lib_dict"] != None: + condition = list(config["lib_dict"].keys()) + + rule create_per_condition_ascii_pileups: + input: + annotations=INTERMEDIATES_DIR / "mirna_annotations.gff3", + maps=lambda wildcards: expand( + OUT_DIR + / "{group}" + / "alignments_intersecting_mirna_uncollapsed_sorted.bam", + group=config["lib_dict"][wildcards.condition], + ), + maps_index=lambda wildcards: expand( + OUT_DIR + / "{group}" + / "alignments_intersecting_mirna_uncollapsed_sorted.bam.bai", + group=config["lib_dict"][wildcards.condition], + ), + reference=INTERMEDIATES_DIR / "genome_processed.fa.bz", + regions=config["bed_file"], + script=SCRIPTS_DIR / "ascii_alignment_pileup.R", + output: + piles=PILEUP_DIR / "{condition}" / "check_file_{condition}.txt", + params: + cluster_log=CLUSTER_LOG / "pileups_condition_{condition}.log", + out_dir=lambda wildcards: expand( + PILEUP_DIR / "{condition}", condition=wildcards.condition + ), + prefix="{condition}", + log: + LOCAL_LOG / "pileups_condition_{condition}.log", + container: + "docker://zavolab/ascii-alignment-pileup:1.1.1" + conda: + ENV_DIR / "r.yaml" + shell: + "(touch {output.piles} && Rscript {input.script} \ + --verbose \ + --annotations={input.annotations} \ + --reference={input.reference} \ + --prefix={params.prefix} \ + --output-directory {params.out_dir} \ + {input.regions} \ + {input.maps} \ + ) &> {log}"