diff --git a/BSBolt/CallMethylation/ProcessMethylationContigs.py b/BSBolt/CallMethylation/ProcessMethylationContigs.py index 698d500..7c3bf10 100644 --- a/BSBolt/CallMethylation/ProcessMethylationContigs.py +++ b/BSBolt/CallMethylation/ProcessMethylationContigs.py @@ -44,6 +44,7 @@ class ProcessContigs: * *verbose (bool)*: Verbose processing, [False] * *cg_only (bool)*: only return CG sites to queue, [False] * *ignore_oprhans (bool)*: ignore orphaned reads (not properly paired), [True] + * *bedgraph_output* (bool)*: output bedgraph format inplace of CGmap, [False] Usage: @@ -57,7 +58,8 @@ def __init__(self, input_file: str = None, genome_database: str = None, output_p ignore_overlap: bool = True, text_output: bool = False, remove_ccgg: bool = False, min_read_depth: int = 10, max_read_depth: int = 8000, threads: int = 1, verbose: bool = True, min_base_quality: int = 10, min_mapping_quality: int = 10, - ATCGmap: bool = False, cg_only: bool = True, ignore_orphans: bool = False): + ATCGmap: bool = False, cg_only: bool = True, ignore_orphans: bool = False, + bedgraph_output: bool = False): assert isinstance(input_file, str), 'Path to input file not valid' assert isinstance(text_output, bool), 'Not valid bool' assert isinstance(threads, int), 'Threads must be specified with integer' @@ -84,6 +86,7 @@ def __init__(self, input_file: str = None, genome_database: str = None, output_p cg_only=cg_only) self.min_read_depth = min_read_depth self.ATCGmap = ATCGmap + self.bedgraph_output = bedgraph_output self.methylation_calling = True self.contigs = self.get_contigs self.completed_contigs = None @@ -173,7 +176,10 @@ def write_output(self, methylation_lines: List[Tuple[Union[int, float, str]]]): self.write_line(self.output_objects['ATCGmap'], self.format_atcg(meth_line)) # if methylation level greater than or equal to min_read_depth output CGmap and wig lines if meth_line['all_cytosines'] >= self.min_read_depth: - self.write_line(self.output_objects['CGmap'], self.format_cgmap(meth_line)) + if self.bedgraph_output: + self.write_line(self.output_objects['CGmap'], self.format_bedgraph(meth_line)) + else: + self.write_line(self.output_objects['CGmap'], self.format_cgmap(meth_line)) @staticmethod def unpack_meth_line(meth_line: Tuple[Union[int, float, str]]) -> Dict[str, Union[int, float, str]]: @@ -199,18 +205,27 @@ def format_cgmap(meth_line: Dict[str, Union[int, float, str]]) -> str: f'\t{meth_line["meth_cytosines"]}\t{meth_line["all_cytosines"]}\n' return CGmap_line + @staticmethod + def format_bedgraph(meth_line: Dict[str, Union[int, float, str]]) -> str: + """Bedgraph line formatting""" + bedgraph_line = f'{meth_line["chrom"]}\t{meth_line["pos"] - 1}\t{meth_line["pos"]}' \ + f'\t{meth_line["meth_level"] * 100:.3f}\t{meth_line["meth_cytosines"]}' \ + f'\t{meth_line["unmeth_cytosines"]}\n' + return bedgraph_line + @property def get_output_objects(self): output_objects = {} output_path = self.input_file + output_suffix = '.CGmap' if not self.bedgraph_output else '.bg' if self.output_prefix: output_path = self.output_prefix if self.text_output: - output_objects['CGmap'] = open(f'{output_path}.CGmap', 'w') + output_objects['CGmap'] = open(f'{output_path}{output_suffix}', 'w') if self.ATCGmap: output_objects['ATCGmap'] = open(f'{output_path}.ATCGmap', 'w') else: - output_objects['CGmap'] = io.BufferedWriter(gzip.open(f'{output_path}.CGmap.gz', 'wb')) + output_objects['CGmap'] = io.BufferedWriter(gzip.open(f'{output_path}{output_suffix}.gz', 'wb')) if self.ATCGmap: output_objects['ATCGmap'] = io.BufferedWriter(gzip.open(f'{output_path}.ATCGmap.gz', 'wb')) return output_objects diff --git a/BSBolt/Utils/Launcher.py b/BSBolt/Utils/Launcher.py index 2ff9f8c..20f24a2 100644 --- a/BSBolt/Utils/Launcher.py +++ b/BSBolt/Utils/Launcher.py @@ -130,7 +130,8 @@ def launch_methylation_call(arguments): min_mapping_quality=arguments.MQ, cg_only=arguments.CG, ATCGmap=arguments.ATCG, - ignore_orphans=arguments.IO) + ignore_orphans=arguments.IO, + bedgraph_output=arguments.BG) methylation_call.process_contigs() methylation_call.watch_pool() diff --git a/BSBolt/Utils/Parser.py b/BSBolt/Utils/Parser.py index eca68e8..58991a8 100644 --- a/BSBolt/Utils/Parser.py +++ b/BSBolt/Utils/Parser.py @@ -3,7 +3,7 @@ from BSBolt.Utils.ParserHelpMessages import aggregate_help, alignment_help, impute_help, index_help, meth_help, sim_help -parser = argparse.ArgumentParser(description='BiSulfite Bolt v1.3.4', +parser = argparse.ArgumentParser(description='BiSulfite Bolt v1.3.5', usage='BSBolt Module {Module Arguments}') subparsers = parser.add_subparsers(description='Please invoke BSBolt module for help,' @@ -155,6 +155,7 @@ call_meth_parser.add_argument('-MQ', type=int, default=20, help='Minimum alignment quality for an alignment to be ' 'considered for methylation calling, default=20') call_meth_parser.add_argument('-CG', action="store_true", default=False, help='Only output CpG sites in CGmap file') +call_meth_parser.add_argument('-BG', action="store_true", default=False, help='Output call in bedGraph format') call_meth_parser.add_argument('-ATCG', action="store_true", default=False, help='Output ATCGmap file') call_meth_parser.add_argument('-IO', action="store_true", default=False, help='Ignore orphans during methylation call') diff --git a/BSBolt/Utils/ParserHelpMessages.py b/BSBolt/Utils/ParserHelpMessages.py index b46d7d7..87f3271 100644 --- a/BSBolt/Utils/ParserHelpMessages.py +++ b/BSBolt/Utils/ParserHelpMessages.py @@ -81,6 +81,7 @@ -DB File path to index directory -O File output prefix -text output plain text files [False] + -BG output calls in bedGraph format [False] -CG only output CpG sites in CGmap file [False] Algorithm Options: -remove-ccgg remove methylation calls in ccgg sites [False] diff --git a/docs/align.md b/docs/align.md index c650ea1..375ca0b 100644 --- a/docs/align.md +++ b/docs/align.md @@ -80,12 +80,12 @@ Algorithm Options ```shell # Paired End Alignment Using Default Commands -python3 -m BSBolt Align -DB ~/Tests/TestData/BSB_Test_DB -F1 ~/Tests/TestSimulations/BSB_pe_meth_1.fastq -F2 ~/Tests/TestSimulations/BSB_pe_meth_2.fastq -O ~/Tests/BSB_pe_test +python3 -m BSBolt Align -DB ~/tests/TestData/BSB_Test_DB -F1 ~/tests/TestSimulations/BSB_pe_meth_1.fastq -F2 ~/tests/TestSimulations/BSB_pe_meth_2.fastq -O ~/tests/BSB_pe_test ``` #### **Single End Alignment** ```shell # Single End Alignment Using Default Commands -python3 -m BSBolt Align -DB ~/Tests/TestData/BSB_Test_DB -F1 ~/Tests/TestSimulations/BSB_pe_meth_1.fastq -O ~/Tests/BSB_pe_test +python3 -m BSBolt Align -DB ~/tests/TestData/BSB_Test_DB -F1 ~/tests/TestSimulations/BSB_pe_meth_1.fastq -O ~/tests/BSB_pe_test ``` diff --git a/docs/bsb_index.md b/docs/bsb_index.md index 47f7494..824549e 100644 --- a/docs/bsb_index.md +++ b/docs/bsb_index.md @@ -39,18 +39,18 @@ Index Options: #### **WGBS Index Generation Example** ```shell -python3 -m BSBolt Index -G ~/Tests/TestData/BSB_test.fa -DB ~/Tests/TestData/BSB_Test_DB +python3 -m BSBolt Index -G ~/tests/TestData/BSB_test.fa -DB ~/tests/TestData/BSB_Test_DB ``` #### **Masked Alignment Index Generation Example** ```shell -python3 -m BSBolt Index -G ~/Tests/TestData/BSB_test.fa -DB ~/Tests/TestData/BSB_Test_DB -MR /Tests/TestData/test_wgbs_madking.bed +python3 -m BSBolt Index -G ~/tests/TestData/BSB_test.fa -DB ~/tests/TestData/BSB_Test_DB -MR /tests/TestData/test_wgbs_madking.bed ``` #### **RRBS Index Generation Example** ```shell # RRBS Index, MSPI Cut Format, 40bp Lower Fragment Bound, and 400bp Upper Fragment Bound -python3 -m BSBolt Index -G ~/Tests/TestData/BSB_test.fa -DB ~/Tests/TestData/BSB_Test_DB -rrbs -rrbs-cut-format C-CGG -rrbs-lower 40 -rrbs-upper 400 +python3 -m BSBolt Index -G ~/tests/TestData/BSB_test.fa -DB ~/tests/TestData/BSB_Test_DB -rrbs -rrbs-cut-format C-CGG -rrbs-lower 40 -rrbs-upper 400 ``` diff --git a/docs/index.md b/docs/index.md index 82aa2ff..2d93775 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,6 +1,6 @@ # **BSBolt (BiSulfite Bolt)** -## A fast and safe bisulfite sequencing processing platform +## A fast and safe bisulfite sequencing analysis platform [BiSuflite Bolt (BSBolt)](https://github.com/NuttyLogic/BSBolt); a fast and scalable bisulfite sequencing analysis platform. BSBolt is an integrated analysis platform that offers support for bisulfite sequencing diff --git a/docs/methylation_calling.md b/docs/methylation_calling.md index e3f3faa..f981ede 100644 --- a/docs/methylation_calling.md +++ b/docs/methylation_calling.md @@ -25,9 +25,8 @@ samtools index BSB_pe_test.dup.bam ### BSBolt CallMethylation -Methylation calling outputs a .CGmap file by default. To maintain compatibility with downstream analysis tools -ATCGmap files can by output, but this feature will be deprecation in a future update. Methylation values are called for -all mapped reads at a position by default. +Methylation calling outputs a .CGmap file by default, and makes calls using +all mapped reads not flagged as duplicate at a position by default. #### **BSB CallMethylation Commands** @@ -41,6 +40,7 @@ Input / Output Options: -DB File path to index directory -O File output prefix -text output plain text files [False] + -BG output calls in bedGraph format [False] -CG only output CpG sites in CGmap file [False] Algorithm Options: -remove-ccgg remove methylation calls in ccgg sites [False] @@ -62,10 +62,12 @@ can only be called using reads mapped to Watson and Crick strands respectively. ```shell # Methylation Calling with 2 threads, -python3 -m BSBolt CallMethylation -I ~/Tests/BSB_pe_test.sorted.bam -O ~/Tests/BSB_pe_test -DB ~/Tests/TestData/BSB_Test_DB -t 2 -verbose > methylation_stats.txt +python3 -m BSBolt CallMethylation -I ~/tests/BSB_pe_test.sorted.bam -O ~/tests/BSB_pe_test -DB ~/tests/TestData/BSB_Test_DB -t 2 -verbose > methylation_stats.txt ``` -### **Output Files** +### **Output File** + +**CGmap** CGmap is a tab separated text format describing the methylation status of observed cyotsines @@ -74,8 +76,8 @@ CGmap is a tab separated text format describing the methylation status of observ 3. Position, base-pairs from start 4. Context, three base pair methylation context 5. Sub-Context, two base pair methylation context - 6. Methylation Value, proportion of methylation reads to total reads - 7. Methylation Bases, methylated nucleotides observed + 6. Methylation Value, proportion of methylated bases to total bases + 7. Methylated Bases, methylated nucleotides observed 8. All Bases, total number of nucleotides observed at the mapping position ```text @@ -85,3 +87,15 @@ chr12 G 389290 CHH CT 0.0 0 10 chr13 G 200552 CHH CT 0.0 0 10 chr11 C 142826 CG CG 0.0 0 10 ``` + +**Bedgraph** + +BedGraph is functionally similar to CGmap format without cytosine context information and zero indexed positions. + + 1. Chromosome + 2. Start Position + 4. End Position + 6. Methylation Percentage, percentage of methylated bases to total observed bases + 7. Methylated Bases, methylated nucleotides observed + 8. Unmethylated Bases, total unmethylated bases + diff --git a/docs/read_simulation.md b/docs/read_simulation.md index 8cc92fe..353ae7c 100644 --- a/docs/read_simulation.md +++ b/docs/read_simulation.md @@ -64,5 +64,5 @@ Algorithm Options: #### **Simulate Paired End, Undirectional Bisulfite Reads** ```shell -python3 -m BSBolt Simulate -G ~/Tests/TestData/BSB_test.fa -O ~/Tests/TestSimulations/BSB_pe -U -PE +python3 -m BSBolt Simulate -G ~/tests/TestData/BSB_test.fa -O ~/tests/TestSimulations/BSB_pe -U -PE ``` diff --git a/docs/simulation.md b/docs/simulation.md index 4ff298e..1ab2783 100644 --- a/docs/simulation.md +++ b/docs/simulation.md @@ -76,5 +76,5 @@ error simulation. **Simulate Paired End, Undirectional Bisulfite Reads** ```shell -python3 -m BSBolt Simulate -G ~/Tests/TestData/BSB_test.fa -O ~/Tests/TestSimulations/BSB_pe -U -PE +python3 -m BSBolt Simulate -G ~/tests/TestData/BSB_test.fa -O ~/tests/TestSimulations/BSB_pe -U -PE ``` \ No newline at end of file diff --git a/setup.py b/setup.py index 6c075ab..d8c09f7 100644 --- a/setup.py +++ b/setup.py @@ -27,7 +27,6 @@ """ - class BuildError(Exception): """Build error""" pass @@ -110,7 +109,7 @@ def run(self): setup(name='BSBolt', - version='1.3.4', + version='1.3.5', description='Bisulfite Sequencing Processing Platform', long_description=long_description, long_description_content_type="text/markdown", diff --git a/tests/test_alignment_pipeline.py b/tests/test_alignment_pipeline.py index ffeaf91..c3b4be3 100644 --- a/tests/test_alignment_pipeline.py +++ b/tests/test_alignment_pipeline.py @@ -48,6 +48,13 @@ subprocess.run(bs_call_methylation_args) print('Methylation Values Called') +bs_call_methylation_args = ['python3', '-m', 'BSBolt', 'CallMethylation', '-I', + f'{bsb_directory}tests/BSB_pe_test.sorted.bam', + '-O', f'{bsb_directory}tests/BSB_pe_test', + '-DB', f'{bsb_directory}tests/TestData/BSB_Test_DB', + '-t', '6', '-verbose', '-BQ', '10', '-MQ', '20', '-BG'] +subprocess.run(bs_call_methylation_args) + # retrieve reference and test alignments test_alignments = f'{bsb_directory}tests/BSB_pe_test.sorted.bam' test_fastqs = [f'{bsb_directory}tests/TestSimulations/BSB_pe_1.fq',