Skip to content

Commit

Permalink
Merge pull request #129 from NuttyLogic/prosper
Browse files Browse the repository at this point in the history
Prosper
  • Loading branch information
NuttyLogic authored Feb 14, 2022
2 parents ca323ed + c2c6853 commit 5a191b0
Show file tree
Hide file tree
Showing 17 changed files with 781 additions and 43 deletions.
15 changes: 14 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,14 @@ __pycache__/

# C extensions
*.so
config*
*.cache
*.dylib

#htslib external
bsbolt/External/HTSLIB/config.h

working.py

# Distribution / packaging
.Python
Expand Down Expand Up @@ -144,4 +152,9 @@ scratch*
/bsbolt/External/BWA/src/*.o

/bsbolt/External/HTSLIB/config.log
/bsbolt/External/HTSLIB/config.status
/bsbolt/External/HTSLIB/config.status
bsbolt/External/HTSLIB/config.h
bsbolt/External/HTSLIB/config.mk
bsbolt/External/HTSLIB/htslib_static.mk
bsbolt/External/HTSLIB/htslib-uninstalled.pc
bsbolt/External/HTSLIB/htslib.pc.tmp
18 changes: 9 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,15 @@ doi:10.1101/2020.10.06.328559](https://academic.oup.com/gigascience/article/10/5
Documentation can be found at [https://bsbolt.readthedocs.io](https://bsbolt.readthedocs.io/en/latest/).

## Release Notes
- v1.5.0
- Improved thread handling for methylation / variant calling.
- Experimental bisulfite aware SNP caller.
- v1.4.8
- Fixed bug ending alignment when the reference template end greater than reference boundary.
- v1.4.7
- Alignment stats fix.
- Alignment stats fix.
- v1.4.6
- Alignment statistics now output as generated.
- Alignment statistics now output as generated.
- Fixed bug where alignment would stop when observed mappability was low.
- v1.4.5
- Fixed maximum read depth bug that prevented methylation call on site covered by greater than 8000 reads
Expand All @@ -48,7 +51,7 @@ pip3 install bsbolt --user

### **Conda Installation**

BSBolt can be installed using the conda package manager using the instructions below.
BSBolt can be installed using the conda package manager using the instructions below.

```shell
conda config --add channels bioconda
Expand Down Expand Up @@ -91,14 +94,14 @@ xcode-select --install
# install autoconf
brew install autoconf
# install automake
brew installa automake
brew install automake
# optionally install python > 3.5
brew install python3.8
# clone the repository
git clone https://github.com/NuttyLogic/BSBolt.git
cd bsbolt
cd BSBolt
# compile and install package
pip3 install .
pip3 install -e .
```
## Usage

Expand All @@ -120,6 +123,3 @@ Impute kNN Imputation
Sort Sort BAM File
BamIndex Index BAM file
```



5 changes: 3 additions & 2 deletions bsbolt/CallMethylation/ProcessMethylationContigs.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,15 +142,16 @@ def watch_pool(self):
if self.verbose:
pbar = tqdm(total=len(self.contigs), desc='Processing Contigs')
while self.methylation_calling:
methylation_lines: list = self.return_queue.get(block=True)
if self.verbose:
if len(self.completed_contigs) != contigs_complete:
update_number = len(self.completed_contigs) - contigs_complete
contigs_complete = len(self.completed_contigs)
pbar.update(update_number)
self.write_output(methylation_lines)
if len(self.completed_contigs) == len(self.contigs) and self.return_queue.empty():
self.methylation_calling = False
elif not self.return_queue.empty():
methylation_lines: list = self.return_queue.get(block=True)
self.write_output(methylation_lines)
if self.verbose:
pbar.close()
for out in self.output_objects.values():
Expand Down
10 changes: 5 additions & 5 deletions bsbolt/External/WGSIM/src/wgsim.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -284,7 +284,7 @@ void wgsim_core(const char *fn, int is_hap, uint64_t N, int dist, int std_dev, i
s[0] = size[0]; s[1] = size[1];

// generate the read sequences
target = rseq[0].s; // haplotype from which the reads are generated
target = rseq[drand48()<0.5?0:1].s; // haplotype from which the reads are generated
n_sub[0] = n_sub[1] = n_indel[0] = n_indel[1] = n_err[0] = n_err[1] = 0;
int start[2] = {pos, pos + insert_size + size_l};
int end[2] = {start[0], start[1]};
Expand Down Expand Up @@ -477,7 +477,7 @@ void wgsim_core(const char *fn, int is_hap, uint64_t N, int dist, int std_dev, i
static int simu_usage()
{
fprintf(stderr, "\n");
fprintf(stderr, "Forked wgsim (Heng Li) (short read simulator) for simulation of bisuflite treated reads\n");
fprintf(stderr, "Forked wgsim (Heng Li) (short read simulator) for simulation of bisulfite treated reads\n");
fprintf(stderr, "Version: %s\n", PACKAGE_VERSION);
fprintf(stderr, "Contact: Colin Farrell <[email protected]>\n\n");
fprintf(stderr, "Usage: wgsim [options] <in.ref.fa> \n\n");
Expand All @@ -492,7 +492,7 @@ static int simu_usage()
fprintf(stderr, " -X FLOAT probability an indel is extended [%.2f]\n", INDEL_EXTEND);
fprintf(stderr, " -S INT seed for random generator [-1]\n");
fprintf(stderr, " -A FLOAT disgard if the fraction of ambiguous bases higher than FLOAT [%.2f]\n", MAX_N_RATIO);
fprintf(stderr, " -h haplotype mode\n");
fprintf(stderr, " -h INT haplotype mode\n");
fprintf(stderr, "\n");
return 1;
}
Expand All @@ -507,7 +507,7 @@ int main(int argc, char *argv[])
N = 1000000; dist = 500; std_dev = 50;
mean_insert = (dist - size_l * 2) / 2;
size_l = size_r = 70;
while ((c = getopt(argc, argv, "e:d:s:N:1:2:r:R:hX:S:A:I:")) >= 0) {
while ((c = getopt(argc, argv, "e:d:s:N:1:2:r:R:h:X:S:A:I:")) >= 0) {
switch (c) {
case 'd': dist = atoi(optarg); break;
case 's': std_dev = atoi(optarg); break;
Expand All @@ -520,7 +520,7 @@ int main(int argc, char *argv[])
case 'X': INDEL_EXTEND = atof(optarg); break;
case 'A': MAX_N_RATIO = atof(optarg); break;
case 'S': seed = atoi(optarg); break;
case 'h': is_hap = 1; break;
case 'h': is_hap = atoi(optarg); break;
case 'I': mean_insert = atoi(optarg); break;
}
}
Expand Down
18 changes: 9 additions & 9 deletions bsbolt/Simulate/SimulateMethylatedReads.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,9 @@ def __init__(self, reference_file: str = None, sim_output: str = None,
'-X', str(indel_extension_probability), '-S', str(random_seed),
'-A', str(ambiguous_base_cutoff), '-I', str(mean_insert_size)]
if haplotype_mode:
self.sim_command.append('-h')
self.sim_command.extend(['-h', '1'])
else:
self.sim_command.extend(['-h', '0'])
self.sim_db = SetCytosineMethylation(reference_file=reference_file,
sim_dir=sim_output,
methylation_reference=methylation_reference,
Expand Down Expand Up @@ -127,10 +129,11 @@ def output_reference(self):
self.sim_db.sim_db.output_contig(self.contig_values, self.current_contig, values=True)
self.contig_values = {}

def process_read_group(self, sim_data):
def process_read_group(self, sim_data, stranded_capture=False):
"""Set read methylation values, randomly assign reads to Watson or Crick strand"""
# randomly select reference strand
sub_pattern = ('C', 'T') if self.random_roll(0.5) else ('G', 'A')
random_roll = self.random_roll(0.5) if not stranded_capture else True
sub_pattern = ('C', 'T') if random_roll else ('G', 'A')
if sub_pattern[0] == 'G':
temp_sim = sim_data[1]
sim_data[1] = sim_data[2]
Expand Down Expand Up @@ -217,8 +220,7 @@ def set_variant_methylation(self, meth_pos, seq_base, variant_type='X') -> Tuple
methyl_status, context = self.set_base_methylation(meth_pos)
if methyl_status:
return seq_base.lower(), cigar_type
else:
return seq_base, cigar_type.lower()
return seq_base, cigar_type.lower()

def handle_match(self, seq_base, ref_base, pos):
if seq_base != ref_base:
Expand All @@ -227,8 +229,7 @@ def handle_match(self, seq_base, ref_base, pos):
methyl_status, context = self.set_base_methylation(pos)
if methyl_status:
return seq_base.lower(), 'C' if context else 'Y'
else:
return seq_base, 'c' if context else 'y'
return seq_base, 'c' if context else 'y'

def set_base_methylation(self, methyl_position):
"""Perform random roll to set methylated bases. Update reference values if *self.collect_sim_stats=True*"""
Expand Down Expand Up @@ -263,8 +264,7 @@ def get_methylation_reference(self, contig: str, variant_data: Union[bool, Dict]
contig_profile = self.sim_db.get_contig_methylation(contig)
self.sim_db.set_variant_methylation(variant_data, contig_profile, self.current_contig)
return contig_profile
else:
return self.sim_db.get_contig_methylation(contig)
return self.sim_db.get_contig_methylation(contig)

@property
def get_output_objects(self):
Expand Down
3 changes: 2 additions & 1 deletion bsbolt/Simulate/StreamSim.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ def collect_variant_info(self, formatted_line):
@staticmethod
def process_variant_line(formatted_line):
chrom, pos, reference, alt, heterozygous = formatted_line.split('\t')
het = True if heterozygous == '+' else False
pos = int(pos)
indel = 0
iupac = None
Expand All @@ -75,7 +76,7 @@ def process_variant_line(formatted_line):
indel = -1
else:
iupac = retrieve_iupac(alt)
return dict(chrom=chrom, pos=pos, reference=reference, alt=alt, heterozygous=heterozygous,
return dict(chrom=chrom, pos=pos, reference=reference, alt=alt, heterozygous=het,
indel=indel, iupac=iupac)

@staticmethod
Expand Down
18 changes: 17 additions & 1 deletion bsbolt/Utils/Launcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import time
from bsbolt.Align.AlignReads import BisulfiteAlignmentAndProcessing
from bsbolt.CallMethylation.ProcessMethylationContigs import ProcessContigs
from bsbolt.Variant.ProcessVariantRegions import ProcessVarContigs
from bsbolt.Impute.kNN_Impute import ImputeMissingValues
from bsbolt.Index.RRBSIndex import RRBSBuild
from bsbolt.Index.WholeGenomeIndex import WholeGenomeBuild
Expand Down Expand Up @@ -125,6 +126,7 @@ def launch_methylation_call(arguments):
genome_database=arguments.DB,
output_prefix=arguments.O,
remove_ccgg=arguments.remove_ccgg,
ignore_overlap=arguments.ignore_ov,
text_output=arguments.text,
min_read_depth=arguments.min,
threads=arguments.t,
Expand Down Expand Up @@ -198,11 +200,25 @@ def launch_imputation(arguments):
impute.output_imputed_matrix()


def launch_variant_call(arguments):
var_call = ProcessVarContigs(input_file=arguments.I, genome_database=arguments.DB, output_prefix=arguments.O,
ignore_overlap=arguments.ignore_ov, text_output=arguments.text,
min_read_depth=arguments.min, max_read_depth=arguments.max, threads=arguments.t,
verbose=arguments.verbose, min_base_quality=arguments.BQ,
min_mapping_quality=arguments.MQ, ignore_orphans=arguments.IO,
bed_output=arguments.BED, vcf_output=arguments.VCF,
output_reference_calls=arguments.OR, call_region_bed=arguments.BR,
min_pval=arguments.MP)
var_call.process_contigs()
var_call.watch_pool()


bsb_launch = {'Index': launch_index,
'Align': launch_alignment,
'CallMethylation': launch_methylation_call,
'AggregateMatrix': launch_matrix_aggregation,
'Simulate': launch_simulation,
'Impute': launch_imputation,
'Sort': launch_sort_bam,
'BamIndex': launch_index_bam}
'BamIndex': launch_index_bam,
'CallVariation':launch_variant_call}
40 changes: 36 additions & 4 deletions bsbolt/Utils/Parser.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import argparse

from bsbolt.Utils.ParserHelpMessages import aggregate_help, alignment_help, impute_help, index_help, meth_help, sim_help
from bsbolt.Utils.ParserHelpMessages import aggregate_help, alignment_help, impute_help
from bsbolt.Utils.ParserHelpMessages import index_help, meth_help, sim_help, variant_help


parser = argparse.ArgumentParser(description='BiSulfite Bolt v1.4.8',
parser = argparse.ArgumentParser(description='BiSulfite Bolt v1.5.0',
usage='bsbolt Module {Module Arguments}')

subparsers = parser.add_subparsers(description='Please invoke bsbolt module for help,'
Expand All @@ -21,6 +22,8 @@
imputation_parser = subparsers.add_parser('Impute', help='kNN Imputation', add_help=False, usage=impute_help)
sort_parser = subparsers.add_parser('Sort', help='BAM Sort')
bam_index = subparsers.add_parser('BamIndex', help='BAM Index')
variant_parser = subparsers.add_parser('CallVariation', help="Genetic Variation Calling",
add_help=False, usage=variant_help)
# Add Alignment Parser Commands

align_parser.add_argument('-F1', type=str, default=None, help='path to fastq 1', required=True)
Expand Down Expand Up @@ -95,7 +98,7 @@
'output all in XA [100,200]',
required=False)
align_parser.add_argument('-DR', type=float, default=0.95, help='drop ratio for alternative hits reported in XA tag, '
'for best bisulifte alignment performance set at or '
'for best bisulfite alignment performance set at or '
'above default [0.95]')
align_parser.add_argument('-M', action='store_true', default=False,
help='mark shorter split hits as secondary',
Expand Down Expand Up @@ -210,7 +213,7 @@
sim_parser.add_argument('-NS', default=True, action='store_false', help='By default observed methylation counts are '
'saved, disable this behavior')
sim_parser.add_argument('-SE', type=float, default=0.001, help='Sequencing Error [0.001]')
sim_parser.add_argument('-NF', type=float, default=0.05, help='Cutoff threshold for amibiguous bases, simulated reads '
sim_parser.add_argument('-NF', type=float, default=0.05, help='Cutoff threshold for ambiguous bases, simulated reads '
'with a proportion of ambiguous bases above this '
'threshold will not be output [0.05]')
sim_parser.add_argument('-FM', type=int, default=400, help='Max fragment size [400]')
Expand Down Expand Up @@ -242,3 +245,32 @@
# add bam indexing args

bam_index.add_argument('-I', type=str, required=True, help='BAM input path')

# variant parser args

variant_parser.add_argument('-I', type=str, required=True,
help='Input BAM, input file must be in BAM format with index file')
variant_parser.add_argument('-DB', type=str, required=True, help='Path to index directory')
variant_parser.add_argument('-O', type=str, required=True, help='Output prefix')
variant_parser.add_argument('-verbose', action="store_true", default=False, help='Verbose Output')
variant_parser.add_argument('-text', action="store_true", default=False,
help='Output plain text files, default=False')
variant_parser.add_argument('-ignore-ov', action="store_true", default=True, help='Only consider higher quality base '
'when paired end reads overlap, '
'default=True')
variant_parser.add_argument('-max', type=int, default=8000, help='Max read depth to call variation, default=8000')
variant_parser.add_argument('-min', type=int, default=10, help='Minimum read depth required to '
'call variation, default=10')
variant_parser.add_argument('-t', type=int, default=1, help='Number of threads to use when calling variation, default=1')
variant_parser.add_argument('-BQ', type=int, default=10, help='Minimum base quality for a base to considered for'
'variant calling, default=0')
variant_parser.add_argument('-MQ', type=int, default=20, help='Minimum alignment quality for an alignment to be '
'considered for variant calling, default=20')
variant_parser.add_argument('-BED', action="store_false", default=True, help='Do not output calls in bed format, '
'default=True')
variant_parser.add_argument('-VCF', action="store_true", default=False, help='Output VCF file, default=False')
variant_parser.add_argument('-IO', action="store_true", default=False, help='Ignore orphans during variation call')
variant_parser.add_argument('-BR', type=str, default=None, help='Regions to call variation in bed format')
variant_parser.add_argument('-OR', action='store_true', help='Output calls for bases without observed variation')
variant_parser.add_argument('-MP', type=float, default=0.001, help='Minimum genotype call p value '
'to output, default 0.001')
27 changes: 26 additions & 1 deletion bsbolt/Utils/ParserHelpMessages.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@
'''

meth_help = '''
bsbolt Module CallMethylation -I {input.bam} -DB {bsbolt DB} -O {output prefix}
bsbolt CallMethylation -I {input.bam} -DB {bsbolt DB} -O {output prefix}
-h, --help show this help message and exit
Expand Down Expand Up @@ -162,4 +162,29 @@
-verbose verbose imputation
-O File output path for imputed matrix
-R randomize batches
'''

variant_help = '''
bsbolt CallVariation -I {input.bam} -DB {bsbolt DB} -O {output prefix}
-h, --help show this help message and exit
Options
-I File Input BAM, input file must be in BAM format with index file
-DB File Path to index directory
-O File Output prefix
-verbose Verbose Output
-text Output plain text files, default=False
-ignore-ov Only consider higher quality base when paired end reads overlap, default=True
-max Int Max read depth to call variation, default=8000
-min Int Minimum read depth required to call variation, default=10
-t Int Number of threads to use when calling variation, default=1
-BQ Int Minimum base quality for a base to considered for variant calling, default=0
-MQ Int Minimum alignment quality for an alignment to be considered for variant calling, default=20
-BED Do not output calls in bed format, default=True
-VCF Output VCF file, default=False
-IO Ignore orphans during variation call
-OR Output reference calls, default=False
-BR File Regions to call variation in bed format
-MR Float Minimum genotype call p value to output, default 0.00
'''
Loading

0 comments on commit 5a191b0

Please sign in to comment.