diff --git a/src/Snakefile b/src/Snakefile index 77fceaab..e0129be4 100644 --- a/src/Snakefile +++ b/src/Snakefile @@ -238,5 +238,5 @@ include: 'Snakefiles/7-extractONT.sm' include: 'Snakefiles/7-buildPackage.sm' include: 'Snakefiles/7-generateConsensus.sm' include: 'Snakefiles/7-combineConsensus.sm' -if config['withHIC'] == "True": +if config['withHIC'] == "True" or config['withPOREC'] == 'True': include: 'Snakefiles/8-hicPipeline.sm' diff --git a/src/Snakefiles/8-hicPipeline.sm b/src/Snakefiles/8-hicPipeline.sm index 4dc54fd6..122f141a 100644 --- a/src/Snakefiles/8-hicPipeline.sm +++ b/src/Snakefiles/8-hicPipeline.sm @@ -20,7 +20,7 @@ HIC_READS2 = config.get('HIC_READS2') HAPLO_DIV = float(config.get('haplo_divergence')) VERKKO = config.get('VERKKO') - + rule copyFiles: input: unitig_scfmap = '6-layoutContigs/unitig-popped.layout.scfmap', @@ -75,7 +75,7 @@ chmod +x ./prepare_hic.sh ./prepare_hic.sh > ../{log.err} 2>&1 ''' - + rule runMashMap: input: unitigs_hpc = '8-hicPipeline/unitigs.hpc.fasta' @@ -194,15 +194,15 @@ def splitHICoutputs(wildcards): checkpoint splitHIC: input: hic1 = HIC_READS1, - hic2 = HIC_READS2 + hic2 = HIC_READS2 if config['withPOREC'] == "False" else rules.emptyfile.output[0] output: split_finished = '8-hicPipeline/splitHIC.finished' log: err = '8-hicPipeline/splitHIC.err' params: - bases = config['shc_bases'], + bases = config['shc_bases'] if config['withPOREC'] == "False" else config['spl_bases'], reads = config['shc_reads'], - length = config['shc_min_length'] + length = config['shc_min_length'], threads: int(config['fhc_n_cpus']) resources: @@ -219,16 +219,23 @@ cat > ./splitHIC.sh < ../{log.err} 2>&1 ''' - + rule alignBWA: input: unitigs = '8-hicPipeline/unitigs.fasta', hic1 = '8-hicPipeline/split/hic1{nnnn}.fasta.gz', - hic2 = '8-hicPipeline/split/hic2{nnnn}.fasta.gz', - index = '8-hicPipeline/unitigs.fasta.bwt' + hic2 = '8-hicPipeline/split/hic2{nnnn}.fasta.gz' if config['withPOREC'] == "False" else rules.emptyfile.output[0], + index = '8-hicPipeline/unitigs.fasta.bwt' if config['withPOREC'] == "False" else rules.emptyfile.output[0] output: bwa_mapping = '8-hicPipeline/mapped{nnnn}.bam' log: err = '8-hicPipeline/align_bwa{nnnn}.err' params: - BWA = config.get('BWA', "{VERKKO}/bin/bwa"), - SAMTOOLS = config.get('SAMTOOLS', "{VERKKO}/bin/samtools") + BWA = config.get('BWA', "{VERKKO}/bin/bwa"), + SAMTOOLS = config.get('SAMTOOLS', "{VERKKO}/bin/samtools"), + WINNOWMAP = config.get('WINNOWMAP', "{VERKKO}/bin/winnowmap"), + withporec = config['withPOREC'] threads: int(config['ahc_n_cpus']) resources: @@ -293,13 +302,19 @@ cd 8-hicPipeline cat > ./align_bwa{wildcards.nnnn}.sh < ../{log.err} 2>&1 +./align_bwa{wildcards.nnnn}.sh > ../{log.err} 2>&1 ''' #TODO: bwa and mashmap and samtools bins @@ -308,7 +323,7 @@ rule mergeBWA: split_finished = rules.splitHIC.output.split_finished, index = '8-hicPipeline/unitigs.fasta.bwt', alignments = lambda wildcards: expand("8-hicPipeline/mapped{nnnn}.bam", nnnn = splitHICoutputs(wildcards)) - + output: alignments = '8-hicPipeline/hic_to_assembly.sorted_by_read.bam' log: @@ -324,7 +339,7 @@ rule mergeBWA: n_cpus = int(config['fhc_n_cpus']), mem_gb = lambda wildcards, input, attempt: getMemoryRequest(attempt, 'fhc'), time_h = lambda wildcards, input, attempt: getTimeRequest(attempt, 'fhc') - shell: + shell: ''' cd 8-hicPipeline @@ -332,7 +347,7 @@ cat > ./mergeBWA.sh < ./transform_bwa.sh < ../{output.byread_mapping} +{params.SAMTOOLS} view -q 1 ../{input.bwa_mapping} | {PYTHON} {VERKKO}/scripts/parse_sam_pairs.py > ../{output.byread_mapping} EOF chmod +x ./transform_bwa.sh @@ -401,7 +416,7 @@ rule hicPhasing: n_cpus = int(config['fhc_n_cpus']), mem_gb = lambda wildcards, input, attempt: getMemoryRequest(attempt, 'fhc'), time_h = lambda wildcards, input, attempt: getTimeRequest(attempt, 'fhc') - shell: + shell: ''' cd 8-hicPipeline @@ -415,7 +430,7 @@ chmod +x ./hic_phasing.sh ./hic_phasing.sh > ../{log.err} 2>&1 ''' - + rule runRukkiHIC: input: graph = '8-hicPipeline/unitigs.hpc.noseq.gfa', @@ -424,7 +439,7 @@ rule runRukkiHIC: tsv_path = '8-hicPipeline/rukki.paths.tsv', gaf_path = '8-hicPipeline/rukki.paths.gaf', final_paths_tsv = 'assembly.paths.tsv' - log: + log: err = '8-hicPipeline/rukki_hic.err' threads: int(config['fhc_n_cpus']) @@ -437,7 +452,7 @@ rule runRukkiHIC: ''' cd 8-hicPipeline -cat > ./rukki_hic.sh < ./rukki_hic.sh < j_split[1]): + print (f"{i_split[0]}\t{j_split[1]}\t{i_split[1]}") + # if they are equal we don't want to print anything, not informative + if not sys.stdin.isatty(): input_stream = sys.stdin @@ -18,11 +31,27 @@ else: input_stream = open(input_filename, 'rU') +name = "" +names = [ ] +seen = {} +out_of_order = 0 + for line in input_stream: - spltd[0] = spltd[1] - spltd[1] = line.split() - if spltd[0][0] == spltd[1][0]: - if spltd[0][2] != spltd[1][2]: - print (f"{spltd[0][0]}\t{spltd[0][2]}\t{spltd[1][2]}") -# print (f"{spltd[0][0]}\t{spltd[0][2]}\t{spltd[1][2]}\t{spltd[0][4]}\t{spltd[1][4]}") + line=line.split() + if name == "": + name = line[0] + if name != line[0]: + seen[name] = 1 + print_results(names) + name = line[0] + names = [ ] + if name in seen: + print("Warning: read %s already seen but encountered it again, please confirm your bam file is sorted by read."%(name), file=sys.stderr) + out_of_order += 1 + names.append("%s\t%s"%(line[0], line[2])) + +if out_of_order > 1000: + print("Error: encountered too many unsorted reads (%d), exiting. Please confirm the input bam is sorted by read."%(out_of_order), file=sys.stderr) + sys.exit(1) +print_results(names) diff --git a/src/verkko.sh b/src/verkko.sh index 106161b9..87b16a0a 100755 --- a/src/verkko.sh +++ b/src/verkko.sh @@ -50,6 +50,7 @@ hic2="" withont="False" withhic="False" +withporec="False" keepinter="True" @@ -379,6 +380,17 @@ while [ $# -gt 0 ] ; do shift arg=$1 done + elif [ "$opt" = "--porec" ] ; then + withporec="True" + while [ -e "$arg" ]; do + if [ -e "/$arg" ]; then + hic1="$hic1 $arg" + else + hic1="$hic1 `pwd`/$arg" + fi + shift + arg=$1 + done elif [ "$opt" = "--hic1" ] ; then withhic="True" while [ -e "$arg" ] ; do @@ -709,7 +721,7 @@ elif [ ! -e "$mbg" ] ; then fi # graphaligner and winnowmap are required when we have ONT data -if [ "x$withont" = "xTrue" ] ; then +if [ "x$withont" = "xTrue" -o "x$withporec" = "xTrue" ] ; then if [ "x$graphaligner" = "x" ] ; then errors="${errors}Can't find GraphAligner executable in \$PATH or \$VERKKO/bin/GraphAligner.\n" elif [ ! -e "$graphaligner" ] ; then @@ -733,7 +745,15 @@ if [ ! -z "$screen" -o "x$withhic" = "xTrue" ] ; then fi # bwa samtools and seqtk required for HiC data -if [ "x$withhic" = "xTrue" ] ; then +if [ "x$withhic" = "xTrue" -o "x$withporec" = "xTrue" ] ; then + if [ "x$withhic" = "xTrue" ]; then + if [ "x$hic1" = "x" -o "x$hic2" = "x" ]; then + errors="${errors}Only one of --hic1 and --hic2 specified, both must be specified to run with Hi-C\n" + fi + fi + if [ "x$withhic" = "xTrue" -a "x$withporec" = "xTrue" ]; then + errors="${errors}Both --hic1/--hic2 and --porec cannot be specified at the same time, please only specify one\n" + fi # check that ref for rdna is present and can be found if [ ! -z "$rdna_scaff_ref" ] ; then dtpath="$verkko/data/$rdna_scaff_ref" @@ -781,9 +801,12 @@ if [ "x$help" = "xhelp" -o "x$errors" != "x" ] ; then echo " -d Directory to use for verkko intermediate and final results." echo " Will be created if needed." echo " --hifi List of files containing PacBio HiFi reads." + echo " Input reads can be any combination of FASTA/FASTQ," + echo " uncompressed or gzip/bzip2/xz compressed. Any" + echo " number of files can be supplied; *.gz works." echo " --nano List of files containing Oxford Nanopore reads." echo "" - echo " Input reads can be any combination of FASTA/FASTQ," + echo " Input reads can be any combination of FASTA/FASTQ/SAM/BAM," echo " uncompressed or gzip/bzip2/xz compressed. Any" echo " number of files can be supplied; *.gz works." echo "" @@ -798,11 +821,15 @@ if [ "x$help" = "xhelp" -o "x$errors" != "x" ] ; then echo " --hic1 List of files containing left hic reads." echo " --hic2 List of files containing right hic reads." echo " Order of left and right files should be same, no interlaced files allowed." - echo " Input reads can be any combination of FASTA/FASTQ," + echo " Input reads can be any combination of FASTA/FASTQ/SAM/BAM," + echo " uncompressed or gzip/bzip2/xz compressed. Any" + echo " number of files can be supplied; *.gz works." + echo "" + echo " --porec List of files containing Pore-C reads." + echo " Input reads can be any combination of FASTA/FASTQ/SAM/BAM," echo " uncompressed or gzip/bzip2/xz compressed. Any" echo " number of files can be supplied; *.gz works." echo " --no-rdna-tangle Switch off option that helps to proceed large rDNA tangles which may connect multiple chromosomes." -# echo " --rdna-scaff Switch on experimental HiC scaffolding over rDNA clusters for acrocentric chromosomes. Tested only on primates!" echo " --rdna-scaff-ref Switch to user-supplied reference for HiC scaffolding rather than human rDNA, experimental use to scaffold across other repeat classes." echo " By default, rDNA representatives from CHM13 human assembly are used" echo " Use '--rdna-scaff-ref none' to switch off scaffolding over rdna or other large repeat clusters" @@ -848,7 +875,7 @@ if [ "x$help" = "xhelp" -o "x$errors" != "x" ] ; then echo " snakemake command. Options MUST be quoted." echo "" echo " --sto-run Set resource limits for various stages." - echo " --ovb-run Format: number-of-cpus memory-in-gb time-in-hours" + echo " --ovb-run Format: number-of-cpus memory-in-gb time-in-hours" echo " --ovs-run --cns-run 8 32 2" echo " --red-run" echo " --mbg-run" @@ -961,6 +988,8 @@ for o in ${hic2} ; do echo >> ${outd}/verkko.yml " - '$o'" done echo >> ${outd}/verkko.yml "" +echo >> ${outd}/verkko.yml "withPOREC: '${withporec}'" +echo >> ${outd}/verkko.yml "" echo >> ${outd}/verkko.yml "# Algorithm parameters." echo >> ${outd}/verkko.yml "" echo >> ${outd}/verkko.yml "# buildStore, countKmers and computeOverlaps" @@ -1131,7 +1160,7 @@ echo >> ${outd}/verkko.yml "# This is the end." target="verkko" -if [ "x$withhic" = "xTrue" ] ; then +if [ "x$withhic" = "xTrue" -o "x$withporec" = "xTrue" ] ; then if [ "x$rdna_scaff" = "xTrue" ] ; then target="HiC_rdnascaff" else @@ -1239,7 +1268,7 @@ cd ${outd} #Failed to do it with snakemake -if [ "x$withhic" = "xTrue" ] ; then +if [ "x$withhic" = "xTrue" -o "x$withporec" = "xTrue" ] ; then if [ ! -e "8-hicPipeline/rukki.paths.gaf" ]; then echo "ERROR!" echo "Not running final consensus since no rukki paths provided!"