Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Initial porec support #250

Merged
merged 3 commits into from
May 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -238,5 +238,5 @@ include: 'Snakefiles/7-extractONT.sm'
include: 'Snakefiles/7-buildPackage.sm'
include: 'Snakefiles/7-generateConsensus.sm'
include: 'Snakefiles/7-combineConsensus.sm'
if config['withHIC'] == "True":
if config['withHIC'] == "True" or config['withPOREC'] == 'True':
include: 'Snakefiles/8-hicPipeline.sm'
91 changes: 53 additions & 38 deletions src/Snakefiles/8-hicPipeline.sm
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ HIC_READS2 = config.get('HIC_READS2')
HAPLO_DIV = float(config.get('haplo_divergence'))
VERKKO = config.get('VERKKO')


rule copyFiles:
input:
unitig_scfmap = '6-layoutContigs/unitig-popped.layout.scfmap',
Expand Down Expand Up @@ -75,7 +75,7 @@ chmod +x ./prepare_hic.sh

./prepare_hic.sh > ../{log.err} 2>&1
'''

rule runMashMap:
input:
unitigs_hpc = '8-hicPipeline/unitigs.hpc.fasta'
Expand Down Expand Up @@ -194,15 +194,15 @@ def splitHICoutputs(wildcards):
checkpoint splitHIC:
input:
hic1 = HIC_READS1,
hic2 = HIC_READS2
hic2 = HIC_READS2 if config['withPOREC'] == "False" else rules.emptyfile.output[0]
output:
split_finished = '8-hicPipeline/splitHIC.finished'
log:
err = '8-hicPipeline/splitHIC.err'
params:
bases = config['shc_bases'],
bases = config['shc_bases'] if config['withPOREC'] == "False" else config['spl_bases'],
reads = config['shc_reads'],
length = config['shc_min_length']
length = config['shc_min_length'],
threads:
int(config['fhc_n_cpus'])
resources:
Expand All @@ -219,16 +219,23 @@ cat > ./splitHIC.sh <<EOF
set -e

mkdir -p split

{PYTHON} {VERKKO}/scripts/fasta_partition.py \\\\
partition split/hic1 {params.bases} {params.reads} {params.length} \\\\
{input.hic1} \\\\
&& \\\\
{PYTHON} {VERKKO}/scripts/fasta_partition.py \\\\
partition split/hic2 {params.bases} {params.reads} {params.length} \\\\
{input.hic2} \\\\
&& \\\\
touch ../{output.split_finished}
if [ "{input.hic2}" = "emptyfile" ]; then
{PYTHON} {VERKKO}/scripts/fasta_partition.py \\\\
partition split/hic1 {params.bases} {params.reads} {params.length} \\\\
{input.hic1} \\\\
&& \\\\
touch ../{output.split_finished}
else
{PYTHON} {VERKKO}/scripts/fasta_partition.py \\\\
partition split/hic1 {params.bases} {params.reads} {params.length} \\\\
{input.hic1} \\\\
&& \\\\
{PYTHON} {VERKKO}/scripts/fasta_partition.py \\\\
partition split/hic2 {params.bases} {params.reads} {params.length} \\\\
{input.hic2} \\\\
&& \\\\
touch ../{output.split_finished}
fi
EOF

chmod +x ./splitHIC.sh
Expand Down Expand Up @@ -265,20 +272,22 @@ chmod +x ./index_bwa.sh
./index_bwa.sh > ../{log.err} 2>&1

'''

rule alignBWA:
input:
unitigs = '8-hicPipeline/unitigs.fasta',
hic1 = '8-hicPipeline/split/hic1{nnnn}.fasta.gz',
hic2 = '8-hicPipeline/split/hic2{nnnn}.fasta.gz',
index = '8-hicPipeline/unitigs.fasta.bwt'
hic2 = '8-hicPipeline/split/hic2{nnnn}.fasta.gz' if config['withPOREC'] == "False" else rules.emptyfile.output[0],
index = '8-hicPipeline/unitigs.fasta.bwt' if config['withPOREC'] == "False" else rules.emptyfile.output[0]
output:
bwa_mapping = '8-hicPipeline/mapped{nnnn}.bam'
log:
err = '8-hicPipeline/align_bwa{nnnn}.err'
params:
BWA = config.get('BWA', "{VERKKO}/bin/bwa"),
SAMTOOLS = config.get('SAMTOOLS', "{VERKKO}/bin/samtools")
BWA = config.get('BWA', "{VERKKO}/bin/bwa"),
SAMTOOLS = config.get('SAMTOOLS', "{VERKKO}/bin/samtools"),
WINNOWMAP = config.get('WINNOWMAP', "{VERKKO}/bin/winnowmap"),
withporec = config['withPOREC']
threads:
int(config['ahc_n_cpus'])
resources:
Expand All @@ -293,13 +302,19 @@ cd 8-hicPipeline
cat > ./align_bwa{wildcards.nnnn}.sh <<EOF
#!/bin/sh
set -e
echo "---Mapping {input.hic1} and {input.hic2} to {input.unitigs} with BWA"
{params.BWA} mem -t {threads} -5 -S -P ../{input.unitigs} ../{input.hic1} ../{input.hic2} | {params.SAMTOOLS} view -bh -@ {threads} -q 1 -F 0x800 - | {params.SAMTOOLS} sort -n -@ {threads} - -o ../{output.bwa_mapping}
if [ {params.withporec} = "False" ] ; then
echo "---Mapping {input.hic1} and {input.hic2} to {input.unitigs} with BWA"
{params.BWA} mem -t {threads} -5 -S -P ../{input.unitigs} ../{input.hic1} ../{input.hic2} | {params.SAMTOOLS} view -bh -@ {threads} -q 1 -F 0x900 - | {params.SAMTOOLS} sort -n -@ {threads} - -o ../{output.bwa_mapping}
else
echo "--Mapping {input.hic1} to {input.unitigs} with winnowmap"
mem=\`expr {resources.mem_gb} \/ {threads} | awk '{{print \$1"G"}}' \`
{params.WINNOWMAP} -ax map-ont -t {threads} -f 0.0002 -K \$mem ../{input.unitigs} ../{input.hic1} | {params.SAMTOOLS} view -T ../{input.unitigs} -bh -@ {threads} -q 5 -F 0x100 - | {params.SAMTOOLS} sort -n -@ {threads} - -o ../{output.bwa_mapping}
fi
EOF

chmod +x ./align_bwa{wildcards.nnnn}.sh

./align_bwa{wildcards.nnnn}.sh > ../{log.err} 2>&1
./align_bwa{wildcards.nnnn}.sh > ../{log.err} 2>&1
'''

#TODO: bwa and mashmap and samtools bins
Expand All @@ -308,7 +323,7 @@ rule mergeBWA:
split_finished = rules.splitHIC.output.split_finished,
index = '8-hicPipeline/unitigs.fasta.bwt',
alignments = lambda wildcards: expand("8-hicPipeline/mapped{nnnn}.bam", nnnn = splitHICoutputs(wildcards))

output:
alignments = '8-hicPipeline/hic_to_assembly.sorted_by_read.bam'
log:
Expand All @@ -324,15 +339,15 @@ rule mergeBWA:
n_cpus = int(config['fhc_n_cpus']),
mem_gb = lambda wildcards, input, attempt: getMemoryRequest(attempt, 'fhc'),
time_h = lambda wildcards, input, attempt: getTimeRequest(attempt, 'fhc')
shell:
shell:
'''
cd 8-hicPipeline

cat > ./mergeBWA.sh <<EOF
#!/bin/sh
set -e

{params.SAMTOOLS} merge -@ {threads} ../{output.alignments} {params.alignments}
{params.SAMTOOLS} merge -n -@ {threads} ../{output.alignments} {params.alignments}

if [ {params.keepinter} = False ] ; then
rm -f {params.alignments}
Expand Down Expand Up @@ -373,7 +388,7 @@ cd 8-hicPipeline
cat > ./transform_bwa.sh <<EOF
#!/bin/sh
set -e
{params.SAMTOOLS} view -F 0x800 -q 1 ../{input.bwa_mapping} | {PYTHON} {VERKKO}/scripts/parse_sam_pairs.py > ../{output.byread_mapping}
{params.SAMTOOLS} view -q 1 ../{input.bwa_mapping} | {PYTHON} {VERKKO}/scripts/parse_sam_pairs.py > ../{output.byread_mapping}
EOF

chmod +x ./transform_bwa.sh
Expand Down Expand Up @@ -401,7 +416,7 @@ rule hicPhasing:
n_cpus = int(config['fhc_n_cpus']),
mem_gb = lambda wildcards, input, attempt: getMemoryRequest(attempt, 'fhc'),
time_h = lambda wildcards, input, attempt: getTimeRequest(attempt, 'fhc')
shell:
shell:
'''
cd 8-hicPipeline

Expand All @@ -415,7 +430,7 @@ chmod +x ./hic_phasing.sh

./hic_phasing.sh > ../{log.err} 2>&1
'''

rule runRukkiHIC:
input:
graph = '8-hicPipeline/unitigs.hpc.noseq.gfa',
Expand All @@ -424,7 +439,7 @@ rule runRukkiHIC:
tsv_path = '8-hicPipeline/rukki.paths.tsv',
gaf_path = '8-hicPipeline/rukki.paths.gaf',
final_paths_tsv = 'assembly.paths.tsv'
log:
log:
err = '8-hicPipeline/rukki_hic.err'
threads:
int(config['fhc_n_cpus'])
Expand All @@ -437,7 +452,7 @@ rule runRukkiHIC:
'''
cd 8-hicPipeline

cat > ./rukki_hic.sh <<EOF
cat > ./rukki_hic.sh <<EOF
#!/bin/sh
set -e
echo "---Running rukki on the resulting clustering"
Expand Down Expand Up @@ -470,17 +485,17 @@ chmod +x ./rukki_hic.sh

rule HiC_rdnascaff:
input:
graph = '8-hicPipeline/unitigs.hpc.noseq.gfa',
graph = '8-hicPipeline/unitigs.hpc.noseq.gfa',
tsv_path = '8-hicPipeline/rukki.paths.tsv',
gaf_path = '8-hicPipeline/rukki.paths.gaf',
unitigs_telo = '8-hicPipeline/unitigs.telo',
unitigs_rdnamatches = '8-hicPipeline/rdna_mashmap.out'
unitigs_rdnamatches = '8-hicPipeline/rdna_mashmap.out'
output:
scaff_tsv_path = '8-hicPipeline/scaff_rukki.paths.tsv',
scaff_gaf_path = '8-hicPipeline/scaff_rukki.paths.gaf',
prescaf_tsv_path = '8-hicPipeline/prescaf_rukki.paths.tsv',
prescaf_gaf_path = '8-hicPipeline/prescaf_rukki.paths.gaf',
log:
log:
err = '8-hicPipeline/hic_scaff.err'
threads:
int(config['fhc_n_cpus'])
Expand All @@ -499,11 +514,11 @@ set -e
{PYTHON} {VERKKO}/scripts/rdna_scaff.py .


cp ../{input.tsv_path} ../{output.prescaf_tsv_path}
cp ../{input.gaf_path} ../{output.prescaf_gaf_path}
cp ../{input.tsv_path} ../{output.prescaf_tsv_path}
cp ../{input.gaf_path} ../{output.prescaf_gaf_path}

cp ../{output.scaff_tsv_path} ../{input.tsv_path}
cp ../{output.scaff_gaf_path} ../{input.gaf_path}
cp ../{output.scaff_tsv_path} ../{input.tsv_path}
cp ../{output.scaff_gaf_path} ../{input.gaf_path}

cp ../{input.tsv_path} ../{rules.runRukkiHIC.output.final_paths_tsv}
EOF
Expand Down
43 changes: 36 additions & 7 deletions src/scripts/parse_sam_pairs.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,20 @@
import re
import os

spltd =[['no_read'],['no_read']]
def print_results(names):
for i in range(len(names)):
for j in range(i+1, len(names)):
i_split = names[i].split()
j_split = names[j].split()
if (i_split[0] != j_split[0]):
print("Error: name read %s and %s don't match"%(i_split, j_split), file=sys.stderr)
sys.exit(1)
if (i_split[1] < j_split[1]):
print (f"{i_split[0]}\t{i_split[1]}\t{j_split[1]}")
elif (i_split[1] > j_split[1]):
print (f"{i_split[0]}\t{j_split[1]}\t{i_split[1]}")
# if they are equal we don't want to print anything, not informative

if not sys.stdin.isatty():
input_stream = sys.stdin

Expand All @@ -18,11 +31,27 @@
else:
input_stream = open(input_filename, 'rU')

name = ""
names = [ ]
seen = {}
out_of_order = 0

for line in input_stream:
spltd[0] = spltd[1]
spltd[1] = line.split()
if spltd[0][0] == spltd[1][0]:
if spltd[0][2] != spltd[1][2]:
print (f"{spltd[0][0]}\t{spltd[0][2]}\t{spltd[1][2]}")
# print (f"{spltd[0][0]}\t{spltd[0][2]}\t{spltd[1][2]}\t{spltd[0][4]}\t{spltd[1][4]}")
line=line.split()
if name == "":
name = line[0]
if name != line[0]:
seen[name] = 1
print_results(names)
name = line[0]
names = [ ]
if name in seen:
print("Warning: read %s already seen but encountered it again, please confirm your bam file is sorted by read."%(name), file=sys.stderr)
out_of_order += 1
names.append("%s\t%s"%(line[0], line[2]))

if out_of_order > 1000:
print("Error: encountered too many unsorted reads (%d), exiting. Please confirm the input bam is sorted by read."%(out_of_order), file=sys.stderr)
sys.exit(1)

print_results(names)
Loading
Loading