Snakefile-base

from collections import namedtuple
import pandas as pd
from snakemake.utils import validate
import sys
import time
from spellbook import spells
from shell import shell # override the default shell

shell.executable('bash')

# Import config
configfile: "config.yaml"

# Set working dir to "working/", you can change this to anything.
# The rationale is you can git clone the pipeline and run it without doing any juggling.
# Note that most things need to be prefixed with ../ to refer to the current directory,
# but there are exceptions like when reading cfg files and conda environments.
WORKDIR = os.path.abspath("working/")
workdir: WORKDIR

# Globally define constraints on common wildcards
wildcard_constraints:
    polishedprefix=".*\.|()", 	# Match a polishing prefix that ends with a fullstop, otherwise match an empty group
                             	# NOTE One cannot use ^$ or the like here, as this pattern is inserted into another pattern by snakemake
    polisher="\w+",
    readtype="[A-z]+",
    depth="\d*?",		# Optionally match an integer added after readtype
    iteration="\d+",

def base_onstart():
    #for a in enumerate_assemblies(base_only=False, unroll=False, suffix=".fa"):
    #    sys.stderr.write("*\t%s\n" % a)

    rprefix = "set -euo pipefail; touch %s; " % os.path.join(WORKDIR, "flags", "{jobid}.start")
    if workflow.global_resources.get('benchmark', False):
        rprefix += "python3 ../scripts/benchmarking/drop_cache_client.py; "

    shell.prefix(rprefix)
    shell.suffix("; touch %s;" % os.path.join(WORKDIR, "flags", "{jobid}.finish"))

    import snakemake.benchmark
    import benchmark as rbenchmark
    snakemake.benchmark.BenchmarkRecord = rbenchmark.BenchmarkRecord
    snakemake.benchmark.benchmarked = rbenchmark.benchmarked
    sys.modules['snakemake.benchmark'] = snakemake.benchmark

    print("snakefile: ", workflow.snakefile)
    os.makedirs('flags', exist_ok=True)

    job_bins = {}
    for job in workflow.persistence.dag.jobs:
        sample = job.wildcards.get('uuid', 'default')
        if sample not in job_bins:
            job_bins[sample] = []
        job_bins[sample].append(job)

    sample_deps = {}
    def tree_sjob(job, ptr=None, depth=0, sample=None):
        if job.jobid == 0:
            return

        if not sample:
            sample = job.wildcards.get('uuid', 'default')
        if sample not in sample_deps:
            sample_deps[sample] = {}
        if ptr is None:
            ptr = sample_deps[sample]

        if job not in ptr:
            ptr[job] = {"__depth": depth}
            ptr = ptr[job]

        if ptr["__depth"] < depth:
            ptr["__depth"] = depth

        deps = workflow.persistence.dag.depending[job]
        for job in deps:
            #if job in workflow.persistence.dag.needrun_jobs:
            tree_sjob(job, ptr, depth+1)

    finish = next(workflow.persistence.dag.jobs)
    if not finish.name.startswith("finish"):
        print("Not like this."); sys.exit(80);

    for sample in job_bins:
        if sample == "default": continue
        for job in job_bins[sample]:
            waiting = set(workflow.persistence.dag.dependencies[job]).intersection(workflow.persistence.dag.needrun_jobs)
            if len(waiting) == 0 or (len(waiting) == 1 and finish in waiting):
                tree_sjob(job, sample=sample)
    
    status_fh = open("flags/q.txt", "w")
    def print_sjob(head, seen, depth=0, sample=""):
        for job in head:
            if job in seen or str(job).startswith("__"):
                continue
            seen.add(job)
            print("\t".join([str(x) for x in [head[job]["__depth"], '*' if job in workflow.persistence.dag.needrun_jobs else '-', job.jobid, job.rule]]))
            status_fh.write("\t".join([str(x) for x in [sample, head[job]["__depth"], '*' if job in workflow.persistence.dag.needrun_jobs else '-', job.jobid, job.rule]]) + '\n')
            print_sjob(head[job], seen, depth+1, sample)
    
    for sample in sample_deps:
        print(sample)
        #print_sjob(sample_deps[sample], set([]), sample=sample)
    status_fh.close()

ONSTART_QUEUE = [base_onstart]
onstart:
    [f() for f in ONSTART_QUEUE]

onsuccess:
    # TODO Need to update https://github.com/slackapi/python-slackclient/wiki/Migrating-to-2.x
    cwd = os.getcwd()
    if config["slack_token"]:
        try:
            from slackclient import SlackClient

            token = config["slack_token"]
            sc = SlackClient(token)
            sc.api_call(
                    "chat.postMessage", channel=config["slack_channel"],
                    attachments= [{
                        "title": "Snakemake pipeline completed successfully",
                        "color": "#36a64f",
                        "text": "Hooray!",
                        "footer": "Completed pipeline spotted by SnakeEyes",
                        "footer_icon": "https://avatars.slack-edge.com/2019-05-07/627979128852_e1d616abbd7312343db8_512.png",
                        "ts": int(time.time()),
                        "fields": [
                            { "title": "Working Directory", "value": cwd, "short": False },
                        ],
                    }]
            )
        except ModuleNotFoundError:
            pass

onerror:
    cwd = os.getcwd()
    if config["slack_token"]:
        try:
            from slackclient import SlackClient

            token = config["slack_token"]
            sc = SlackClient(token)
            sc.api_call(
                    "chat.postMessage", channel=config["slack_channel"],
                    attachments= [{
                        "title": "Snakemake pipeline terminated unexpectedly",
                        "color": "#e20b00",
                        "text": "Boo :(",
                        "footer": "Trashed pipeline spotted by SnakeEyes",
                        "footer_icon": "https://avatars.slack-edge.com/2019-05-07/627979128852_e1d616abbd7312343db8_512.png",
                        "ts": int(time.time()),
                        "fields": [
                            { "title": "Working Directory", "value": cwd, "short": False },
                        ],
                    }]
            )
        except ModuleNotFoundError:
            pass

# Load manifest(s)
samples = pd.read_csv("../manifest.cfg", sep="\t").set_index("uuid", drop=False)  # defines the assembly and polishing strategies
reads_lookup = pd.read_csv("../reads.cfg", sep="\t").set_index("samplename", drop=False)

# Utility functions
def input_polish(w):
    if int(w.iteration) > 1:
        contigs = "%s.%s.ctg.cns.%s%s-%s%s-%d.fa" % (w.uuid, w.conf, w.polishedprefix, w.polisher, w.readtype, w.get("depth", ""), int(w.iteration)-1)
    elif w.polishedprefix=="":
        # Polish existing assembly if repolish is set
        polish_uuid = w.uuid
        if samples.loc[w.uuid]["repolish"] != '-':
            polish_uuid = samples.loc[w.uuid]["repolish"]
        contigs = "%s.%s.ctg.cns.fa" % (polish_uuid, w.conf)
    else:
        contigs = "%s.%s.ctg.cns.%sfa" % (w.uuid, w.conf, w.polishedprefix)
    return contigs

def minimap2_mode(w):
    if w.readtype.startswith("ont"):
        mode = "map-ont"
    elif w.readtype == "ill":
        mode = "sr"
    return mode


def get_reads(w, readtype, samplename=None):
    if samplename:
        r = reads_lookup.loc[samplename][readtype]
	return r.split(':')[1].split(',')

    r = reads_lookup.loc[ samples.loc[w.uuid]['samplename'] ][readtype]
    if ':' in r:
        r = os.path.join(r.split(':')[0], '%s.cat-%s.fq.gz' % (samples.loc[w.uuid]['samplename'], readtype))
    return r

def polish_reads_input(w):
    if w.readtype == "ont" and not w.get("depth", None):
        reads = get_reads(w, 'ont')
    elif w.readtype.startswith("ont") and w.get("depth", None):
        reads = get_reads(w, 'ont')
        #reads = "%s.%s.ctg.cns.%s%s-%s%s-%s.reads.fa" % (w.uuid, w.conf, w.polishedprefix, w.polisher, w.readtype, w.depth, w.iteration)
    elif w.readtype == "ill":
        reads = [get_reads(w, 'i1'), get_reads(w, 'i2')]
    return reads

def align_polish_reads_input(w):
    if w.readtype.startswith("ont"):
        reads = get_reads(w, 'ont')
    elif w.readtype == "ill":
        reads = [get_reads(w, 'i1'), get_reads(w, 'i2')]
    return reads

def n_threads(w, key=""):
    n_threads = workflow.global_resources['_cores']
    if w and samples.loc[w.uuid].get("cpu", '-') != '-':
        try:
            n_threads = int(samples.loc[w.uuid]["cpu"])
        except ValueError:
            pass
    elif key in config:
        n_threads = config[key]
    return n_threads

def n_gpu(w, key=""):
    if not config.get("cuda"):
        return 0

    import GPUtil
    n_gpu = workflow.global_resources.get('gpu', len(GPUtil.getGPUs()))
    if samples.loc[w.uuid].get("gpu", '-') != '-':
        try:
            n_gpu = int(samples.loc[w.uuid]["gpu"])
        except ValueError:
            pass
    elif key in config:
        n_gpu = config[key]
    return n_gpu

def unroll_assemblies(w, name, unroll=True):
    if not unroll:
        return [name]

    polishes = name.split(".")
    if len(polishes) == 0 or len(polishes[0]) == 0:
        return []

    end_polish = polishes[-1]
    end_polish_f = end_polish.split("-")
    end_iter = int(end_polish_f[-1])
    if (end_iter - 1) < 1:
        return [name] + unroll_assemblies(w, ".".join(polishes[:-1]))
    else:
        end_polish_f[-1] = str(end_iter-1)
        return [name] + unroll_assemblies(w, ".".join(polishes[:-1] + ["-".join(end_polish_f)]))

def enumerate_assemblies(w=None, prefix="", suffix="", unroll=True, base_only=False, refgroup=False):
    base_assemblies = []
    polished_assemblies = []
    for uuid in samples["uuid"]:
        conf = samples.loc[uuid]["spell"]
        if refgroup and samples.loc[uuid]["refgroup"] == "-":
            continue
        if samples.loc[uuid]["repolish"] == '-':
            base_assemblies.append( "%s%s.%s.ctg.cns%s" % (prefix, uuid, conf, suffix) )

        if samples.loc[uuid]["polishpipe"] != '-' and not base_only:
            unrolled = unroll_assemblies(w, samples.loc[uuid]["polishpipe"], unroll=unroll)
            context = ['%s%s.%s.ctg.cns.%s%s' % (prefix, uuid, conf, step, suffix) for step in unrolled]
            polished_assemblies.extend(context)
    return base_assemblies + polished_assemblies

def enumerate_reads(cols=["ont", "i1", "i2"], suffix="", refgroup=False):
    read_paths = set([])
    FakeWildcard = namedtuple('Wildcard', 'uuid')

    for uuid in samples["uuid"]:
        fw = FakeWildcard(uuid=uuid)
        group = ""
        if refgroup:
            group = ".%s" % samples.loc[uuid]["refgroup"]
            if group == ".-":
                continue
        for col in cols:
            path = get_reads(fw, col)
            if path and path != '-':
                read_paths.add(path + group + suffix)
    return read_paths

def get_samplename_from_readpath(path):
    try:
        return reads_lookup.loc[reads_lookup['ont'] == path]["samplename"][0]
    except:
        for samplename in reads_lookup["samplename"]:
            if os.path.basename(path).split(".")[0] == samplename:
                return samplename
    raise Exception


# TODO samstudio8
# This function is fired during construction of the DAG when searching the parameter space of the GPU jobs,
# it might be worth trying to catch the fact that execution hasn't started yet to suppress this output?
def select_gpu_device(wildcards, resources):
    if not config["cuda"] or resources.gpu == 0:
        return None
    import GPUtil
    available_l = GPUtil.getAvailable(order = 'random', limit = resources.gpu, maxLoad = 0.5, maxMemory = 0.5, includeNan=False, excludeID=[], excludeUUID=[])
    available_str = ",".join([str(x) for x in available_l])

    if len(available_l) == 0 and resources.gpu > 0:
        raise Exception("select_gpu_device did not select any GPUs")
    elif len(available_l) < resources.gpu:
        sys.stderr.write("[WARN] select_gpu_device selected fewer GPU devices than requested")
    print("Assigning %d available GPU devices: %s" % (resources.gpu, available_str))
    return available_str


rule finish:
    input:
        tests=enumerate_assemblies(base_only=True, unroll=False, prefix="tests/", suffix=".test.txt"),
        stat="assembly_stats.txt",
        meta="assembly_md5size.txt",
        kraken="kraken_summary.bond.tsv",
        reads_kraken=enumerate_reads(suffix=".k2kc"),
        base_graphs=enumerate_assemblies(unroll=False, base_only=True, suffix=".gfa.svg"),
        polished=enumerate_assemblies(base_only=False, unroll=False, suffix=".fa"),
    output: touch("reticulated.base.ok")
    shell: 'echo "Reticulated successfully."'

rule merge_reads:
    input: lambda w: get_reads(w, w.readtype, samplename=os.path.basename(w.samplename))
    output:
        "{samplename}.cat-{readtype}.{ext,(fq|fastq)(\.gz|)}"
    shell:
        "cat {input} > {output}"
    

# new and improved version of this rule doesn't fucking nuke your input reads
rule subset_reads:
    input:
        "{fq}.{ext}"
    output:
        "{fq}.subset-{ratio,\d+}.{ext,(fq|fastq)(\.gz|)}"
    params:
        ratio=lambda w: float(w.ratio)/100
    shell:
        "seqkit sample {input} -p {params.ratio} -o {output}"

rule rmdup_reads:
    input:
        "{fq}.{ext}"
    output:
        out="{fq}.rmdup.{ext,(fq|fastq)(\.gz|)}",
        dup="{fq}.dup.{ext,(fq|fastq)(\.gz|)}",
    shell:
        "seqkit rmdup -n -i {input} -o {output.out} -d {output.dup}"


rule minimap2_presubsample_bam:
    input: contigs=input_polish, reads=align_polish_reads_input
    params:
        mode=minimap2_mode,
        #reads=lambda w, input: input.reads.replace(".gz", ".b.gz")
    threads: lambda w: n_threads(w, "minimap2_threads")
    output:
        b=temp("{uuid}.{conf}.ctg.cns.{polishedprefix}{polisher}-{readtype}{depth}-{iteration}.fa.bam"),
        bai=temp("{uuid}.{conf}.ctg.cns.{polishedprefix}{polisher}-{readtype}{depth}-{iteration}.fa.bam.bai"),
    resources:
        benchmark=1,
    benchmark: "benchmarks/{uuid}.{conf}.ctg.cns.{polishedprefix}{polisher}-{readtype}{depth}-{iteration}.fa.bam"
    log: "log/{uuid}.{conf}.ctg.cns.{polishedprefix}{polisher}-{readtype}{depth}-{iteration}.fa.bam"
    shell: "minimap2 -a -2 --sam-hit-only --secondary=no -Q -t {threads} -x {params.mode} {input.contigs} {input.reads} | samtools sort -T tmpsort-{output.b} %s -@ {threads} -o {output.b} 2> {log}; samtools index -@ {threads} {output.b}" % config["sort_flags"]
    #shell: "cudamapper -w 10 -t 1000 -i 5000 {params.reads} {input.contigs} > {output}"

rule minimap2_prepolish_fullsample:
    input: contigs=input_polish, reads=align_polish_reads_input
    params:
        mode=minimap2_mode,
        #reads=lambda w, input: input.reads.replace(".gz", ".b.gz")
    threads: lambda w: n_threads(w, "minimap2_threads")
    resources:
        benchmark=1,
    output:
        s=temp("{uuid}.{conf}.ctg.cns.{polishedprefix}{polisher}-{readtype}-{iteration}.fa.sam"),
    benchmark: "benchmarks/{uuid}.{conf}.ctg.cns.{polishedprefix}{polisher}-{readtype}-{iteration}.fa.sam"
    log: "log/{uuid}.{conf}.ctg.cns.{polishedprefix,.*\.|()}{polisher}-{readtype}-{iteration}.fa.sam"
    shell: "minimap2 -a -2 --sam-hit-only --secondary=no -Q -t {threads} -x {params.mode} {input.contigs} {input.reads} > {output.s} 2> {log}"

rule minimap2_prepolish_subsample:
    ##conda: "environments/pomoxis-sam.yaml"
    conda: "environments/pomoxis.yaml"
    input: b="{uuid}.{conf}.ctg.cns.{polishedprefix}{polisher}-{readtype}{depth}-{iteration}.fa.bam",
        bai="{uuid}.{conf}.ctg.cns.{polishedprefix}{polisher}-{readtype}{depth}-{iteration}.fa.bam.bai"
    output: s=temp("{uuid}.{conf}.ctg.cns.{polishedprefix}{polisher}-{readtype}{depth,\d+}-{iteration}.fa.sam"),
    #    fa="{uuid}.{conf}.ctg.cns.{polishedprefix}{polisher}-{readtype}{depth,\d+}-{iteration}.reads.fa",
    benchmark: "benchmarks/{uuid}.{conf}.ctg.cns.{polishedprefix}{polisher}-{readtype}{depth,\d+}-{iteration}.fa.sam"
    log: "log/{uuid}.{conf}.ctg.cns.{polishedprefix}{polisher}-{readtype}{depth,\d+}-{iteration}.fa.sam"
    threads: lambda w: n_threads(w, "minimap2_threads")
    resources:
        benchmark=1,
    shell: "subsample_bam {input.b} {wildcards.depth} -t {threads} --output-fasta {output.fa} | samtools sort -T {wildcards.uuid}-B %s -O SAM -o {output.s} -@ {threads} 2> {log}" % config["sort_flags"]
    ###shell: "subsample_bam {input.b} {wildcards.depth} -t {threads} --output-override - | samtools sort -T {wildcards.uuid}-B %s -O SAM -o {output.s} -@ {threads} 2> {log}" % config["sort_flags"]
    ##shell: "subsample_bam_fast {input.b} {wildcards.depth} -t {threads} -o - 2> {log} | samtools sort -T {wildcards.uuid}-B %s -O SAM -o {output.s} -@ {threads} 2> {log}" % config["sort_flags"]

rule polish_racon:
    #input: contigs=input_polish, reads=polish_reads_input, overlaps=lambda w: input_polish(w)+".sam"
    input: contigs=input_polish, reads=polish_reads_input, overlaps="{uuid}.{conf}.ctg.cns.{polishedprefix}{polisher}-{readtype}{depth}-{iteration}.fa.sam"
    params:
        cuda=lambda w: "--cudapoa-batches %d" % config["racon_batches"] if n_gpu(w, "polish_gpu") > 0 else "",
        devices=select_gpu_device,
    output:
        out="{uuid}.{conf}.ctg.cns.{polishedprefix}{polisher,racon}-{readtype}{depth}-{iteration}.fa",
        rtbench="rtbenchmarks/{uuid}.{conf}.ctg.cns.{polishedprefix}{polisher,racon}-{readtype}{depth}-{iteration}.fa",
    benchmark: "benchmarks/{uuid}.{conf}.ctg.cns.{polishedprefix}{polisher,racon}-{readtype}{depth}-{iteration}.fa"
    log: "log/{uuid}.{conf}.ctg.cns.{polishedprefix}{polisher,racon}-{readtype}{depth}-{iteration}.fa"
    threads: lambda w: n_threads(w, "polish_threads")
    resources:
        gpu=lambda w: n_gpu(w, "polish_gpu"),
        benchmark=1,
    shell:
        "export CUDA_VISIBLE_DEVICES={params.devices}; racon -m 8 -x -6 -g -8 -w 500 -t {threads} {input.reads} {input.overlaps} {input.contigs} {params.cuda} > {output.out} 2> {log}"

rule prepolish_medaka_override_calls2draft:
    singularity: config["medaka_env"],
    input:
        contigs=input_polish,
        reads=polish_reads_input,
    params:
        prefix=lambda w, output: output.b.replace(".bam", "")
    output:
        b="medaka-{uuid}.{conf}.ctg.cns.{polishedprefix}{polisher,medaka}-{readtype}{depth,\d*}-{iteration}.fa/pre_calls_to_draft.bam",
        i="medaka-{uuid}.{conf}.ctg.cns.{polishedprefix}{polisher,medaka}-{readtype}{depth,\d*}-{iteration}.fa/pre_calls_to_draft.bam.bai"
    threads: lambda w: n_threads(w, "minimap2_threads")
    shell: "mini_align -i {input.reads} -r {input.contigs} -p {params.prefix} -t {threads} -m -f" # as in medaka_consensus

#TODO Integrate with other subsample_bam rule
rule prepolish_medaka_override_calls2draft_subsample:
    #conda: "environments/pomoxis-sam.yaml"
    conda: "environments/pomoxis.yaml"
    input:
        b="medaka-{uuid}.{conf}.ctg.cns.{polishedprefix}{polisher}-{readtype}{depth}-{iteration}.fa/pre_calls_to_draft.bam",
        i="medaka-{uuid}.{conf}.ctg.cns.{polishedprefix}{polisher}-{readtype}{depth}-{iteration}.fa/pre_calls_to_draft.bam.bai"
    output:
        b=temp("medaka-{uuid}.{conf}.ctg.cns.{polishedprefix}{polisher,medaka}-{readtype}{depth,\d+}-{iteration}.fa/calls_to_draft.bam"),
        i=temp("medaka-{uuid}.{conf}.ctg.cns.{polishedprefix}{polisher,medaka}-{readtype}{depth,\d+}-{iteration}.fa/calls_to_draft.bam.bai")
    threads: lambda w: n_threads(w, "minimap2_threads")
    resources:
        benchmark=1,
    shell: "subsample_bam {input.b} {wildcards.depth} -t {threads} --output-override - | samtools sort -T {wildcards.uuid}-B %s -O BAM -o {output.b} -@ {threads}; samtools index -@ {threads} {output.b}" % config["sort_flags"]
    #shell: "subsample_bam_fast {input.b} {wildcards.depth} -t {threads} -o - | samtools sort -T {wildcards.uuid}-B %s -O BAM -o {output.b} -@ {threads}; samtools index -@ {threads} {output.b}" % config["sort_flags"]

rule prepolish_medaka_override_calls2draft_subsample_depth0:
    input:
        b="medaka-{uuid}.{conf}.ctg.cns.{polishedprefix}{polisher}-{readtype}-{iteration}.fa/pre_calls_to_draft.bam",
        i="medaka-{uuid}.{conf}.ctg.cns.{polishedprefix}{polisher}-{readtype}-{iteration}.fa/pre_calls_to_draft.bam.bai"
    output:
        b=temp("medaka-{uuid}.{conf}.ctg.cns.{polishedprefix}{polisher,medaka}-{readtype}-{iteration}.fa/calls_to_draft.bam"),
        i=temp("medaka-{uuid}.{conf}.ctg.cns.{polishedprefix}{polisher,medaka}-{readtype}-{iteration}.fa/calls_to_draft.bam.bai")
    shell: "ln -srf {input.b} {output.b}; ln -srf {input.i} {output.i}"

rule polish_medaka:
    singularity: config["medaka_env"],
    input:
        contigs=input_polish,
        reads=polish_reads_input,
        cbam="medaka-{uuid}.{conf}.ctg.cns.{polishedprefix}{polisher}-{readtype}{depth}-{iteration}.fa/calls_to_draft.bam",
        cbai="medaka-{uuid}.{conf}.ctg.cns.{polishedprefix}{polisher}-{readtype}{depth}-{iteration}.fa/calls_to_draft.bam.bai"
    params:
        prefix=lambda w, output: "medaka-%s" % output.fa,
        model=lambda w: samples.loc[w.uuid]['medakamodel'],
        batch=lambda w: "-b 100" if n_gpu(w, "polish_gpu") > 0 else "",
        devices=select_gpu_device,
        mmi=lambda w, input: temp("%s.mmi" % input.contigs)
    output:
        fa="{uuid}.{conf}.ctg.cns.{polishedprefix}{polisher,medaka}-{readtype}{depth}-{iteration}.fa",
        chdf=temp("medaka-{uuid}.{conf}.ctg.cns.{polishedprefix}{polisher,medaka}-{readtype}{depth}-{iteration}.fa/consensus_probs.hdf"),
    threads: lambda w: n_threads(w, "polish_threads")
    benchmark: "benchmarks/{uuid}.{conf}.ctg.cns.{polishedprefix}{polisher,medaka}-{readtype}{depth}-{iteration}.fa"
    log: "log/{uuid}.{conf}.ctg.cns.{polishedprefix}{polisher,medaka}-{readtype}{depth}-{iteration}.fa"
    resources:
        gpu=lambda w: n_gpu(w, "polish_gpu"),
        benchmark=1,
    shell:
        "export CUDA_VISIBLE_DEVICES={params.devices}; export TF_FORCE_GPU_ALLOW_GROWTH=true; rm -f {params.mmi}; SORT='%s' medaka_consensus -i {input.reads} -d {input.contigs} -o {params.prefix} -t {threads} -m {params.model} {params.batch} > {log} 2>&1; mv {params.prefix}/consensus.fasta {output.fa}" % config["sort_flags"]

rule download_dehumanizer_database:
    output:
        ok=touch(os.path.join(config["dehumanizer_database_root"], "dehuman.ok")),
        a=os.path.join(config["dehumanizer_database_root"], "GCA_000786075.2_hs38d1_genomic.mmi"),
        b=os.path.join(config["dehumanizer_database_root"], "GCF_000001405.39_GRCh38.p13_genomic.mmi"),
        c=os.path.join(config["dehumanizer_database_root"], "ipd-imgt-3_39_0.hla_gen.mmi"),
        manifest=os.path.join(config["dehumanizer_database_root"], "manifest.txt")
    shell:
        "bash ../scripts/download_dehumanizer_refs.sh %s" % config["dehumanizer_database_root"]

rule polish_dehumanizer:
    input:
        contigs=input_polish,
        ok=os.path.join(config["dehumanizer_database_root"], "dehuman.ok"),
        manifest=os.path.join(config["dehumanizer_database_root"], "manifest.txt")
    output:
        "{uuid}.{conf}.ctg.cns.{polishedprefix}{polisher,dehumanizer}-{readtype}-{iteration}.fa"
    threads: 2
    shell:
        "dehumanize {input.manifest} {input.contigs} {output}"

rule download_pilon:
    output: "pilon-1.23.jar"
    shell: "wget https://github.com/broadinstitute/pilon/releases/download/v1.23/pilon-1.23.jar"

rule polish_pilon:
    input: contigs=input_polish, reads=polish_reads_input, pilon="pilon-1.23.jar"
    params:
        mode=minimap2_mode,
    output:
        polish="{uuid}.{conf}.ctg.cns.{polishedprefix}{polisher,pilon}-{readtype}-{iteration}.fa"
    threads: lambda w: n_threads(w, "polish_threads")
    resources:
        benchmark=1,
    benchmark: "benchmarks/{uuid}.{conf}.ctg.cns.{polishedprefix}{polisher,pilon}-{readtype}-{iteration}.fa"
    log: "log/{uuid}.{conf}.ctg.cns.{polishedprefix}{polisher,pilon}-{readtype}-{iteration}.fa"
    shell:
        "minimap2 -t {threads} -ax {params.mode} {input.contigs} {input.reads} > {output}.sam; samtools sort {output}.sam -T {wildcards.uuid} %s -@ {threads} -o {output}.bam; samtools index {output}.bam; java -Xmx16G -jar {input.pilon} --genome {input.contigs} --bam {output}.bam --outdir pilon-{wildcards.uuid}/ 2> {log}; mv pilon-{wildcards.uuid}/pilon.fasta {output}" % config["sort_flags"]

rule summarise_assembly_stats:
    input:
        enumerate_assemblies(suffix=".fa")
    output:
        "assembly_stats.txt"
    shell:
        "for fa in {input}; do perl ../scripts/assembly-stats.pl $fa; done > {output}"

rule summarise_assembly_meta:
    input:
        enumerate_assemblies(suffix=".fa")
    output:
        "assembly_md5size.txt"
    shell:
        "for fa in {input}; do base=`basename $fa`; md5=`md5sum $fa | cut -f1 -d' '`; size=`du -Lh $fa | cut -f1`; echo \"$base,$size,$md5\"; done > {output}"

rule assembly_read_coverage:
    input:
        contigs="{uuid}.{prefix}.fa",
        reads=lambda w: get_reads(w, 'ont'),
        #b="{uuid}.{prefix}.fa.bam",
        #bai="{uuid}.{prefix}.fa.bam.bai",
    output:
        depth="{uuid,[A-z0-9_-]*}.{prefix}.fa.depth"
    params:
        mode="map-ont",
    threads: lambda w: n_threads(w, "minimap2_threads")
    shell: "echo -e \"contig\tcov_min\tcov_max\tcov_median\tcov_iqr\tcov_mean\tcov_pstdev\" > {output.depth}; minimap2 -a -2 --sam-hit-only --secondary=no -Q -t {threads} -x {params.mode} {input.contigs} {input.reads} | samtools sort -T tmpsort-{output.depth} %s -@ {threads} | samtools depth -a - | datamash -g 1 min 3 max 3 median 3 iqr 3 mean 3 pstdev 3 >> {output.depth}" % config["sort_flags"]

rule bond_summarise_kraken:
    input:
        "kraken_summary.tsv"
    output:
        "kraken_summary.bond.tsv"
    shell:
        "bond {input} ../manifest.cfg --dcol uuid --dropid > {output}"

rule summarise_kraken:
    input:
        enumerate_assemblies(suffix=".fa.k2kr", unroll=False)
    output:
        "kraken_summary.tsv"
    shell:
        "awk 'FNR>1 || NR==1' {input} > {output}"

rule download_kraken_database:
    output:
        ok=touch(os.path.join(config["kraken2_database_root"], "k2db.ok")),
        h=os.path.join(config["kraken2_database_root"], "hash.k2d"),
        o=os.path.join(config["kraken2_database_root"], "opts.k2d"),
        t=os.path.join(config["kraken2_database_root"], "taxo.k2d"),
    shell: "bash ../scripts/download_kraken2_db.sh %s" % config["kraken2_database_root"]

rule download_ktkit_database:
    output:
        ok=touch(os.path.join(config["ktkit_database_root"], "ktkit.ok")),
        nodes=os.path.join(config["ktkit_database_root"], "nodes.dmp"),
        names=os.path.join(config["ktkit_database_root"], "names.dmp"),
    shell: "mkdir -p %s; cd %s; wget ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz; tar xvf taxdump.tar.gz" % (config["ktkit_database_root"], config["ktkit_database_root"])

rule kraken:
    input:
        fa="{path}", ok=os.path.join(config["kraken2_database_root"], "k2db.ok")
    output:
        out="{path}.k2",
        rep="{path}.k2r",
    threads: 8
    resources:
        benchmark=1, # Don't let this RAM pig near our assembly and polishing benchmarks
    #benchmark: "benchmarks/{path}.k2"
    shell:
        "kraken2 --db %s --use-names -t {threads} --output {output.out} --report {output.rep} {input.fa}" % config["kraken2_database_root"]

rule ktkit_rollup:
    input:
        k2o="{uuid}.{prefix}.fa.k2",
        depth="{uuid}.{prefix}.fa.depth",
        ok=os.path.join(config["ktkit_database_root"], "ktkit.ok"),
    params:
        only=lambda w: "--only " + ",".join(str(x) for x in get_ref_tids_by_group(get_refgroup_by_uuid(w.uuid))) if samples.loc[w.uuid]["refgroup"] != "-" else "",
    output:
        k2k="{uuid,[A-z0-9_-]*}.{prefix}.fa.k2kr"
    threads: 1
    shell: "ktkit rollup {input.k2o} --dump %s --rank species {params.only} | bond - {input.depth} --dcol 2 --dheader kstate,contig,ktax,contig_len,kmer,ktkit_tid,ktkit_name --append uuid:{wildcards.uuid},suffix:{wildcards.prefix} --dropid | cut -f 2- > {output}" % config["ktkit_database_root"]

rule ktkit_count:
    input:
        k2o="{path}.k2",
        ok=os.path.join(config["ktkit_database_root"], "ktkit.ok"),
    params:
        samplename=lambda w: get_samplename_from_readpath(w.path)
    output:
        k2kb="{path}.k2kb",
        k2kc="{path}.k2kc"
    threads: 1
    shell: "echo -e 'samplename\tktkit_tid\tktkit_name\tmean_seq_len\tn_seq\tprop_seq\tn_seq_unmasked\ttot_bp\tprop_bp\tprop_bp_unmasked' > {output.k2kb}; ktkit count {input.k2o} --dump %s --rank species | sed 's/^/{params.samplename}\t/' >> {output.k2kb}; bond {output.k2kb} ../reads.cfg --dropid > {output.k2kc}" % config["ktkit_database_root"]


# TODO We need to use git/Flye/bin/flye as the script sets up some dir dependent stuff
rule install_flye_hash:
    output:
        ok=touch("{conf,flye[A-z0-9_-]*}.ok"),
        #flye_bin="git/{conf,flye[A-z0-9_-]*}/Flye/bin/flye",
    params:
        hash=lambda w: spells[w.conf]["hash"],
    shell: "mkdir -p git/{wildcards.conf}; cd git/{wildcards.conf}; git clone https://github.com/fenderglass/Flye.git; cd Flye; git reset --hard {params.hash}; make"

rule flye_assembly:
    conda: "environments/flye.yaml"
    input:
        reads=lambda w: get_reads(w, 'ont'),
        ready=lambda w: "%s.ok" % samples.loc[w.uuid]["spell"],
    params:
        overlap=lambda w: '-m %s' % spells[w.conf]['m'] if spells[w.conf]['m'] != '-' else '',
        meta=lambda w: '--meta' if spells[w.conf]['meta'] else '',
        plasmids=lambda w: '--plasmids' if spells[w.conf]['plasmids'] else '',
        genome_size=lambda w: spells[w.conf]['genome_size'],
        d = "{uuid}.{conf,flye[A-z0-9_-]*}/",
        iterations=lambda w: '--iterations %s' % spells[w.conf]['iterations'] if 'iterations' in spells[w.conf] else '',
    output:
        fa = "{uuid}.{conf,flye[A-z0-9_-]*}/assembly.fasta",
        gfa = "{uuid}.{conf,flye[A-z0-9_-]*}/assembly_graph.gfa"
    threads: lambda w: n_threads(w, "assembly_threads")
    resources:
        benchmark=1,
    benchmark: "benchmarks/{uuid}.{conf,flye[A-z0-9_-]*}_assembly.fa"
    log: "log/{uuid}.{conf,flye[A-z0-9_-]*}_assembly.fa"
    shell:
        "git/{wildcards.conf}/Flye/bin/flye --nano-raw {input.reads} {params.meta} {params.plasmids} -g {params.genome_size} -o {params.d} -t {threads} {params.overlap} {params.iterations} > {log} 2>&1"

rule link_flye_assembly:
    input: "{uuid}.{conf}/assembly.fasta"
    output: "{uuid}.{conf,flye[A-z0-9._-]*}.ctg.cns.fa"
    shell: "ln -s {input} {output}"

rule install_bandage_linux:
    output: ok=touch("bandage.ok"), b="ware/bandage/Bandage",
    shell: "cd ware; wget https://github.com/rrwick/Bandage/releases/download/v0.8.1/Bandage_Ubuntu_static_v0_8_1.zip; unzip Bandage_Ubuntu_static_v0_8_1.zip -d bandage"

rule prep_wtdbg2_gfa:
    input:
        dot="{uuid}.{conf}.ctg.dot.gz",
        assembly="{uuid}.{conf}.ctg.lay.gz"
    output: "{uuid}.{conf,wtdbg2[A-z0-9._-]*}.ctg.cns.gfa"
    shell: "zcat {input.dot} | perl git/{ wildcards.conf }/wtdbg2/scripts/wtdbg-dot2gfa.pl > {output}"

#NOTE We reduce only the flye GFA because it has too many links to draw, and also the
# wtdbg2 graph isnt actually representative of the consensus so the contigs dont match up at all
rule prep_flye_gfa:
    input: "{uuid}.{conf}/assembly_graph.gfa"
    output: "{uuid}.{conf,flye[A-z0-9._-]*}.ctg.cns.gfa"
    shell: "python ../scripts/reduce_gfa.py {input} 50000 > {output}"

rule bandage_assembly:
    input:
        b="bandage.ok",
        gfa="{uuid}.{prefix}.gfa",
        assembly="{uuid}.{prefix}.fa"
    output: "{uuid,[A-z0-9_-]*}.{prefix}.gfa.svg"
    shell: "ware/bandage/Bandage image {input.gfa} {output}"

rule wtdbg2_consensus:
    input:
        "{uuid}.{conf}.ctg.lay.gz"
    output:
        "{uuid}.{conf,wtdbg2[A-z0-9_-]*}.ctg.cns.fa"
    threads: lambda w: n_threads(w, "assembly_threads")
    resources:
        benchmark=1,
    benchmark: "benchmarks/{uuid}.{conf,wtdbg2[A-z0-9_-]*}.ctg.cns.fa"
    shell:
        "git/{wildcards.conf}/wtdbg2/wtpoa-cns -f -i {input} -o {output} -t {threads}"

rule install_wtdbg2_hash:
    output:
        ok=touch("{conf,wtdbg2[A-z0-9_-]*}.ok"),
        #w2_bin="git/{conf,wtdbg2[A-z0-9_-]*}/wtdbg2/wtdbg2",
        #w2_poa="git/{conf,wtdbg2[A-z0-9_-]*}/wtdbg2/wtpoa-cns",
    params:
        hash=lambda w: spells[w.conf]["hash"],
    shell: "mkdir -p git/{wildcards.conf}; cd git/{wildcards.conf}; git clone https://github.com/ruanjue/wtdbg2.git; cd wtdbg2; git reset --hard {params.hash}; make"

rule wtdbg2_assembly:
    input:
        reads=lambda w: get_reads(w, 'ont'),
        ready="w2.ok"
    params:
        pmer=lambda w: spells[w.conf]['pmer'],
        kmer=lambda w: spells[w.conf]['kmer'],
        sampler=lambda w: spells[w.conf]['sampler'],
        edge=lambda w: spells[w.conf]['edge'],
        length=lambda w: spells[w.conf]['length'],
        max_k=lambda w: spells[w.conf]['max_k'],
        max_node=lambda w: spells[w.conf]['max_node'],
        prefix=lambda w: w.uuid+'.'+w.conf,
    output:
        lay="{uuid}.{conf,wtdbg2}.ctg.lay.gz",
        dot="{uuid}.{conf,wtdbg2}.ctg.dot.gz",
    threads: lambda w: n_threads(w, "assembly_threads")
    resources:
        benchmark=1,
    benchmark: "benchmarks/{uuid}.{conf,wtdbg2}.ctg.lay.gz"
    shell:
        "git/{wildcards.conf}/wtdbg2/wtdbg2 -f -i {input.reads} -o {params.prefix} -S {params.sampler} -e {params.edge} -k {params.kmer} -p {params.pmer} -L {params.length} -K {params.max_k} --node-max {params.max_node} -t {threads}"

rule wtdbg2_24_assembly:
    input:
        reads=lambda w: get_reads(w, 'ont'),
        ready=lambda w: "%s.ok" % samples.loc[w.uuid]["spell"],
    params:
        prefix=lambda w: w.uuid+'.'+w.conf,
        pmer=lambda w: spells[w.conf]['pmer'],
        kmer=lambda w: spells[w.conf]['kmer'],
        sampler=lambda w: spells[w.conf]['sampler'],
        edge=lambda w: spells[w.conf]['edge'],
        length=lambda w: spells[w.conf]['length'],
        max_k=lambda w: spells[w.conf]['max_k'],
        max_node=lambda w: spells[w.conf]['max_node'],
        genome_size=lambda w: spells[w.conf]['genome_size'],
    output:
        lay="{uuid}.{conf,wtdbg2[A-z0-9_-]*}.ctg.lay.gz",
        dot="{uuid}.{conf,wtdbg2[A-z0-9_-]*}.ctg.dot.gz",
    threads: lambda w: n_threads(w, "assembly_threads")
    resources:
        benchmark=1,
    benchmark: "benchmarks/{uuid}.{conf,wtdbg2[A-z0-9_-]*}.ctg.lay.gz"
    shell:
        "git/{wildcards.conf}/wtdbg2/wtdbg2 -f -i {input.reads} -o {params.prefix} -S {params.sampler} -e {params.edge} -k {params.kmer} -p {params.pmer} -L {params.length} -K {params.max_k} --node-max {params.max_node} -X {params.max_k} -g {params.genome_size} -t {threads}"

rule rebaler_assembly:
    conda: "environments/rebaler.yaml"
    input:
        reads=lambda w: get_reads(w, 'ont'),
        ref=lambda w: "%s.super_ref.fa" % samples.loc[w.uuid]["refgroup"],
        #ready=lambda w: "%s.ok" % samples.loc[w.uuid]["spell"],
    output:
        fa="{uuid}.{conf,rebaler[A-z0-9._-]*}.ctg.cns.fa",
        gfa=touch("{uuid}.{conf,rebaler[A-z0-9._-]*}.ctg.cns.gfa")
    threads: lambda w: n_threads(w, "assembly_threads")
    resources:
        benchmark=1,
    log: "log/{uuid}.{conf,rebaler[A-z0-9._-]*}.ctg.cns.fa"
    benchmark: "benchmarks/{uuid}.{conf,rebaler[A-z0-9_-]*}.ctg.cns.fa"
    shell: "rebaler --base-only --base-only-prefix {wildcards.uuid}.{wildcards.conf}.ctg.cns  {input.ref} {input.reads} 2> {log}"

rule test_assembly:
    input: "{uuid}.{conf}.ctg.cns.fa"
    output: "tests/{uuid}.{conf}.ctg.cns.test.txt"
    shell: "touch {output}"