workflow/workflow_notes

# Plasmid filter criteria during conversion
[?!(features[?type==`source`].qualifiers.plasmid)]

# Conversion to embl for island callers
Currently there is an issue with the way that anticodon qualifiers are output by Biopython that is incompatible with SigiHMM
Set bioperl compatibilty to true. See https://github.com/brinkmanlab/IslandCompare/issues/135

# SigiHMM
To be consistent with IslandViewer the following setting is required
Merge islands of distance: 3

# Scrub version from fasta ID
s/^(>[^\.[:space:]]+)\.[^[:space:]]+/\1/

# Sequence ID extraction is done to hand to ParSNP to properly name its output
See ./scripts/extractSequenceIds.gawk

# Scrub .ref from alignment order
s/\.ref$//

# SigiHMM outputs genes rather than genomic islands
See ./scripts/collapse_sigi_islands.gawk

# GFF3 to fasta sed 
0,/^##FASTA/d

# Cut sequences from GFF3 sed
/^##FASTA/Q

# Filter islands
c5-c4>${minimum_island_size}

# Filter GFF for islands
c3=='genomic_islands'

# Cut identifiers from extractSequenceIds.gawk output
c1

# Generate final GFF variables
minimum_homologous_region=${minimum_homologous_region}
min_cluster_size=${min_cluster_size}

# ParSNP
- use random genome
- use dataset name in output
- Curated genome directory = yes

# Create pairwise invocations
See ./mauve_paring.rule for the Apply rule to collection rules

progressiveMauve pairwise alignments
- output backbone


# Build list:list from merged datasets
Requires merge tool to append suffix to all datasets
See ./listlist_from_merge.rule  for the Apply rule to collection rules

# MASH
Sketch individual sequences = True
k-mer size: 16

# Keep MASH columns
c1,c2,c3

# MCL 
Transform input matrix: mul(-1),add(1),gq(0.96)
Inflation: 5

# Final data aggregation done by awk into a GFF
See ./scripts/final.awk for the script

# Workflow list:list input annotation
{ "subinputs": [
{ "id": "data", "label": "Input datasets", "type": "data_collection_input", "annotation": "Select data to analyze.", "format": ["genbank", "gbk", "embl", "gbff"], "order":0, "validator":"(selection && selection.length > 1)?'':'Please select at least two datasets'" },
{ "id": "newick", "label": "Phylogenetic tree in newick format", "type": "data_input", "annotation": "Select tree to use for alignment order.", "optional": true, "format": ["newick"], "order":1, "collapse":true }
]}

# Workflow dbkey input annotation
{ "type": "dbkey_input", "optional": true, "value":"", "annotation": "Select a reference genome to align draft input data", "order":3, "collapse":true }

# Workflow toggle newick identifier annotation
{ "type": "select", "label":"Newick identifier type", "value":"False", "annotation": "Select the identifers that are present in the optional newick", "order": 2, "options": {"Accession":"False", "File name":"True"} }

# TODO
- invocations not accessible https://github.com/galaxyproject/galaxy/issues/7710
- See data_manager_fetch_genome_dbkeys_all_fasta tool to manage microbedb