From de705f610b42c5e297775784ac850d19f037b62f Mon Sep 17 00:00:00 2001
From: kylacochrane <Kyla.Cochrane@phac-aspc.gc.ca>
Date: Mon, 27 May 2024 16:01:13 -0400
Subject: [PATCH 01/11] Update parameters for profile_dists and gas_call;
 Add/update descriptions of various parameters for clarity

---
 assets/schema_input.json |  4 +--
 nextflow.config          |  7 +---
 nextflow_schema.json     | 75 +++++++++++++++++++---------------------
 3 files changed, 39 insertions(+), 47 deletions(-)

diff --git a/assets/schema_input.json b/assets/schema_input.json
index 48e9936..6094f92 100644
--- a/assets/schema_input.json
+++ b/assets/schema_input.json
@@ -17,8 +17,8 @@
             "mlst_alleles": {
                 "type": "string",
                 "format": "file-path",
-                "pattern": "^\\S+\\.mlst\\.json(\\.gz)?$",
-                "errorMessage": "MLST JSON file from locidex report, cannot contain spaces and must have the extension: '.mlst.json' or '.mlst.json.gz'"
+                "pattern": "^\\S+\\.mlst(\\.subtyping)?\\.json(\\.gz)?$",
+                "errorMessage": "MLST JSON file from locidex report, cannot contain spaces and must have the extension: '.mlst.json', '.mlst.json.gz', '.mlst.subtyping.json', or 'mlst.subtyping.json.gz'"
             },
             "address": {
                 "type": "string",
diff --git a/nextflow.config b/nextflow.config
index 9f9d8ff..227b0bb 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -11,9 +11,6 @@ params {
 
     // Input options
     input                      = null
-    project_name               = 'assembly'
-    assembler                  = 'stub'
-    random_seed                = 1
 
     // Boilerplate options
     outdir                     = null
@@ -51,19 +48,17 @@ params {
     pd_distm = "hamming"
     pd_missing_threshold = 1.0
     pd_sample_quality_threshold = 1.0
-    pd_match_threshold = -1.0
     pd_file_type = "text"
     pd_mapping_file = null // default is no file
     pd_force = false
     pd_skip = false
     pd_columns = null
-    pd_count_missing = true
+    pd_count_missing = false
 
 
     // GAS Call
     gm_thresholds = "10,5,0"
     gm_delimiter = "'.'" // note the single quotes surrounding the delimiter
-    ref_clusters = ""
 
 }
 
diff --git a/nextflow_schema.json b/nextflow_schema.json
index d82d41f..ea67459 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -2,7 +2,7 @@
     "$schema": "http://json-schema.org/draft-07/schema",
     "$id": "https://raw.githubusercontent.com/phac-nml/gasnomenclature/main/nextflow_schema.json",
     "title": "phac-nml/gasnomenclature pipeline parameters",
-    "description": "IRIDA Next Example Pipeline",
+    "description": "Gas Nomenclature assignment pipeline",
     "type": "object",
     "definitions": {
         "gas_call": {
@@ -13,14 +13,15 @@
             "properties": {
                 "gm_thresholds": {
                     "type": "string",
-                    "default": "10,5,0"
+                    "default": "10,5,0",
+                    "description": "Thresholds delimited by ','. Values should match units from '--pd_distm' (either 'hamming' or 'scaled').",
+                    "pattern": "^(\\d+(\\.\\d+)?,)*\\d+(\\.\\d+)?$"
                 },
                 "gm_delimiter": {
                     "type": "string",
-                    "default": "\\'.\\"
-                },
-                "ref_clusters": {
-                    "type": "string"
+                    "default": "\\'.\\",
+                    "description": "Delimiter desired for nomenclature code.",
+                    "pattern": "^\\S+$"
                 }
             }
         },
@@ -32,43 +33,60 @@
             "properties": {
                 "pd_outfmt": {
                     "type": "string",
-                    "default": "pairwise"
+                    "description": "The output format for distances",
+                    "enum": ["pairwise"],
+                    "default": "pairwise",
+                    "hidden": true
                 },
                 "pd_distm": {
                     "type": "string",
+                    "description": "The distance method/unit",
+                    "enum": ["hamming", "scaled"],
                     "default": "hamming"
                 },
                 "pd_missing_threshold": {
                     "type": "number",
+                    "description": "The maximum proportion of missing data per locus for a locus to be kept in the analysis",
+                    "minimum": 0,
+                    "maximum": 1,
                     "default": 1
                 },
                 "pd_sample_quality_threshold": {
                     "type": "number",
+                    "description": "The maximum proportion of missing data per sample for a sample to be kept in the analysis",
+                    "minimum": 0,
+                    "maximum": 1,
                     "default": 1
                 },
-                "pd_match_threshold": {
-                    "type": "number",
-                    "default": -1
-                },
                 "pd_file_type": {
                     "type": "string",
+                    "description": "Output format file type",
+                    "enum": ["text", "parquet"],
                     "default": "text"
                 },
                 "pd_mapping_file": {
-                    "type": "string"
-                },
-                "pd_force": {
-                    "type": "boolean"
+                    "type": "string",
+                    "pattern": "^\\S+\\.json(\\.gz)?$",
+                    "description": "A file used to map allele codes to integers for internal distance calculations",
+                    "exists": true,
+                    "hidden": true,
+                    "format": "file-path"
                 },
                 "pd_skip": {
-                    "type": "boolean"
+                    "type": "boolean",
+                    "description": "Skip QA/QC steps"
                 },
                 "pd_columns": {
-                    "type": "string"
+                    "type": "string",
+                    "pattern": "^\\S+$",
+                    "description": "Defines the loci to keep within the analysis. Formatted as a single column file with one locus name per line or list of comma-separated loci",
+                    "exists": true,
+                    "format": "file-path"
                 },
                 "pd_count_missing": {
                     "type": "boolean",
-                    "default": true
+                    "description": "Count missing alleles as different",
+                    "default": false
                 }
             }
         },
@@ -96,27 +114,6 @@
                     "description": "The output directory where the results will be saved. You have to use absolute paths to storage on Cloud infrastructure.",
                     "fa_icon": "fas fa-folder-open"
                 },
-                "project_name": {
-                    "type": "string",
-                    "default": "assembly",
-                    "pattern": "^\\S+$",
-                    "description": "The name of the project.",
-                    "fa_icon": "fas fa-tag"
-                },
-                "assembler": {
-                    "type": "string",
-                    "default": "stub",
-                    "fa_icon": "fas fa-desktop",
-                    "description": "The sequence assembler to use for sequence assembly.",
-                    "enum": ["default", "stub", "experimental"]
-                },
-                "random_seed": {
-                    "type": "integer",
-                    "default": 1,
-                    "fa_icon": "fas fa-dice-six",
-                    "description": "The random seed to use for sequence assembly.",
-                    "minimum": 1
-                },
                 "email": {
                     "type": "string",
                     "description": "Email address for completion summary.",

From 76c69808f20ef7e7c01b1b0d9b05ec44c508b4f0 Mon Sep 17 00:00:00 2001
From: kylacochrane <Kyla.Cochrane@phac-aspc.gc.ca>
Date: Mon, 27 May 2024 16:02:47 -0400
Subject: [PATCH 02/11] Remove pd_force from nextflow.config

---
 nextflow.config | 1 -
 1 file changed, 1 deletion(-)

diff --git a/nextflow.config b/nextflow.config
index 227b0bb..a90df40 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -50,7 +50,6 @@ params {
     pd_sample_quality_threshold = 1.0
     pd_file_type = "text"
     pd_mapping_file = null // default is no file
-    pd_force = false
     pd_skip = false
     pd_columns = null
     pd_count_missing = false

From e9e14f057b3cd7589c1fc5c897201fd397e6ff6c Mon Sep 17 00:00:00 2001
From: kylacochrane <Kyla.Cochrane@phac-aspc.gc.ca>
Date: Mon, 27 May 2024 16:07:12 -0400
Subject: [PATCH 03/11] Removed pd_force argument from profile_dists/main.nf

---
 modules/local/profile_dists/main.nf | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/modules/local/profile_dists/main.nf b/modules/local/profile_dists/main.nf
index b7a0933..3d6845c 100644
--- a/modules/local/profile_dists/main.nf
+++ b/modules/local/profile_dists/main.nf
@@ -32,9 +32,6 @@ process PROFILE_DISTS{
     if(columns){
         args = args + " --columns $columns"
     }
-    if(params.pd_force){
-        args = args + " --force"
-    }
     if(params.pd_skip){
         args = args + " --skip"
     }

From 43a0b41bd2ee019b268b1b71f0f0091ba1f0d5a9 Mon Sep 17 00:00:00 2001
From: kylacochrane <Kyla.Cochrane@phac-aspc.gc.ca>
Date: Mon, 27 May 2024 16:56:53 -0400
Subject: [PATCH 04/11] update documentation

---
 README.md      | 84 ++++++++++++++++++++++++++++----------------------
 docs/output.md | 69 ++++++++++++++++++++++++++---------------
 docs/usage.md  | 24 +++++++--------
 3 files changed, 104 insertions(+), 73 deletions(-)

diff --git a/README.md b/README.md
index 303bd5f..0dd80be 100644
--- a/README.md
+++ b/README.md
@@ -1,23 +1,58 @@
 [![Nextflow](https://img.shields.io/badge/nextflow-%E2%89%A523.04.3-brightgreen.svg)](https://www.nextflow.io/)
 
-# Example Pipeline for IRIDA Next
+# Genomic Address Service Nomenclature Workflow
 
-This is an example pipeline to be used for integration with IRIDA Next.
+This workflow takes provided JSON-formatted MLST allelic profiles and assigns cluster addresses to samples based on an existing cluster designations. This pipeline is designed to be integrated into IRIDA Next. However, it may be run as a stand-alone pipeline.
+
+A brief overview of the usage of this pipeline is given below. Detailed documentation can be found in the [docs/](docs/) directory.
 
 # Input
 
 The input to the pipeline is a standard sample sheet (passed as `--input samplesheet.csv`) that looks like:
 
-| sample  | fastq_1         | fastq_2         |
-| ------- | --------------- | --------------- |
-| SampleA | file_1.fastq.gz | file_2.fastq.gz |
+| sample  | mlst_alleles      | address |
+| ------- | ----------------- | ------- |
+| sampleA | sampleA.mlst.json | 1.1.1   |
+| sampleQ | sampleQ.mlst.json |         |
+| sampleF | sampleF.mlst.json |         |
 
 The structure of this file is defined in [assets/schema_input.json](assets/schema_input.json). Validation of the sample sheet is performed by [nf-validation](https://nextflow-io.github.io/nf-validation/).
 
+Details on the columns can be found in the [Full samplesheet](docs/usage.md#full-samplesheet) documentation.
+
 # Parameters
 
 The main parameters are `--input` as defined above and `--output` for specifying the output results directory. You may wish to provide `-profile singularity` to specify the use of singularity containers and `-r [branch]` to specify which GitHub branch you would like to run.
 
+## Profile dists
+
+The following can be used to adjust parameters for the [profile_dists][] tool.
+
+- `--pd_outfmt`: The output format for distances. For this pipeline the only valid value is _pairwise_ (required by [gas call][]).
+- `--pd_distm`: The distance method/unit, either _hamming_ or _scaled_. For _hamming_ distances, the distance values will be a non-negative integer. For _scaled_ distances, the distance values are between 0 and 1.
+- `--pd_missing_threshold`: The maximum proportion of missing data per locus for a locus to be kept in the analysis. Values from 0 to 1.
+- `--pd_sample_quality_threshold`: The maximum proportion of missing data per sample for a sample to be kept in the analysis. Values from 0 to 1.
+- `--pd_file_type`: Output format file type. One of _text_ or _parquet_.
+- `--pd_mapping_file`: A file used to map allele codes to integers for internal distance calculations. This is the same file as produced from the _profile dists_ step (the [allele_map.json](docs/output.md#profile-dists) file). Normally, this is unneeded unless you wish to override the automated process of mapping alleles to integers.
+- `--pd_skip`: Skip QA/QC steps. Can be used as a flag, `--pd_skip`, or passing a boolean, `--pd_skip true` or `--pd_skip false`.
+- `--pd_columns`: Defines the loci to keep within the analysis (default when unset is to keep all loci). Formatted as a single column file with one locus name per line. For example:
+  - **Single column format**
+    ```
+    loci1
+    loci2
+    loci3
+    ```
+- `--pd_count_missing`: Count missing alleles as different. Can be used as a flag, `--pd_count_missing`, or passing a boolean, `--pd_count_missing true` or `--pd_count_missing false`. If true, will consider missing allele calls for the same locus between samples as a difference, increasing the distance counts.
+
+## GAS CALL
+
+The following can be used to adjust parameters for the [gas call][] tool.
+
+- `--gm_thresholds`: Thresholds delimited by `,`. Values should match units from `--pd_distm` (either _hamming_ or _scaled_).
+- `--gm_delimiter`: Delimiter desired for nomenclature code.
+
+## Other
+
 Other parameters (defaults from nf-core) are defined in [nextflow_schema.json](nextflow_schmea.json).
 
 # Running
@@ -39,51 +74,26 @@ An example of the what the contents of the IRIDA Next JSON file looks like for t
 ```
 {
     "files": {
-        "global": [
-            {
-                "path": "summary/summary.txt.gz"
-            }
-        ],
+        "global": [],
         "samples": {
-            "SAMPLE1": [
-                {
-                    "path": "assembly/SAMPLE1.assembly.fa.gz"
-                }
-            ],
-            "SAMPLE2": [
+            "sampleF": [
                 {
-                    "path": "assembly/SAMPLE2.assembly.fa.gz"
+                    "path": "input/sampleF_error_report.csv"
                 }
             ],
-            "SAMPLE3": [
-                {
-                    "path": "assembly/SAMPLE3.assembly.fa.gz"
-                }
-            ]
         }
     },
     "metadata": {
         "samples": {
-            "SAMPLE1": {
-                "reads.1": "sample1_R1.fastq.gz",
-                "reads.2": "sample1_R2.fastq.gz"
-            },
-            "SAMPLE2": {
-                "reads.1": "sample2_R1.fastq.gz",
-                "reads.2": "sample2_R2.fastq.gz"
-            },
-            "SAMPLE3": {
-                "reads.1": "sample1_R1.fastq.gz",
-                "reads.2": "null"
+            "sampleQ": {
+                "address": "1.1.3",
             }
         }
     }
 }
 ```
 
-Within the `files` section of this JSON file, all of the output paths are relative to the `outdir`. Therefore, `"path": "assembly/SAMPLE1.assembly.fa.gz"` refers to a file located within `outdir/assembly/SAMPLE1.assembly.fa.gz`.
-
-There is also a pipeline execution summary output file provided (specified in the above JSON as `"global": [{"path":"summary/summary.txt.gz"}]`). However, there is no formatting specification for this file.
+Within the `files` section of this JSON file, all of the output paths are relative to the `outdir`. Therefore, `"path": "input/sampleF_error_report.csv"` refers to a file located within `outdir/input/sampleF_error_report.csv`. This file is generated only if a sample fails the input check during samplesheet assessment.
 
 ## Test profile
 
@@ -95,7 +105,7 @@ nextflow run phac-nml/gasnomenclature -profile docker,test -r main -latest --out
 
 # Legal
 
-Copyright 2023 Government of Canada
+Copyright 2024 Government of Canada
 
 Licensed under the MIT License (the "License"); you may not use
 this work except in compliance with the License. You may obtain a copy of the
diff --git a/docs/output.md b/docs/output.md
index 817c382..4ad48e8 100644
--- a/docs/output.md
+++ b/docs/output.md
@@ -6,11 +6,13 @@ This document describes the output produced by the pipeline.
 
 The directories listed below will be created in the results directory after the pipeline has finished. All paths are relative to the top-level results directory.
 
-- assembly: very small mock assembly files for each sample
-- generate: intermediate files used in generating the IRIDA Next JSON output
-- pipeline_info: information about the pipeline's execution
-- simplify: simplified intermediate files used in generating the IRIDA Next JSON output
-- summary: summary report about the pipeline's execution and results
+- call: The cluster addresses from the [genomic_address_service](https://github.com/phac-nml/genomic_address_service).
+- cluster: The cluster file required by GAS_call.
+- distances: Distances between genomes from [profile_dists](https://github.com/phac-nml/profile_dists).
+- filter: The cluster addresses from only the query samples.
+- input: An error report that is only generated when sample IDs and MLST JSON files do not match.
+- locidex: The merged MLST JSON files for reference and query samples.
+- pipeline_info: Information about the pipeline's execution
 
 The IRIDA Next-compliant JSON output file will be named `iridanext.output.json.gz` and will be written to the top-level of the results directory. This file is compressed using GZIP and conforms to the [IRIDA Next JSON output specifications](https://github.com/phac-nml/pipeline-standards#42-irida-next-json).
 
@@ -18,60 +20,79 @@ The IRIDA Next-compliant JSON output file will be named `iridanext.output.json.g
 
 The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps:
 
-- [Assembly stub](#assembly-stub) - Performs a stub assembly by generating a mock assembly
-- [Generate sample JSON](#generate-sample-json) - Generates a JSON file for each sample
-- [Generate summary](#generate-summary) - Generates a summary text file describing the samples and assemblies
-- [Simplify IRIDA JSON](#simplify-irida-json) - Simplifies the sample JSONs by limiting nesting depth
+- [Input check](#input-check) - Performs a validation check on the samplesheet inputs to ensure that the sampleID precisely matches the MLST JSON key.
+- [Locidex merge](#locidex-merge) - Merges MLST profile JSON files into a single profiles file for reference and query samples.
+- [Profile dists](#profile-dists) - Computes pairwise distances between genomes using MLST allele differences.
+- [Cluster file](#cluster-file) - Generates the expected_clusters.txt file from reference sample addresses for use in GAS_call.
+- [GAS call](#gas-call) - Generates hierarchical cluster addresses.
+- [Filter query](#filter-query) - Filters and generates a csv file containing only the cluster addresses for query samples.
 - [IRIDA Next Output](#irida-next-output) - Generates a JSON output file that is compliant with IRIDA Next
 - [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution
 
-### Assembly stub
+### Input Check
 
 <details markdown="1">
 <summary>Output files</summary>
 
-- `assembly/`
-  - Mock assembly files: `ID.assembly.fa.gz`
+- `input/`
+  - `sampleID_error_report.csv`
 
 </details>
 
-### Generate sample JSON
+### Locidex merge
 
 <details markdown="1">
 <summary>Output files</summary>
 
-- `generate/`
-  - JSON files: `ID.json.gz`
+- `locidex/merge/`
+  - reference samples: `reference/merged_ref/merged_profiles_ref.tsv`
+  - query samples: `query/merged_value/merged_profile_value.tsv`
 
 </details>
 
-### Generate summary
+### Profile Dists
 
 <details markdown="1">
 <summary>Output files</summary>
 
-- `summary/`
-  - Text summary describing samples and assemblies: `summary.txt.gz`
+- `distances/`
+  - Mapping allele identifiers to integers: `allele_map.json`
+  - The query MLST profiles: `query_profile.text`
+  - The reference MLST profiles: `ref_profile.text`
+  - The computed distances based on MLST allele differences: `results.text`
+  - Information on the profile_dists run: `run.json`
 
 </details>
 
-### Simplify IRIDA JSON
+### Cluster File
 
 <details markdown="1">
 <summary>Output files</summary>
 
-- `simplify/`
-  - Simplified JSON files: `ID.simple.json.gz`
+- `cluster/`
+  - `expected_clusters.txt`
 
 </details>
 
-### IRIDA Next Output
+### GAS call
 
 <details markdown="1">
 <summary>Output files</summary>
 
-- `/`
-  - IRIDA Next-compliant JSON output: `iridanext.output.json.gz`
+- `call/`
+  - The computed cluster addresses: `clusters.text`
+  - Information on the GAS mcluster run: `run.json`
+  - Thesholds used to compute cluster addresses: `thresholds.json`
+
+</details>
+
+### Filter Query
+
+<details markdown="1">
+<summary>Output files</summary>
+
+- `filter/`
+  - `new_addresses.csv`
 
 </details>
 
diff --git a/docs/usage.md b/docs/usage.md
index 4fbd758..4f4abd4 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -2,7 +2,7 @@
 
 ## Introduction
 
-This pipeline is an example that illustrates running a nf-core-compliant pipeline on IRIDA Next.
+This workflow takes provided JSON-formatted MLST allelic profiles and assigns cluster addresses to samples based on an existing cluster designations. This pipeline is designed to be integrated into IRIDA Next. However, it may be run as a stand-alone pipeline.
 
 ## Samplesheet input
 
@@ -14,22 +14,22 @@ You will need to create a samplesheet with information about the samples you wou
 
 ### Full samplesheet
 
-The input samplesheet must contain three columns: `ID`, `fastq_1`, `fastq_2`. The IDs within a samplesheet should be unique. All other columns will be ignored.
+The input samplesheet must contain three columns: `sample`, `mlst_alleles`, `address`. The sample names within a samplesheet should be unique. All other columns will be ignored.
 
-A final samplesheet file consisting of both single- and paired-end data may look something like the one below.
+A final samplesheet file consisting of mlst_alleles and addresses may look something like the one below:
 
 ```csv title="samplesheet.csv"
-sample,fastq_1,fastq_2
-SAMPLE1,sample1_R1.fastq.gz,sample1_R2.fastq.gz
-SAMPLE2,sample2_R1.fastq.gz,sample2_R2.fastq.gz
-SAMPLE3,sample1_R1.fastq.gz,
+sample,mlst_alleles,address
+sampleA,sampleA.mlst.json.gz,1.1.1
+sampleQ,sampleQ.mlst.json.gz,2.2.2
+sampleF,sampleF.mlst.json,
 ```
 
-| Column    | Description                                                                                                                |
-| --------- | -------------------------------------------------------------------------------------------------------------------------- |
-| `sample`  | Custom sample name. Samples should be unique within a samplesheet.                                                         |
-| `fastq_1` | Full path to FastQ file for Illumina short reads 1. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". |
-| `fastq_2` | Full path to FastQ file for Illumina short reads 2. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". |
+| Column         | Description                                                                                                                                                                                                                                                                                                                      |
+| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `sample`       | Custom sample name. Samples should be unique within a samplesheet.                                                                                                                                                                                                                                                               |
+| `mlst_alleles` | Full path to an MLST JSON file describing the loci/alleles for the sample against some MLST scheme. A way to generate this file is via [locidex](https://github.com/phac-nml/locidex). File can optionally be gzipped and must have the extension ".mlst.json", ".mlst.subtyping.json" (or with an additional ".gz" if gzipped). |
+| `address`      | Hierarchal clustering address. If left empty for a sample, the pipeline will perform de novo clustering based on the provided cluster designations and thresholds.                                                                                                                                                               |
 
 An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline.
 

From f01e12ccbcf99c096923964bc3c67fadc5059728 Mon Sep 17 00:00:00 2001
From: kylacochrane <Kyla.Cochrane@phac-aspc.gc.ca>
Date: Mon, 27 May 2024 17:05:48 -0400
Subject: [PATCH 05/11] Update gm_delimiter parameter

---
 README.md            | 2 +-
 nextflow_schema.json | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 0dd80be..d8344e5 100644
--- a/README.md
+++ b/README.md
@@ -49,7 +49,7 @@ The following can be used to adjust parameters for the [profile_dists][] tool.
 The following can be used to adjust parameters for the [gas call][] tool.
 
 - `--gm_thresholds`: Thresholds delimited by `,`. Values should match units from `--pd_distm` (either _hamming_ or _scaled_).
-- `--gm_delimiter`: Delimiter desired for nomenclature code.
+- `--gm_delimiter`: Delimiter desired for nomenclature code. Must be alphanumeric or one of `._-`.
 
 ## Other
 
diff --git a/nextflow_schema.json b/nextflow_schema.json
index ea67459..5859003 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -21,7 +21,7 @@
                     "type": "string",
                     "default": "\\'.\\",
                     "description": "Delimiter desired for nomenclature code.",
-                    "pattern": "^\\S+$"
+                    "pattern": "^[A-Fa-f0-9\\._-]+$"
                 }
             }
         },

From dabbdecf5c07902dee8fbb4e2873571b977b86fe Mon Sep 17 00:00:00 2001
From: kylacochrane <Kyla.Cochrane@phac-aspc.gc.ca>
Date: Mon, 27 May 2024 17:23:53 -0400
Subject: [PATCH 06/11] Fixed defaults for gm_delimiter

---
 nextflow.config      | 2 +-
 nextflow_schema.json | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/nextflow.config b/nextflow.config
index a90df40..4cd0af2 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -57,7 +57,7 @@ params {
 
     // GAS Call
     gm_thresholds = "10,5,0"
-    gm_delimiter = "'.'" // note the single quotes surrounding the delimiter
+    gm_delimiter = "." 
 
 }
 
diff --git a/nextflow_schema.json b/nextflow_schema.json
index 5859003..3cbdf77 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -19,7 +19,7 @@
                 },
                 "gm_delimiter": {
                     "type": "string",
-                    "default": "\\'.\\",
+                    "default": ".",
                     "description": "Delimiter desired for nomenclature code.",
                     "pattern": "^[A-Fa-f0-9\\._-]+$"
                 }

From 4a199de83a1ef86e804b27ff7555bf70f51d5a3a Mon Sep 17 00:00:00 2001
From: kylacochrane <Kyla.Cochrane@phac-aspc.gc.ca>
Date: Mon, 27 May 2024 17:26:36 -0400
Subject: [PATCH 07/11] Fix whitespace

---
 nextflow.config | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nextflow.config b/nextflow.config
index 4cd0af2..a33af43 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -57,7 +57,7 @@ params {
 
     // GAS Call
     gm_thresholds = "10,5,0"
-    gm_delimiter = "." 
+    gm_delimiter = "."
 
 }
 

From 6b7243a5735a625cf6c437021231d3792ce88578 Mon Sep 17 00:00:00 2001
From: kylacochrane <Kyla.Cochrane@phac-aspc.gc.ca>
Date: Tue, 4 Jun 2024 09:55:01 -0400
Subject: [PATCH 08/11] Add clustering method

---
 README.md                      | 1 +
 modules/local/gas/call/main.nf | 1 +
 nextflow.config                | 1 +
 nextflow_schema.json           | 6 ++++++
 4 files changed, 9 insertions(+)

diff --git a/README.md b/README.md
index d8344e5..4e24c8e 100644
--- a/README.md
+++ b/README.md
@@ -49,6 +49,7 @@ The following can be used to adjust parameters for the [profile_dists][] tool.
 The following can be used to adjust parameters for the [gas call][] tool.
 
 - `--gm_thresholds`: Thresholds delimited by `,`. Values should match units from `--pd_distm` (either _hamming_ or _scaled_).
+- `--gm_method`: The linkage method to use for clustering. Value should be one of _single_, _average_, or _complete_.
 - `--gm_delimiter`: Delimiter desired for nomenclature code. Must be alphanumeric or one of `._-`.
 
 ## Other
diff --git a/modules/local/gas/call/main.nf b/modules/local/gas/call/main.nf
index 33db7a7..56f7d92 100644
--- a/modules/local/gas/call/main.nf
+++ b/modules/local/gas/call/main.nf
@@ -26,6 +26,7 @@ process GAS_CALL{
     gas call --dists $distances \\
                 --rclusters $reference_clusters \\
                 --outdir ${prefix} \\
+                --method ${params.gm_method} \\
                 --threshold ${params.gm_thresholds} \\
                 --delimeter ${params.gm_delimiter}
 
diff --git a/nextflow.config b/nextflow.config
index a33af43..31a7a17 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -57,6 +57,7 @@ params {
 
     // GAS Call
     gm_thresholds = "10,5,0"
+    gm_method = "average"
     gm_delimiter = "."
 
 }
diff --git a/nextflow_schema.json b/nextflow_schema.json
index 3cbdf77..99b1c29 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -17,6 +17,12 @@
                     "description": "Thresholds delimited by ','. Values should match units from '--pd_distm' (either 'hamming' or 'scaled').",
                     "pattern": "^(\\d+(\\.\\d+)?,)*\\d+(\\.\\d+)?$"
                 },
+                "gm_method": {
+                    "type": "string",
+                    "default": "average",
+                    "description": "Clustering linkage method.",
+                    "enum": ["single", "average", "complete"]
+                },
                 "gm_delimiter": {
                     "type": "string",
                     "default": ".",

From e32f5d26feb65d0f5500fd630a648e67196a1670 Mon Sep 17 00:00:00 2001
From: kylacochrane <Kyla.Cochrane@phac-aspc.gc.ca>
Date: Tue, 4 Jun 2024 12:50:45 -0400
Subject: [PATCH 09/11] Comment removed from gas-call main.nf

---
 modules/local/gas/call/main.nf | 1 -
 1 file changed, 1 deletion(-)

diff --git a/modules/local/gas/call/main.nf b/modules/local/gas/call/main.nf
index 56f7d92..3216c9e 100644
--- a/modules/local/gas/call/main.nf
+++ b/modules/local/gas/call/main.nf
@@ -20,7 +20,6 @@ process GAS_CALL{
     path  "versions.yml", emit: versions
 
     script:
-    // Need to add more args for gas call below
     prefix = "Called"
     """
     gas call --dists $distances \\

From a1b36852f018497b8fd50144b7031a1ae7f97876 Mon Sep 17 00:00:00 2001
From: kylacochrane <Kyla.Cochrane@phac-aspc.gc.ca>
Date: Fri, 7 Jun 2024 15:01:10 -0400
Subject: [PATCH 10/11] Update README and USAGE documents

---
 README.md     | 3 +--
 docs/usage.md | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 4e24c8e..4aa8b43 100644
--- a/README.md
+++ b/README.md
@@ -28,14 +28,13 @@ The main parameters are `--input` as defined above and `--output` for specifying
 
 The following can be used to adjust parameters for the [profile_dists][] tool.
 
-- `--pd_outfmt`: The output format for distances. For this pipeline the only valid value is _pairwise_ (required by [gas call][]).
 - `--pd_distm`: The distance method/unit, either _hamming_ or _scaled_. For _hamming_ distances, the distance values will be a non-negative integer. For _scaled_ distances, the distance values are between 0 and 1.
 - `--pd_missing_threshold`: The maximum proportion of missing data per locus for a locus to be kept in the analysis. Values from 0 to 1.
 - `--pd_sample_quality_threshold`: The maximum proportion of missing data per sample for a sample to be kept in the analysis. Values from 0 to 1.
 - `--pd_file_type`: Output format file type. One of _text_ or _parquet_.
 - `--pd_mapping_file`: A file used to map allele codes to integers for internal distance calculations. This is the same file as produced from the _profile dists_ step (the [allele_map.json](docs/output.md#profile-dists) file). Normally, this is unneeded unless you wish to override the automated process of mapping alleles to integers.
 - `--pd_skip`: Skip QA/QC steps. Can be used as a flag, `--pd_skip`, or passing a boolean, `--pd_skip true` or `--pd_skip false`.
-- `--pd_columns`: Defines the loci to keep within the analysis (default when unset is to keep all loci). Formatted as a single column file with one locus name per line. For example:
+- `--pd_columns`: Path to a file that defines the loci to keep within the analysis (default when unset is to keep all loci). Formatted as a single column file with one locus name per line. For example:
   - **Single column format**
     ```
     loci1
diff --git a/docs/usage.md b/docs/usage.md
index 4f4abd4..2433443 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -29,7 +29,7 @@ sampleF,sampleF.mlst.json,
 | -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `sample`       | Custom sample name. Samples should be unique within a samplesheet.                                                                                                                                                                                                                                                               |
 | `mlst_alleles` | Full path to an MLST JSON file describing the loci/alleles for the sample against some MLST scheme. A way to generate this file is via [locidex](https://github.com/phac-nml/locidex). File can optionally be gzipped and must have the extension ".mlst.json", ".mlst.subtyping.json" (or with an additional ".gz" if gzipped). |
-| `address`      | Hierarchal clustering address. If left empty for a sample, the pipeline will perform de novo clustering based on the provided cluster designations and thresholds.                                                                                                                                                               |
+| `address`      | Hierarchal clustering address. If left empty for a sample, the pipeline will assign a cluster address.                                                                                                                                                                                                                           |
 
 An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline.
 

From 04ee4fd8044719d6d76a2c786266337af7d829b3 Mon Sep 17 00:00:00 2001
From: kylacochrane <Kyla.Cochrane@phac-aspc.gc.ca>
Date: Fri, 7 Jun 2024 15:21:02 -0400
Subject: [PATCH 11/11] Remove the --pd_outfmt paramater

---
 modules/local/profile_dists/main.nf | 6 +++---
 nextflow.config                     | 1 -
 nextflow_schema.json                | 7 -------
 workflows/gas_nomenclature.nf       | 3 ---
 4 files changed, 3 insertions(+), 14 deletions(-)

diff --git a/modules/local/profile_dists/main.nf b/modules/local/profile_dists/main.nf
index 3d6845c..f43d63b 100644
--- a/modules/local/profile_dists/main.nf
+++ b/modules/local/profile_dists/main.nf
@@ -9,7 +9,6 @@ process PROFILE_DISTS{
     input:
     path query
     path ref
-    val mapping_format
     path mapping_file
     path columns
 
@@ -39,9 +38,10 @@ process PROFILE_DISTS{
         args = args + " --count_missing"
     }
     // --match_threshold $params.profile_dists.match_thresh \\
-    prefix = "distances_${mapping_format}"
+    prefix = "distances_pairwise"
     """
-    profile_dists --query $query --ref $ref $args --outfmt $mapping_format \\
+    profile_dists --query $query --ref $ref $args \\
+                --outfmt pairwise \\
                 --distm $params.pd_distm \\
                 --file_type $params.pd_file_type \\
                 --missing_thresh $params.pd_missing_threshold \\
diff --git a/nextflow.config b/nextflow.config
index 31a7a17..9734b66 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -44,7 +44,6 @@ params {
     validate_params                  = true
 
     // Profile Dists
-    pd_outfmt = "pairwise"
     pd_distm = "hamming"
     pd_missing_threshold = 1.0
     pd_sample_quality_threshold = 1.0
diff --git a/nextflow_schema.json b/nextflow_schema.json
index 99b1c29..b2b8a89 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -37,13 +37,6 @@
             "description": "",
             "default": "",
             "properties": {
-                "pd_outfmt": {
-                    "type": "string",
-                    "description": "The output format for distances",
-                    "enum": ["pairwise"],
-                    "default": "pairwise",
-                    "hidden": true
-                },
                 "pd_distm": {
                     "type": "string",
                     "description": "The distance method/unit",
diff --git a/workflows/gas_nomenclature.nf b/workflows/gas_nomenclature.nf
index 4dd9d5f..813de21 100644
--- a/workflows/gas_nomenclature.nf
+++ b/workflows/gas_nomenclature.nf
@@ -123,11 +123,8 @@ workflow GAS_NOMENCLATURE {
         exit 1, "${params.pd_columns}: Does not exist but was passed to the pipeline. Exiting now."
     }
 
-    mapping_format = Channel.value(params.pd_outfmt)
-
     distances = PROFILE_DISTS(merged_queries.combined_profiles,
                             merged_references.combined_profiles,
-                            mapping_format,
                             mapping_file,
                             columns_file)
     ch_versions = ch_versions.mix(distances.versions)