phac-nml · kylacochrane · Jun 20, 2024 · Jun 17, 2024 · Jun 17, 2024 · Jun 17, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,8 +5,17 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## In-development
 
-- Fixed nf-core tools linting failures introduced in version 2.12.1.
-- Added phac-nml prefix to nf-core config
+## 1.0.5 - 2024/06/17
+
+- Updated modules to include:
+
+  - `input_assure`: Performs a validation check on the samplesheet inputs to ensure that the sampleID precisely matches the MLST JSON key and enforces necessary changes where discrepancies are found.
+  - `cluster_file`: Generates the expected_clusters.txt file from reference sample addresses for use in GAS_call.
+  - `filter_query`: Filters and generates a csv file containing only the cluster addresses for query samples.
+
+- Pinned nf-iridanext plugin
+- Added tests for the full pipeline, independant modules, and input parameters
+- Updated documentation and configuration files
 
 ## 1.0.3 - 2024/02/23
 

diff --git a/CITATIONS.md b/CITATIONS.md
@@ -10,6 +10,18 @@
 
 ## Pipeline tools
 
+- [locidex](https://github.com/phac-nml/locidex) (in-development, citation subject to change)
+
+  > Robertson, James, Wells, Matthew, Christy-Lynn, Peterson, Kyrylo Bessonov, Reimer, Aleisha, Schonfeld, Justin. LOCIDEX: Distributed allele calling engine. 2024. https://github.com/phac-nml/locidex
+
+- [profile_dists](https://github.com/phac-nml/profile_dists) (in-development, citation subject to change)
+
+  > Robertson, James, Wells, Matthew, Schonfeld, Justin, Reimer, Aleisha. Profile Dists: Convenient package for comparing genetic similarity of samples based on allelic profiles. 2023. https://github.com/phac-nml/profile_dists
+
+- [genomic_address_service (GAS)](https://github.com/phac-nml/genomic_address_service) (in-development, citation subject to change)
+
+  > Robertson, James, Wells, Matthew, Schonfeld, Justin, Reimer, Aleisha. Genomic Address Service: Convenient package for de novo clustering and sample assignment to existing clusters. 2023. https://github.com/phac-nml/genomic_address_service
+
 ## Software packaging/containerisation tools
 
 - [Anaconda](https://anaconda.com)

diff --git a/LICENSE b/LICENSE
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) Aaron Petkau
+Copyright (c) Government of Canada
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

diff --git a/docs/output.md b/docs/output.md
@@ -20,7 +20,7 @@ The IRIDA Next-compliant JSON output file will be named `iridanext.output.json.g
 
 The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps:
 
-- [Input check](#input-check) - Performs a validation check on the samplesheet inputs to ensure that the sampleID precisely matches the MLST JSON key.
+- [Input assure](#input-assure) - Performs a validation check on the samplesheet inputs to ensure that the sampleID precisely matches the MLST JSON key and enforces necessary changes where discrepancies are found.
 - [Locidex merge](#locidex-merge) - Merges MLST profile JSON files into a single profiles file for reference and query samples.
 - [Profile dists](#profile-dists) - Computes pairwise distances between genomes using MLST allele differences.
 - [Cluster file](#cluster-file) - Generates the expected_clusters.txt file from reference sample addresses for use in GAS_call.
@@ -29,13 +29,14 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d
 - [IRIDA Next Output](#irida-next-output) - Generates a JSON output file that is compliant with IRIDA Next
 - [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution
 
-### Input Check
+### Input Assure
 
 <details markdown="1">
 <summary>Output files</summary>
 
 - `input/`
   - `sampleID_error_report.csv`
+  - `sampleID.mlst.json.gz`
 
 </details>
 

diff --git a/tests/data/called/expected_results_count-missing.txt b/tests/data/called/expected_results_count-missing.txt
@@ -0,0 +1,5 @@
+id	address	level_1
+sample1	1	1
+sample2	1	1
+sample3	2	2
+sampleQ	1	1
diff --git a/tests/data/called/expected_results_loci-missing.txt b/tests/data/called/expected_results_loci-missing.txt
@@ -0,0 +1,5 @@
+id	address	level_1
+sample1	1	1
+sample2	1	1
+sample3	2	2
+sampleQ	1	1
diff --git a/tests/data/called/expected_results_missing.txt b/tests/data/called/expected_results_missing.txt
@@ -0,0 +1,5 @@
+id	address	level_1
+sample1	1	1
+sample2	1	1
+sample3	2	2
+sampleQ	1	1
diff --git a/tests/data/clusters/expected_clusters_missing.txt b/tests/data/clusters/expected_clusters_missing.txt
@@ -0,0 +1,4 @@
+id	address	level_1
+sample1	1	1
+sample2	1	1
+sample3	2	2
diff --git a/tests/data/columns/keep-zero-loci-empty-file.txt b/tests/data/columns/keep-zero-loci-empty-file.txt
@@ -0,0 +1 @@
+
diff --git a/tests/data/distances/expected_dists_count-missing.txt b/tests/data/distances/expected_dists_count-missing.txt
@@ -0,0 +1,5 @@
+query_id	ref_id	dist
+sampleQ	sampleQ	0
+sampleQ	sample1	1
+sampleQ	sample2	2
+sampleQ	sample3	3
diff --git a/tests/data/distances/expected_dists_loci-missing.txt b/tests/data/distances/expected_dists_loci-missing.txt
@@ -0,0 +1,5 @@
+query_id	ref_id	dist
+sampleQ	sampleQ	0
+sampleQ	sample1	1
+sampleQ	sample2	2
+sampleQ	sample3	3
diff --git a/tests/data/distances/expected_dists_missing.txt b/tests/data/distances/expected_dists_missing.txt
@@ -0,0 +1,5 @@
+query_id	ref_id	dist
+sampleQ	sampleQ	0
+sampleQ	sample1	1
+sampleQ	sample2	1
+sampleQ	sample3	2
diff --git a/tests/data/irida/count-missing_iridanext.output.json b/tests/data/irida/count-missing_iridanext.output.json
@@ -0,0 +1,13 @@
+{
+    "files": {
+        "global": [],
+        "samples": {}
+    },
+    "metadata": {
+        "samples": {
+            "sampleQ": {
+                "address": "1"
+            }
+        }
+    }
+}
diff --git a/tests/data/irida/loci-missing_iridanext.output.json b/tests/data/irida/loci-missing_iridanext.output.json
@@ -0,0 +1,13 @@
+{
+    "files": {
+        "global": [],
+        "samples": {}
+    },
+    "metadata": {
+        "samples": {
+            "sampleQ": {
+                "address": "1"
+            }
+        }
+    }
+}
diff --git a/tests/data/irida/missing_iridanext.output.json b/tests/data/irida/missing_iridanext.output.json
@@ -0,0 +1,13 @@
+{
+    "files": {
+        "global": [],
+        "samples": {}
+    },
+    "metadata": {
+        "samples": {
+            "sampleQ": {
+                "address": "1"
+            }
+        }
+    }
+}
diff --git a/tests/data/profiles/expected-profile_missing1.tsv b/tests/data/profiles/expected-profile_missing1.tsv
@@ -0,0 +1,5 @@
+sample_id	l1	l2	l3
+sampleQ	1	2	1
+sample1	1	1	1
+sample2	-	1	1
+sample3	-	1	2
diff --git a/tests/data/profiles/expected-profile_missing2.tsv b/tests/data/profiles/expected-profile_missing2.tsv
@@ -0,0 +1,2 @@
+sample_id	l1	l2	l3
+sampleQ	1	2	1
diff --git a/tests/data/reports/sample2_missing.mlst.json b/tests/data/reports/sample2_missing.mlst.json
@@ -0,0 +1,7 @@
+{
+    "sample2": {
+        "l1": "-",
+        "l2": "1",
+        "l3": "1"
+    }
+}
diff --git a/tests/data/reports/sample3_missing.mlst.json b/tests/data/reports/sample3_missing.mlst.json
@@ -0,0 +1,7 @@
+{
+    "sample3": {
+        "l1": "-",
+        "l2": "1",
+        "l3": "2"
+    }
+}
diff --git a/tests/data/samplesheets/samplesheet-hash_missing.csv b/tests/data/samplesheets/samplesheet-hash_missing.csv
@@ -0,0 +1,5 @@
+sample,mlst_alleles,address
+sampleQ,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sampleQ.mlst.json,
+sample1,https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/reports/sample1.mlst.json,1
+sample2,https://raw.githubusercontent.com/phac-nml/gasnomenclature/add_tests/tests/data/reports/sample2_missing.mlst.json,1
+sample3,https://raw.githubusercontent.com/phac-nml/gasnomenclature/add_tests/tests/data/reports/sample3_missing.mlst.json,2
diff --git a/tests/pipelines/main_gm_threshold.nf.test b/tests/pipelines/main_gm_threshold.nf.test
@@ -0,0 +1,135 @@
+nextflow_pipeline {
+
+    name "Integration Tests of adjusting gm_thresholds parameters"
+    script "main.nf"
+
+    test("Test fail pipeline if null threshold set") {
+        tag "pipeline_failure_null_threshold"
+
+        when {
+            params {
+                input = "$baseDir/tests/data/samplesheets/samplesheet1.csv"
+                outdir = "results"
+
+                gm_thresholds = null
+            }
+        }
+
+        then {
+            assert workflow.failed
+            assert workflow.stdout.contains("ERROR ~ --gm_thresholds null: Cannot pass null or empty string")
+        }
+    }
+
+    test("Test fail pipeline if empty threshold set") {
+        tag "pipeline_failure_no_threshold"
+
+        when {
+            params {
+                input = "$baseDir/tests/data/samplesheets/samplesheet1.csv"
+                outdir = "results"
+
+                gm_thresholds = ""
+            }
+        }
+
+        then {
+            assert workflow.failed
+            assert workflow.stdout.contains("ERROR ~ --gm_thresholds : Cannot pass null or empty string")
+        }
+    }
+
+    test("Test fail pipeline if negative threshold set") {
+        tag "pipeline_failure_negative_threshold"
+
+        when {
+            params {
+                input = "$baseDir/tests/data/samplesheets/samplesheet1.csv"
+                outdir = "results"
+
+                gm_thresholds = "-1"
+            }
+        }
+
+        then {
+            assert workflow.failed
+            assert workflow.stderr.contains('* --gm_thresholds: string [-1] does not match pattern ^(\\d+(\\.\\d+)?,)*\\d+(\\.\\d+)?$ (-1)')
+        }
+    }
+
+    test("Test fail pipeline if mismatch between thresholds and scaled distm") {
+        tag "pipeline_failure_threshold_scaled"
+
+        when {
+            params {
+                input = "$baseDir/tests/data/samplesheets/samplesheet1.csv"
+                outdir = "results"
+
+                gm_thresholds = "1,0.5,2"
+                pd_distm = "scaled"
+            }
+        }
+
+        then {
+            assert workflow.failed
+            assert workflow.stdout.contains("ERROR ~ '--pd_distm scaled' is set, but '--gm_thresholds 1,0.5,2' contains thresholds outside of range [0,1]."
+                                            + " Please either set '--pd_distm hamming' or adjust the threshold values.")
+        }
+    }
+
+    test("Test fail pipeline if mismatch between thresholds and hamming distm") {
+        tag "pipeline_failure_threshold_hamming"
+
+        when {
+            params {
+                input = "$baseDir/tests/data/samplesheets/samplesheet1.csv"
+                outdir = "results"
+
+                gm_thresholds = "2,1,0.5"
+                pd_distm = "hamming"
+            }
+        }
+
+        then {
+            assert workflow.failed
+            assert workflow.stdout.contains("ERROR ~ '--pd_distm hamming' is set, but '--gm_thresholds 2,1,0.5' contains fractions."
+                                            + " Please either set '--pd_distm scaled' or remove fractions from distance thresholds.")
+        }
+    }
+
+    test("Test pipeline with single threshold set to 1") {
+        tag "pipeline_thresh_1"
+
+        when {
+            params {
+                input = "$baseDir/tests/data/samplesheets/samplesheet1.csv"
+                outdir = "results"
+
+                gm_thresholds = "1"
+            }
+        }
+
+        then {
+            assert workflow.failed
+            assert (workflow.stdout =~ /Error \[1.0\]  supplied thresholds do not equal the number of threshold columns in reference_clusters.txt/).find()
+        }
+    }
+
+    test("Test pipeline with threshold set to 1,0") {
+        tag "pipeline_thresh_1.0"
+
+        when {
+            params {
+                input = "$baseDir/tests/data/samplesheets/samplesheet1.csv"
+                outdir = "results"
+
+                gm_thresholds = "1,0"
+            }
+        }
+
+        then {
+            assert workflow.failed
+            assert (workflow.stdout =~ /Error \[1.0, 0.0\]  supplied thresholds do not equal the number of threshold columns in reference_clusters.txt/).find()
+        }
+    }
+}