Merge pull request #11 from phac-nml/expected-clusters

Add module to create the expected_clusters file for gas-call
phac-nml · Jun 10, 2024 · 4acdb8c · 4acdb8c
2 parents b1c888a + 6691dea
commit 4acdb8c
Show file tree

Hide file tree

Showing 10 changed files with 127 additions and 10 deletions.
diff --git a/conf/test.config b/conf/test.config
@@ -20,8 +20,7 @@ params {
     max_time   = '1.h'
 
     // Input data
-    input  = "${projectDir}/tests/data/samplesheets/samplesheet1.csv"
-    ref_clusters = 'https://raw.githubusercontent.com/phac-nml/gasnomenclature/dev/tests/data/clusters/expected_clusters.txt'
+    input  = "${projectDir}/assets/samplesheet.csv"
 }
 
 

diff --git a/modules/local/cluster_file/main.nf b/modules/local/cluster_file/main.nf
@@ -0,0 +1,45 @@
+process CLUSTER_FILE {
+    tag "Create cluster file for GAS call"
+    label 'process_single'
+
+    input:
+    val meta
+
+    output:
+    path("expected_clusters.txt"), emit: text
+
+    exec:
+    def outputLines = []
+    def delimiter = java.util.regex.Pattern.quote(params.gm_delimiter)
+
+    // Determine the maximum number of levels to set the header requirements for each pipeline run
+    int maxLevels = meta.collect { sample -> sample.address.split(delimiter).size() }.max() ?: 0
+
+    // Verify each sample is consistent with $maxLevels
+    meta.each { sample ->
+        int level = sample.address.split(delimiter).size()
+        if (level != maxLevels) {
+            error ("Inconsistent levels found: expected $maxLevels levels but found $level levels in ${sample.id}")
+        }
+    }
+
+    // Generate the header for the expected_clusters.txt file
+    def header = ["id", "address"] + (1..maxLevels).collect { "level_$it" }
+    outputLines << header.join("\t")
+
+    // Iterate over each sample in the meta list and pull the relevant information for the text file
+    meta.each { sample ->
+        def id = sample.id
+        def address = sample.address
+        def levels = address.split(delimiter)
+        def line = [id, address] + levels.collect { it.toString() }
+        outputLines << line.join("\t")
+    }
+
+    // Write the text file, iterating over each sample
+    task.workDir.resolve("expected_clusters.txt").withWriter { writer ->
+        outputLines.each { line ->
+            writer.writeLine(line)
+        }
+    }
+}
diff --git a/nextflow.config b/nextflow.config
@@ -62,7 +62,7 @@ params {
 
     // GAS Call
     gm_thresholds = "10,5,0"
-    gm_delimiter = "'.'" // note the single quotes surrounding the delimiter
+    gm_delimiter = "."
     ref_clusters = ""
 
 }

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -17,7 +17,9 @@
                 },
                 "gm_delimiter": {
                     "type": "string",
-                    "default": "\\'.\\"
+                    "default": ".",
+                    "description": "Delimiter desired for nomenclature code.",
+                    "pattern": "^[A-Fa-f0-9\\._-]+$"
                 },
                 "ref_clusters": {
                     "type": "string"

diff --git a/tests/data/called/expected_results.txt b/tests/data/called/expected_results.txt
@@ -1,5 +1,5 @@
 id	address	level_1	level_2	level_3
 sample1	1.1.1	1	1	1
 sample2	1.1.1	1	1	1
-sample3	2.2.2	2	2	2
+sample3	1.1.2	1	1	2
 sampleQ	1.1.3	1	1	3
diff --git a/tests/data/clusters/expected_clusters.txt b/tests/data/clusters/expected_clusters.txt
@@ -1,4 +1,4 @@
 id	address	level_1	level_2	level_3
 sample1	1.1.1	1	1	1
 sample2	1.1.1	1	1	1
-sample3	2.2.2	2	2	2
+sample3	1.1.2	1	1	2
diff --git a/tests/data/profiles/expected-profile2.tsv b/tests/data/profiles/expected-profile2.tsv
@@ -0,0 +1,2 @@
+sample_id	l1	l2	l3
+sampleQ	1	2	1
diff --git a/tests/modules/cluster_file/main.nf.test b/tests/modules/cluster_file/main.nf.test
@@ -0,0 +1,58 @@
+nextflow_process {
+    name "Test Process CLUSTER_FILE"
+    script "modules/local/cluster_file/main.nf"
+    process "CLUSTER_FILE"
+
+    test("Test when sample levels are equal") {
+
+        when {
+            process {
+                """
+                input[0] = Channel.of(
+                    [['id':'sample1', 'address':'1.1.1'],
+                    ['id':'sample2', 'address':'1.1.1'],
+                    ['id':'sample3', 'address':'1.1.2']]
+                    )
+                """
+            }
+
+            params {
+                outdir = "cluster_results"
+            }
+        }
+
+        then {
+            assert process.success
+            assert path("$launchDir/cluster_results").exists()
+
+            // Check expected_clusters
+            def actual_clusters = path("$launchDir/cluster_results/cluster/expected_clusters.txt")
+            def expected_clusters = path("$baseDir/tests/data/clusters/expected_clusters.txt")
+            assert actual_clusters.text == expected_clusters.text
+        }
+    }
+
+    test("Test when sample levels are different") {
+
+        when {
+            process {
+                """
+                input[0] = Channel.of(
+                    [['id':'sample1', 'address':'1.1.1'],
+                    ['id':'sample2', 'address':'1.1.1'],
+                    ['id':'sample3', 'address':'1.2']]
+                    )
+                """
+            }
+
+            params {
+                outdir = "cluster_results"
+            }
+        }
+
+        then {
+            assert process.failed
+            assert (process.stdout =~ /Inconsistent levels found: expected 3 levels but found 2 levels in sample3/).find()
+        }
+    }
+}
diff --git a/tests/pipelines/main.nf.test b/tests/pipelines/main.nf.test
@@ -18,11 +18,15 @@ nextflow_pipeline {
             assert path("$launchDir/results").exists()
 
             // Check merged profiles
-            // TODO check query profile is merged
             def actual_profile_ref = path("$launchDir/results/locidex/merge/reference/merged_ref/merged_profiles_ref.tsv")
             def expected_profile_tsv = path("$baseDir/tests/data/profiles/expected-profile1.tsv")
             assert actual_profile_ref.text == expected_profile_tsv.text
 
+            // Check query profiles
+            def actual_profile_query = path("$launchDir/results/locidex/merge/query/merged_value/merged_profiles_value.tsv")
+            def expected_profile_query_tsv = path("$baseDir/tests/data/profiles/expected-profile2.tsv")
+            assert actual_profile_query.text == expected_profile_query_tsv.text
+
             // Check computed pairwise distances
             def actual_distances = path("$launchDir/results/distances/results.text")
             def expected_distances = path("$baseDir/tests/data/distances/expected_pairwise_dists.txt")

diff --git a/workflows/gas_nomenclature.nf b/workflows/gas_nomenclature.nf
@@ -26,6 +26,7 @@ include { INPUT_CHECK                           } from "../modules/local/input_c
 include { LOCIDEX_MERGE as LOCIDEX_MERGE_REF    } from "../modules/local/locidex/merge/main"
 include { LOCIDEX_MERGE as LOCIDEX_MERGE_QUERY  } from "../modules/local/locidex/merge/main"
 include { PROFILE_DISTS                         } from "../modules/local/profile_dists/main"
+include { CLUSTER_FILE                          } from "../modules/local/cluster_file/main"
 include { GAS_CALL                              } from "../modules/local/gas/call/main"
 include { FILTER_QUERY                          } from "../modules/local/filter_query/main"
 
@@ -131,10 +132,16 @@ workflow GAS_NOMENCLATURE {
                             columns_file)
     ch_versions = ch_versions.mix(distances.versions)
 
-    // GAS CALL
-    clusters = Channel.fromPath(params.ref_clusters, checkIfExists: true)
+    // Generate the expected_clusters.txt file from the addresses of the provided reference samples
+    clusters = input.filter { meta, file ->
+        meta.address != null
+    }.collect { meta, file ->
+        meta }
+
+    expected_clusters = CLUSTER_FILE(clusters)
 
-    called_data = GAS_CALL(clusters, distances.results)
+    // GAS CALL
+    called_data = GAS_CALL(expected_clusters.text, distances.results)
     ch_versions = ch_versions.mix(called_data.versions)
 
     // Filter the new queried samples and addresses into a CSV/JSON file for the IRIDANext plug in