Merge pull request #35 from phac-nml/dev

Release 0.3.0: Added database incorporation
phac-nml · Jan 9, 2025 · bd9c8ce · bd9c8ce
2 parents 39c8c16 + bc8d4d2
commit bd9c8ce
Show file tree

Hide file tree

Showing 61 changed files with 853 additions and 133 deletions.
diff --git a/.editorconfig b/.editorconfig
@@ -30,3 +30,8 @@ indent_style = unset
 # ignore python
 [*.{py}]
 indent_style = unset
+
+# ignore nf-test json file
+[tests/data/irida/*.json]
+insert_final_newline = unset
+trim_trailing_whitespace = unset
diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml
@@ -1,6 +1,6 @@
 name: nf-core linting
 # This workflow is triggered on pushes and PRs to the repository.
-# It runs the `nf-core lint` and markdown lint tests to ensure
+# It runs the `nf-core pipelines lint` and markdown lint tests to ensure
 # that the code meets the nf-core guidelines.
 on:
   push:
@@ -41,17 +41,32 @@ jobs:
           python-version: "3.12"
           architecture: "x64"
 
+      - name: read .nf-core.yml
+        uses: pietrobolcato/[email protected]
+        id: read_yml
+        with:
+          config: ${{ github.workspace }}/.nf-core.yml
+
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
-          pip install nf-core
+          pip install nf-core==${{ steps.read_yml.outputs['nf_core_version'] }}
+
+      - name: Run nf-core pipelines lint
+        if: ${{ github.base_ref != 'master' }}
+        env:
+          GITHUB_COMMENTS_URL: ${{ github.event.pull_request.comments_url }}
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          GITHUB_PR_COMMIT: ${{ github.event.pull_request.head.sha }}
+        run: nf-core -l lint_log.txt pipelines lint --dir ${GITHUB_WORKSPACE} --markdown lint_results.md
 
-      - name: Run nf-core lint
+      - name: Run nf-core pipelines lint --release
+        if: ${{ github.base_ref == 'master' }}
         env:
           GITHUB_COMMENTS_URL: ${{ github.event.pull_request.comments_url }}
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
           GITHUB_PR_COMMIT: ${{ github.event.pull_request.head.sha }}
-        run: nf-core -l lint_log.txt lint --dir ${GITHUB_WORKSPACE} --markdown lint_results.md
+        run: nf-core -l lint_log.txt pipelines lint --release --dir ${GITHUB_WORKSPACE} --markdown lint_results.md
 
       - name: Save PR number
         if: ${{ always() }}

diff --git a/.github/workflows/linting_comment.yml b/.github/workflows/linting_comment.yml
@@ -11,7 +11,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Download lint results
-        uses: dawidd6/action-download-artifact@09f2f74827fd3a8607589e5ad7f9398816f540fe # v3
+        uses: dawidd6/action-download-artifact@bf251b5aa9c2f7eeb574a96ee720e24f801b7c11 # v6
         with:
           workflow: linting.yml
           workflow_conclusion: completed

diff --git a/.nf-core.yml b/.nf-core.yml
@@ -1,6 +1,6 @@
 repository_type: pipeline
 
-nf_core_version: "2.14.1"
+nf_core_version: "3.0.1"
 lint:
   files_exist:
     - assets/nf-core-gasnomenclature_logo_light.png
@@ -27,6 +27,9 @@ lint:
     - custom_config
     - manifest.name
     - manifest.homePage
+    - params.max_cpus
+    - params.max_memory
+    - params.max_time
   readme:
     - nextflow_badge
 

diff --git a/.prettierignore b/.prettierignore
@@ -10,3 +10,4 @@ testing/
 testing*
 *.pyc
 bin/
+tests/data/irida/sample_name_add_iridanext.output.json
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,27 @@
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [0.3.0] - 2025/01/09
+
+### `Added`
+
+- Enhanced the pipeline to integrate _optional_ user-provided reference profiles and cluster addresses for additional samples [PR #29](https://github.com/phac-nml/gasnomenclature/pull/29):
+  - Added support for `--db_profiles` via the `APPEND_PROFILES` process
+  - Added support for `--db_clusters` via the `APPEND_CLUSTERS` process
+- Added tests to verify the additional databases can be incorporated and that both databases are required together for their respective processes.
+
+- Added the ability to include a `sample_name` column in the input samplesheet.csv. Allows for compatibility with IRIDA-Next input configuration [PR #30](https://github.com/phac-nml/gasnomenclature/pull/30):
+  - `sample_name` special characters will be replaced with `"_"`
+  - If no `sample_name` is supplied in the column `sample` will be used
+  - To avoid repeat values for `sample_name` all `sample_name` values will be suffixed with the unique `sample` value from the input file
+- Updated `gas/call` to version `0.1.2` and both `CLUSTER_FILE` and `APPEND_CLUSTERS` to comply with the latest formatting requirements.
+
+### `Changed`
+
+- Genomic Service Address version [0.1.1](https://pypi.org/project/genomic-address-service/0.1.1/) -> [0.1.3](https://pypi.org/project/genomic-address-service/0.1.3/)
+
+- Refined the format of `reference_cluster.tsv (rclusters)` used by `GAS CALL` to require only `id` and `address` columns. This change involved updates to both the `append_clusters` and `cluster_file` modules.
+
 ## [0.2.3] - 2024/09/25
 
 ### `Changed`
@@ -45,3 +66,4 @@ Initial release of the Genomic Address Nomenclature pipeline to be used to assig
 [0.2.1]: https://github.com/phac-nml/gasnomenclature/releases/tag/0.2.1
 [0.2.2]: https://github.com/phac-nml/gasnomenclature/releases/tag/0.2.2
 [0.2.3]: https://github.com/phac-nml/gasnomenclature/releases/tag/0.2.3
+[0.3.0]: https://github.com/phac-nml/gasnomenclature/releases/tag/0.3.0
diff --git a/README.md b/README.md
@@ -20,6 +20,16 @@ The structure of this file is defined in [assets/schema_input.json](assets/schem
 
 Details on the columns can be found in the [Full samplesheet](docs/usage.md#full-samplesheet) documentation.
 
+## IRIDA-Next Optional Input Configuration
+
+`gasnomenclature` accepts the [IRIDA-Next](https://github.com/phac-nml/irida-next) format for samplesheets which can contain an additional column: `sample_name`
+
+`sample_name`: An **optional** column, that overrides `sample` for outputs (filenames and sample names) and reference assembly identification.
+
+`sample_name`, allows more flexibility in naming output files or sample identification. Unlike `sample`, `sample_name` is not required to contain unique values. `Nextflow` requires unique sample names, and therefore in the instance of repeat `sample_names`, `sample` will be suffixed to any `sample_name`. Non-alphanumeric characters (excluding `_`,`-`,`.`) will be replaced with `"_"`.
+
+An [example samplesheet](tests/data/samplesheets/samplesheet-sample_name.csv) has been provided with the pipeline.
+
 # Parameters
 
 The main parameters are `--input` as defined above and `--output` for specifying the output results directory. You may wish to provide `-profile singularity` to specify the use of singularity containers and `-r [branch]` to specify which GitHub branch you would like to run.
@@ -71,7 +81,30 @@ The following can be used to adjust parameters for the [gas call][] tool.
 
 - `--gm_thresholds`: Thresholds delimited by `,`. Values should match units from `--pd_distm` (either _hamming_ or _scaled_). Please see the [Distance Method and Thresholds](#distance-method-and-thresholds) section for more information.
 - `--gm_method`: The linkage method to use for clustering. Value should be one of _single_, _average_, or _complete_.
-- `--gm_delimiter`: Delimiter desired for nomenclature code. Must be alphanumeric or one of `._-`.
+- `--gm_delimiter`: Delimiter desired for nomenclature code. Must be alphanumeric or one of `._-`. Must be the same delimeter as samplesheet and cluster address database.
+
+## Optional Profile and Cluster Address Databases (as used by IRIDA-Next)
+
+In addition to the reference samples included in the input samplesheet (which already contain pre-computed cluster addresses), users can incorporate additional pre-computed reference profiles and cluster addresses by providing them as parameterized databases.
+Note that any address levels present in the additional databases but absent from the input samplesheet addresses will be disregarded.
+
+- `--db_profiles`: Specifies the path to the database containing pre-merged MLST profiles in tab-separated format. To ensure compatibility, the database structure must adhere to the expected header format corresponding to the samples included in the input samplesheet:
+
+| sample_id | l1  | l2  | ... | ln  |
+| --------- | --- | --- | --- | --- |
+| sampleA   | 1   | 1   | ... | 1   |
+| sampleB   | 1   | 1   | ... | 2   |
+| sampleC   | 2   | 1   | ... | 1   |
+
+- `--db_clusters`: Specifies the path to the database containing cluster addresses for additional samples in tab-separated format. To ensure compatibility, the database structure must adhere to the expected header format corresponding to the samples included in the input samplesheet:
+
+| id      | address |
+| ------- | ------- |
+| sampleA | 1.1.1   |
+| sampleB | 1.1.2   |
+| sampleC | 2.1.1   |
+
+_Note: To add additional reference samples to the pipeline, both `--db_profiles` and `--db_clusters` must be provided together, and all `sample_id`'s in `--db_profiles` must match the `id`'s in `--db_clusters`_
 
 ## Other
 

diff --git a/assets/schema_input.json b/assets/schema_input.json
@@ -1,5 +1,5 @@
 {
-    "$schema": "http://json-schema.org/draft-07/schema",
+    "$schema": "https://json-schema.org/draft-07/schema",
     "$id": "https://raw.githubusercontent.com/phac-nml/gasnomenclature/main/assets/schema_input.json",
     "title": "phac-nml/gasnomenclature pipeline - params.input schema",
     "description": "Schema for the file provided with params.input",
@@ -10,10 +10,15 @@
             "sample": {
                 "type": "string",
                 "pattern": "^\\S+$",
-                "meta": ["id"],
+                "meta": ["irida_id"],
                 "unique": true,
                 "errorMessage": "Sample name must be provided and cannot contain spaces"
             },
+            "sample_name": {
+                "type": "string",
+                "meta": ["id"],
+                "errorMessage": "Sample name is optional, if provided will replace sample for filenames and outputs"
+            },
             "mlst_alleles": {
                 "type": "string",
                 "format": "file-path",

diff --git a/conf/iridanext.config b/conf/iridanext.config
@@ -4,13 +4,18 @@ iridanext {
         path = "${params.outdir}/iridanext.output.json.gz"
         overwrite = true
         files {
+            idkey = "irida_id"
             samples = ["**/input/*_error_report.csv"]
         }
         metadata {
             samples {
+                keep = [
+                    "address"
+                ]
                 csv {
-                    path = "**/filter/new_addresses.csv"
-                    idcol = "id"
+                    path = "**/filter/new_addresses.tsv"
+                    sep = "\t"
+                    idcol = 'irida_id'
                 }
             }
         }

diff --git a/docs/output.md b/docs/output.md
@@ -6,6 +6,7 @@ This document describes the output produced by the pipeline.
 
 The directories listed below will be created in the results directory after the pipeline has finished. All paths are relative to the top-level results directory.
 
+- append: Contains reference MLST profile and cluster address files if additional databases were provided by the user.
 - call: The cluster addresses from the [genomic_address_service](https://github.com/phac-nml/genomic_address_service).
 - cluster: The cluster file required by GAS_call.
 - distances: Distances between genomes from [profile_dists](https://github.com/phac-nml/profile_dists).
@@ -22,8 +23,10 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d
 
 - [Input assure](#input-assure) - Performs a validation check on the samplesheet inputs to ensure that the sampleID precisely matches the MLST JSON key and enforces necessary changes where discrepancies are found.
 - [Locidex merge](#locidex-merge) - Merges MLST profile JSON files into a single profiles file for reference and query samples.
+- [Append profiles](#append-profiles) - Appends additional MLST profile information to reference samples if provided by user.
 - [Profile dists](#profile-dists) - Computes pairwise distances between genomes using MLST allele differences.
 - [Cluster file](#cluster-file) - Generates the expected_clusters.txt file from reference sample addresses for use in GAS_call.
+- [Append clusters](#append-clusters) - Appends additional cluster information to reference samples if provided by user.
 - [GAS call](#gas-call) - Generates hierarchical cluster addresses.
 - [Filter query](#filter-query) - Filters and generates a csv file containing only the cluster addresses for query samples.
 - [IRIDA Next Output](#irida-next-output) - Generates a JSON output file that is compliant with IRIDA Next
@@ -51,6 +54,16 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d
 
 </details>
 
+### Append Profiles
+
+<details markdown="1">
+<summary>Output files</summary>
+
+- `append/`
+  - profiles: `profiles_ref.tsv`
+
+</details>
+
 ### Profile Dists
 
 <details markdown="1">
@@ -75,6 +88,16 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d
 
 </details>
 
+### Append Clusters
+
+<details markdown="1">
+<summary>Output files</summary>
+
+- `append/`
+  - clusters: `reference_clusters.tsv`
+
+</details>
+
 ### GAS call
 
 <details markdown="1">
@@ -93,7 +116,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d
 <summary>Output files</summary>
 
 - `filter/`
-  - `new_addresses.csv`
+  - `new_addresses.tsv`
 
 </details>
 

diff --git a/docs/usage.md b/docs/usage.md
@@ -12,7 +12,7 @@ You will need to create a samplesheet with information about the samples you wou
 --input '[path to samplesheet file]'
 ```
 
-### Full samplesheet
+### Full Standard Samplesheet
 
 The input samplesheet must contain three columns: `sample`, `mlst_alleles`, `address`. The sample names within a samplesheet should be unique. All other columns will be ignored.
 
@@ -33,6 +33,28 @@ sampleF,sampleF.mlst.json,
 
 An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline.
 
+### IRIDA-Next Optional Samplesheet Configuration
+
+`gasnomenclature` accepts the [IRIDA-Next](https://github.com/phac-nml/irida-next) format for samplesheets which contain the following columns: `sample`, `sample_name`, `mlst_alleles`, `address`. The sample IDs within a samplesheet should be unique.
+
+A final samplesheet file consisting of mlst_alleles and addresses may look something like the one below:
+
+```csv title="samplesheet.csv"
+sample,sample_name,mlst_alleles,address
+sampleA,S1,sampleA.mlst.json.gz,1.1.1
+sampleQ,S2,sampleQ.mlst.json.gz,2.2.2
+sampleF,,sampleF.mlst.json,
+```
+
+| Column         | Description                                                                                                                                                                                                                                                                                 |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `sample`       | Custom sample name. Samples should be unique within a samplesheet.                                                                                                                                                                                                                          |
+| `sample_name`  | Sample name used in outputs (filenames and sample names)                                                                                                                                                                                                                                    |
+| `mlst_alleles` | Full path to an MLST JSON file describing the loci/alleles for the sample against some MLST scheme. A way to generate this file is via [locidex]. File can optionally be gzipped and must have the extension ".mlst.json", ".mlst.subtyping.json" (or with an additional ".gz" if gzipped). |
+| `address`      | Hierarchal clustering address. If left empty for a sample, the pipeline will assign a cluster address.                                                                                                                                                                                      |
+
+An [example samplesheet](tests/data/samplesheets/samplesheet-sample_name.csv) has been provided with the pipeline.
+
 ## Running the pipeline
 
 The typical command for running the pipeline is as follows:
@@ -185,3 +207,5 @@ We recommend adding the following line to your environment to limit this (typica
 ```bash
 NXF_OPTS='-Xms1g -Xmx4g'
 ```
+
+[locidex]: https://github.com/phac-nml/locidex
diff --git a/main.nf b/main.nf
@@ -22,7 +22,15 @@ if (params.validate_params) {
     validateParameters()
 }
 
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Ensure both --db_profiles and --db_clusters are provided together
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+*/
 
+if (params.db_profiles as boolean != params.db_clusters as boolean) {
+    error "Both '--db_profiles' and '--db_clusters' parameters must be provided together."
+}
 
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

diff --git a/modules/local/append_clusters/main.nf b/modules/local/append_clusters/main.nf
@@ -0,0 +1,51 @@
+process APPEND_CLUSTERS {
+    tag "Append additional clusters from database"
+    label 'process_single'
+
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/csvtk:0.22.0--h9ee0642_1' :
+        'biocontainers/csvtk:0.22.0--h9ee0642_1' }"
+
+    input:
+    path(initial_clusters)
+    path(additional_clusters)
+
+    output:
+    path("reference_clusters.tsv")
+
+    script:
+    """
+    # Function to get the first address line from the files, handling gzipped files
+    get_address() {
+        if [[ "\${1##*.}" == "gz" ]]; then
+            zcat "\$1" | awk 'NR>1 {print \$2}' | head -n 1
+        else
+            awk 'NR>1 {print \$2}' "\$1" | head -n 1
+        fi
+    }
+
+    # Check if two files have consistent delimeter splits in the address column
+    init_splits=\$(get_address "${initial_clusters}" | awk -F '${params.gm_delimiter}' '{print NF}')
+    add_splits=\$(get_address "${additional_clusters}" | awk -F '${params.gm_delimiter}' '{print NF}')
+
+    if [ "\$init_splits" != "\$add_splits" ]; then
+        echo "Error: Address levels do not match between initial_clusters and --db_clusters."
+        exit 1
+    fi
+
+    # Add a "source" column to differentiate the reference profiles and additional profiles
+    csvtk mutate2 -t -n source -e " 'ref' " ${initial_clusters} > reference_clusters_source.tsv
+    csvtk mutate2 -t -n source -e " 'db' " ${additional_clusters} > additional_clusters_source.tsv
+
+    # Combine profiles from both the reference and database into a single file
+    csvtk concat -t reference_clusters_source.tsv additional_clusters_source.tsv | csvtk sort -t -k id > combined_profiles.tsv
+
+    # Calculate the frequency of each sample_id across both sources
+    csvtk freq -t -f id combined_profiles.tsv > sample_counts.tsv
+
+    # For any sample_id that appears in both the reference and database, add a 'db_' prefix to the sample_id from the database
+    csvtk join -t -f id combined_profiles.tsv sample_counts.tsv | \
+    csvtk mutate2 -t -n id -e '(\$source == "db" && \$frequency > 1) ? "db_" + \$id : \$id' | \
+    csvtk cut -t -f id,address > reference_clusters.tsv
+    """
+}
-Original file line number
+Diff line change
@@ Expand Up / @@ -10,3 +10,4 @@ testing/ @@
     testing*
     *.pyc
     bin/
+    tests/data/irida/sample_name_add_iridanext.output.json