Merge pull request #840 from Mitchob/master

Added nci_gadi config for proteinfold pipeline and updated documentation
nf-core · Feb 3, 2025 · 7e29297 · 7e29297
2 parents b3f73ea + 5fccaf9
commit 7e29297
Show file tree

Hide file tree

Showing 4 changed files with 124 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -304,6 +304,7 @@ Currently documentation is available for the following pipelines within specific
   - [hasta](docs/pipeline/taxprofiler/hasta.md)
 - proteinfold
   - [CRG](docs/pipeline/proteinfold/crg.md)
+  - [nci_gadi](docs/pipeline/proteinfold/nci_gadi.md)
 
 ### Pipeline-specific documentation
 

diff --git a/conf/pipeline/proteinfold/nci_gadi.config b/conf/pipeline/proteinfold/nci_gadi.config
@@ -0,0 +1,58 @@
+// NCI Gadi nf-core configuration profile
+profiles {
+    nci_gadi {
+        params {
+            config_profile_description = 'nf-core/proteinfold NCI Gadi HPC profile provided by nf-core/configs'
+            config_profile_contact = 'Mitchell O\'Brien (@mitchob)'
+            config_profile_url = 'https://opus.nci.org.au/display/Help/Gadi+User+Guide'
+            project = System.getenv("PROJECT")
+            storage_account = ''
+        }
+
+        // Define process resource limits
+        process {
+            executor = 'pbspro'
+            project = System.getenv("PROJECT")
+            storage = params.storage_account?.trim() ? params.storage_account : "scratch/${params.project}+gdata/${params.project}"
+            module = 'singularity'
+            cache = 'lenient'
+            stageInMode = 'symlink'
+
+            // Process-specific configurations
+            withName: 'RUN_ALPHAFOLD2|RUN_ALPHAFOLD2_PRED|RUN_ALPHAFOLD2_MSA' {
+                queue   = params.use_gpu ? 'gpuvolta' : 'normal'
+                cpus    = 48
+                gpus    = 4
+                time    = '4h'
+                memory  = 380.GB
+            }
+
+            withName: COLABFOLD_BATCH {
+                container = "nf-core/proteinfold_colabfold:1.1.1"
+                queue     = params.use_gpu ? 'gpuvolta' : 'normal'
+                cpus      = 48
+                gpus      = 4
+                time      = '4h'
+                memory    = 380.GB
+            }
+
+            withName: RUN_ESMFOLD {
+                container = "nf-core/proteinfold_esmfold:1.1.1"
+                queue     = params.use_gpu ? 'gpuvolta' : 'normal'
+                cpus      = 48
+                gpus      = 4
+                time      = '4h'
+                memory    = 380.GB
+            }
+        }
+
+        // Write custom trace file with outputs required for SU calculation
+        def trace_timestamp = new java.util.Date().format('yyyy-MM-dd_HH-mm-ss')
+        trace {
+            enabled = true
+            overwrite = false
+            file = "./gadi-nf-core-trace-${trace_timestamp}.txt"
+            fields = 'name,status,exit,duration,realtime,cpus,%cpu,memory,%mem,rss'
+        }
+    }
+}
diff --git a/docs/pipeline/proteinfold/nci_gadi.md b/docs/pipeline/proteinfold/nci_gadi.md
@@ -0,0 +1,62 @@
+# nf-core/configs: NCI Gadi proteinfold specific configuration
+
+Extra specific configuration for proteinfold pipeline
+
+## Usage
+
+To use, run the pipeline with `-profile nci_gadi`.
+
+This will download and launch the proteinfold specific [`nci_gadi.config`](../../../conf/pipeline/proteinfold/nci_gadi.config) which has been pre-configured with a setup suitable for the NCI Gadi HPC cluster.
+
+Example: `nextflow run nf-core/proteinfold -profile nci_gadi`
+
+## proteinfold specific configurations for NCI Gadi
+
+Specific configurations for NCI Gadi has been made for proteinfold.
+
+### Project accounting
+
+As described [here](https://github.com/nf-core/configs/blob/master/docs/nci_gadi.md#project-accounting) the config uses the PBS environmental variable `$PROJECT` to assign a project code to all task job submissions for billing purposes. If you are a member of multiple Gadi projects, you should confirm which project will be charged for your pipeline execution. You can do this using:
+
+```bash
+echo $PROJECT
+```
+
+The version of Nextflow installed on Gadi has been modified to make it easier to specify resource options for jobs submitted to the cluster. See NCI's [Gadi user guide](https://opus.nci.org.au/display/DAE/Nextflow) for more details. You can manually override the `$PROJECT` specification by editing your local copy of the `nci_gadi.config` and replacing `$PROJECT` with your project code. For example:
+
+```nextflow
+process {
+    project = '<abc>'
+    storage'scratch/<abc>+gdata/<abc>'
+    ...
+}
+```
+
+or export specification manually
+
+```
+export $PROJECT = '<abc>'
+```
+
+### Storage considerations
+
+When running proteinfold on NCI Gadi it is expected that all your data will be contained within the projects `/scratch` and `/g/data` directories, as specified in `$PROJECT` . However, if you are working across multiple project codes, you will need to manually edit this line in the `nci_gadi.config` to reflect this:
+
+```bash
+storage = "scratch/<abc>+gdata/<def>"
+```
+
+Alternatively, you can use the `--storage_account "scratch/abc+gdata/def"` parameter to specify access to storage as required.
+
+### ⚠️ Expected Warnings
+
+When running the pipeline, you may encounter the following warnings:
+
+```
+WARN: The following invalid input values have been detected:
+
+* --storage_account: scratch/abc+gdata/def
+* --project: abc
+```
+
+These warnings can be safely ignored. The parameters are required for job allocations and billing purposes on NCI Gadi, but they do not affect execution.
diff --git a/pipeline/proteinfold.config b/pipeline/proteinfold.config
@@ -15,4 +15,7 @@ profiles {
     unsw_katana {
         includeConfig "${params.custom_config_base}/conf/pipeline/proteinfold/unsw_katana.config"
     }
+    nci_gadi {
+        includeConfig "${params.custom_config_base}/conf/pipeline/proteinfold/nci_gadi.config"
+    }
 }