Merge pull request #92 from UPHL-BioNGS/update20230717

Update 20230717
UPHL-BioNGS · Jul 18, 2023 · 8de2193 · 8de2193
2 parents 986b26d + 8ec00cb
commit 8de2193
Show file tree

Hide file tree

Showing 6 changed files with 28 additions and 31 deletions.
diff --git a/bin/HeatCluster.py b/bin/HeatCluster.py
@@ -11,23 +11,21 @@
 import scipy.cluster.hierarchy as sch
 from sklearn.cluster import KMeans
 from sklearn.metrics import silhouette_score
-from io import StringIO
 
-# Read the SNP matrix file
-with open("snp_matrix.txt", "r") as infile:
-    lines = infile.readlines()
-numSamples = len(lines) -1 #counts data lines
-
-# Remove 'snp-dists 0.8.2', '_contigs' and '_genomic', & replace commas with tabs
-cleaned_lines = [line.replace('snp-dists 0.8.2\t', '').replace('snp-dists 0.8.2,', '').
-                 replace(",", "\t").replace('_contigs', '').replace('_genomic', '').replace("^\t", '')
-                 for line in lines]
+tabs   = pd.read_csv("snp_matrix.txt", nrows=1, sep='\t').shape[1]
+commas = pd.read_csv("snp_matrix.txt", nrows=1, sep=',').shape[1]
+if tabs > commas:
+    df = pd.read_csv("snp_matrix.txt", sep='\t', index_col=0)
+else:
+    df = pd.read_csv("snp_matrix.txt", sep=',', index_col=0)
 
-# Combine the cleaned lines into a single string instead of a file
-snp_matrix_string = "\n".join(cleaned_lines)
+print("Found ", len(df.columns), " samples in snp_matrix.txt")
 
-# Read the tab-delimited string into a DataFrame
-df = pd.read_csv(StringIO(snp_matrix_string), sep='\t')
+if len(df.columns) <= 2:
+    print("This matrix has too few samples or has been melted. Sorry!")
+    exit(0)
+else:
+    numSamples = len(df.columns)
 
 #Define colormap for heatmap
 cmap = 'Reds_r'
@@ -45,14 +43,10 @@
 sorted_cluster_matrix=sorted_cluster_matrix.reindex(columns=sorted_cluster_matrix.index)
 
 #Change output figure size tuple based on number of samples
-if   (numSamples <= 20): 
+if (numSamples <= 20): 
     figureSize = (10, 8)
-elif (numSamples <= 40): 
-    figureSize = (20, 16)
-elif (numSamples <= 60): 
-    figureSize = (30, 24)
-else: 
-    figureSize = (40, 32)
+else:
+    figureSize = (round(numSamples / 2,0) , round(numSamples / 2.5,0))
 print("\n\nNumber of samples: ", numSamples,"\nFigure size: ", figureSize)
 
 # Compute clusters
@@ -142,4 +136,4 @@
 plt.show()
 plt.close()
 
-print("Saved heatmap as Heatmap.{pdf,png}")
+print("Saved heatmap as SNP_matrix.{pdf,png}")
diff --git a/modules/grandeur.nf b/modules/grandeur.nf
@@ -381,9 +381,9 @@ process snp_matrix_heatmap {
   tuple file(snp_matrix), file(script)
 
   output:
-  path "snp-dists/SNP_matrix*"
-  path "snp-dists/SNP_matrix_mqc.png"                              , emit: for_multiqc
-  path "logs/${task.process}/snp_matrix.${workflow.sessionId}.log" , emit: log_files
+  path "snp-dists/SNP_matrix*", optional : true
+  path "snp-dists/SNP_matrix_mqc.png", optional : true, emit: for_multiqc
+  path "logs/${task.process}/snp_matrix.${workflow.sessionId}.log", emit: log_files
 
   shell:
   '''

diff --git a/modules/mlst.nf b/modules/mlst.nf
@@ -1,7 +1,7 @@
 process mlst {
   tag           "${sample}"
   publishDir    params.outdir, mode: 'copy'
-  container     'staphb/mlst:2.23.0'
+  container     'staphb/mlst:2.23.0-2023-07'
   maxForks      10
   //#UPHLICA errorStrategy { task.attempt < 2 ? 'retry' : 'ignore'}
   //#UPHLICA pod annotation: 'scheduler.illumina.com/presetSize', value: 'standard-medium'

diff --git a/modules/quast.nf b/modules/quast.nf
@@ -39,8 +39,11 @@ process quast {
 
     if [ -f "quast/!{sample}/report.tsv" ] ; then cp quast/!{sample}/report.tsv quast/!{sample}_quast_report.tsv ; fi
 
-    head -n 1 quast/!{sample}/transposed_report.tsv | awk '{print "sample\\t" $0 }' > quast/!{sample}/transposed_report.tsv.tmp
-    tail -n 1 quast/!{sample}/transposed_report.tsv | awk -v sample=!{sample} '{print sample "\\t" $0}' >> quast/!{sample}/transposed_report.tsv.tmp
-    mv quast/!{sample}/transposed_report.tsv.tmp quast/!{sample}/transposed_report.tsv
+    if [ -f "quast/!{sample}/transposed_report.tsv" ]
+    then
+      head -n 1 quast/!{sample}/transposed_report.tsv | awk '{print "sample\\t" $0 }' > quast/!{sample}/transposed_report.tsv.tmp
+      tail -n 1 quast/!{sample}/transposed_report.tsv | awk -v sample=!{sample} '{print sample "\\t" $0}' >> quast/!{sample}/transposed_report.tsv.tmp
+      mv quast/!{sample}/transposed_report.tsv.tmp quast/!{sample}/transposed_report.tsv
+    fi
   '''
 }
diff --git a/modules/shigatyper.nf b/modules/shigatyper.nf
@@ -2,7 +2,7 @@ process shigatyper {
   tag           "${sample}"
   label         "medcpus"
   publishDir    params.outdir, mode: 'copy'
-  container     'staphb/shigatyper:2.0.3'
+  container     'staphb/shigatyper:2.0.5'
   stageInMode   'copy'
   maxForks      10
   //#UPHLICA errorStrategy { task.attempt < 2 ? 'retry' : 'ignore'}

diff --git a/nextflow.config b/nextflow.config
@@ -3,7 +3,7 @@ manifest {
   author                          = 'Erin Young'
   homePage                        = 'https://github.com/UPHL-BioNGS/Grandeur'
   mainScript                      = 'grandeur.nf'
-  version                         = '3.2.20230711'
+  version                         = '3.2.20230718'
   defaultBranch                   = 'main'
   description                     = 'Grandeur is short-read de novo assembly pipeline with serotyping.'
 }