Merge branch 'v1.7'

wtsi-hgi · Oct 20, 2024 · bf69c4d · bf69c4d
2 parents 6a9b878 + 42682d2
commit bf69c4d
Show file tree

Hide file tree

Showing 43 changed files with 732 additions and 314 deletions.
diff --git a/assets/deploy_scripts/module_exacutables/yascp b/assets/deploy_scripts/module_exacutables/yascp
@@ -11,7 +11,7 @@ normal=$(tput sgr0)
 
 if [[ "$QUEUE" == '' ]];
     then
-        export QUEUE='long'
+        export QUEUE='oversubscribed'
     else
         export QUEUE="$QUEUE"
 fi

diff --git a/assets/deploy_scripts/sanger_module_files/1.6.1 b/assets/deploy_scripts/sanger_module_files/1.6.1
@@ -6,7 +6,7 @@ set version [file tail [module-info version [module-info name]]]
 
 proc ModulesHelp { } {
     global version
-    puts stderr "YASCP (Yet Another Single Cell (scRNA) Pieline: https://github.com/wtsi-hgi/yascp) is a nextflow pipeline that QCs the scRNA Cellranger data by removing ambient RNA, deconvoluting donors, assigning celltypes, analysing concordances vs expected genotypes. IMPROVEMENTS: 1) Added support for running cellbender with different parameters. 2) Citeseq, VDJ data integrations using Surat integration, 3) You can now only run only the cellbender step 4) Have added additional 5 doublet detection methods 5) Can provide your own Azimuth and Celltypist references 6) You can now only run the doublet detection step"
+    puts stderr "YASCP (Yet Another Single Cell (scRNA) Pieline: https://github.com/wtsi-hgi/yascp) is a nextflow pipeline that QCs the scRNA Cellranger data by removing ambient RNA, deconvoluting donors, assigning celltypes, analysing concordances vs expected genotypes. IMPROVEMENTS: 1) Improved pipeline logic and bug fixes"
     puts stderr ""
     puts stderr "Yascp module has been set to run in multiple modes:"
     puts stderr "	*yascp -v or yascp -h :will describe the checkout tag used."
@@ -25,7 +25,7 @@ proc ModulesHelp { } {
 }
 
 module-whatis   "Yascp Version: $version" 
-module-whatis   "Yascp version $version is a single cell (scRNA) processing pipeline that takes care of donor deconvolution, ambient rna removal, celltype assignment, integration, clustering and cluster assesments and data qc: yascp (https://github.com/wtsi-hgi/yascp)"
+#//module-whatis   "Yascp version $version is a single cell (scRNA) processing pipeline that takes care of donor deconvolution, ambient rna removal, celltype assignment, integration, clustering and cluster assesments and data qc: yascp (https://github.com/wtsi-hgi/yascp)"
 
 
 set install /software/hgi/pipelines/yascp_versions/yascp_v1.6.1

diff --git a/assets/fake_fileinput.tsv b/assets/fake_fileinput.tsv
@@ -0,0 +1 @@
+experiment_id	n_pooled	donor_vcf_ids	data_path_10x_format
diff --git a/bin/0026-filter_outlier_cells.py b/bin/0026-filter_outlier_cells.py
@@ -500,10 +500,9 @@ def main():
                     generate_plots(subset_ad,cell_qc_column,metadata_columns,metadata_columns_original,of)
                     del subset_ad
             adata.uns['cell_outlier_estimator'] = method  
-            adata.obs[['cell_id', cell_qc_column]].to_csv(
-                f'{options.of}-outliers_filtered__{cell_qc_column}.tsv.gz',
+            adata.obs[['cell_id', cell_qc_column,cell_qc_column_score]].to_csv(
+                f'{options.of}-outliers_filtered__{cell_qc_column}.tsv',
                 sep='\t',
-                compression=compression_opts,
                 index=False,
                 header=True
             )
@@ -553,21 +552,20 @@ def main():
             )
             # Save the final dataframe
             df_cell_filt_per_exp.to_csv(
-                f'{options.of}-cell_filtered_per_experiment__{cell_qc_column}.tsv.gz',
+                f'{options.of}-cell_filtered_per_experiment__{cell_qc_column}.tsv',
                 sep='\t',
-                compression=compression_opts,
                 index=False,
                 header=True
             )
 
             generate_plots(adata,cell_qc_column,metadata_columns,metadata_columns_original,options.of)
 
 
-    adata.write(
-        '{}.h5ad'.format(options.of),
-        compression='gzip',
-        compression_opts=options.anndata_compression_opts
-    )
+    # adata.write(
+    #     '{}.h5ad'.format(options.of),
+    #     compression='gzip',
+    #     compression_opts=options.anndata_compression_opts
+    # )
 
 
 

diff --git a/bin/DoubletDecon.R b/bin/DoubletDecon.R
@@ -9,8 +9,11 @@ parser <- ArgumentParser()
 
 # specify our desired options 
 # by default ArgumentParser will add an help option 
-parser$add_argument("-o", "--out", required = TRUE, help="The output directory where results will be saved")
-parser$add_argument("-s", "--seurat_object", required = TRUE, type = "character", help = "A QC, normalized seurat object with classifications/clusters as Idents() saved as an rds object.")
+# parser$add_argument("-o", "--out", required = TRUE, help="The output directory where results will be saved")
+# parser$add_argument("-s", "--seurat_object", required = TRUE, type = "character", help = "A QC, normalized seurat object with classifications/clusters as Idents() saved as an rds object.")
+parser$add_argument("-o", "--out", required = FALSE, default = "DoubletDecon_CRD_CMB13631913", help = "The output directory where results will be saved")
+parser$add_argument("-s", "--seurat_object", required = FALSE, type = "character", default = "CRD_CMB13631913.h5ad", help = "A QC, normalized seurat object with classifications/clusters as Idents() saved as an rds object.")
+
 parser$add_argument("-g", "--num_genes", required = FALSE, type = "integer", default=50, help = "Number  of genes to use in \'Improved_Seurat_Pre_Process\' function.")
 parser$add_argument("-r", "--rhop", required = FALSE, type="double", default=0.9, help="rhop to use in DoubletDecon - the number of SD from the mean to identify upper limit to blacklist")
 parser$add_argument("-p", "--species", required = FALSE, type = "character", default="hsa", help = "The species of your sample. Can be scientific species name, KEGG ID, three letter species abbreviation, or NCBI ID.")
@@ -64,7 +67,13 @@ all.genes <- rownames(seurat)
 seurat <- ScaleData(seurat, features = all.genes)
 print('Scaled')
 seurat <- FindVariableFeatures(object = seurat)
-seurat <- RunPCA(seurat, features = VariableFeatures(object = seurat))
+# Check the number of variable features
+num_samples <- ncol(seurat)
+# Determine the number of PCs to use
+npcs_to_use <- ifelse(num_samples > 50, 50, num_samples-1)
+
+# Run PCA with the determined number of PCs
+seurat <- RunPCA(seurat, features = VariableFeatures(object = seurat), npcs = npcs_to_use)
 print('PCA performed')
 seurat <- FindNeighbors(seurat, dims = 1:10)
 print('Neighbors found')
@@ -74,8 +83,17 @@ print(seurat[["pca"]], dims = 1:5, nfeatures = 5)
 # seurat <- Read10X('TMP_DIR')
 # seurat_object = CreateSeuratObject(counts = seurat)
 ## Preprocess ##
-processed <- Improved_Seurat_Pre_Process(seurat, num_genes=args$num_genes, write_files=FALSE)
-
+if (num_samples > 50) {
+    # Proceed with the regular pipeline
+    processed <- Improved_Seurat_Pre_Process(seurat, num_genes = args$num_genes, write_files = FALSE)
+} else {
+    message("Skipping clustering due to insufficient cell count.")
+    # You can still preprocess without clustering or simply use the normalized data
+    seurat <- NormalizeData(seurat)
+    seurat <- FindVariableFeatures(seurat, selection.method = "vst", nfeatures = args$num_genes)
+    seurat <- ScaleData(seurat)
+    processed <- seurat  # Assign the preprocessed data
+}
 DeconRNASeq = function(datasets, signatures, proportions=NULL, checksig=FALSE, known.prop = FALSE, use.scale = TRUE, fig = TRUE){
 
   if (is.null(datasets)) 
@@ -645,66 +663,68 @@ Main_Doublet_Decon<-function(rawDataFile, groupsFile, filename, location, fullDa
 }
 
 ## Run Doublet Decon ##
+# Define the rhop values to test
+rhop_values <- c(0.9, 0.64, 0.5, 0.4, 0.3, 0.2, 0.1)
+success <- FALSE
 
-tryCatch({
-  print(paste0('trying rhop: ',args$rhop))
-  results <- Main_Doublet_Decon(rawDataFile = processed$newExpressionFile, 
-    groupsFile = processed$newGroupsFile, 
-    filename = "DoubletDecon_results",
-    location = paste0(args$out, "/"),
-    fullDataFile = NULL, 
-    removeCC = args$removeCC, 
-    species = args$species, 
-    rhop = args$rhop,
-    write = TRUE, 
-    PMF = args$pmf, 
-    useFull = FALSE, 
-    heatmap = args$heatmap, 
-    centroids=args$centroids, 
-    num_doubs=args$num_doubs, 
-    only50=args$only50, 
-    min_uniq=args$min_uniq, 
-    nCores = args$nCores)
-
-}, error = function(e) {
+# Loop over the rhop values
+for (rhop in rhop_values) {
   tryCatch({
-    print(paste0('trying rhop: ',0.64))
-    results <- Main_Doublet_Decon(rawDataFile = processed$newExpressionFile, 
+    print(paste0('trying rhop: ', rhop))
+    results <- Main_Doublet_Decon(
+      rawDataFile = processed$newExpressionFile, 
       groupsFile = processed$newGroupsFile, 
       filename = "DoubletDecon_results",
       location = paste0(args$out, "/"),
       fullDataFile = NULL, 
       removeCC = args$removeCC, 
       species = args$species, 
-      rhop =  0.64,
+      rhop = rhop,
       write = TRUE, 
       PMF = args$pmf, 
       useFull = FALSE, 
       heatmap = args$heatmap, 
-      centroids=args$centroids, 
-      num_doubs=args$num_doubs, 
-      only50=args$only50, 
-      min_uniq=args$min_uniq, 
-      nCores = args$nCores)
+      centroids = args$centroids, 
+      num_doubs = args$num_doubs, 
+      only50 = args$only50, 
+      min_uniq = args$min_uniq, 
+      nCores = args$nCores
+    )
+    success <- TRUE  # If no error occurs, set success to TRUE
+    break  # Exit the loop as we found a working rhop value
+
+    # results <- Main_Doublet_Decon(
+    #   rawDataFile = processed$newExpressionFile, 
+    #   groupsFile = processed$newGroupsFile, 
+    #   filename = "DoubletDecon_results",
+    #   location = paste0('DoubletDecon_039312-s14-Z0032-CTCTGTATTGCAGAT', "/"),
+    #   fullDataFile = NULL, 
+    #   removeCC = FALSE, 
+    #   species = 'hsa', 
+    #   rhop = rhop,
+    #   write = TRUE, 
+    #   PMF = TRUE, 
+    #   useFull = FALSE, 
+    #   heatmap = FALSE, 
+    #   centroids = FALSE, 
+    #   num_doubs = 10, 
+    #   only50 = FALSE, 
+    #   min_uniq = 4, 
+    #   nCores = 1
+    # )
+
   }, error = function(e) {
-      print(paste0('trying rhop: ',0.5))
-      results <- Main_Doublet_Decon(rawDataFile = processed$newExpressionFile, 
-        groupsFile = processed$newGroupsFile, 
-        filename = "DoubletDecon_results",
-        location = paste0('.', "/"),
-        fullDataFile = NULL, 
-        removeCC = FALSE, 
-        species = args$species, 
-        rhop =  0.5,
-        write = TRUE, 
-        PMF = args$pmf, 
-        useFull = FALSE, 
-        heatmap =  args$heatmap, 
-        nCores = 3)
-  }
-  )
+    print(paste0('Error with rhop: ', rhop))
+    # The loop will continue to the next rhop value
+  })
+}
+
+if (!success) {
+  print("All rhop values failed.")
+  quit(status = 0) 
+} else {
+  print("DoubletDecon ran successfully.")
 }
-)
 
 
 doublets <- read.table(paste0(args$out, "/Final_doublets_groups_DoubletDecon_results.txt"))

diff --git a/bin/DoubletFinder.R b/bin/DoubletFinder.R
@@ -62,7 +62,17 @@ all.genes <- rownames(seurat)
 seurat <- ScaleData(seurat, features = all.genes)
 print('Scaled')
 seurat <- FindVariableFeatures(object = seurat)
-seurat <- RunPCA(seurat, features = VariableFeatures(object = seurat))
+# Check the number of variable features
+num_samples <- ncol(seurat)
+if (num_samples < 50) {
+    print("No sufficinet number of cells to detect doublets.")
+    quit(status = 0) 
+}
+# Determine the number of PCs to use
+npcs_to_use <- ifelse(num_samples > 50, 50, num_samples-1)
+
+# Run PCA with the determined number of PCs
+seurat <- RunPCA(seurat, features = VariableFeatures(object = seurat), npcs = npcs_to_use)
 print('PCA performed')
 seurat <- FindNeighbors(seurat, dims = 1:10)
 print('Neighbors found')

diff --git a/bin/cohort_report.py b/bin/cohort_report.py
@@ -184,7 +184,10 @@ def Generate_Report(GT_MATCH_CONFIDENT,pan):
                 GR_PANEL = GT_MATCH[GT_MATCH['final_panel'] == confident_panel]
                 GT_CELLINE = GT_MATCH[GT_MATCH['final_panel'] == 'GT_cell_lines']
                 for i,row1 in GT_CELLINE.iterrows():
-                    celline = row1['donor_gt original'].split('_')[1]
+                    try:
+                        celline = row1['donor_gt original'].split('_')[1]
+                    except:
+                        celline = row1['donor_gt original']
                     if celline in row1['Good_ids expected']:
                         GT_CELLINE.loc[i,'Match Expected']=True
 

diff --git a/bin/combine_concordance.py b/bin/combine_concordance.py
@@ -72,10 +72,14 @@
 Swap_Quant = pd.read_csv(sq,sep='\t')
 
 Swap_Quant = Swap_Quant.set_index('cell')
-Cell_Concordance = Cell_Concordance.set_index('GT 1')
+try:
+    Cell_Concordance = Cell_Concordance.set_index('GT 1')
 
-Joined_Df = Swap_Quant.join(Cell_Concordance,how='inner')
-Joined_Df['pool id']= name
+    Joined_Df = Swap_Quant.join(Cell_Concordance,how='inner')
+    Joined_Df['pool id']= name
+except:
+    print('Too little cells and cell concordances were not calculated.')
+    exit()
 
 try:
     cell_ambientness=pd.read_csv(f'/lustre/scratch123/hgi/projects/cardinal_analysis/qc/{run}/Donor_Quantification/{name}/ambientness_per_cell_{name}.tsv',sep='\t')
@@ -423,4 +427,4 @@ def scatter(fig, ax):
 fig.tight_layout()
 fig.savefig('subplot_sites_vs_concordance.png')
 fig.clf()
-print('Done')
+print('Done')
diff --git a/bin/combine_doublets.py b/bin/combine_doublets.py
@@ -22,8 +22,11 @@
     df1 = pd.read_csv(f1, index_col=0, sep='\t')
     all_combo = pd.concat([all_combo, df1], axis=1)
 all_combo.index.name = 'barcodes'
-all_combo['Scrublet_DropletType'] = all_combo['scrublet__predicted_multiplet']
-all_combo.loc[all_combo['Scrublet_DropletType'] == True, 'Scrublet_DropletType'] = 'doublet'
-all_combo.loc[all_combo['Scrublet_DropletType'] == False, 'Scrublet_DropletType'] = 'singlet'
+try:
+  all_combo['Scrublet_DropletType'] = all_combo['scrublet__predicted_multiplet']
+  all_combo.loc[all_combo['Scrublet_DropletType'] == True, 'Scrublet_DropletType'] = 'doublet'
+  all_combo.loc[all_combo['Scrublet_DropletType'] == False, 'Scrublet_DropletType'] = 'singlet'
+except:
+  print('Scrublet wast exacuted')
 all_combo.to_csv('all_doublet_results_combined.tsv',sep='\t') 
 print('Done')
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		experiment_id n_pooled donor_vcf_ids data_path_10x_format