Skip to content

Commit

Permalink
Merge branch 'v1.7'
Browse files Browse the repository at this point in the history
  • Loading branch information
Matiss Ozols committed Oct 20, 2024
2 parents 6a9b878 + 42682d2 commit bf69c4d
Show file tree
Hide file tree
Showing 43 changed files with 732 additions and 314 deletions.
2 changes: 1 addition & 1 deletion assets/deploy_scripts/module_exacutables/yascp
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ normal=$(tput sgr0)

if [[ "$QUEUE" == '' ]];
then
export QUEUE='long'
export QUEUE='oversubscribed'
else
export QUEUE="$QUEUE"
fi
Expand Down
4 changes: 2 additions & 2 deletions assets/deploy_scripts/sanger_module_files/1.6.1
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ set version [file tail [module-info version [module-info name]]]

proc ModulesHelp { } {
global version
puts stderr "YASCP (Yet Another Single Cell (scRNA) Pieline: https://github.com/wtsi-hgi/yascp) is a nextflow pipeline that QCs the scRNA Cellranger data by removing ambient RNA, deconvoluting donors, assigning celltypes, analysing concordances vs expected genotypes. IMPROVEMENTS: 1) Added support for running cellbender with different parameters. 2) Citeseq, VDJ data integrations using Surat integration, 3) You can now only run only the cellbender step 4) Have added additional 5 doublet detection methods 5) Can provide your own Azimuth and Celltypist references 6) You can now only run the doublet detection step"
puts stderr "YASCP (Yet Another Single Cell (scRNA) Pieline: https://github.com/wtsi-hgi/yascp) is a nextflow pipeline that QCs the scRNA Cellranger data by removing ambient RNA, deconvoluting donors, assigning celltypes, analysing concordances vs expected genotypes. IMPROVEMENTS: 1) Improved pipeline logic and bug fixes"
puts stderr ""
puts stderr "Yascp module has been set to run in multiple modes:"
puts stderr " *yascp -v or yascp -h :will describe the checkout tag used."
Expand All @@ -25,7 +25,7 @@ proc ModulesHelp { } {
}

module-whatis "Yascp Version: $version"
module-whatis "Yascp version $version is a single cell (scRNA) processing pipeline that takes care of donor deconvolution, ambient rna removal, celltype assignment, integration, clustering and cluster assesments and data qc: yascp (https://github.com/wtsi-hgi/yascp)"
#//module-whatis "Yascp version $version is a single cell (scRNA) processing pipeline that takes care of donor deconvolution, ambient rna removal, celltype assignment, integration, clustering and cluster assesments and data qc: yascp (https://github.com/wtsi-hgi/yascp)"


set install /software/hgi/pipelines/yascp_versions/yascp_v1.6.1
Expand Down
1 change: 1 addition & 0 deletions assets/fake_fileinput.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
experiment_id n_pooled donor_vcf_ids data_path_10x_format
18 changes: 8 additions & 10 deletions bin/0026-filter_outlier_cells.py
Original file line number Diff line number Diff line change
Expand Up @@ -500,10 +500,9 @@ def main():
generate_plots(subset_ad,cell_qc_column,metadata_columns,metadata_columns_original,of)
del subset_ad
adata.uns['cell_outlier_estimator'] = method
adata.obs[['cell_id', cell_qc_column]].to_csv(
f'{options.of}-outliers_filtered__{cell_qc_column}.tsv.gz',
adata.obs[['cell_id', cell_qc_column,cell_qc_column_score]].to_csv(
f'{options.of}-outliers_filtered__{cell_qc_column}.tsv',
sep='\t',
compression=compression_opts,
index=False,
header=True
)
Expand Down Expand Up @@ -553,21 +552,20 @@ def main():
)
# Save the final dataframe
df_cell_filt_per_exp.to_csv(
f'{options.of}-cell_filtered_per_experiment__{cell_qc_column}.tsv.gz',
f'{options.of}-cell_filtered_per_experiment__{cell_qc_column}.tsv',
sep='\t',
compression=compression_opts,
index=False,
header=True
)

generate_plots(adata,cell_qc_column,metadata_columns,metadata_columns_original,options.of)


adata.write(
'{}.h5ad'.format(options.of),
compression='gzip',
compression_opts=options.anndata_compression_opts
)
# adata.write(
# '{}.h5ad'.format(options.of),
# compression='gzip',
# compression_opts=options.anndata_compression_opts
# )



Expand Down
122 changes: 71 additions & 51 deletions bin/DoubletDecon.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,11 @@ parser <- ArgumentParser()

# specify our desired options
# by default ArgumentParser will add an help option
parser$add_argument("-o", "--out", required = TRUE, help="The output directory where results will be saved")
parser$add_argument("-s", "--seurat_object", required = TRUE, type = "character", help = "A QC, normalized seurat object with classifications/clusters as Idents() saved as an rds object.")
# parser$add_argument("-o", "--out", required = TRUE, help="The output directory where results will be saved")
# parser$add_argument("-s", "--seurat_object", required = TRUE, type = "character", help = "A QC, normalized seurat object with classifications/clusters as Idents() saved as an rds object.")
parser$add_argument("-o", "--out", required = FALSE, default = "DoubletDecon_CRD_CMB13631913", help = "The output directory where results will be saved")
parser$add_argument("-s", "--seurat_object", required = FALSE, type = "character", default = "CRD_CMB13631913.h5ad", help = "A QC, normalized seurat object with classifications/clusters as Idents() saved as an rds object.")

parser$add_argument("-g", "--num_genes", required = FALSE, type = "integer", default=50, help = "Number of genes to use in \'Improved_Seurat_Pre_Process\' function.")
parser$add_argument("-r", "--rhop", required = FALSE, type="double", default=0.9, help="rhop to use in DoubletDecon - the number of SD from the mean to identify upper limit to blacklist")
parser$add_argument("-p", "--species", required = FALSE, type = "character", default="hsa", help = "The species of your sample. Can be scientific species name, KEGG ID, three letter species abbreviation, or NCBI ID.")
Expand Down Expand Up @@ -64,7 +67,13 @@ all.genes <- rownames(seurat)
seurat <- ScaleData(seurat, features = all.genes)
print('Scaled')
seurat <- FindVariableFeatures(object = seurat)
seurat <- RunPCA(seurat, features = VariableFeatures(object = seurat))
# Check the number of variable features
num_samples <- ncol(seurat)
# Determine the number of PCs to use
npcs_to_use <- ifelse(num_samples > 50, 50, num_samples-1)

# Run PCA with the determined number of PCs
seurat <- RunPCA(seurat, features = VariableFeatures(object = seurat), npcs = npcs_to_use)
print('PCA performed')
seurat <- FindNeighbors(seurat, dims = 1:10)
print('Neighbors found')
Expand All @@ -74,8 +83,17 @@ print(seurat[["pca"]], dims = 1:5, nfeatures = 5)
# seurat <- Read10X('TMP_DIR')
# seurat_object = CreateSeuratObject(counts = seurat)
## Preprocess ##
processed <- Improved_Seurat_Pre_Process(seurat, num_genes=args$num_genes, write_files=FALSE)

if (num_samples > 50) {
# Proceed with the regular pipeline
processed <- Improved_Seurat_Pre_Process(seurat, num_genes = args$num_genes, write_files = FALSE)
} else {
message("Skipping clustering due to insufficient cell count.")
# You can still preprocess without clustering or simply use the normalized data
seurat <- NormalizeData(seurat)
seurat <- FindVariableFeatures(seurat, selection.method = "vst", nfeatures = args$num_genes)
seurat <- ScaleData(seurat)
processed <- seurat # Assign the preprocessed data
}
DeconRNASeq = function(datasets, signatures, proportions=NULL, checksig=FALSE, known.prop = FALSE, use.scale = TRUE, fig = TRUE){

if (is.null(datasets))
Expand Down Expand Up @@ -645,66 +663,68 @@ Main_Doublet_Decon<-function(rawDataFile, groupsFile, filename, location, fullDa
}

## Run Doublet Decon ##
# Define the rhop values to test
rhop_values <- c(0.9, 0.64, 0.5, 0.4, 0.3, 0.2, 0.1)
success <- FALSE

tryCatch({
print(paste0('trying rhop: ',args$rhop))
results <- Main_Doublet_Decon(rawDataFile = processed$newExpressionFile,
groupsFile = processed$newGroupsFile,
filename = "DoubletDecon_results",
location = paste0(args$out, "/"),
fullDataFile = NULL,
removeCC = args$removeCC,
species = args$species,
rhop = args$rhop,
write = TRUE,
PMF = args$pmf,
useFull = FALSE,
heatmap = args$heatmap,
centroids=args$centroids,
num_doubs=args$num_doubs,
only50=args$only50,
min_uniq=args$min_uniq,
nCores = args$nCores)

}, error = function(e) {
# Loop over the rhop values
for (rhop in rhop_values) {
tryCatch({
print(paste0('trying rhop: ',0.64))
results <- Main_Doublet_Decon(rawDataFile = processed$newExpressionFile,
print(paste0('trying rhop: ', rhop))
results <- Main_Doublet_Decon(
rawDataFile = processed$newExpressionFile,
groupsFile = processed$newGroupsFile,
filename = "DoubletDecon_results",
location = paste0(args$out, "/"),
fullDataFile = NULL,
removeCC = args$removeCC,
species = args$species,
rhop = 0.64,
rhop = rhop,
write = TRUE,
PMF = args$pmf,
useFull = FALSE,
heatmap = args$heatmap,
centroids=args$centroids,
num_doubs=args$num_doubs,
only50=args$only50,
min_uniq=args$min_uniq,
nCores = args$nCores)
centroids = args$centroids,
num_doubs = args$num_doubs,
only50 = args$only50,
min_uniq = args$min_uniq,
nCores = args$nCores
)
success <- TRUE # If no error occurs, set success to TRUE
break # Exit the loop as we found a working rhop value

# results <- Main_Doublet_Decon(
# rawDataFile = processed$newExpressionFile,
# groupsFile = processed$newGroupsFile,
# filename = "DoubletDecon_results",
# location = paste0('DoubletDecon_039312-s14-Z0032-CTCTGTATTGCAGAT', "/"),
# fullDataFile = NULL,
# removeCC = FALSE,
# species = 'hsa',
# rhop = rhop,
# write = TRUE,
# PMF = TRUE,
# useFull = FALSE,
# heatmap = FALSE,
# centroids = FALSE,
# num_doubs = 10,
# only50 = FALSE,
# min_uniq = 4,
# nCores = 1
# )

}, error = function(e) {
print(paste0('trying rhop: ',0.5))
results <- Main_Doublet_Decon(rawDataFile = processed$newExpressionFile,
groupsFile = processed$newGroupsFile,
filename = "DoubletDecon_results",
location = paste0('.', "/"),
fullDataFile = NULL,
removeCC = FALSE,
species = args$species,
rhop = 0.5,
write = TRUE,
PMF = args$pmf,
useFull = FALSE,
heatmap = args$heatmap,
nCores = 3)
}
)
print(paste0('Error with rhop: ', rhop))
# The loop will continue to the next rhop value
})
}

if (!success) {
print("All rhop values failed.")
quit(status = 0)
} else {
print("DoubletDecon ran successfully.")
}
)


doublets <- read.table(paste0(args$out, "/Final_doublets_groups_DoubletDecon_results.txt"))
Expand Down
12 changes: 11 additions & 1 deletion bin/DoubletFinder.R
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,17 @@ all.genes <- rownames(seurat)
seurat <- ScaleData(seurat, features = all.genes)
print('Scaled')
seurat <- FindVariableFeatures(object = seurat)
seurat <- RunPCA(seurat, features = VariableFeatures(object = seurat))
# Check the number of variable features
num_samples <- ncol(seurat)
if (num_samples < 50) {
print("No sufficinet number of cells to detect doublets.")
quit(status = 0)
}
# Determine the number of PCs to use
npcs_to_use <- ifelse(num_samples > 50, 50, num_samples-1)

# Run PCA with the determined number of PCs
seurat <- RunPCA(seurat, features = VariableFeatures(object = seurat), npcs = npcs_to_use)
print('PCA performed')
seurat <- FindNeighbors(seurat, dims = 1:10)
print('Neighbors found')
Expand Down
5 changes: 4 additions & 1 deletion bin/cohort_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,10 @@ def Generate_Report(GT_MATCH_CONFIDENT,pan):
GR_PANEL = GT_MATCH[GT_MATCH['final_panel'] == confident_panel]
GT_CELLINE = GT_MATCH[GT_MATCH['final_panel'] == 'GT_cell_lines']
for i,row1 in GT_CELLINE.iterrows():
celline = row1['donor_gt original'].split('_')[1]
try:
celline = row1['donor_gt original'].split('_')[1]
except:
celline = row1['donor_gt original']
if celline in row1['Good_ids expected']:
GT_CELLINE.loc[i,'Match Expected']=True

Expand Down
12 changes: 8 additions & 4 deletions bin/combine_concordance.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,10 +72,14 @@
Swap_Quant = pd.read_csv(sq,sep='\t')

Swap_Quant = Swap_Quant.set_index('cell')
Cell_Concordance = Cell_Concordance.set_index('GT 1')
try:
Cell_Concordance = Cell_Concordance.set_index('GT 1')

Joined_Df = Swap_Quant.join(Cell_Concordance,how='inner')
Joined_Df['pool id']= name
Joined_Df = Swap_Quant.join(Cell_Concordance,how='inner')
Joined_Df['pool id']= name
except:
print('Too little cells and cell concordances were not calculated.')
exit()

try:
cell_ambientness=pd.read_csv(f'/lustre/scratch123/hgi/projects/cardinal_analysis/qc/{run}/Donor_Quantification/{name}/ambientness_per_cell_{name}.tsv',sep='\t')
Expand Down Expand Up @@ -423,4 +427,4 @@ def scatter(fig, ax):
fig.tight_layout()
fig.savefig('subplot_sites_vs_concordance.png')
fig.clf()
print('Done')
print('Done')
9 changes: 6 additions & 3 deletions bin/combine_doublets.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,11 @@
df1 = pd.read_csv(f1, index_col=0, sep='\t')
all_combo = pd.concat([all_combo, df1], axis=1)
all_combo.index.name = 'barcodes'
all_combo['Scrublet_DropletType'] = all_combo['scrublet__predicted_multiplet']
all_combo.loc[all_combo['Scrublet_DropletType'] == True, 'Scrublet_DropletType'] = 'doublet'
all_combo.loc[all_combo['Scrublet_DropletType'] == False, 'Scrublet_DropletType'] = 'singlet'
try:
all_combo['Scrublet_DropletType'] = all_combo['scrublet__predicted_multiplet']
all_combo.loc[all_combo['Scrublet_DropletType'] == True, 'Scrublet_DropletType'] = 'doublet'
all_combo.loc[all_combo['Scrublet_DropletType'] == False, 'Scrublet_DropletType'] = 'singlet'
except:
print('Scrublet wast exacuted')
all_combo.to_csv('all_doublet_results_combined.tsv',sep='\t')
print('Done')
Loading

0 comments on commit bf69c4d

Please sign in to comment.