ensuring all in place

wtsi-hgi · May 24, 2024 · ac9e8c6 · ac9e8c6
1 parent 9a45fea
commit ac9e8c6
Show file tree

Hide file tree

Showing 5 changed files with 73 additions and 67 deletions.
diff --git a/bin/0035-scanpy_normalize_pca.py b/bin/0035-scanpy_normalize_pca.py
@@ -543,7 +543,7 @@ def scanpy_normalize_and_pca(
             copy=False
         )
 
-
+    print('---- Writting files ----')
     # Keep a record of the different gene scores
     if score_genes_df is not None:
         adata.uns['df_score_genes'] = score_genes_df_updated

diff --git a/bin/2.integrate.R b/bin/2.integrate.R
@@ -169,6 +169,11 @@ for(f in cite_files){
 
   # Add the vireo donor assignment to the seurat object
   this_donor_cells <- donor_cells[donor_cells$sample==sample_id,]
+
+  if (dim(this_donor_cells)[1]==0){
+    # Here we skip non deconvoluted samples
+    next
+  }
   sobj@meta.data$donor.vireo <- this_donor_cells[match(paste0(rownames(sobj@meta.data),'_',sample_name),
                                                        paste0(this_donor_cells$cell,
                                                               '_',this_donor_cells$sample)),]$matched.donor

diff --git a/bin/add_adt.R b/bin/add_adt.R
@@ -24,7 +24,7 @@ if (future::supportsMulticore()) {
 } else {
   future::plan(future::multisession)
 }
-# args=vector(mode='list', length=6); args[[1]]='cellranger700_multi_abb2ba0911f1f4bde982a4c38d9fd6ed'; args[[2]]='raw_feature_bc_matrix'; args[[3]]='filtered_feature_bc_matrix'; args[[4]]='cellranger700_multi_abb2ba0911f1f4bde982a4c38d9fd6ed___sample_QCd_adata.h5ad'
+# args=vector(mode='list', length=6); args[[1]]='LDP58'; args[[2]]='raw_feature_bc_matrix'; args[[3]]='filtered_feature_bc_matrix'; args[[4]]='LDP58___sample_QCd_adata.h5ad'
 
 #####
 args = commandArgs(trailingOnly=TRUE)
@@ -47,13 +47,13 @@ outdir <- getwd()
 # filtered_feature_file = cellranger_filepath = args[2]
 #    
 # filtered_cellranger = '/lustre/scratch123/hgi/teams/hgi/mo11/tmp_projects/jaguar_yascp/nieks_pipeline/fetch/results_old/cellranger_data/cellranger700_multi_bc45a1c2fe2a3fbbcde46cf984cf42e2/per_sample_outs/cellranger700_multi_bc45a1c2fe2a3fbbcde46cf984cf42e2/count/sample_filtered_feature_bc_matrix.h5'
-# sample_name <- 'cellranger700_multi_74b30caec2cf83c0048bc87946b301e8'
-# cellranger_rawfile_path <- 'raw_feature_bc_matrix'
-# filtered_cellranger = 'filtered_feature_bc_matrix'
-# file_with_qc_applied = 'cellranger700_multi_74b30caec2cf83c0048bc87946b301e8___sample_QCd_adata.h5ad'
+sample_name <- 'LDP58'
+cellranger_rawfile_path <- 'raw_feature_bc_matrix'
+filtered_cellranger = 'filtered_feature_bc_matrix'
+file_with_qc_applied = 'LDP58___sample_QCd_adata.h5ad'
 sample_name <- args[1]
 cellranger_rawfile_path <- args[2]
-filtered_cellranger = args[3]
+filtered_cellranger = args[3][1]
 file_with_qc_applied = args[4]
 # cellranger_rawfile_path = '/lustre/scratch123/hgi/teams/hgi/mo11/tmp_projects/jaguar_yascp/nieks_pipeline/fetch/results_old/cellranger_data/cellranger700_multi_bc45a1c2fe2a3fbbcde46cf984cf42e2/multi/count/raw_feature_bc_matrix.h5'
 
@@ -151,7 +151,9 @@ Convert(
   # qced_cells <- readRDS(scpred_file_with_qc)
   qced_cells <- LoadH5Seurat(paste('tmp',"h5seurat",sep='.'),assays = "RNA")
   qced_barcodes <- gsub('_.*','',colnames(qced_cells))
-  qced_barcodes <- gsub('-cellranger.*','',colnames(qced_cells))
+  qced_barcodes <- gsub('-1.*','-1',(qced_barcodes))
+  qced_barcodes <- gsub('-2.*','-2',(qced_barcodes))
+  qced_barcodes <- gsub('-cellranger.*','',(qced_barcodes))
   qced_cells = RenameCells(qced_cells, new.names = qced_barcodes)
 
   genes <-  read.table(paste0(cellranger_rawfile_path,"/features.tsv.gz"), sep = "\t", col.names = c("ENSG_ID","Gene_ID", "FeatureType"))
@@ -183,6 +185,7 @@ Convert(
   # }else if(basename(f)=='sample_feature_bc_matrix.h5' || basename(f) == 'sample_filtered_feature_bc_matrix.h5'){
     # with cellranger multi the raw file is 3 directories higher than filtered file, then in multi/count
   rna = raw <- Read10X(cellranger_rawfile_path)
+  print(paste0(' RAW data loaded '))
   # }else{
   #   stop(paste('basename(f) should be sample_feature_bc_matrix.h5 or filtered_feature_bc_matrix.h5, but was:',basename(f)))
   # }

diff --git a/bin/gather_minimal_dataset.py b/bin/gather_minimal_dataset.py
@@ -61,8 +61,8 @@
     'cell_passes_qc:score':'qc.filter.pass:score',
     'cell_passes_qc-per:all_together::exclude':'qc.filter.pass.spikein_exclude',
     'cell_passes_qc-per:all_together::exclude:score':'qc.filter.pass.spikein_exclude:score',
-    'cell_passes_qc-per:Azimuth:L0_Azimuth:predicted.celltype.l2':'qc.filter.pass.AZ:L0',
-    'cell_passes_qc-per:Azimuth:L0_Azimuth:predicted.celltype.l2:score':'qc.filter.pass.AZ:L0:score',
+    'cell_passes_qc-per:Azimuth:L0_predicted.celltype.l2':'qc.filter.pass.AZ:L0',
+    'cell_passes_qc-per:Azimuth:L0_predicted.celltype.l2:score':'qc.filter.pass.AZ:L0:score',
     'total_counts': 'qc.umi.count.total',
     'total_counts_gene_group__mito_transcript': 'qc.umi.count.mt',
     'pct_counts_gene_group__mito_transcript': 'qc.umi.perc.mt',
@@ -399,8 +399,7 @@ def gather_pool(expid, args, df_raw, df_cellbender, adqc, oufh = sys.stdout,lane
     #Cellranger datasets
     ######################
     columns_output = {**COLUMNS_DATASET, **COLUMNS_DECONV, **COLUMNS_QC}
-    #Unfiltered
-    compression_opts = 'gzip'
+    #Reading unfiltered raw cellranger datasets
     try:
         adata_cellranger_raw = scanpy.read_10x_mtx(f"{df_raw.loc[expid, 'data_path_10x_format']}/raw_feature_bc_matrix")
         try:
@@ -417,6 +416,7 @@ def gather_pool(expid, args, df_raw, df_cellbender, adqc, oufh = sys.stdout,lane
         except:
             print('File already linked')
 
+    # Reading filtered cellranger files
     try:
         adata_cellranger_filtered = scanpy.read_10x_mtx(f"{df_raw.loc[expid, 'data_path_10x_format']}/filtered_feature_bc_matrix")
         try:
@@ -441,24 +441,18 @@ def gather_pool(expid, args, df_raw, df_cellbender, adqc, oufh = sys.stdout,lane
     df_total_counts['barcodes'] = df_total_counts.index
     df_total_counts_cellranger_raw = df_total_counts
     df_total_counts_cellranger_raw['dataset']='Cellranger Raw'
-    wd = os.getcwd()
 
     if df_cellbender is not None and (len(df_cellbender)!=0):
         df_cellbender = df_cellbender.reset_index()
         df_cellbender = df_cellbender.drop_duplicates(subset=['experiment_id'])
         df_cellbender = df_cellbender.set_index('experiment_id')
         f=df_cellbender.loc[expid, 'data_path_10x_format']
         if (type(f) == str):
-            print('yes')
             f=[f]
         for id in f:
             print(id)
             try:
                 cell_bender_path = id
-                # try:
-                #     cell_bender_path = f'{wd}/'+'/'.join(cell_bender_path.split('/')[-6:])
-                # except:
-                #     cell_bender_path = f"{args.results_dir}/{df_cellbender.loc[expid, 'data_path_10x_format']}"
                 cellbender_h5 = f"{cell_bender_path}/../cellbender_FPR_{Resolution}_filtered.h5"
                 ad_lane_filtered = scanpy.read_10x_mtx(cell_bender_path)
                 print('loaded')
@@ -471,30 +465,19 @@ def gather_pool(expid, args, df_raw, df_cellbender, adqc, oufh = sys.stdout,lane
             try:
                 path2 = os.path.realpath(cellbender_h5)
                 os.symlink(path2, f"./{outdir}/Cellbender_filtered_{Resolution}__{expid}.h5")
-                # Here link also mtx files
             except:
                 print('File already linked')
-        dfcb = fetch_cellbender_annotation(cell_bender_path, expid,Resolution)
         columns_output = {**columns_output, **COLUMNS_CELLBENDER}
     else:
         ad_lane_filtered = scanpy.read_10x_mtx(f"{df_raw.loc[expid, 'data_path_10x_format']}/filtered_feature_bc_matrix")
         df_cellbender=None
         cell_bender_path=None
 
-
+    # Removing Zero count cells from the matices
     zero_count_cells = ad_lane_filtered.obs_names[np.where(ad_lane_filtered.X.sum(axis=1) == 0)[0]]
     ad_lane_filtered = ad_lane_filtered[ad_lane_filtered.obs_names.difference(zero_count_cells, sort=False)]
-
-
     zero_count_cells = adata_cellranger_filtered.obs_names[np.where(adata_cellranger_filtered.X.sum(axis=1) == 0)[0]]
     adata_cellranger_filtered = adata_cellranger_filtered[adata_cellranger_filtered.obs_names.difference(zero_count_cells, sort=False)]
-    # adata_cellranger_filtered=ad_lane_filtered
-    # scanpy.pp.calculate_qc_metrics(adata_cellranger_filtered, inplace=True)
-    # df_total_counts = pd.DataFrame(data= adata_cellranger_filtered.obs.sort_values(by=['total_counts'], ascending=False).total_counts)
-    # df_total_counts['barcodes'] = df_total_counts.index
-    # df_total_counts['barcode_row_number'] = df_total_counts.reset_index().index + 1 
-    # df_total_counts_cellranger_filtered = df_total_counts
-    # df_total_counts_cellranger_filtered['dataset'] = 'Cellranger Filtered'
 
     #############
     #Cellranger Metrics Datasheet
@@ -513,6 +496,15 @@ def gather_pool(expid, args, df_raw, df_cellbender, adqc, oufh = sys.stdout,lane
     ##########################
     # Scrublet
     #########################
+    doublet_data = glob.glob(f'{args.results_dir}/doublets/*.tsv')
+    doublet_data_combined = pd.DataFrame()
+    for f1 in doublet_data:
+        print(f1)
+        pool_name = f1.split('__')[0].split('/')[-1]
+        d2 = pd.read_csv(f1,sep='\t')
+        d2['Exp']=pool_name
+        doublet_data_combined = pd.concat([doublet_data_combined,d2])
+
     datadir_scrublet=glob.glob(f'{args.results_dir}/*/multiplet.method=scrublet')[0]
     if os.path.isdir(datadir_scrublet):
         # Scrublet loading QC
@@ -532,15 +524,9 @@ def gather_pool(expid, args, df_raw, df_cellbender, adqc, oufh = sys.stdout,lane
     ############################################################
     # Loading deconvoluted data including unassigned and doublets
     ###########################################
-    print(expid)
 
     obsqc,all_QC_lane = fetch_qc_obs_from_anndata(adqc, expid, cell_bender_path = cell_bender_path,Resolution=Resolution)
-
-    # try:        
-    #     datadir_deconv=f'{args.results_dir}/deconvolution/split_donor_h5ad'
-    #     donor_table = os.path.join(datadir_deconv, expid, "{}.donors.h5ad.tsv".format(expid))
-    #     df_donors = pandas.read_table(donor_table, header=None, names=("experiment_id", "donor_id", "file_path_h5ad"))
-    # except:
+
     donor_tables=pd.DataFrame([])
     for d1 in set(obsqc['donor']):
         donor_table={}
@@ -596,11 +582,6 @@ def gather_pool(expid, args, df_raw, df_cellbender, adqc, oufh = sys.stdout,lane
     except:
         chromium_channel = 'Run_ID not vailable'
 
-
-
-
-    def isNaN(num):
-        return num!= num
     Donors = list(df_donors.donor_id)
 
     try:
@@ -634,7 +615,7 @@ def isNaN(num):
             Mean_reads_per_cell = int(metrics['Mean reads per cell'].values[0].replace(',',''))
         except:
             Mean_reads_per_cell= None
-    # adata_cellranger_raw.X
+
     f = pd.DataFrame(adata_cellranger_filtered.X.sum(axis=1))
     Median_UMI_Counts_per_cellranger= statistics.median(f[f>0][0])
     Mean_Reads = statistics.mean(f[f>0][0])
@@ -658,7 +639,6 @@ def isNaN(num):
     Number_of_cells = len(set(df1.index))
     Total_UMIs_before_10x_filter = np.sum(ad_lane_raw.X) #this may be after the normalisation
 
-    # ad_lane_filtered = 
     Total_UMIs_after_cellbender_filter = np.sum(ad_lane_filtered.X) #This is more 27840
     Total_UMIs_after_cellbender = sum(all_QC_lane.obs['total_counts']) #This is less 22817
 
@@ -694,9 +674,6 @@ def isNaN(num):
     all_probs = pd.DataFrame()
 
     Tranche_Pass_Fail='PASS'
-    Tranche_Failure_Reason=''
-
-    # print("** Fraction_Reads_in_Cells : "+Fraction_Reads_in_Cells.strip('%'))
     Tranche_Failure_Reason =' '
     try:
         if (float(Fraction_Reads_in_Cells.strip('%'))<=70):
@@ -713,7 +690,6 @@ def isNaN(num):
 
     for i in df_donors.index:
         print(i)
-        # feeds in the individual assignments here.
         Donor_Stats=[]
         row = df_donors.loc[i]
         print(row)
@@ -727,14 +703,12 @@ def isNaN(num):
             path1 = re.sub('.*/results/', 'results/', path1)
             Deconvoluted_Donor_Data = anndata.read_h5ad(path1)
             Donor_barcodes = Deconvoluted_Donor_Data.obs.index
-        # print(path1)
+
         #################
         #Deconvolution data
         #################
 
-
-
-        # issue with the all_QC_lane is that they are filtered and the unassigned cells are removed - ve can merge them back together and 
+        # Issue with the all_QC_lane is that they are filtered and the unassigned cells are removed - we can merge them back together and 
         if (row["donor_id"] == 'unassigned'):
             Unassigned_donor = len(Deconvoluted_Donor_Data.obs)
         elif (row["donor_id"] == 'doublet'):
@@ -765,7 +739,6 @@ def isNaN(num):
             data_donor_for_stats['cells passing QC'].append(Donor_cells_passes_qc)
 
             Donor_cell_assignments = azt.loc[Mengled_barcodes_donor] #for this have to figure out when the cell type is unasigned.
-            # Donor_cell_assignments = Donor_cell_assignments.rename(COLUMNS_AZIMUTH,axis=1)
             Cell_types_detected = len(set(Donor_cell_assignments['Azimuth:predicted.celltype.l2']))
             Donor_UMIS_mapped_to_mitochondrial_genes = sum(Donor_qc_files.obs['total_counts_gene_group__mito_transcript'])
             Donor_UMIS_mapped_to_ribo_genes = sum(Donor_qc_files.obs['total_counts_gene_group__ribo_protein'])
@@ -855,7 +828,6 @@ def isNaN(num):
                 data_donor.append(Donor_Stats)
 
         fctr += 1
-    # print('done')
     all_probs = all_probs[~all_probs.index.duplicated(keep='first')]
     azt['prob_doublet']=all_probs['prob_doublet']
     Donor_df = pd.DataFrame(data_donor)
@@ -973,23 +945,26 @@ def set_argument_parser():
         write_h5=False
     else:
        write_h5=True 
-    # write_h5=False
+
     oufh = open(os.path.join(args.outdir, "files.tsv"), 'w')
     oufh.write("experiment_id\tdonor_id\tfilename_h5ad\tfilename_annotation_tsv\n")
     df_raw = pandas.read_table(args.input_table, index_col = 'experiment_id')
     if (args.cellbender)=='cellranger':
-        # here we do not use cellbender and go with default cellranger
+        # Here we do not use cellbender and go with default cellranger
         df_cellbender = None
     else:
-        # here we have run the cellbender as par of pipeline. 
+        # Here we have run the cellbender as par of pipeline. 
         # cellbender/*/cellbender-epochs_*/cellbender-FPR_0pt01-filtered_10x_mtx
         file_path = glob.glob(f'{args.results_dir}/nf-preprocessing/cellbender/*/cellbender-epochs_*/*{args.resolution}*10x_mtx*')
         file_path2 = glob.glob(f'{args.results_dir}/nf-preprocessing/cellbender/*/*{args.resolution}*10x_mtx*')
         joined_file_paths = file_path+file_path2
         df_cellbender = pd.DataFrame(joined_file_paths,columns=['data_path_10x_format'])
         df_cellbender['experiment_id']=df_cellbender['data_path_10x_format'].str.split('/').str[-3]
         df_cellbender= df_cellbender.set_index('experiment_id')
+
     Resolution = args.resolution
+
+    # Load the final QCd dataset
     try:
         adqc = anndata.read_h5ad(f'{args.results_dir}/merged_h5ad/outlier_filtered_adata.h5ad')
     except:
@@ -998,30 +973,27 @@ def set_argument_parser():
         except:
             d2 = glob.glob(f'{args.results_dir}/*/*/adatanormalized.h5ad')[0]
             adqc = anndata.read_h5ad(d2)
-    adqc.obs['experiment_id'] = adqc.obs['experiment_id'].str.split("__").str[0]
+    # adqc.obs['experiment_id'] = adqc.obs['experiment_id'].str.split("__").str[0]
     fctr = 0
     data_tranche_all=[]
     data_donor_all=[]
     count = 1
     All_probs_and_celltypes = pd.DataFrame()
-
-
     Sample_metadata = pd.DataFrame()
 
-    adqc.obs['tranche.id']=args.experiment_name
+    # SETTING TRANCHE NAME
     try:
-        adqc.obs['cell_passes_qc-per:Azimuth:L0_Azimuth:predicted.celltype.l2:score'] = adqc.obs['cell_passes_qc-per:Azimuth:L0_Azimuth:predicted.celltype.l2:score'].astype(float,errors='ignore')
+        adqc.obs['cell_passes_qc-per:Azimuth:L0_predicted.celltype.l2'] = adqc.obs['cell_passes_qc-per:Azimuth:L0_predicted.celltype.l2:score'].astype(float,errors='ignore')
     except:
         _='no values associated'
     try:
         adqc.obs['cell_passes_qc-per:all_together::exclude:score']= adqc.obs['cell_passes_qc-per:all_together::exclude:score'].astype(float,errors='ignore')
     except:
         _='no values associated'
-
+
+    # Now we calculate all the statistics for each of the pools.
     for expid in df_raw.index:
         print(expid)
-        # try:
-        # expid ='OTARscRNA12924807'
         s = adqc.obs['convoluted_samplename'] == expid
         ad = adqc[s]
         if ad.n_obs == 0:

diff --git a/bin/transfer_data.py b/bin/transfer_data.py
@@ -167,6 +167,26 @@ def main_data_colection(pipeline='',name='',directory='',input_table=None,cb_res
         for folder in Folders:
             copyfile(f'{folder1}/{folder}/Vireo_plots.pdf', f'{name_dir}/Deconvolution/Vireo_plots_{folder}.pdf')
 
+
+
+    folder1 = f'{directory}/doublets'
+    if os.path.isdir(folder1):
+        print('prepearing Doublet folder')
+        try:
+            os.mkdir(f'{name_dir}/Doublets___301')
+        except:
+            print('dire exists')
+        files = glob.glob(f'{folder1}/*.tsv')
+
+        for file1 in files:
+            print(file1)
+            try:
+                copy(file1, f'{name_dir}/Doublets___301')
+            except:
+                print('picked up directory')
+                continue
+
+
     folder1 = f'{directory}/celltype/celltypist'
     if os.path.isdir(folder1):
         print('prepearing celltype folder')
@@ -205,15 +225,21 @@ def main_data_colection(pipeline='',name='',directory='',input_table=None,cb_res
         #     print('dire exists')
         # copyfile(fil1, f'{name_dir}/QC metrics/plot_ecdf-x_log10.var=total_counts.color=experiment_id-adata.png')
         files = glob.glob(f'{folder1}/*[!.gz]')
+        files2 = glob.glob(f'{folder1}/*/*[!.gz]')
+        files = files + files2
         for file1 in files:
             print(file1)
             try:
                 copy(file1, f'{name_dir}/Cell-type assignment/azimuth')
             except:
                 print('picked up directory')
                 continue
-
-
+    try:
+        copy(f'{directory}/celltype/All_Celltype_Assignments.csv', f'{name_dir}/Cell-type assignment/All_Celltype_Assignments.csv')
+    except:
+        print('doesnt exist')    
+
+
     folder1 = f'{directory}/plots/per_celltype_outliers'
     if os.path.isdir(folder1):
         print('yes')