Skip to content

Commit

Permalink
ensuring all in place
Browse files Browse the repository at this point in the history
  • Loading branch information
Matiss Ozols committed May 24, 2024
1 parent 9a45fea commit ac9e8c6
Show file tree
Hide file tree
Showing 5 changed files with 73 additions and 67 deletions.
2 changes: 1 addition & 1 deletion bin/0035-scanpy_normalize_pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -543,7 +543,7 @@ def scanpy_normalize_and_pca(
copy=False
)


print('---- Writting files ----')
# Keep a record of the different gene scores
if score_genes_df is not None:
adata.uns['df_score_genes'] = score_genes_df_updated
Expand Down
5 changes: 5 additions & 0 deletions bin/2.integrate.R
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,11 @@ for(f in cite_files){

# Add the vireo donor assignment to the seurat object
this_donor_cells <- donor_cells[donor_cells$sample==sample_id,]

if (dim(this_donor_cells)[1]==0){
# Here we skip non deconvoluted samples
next
}
sobj@meta.data$donor.vireo <- this_donor_cells[match(paste0(rownames(sobj@meta.data),'_',sample_name),
paste0(this_donor_cells$cell,
'_',this_donor_cells$sample)),]$matched.donor
Expand Down
17 changes: 10 additions & 7 deletions bin/add_adt.R
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ if (future::supportsMulticore()) {
} else {
future::plan(future::multisession)
}
# args=vector(mode='list', length=6); args[[1]]='cellranger700_multi_abb2ba0911f1f4bde982a4c38d9fd6ed'; args[[2]]='raw_feature_bc_matrix'; args[[3]]='filtered_feature_bc_matrix'; args[[4]]='cellranger700_multi_abb2ba0911f1f4bde982a4c38d9fd6ed___sample_QCd_adata.h5ad'
# args=vector(mode='list', length=6); args[[1]]='LDP58'; args[[2]]='raw_feature_bc_matrix'; args[[3]]='filtered_feature_bc_matrix'; args[[4]]='LDP58___sample_QCd_adata.h5ad'

#####
args = commandArgs(trailingOnly=TRUE)
Expand All @@ -47,13 +47,13 @@ outdir <- getwd()
# filtered_feature_file = cellranger_filepath = args[2]
#
# filtered_cellranger = '/lustre/scratch123/hgi/teams/hgi/mo11/tmp_projects/jaguar_yascp/nieks_pipeline/fetch/results_old/cellranger_data/cellranger700_multi_bc45a1c2fe2a3fbbcde46cf984cf42e2/per_sample_outs/cellranger700_multi_bc45a1c2fe2a3fbbcde46cf984cf42e2/count/sample_filtered_feature_bc_matrix.h5'
# sample_name <- 'cellranger700_multi_74b30caec2cf83c0048bc87946b301e8'
# cellranger_rawfile_path <- 'raw_feature_bc_matrix'
# filtered_cellranger = 'filtered_feature_bc_matrix'
# file_with_qc_applied = 'cellranger700_multi_74b30caec2cf83c0048bc87946b301e8___sample_QCd_adata.h5ad'
sample_name <- 'LDP58'
cellranger_rawfile_path <- 'raw_feature_bc_matrix'
filtered_cellranger = 'filtered_feature_bc_matrix'
file_with_qc_applied = 'LDP58___sample_QCd_adata.h5ad'
sample_name <- args[1]
cellranger_rawfile_path <- args[2]
filtered_cellranger = args[3]
filtered_cellranger = args[3][1]
file_with_qc_applied = args[4]
# cellranger_rawfile_path = '/lustre/scratch123/hgi/teams/hgi/mo11/tmp_projects/jaguar_yascp/nieks_pipeline/fetch/results_old/cellranger_data/cellranger700_multi_bc45a1c2fe2a3fbbcde46cf984cf42e2/multi/count/raw_feature_bc_matrix.h5'

Expand Down Expand Up @@ -151,7 +151,9 @@ Convert(
# qced_cells <- readRDS(scpred_file_with_qc)
qced_cells <- LoadH5Seurat(paste('tmp',"h5seurat",sep='.'),assays = "RNA")
qced_barcodes <- gsub('_.*','',colnames(qced_cells))
qced_barcodes <- gsub('-cellranger.*','',colnames(qced_cells))
qced_barcodes <- gsub('-1.*','-1',(qced_barcodes))
qced_barcodes <- gsub('-2.*','-2',(qced_barcodes))
qced_barcodes <- gsub('-cellranger.*','',(qced_barcodes))
qced_cells = RenameCells(qced_cells, new.names = qced_barcodes)

genes <- read.table(paste0(cellranger_rawfile_path,"/features.tsv.gz"), sep = "\t", col.names = c("ENSG_ID","Gene_ID", "FeatureType"))
Expand Down Expand Up @@ -183,6 +185,7 @@ Convert(
# }else if(basename(f)=='sample_feature_bc_matrix.h5' || basename(f) == 'sample_filtered_feature_bc_matrix.h5'){
# with cellranger multi the raw file is 3 directories higher than filtered file, then in multi/count
rna = raw <- Read10X(cellranger_rawfile_path)
print(paste0(' RAW data loaded '))
# }else{
# stop(paste('basename(f) should be sample_feature_bc_matrix.h5 or filtered_feature_bc_matrix.h5, but was:',basename(f)))
# }
Expand Down
86 changes: 29 additions & 57 deletions bin/gather_minimal_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,8 @@
'cell_passes_qc:score':'qc.filter.pass:score',
'cell_passes_qc-per:all_together::exclude':'qc.filter.pass.spikein_exclude',
'cell_passes_qc-per:all_together::exclude:score':'qc.filter.pass.spikein_exclude:score',
'cell_passes_qc-per:Azimuth:L0_Azimuth:predicted.celltype.l2':'qc.filter.pass.AZ:L0',
'cell_passes_qc-per:Azimuth:L0_Azimuth:predicted.celltype.l2:score':'qc.filter.pass.AZ:L0:score',
'cell_passes_qc-per:Azimuth:L0_predicted.celltype.l2':'qc.filter.pass.AZ:L0',
'cell_passes_qc-per:Azimuth:L0_predicted.celltype.l2:score':'qc.filter.pass.AZ:L0:score',
'total_counts': 'qc.umi.count.total',
'total_counts_gene_group__mito_transcript': 'qc.umi.count.mt',
'pct_counts_gene_group__mito_transcript': 'qc.umi.perc.mt',
Expand Down Expand Up @@ -399,8 +399,7 @@ def gather_pool(expid, args, df_raw, df_cellbender, adqc, oufh = sys.stdout,lane
#Cellranger datasets
######################
columns_output = {**COLUMNS_DATASET, **COLUMNS_DECONV, **COLUMNS_QC}
#Unfiltered
compression_opts = 'gzip'
#Reading unfiltered raw cellranger datasets
try:
adata_cellranger_raw = scanpy.read_10x_mtx(f"{df_raw.loc[expid, 'data_path_10x_format']}/raw_feature_bc_matrix")
try:
Expand All @@ -417,6 +416,7 @@ def gather_pool(expid, args, df_raw, df_cellbender, adqc, oufh = sys.stdout,lane
except:
print('File already linked')

# Reading filtered cellranger files
try:
adata_cellranger_filtered = scanpy.read_10x_mtx(f"{df_raw.loc[expid, 'data_path_10x_format']}/filtered_feature_bc_matrix")
try:
Expand All @@ -441,24 +441,18 @@ def gather_pool(expid, args, df_raw, df_cellbender, adqc, oufh = sys.stdout,lane
df_total_counts['barcodes'] = df_total_counts.index
df_total_counts_cellranger_raw = df_total_counts
df_total_counts_cellranger_raw['dataset']='Cellranger Raw'
wd = os.getcwd()

if df_cellbender is not None and (len(df_cellbender)!=0):
df_cellbender = df_cellbender.reset_index()
df_cellbender = df_cellbender.drop_duplicates(subset=['experiment_id'])
df_cellbender = df_cellbender.set_index('experiment_id')
f=df_cellbender.loc[expid, 'data_path_10x_format']
if (type(f) == str):
print('yes')
f=[f]
for id in f:
print(id)
try:
cell_bender_path = id
# try:
# cell_bender_path = f'{wd}/'+'/'.join(cell_bender_path.split('/')[-6:])
# except:
# cell_bender_path = f"{args.results_dir}/{df_cellbender.loc[expid, 'data_path_10x_format']}"
cellbender_h5 = f"{cell_bender_path}/../cellbender_FPR_{Resolution}_filtered.h5"
ad_lane_filtered = scanpy.read_10x_mtx(cell_bender_path)
print('loaded')
Expand All @@ -471,30 +465,19 @@ def gather_pool(expid, args, df_raw, df_cellbender, adqc, oufh = sys.stdout,lane
try:
path2 = os.path.realpath(cellbender_h5)
os.symlink(path2, f"./{outdir}/Cellbender_filtered_{Resolution}__{expid}.h5")
# Here link also mtx files
except:
print('File already linked')
dfcb = fetch_cellbender_annotation(cell_bender_path, expid,Resolution)
columns_output = {**columns_output, **COLUMNS_CELLBENDER}
else:
ad_lane_filtered = scanpy.read_10x_mtx(f"{df_raw.loc[expid, 'data_path_10x_format']}/filtered_feature_bc_matrix")
df_cellbender=None
cell_bender_path=None


# Removing Zero count cells from the matices
zero_count_cells = ad_lane_filtered.obs_names[np.where(ad_lane_filtered.X.sum(axis=1) == 0)[0]]
ad_lane_filtered = ad_lane_filtered[ad_lane_filtered.obs_names.difference(zero_count_cells, sort=False)]


zero_count_cells = adata_cellranger_filtered.obs_names[np.where(adata_cellranger_filtered.X.sum(axis=1) == 0)[0]]
adata_cellranger_filtered = adata_cellranger_filtered[adata_cellranger_filtered.obs_names.difference(zero_count_cells, sort=False)]
# adata_cellranger_filtered=ad_lane_filtered
# scanpy.pp.calculate_qc_metrics(adata_cellranger_filtered, inplace=True)
# df_total_counts = pd.DataFrame(data= adata_cellranger_filtered.obs.sort_values(by=['total_counts'], ascending=False).total_counts)
# df_total_counts['barcodes'] = df_total_counts.index
# df_total_counts['barcode_row_number'] = df_total_counts.reset_index().index + 1
# df_total_counts_cellranger_filtered = df_total_counts
# df_total_counts_cellranger_filtered['dataset'] = 'Cellranger Filtered'

#############
#Cellranger Metrics Datasheet
Expand All @@ -513,6 +496,15 @@ def gather_pool(expid, args, df_raw, df_cellbender, adqc, oufh = sys.stdout,lane
##########################
# Scrublet
#########################
doublet_data = glob.glob(f'{args.results_dir}/doublets/*.tsv')
doublet_data_combined = pd.DataFrame()
for f1 in doublet_data:
print(f1)
pool_name = f1.split('__')[0].split('/')[-1]
d2 = pd.read_csv(f1,sep='\t')
d2['Exp']=pool_name
doublet_data_combined = pd.concat([doublet_data_combined,d2])

datadir_scrublet=glob.glob(f'{args.results_dir}/*/multiplet.method=scrublet')[0]
if os.path.isdir(datadir_scrublet):
# Scrublet loading QC
Expand All @@ -532,15 +524,9 @@ def gather_pool(expid, args, df_raw, df_cellbender, adqc, oufh = sys.stdout,lane
############################################################
# Loading deconvoluted data including unassigned and doublets
###########################################
print(expid)

obsqc,all_QC_lane = fetch_qc_obs_from_anndata(adqc, expid, cell_bender_path = cell_bender_path,Resolution=Resolution)

# try:
# datadir_deconv=f'{args.results_dir}/deconvolution/split_donor_h5ad'
# donor_table = os.path.join(datadir_deconv, expid, "{}.donors.h5ad.tsv".format(expid))
# df_donors = pandas.read_table(donor_table, header=None, names=("experiment_id", "donor_id", "file_path_h5ad"))
# except:

donor_tables=pd.DataFrame([])
for d1 in set(obsqc['donor']):
donor_table={}
Expand Down Expand Up @@ -596,11 +582,6 @@ def gather_pool(expid, args, df_raw, df_cellbender, adqc, oufh = sys.stdout,lane
except:
chromium_channel = 'Run_ID not vailable'




def isNaN(num):
return num!= num
Donors = list(df_donors.donor_id)

try:
Expand Down Expand Up @@ -634,7 +615,7 @@ def isNaN(num):
Mean_reads_per_cell = int(metrics['Mean reads per cell'].values[0].replace(',',''))
except:
Mean_reads_per_cell= None
# adata_cellranger_raw.X

f = pd.DataFrame(adata_cellranger_filtered.X.sum(axis=1))
Median_UMI_Counts_per_cellranger= statistics.median(f[f>0][0])
Mean_Reads = statistics.mean(f[f>0][0])
Expand All @@ -658,7 +639,6 @@ def isNaN(num):
Number_of_cells = len(set(df1.index))
Total_UMIs_before_10x_filter = np.sum(ad_lane_raw.X) #this may be after the normalisation

# ad_lane_filtered =
Total_UMIs_after_cellbender_filter = np.sum(ad_lane_filtered.X) #This is more 27840
Total_UMIs_after_cellbender = sum(all_QC_lane.obs['total_counts']) #This is less 22817

Expand Down Expand Up @@ -694,9 +674,6 @@ def isNaN(num):
all_probs = pd.DataFrame()

Tranche_Pass_Fail='PASS'
Tranche_Failure_Reason=''

# print("** Fraction_Reads_in_Cells : "+Fraction_Reads_in_Cells.strip('%'))
Tranche_Failure_Reason =' '
try:
if (float(Fraction_Reads_in_Cells.strip('%'))<=70):
Expand All @@ -713,7 +690,6 @@ def isNaN(num):

for i in df_donors.index:
print(i)
# feeds in the individual assignments here.
Donor_Stats=[]
row = df_donors.loc[i]
print(row)
Expand All @@ -727,14 +703,12 @@ def isNaN(num):
path1 = re.sub('.*/results/', 'results/', path1)
Deconvoluted_Donor_Data = anndata.read_h5ad(path1)
Donor_barcodes = Deconvoluted_Donor_Data.obs.index
# print(path1)

#################
#Deconvolution data
#################



# issue with the all_QC_lane is that they are filtered and the unassigned cells are removed - ve can merge them back together and
# Issue with the all_QC_lane is that they are filtered and the unassigned cells are removed - we can merge them back together and
if (row["donor_id"] == 'unassigned'):
Unassigned_donor = len(Deconvoluted_Donor_Data.obs)
elif (row["donor_id"] == 'doublet'):
Expand Down Expand Up @@ -765,7 +739,6 @@ def isNaN(num):
data_donor_for_stats['cells passing QC'].append(Donor_cells_passes_qc)

Donor_cell_assignments = azt.loc[Mengled_barcodes_donor] #for this have to figure out when the cell type is unasigned.
# Donor_cell_assignments = Donor_cell_assignments.rename(COLUMNS_AZIMUTH,axis=1)
Cell_types_detected = len(set(Donor_cell_assignments['Azimuth:predicted.celltype.l2']))
Donor_UMIS_mapped_to_mitochondrial_genes = sum(Donor_qc_files.obs['total_counts_gene_group__mito_transcript'])
Donor_UMIS_mapped_to_ribo_genes = sum(Donor_qc_files.obs['total_counts_gene_group__ribo_protein'])
Expand Down Expand Up @@ -855,7 +828,6 @@ def isNaN(num):
data_donor.append(Donor_Stats)

fctr += 1
# print('done')
all_probs = all_probs[~all_probs.index.duplicated(keep='first')]
azt['prob_doublet']=all_probs['prob_doublet']
Donor_df = pd.DataFrame(data_donor)
Expand Down Expand Up @@ -973,23 +945,26 @@ def set_argument_parser():
write_h5=False
else:
write_h5=True
# write_h5=False

oufh = open(os.path.join(args.outdir, "files.tsv"), 'w')
oufh.write("experiment_id\tdonor_id\tfilename_h5ad\tfilename_annotation_tsv\n")
df_raw = pandas.read_table(args.input_table, index_col = 'experiment_id')
if (args.cellbender)=='cellranger':
# here we do not use cellbender and go with default cellranger
# Here we do not use cellbender and go with default cellranger
df_cellbender = None
else:
# here we have run the cellbender as par of pipeline.
# Here we have run the cellbender as par of pipeline.
# cellbender/*/cellbender-epochs_*/cellbender-FPR_0pt01-filtered_10x_mtx
file_path = glob.glob(f'{args.results_dir}/nf-preprocessing/cellbender/*/cellbender-epochs_*/*{args.resolution}*10x_mtx*')
file_path2 = glob.glob(f'{args.results_dir}/nf-preprocessing/cellbender/*/*{args.resolution}*10x_mtx*')
joined_file_paths = file_path+file_path2
df_cellbender = pd.DataFrame(joined_file_paths,columns=['data_path_10x_format'])
df_cellbender['experiment_id']=df_cellbender['data_path_10x_format'].str.split('/').str[-3]
df_cellbender= df_cellbender.set_index('experiment_id')

Resolution = args.resolution

# Load the final QCd dataset
try:
adqc = anndata.read_h5ad(f'{args.results_dir}/merged_h5ad/outlier_filtered_adata.h5ad')
except:
Expand All @@ -998,30 +973,27 @@ def set_argument_parser():
except:
d2 = glob.glob(f'{args.results_dir}/*/*/adatanormalized.h5ad')[0]
adqc = anndata.read_h5ad(d2)
adqc.obs['experiment_id'] = adqc.obs['experiment_id'].str.split("__").str[0]
# adqc.obs['experiment_id'] = adqc.obs['experiment_id'].str.split("__").str[0]
fctr = 0
data_tranche_all=[]
data_donor_all=[]
count = 1
All_probs_and_celltypes = pd.DataFrame()


Sample_metadata = pd.DataFrame()

adqc.obs['tranche.id']=args.experiment_name
# SETTING TRANCHE NAME
try:
adqc.obs['cell_passes_qc-per:Azimuth:L0_Azimuth:predicted.celltype.l2:score'] = adqc.obs['cell_passes_qc-per:Azimuth:L0_Azimuth:predicted.celltype.l2:score'].astype(float,errors='ignore')
adqc.obs['cell_passes_qc-per:Azimuth:L0_predicted.celltype.l2'] = adqc.obs['cell_passes_qc-per:Azimuth:L0_predicted.celltype.l2:score'].astype(float,errors='ignore')
except:
_='no values associated'
try:
adqc.obs['cell_passes_qc-per:all_together::exclude:score']= adqc.obs['cell_passes_qc-per:all_together::exclude:score'].astype(float,errors='ignore')
except:
_='no values associated'


# Now we calculate all the statistics for each of the pools.
for expid in df_raw.index:
print(expid)
# try:
# expid ='OTARscRNA12924807'
s = adqc.obs['convoluted_samplename'] == expid
ad = adqc[s]
if ad.n_obs == 0:
Expand Down
30 changes: 28 additions & 2 deletions bin/transfer_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,26 @@ def main_data_colection(pipeline='',name='',directory='',input_table=None,cb_res
for folder in Folders:
copyfile(f'{folder1}/{folder}/Vireo_plots.pdf', f'{name_dir}/Deconvolution/Vireo_plots_{folder}.pdf')



folder1 = f'{directory}/doublets'
if os.path.isdir(folder1):
print('prepearing Doublet folder')
try:
os.mkdir(f'{name_dir}/Doublets___301')
except:
print('dire exists')
files = glob.glob(f'{folder1}/*.tsv')

for file1 in files:
print(file1)
try:
copy(file1, f'{name_dir}/Doublets___301')
except:
print('picked up directory')
continue


folder1 = f'{directory}/celltype/celltypist'
if os.path.isdir(folder1):
print('prepearing celltype folder')
Expand Down Expand Up @@ -205,15 +225,21 @@ def main_data_colection(pipeline='',name='',directory='',input_table=None,cb_res
# print('dire exists')
# copyfile(fil1, f'{name_dir}/QC metrics/plot_ecdf-x_log10.var=total_counts.color=experiment_id-adata.png')
files = glob.glob(f'{folder1}/*[!.gz]')
files2 = glob.glob(f'{folder1}/*/*[!.gz]')
files = files + files2
for file1 in files:
print(file1)
try:
copy(file1, f'{name_dir}/Cell-type assignment/azimuth')
except:
print('picked up directory')
continue


try:
copy(f'{directory}/celltype/All_Celltype_Assignments.csv', f'{name_dir}/Cell-type assignment/All_Celltype_Assignments.csv')
except:
print('doesnt exist')


folder1 = f'{directory}/plots/per_celltype_outliers'
if os.path.isdir(folder1):
print('yes')
Expand Down

0 comments on commit ac9e8c6

Please sign in to comment.