Skip to content

Commit

Permalink
upgrade pandas and python version, simplify summary code, add github …
Browse files Browse the repository at this point in the history
…workflow action
  • Loading branch information
wxicu committed Nov 7, 2023
1 parent aa01844 commit f995c5f
Show file tree
Hide file tree
Showing 15 changed files with 83 additions and 1,118 deletions.
10 changes: 1 addition & 9 deletions bin/demuxem.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,16 +31,8 @@
if __name__ == '__main__':
output_name = args.outputdir + "/" + args.objectOutDemuxem
# load input rna data
#data = io.read_input(args.rna_matrix_dir, modality="rna")
rna_data = sc.read_10x_mtx(args.rna_matrix_dir)
hashing_data = sc.read_10x_mtx(args.hto_matrix_dir,gex_only=False)
#data.subset_data(modality_subset=['rna'])
#data.concat_data() # in case of multi-organism mixing data
# load input hashing data
#data.update(io.read_input(args.hto_matrix_dir, modality="hashing"))
# Extract rna and hashing data
#rna_data = data.get_data(modality="rna")
#hashing_data = data.get_data(modality="hashing")
filter = ""
if args.filter_demuxem.lower() in ['true', 't', 'yes', 'y', '1']:
filter = True
Expand Down Expand Up @@ -96,7 +88,7 @@
pg.write_output(mudata, output_name + ".out.demuxEM.zarr.zip")
print("\nSummary statistics:")
print("total\t{}".format(rna_data.shape[0]))
for name, value in rna_data.obs["demux_type"].value_counts().iteritems():
for name, value in rna_data.obs["demux_type"].value_counts().items():
print("{}\t{}".format(name, value))
summary = rna_data.obs["demux_type"].value_counts().rename_axis('classification').reset_index(name='counts')
total = ["total", rna_data.shape[0]]
Expand Down
1 change: 0 additions & 1 deletion bin/generate_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import os
import scanpy as sc
import argparse
import muon as mu

parser = argparse.ArgumentParser(description="Parameters for generating anndata and mudata")
parser.add_argument("--assignment", help="Folder which contains cSV file with demultiplexing assignment", default=None)
Expand Down
80 changes: 36 additions & 44 deletions bin/summary_gene.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
#!/usr/bin/env python
import pandas as pd
import os
import scanpy as sc
import argparse
import muon as mu
import numpy as np
import scanpy as sc
import pandas as pd
from mudata import MuData


parser = argparse.ArgumentParser(description="Parameters for summary process")
parser.add_argument("--demuxlet", help="Folder containing output files of Demuxlet", default=None)
Expand All @@ -15,7 +17,6 @@
parser.add_argument("--generate_mudata", help="Generate mudata", action='store_true')
parser.add_argument("--read_rna_mtx", help="10x-Genomics-formatted mtx directory for gene expression", default=None)
parser.add_argument("--read_hto_mtx", help="10x-Genomics-formatted mtx directory for HTO expression", default=None)
#parser.add_argument("--sampleId", help="sampleID if multiple samples are demultiplexed", default=None)

args = parser.parse_args()

Expand All @@ -26,14 +27,12 @@ def demuxlet_summary(demuxlet_res, raw_adata, raw_mudata):
obs_res_dir = [file for file in os.listdir(x) if file.endswith('.best')][0]
obs_res = pd.read_csv(os.path.join(x, obs_res_dir), sep='\t')
obs_res = obs_res.iloc[:, [1, 4, 5]]
obs_res['Assignment'] = obs_res['BEST.GUESS'].apply(lambda x: x.split(',')[1] if x.split(',')[0] == x.split(',')[1] else "NA")
obs_res.loc[obs_res['Assignment'] == "NA", 'Assignment'] = "doublet"
obs_res.loc[obs_res['DROPLET.TYPE'] == "AMB", 'Assignment'] = "negative"
obs_res.rename(columns={'BARCODE': 'Barcode'}, inplace=True)
demuxlet_assign = obs_res[['Barcode', 'Assignment']]
demuxlet_assign.index = demuxlet_assign.Barcode
demuxlet_assign = demuxlet_assign.drop(columns=['Barcode'])
demuxlet_assign.rename(columns={'Assignment': os.path.basename(x)}, inplace=True)
obs_res['Assignment'] = np.where(obs_res['BEST.GUESS'].str.split(',').str[0] == obs_res['BEST.GUESS'].str.split(',').str[1],
obs_res['BEST.GUESS'].str.split(',').str[0], "doublet")
obs_res['Assignment'] = np.where(obs_res['DROPLET.TYPE'] == 'AMB', 'negative', obs_res['Assignment'])
obs_res.rename(columns={"BARCODE": "Barcode", "Assignment": os.path.basename(x)}, inplace=True)
obs_res.set_index('Barcode', inplace=True)
demuxlet_assign = obs_res[[os.path.basename(x)]]

if raw_adata is not None:
adata = raw_adata.copy()
Expand All @@ -53,7 +52,7 @@ def demuxlet_summary(demuxlet_res, raw_adata, raw_mudata):
assign.to_csv("genetic_summary/demuxlet_assignment.csv", quoting=False)

classi = assign.copy()
classi[(classi != "negative") & (classi != "doublet")] = "singlet"
classi[~classi.isin(["doublet", "negative"])] = "singlet"
classi.to_csv("genetic_summary/demuxlet_classification.csv", quoting=False)

params = pd.concat(params, axis=1)
Expand All @@ -67,14 +66,12 @@ def freemuxlet_summary(freemuxlet_res, raw_adata, raw_mudata):
obs_res_dir = [file for file in os.listdir(x) if file.endswith('.clust1.samples.gz')][0]
obs_res = pd.read_csv(os.path.join(x, obs_res_dir), sep='\t')
obs_res = obs_res.iloc[:, [1, 4, 5]]
obs_res['Assignment'] = obs_res['BEST.GUESS'].apply(lambda x: x.split(',')[1] if x.split(',')[0] == x.split(',')[1] else "NA")
obs_res.loc[obs_res['Assignment'] == "NA", 'Assignment'] = "doublet"
obs_res.loc[obs_res['DROPLET.TYPE'] == "AMB", 'Assignment'] = "negative"
obs_res.rename(columns={'BARCODE': 'Barcode'}, inplace=True)
freemuxlet_assign = obs_res[['Barcode', 'Assignment']]
freemuxlet_assign.index = freemuxlet_assign.Barcode
freemuxlet_assign = freemuxlet_assign.drop(columns=['Barcode'])
freemuxlet_assign.rename(columns={'Assignment': os.path.basename(x)}, inplace=True)
obs_res['Assignment'] = np.where(obs_res['BEST.GUESS'].str.split(',').str[0] == obs_res['BEST.GUESS'].str.split(',').str[1],
obs_res['BEST.GUESS'].str.split(',').str[0], "doublet")
obs_res['Assignment'] = np.where(obs_res['DROPLET.TYPE'] == 'AMB', 'negative', obs_res['Assignment'])
obs_res.rename(columns={"BARCODE": "Barcode", "Assignment": os.path.basename(x)}, inplace=True)
obs_res.set_index('Barcode', inplace=True)
freemuxlet_assign = obs_res[[os.path.basename(x)]]

if raw_adata is not None:
adata = raw_adata.copy()
Expand All @@ -94,7 +91,7 @@ def freemuxlet_summary(freemuxlet_res, raw_adata, raw_mudata):
assign.to_csv("genetic_summary/freemuxlet_assignment.csv", quoting=False)

classi = assign.copy()
classi[(classi != "negative") & (classi != "doublet")] = "singlet"
classi[~classi.isin(["doublet", "negative"])] = "singlet"
classi.to_csv("genetic_summary/freemuxlet_classification.csv", quoting=False)

params = pd.concat(params, axis=1)
Expand All @@ -112,11 +109,9 @@ def souporcell_summary(souporcell_res, raw_adata, raw_mudata):
obs_res = obs_res.iloc[:, 0:3]
obs_res.loc[obs_res['status'] == 'doublet', 'assignment'] = 'doublet'
obs_res.loc[obs_res['status'] == 'unassigned', 'assignment'] = 'negative'
obs_res = obs_res.drop('status', axis=1)
obs_res.rename(columns={'barcode': 'Barcode'}, inplace=True)
obs_res.index = obs_res.Barcode
obs_res = obs_res.drop(columns=['Barcode'])
obs_res.columns = [os.path.basename(x)]
obs_res.rename(columns={'barcode': 'Barcode', 'assignment': os.path.basename(x)}, inplace=True)
obs_res.set_index('Barcode', inplace=True)
obs_res = obs_res[[os.path.basename(x)]]

if raw_adata is not None:
adata = raw_adata.copy()
Expand All @@ -136,7 +131,7 @@ def souporcell_summary(souporcell_res, raw_adata, raw_mudata):
assign.to_csv("genetic_summary/souporcell_assignment.csv", quoting=False)

classi = assign.copy()
classi[(classi != "negative") & (classi != "doublet")] = "singlet"
classi[~classi.isin(["doublet", "negative"])] = "singlet"
classi.to_csv("genetic_summary/souporcell_classification.csv", quoting=False)

params = pd.concat(params, axis=1)
Expand All @@ -152,12 +147,11 @@ def vireo_summary(vireo_res, raw_adata, raw_mudata):
if "donor_ids.tsv" in files:
obs_res_dir = os.path.join(root, "donor_ids.tsv")
obs_res = pd.read_csv(os.path.join(x, obs_res_dir), sep='\t')
obs_res = obs_res.iloc[:, [0, 1]]
bs_res = obs_res.iloc[:, [0, 1]]
obs_res[obs_res == "unassigned"] = "negative"
obs_res.rename(columns={'cell': 'Barcode'}, inplace=True)
obs_res.index = obs_res.Barcode
obs_res = obs_res.drop(columns=['Barcode'])
obs_res.columns = [os.path.basename(x)]
obs_res.rename(columns={'cell': 'Barcode', 'donor_id': os.path.basename(x)}, inplace=True)
obs_res.set_index('Barcode', inplace=True)
obs_res = obs_res[[os.path.basename(x)]]

if raw_adata is not None:
adata = raw_adata.copy()
Expand All @@ -177,7 +171,7 @@ def vireo_summary(vireo_res, raw_adata, raw_mudata):
assign.to_csv("genetic_summary/vireo_assignment.csv", quoting=False)

classi = assign.copy()
classi[(classi != "negative") & (classi != "doublet")] = "singlet"
classi[~classi.isin(["doublet", "negative"])] = "singlet"
classi.to_csv("genetic_summary/vireo_classification.csv", quoting=False)

params = pd.concat(params, axis=1)
Expand All @@ -188,17 +182,13 @@ def scsplit_summary(scsplit_res, raw_adata, raw_mudata):
params = []

for x in scsplit_res:
obs_res_dir = ""
for root, dirs, files in os.walk(x):
if "scSplit_result.csv" in files:
obs_res_dir = os.path.join(root, "scSplit_result.csv")
obs_res_dir = next((os.path.join(root, "scSplit_result.csv") for root, dirs, files in os.walk(x) if "scSplit_result.csv" in files),"")
obs_res = pd.read_table(obs_res_dir)
obs_res['Assignment'] = obs_res['Cluster'].str.split('-').str[1]
obs_res['Classification'] = obs_res['Cluster'].str.split('-').str[0]
obs_res = obs_res.drop('Cluster', axis=1)
obs_res.loc[obs_res['Classification'] == 'DBL', 'Assignment'] = 'doublet'
obs_res.index = obs_res.Barcode
obs_res = obs_res.drop(columns=['Barcode', 'Classification'])
obs_res = obs_res.drop(columns=['Cluster', 'Classification'])
obs_res.set_index('Barcode', inplace=True)
obs_res.columns = [os.path.basename(x)]

if raw_adata is not None:
Expand Down Expand Up @@ -236,9 +226,11 @@ def scsplit_summary(scsplit_res, raw_adata, raw_mudata):
adata = sc.read_10x_mtx(args.read_rna_mtx)

if args.generate_mudata is True:
# TODO
os.mkdir("genetic_summary/mudata")
pass
if not os.path.exists("genetic_summary/mudata"):
os.mkdir("genetic_summary/mudata")
rna_data = sc.read_10x_mtx(args.read_rna_mtx)
hto_data = sc.read_10x_mtx(args.read_hto_mtx, gex_only=False)
mudata = MuData({"rna": rna_data, "hto": hto_data })

if args.demuxlet is not None:
demuxlet_res = args.demuxlet.split(':')
Expand Down
Loading

0 comments on commit f995c5f

Please sign in to comment.