Skip to content

Commit

Permalink
updated merging to account for several tiles/pucks per sample
Browse files Browse the repository at this point in the history
  • Loading branch information
sztankatt committed Apr 11, 2022
1 parent 60eddf6 commit 0565c29
Show file tree
Hide file tree
Showing 2 changed files with 109 additions and 74 deletions.
158 changes: 84 additions & 74 deletions spacemake/project_df.py
Original file line number Diff line number Diff line change
Expand Up @@ -613,7 +613,7 @@ def __init__(self, file_path, config: ConfigFile = None):

while not failed:
try:
df = pd.read_csv(
self.df = pd.read_csv(
file_path,
index_col=["project_id", "sample_id"],
na_values=["None", "none"],
Expand All @@ -629,53 +629,16 @@ def __init__(self, file_path, config: ConfigFile = None):
raise e
failed=True

if df.empty:
if self.df.empty:
index = pd.MultiIndex(
names=["project_id", "sample_id"], levels=[[], []], codes=[[], []]
)
self.df = pd.DataFrame(
columns=self.project_df_default_values.keys(), index=index
)
else:
# replacing NaN with None
df = df.where(pd.notnull(df), None)

# rename puck_id to puck_barcode_file_id, for backward
# compatibility
df.rename(
columns={"puck_id":"puck_barcode_file_id"},
inplace=True,
)

# convert list values stored as string
df.run_mode = df.run_mode.apply(str_to_list)
df.merged_from = df.merged_from.apply(str_to_list)

# convert R1/R2 to list, if they are stored as string
df.R1 = df.R1.apply(str_to_list)
df.R2 = df.R2.apply(str_to_list)

df.puck_barcode_file_id = df.puck_barcode_file_id.apply(str_to_list)
df.puck_barcode_file = df.puck_barcode_file.apply(str_to_list)

project_list = []
# required if upgrading from pre-longread tree
if not "longreads" in df.columns:
df["longreads"] = None

if not "longread_signature" in df.columns:
df["longread_signature"] = None

# update with new columns, if they exist.
for ix, row in df.iterrows():
s = pd.Series(self.project_df_default_values)
s.update(row)
s.name = row.name
project_list.append(s)

self.df = pd.concat(project_list, axis=1).T
self.df.is_merged = self.df.is_merged.astype(bool)
self.df.index.names = ["project_id", "sample_id"]
# 'fix' the dataframe if there are inconsistencies
self.fix()
else:
index = pd.MultiIndex(
names=["project_id", "sample_id"], levels=[[], []], codes=[[], []]
Expand Down Expand Up @@ -888,6 +851,68 @@ def is_spatial(
else:
return False

def fix(self):
# replacing NaN with None
self.df = self.df.where(pd.notnull(self.df), None)

# rename puck_id to puck_barcode_file_id, for backward
# compatibility
self.df.rename(
columns={"puck_id":"puck_barcode_file_id"},
inplace=True,
)

# convert list values stored as string
self.df.run_mode = self.df.run_mode.apply(str_to_list)
self.df.merged_from = self.df.merged_from.apply(str_to_list)

# convert R1/R2 to list, if they are stored as string
self.df.R1 = self.df.R1.apply(str_to_list)
self.df.R2 = self.df.R2.apply(str_to_list)

self.df.puck_barcode_file_id = self.df.puck_barcode_file_id.apply(
str_to_list)
self.df.puck_barcode_file = self.df.puck_barcode_file.apply(
str_to_list)

project_list = []
# required if upgrading from pre-longread tree
if not "longreads" in self.df.columns:
self.df["longreads"] = None

if not "longread_signature" in self.df.columns:
self.df["longread_signature"] = None

# per row updates
# first create a series of a
for ix, row in self.df.iterrows():
s = pd.Series(self.project_df_default_values)

# update puck barcode file info
# for samples which have shared barcodes, and this barcode info is
# stored in a puck, in the config file, before the id was set to
# the name of the puck, and the puck_barcode_file was set to None.
# Here we populate the puck_barcode_file into the path to the actual
# file so that no errors are caused downstream.
if row['puck_barcode_file'] is None:
if len(row['puck_barcode_file_id']) > 1:
raise SpacemakeError('When no barcode file provided, there ' +
'only should be one id available')

pbf_id = row['puck_barcode_file_id'][0]
if pbf_id != self.project_df_default_values['puck_barcode_file_id']:
puck = self.config.get_puck(pbf_id)

row['puck_barcode_file'] = [puck.variables['barcodes']]

s.update(row)
s.name = row.name
project_list.append(s)

self.df = pd.concat(project_list, axis=1).T
self.df.is_merged = self.df.is_merged.astype(bool)
self.df.index.names = ["project_id", "sample_id"]

def get_puck_barcode_file(
self, project_id: str,
sample_id: str,
Expand All @@ -907,18 +932,7 @@ def get_puck_barcode_file(

# if no puck_barcode_file is provided, it means that barcode
# file has to be fetched from the puck itself
if puck_barcode_files is None:
puck_name = self.get_metadata(
"puck",
project_id=project_id,
sample_id=sample_id
)

puck = self.config.get_puck(puck_name)

return puck.variables['barcodes']
# else we fetch the barcodes from the list
else:
if puck_barcode_files is not None:
for pid, pbf in zip(ids, puck_barcode_files):
if pid == puck_barcode_file_id:
return pbf
Expand Down Expand Up @@ -1284,6 +1298,7 @@ def add_update_sample(
puck_barcode_file = [puck_barcode_file]

# if there are duplicates, raise error
print(puck_barcode_file)
if len(puck_barcode_file) != len(set(puck_barcode_file)):
raise SpacemakeError('Duplicate files provided for '
+ '--puck_barcode_file. \n'
Expand Down Expand Up @@ -1339,6 +1354,7 @@ def add_update_sample(

if puck.has_barcodes:
kwargs['puck_barcode_file_id'] = [puck_name]
kwargs['puck_barcode_file'] = puck.variables['barcodes']

if sample_exists:
new_project = self.df.loc[ix].copy()
Expand Down Expand Up @@ -1583,27 +1599,21 @@ def merge_samples(
# attach the deduced, consisten variable
kwargs[variable] = variable_val[0]

# get puck_barcode_file
if "puck_barcode_file" not in kwargs:
pbf_default = self.project_df_default_values["puck_barcode_file"]
pbf_list = self.df.loc[ix, "puck_barcode_file"].to_list()
# filter out default values
pbf_list = [x for x in pbf_list if x != pbf_default]

# remove duplicates
pbf_list = list(set(pbf_list))

if pbf_list == []:
# if all values are default
kwargs["puck_barcode_file"] = pbf_default
elif len(pbf_list) == 1:
kwargs["puck_barcode_file"] = pbf_list[0]
else:
raise InconsistentVariablesDuringMerge(
variable_name="puck_barcode_file",
variable_value=pbf_list,
ix=ix.to_list(),
)
# get puck_barcode_files
if ("puck_barcode_file" not in kwargs or
"puck_barcode_file_id" not in kwargs):
kwargs['puck_barcode_file_id'] = []
kwargs['puck_barcode_file'] = []

for _, row in self.df.loc[ix].iterrows():
if row['puck_barcode_file'] is None:
continue
else:
for pbf_id, pbf in zip(row['puck_barcode_file_id'], row['puck_barcode_file']):
if (pbf_id not in kwargs['puck_barcode_file_id'] and
pbf not in kwargs['puck_barcode_file']):
kwargs['puck_barcode_file_id'].append(pbf_id)
kwargs['puck_barcode_file'].append(pbf)

# after all checks, log that we are merging
self.logger.info(f"Merging samples {ix_list} together\n")
Expand Down
25 changes: 25 additions & 0 deletions test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,20 @@ spacemake projects add_sample --project_id test \
--R2 spacemake/data/test/visium_public_lane_joined_1m_R1.fastq.gz \
--species mouse

spacemake projects add_sample --project_id test \
--sample_id sc_rnaseq_sample_2 \
--R1 spacemake/data/test/visium_public_lane_joined_1m_R1.fastq.gz \
--R2 spacemake/data/test/visium_public_lane_joined_1m_R1.fastq.gz \
--species mouse \
--barcode_flavor visium

# with one bc file
spacemake projects add_sample --project_id test \
--sample_id one_bc_file \
--R1 spacemake/data/test/visium_public_lane_joined_1m_R1.fastq.gz \
--R2 spacemake/data/test/visium_public_lane_joined_1m_R1.fastq.gz \
--species mouse \
--barcode_flavor visium \
--puck visium

# with two bc files
Expand All @@ -22,5 +30,22 @@ spacemake projects add_sample --project_id test \
--R1 spacemake/data/test/visium_public_lane_joined_1m_R1.fastq.gz \
--R2 spacemake/data/test/visium_public_lane_joined_1m_R1.fastq.gz \
--species mouse \
--barcode_flavor visium \
--puck visium \
--puck_barcode_file spacemake/data/test/test_bc1.csv spacemake/data/test/test_bc2.csv

spacemake projects merge_samples --merged_project_id test \
--merged_sample_id test_merged \
--project_id_list test \
--sample_id_list one_bc_file two_bc_files

# this is expected to fail as has different barcode_flavor
spacemake projects merge_samples --merged_project_id test \
--merged_sample_id test_merged_2 \
--project_id_list test \
--sample_id_list sc_rnaseq_sample two_bc_files

spacemake projects merge_samples --merged_project_id test \
--merged_sample_id test_merged_2 \
--project_id_list test \
--sample_id_list sc_rnaseq_sample_2 two_bc_files

0 comments on commit 0565c29

Please sign in to comment.