Skip to content

Commit

Permalink
Rename normalization function, add NON_PDX_SPECIMEN_TYPE list, and fi…
Browse files Browse the repository at this point in the history
…x tests
  • Loading branch information
sivkovic committed Jan 14, 2025
1 parent a60bee2 commit c35e903
Show file tree
Hide file tree
Showing 5 changed files with 31 additions and 9 deletions.
2 changes: 1 addition & 1 deletion fixtures/tests/10075_D_single_TN_pair.argos.input.json
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
"adapter2": "AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT",
"bwa_output": "s_C_8VK0V7_R001_d.bam",
"request_id": "10075_D",
"specimen_type": "Local Recurrence"
"specimen_type": "Resection"
},
{
"ID": "s_C_8VK0V7_N001_d",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
"adapter2": "AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT",
"bwa_output": "s_C_8VK0V7_R001_d.bam",
"request_id": "10075_D",
"specimen_type": "Local Recurrence"
"specimen_type": "Resection"
},
"normal": {
"ID": "s_C_8VK0V7_N001_d",
Expand Down
2 changes: 1 addition & 1 deletion fixtures/tests/10075_D_single_TN_pair.argos_bam.input.json
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@
"adapter2": "AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT",
"bwa_output": "s_C_8VK0V7_R001_d.bam",
"request_id": "10075_D",
"specimen_type": "Local Recurrence"
"specimen_type": "Resection"
},
"normal": {
"ID": "s_C_8VK0V7_N001_d",
Expand Down
4 changes: 2 additions & 2 deletions fixtures/tests/10075_D_single_TN_pair.filemetadata.json
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@
],
"tumorOrNormal": "Tumor",
"captureInputNg": "110.0",
"sampleClass": "Local Recurrence",
"sampleClass": "Resection",
"collectionYear": "",
"tissueLocation": "",
"dataAnalystName": "",
Expand Down Expand Up @@ -130,7 +130,7 @@
],
"tumorOrNormal": "Tumor",
"captureInputNg": "110.0",
"sampleClass": "Local Recurrence",
"sampleClass": "Resection",
"collectionYear": "",
"tissueLocation": "",
"dataAnalystName": "",
Expand Down
30 changes: 26 additions & 4 deletions runner/operator/argos_operator/v2_1_0/construct_argos_pair.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,21 @@


PDX_SPECIMEN_TYPES = ["pdx", "xenograft", "xenograftderivedcellline"]
NON_PDX_SPECIMEN_TYPES = [
"biopsy",
"blood",
"cellLine",
"cfdna",
"fingernails",
"nonpdx",
"normal",
"organoid",
"other",
"rapidautopsy",
"resection",
"saliva",
"tumor",
]

# TODO: generalize
def load_references():
Expand All @@ -25,7 +40,12 @@ def calculate_abra_ram_size(grouping_dict):
return


def normalize_specimen_type(specimen_type):
def normalize_igo_text_field(specimen_type):
# Flatten text data from the Genomics Core
# to allow robust exact text matching.
#
# Allow variance in case and ignore non
# alphanumeric characters (FYI).
# Convert to lowercase
s = specimen_type.lower()
# Remove special characters and extra spaces
Expand All @@ -35,7 +55,7 @@ def normalize_specimen_type(specimen_type):

# TODO: This is ARGOS-formatted, note the confusing IDs
def format_sample(data):
specimen_type = normalize_specimen_type(data["specimen_type"])
specimen_type = normalize_igo_text_field(data["specimen_type"])
sample = dict()
sample["ID"] = data["SM"] # TODO: change someday
sample["CN"] = data["CN"]
Expand All @@ -52,14 +72,16 @@ def format_sample(data):
sample["adapter2"] = "AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT"
sample["bwa_output"] = sample["ID"] + ".bam"
sample["request_id"] = data["request_id"]
sample["specimen_type"] = data["specimen_type"]
sample["specimen_type"] = specimen_type

if specimen_type in PDX_SPECIMEN_TYPES:
r1 = "zR1"
r2 = "zR2"
else:
elif specimen_type in NON_PDX_SPECIMEN_TYPES:
r1 = "R1"
r2 = "R2"
else:
raise Exception(f"Invalid Specimen Type: {specimen_type}")

for i in data["R1"]:
if i:
Expand Down

0 comments on commit c35e903

Please sign in to comment.