Rename normalization function, add NON_PDX_SPECIMEN_TYPE list, and fi…

…x tests
mskcc · Jan 14, 2025 · c35e903 · c35e903
1 parent a60bee2
commit c35e903
Show file tree

Hide file tree

Showing 5 changed files with 31 additions and 9 deletions.
diff --git a/fixtures/tests/10075_D_single_TN_pair.argos.input.json b/fixtures/tests/10075_D_single_TN_pair.argos.input.json
@@ -34,7 +34,7 @@
                 "adapter2": "AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT",
                 "bwa_output": "s_C_8VK0V7_R001_d.bam",
                 "request_id": "10075_D",
-                "specimen_type": "Local Recurrence"
+                "specimen_type": "Resection"
             },
             {
                 "ID": "s_C_8VK0V7_N001_d",

diff --git a/fixtures/tests/10075_D_single_TN_pair.argos_1_7_0.input.json b/fixtures/tests/10075_D_single_TN_pair.argos_1_7_0.input.json
@@ -30,7 +30,7 @@
       "adapter2": "AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT",
       "bwa_output": "s_C_8VK0V7_R001_d.bam",
       "request_id": "10075_D",
-      "specimen_type": "Local Recurrence"
+      "specimen_type": "Resection"
     },
     "normal": {
       "ID": "s_C_8VK0V7_N001_d",

diff --git a/fixtures/tests/10075_D_single_TN_pair.argos_bam.input.json b/fixtures/tests/10075_D_single_TN_pair.argos_bam.input.json
@@ -108,7 +108,7 @@
             "adapter2": "AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT",
             "bwa_output": "s_C_8VK0V7_R001_d.bam",
             "request_id": "10075_D",
-            "specimen_type": "Local Recurrence"
+            "specimen_type": "Resection"
         },
         "normal": {
             "ID": "s_C_8VK0V7_N001_d",

diff --git a/fixtures/tests/10075_D_single_TN_pair.filemetadata.json b/fixtures/tests/10075_D_single_TN_pair.filemetadata.json
@@ -56,7 +56,7 @@
         ],
         "tumorOrNormal": "Tumor",
         "captureInputNg": "110.0",
-        "sampleClass": "Local Recurrence",
+        "sampleClass": "Resection",
         "collectionYear": "",
         "tissueLocation": "",
         "dataAnalystName": "",
@@ -130,7 +130,7 @@
         ],
         "tumorOrNormal": "Tumor",
         "captureInputNg": "110.0",
-        "sampleClass": "Local Recurrence",
+        "sampleClass": "Resection",
         "collectionYear": "",
         "tissueLocation": "",
         "dataAnalystName": "",

diff --git a/runner/operator/argos_operator/v2_1_0/construct_argos_pair.py b/runner/operator/argos_operator/v2_1_0/construct_argos_pair.py
@@ -13,6 +13,21 @@
 
 
 PDX_SPECIMEN_TYPES = ["pdx", "xenograft", "xenograftderivedcellline"]
+NON_PDX_SPECIMEN_TYPES = [
+    "biopsy",
+    "blood",
+    "cellLine",
+    "cfdna",
+    "fingernails",
+    "nonpdx",
+    "normal",
+    "organoid",
+    "other",
+    "rapidautopsy",
+    "resection",
+    "saliva",
+    "tumor",
+]
 
 # TODO: generalize
 def load_references():
@@ -25,7 +40,12 @@ def calculate_abra_ram_size(grouping_dict):
     return
 
 
-def normalize_specimen_type(specimen_type):
+def normalize_igo_text_field(specimen_type):
+    # Flatten text data from the Genomics Core
+    # to allow robust exact text matching.
+    #
+    # Allow variance in case and ignore non
+    # alphanumeric characters (FYI).
     # Convert to lowercase
     s = specimen_type.lower()
     # Remove special characters and extra spaces
@@ -35,7 +55,7 @@ def normalize_specimen_type(specimen_type):
 
 # TODO: This is ARGOS-formatted, note the confusing IDs
 def format_sample(data):
-    specimen_type = normalize_specimen_type(data["specimen_type"])
+    specimen_type = normalize_igo_text_field(data["specimen_type"])
     sample = dict()
     sample["ID"] = data["SM"]  # TODO: change someday
     sample["CN"] = data["CN"]
@@ -52,14 +72,16 @@ def format_sample(data):
     sample["adapter2"] = "AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT"
     sample["bwa_output"] = sample["ID"] + ".bam"
     sample["request_id"] = data["request_id"]
-    sample["specimen_type"] = data["specimen_type"]
+    sample["specimen_type"] = specimen_type
 
     if specimen_type in PDX_SPECIMEN_TYPES:
         r1 = "zR1"
         r2 = "zR2"
-    else:
+    elif specimen_type in NON_PDX_SPECIMEN_TYPES:
         r1 = "R1"
         r2 = "R2"
+    else:
+        raise Exception(f"Invalid Specimen Type: {specimen_type}")
 
     for i in data["R1"]:
         if i: