Sage-Bionetworks · danlu1 · Oct 31, 2024 · Oct 1, 2024 · Oct 2, 2024 · Oct 2, 2024
@@ -23,7 +23,7 @@ jobs:
 
     steps:
     - name: Checkout repository
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
       with:
         fetch-depth: 2
 

@@ -1,5 +1,28 @@
 # genie-bpc-pipeline: Contributing Guidelines
 
+## Getting started
+1. [Clone the repository](https://help.github.com/articles/cloning-a-repository/) to your local machine so you can begin making changes.
+2. On your local machine make sure you have the latest version of the `develop` branch:
+
+    ```
+    git checkout develop
+    git pull origin develop
+    ```
+3. Create a feature branch off the `develop` branch and work on it. The branch should be named the same as the JIRA issue you are working on in **lowercase** (e.g., `gen-1234-{feature-here}`). Make sure the branch name as informative as possible. 
+    ```
+    git checkout develop
+    git checkout -b gen-1234-{feature-here}
+    ```
+4. Once you have made your additions or changes, make sure you write tests and run the [comparison scripts](https://github.com/Sage-Bionetworks/Genie_processing/blob/create_generalized_comparison_script/utility_scripts/compare_between_two_synapse_entities.py) to ensure changes are expected.
+5. At this point, you have only created the branch locally, you need to push this to your fork on GitHub.
+
+    ```
+    git add your file
+    git commit -m"your commit information"
+    git push --set-upstream origin SYNPY-1234-{feature-here}
+    ```
+6. Create a pull request from the feature branch to the develop branch. An Github action will be triggered to create a docker image for the branch, you can check it [here](https://github.com/Sage-Bionetworks/genie-bpc-pipeline/pkgs/container/genie-bpc-pipeline).
+
 ## Nextflow Pipeline contribution
 
 Here is how to contribute to the nextflow workflow of the genie-bpc-pipeline

@@ -93,15 +93,16 @@ workflow BPC_PIPELINE {
    ch_comment = Channel.value(params.comment)
 
    if (params.step == "update_potential_phi_fields_table") {
-    update_potential_phi_fields_table(ch_comment, params.production)
-    // validate_data.out.view()
-   } else if (params.step == "genie_bpc_pipeline"){
+    update_potential_phi_fields_table(ch_comment, params.production)// validate_data.out.view()
+    } else if (params.step == "update_data_table") {
+    update_data_table("default", ch_cohort, ch_comment, params.production)
+    } else if (params.step == "genie_bpc_pipeline"){
     update_potential_phi_fields_table(ch_comment, params.production)
     run_quac_upload_report_error(update_potential_phi_fields_table.out, ch_cohort)
     run_quac_upload_report_warning(run_quac_upload_report_error.out, ch_cohort, params.production)
     merge_and_uncode_rca_uploads(run_quac_upload_report_warning.out, ch_cohort, params.production)
     // remove_patients_from_merged(merge_and_uncode_rca_uploads.out, ch_cohort, params.production)
-    update_data_table(merge_and_uncode_rca_uploads.out, ch_comment, params.production)
+    update_data_table(merge_and_uncode_rca_uploads.out, ch_cohort, ch_comment, params.production)
     update_date_tracking_table(update_data_table.out, ch_cohort, ch_comment, params.production)
     run_quac_table_report(update_date_tracking_table.out, ch_cohort, params.production)
     run_quac_comparison_report(run_quac_table_report.out, ch_cohort, params.production)

@@ -2,13 +2,14 @@
 Update Synapse tables with merged and uncoded data.
 */
 process update_data_table {
+   container "$params.table_updates_docker"
 
-   container 'sagebionetworks/genie-bpc-pipeline-table-updates'
    secret 'SYNAPSE_AUTH_TOKEN'
    debug true
 
    input:
    val previous
+   val cohort
    val comment
    val production
 
@@ -19,13 +20,12 @@ process update_data_table {
    if (production) {
       """
       cd /root/scripts/
-      python update_data_table.py -p /root/scripts/config.json -m "$comment" primary
+      python update_data_table.py -p /root/scripts/config.json -c $cohort -m "$comment" primary -pd
       """
-   }
-   else {
+   } else {
       """
       cd /root/scripts/
-      python update_data_table.py -p /root/scripts/config.json -m "$comment" primary -d
+      python update_data_table.py -p /root/scripts/config.json -c $cohort -m "$comment" primary
       """
    }
 }
@@ -53,6 +53,7 @@ profiles {
 		params {
 			// docker image parameters, see nextflow_schema.json for details
 			references_docker = "sagebionetworks/genie-bpc-pipeline-references"
+			table_updates_docker = "sagebionetworks/genie-bpc-pipeline-table-updates"
 		}
 	}
 }
@@ -50,13 +50,18 @@
                     "description": "Available BPC steps",
                     "enum": [
                         "update_potential_phi_fields_table",
-                        "genie_bpc_pipeline"
+                        "genie_bpc_pipeline",
+                        "update_data_table"
                     ]
                 },
                 "references_docker":{
                     "type": "string",
                     "description": "Name of docker to use in processes in scripts/references"
                 },
+                "table_updates_docker":{
+                    "type": "string",
+                    "description": "Name of docker to use in processes in scripts/table_updates" 
+                },
                 "schema_ignore_params": {
                     "type": "string",
                     "description": "Put parameters to ignore for validation here separated by comma",

@@ -1,4 +1,4 @@
-FROM python:3.8
+FROM python:3.11
 
 WORKDIR /root/scripts
 

@@ -39,6 +39,14 @@ Usage
 
 ### Update the Synapse Tables with data
 #### Primary Case Tables
-    python update_data_table.py -m [version_comment] primary
-#### IRR Case Tables
+##### 1. dry-run (Save output to local)
+    python update_data_table.py -c [cohort_name] -m [version_comment] primary -d
+
+##### 2. production (Save output to production projects)
+    python update_data_table.py -c [cohort_name] -m [version_comment] primary -pd
+
+##### 3. staging (Save output to staging projects)
+    python update_data_table.py -c [cohort_name] -m [version_comment] primary
+
+#### IRR Case Tables (Deprecated)
     python update_data_table.py -m [version_comment] irr
@@ -1 +1 @@
-synapseclient[pandas] == 2.7.2
+synapseclient[pandas] == 4.6.0
@@ -0,0 +1,67 @@
+from unittest.mock import MagicMock, create_autospec, patch
+
+import numpy as np
+import pandas as pd
+import pytest
+import synapseclient
+from synapseclient import Schema, Table
+from table_updates import utilities
+
+
+@pytest.fixture(scope="session")
+def syn():
+    return create_autospec(synapseclient.Synapse)
+
+@pytest.fixture(scope="session")
+def table_schema():
+    schema = synapseclient.table.Schema(
+        name="test_table",
+        parent="syn123",
+        column_names=["col1", "col2"],
+        column_types=["STRING", "INTEGER"],
+    )
+    return schema
+
+
+@pytest.mark.parametrize(
+    "query_return_df,select,query,expected_df",
+    [
+        (pd.DataFrame({'col1': ['value1', 'value2']}), "col1", "SELECT col1 from syn123456",pd.DataFrame({'col1': ['value1', 'value2']})),
+        (pd.DataFrame({'col1': ['value1', 'value2'],'col2': [1, 2]}), "col1,col2", "SELECT col1,col2 from syn123456",pd.DataFrame({'col1': ['value1', 'value2'],'col2': [1, 2]})),
+        (pd.DataFrame({'col1': ["NA", "value1", "None"],'col2': [1, 2, 3]}),"*","SELECT * from syn123456",pd.DataFrame({'col1': [np.nan, "value1", "None"],'col2': [1, 2, 3]})),
+        (pd.DataFrame(columns = ["col1", "col2"]),"*","SELECT * from syn123456",pd.DataFrame(columns = ["col1", "col2"])),
+    ],
+    ids = ["selected_single_column","selected_multiple_column","pull_table_with_na_values_all_columns","pull_empty_table_all_columns"],
+)
+def test_download_synapse_table_default_condition(syn, table_schema, query_return_df, select, query, expected_df):
+    syn.tableQuery = MagicMock(return_value = Table(table_schema, query_return_df))
+    result = utilities.download_synapse_table(syn, "syn123456", select)
+
+    # validate
+    syn.tableQuery.assert_called_once_with(query)
+    pd.testing.assert_frame_equal(result, expected_df)
+
+@pytest.mark.parametrize(
+    "query_return_df,condition,query,expected_df",
+    [
+        (pd.DataFrame({'col1': ['value1'],'col2': [1]}), "col1 = 'value1'", "SELECT * from syn123456 WHERE col1 = 'value1'",pd.DataFrame({'col1': ['value1'],'col2': [1]})),
+        (pd.DataFrame({'col1': ["NA", "value1", "None"],'col2': [1, 1, 1]}), "col2 = 1","SELECT * from syn123456 WHERE col2 = 1",pd.DataFrame({'col1': [np.nan, "value1", "None"],'col2': [1, 1, 1]})),
+    ],
+    ids = ["selected_row_all_columns","pull_table_with_na_values_all_columns"],
+)
+def test_download_synapse_table_with_condition(syn, table_schema, query_return_df, condition, query,expected_df):
+    syn.tableQuery = MagicMock(return_value = Table(table_schema, query_return_df))
+    result = utilities.download_synapse_table(syn, "syn123456", condition = condition)
+
+    # validate
+    syn.tableQuery.assert_called_once_with(query)
+    pd.testing.assert_frame_equal(result, expected_df)
+
+def test_download_empty_synapse_table_with_condition(syn, table_schema, ):
+    syn.tableQuery = MagicMock(return_value = Table(table_schema, pd.DataFrame(columns = ["col1", "col2"])))
+    result = utilities.download_synapse_table(syn, "syn123456", condition = "col2 = 1")
+
+    # validate
+    syn.tableQuery.assert_called_once_with("SELECT * from syn123456 WHERE col2 = 1")
+    pd.testing.assert_frame_equal(result, pd.DataFrame(columns = ["col1", "col2"]))
+
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		synapseclient[pandas] == 2.7.2
		synapseclient[pandas] == 4.6.0