Skip to content

Commit

Permalink
[GEN-1485] Modularize update potential PHI fields (#155)
Browse files Browse the repository at this point in the history
* initial commit

* add missing commas

* update production default

* remove short flag

* remove comma

* add auth, logging

* correct dry-run, adjust activity pos

* update tests in readme
  • Loading branch information
rxu17 authored Oct 23, 2024
1 parent 3a18ae8 commit d8a8888
Show file tree
Hide file tree
Showing 10 changed files with 422 additions and 92 deletions.
1 change: 1 addition & 0 deletions .github/CODEOWNERS
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
* @Sage-Bionetworks/genie_admins
67 changes: 67 additions & 0 deletions .github/workflows/build-docker-images.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
name: Build and Push Docker Images

on:
push:
branches: [develop, 'GEN*', 'gen*']
paths:
- 'scripts/references/**'
- '.github/workflows/build-docker-images.yml'
workflow_dispatch:

jobs:
build_references_docker:
runs-on: ubuntu-latest
env:
REGISTRY: ghcr.io
IMAGE_NAME: sage-bionetworks/genie-bpc-pipeline
permissions:
contents: read
packages: write

steps:
- name: Checkout repository
uses: actions/checkout@v3
with:
fetch-depth: 2

- name: Setup Docker buildx
uses: docker/setup-buildx-action@v3

- name: Fetch the default branch (develop) for comparison
run: git fetch origin develop:refs/remotes/origin/develop --depth=1

- name: Check for Changes in scripts/references
id: check_changes
run: |
# Check for a merge base, fallback to root commit if none exists
if git merge-base --is-ancestor origin/develop HEAD; then
DIFF_BASE="origin/develop"
else
DIFF_BASE=$(git rev-list --max-parents=0 HEAD) # Use the initial commit as fallback
fi
# Compare changes between DIFF_BASE and HEAD
if git diff --name-only $DIFF_BASE -- scripts/references | grep -q .; then
echo "CHANGED=true" >> $GITHUB_ENV
else
echo "CHANGED=false" >> $GITHUB_ENV
fi
- name: Log in to GitHub Container Registry
if: env.CHANGED == 'true'
uses: docker/login-action@v3
with:
registry: ${{ env.REGISTRY }}
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}

- name: Build and Push Docker Image for scripts/references
if: env.CHANGED == 'true'
uses: docker/build-push-action@v5
with:
context: scripts/references
push: true
tags: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:references-${{ github.ref_name }}
cache-from: type=registry,ref=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:references-${{ github.ref_name }}-cache
cache-to: type=inline,mode=max

36 changes: 24 additions & 12 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ params.comment = 'NSCLC public release update'
params.production = false
params.schema_ignore_params = ""
params.help = false
params.step = "update_potential_phi_fields_table"

/*
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Expand All @@ -44,9 +45,12 @@ NfcoreSchema.validateParameters(workflow, params, log)
if (params.cohort == null) { exit 1, 'cohort parameter not specified!' }
if (params.comment == null) { exit 1, 'comment parameter not specified!' }
if (params.production == null) { exit 1, 'production parameter not specified!' }
if (params.step == null) { exit 1, 'step parameter not specified!' }


// Print parameter summary log to screen
log.info NfcoreSchema.paramsSummaryLog(workflow, params)
log.info "Running step: ${params.step}"

// Print message for production mode vs test mode
if (params.production) {
Expand All @@ -66,6 +70,7 @@ else {
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
*/

include { update_potential_phi_fields_table } from './modules/update_potential_phi_fields_table'
include { run_quac_upload_report_error } from './modules/run_quac_upload_report_error'
include { run_quac_upload_report_warning } from './modules/run_quac_upload_report_warning'
include { merge_and_uncode_rca_uploads } from './modules/merge_and_uncode_rca_uploads'
Expand All @@ -77,7 +82,6 @@ include { run_quac_comparison_report } from './modules/run_quac_comparison_repor
include { create_masking_report } from './modules/create_masking_report'
include { update_case_count_table } from './modules/update_case_count_table'
include { run_clinical_release } from './modules/run_clinical_release'

/*
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
RUN WORKFLOW
Expand All @@ -87,17 +91,25 @@ include { run_clinical_release } from './modules/run_clinical_release'
workflow BPC_PIPELINE {
ch_cohort = Channel.value(params.cohort)
ch_comment = Channel.value(params.comment)

run_quac_upload_report_error(ch_cohort)
run_quac_upload_report_warning(run_quac_upload_report_error.out, ch_cohort, params.production)
merge_and_uncode_rca_uploads(run_quac_upload_report_warning.out, ch_cohort, params.production)
// remove_patients_from_merged(merge_and_uncode_rca_uploads.out, ch_cohort, params.production)
update_data_table(merge_and_uncode_rca_uploads.out, ch_comment, params.production)
update_date_tracking_table(update_data_table.out, ch_cohort, ch_comment, params.production)
run_quac_table_report(update_date_tracking_table.out, ch_cohort, params.production)
run_quac_comparison_report(run_quac_table_report.out, ch_cohort, params.production)
create_masking_report(run_quac_comparison_report.out, ch_cohort, params.production)
update_case_count_table(create_masking_report.out, ch_comment, params.production)

if (params.step == "update_potential_phi_fields_table") {
update_potential_phi_fields_table(ch_comment, params.production)
// validate_data.out.view()
} else if (params.step == "genie_bpc_pipeline"){
update_potential_phi_fields_table(ch_comment, params.production)
run_quac_upload_report_error(update_potential_phi_fields_table.out, ch_cohort)
run_quac_upload_report_warning(run_quac_upload_report_error.out, ch_cohort, params.production)
merge_and_uncode_rca_uploads(run_quac_upload_report_warning.out, ch_cohort, params.production)
// remove_patients_from_merged(merge_and_uncode_rca_uploads.out, ch_cohort, params.production)
update_data_table(merge_and_uncode_rca_uploads.out, ch_comment, params.production)
update_date_tracking_table(update_data_table.out, ch_cohort, ch_comment, params.production)
run_quac_table_report(update_date_tracking_table.out, ch_cohort, params.production)
run_quac_comparison_report(run_quac_table_report.out, ch_cohort, params.production)
create_masking_report(run_quac_comparison_report.out, ch_cohort, params.production)
update_case_count_table(create_masking_report.out, ch_comment, params.production)
} else {
exit 1, 'step not supported'
}
}

/*
Expand Down
30 changes: 30 additions & 0 deletions modules/update_potential_phi_fields_table.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
/*
Updates the potential pHI fields table with any new variables to redact
*/
process update_potential_phi_fields_table {

container "$params.references_docker"
secret 'SYNAPSE_AUTH_TOKEN'
debug true

input:
val comment
val production

output:
stdout

script:
if (production) {
"""
cd /usr/local/src/myscripts/
Rscript update_potential_phi_fields_table.R -c $comment --production
"""
}
else {
"""
cd /usr/local/src/myscripts/
Rscript update_potential_phi_fields_table.R -c $comment
"""
}
}
8 changes: 8 additions & 0 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,10 @@ manifest {
profiles {
aws_prod {
process {
withName: update_potential_phi_fields_table {
memory = 32.GB
cpus = 8
}
withName: run_workflow_case_selection {
memory = 32.GB
cpus = 8
Expand Down Expand Up @@ -46,5 +50,9 @@ profiles {
cpus = 8
}
}
params {
// docker image parameters, see nextflow_schema.json for details
references_docker = "sagebionetworks/genie-bpc-pipeline-references"
}
}
}
13 changes: 13 additions & 0 deletions nextflow_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,19 @@
false
]
},
"step": {
"type": "string",
"default": "update_potential_phi_fields_table",
"description": "Available BPC steps",
"enum": [
"update_potential_phi_fields_table",
"genie_bpc_pipeline"
]
},
"references_docker":{
"type": "string",
"description": "Name of docker to use in processes in scripts/references"
},
"schema_ignore_params": {
"type": "string",
"description": "Put parameters to ignore for validation here separated by comma",
Expand Down
35 changes: 24 additions & 11 deletions scripts/references/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,21 +1,34 @@
FROM r-base:4.0.0
FROM rstudio/r-base:4.0-bullseye

# Set working directory
WORKDIR /usr/local/src/myscripts

# Set environment variable for renv version
ENV RENV_VERSION 0.14.0

RUN rm /etc/apt/apt.conf.d/default
RUN apt-get update -y
RUN apt-get install -y dpkg-dev zlib1g-dev libssl-dev libffi-dev
# procps is required for nextflow tower
RUN apt-get install -y curl libcurl4-openssl-dev procps
RUN R -e "install.packages('synapser', repos=c('http://ran.synapse.org', 'http://cran.fhcrc.org'))"
# Update apt-get and install system dependencies (only install required)
RUN apt-get update -y && \
apt-get install -y --no-install-recommends \
dpkg-dev zlib1g-dev libssl-dev libffi-dev \
libcurl4-openssl-dev curl procps && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*

ENV PYTHON /usr/local/lib/R/site-library/PythonEmbedInR/bin/python3.6
# Install R packages including remotes and renv
RUN R -e "install.packages('remotes', repos = 'https://cloud.r-project.org')" && \
R -e "remotes::install_github('rstudio/renv', ref = '${RENV_VERSION}')" || true

RUN R -e "install.packages('remotes', repos = c(CRAN = 'https://cloud.r-project.org'))"
RUN R -e "remotes::install_github('rstudio/renv@${RENV_VERSION}')"
# Install synapser with specific version
RUN R -e "remotes::install_version('synapser', version = '0.11.7', repos = c('http://ran.synapse.org', 'http://cran.fhcrc.org'))"

COPY . .
# Set Python environment variable for R
ENV PYTHON /usr/local/lib/R/site-library/PythonEmbedInR/bin/python3.6

# Copy only renv.lock first to leverage docker cache for dependencies
COPY renv.lock renv.lock

# Restore R environment with renv
RUN R -e "renv::restore()"

# Copy the local project files into the container
COPY . .
57 changes: 48 additions & 9 deletions scripts/references/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,22 +45,26 @@ Usage: update_potential_phi_fields_table.R [options]
Options:
-f SYNID_FILE_SOR, --synid_file_sor=SYNID_FILE_SOR
Synapse ID of Scope of Release file (default: syn22294851)
-t SYNID_TABLE_RED, --synid_table_red=SYNID_TABLE_RED
Synapse ID of table listing variables to redact (default: syn23281483)
-a AUTH, --auth=AUTH
path to .synapseConfig or Synapse PAT (default: standard login precedence)
-d , --dry_run
Whether to dry-run or not.
--production
Whether to run in production mode (uses production project) or not (runs in staging mode and uses staging project).
-h, --help
Show this help message and exit
-c, --comment
Comment for new table snapshot version. This must be unique and is tied to the cohort run.
```

Example run:
Example run (runs in staging mode) with version comment 3.0.1 for
potential PHI fields table when updated:
```
Rscript update_potential_phi_fields_table.R
Rscript update_potential_phi_fields_table.R -c "version3.0.1"
```

## Usage: updating the cBioPortal mapping table
Expand Down Expand Up @@ -94,7 +98,7 @@ Options:

Example run:
```
Rscript update_potential_phi_fields_table.R -v
Rscript update_cbio_mapping.R -v
```

## Usage: updating upload tracking table
Expand Down Expand Up @@ -127,3 +131,38 @@ Example run:
```
Rscript update_date_tracking_table.R -c CRC -d 2022-03-31 -s 'round x update to crc'
```

## Running tests
There are unit tests under `scripts/references/tests`.

1. Please pull and run the docker image associated with this modules from [here](https://github.com/Sage-Bionetworks/genie-bpc-pipeline/pkgs/container/genie-bpc-pipeline) into your EC2/local.

```bash
docker run -d --name <nickname_for_container> <container_name> /bin/bash -c "while true; do sleep 1; done"
```

2. Do anything you need to do to the container (e.g: copy current local changes)

```bash
docker cp ./. test_container:/usr/local/src/myscripts
```

3. Execute container into a bash session

```bash
docker exec -it <nickname_for_container> /bin/bash
```

4. Install the `mockery` and `testthat` packages:

```bash
R -e "remotes::install_cran('mockery')"
R -e "remotes::install_cran('testthat')"
```

5. Run the following in a R session:

```R
library(testthat)
test_dir("/usr/local/src/myscripts/tests")
```
Loading

0 comments on commit d8a8888

Please sign in to comment.