Skip to content

Commit

Permalink
Finish date of initial diagnosis validations
Browse files Browse the repository at this point in the history
  • Loading branch information
kzollove committed Jan 24, 2024
1 parent b43459e commit 11278a9
Show file tree
Hide file tree
Showing 19 changed files with 271 additions and 137 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@
OncologyWG.Rproj
DATA_00.csv
.DS_Store
errorReportSql.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@ library(jsonlite)
#' @export
#'
#' @examples
getOncAnalysisQueries <- function() {
read.csv(file.path('./inst/csv/onc_analysis_queries.csv'))
getOncQueries <- function() {
read.csv(file.path('./inst/csv/onc_queries.csv'))
}


Expand Down Expand Up @@ -51,7 +51,7 @@ getQueryText <- function(queryNumber) {
#'
#' @examples
getExistingQueryNumbers <- function(connectionDetails, scratchDatabaseSchema) {
# REad table names from scratchDatabase
# Read table names from scratchDatabase
# There is not way to translate between these operations, so will need one for each supported database

sql <- render('SHOW TABLES FROM @scratchDatabaseSchema LIKE \'onc_val_*\'',
Expand All @@ -63,6 +63,39 @@ getExistingQueryNumbers <- function(connectionDetails, scratchDatabaseSchema) {
}


#' Title
#'
#' @param connectionDetails
#' @param resultsDatabaseSchema
#'
#' @return
#' @export
#'

getExistingAnalysisNumbers <- function(connectionDetails, resultsDatabaseSchema) {
sql <- render('SELECT DISTINCT analysis_id FROM @resultsDatabaseSchema.onc_validation_results',
resultsDatabaseSchema = resultsDatabaseSchema)
conn <- DatabaseConnector::connect(connectionDetails)
on.exit(DatabaseConnector::disconnect(conn))
DatabaseConnector::querySql(conn, sql = sql)$ANALYSIS_ID
}


#' Title
#'
#' @param queries
#' @param queryNumber
#'
#' @return
#'

getQueryTableName <- function(queries, queryNumber) {
category <- queries$category[queries['query_id'] == queryNumber]
formattedCategory <- tolower(gsub(" ", "_", category))
paste("onc_val", formattedCategory, queryNumber, sep ='_')
}


# DDL Handlers ------------------------------------------------------------

# Create composite analysis table
Expand All @@ -77,9 +110,9 @@ getExistingQueryNumbers <- function(connectionDetails, scratchDatabaseSchema) {
#' @examples
#'

createAnalysisTable <- function(connectionDetails, resultsDatabaseSchema, createTable = TRUE) {
createQueryTable <- function(connectionDetails, resultsDatabaseSchema, createTable = TRUE) {
if (isTRUE(createTable)) {
fp <- file.path('.', 'inst', 'sql', 'onc_validation_analysis_ddl.sql')
fp <- file.path('.', 'inst', 'sql', 'onc_validation_query_ddl.sql')
sql <- readChar(fp, file.info(fp)$size)
rendered <- render(sql, resultsDatabaseSchema = resultsDatabaseSchema)
renderedTranslated <- translate(rendered, targetDialect = connectionDetails$dbms)
Expand Down Expand Up @@ -120,6 +153,20 @@ createResultsTable <- function(connectionDetails, resultsDatabaseSchema, overwri

# Execution Handlers ------------------------------------------------------

#' Title
#'
#' @param connectionDetails
#' @param analysisNumber
#' @param cdmDatabaseSchema
#' @param vocabDatabaseSchema
#' @param scratchDatabaseSchema
#' @param resultsDatabaseSchema
#'
#' @return
#' @export
#'

# TODO this appends even if analysis_id already exists... it should replace existing
executeAnalysis <- function(connectionDetails,
analysisNumber,
cdmDatabaseSchema = cdmDatabaseSchema,
Expand All @@ -131,11 +178,30 @@ executeAnalysis <- function(connectionDetails,

composites <- fromJSON(composites_path)$composite_analyses

requisiteAnalysisNumbers <- composites$composite_analyses[composites['analysis_id'] == analysisNumber][[1]]

existingAnalysisNumbers <- getExistingAnalysisNumbers(connectionDetails, resultsDatabaseSchema)

missingAnalyses <- requisiteAnalysisNumbers[which(!requisiteAnalysisNumbers %in% existingAnalysisNumbers)]

if (length(missingAnalyses)) {
message("Executing required analyses")
lapply(missingAnalyses, function(x) {
executeAnalysis(connectionDetails,
analysisNumber = x,
cdmDatabaseSchema = cdmDatabaseSchema,
vocabDatabaseSchema = vocabDatabaseSchema,
scratchDatabaseSchema = scratchDatabaseSchema,
resultsDatabaseSchema = resultsDatabaseSchema)
})
}
message("All requisite analyses executed.")

requisiteQueryNumbers <- composites$queries[composites['analysis_id'] == analysisNumber][[1]]

existingQueryNumbers <- getExistingQueryNumbers(connectionDetails, scratchDatabaseSchema)

missingQueries <- which(!requisiteQueryNumbers %in% existingQueryNumbers)
missingQueries <- requisiteQueryNumbers[which(!requisiteQueryNumbers %in% existingQueryNumbers)]

if (length(missingQueries)) {
message("Executing required queries")
Expand All @@ -151,10 +217,11 @@ executeAnalysis <- function(connectionDetails,


renderedAnalysisText <- render(getAnalysisText(analysisNumber),
scratchDatabaseSchema = scratchDatabaseSchema)
scratchDatabaseSchema = scratchDatabaseSchema,
resultsDatabaseSchema = resultsDatabaseSchema)

renderedInsertAnalysisText <- render(getAnalysisText('analysisInsert'),
insertSchema = scratchDatabaseSchema,
insertSchema = resultsDatabaseSchema,
analysisText = renderedAnalysisText)

translatedRenderedInsertAnalysisText <- translate(renderedInsertAnalysisText, targetDialect = connectionDetails$dbms)
Expand Down Expand Up @@ -184,9 +251,9 @@ executeQuery <- function(connectionDetails,
vocabDatabaseSchema = vocabDatabaseSchema,
scratchDatabaseSchema = scratchDatabaseSchema) {

queries <- getOncAnalysisQueries()
queries <- getOncQueries()

queryTableName <- paste("onc_val", queries$category[queries['granular_analysis_id'] == queryNumber], queryNumber, sep ='_')
queryTableName <- getQueryTableName(queries, queryNumber)


renderedQueryText <- render(getQueryText(queryNumber),
Expand Down Expand Up @@ -232,22 +299,22 @@ scratchDatabaseSchema <- resultsDatabaseSchema <- 'ctsi.kzollo'



createAnalysisTable(connectionDetails, resultsDatabaseSchema)
createQueryTable(connectionDetails, resultsDatabaseSchema)

createResultsTable(connectionDetails, resultsDatabaseSchema)
createResultsTable(connectionDetails, resultsDatabaseSchema, overwrite = TRUE)


oncAnalysisQueriesCsv <- getOncAnalysisQueries()
oncAnalysisQueriesCsv <- oncAnalysisQueriesCsv[, -c(2, 3)]
oncQueriesCsv <- getOncQueries()
oncQueriesCsv <- oncQueriesCsv[, -c(2, 3)]

conn <- DatabaseConnector::connect(connectionDetails)

# Populate ONC_analysis with data from ONC_ANALYSIS_QUERIES. from above
DatabaseConnector::insertTable(
connection = connection,
connection = conn,
databaseSchema = resultsDatabaseSchema,
tableName = "ONC_VALIDATION_ANALYSIS",
data = oncAnalysisQueriesCsv,
tableName = "onc_validation_query",
data = oncQueriesCsv,
dropTableIfExists = FALSE,
createTable = FALSE,
tempTable = FALSE
Expand All @@ -258,7 +325,7 @@ DatabaseConnector::disconnect(conn)

# User asks: How many cancer diagnosis records are in my data?

executeAnalysis(connectionDetails, analysisNumber = 2,
executeAnalysis(connectionDetails, analysisNumber = 1001,
cdmDatabaseSchema = cdmDatabaseSchema,
vocabDatabaseSchema = vocabDatabaseSchema,
scratchDatabaseSchema = scratchDatabaseSchema,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"granular_analysis_id","distribution","distributed_field","analysis_name","stratum_1_name","stratum_2_name","stratum_3_name","stratum_4_name","stratum_5_name","is_default","category"
"query_id","distribution","distributed_field","query_name","stratum_1_name","stratum_2_name","stratum_3_name","stratum_4_name","stratum_5_name","is_default","category"
"1","0",,"Person_ids of persons with cancer diagnosis",,,,,,"1","General"
"2","0",,"Condition_occurrence_ids of cancer diagnoses",,,,,,"1","General"
"10","0",,"Measurement_ids of date of initial diagnosis modifier records",,,,,,"1","Date of Initial Diagnosis"
Expand Down
20 changes: 14 additions & 6 deletions validationScripts/inst/description.MD
Original file line number Diff line number Diff line change
@@ -1,12 +1,20 @@
Trying to figure out how to manage analyses/queries..
# Oncology Validation Framework: Analyses and Queries

I'm toying with two styles: 1) Achilles, verbatim. 2) Based off Achilles for storage, but growing increasingly different
## Overview
The analyses in the Oncology Validation Framework are meant to be extensible. Practically, this means that OncologyWG developers should be able to create these analyses to support the quality control needs of specific studies and study authors. This document outlines how analyses and queries are managed in this directory. Further, it explains how analyses are built from queries and demonstrates how to do this with examples.

Benefits of Achilles are that the analyses could all be added back into Achilles. This is a big deal as then they could integrate with Atlas or DQD and be exported to Ares where they could have distinct visualizations.
## Definitions

Downside of Achilles is that it is a little restrictive and may be a bit off the mark from what I understand we are trying to do.
Definitions for the major terms to know

Benefits of the (increasingly) ad hoc approach is that we can make the queries as granular as we would like and then combine them together to get answers to different questions.
### Analyses

Also, keeping them granular makes it possible (easier) to track the records of interest (e.g. mal-formed measurement records)
### Queries

## Directory Structure

### csv

### json

### sql
11 changes: 9 additions & 2 deletions validationScripts/inst/json/onc_analysis_composite.json
Original file line number Diff line number Diff line change
Expand Up @@ -51,16 +51,23 @@
},
{
"analysis_id": "15",
"analysis_name": "Number of date of initial diagnosis modifier records that come from tumor registry data source",
"analysis_name": "Number of date of initial diagnosis modifier records by data source",
"analysis_type": "count",
"queries": [15],
"composite_analyses": []
},
{
"analysis_id": "150",
"analysis_name": "Number of date of initial diagnosis modifier records that come from tumor registry data source",
"analysis_type": "count",
"queries": [],
"composite_analyses": [15]
},
{
"analysis_id": "1001",
"analysis_name": "Number of poorly-formed date of initial diagnosis modifier records",
"analysis_type": "derived",
"queries": [2, 12, 13, 14],
"queries": [12, 13, 14],
"composite_analyses": []
},
{
Expand Down
5 changes: 2 additions & 3 deletions validationScripts/inst/sql/composite_analyses/10.sql
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,5 @@

select 10 as analysis_id,
cast(null as varchar(255)) as stratum_1, cast(null as varchar(255)) as stratum_2, cast(null as varchar(255)) as stratum_3, cast(null as varchar(255)) as stratum_4, cast(null as varchar(255)) as stratum_5,
COUNT_BIG(distinct measurement_id) as count_value
FROM @cdmDatabaseSchema.measurement m
WHERE m.measurement_concept_id = 734306 -- Initial Diagnosis
COUNT_BIG(*) as count_value
FROM @scratchDatabaseSchema.onc_val_date_of_initial_diagnosis_10
20 changes: 19 additions & 1 deletion validationScripts/inst/sql/composite_analyses/1001.sql
Original file line number Diff line number Diff line change
@@ -1 +1,19 @@
-- 1001 Number of poorly-formed date of initial diagnosis modifier records
-- 1001 Number of poorly-formed date of initial diagnosis modifier records

select 1001 as analysis_id,
cast(null as varchar(255)) as stratum_1, cast(null as varchar(255)) as stratum_2, cast(null as varchar(255)) as stratum_3, cast(null as varchar(255)) as stratum_4, cast(null as varchar(255)) as stratum_5,
COUNT_BIG(DISTINCT measurement_id) as count_value
FROM (
SELECT measurement_id
FROM @scratchDatabaseSchema.onc_val_date_of_initial_diagnosis_12

UNION ALL

SELECT measurement_id
FROM @scratchDatabaseSchema.onc_val_date_of_initial_diagnosis_13

UNION ALL

SELECT measurement_id
FROM @scratchDatabaseSchema.onc_val_date_of_initial_diagnosis_14
)
23 changes: 22 additions & 1 deletion validationScripts/inst/sql/composite_analyses/1002.sql
Original file line number Diff line number Diff line change
@@ -1 +1,22 @@
-- 1002 Proportion of poorly-formed date of initial diagnosis modifier records
-- 1002 Proportion of poorly-formed date of initial diagnosis modifier records

select 1002 as analysis_id,
CASE WHEN num_cancer_diagnoses != 0 THEN
cast(cast(1.0*num_poorly_formed as float)/CAST(num_cancer_diagnoses as float) as varchar(255))
ELSE
cast(null as varchar(255)) END as stratum_1,
cast(num_poorly_formed as varchar(255)) as stratum_2,
cast(num_cancer_diagnoses as varchar(255)) as stratum_3,
cast(null as varchar(255)) as stratum_4, cast(null as varchar(255)) as stratum_5,
sign(num_poorly_formed) as count_value
FROM (
SELECT (
SELECT count_value
FROM @resultsDatabaseSchema.onc_validation_results
WHERE analysis_id = 1001
) AS num_poorly_formed, (
SELECT count_value
FROM @resultsDatabaseSchema.onc_validation_results
WHERE analysis_id = 2
) AS num_cancer_diagnoses
)
5 changes: 2 additions & 3 deletions validationScripts/inst/sql/composite_analyses/11.sql
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,5 @@

select 11 as analysis_id,
cast(null as varchar(255)) as stratum_1, cast(null as varchar(255)) as stratum_2, cast(null as varchar(255)) as stratum_3, cast(null as varchar(255)) as stratum_4, cast(null as varchar(255)) as stratum_5,
COUNT_BIG(distinct person_id) as count_value
FROM @cdmDatabaseSchema.measurement m
WHERE m.measurement_concept_id = 734306 -- Initial Diagnosis
COUNT_BIG(*) as count_value
FROM @scratchDatabaseSchema.onc_val_date_of_initial_diagnosis_11
Loading

0 comments on commit 11278a9

Please sign in to comment.