opensafely · ZoeMZou · Jan 7, 2025 · Jan 7, 2025 · Jan 7, 2025 · Jan 15, 2025
diff --git a/analysis/active_analyses.R b/analysis/active_analyses.R
@@ -65,14 +65,14 @@ core_covars <- c(
 
 ## Define project-specific covariates (specific to respiratory project) ----
 project_covars <- c(
-  "cov_bin_history_pneumonia_snomed", "cov_bin_history_asthma_snomed", 
-  "cov_bin_history_pulmonary_fibrosis_snomed", "cov_bin_all_stroke"
+  "cov_bin_history_pneumonia", "cov_bin_history_asthma", 
+  "cov_bin_history_pulmonary_fibrosis", "cov_bin_stroke_isch"
 )
 # Combine covariates into a single vector ----
 all_covars <- c(core_covars, project_covars)
 
 ## Combine covariates into a single string for analysis ----
-preex_FALSE_covars <- paste0(all_covars[!all_covars %in% c("cov_bin_history_asthma_snomed", "cov_bin_history_copd")], collapse = ";")
+preex_FALSE_covars <- paste0(all_covars[!all_covars %in% c("cov_bin_history_asthma", "cov_bin_history_copd")], collapse = ";")
 all_covars <- paste0(c(core_covars, project_covars), collapse = ";")
 
 # Specify cohorts ----

diff --git a/analysis/create_project_actions.R b/analysis/create_project_actions.R
@@ -94,13 +94,34 @@ preprocess_data <- function(cohort){
         describe_venn = glue("output/describe_venn_{cohort}.txt")
       ),
       highly_sensitive = list(
-        cohort = glue("output/input_{cohort}.rds"),
+        cohort = glue("output/input_{cohort}_0.rds"),
         venn = glue("output/venn_{cohort}.rds")
       )
     )
   )
 }
 
+# Create function to data cleaning -------------------------------------------
+
+data_cleaning <- function(cohort){
+  splice(
+    comment(glue("Data cleaning - {cohort}")),
+    action(
+      name = glue("data_cleaning_{cohort}"),
+      run = glue("r:latest analysis/data_cleaning/data_cleaning.R"),
+      arguments = c(cohort),
+      needs = list("vax_eligibility_inputs",glue("preprocess_data_{cohort}")),
+      moderately_sensitive = list(
+        consort = glue("output/consort_{cohort}.csv"),
+        consort_midpoint6 = glue("output/consort_{cohort}_midpoint6.csv")
+      ),
+      highly_sensitive = list(
+        cohort = glue("output/input_{cohort}.rds")
+      )
+    )
+  )
+}
+
 # Define and combine all actions into a list of actions ------------------------------0
 
 actions_list <- splice(
@@ -153,8 +174,16 @@ actions_list <- splice(
                   function(x) preprocess_data(cohort = x)), 
            recursive = FALSE
     )
-  )
-)
+  ),
+
+  ## Preprocess data -----------------------------------------------------------
+
+  splice(
+    unlist(lapply(cohorts, 
+                  function(x) data_cleaning(cohort = x)), 
+           recursive = FALSE
+    )
+  ))
 
 
 # Combine actions into project list --------------------------------------------

diff --git a/analysis/data_cleaning/data_cleaning.R b/analysis/data_cleaning/data_cleaning.R
@@ -0,0 +1,109 @@
+# Load libraries ---------------------------------------------------------------
+print('Load libraries')
+
+library(dplyr)
+library(tictoc)
+library(readr)
+library(tidyr)
+library(stringr)
+library(ggplot2)
+library(jsonlite)
+library(here)
+library(arrow)
+
+# Specify redaction threshold --------------------------------------------------
+print('Specify redaction threshold')
+
+threshold <- 6
+
+# Source common functions ------------------------------------------------------
+print('Source common functions')
+
+source("analysis/utility.R")
+source("analysis/data_cleaning/fn-qa.R")
+source("analysis/data_cleaning/fn-inex.R")
+source("analysis/data_cleaning/fn-ref.R")
+# Specify command arguments ----------------------------------------------------
+print('Specify command arguments')
+
+args <- commandArgs(trailingOnly=TRUE)
+
+if(length(args)==0){
+  cohort <- "vax"
+} else {
+  cohort <- args[[1]]
+}
+
+# Load json file containing vax study dates ------------------------------------
+print('Load json file containing vax study dates')
+
+study_dates <- fromJSON("output/study_dates.json")
+
+# Specify relevant dates -------------------------------------------------------
+print('Specify relevant dates')
+
+vax_start_date <- as.Date(study_dates$vax1_earliest, format="%Y-%m-%d")
+mixed_vax_threshold <- as.Date("2021-05-07")
+start_date_delta <- as.Date(study_dates$delta_date, format="%Y-%m-%d")
+
+# Load cohort data -------------------------------------------------------------
+print('Load cohort data')
+
+input <- read_rds(file.path("output", paste0("input_",cohort,"_0.rds")))
+print(paste0(cohort,  " cohort: ", nrow(input), " rows in the input file"))
+
+# Set reference levels for factors----------------------------------------------
+
+print('Call reference function')
+
+input <- ref(input)$input
+
+# Quality assurance ------------------------------------------------------------
+
+print('Call quality assurance function')
+qa_results <- qa(input, study_dates)
+input <- qa_results$input
+consort <- qa_results$consort
+
+# Inclusion criteria -----------------------------------------------------------
+
+print('Call inclusion criteria function')
+inex_results <- inex(input, consort, cohort, vax_start_date, mixed_vax_threshold, start_date_delta)
+input <- inex_results$input
+consort <- inex_results$consort
+
+# Save consort data after Inclusion criteria
+print('Saving consort data after Inclusion criteria')
+
+consort$N <- as.numeric(consort$N)
+consort$removed <- dplyr::lag(consort$N, default = dplyr::first(consort$N)) - consort$N
+
+write.csv(consort, file = paste0("output/consort_", cohort, ".csv"), row.names = FALSE)
+
+# Perform redaction
+print('Performing redaction')
+
+consort$removed <- NULL
+consort$N_midpoint6 <- roundmid_any(consort$N, to=threshold)
+consort$removed_derived <- dplyr::lag(consort$N_midpoint6, default = dplyr::first(consort$N_midpoint6)) - consort$N_midpoint6
+consort$N <- NULL
+
+# Save rounded consort data
+print('Saving rounded consort data after Inclusion criteria')
+
+write.csv(consort, file = paste0("output/consort_", cohort, "_midpoint6.csv"), row.names = FALSE)
+
+# Save the dataset 
+print('Saving dataset after Inclusion criteria')
+
+input <- input[, c("patient_id", "index_date",
+                  colnames(input)[grepl("end_date_", colnames(input))],
+                  colnames(input)[grepl("sub_", colnames(input))],
+                  colnames(input)[grepl("exp_", colnames(input))],
+                  colnames(input)[grepl("out_", colnames(input))],
+                  colnames(input)[grepl("cov_", colnames(input))],
+                  colnames(input)[grepl("cens_", colnames(input))],
+                  colnames(input)[grepl("vax_date_", colnames(input))],
+                  colnames(input)[grepl("vax_cat_", colnames(input))])]
+
+saveRDS(input, file = paste0("output/input_", cohort, ".rds"), compress = TRUE)
diff --git a/analysis/data_cleaning/fn-inex.R b/analysis/data_cleaning/fn-inex.R
@@ -0,0 +1,142 @@
+# Function to apply inclusion criteria
+
+inex <- function(input, consort, cohort, vax_start_date, mixed_vax_threshold, start_date_delta) {
+
+  print('Inclusion criteria: Alive at index')
+
+  input <- subset(input, input$inex_bin_alive == TRUE) # Subset input if alive at index.
+  consort[nrow(consort)+1,] <- c("Inclusion criteria: Alive at index",
+                                 nrow(input))
+
+  print('Inclusion criteria: Known age 18 or over at index')
+
+  input <- subset(input, input$cov_num_age >= 18) # Subset input if age between 18 and 110 at index.
+  consort[nrow(consort)+1,] <- c("Inclusion criteria: Known age 18 or over at index",
+                                 nrow(input))
+
+  print('Inclusion criteria: Known age 110 or under at index')
+
+  input <- subset(input, input$cov_num_age <= 110) # Subset input if age between 18 and 110 on 01/06/2021.
+  consort[nrow(consort)+1,] <- c("Inclusion criteria: Known age 110 or under at index",
+                                 nrow(input))
+
+  print('Inclusion criteria: Known sex at index')
+
+  input <- input[!is.na(input$cov_cat_sex),] # removes NAs, if any
+  input$cov_cat_sex <- relevel(input$cov_cat_sex, ref = "Female")
+  consort[nrow(consort)+1,] <- c("Inclusion criteria: Known sex at index",
+                                 nrow(input))
+
+  print('Inclusion criteria: Known IMD at index')
+
+  input <- input[!is.na(input$cov_cat_imd),] # removes NAs, if any
+  input$cov_cat_imd <- ordered(input$cov_cat_imd, 
+                               levels = c("1 (most deprived)","2","3","4","5 (least deprived)"))
+  consort[nrow(consort)+1,] <- c("Inclusion criteria: Known IMD at index",
+                                 nrow(input))
+
+  print('Inclusion criteria: Continuous registration with the same practice for at least six months up to and including the index date')
+
+  input <- subset(input, input$inex_bin_6m_reg == TRUE)
+  consort[nrow(consort)+1,] <- c("Inclusion criteria: Continuous registration with the same practice for at least six months up to and including the index date",
+                                 nrow(input))
+
+  print('Inclusion criteria: Known region at index')
+
+  input <- input %>% mutate(cov_cat_region = as.character(cov_cat_region)) %>%
+    filter(cov_cat_region != "Missing")%>%
+    mutate(cov_cat_region = as.factor(cov_cat_region))
+  input$cov_cat_region <- relevel(input$cov_cat_region, ref = "East")
+  consort[nrow(consort)+1,] <- c("Inclusion criteria: Known region at index",
+                                 nrow(input))
+# Apply cohort specific inclusion criteria -------------------------------------
+  print('Apply cohort specific inclusion criteria')
+
+  if (cohort == "vax") {
+    print('Inclusion criteria: Record of two vaccination doses prior to the study end date')
+
+    input$vax_gap <- input$vax_date_covid_2 - input$vax_date_covid_1 #Determine the vaccination gap in days : gap is NA if any vaccine date is missing
+    input <- input[!is.na(input$vax_gap),] # Subset the fully vaccinated group
+    consort[nrow(consort)+1,] <- c("Inclusion criteria: Record of two vaccination doses prior to the study end date",
+                                    nrow(input))
+
+    print('Inclusion criteria: Did not receive a vaccination prior to 08-12-2020 (i.e., the start of the vaccination program)')
+
+    input <- subset(input, input$vax_date_covid_1 >= vax_start_date & input$vax_date_covid_2 >= vax_start_date)
+    consort[nrow(consort)+1,] <- c("Inclusion criteria: Did not receive a vaccination prior to 08-12-2020 (i.e., the start of the vaccination program)",
+                                    nrow(input))
+
+    print('Inclusion criteria: Did not recieve a second dose vaccination before their first dose vaccination')
+
+    input <- subset(input, input$vax_gap >= 0) # Keep those with positive vaccination gap
+    consort[nrow(consort)+1,] <- c("Inclusion criteria: Did not recieve a second dose vaccination before their first dose vaccination",
+                                    nrow(input))
+
+    print('Inclusion criteria: Did not recieve a second dose vaccination less than three weeks after their first dose')
+
+    input <- subset(input, input$vax_gap >= 21) # Keep those with at least 3 weeks vaccination gap
+    consort[nrow(consort)+1,] <- c("Inclusion criteria: Did not recieve a second dose vaccination before their first dose vaccination",
+                                    nrow(input))
+
+    print('Inclusion criteria: Did not recieve a mixed vaccine products before 07-05-2021')
+
+    input <- input %>%
+      mutate(
+        AZ_date = as.numeric(ifelse(vax_date_AstraZeneca_1 < mixed_vax_threshold, 1,
+                          ifelse(vax_date_AstraZeneca_2 < mixed_vax_threshold, 1,
+                                 ifelse(vax_date_AstraZeneca_3 < mixed_vax_threshold, 1, 0)))),
+        Moderna_date = as.numeric(ifelse(vax_date_Moderna_1 < mixed_vax_threshold, 1,
+                              ifelse(vax_date_Moderna_2 < mixed_vax_threshold, 1,
+                                     ifelse(vax_date_Moderna_3 < mixed_vax_threshold, 1, 0)))),
+        Pfizer_date = as.numeric(ifelse(vax_date_Pfizer_1 < mixed_vax_threshold, 1,
+                              ifelse(vax_date_Pfizer_2 < mixed_vax_threshold, 1,
+                                     ifelse(vax_date_Pfizer_3 < mixed_vax_threshold, 1, 0))))
+      ) %>%
+      rowwise() %>%
+      mutate(vax_mixed = sum(c_across(c(AZ_date, Moderna_date, Pfizer_date)), na.rm = TRUE)) %>%
+      ungroup() %>%
+      dplyr::filter(vax_mixed < 2)
+
+    consort[nrow(consort)+1,] <- c("Inclusion criteria: Did not recieve a mixed vaccine products before 07-05-2021",
+                                    nrow(input))
+
+    print('Inclusion criteria: Index date is before cohort end date')
+
+    # Will remove anyone who is not fully vaccinated by the cohort end date
+
+    input <- input %>% filter (!is.na(index_date) & index_date <= end_date_exposure & index_date >= start_date_delta)
+    consort[nrow(consort)+1,] <- c("Inclusion criteria: Index date is before cohort end date",
+                                    nrow(input))
+
+  } else if (cohort == "unvax"){
+
+    print('Inclusion criteria: Does not have a record of one or more vaccination prior index date')
+
+    # i.e. Have a record of a first vaccination prior to index date
+    # (no more vax 2 and 3 variables available in this dataset)
+    # a.Determine the vaccination status on index start date
+
+    input$prior_vax1 <- ifelse(input$vax_date_covid_1 <= input$index_date, 1,0)
+    input$prior_vax1[is.na(input$prior_vax1)] <- 0
+    input <- subset(input, input$prior_vax1 == 0) # Exclude people with prior vaccination
+    consort[nrow(consort)+1,] <- c("Inclusion criteria: Does not have a record of one or more vaccination prior index date",
+                                    nrow(input))
+
+    print('Inclusion criteria: Not missing JCVI group')
+
+    input <- subset(input, vax_cat_jcvi_group == "01" | vax_cat_jcvi_group == "02" | vax_cat_jcvi_group == "03" | vax_cat_jcvi_group == "04" |
+                        vax_cat_jcvi_group == "05" | vax_cat_jcvi_group == "06" | vax_cat_jcvi_group == "07" | vax_cat_jcvi_group == "08" |
+                        vax_cat_jcvi_group == "09" | vax_cat_jcvi_group == "10" | vax_cat_jcvi_group == "11" | vax_cat_jcvi_group == "12")
+    consort[nrow(consort)+1,] <- c("Inclusion criteria: Not missing JCVI group",
+                                    nrow(input))
+
+    print('Inclusion criteria: Index date is not before cohort end date - will remove anyone whose eligibility date + 84 days is after study end date (only those with unknown JCVI group)')
+
+    input <- input %>% filter (!is.na(index_date) & index_date <= end_date_exposure & index_date >= start_date_delta)
+    consort[nrow(consort)+1,] <- c("Inclusion criteria: Index date is not before cohort end date - will remove anyone whose eligibility date + 84 days is after study end date (only those with unknown JCVI group)",
+                                    nrow(input))
+
+  }
+
+  return(list(input = input, consort = consort))
+}
diff --git a/analysis/data_cleaning/fn-qa.R b/analysis/data_cleaning/fn-qa.R
@@ -0,0 +1,55 @@
+# Function to apply quality assurance
+qa <- function(input, study_dates) {
+
+  # Specify consort table --------------------------------------------------------
+
+  print('Specify consort table')
+
+  consort <- data.frame(Description = "Input", 
+                        N = nrow(input),
+                        stringsAsFactors = FALSE)
+
+  print('Quality assurance: Year of birth is after year of death or patient only has year of death')
+
+  input <- input[!((input$qa_num_birth_year > (format(input$cens_date_death, format="%Y")) & 
+                      is.na(input$qa_num_birth_year)== FALSE & is.na(input$cens_date_death) == FALSE) | 
+                     (is.na(input$qa_num_birth_year)== TRUE & is.na(input$cens_date_death) == FALSE)),]
+  consort[nrow(consort)+1,] <- c("Quality assurance: Year of birth is after year of death or patient only has year of death",
+                                 nrow(input))
+
+  print('Quality assurance: Year of birth is before 1793 or year of birth exceeds current date')
+
+  input <- input[!((input$qa_num_birth_year < 1793 | 
+                      (input$qa_num_birth_year >format(Sys.Date(),"%Y"))) & 
+                     is.na(input$qa_num_birth_year) == FALSE),]
+  consort[nrow(consort)+1,] <- c("Quality assurance: Year of birth is before 1793 or year of birth exceeds current date",
+                                 nrow(input))
+
+  print('Quality assurance: Date of death is invalid (on or before 1/1/1900 or after current date)')
+
+  input <- input[!((input$cens_date_death <= as.Date(study_dates$earliest_expec) | 
+                      input$cens_date_death > format(Sys.Date(),"%Y-%m-%d")) & is.na(input$cens_date_death) == FALSE),]
+  consort[nrow(consort)+1,] <- c("Quality assurance: Date of death is invalid (on or before 1/1/1900 or after current date)",
+                                 nrow(input))
+
+  print('Quality assurance: Pregnancy/birth codes for men')
+
+  input <- input[!(input$qa_bin_pregnancy == TRUE & input$cov_cat_sex=="Male") | is.na(input$cov_cat_sex),]
+  consort[nrow(consort)+1,] <- c("Quality assurance: Pregnancy/birth codes for men",
+                                 nrow(input))
+
+  print('Quality assurance: HRT or COCP meds for men')
+
+  input <- input[!(input$cov_cat_sex=="Male" & input$qa_bin_hrtcocp==TRUE) | is.na(input$cov_cat_sex),]
+  consort[nrow(consort)+1,] <- c("Quality assurance: HRT or COCP meds for men",
+                                 nrow(input))
+
+  print('Quality assurance: Prostate cancer codes for women')
+
+  input <- input[!(input$qa_bin_prostate_cancer == TRUE & 
+                     input$cov_cat_sex=="Female") | is.na(input$cov_cat_sex),]
+  consort[nrow(consort)+1,] <- c("Quality assurance: Prostate cancer codes for women",
+                                 nrow(input))
+
+  return(list(input = input, consort = consort))
+}