generated from opensafely/research-template
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
4 changed files
with
448 additions
and
368 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,161 @@ | ||
# Load libraries --------------------------------------------------------------- | ||
print('Load libraries') | ||
|
||
library(dplyr) | ||
library(tictoc) | ||
library(readr) | ||
library(tidyr) | ||
library(stringr) | ||
library(ggplot2) | ||
library(jsonlite) | ||
library(here) | ||
library(arrow) | ||
|
||
# Specify redaction threshold -------------------------------------------------- | ||
print('Specify redaction threshold') | ||
|
||
threshold <- 6 | ||
|
||
# Source common functions ------------------------------------------------------ | ||
print('Source common functions') | ||
|
||
source("analysis/utility.R") | ||
source("analysis/data_cleaning/fn-qa.R") | ||
source("analysis/data_cleaning/fn-inex.R") | ||
# Specify command arguments ---------------------------------------------------- | ||
print('Specify command arguments') | ||
|
||
args <- commandArgs(trailingOnly=TRUE) | ||
|
||
if(length(args)==0){ | ||
cohort <- "prevax" | ||
} else { | ||
cohort <- args[[1]] | ||
} | ||
|
||
# Load json file containing vax study dates ------------------------------------ | ||
print('Load json file containing vax study dates') | ||
|
||
study_dates <- fromJSON("output/study_dates.json") | ||
|
||
# Specify relevant dates ------------------------------------------------------- | ||
print('Specify relevant dates') | ||
|
||
vax_start_date <- as.Date(study_dates$vax1_earliest, format="%Y-%m-%d") | ||
mixed_vax_threshold <- as.Date("2021-05-07") | ||
start_date_delta <- as.Date(study_dates$delta_date, format="%Y-%m-%d") | ||
end_date_delta <- as.Date(study_dates$omicron_date, format="%Y-%m-%d") | ||
|
||
# Load cohort data ------------------------------------------------------------- | ||
print('Load cohort data') | ||
|
||
input <- read_rds(file.path("output", paste0("input_",cohort,".rds"))) | ||
print(paste0(cohort, " cohort: ", nrow(input), " rows in the input file")) | ||
|
||
# Handle missing values -------------------------------------------------------- | ||
print('Handle missing values') | ||
|
||
input$cov_cat_smoking_status <- replace(input$cov_cat_smoking_status, | ||
is.na(input$cov_cat_smoking_status), | ||
"M") | ||
|
||
input <- input %>% | ||
mutate(cov_cat_region = as.character(cov_cat_region)) %>% | ||
mutate(cov_cat_region = replace_na(cov_cat_region, "Missing")) %>% | ||
mutate(cov_cat_region = as.factor(cov_cat_region)) | ||
|
||
# Set reference levels for factors --------------------------------------------- | ||
print('Set reference levels for factors') | ||
|
||
cat_factors <- colnames(input)[grepl("_cat_",colnames(input))] | ||
input[,cat_factors] <- lapply(input[,cat_factors], function(x) factor(x, ordered = FALSE)) | ||
|
||
# Set reference level for variable: sub_cat_covid19_hospital ------------------- | ||
print('Set reference level for variable: sub_cat_covid19_hospital') | ||
|
||
input$sub_cat_covid19_hospital <- ordered(input$sub_cat_covid19_hospital, | ||
levels = c("non_hospitalised", | ||
"hospitalised", | ||
"no_infection")) | ||
|
||
# Set reference level for variable: cov_cat_ethnicity -------------------------- | ||
print('Set reference level for variable: cov_cat_ethnicity') | ||
|
||
levels(input$cov_cat_ethnicity) <- list("Missing" = "0", "White" = "1", | ||
"Mixed" = "2", "South Asian" = "3", | ||
"Black" = "4", "Other" = "5") | ||
|
||
input$cov_cat_ethnicity <- ordered(input$cov_cat_ethnicity, | ||
levels = c("White","Mixed", | ||
"South Asian","Black", | ||
"Other","Missing")) | ||
|
||
# Set reference level for variable: cov_cat_imd ------------------------------- | ||
print('Set reference level for variable: cov_cat_imd') | ||
|
||
input$cov_cat_imd <- ordered(input$cov_cat_imd, | ||
levels = c("1 (most deprived)","2","3","4","5 (least deprived)")) | ||
|
||
# Set reference level for variable: cov_cat_region ----------------------------- | ||
print('Set reference level for variable: cov_cat_region') | ||
|
||
input$cov_cat_region <- relevel(input$cov_cat_region, ref = "East") | ||
|
||
# Set reference level for variable: cov_cat_smoking_status --------------------- | ||
print('Set reference level for variable: cov_cat_smoking_status') | ||
|
||
levels(input$cov_cat_smoking_status) <- list("Ever smoker" = "E", "Missing" = "M", "Never smoker" = "N", "Current smoker" = "S") | ||
|
||
input$cov_cat_smoking_status <- ordered(input$cov_cat_smoking_status, levels = c("Never smoker","Ever smoker","Current smoker","Missing")) | ||
|
||
# Set reference level for variable: cov_cat_sex -------------------------------- | ||
print('Set reference level for variable: cov_cat_sex') | ||
|
||
levels(input$cov_cat_sex) <- list("Female" = "female", "Male" = "male", "Unknown" = "unknown") | ||
|
||
# # Replace NA values with "Unknown" to ensure patients with missing sex | ||
# # are not evaluated entirely to NA during quality assurance steps | ||
|
||
input$cov_cat_sex <- replace(input$cov_cat_sex, is.na(input$cov_cat_sex), "Unknown") | ||
|
||
|
||
input$cov_cat_sex <- factor(input$cov_cat_sex, | ||
levels = c("Female", "Male", "Unknown")) | ||
input$cov_cat_sex <- relevel(input$cov_cat_sex, ref = "Female") | ||
|
||
|
||
# Set reference level for variable: vax_cat_jcvi_group ------------------------- | ||
print('Set reference level for variable: vax_cat_jcvi_group') | ||
|
||
input$vax_cat_jcvi_group <- ordered(input$vax_cat_jcvi_group, | ||
levels = c("12","11","10", | ||
"09","08","07", | ||
"06","05","04", | ||
"03","02","01","99")) | ||
|
||
# Set reference level for binaries --------------------------------------------- | ||
print('Set reference level for binaries') | ||
|
||
bin_factors <- colnames(input)[grepl("cov_bin_",colnames(input))] | ||
|
||
input[,bin_factors] <- lapply(input[,bin_factors], | ||
function(x) factor(x, levels = c("FALSE","TRUE"))) | ||
|
||
# Specify consort table -------------------------------------------------------- | ||
print('Specify consort table') | ||
|
||
consort <- data.frame(Description = "Input", | ||
N = nrow(input), | ||
stringsAsFactors = FALSE) | ||
|
||
# Quality assurance ------------------------------------------------------------ | ||
|
||
print('Call quality assurance function') | ||
qa_results <- qa(input, study_dates, consort, cohort, threshold) | ||
input <- qa_results$input | ||
consort <- qa_results$consort | ||
|
||
# Inclusion criteria ----------------------------------------------------------- | ||
|
||
print('Call inclusion criteria function') | ||
inex(input, consort, cohort, threshold) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,188 @@ | ||
inex <- function(input, consort, cohort, threshold) { | ||
print('Load libraries') | ||
|
||
library(dplyr) | ||
library(tictoc) | ||
library(readr) | ||
library(tidyr) | ||
library(stringr) | ||
library(ggplot2) | ||
library(jsonlite) | ||
library(here) | ||
library(arrow) | ||
|
||
print('Source common functions') | ||
source("analysis/utility.R") | ||
|
||
print('Inclusion criteria: Alive at index') | ||
|
||
input <- subset(input, input$inex_bin_alive == TRUE) # Subset input if alive at index. | ||
consort[nrow(consort)+1,] <- c("Inclusion criteria: Alive at index", | ||
nrow(input)) | ||
|
||
print('Inclusion criteria: Known age 18 or over at index') | ||
|
||
input <- subset(input, input$cov_num_age >= 18) # Subset input if age between 18 and 110 at index. | ||
consort[nrow(consort)+1,] <- c("Inclusion criteria: Known age 18 or over at index", | ||
nrow(input)) | ||
|
||
print('Inclusion criteria: Known age 110 or under at index') | ||
|
||
input <- subset(input, input$cov_num_age <= 110) # Subset input if age between 18 and 110 on 01/06/2021. | ||
consort[nrow(consort)+1,] <- c("Inclusion criteria: Known age 110 or under at index", | ||
nrow(input)) | ||
|
||
print('Inclusion criteria: Known sex at index') | ||
|
||
input <- input[!is.na(input$cov_cat_sex),] # removes NAs, if any | ||
input$cov_cat_sex <- relevel(input$cov_cat_sex, ref = "Female") | ||
consort[nrow(consort)+1,] <- c("Inclusion criteria: Known sex at index", | ||
nrow(input)) | ||
|
||
print('Inclusion criteria: Known IMD at index') | ||
|
||
input <- input[!is.na(input$cov_cat_imd),] # removes NAs, if any | ||
input$cov_cat_imd <- ordered(input$cov_cat_imd, | ||
levels = c("1 (most deprived)","2","3","4","5 (least deprived)")) | ||
consort[nrow(consort)+1,] <- c("Inclusion criteria: Known IMD at index", | ||
nrow(input)) | ||
|
||
print('Inclusion criteria: Continuous registration with the same practice for at least six months up to and including the index date') | ||
|
||
input <- subset(input, input$inex_bin_6m_reg == TRUE) | ||
consort[nrow(consort)+1,] <- c("Inclusion criteria: Continuous registration with the same practice for at least six months up to and including the index date", | ||
nrow(input)) | ||
|
||
print('Inclusion criteria: Known region at index') | ||
|
||
input <- input %>% mutate(cov_cat_region = as.character(cov_cat_region)) %>% | ||
filter(cov_cat_region != "Missing")%>% | ||
mutate(cov_cat_region = as.factor(cov_cat_region)) | ||
input$cov_cat_region <- relevel(input$cov_cat_region, ref = "East") | ||
consort[nrow(consort)+1,] <- c("Inclusion criteria: Known region at index", | ||
nrow(input)) | ||
# Apply cohort specific inclusion criteria ------------------------------------- | ||
print('Apply cohort specific inclusion criteria') | ||
|
||
if (cohort == "vax") { | ||
print('Inclusion criteria: Record of two vaccination doses prior to the study end date') | ||
|
||
input$vax_gap <- input$vax_date_covid_2 - input$vax_date_covid_1 #Determine the vaccination gap in days : gap is NA if any vaccine date is missing | ||
input <- input[!is.na(input$vax_gap),] # Subset the fully vaccinated group | ||
consort[nrow(consort)+1,] <- c("Inclusion criteria: Record of two vaccination doses prior to the study end date", | ||
nrow(input)) | ||
|
||
print('Inclusion criteria: Did not receive a vaccination prior to 08-12-2020 (i.e., the start of the vaccination program)') | ||
|
||
input <- subset(input, input$vax_date_covid_1 >= vax_start_date & input$vax_date_covid_2 >= vax_start_date) | ||
consort[nrow(consort)+1,] <- c("Inclusion criteria: Did not receive a vaccination prior to 08-12-2020 (i.e., the start of the vaccination program)", | ||
nrow(input)) | ||
|
||
print('Inclusion criteria: Did not recieve a second dose vaccination before their first dose vaccination') | ||
|
||
input <- subset(input, input$vax_gap >= 0) # Keep those with positive vaccination gap | ||
consort[nrow(consort)+1,] <- c("Inclusion criteria: Did not recieve a second dose vaccination before their first dose vaccination", | ||
nrow(input)) | ||
|
||
print('Inclusion criteria: Did not recieve a second dose vaccination less than three weeks after their first dose') | ||
|
||
input <- subset(input, input$vax_gap >= 21) # Keep those with at least 3 weeks vaccination gap | ||
consort[nrow(consort)+1,] <- c("Inclusion criteria: Did not recieve a second dose vaccination before their first dose vaccination", | ||
nrow(input)) | ||
|
||
print('Inclusion criteria: Did not recieve a mixed vaccine products before 07-05-2021') | ||
|
||
# Trick to run the mixed vaccine code on dummy data with limited levels -> To ensure that the levels are the same in vax_cat_product variables | ||
|
||
input <- input %>% | ||
mutate(AZ_date = ifelse(vax_date_AstraZeneca_1 < mixed_vax_threshold, 1, | ||
ifelse(vax_date_AstraZeneca_2 < mixed_vax_threshold, 1, | ||
ifelse(vax_date_AstraZeneca_3 < mixed_vax_threshold, 1, 0)))) %>% | ||
mutate(Moderna_date = ifelse(vax_date_Moderna_1 < mixed_vax_threshold, 1, | ||
ifelse(vax_date_Moderna_2 < mixed_vax_threshold, 1, | ||
ifelse(vax_date_Moderna_3 < mixed_vax_threshold, 1, 0)))) %>% | ||
mutate(Pfizer_date = ifelse(vax_date_Pfizer_1 < mixed_vax_threshold, 1, | ||
ifelse(vax_date_Pfizer_2 < mixed_vax_threshold, 1, | ||
ifelse(vax_date_Pfizer_3 < mixed_vax_threshold, 1, 0)))) %>% | ||
rowwise() %>% | ||
mutate(vax_mixed = sum(across(c("AZ_date", "Moderna_date", "Pfizer_date")), na.rm = T)) %>% | ||
dplyr::filter(vax_mixed < 2) | ||
|
||
consort[nrow(consort)+1,] <- c("Inclusion criteria: Did not recieve a mixed vaccine products before 07-05-2021", | ||
nrow(input)) | ||
|
||
print('Inclusion criteria: Index date is before cohort end date') | ||
|
||
# Will remove anyone who is not fully vaccinated by the cohort end date | ||
|
||
input <- input %>% filter (!is.na(index_date) & index_date <= end_date_exposure & index_date >= start_date_delta) | ||
consort[nrow(consort)+1,] <- c("Inclusion criteria: Index date is before cohort end date", | ||
nrow(input)) | ||
|
||
} else if (cohort == "unvax"){ | ||
|
||
print('Inclusion criteria: Does not have a record of one or more vaccination prior index date') | ||
|
||
# i.e. Have a record of a first vaccination prior to index date | ||
# (no more vax 2 and 3 variables available in this dataset) | ||
# a.Determine the vaccination status on index start date | ||
|
||
input$prior_vax1 <- ifelse(input$vax_date_covid_1 <= input$index_date, 1,0) | ||
input$prior_vax1[is.na(input$prior_vax1)] <- 0 | ||
input <- subset(input, input$prior_vax1 == 0) # Exclude people with prior vaccination | ||
consort[nrow(consort)+1,] <- c("Inclusion criteria: Does not have a record of one or more vaccination prior index date", | ||
nrow(input)) | ||
|
||
print('Inclusion criteria: Not missing JCVI group') | ||
|
||
input <- subset(input, vax_cat_jcvi_group == "01" | vax_cat_jcvi_group == "02" | vax_cat_jcvi_group == "03" | vax_cat_jcvi_group == "04" | | ||
vax_cat_jcvi_group == "05" | vax_cat_jcvi_group == "06" | vax_cat_jcvi_group == "07" | vax_cat_jcvi_group == "08" | | ||
vax_cat_jcvi_group == "09" | vax_cat_jcvi_group == "10" | vax_cat_jcvi_group == "11" | vax_cat_jcvi_group == "12") | ||
consort[nrow(consort)+1,] <- c("Inclusion criteria: Not missing JCVI group", | ||
nrow(input)) | ||
|
||
print('Inclusion criteria: Index date is not before cohort end date - will remove anyone whose eligibility date + 84 days is after study end date (only those with unknown JCVI group)') | ||
|
||
input <- input %>% filter (!is.na(index_date) & index_date <= end_date_exposure & index_date >= start_date_delta) | ||
consort[nrow(consort)+1,] <- c("Inclusion criteria: Index date is not before cohort end date - will remove anyone whose eligibility date + 84 days is after study end date (only those with unknown JCVI group)", | ||
nrow(input)) | ||
|
||
} | ||
# Save consort data after Inclusion criteria | ||
print('Saving consort data after Inclusion criteria') | ||
|
||
consort$N <- as.numeric(consort$N) | ||
consort$removed <- dplyr::lag(consort$N, default = dplyr::first(consort$N)) - consort$N | ||
|
||
write.csv(consort, file = paste0("output/consort_", cohort, "_inex.csv"), row.names = FALSE) | ||
|
||
# Perform redaction | ||
print('Performing redaction') | ||
|
||
consort$removed <- NULL | ||
consort$N_midpoint6 <- roundmid_any(consort$N, to=threshold) | ||
consort$removed_derived <- dplyr::lag(consort$N_midpoint6, default = dplyr::first(consort$N_midpoint6)) - consort$N_midpoint6 | ||
consort$N <- NULL | ||
|
||
# Save rounded consort data | ||
print('Saving rounded consort data after Inclusion criteria') | ||
|
||
write.csv(consort, file = paste0("output/consort_", cohort, "_inex_midpoint6.csv"), row.names = FALSE) | ||
|
||
# Save the dataset after Inclusion criteria | ||
print('Saving dataset after Inclusion criteria') | ||
|
||
input <- input[, c("patient_id", "cens_date_death", "index_date", | ||
colnames(input)[grepl("end_date_", colnames(input))], | ||
colnames(input)[grepl("sub_", colnames(input))], | ||
colnames(input)[grepl("exp_", colnames(input))], | ||
colnames(input)[grepl("out_", colnames(input))], | ||
colnames(input)[grepl("cov_", colnames(input))], | ||
colnames(input)[grepl("cens_", colnames(input))], | ||
colnames(input)[grepl("vax_date_", colnames(input))], | ||
colnames(input)[grepl("vax_cat_", colnames(input))])] | ||
|
||
saveRDS(input, file = paste0("output/input_", cohort, "_inex.rds"), compress = TRUE) | ||
|
||
return(NULL) | ||
} |
Oops, something went wrong.