Skip to content

Commit

Permalink
Update functions for data cleaning
Browse files Browse the repository at this point in the history
  • Loading branch information
ZoeMZou committed Jan 28, 2025
1 parent c69e1a3 commit 5f9811c
Show file tree
Hide file tree
Showing 4 changed files with 448 additions and 368 deletions.
161 changes: 161 additions & 0 deletions analysis/data_cleaning/data_cleaning.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
# Load libraries ---------------------------------------------------------------
print('Load libraries')

library(dplyr)
library(tictoc)
library(readr)
library(tidyr)
library(stringr)
library(ggplot2)
library(jsonlite)
library(here)
library(arrow)

# Specify redaction threshold --------------------------------------------------
print('Specify redaction threshold')

threshold <- 6

# Source common functions ------------------------------------------------------
print('Source common functions')

source("analysis/utility.R")
source("analysis/data_cleaning/fn-qa.R")
source("analysis/data_cleaning/fn-inex.R")
# Specify command arguments ----------------------------------------------------
print('Specify command arguments')

args <- commandArgs(trailingOnly=TRUE)

if(length(args)==0){
cohort <- "prevax"
} else {
cohort <- args[[1]]
}

# Load json file containing vax study dates ------------------------------------
print('Load json file containing vax study dates')

study_dates <- fromJSON("output/study_dates.json")

# Specify relevant dates -------------------------------------------------------
print('Specify relevant dates')

vax_start_date <- as.Date(study_dates$vax1_earliest, format="%Y-%m-%d")
mixed_vax_threshold <- as.Date("2021-05-07")
start_date_delta <- as.Date(study_dates$delta_date, format="%Y-%m-%d")
end_date_delta <- as.Date(study_dates$omicron_date, format="%Y-%m-%d")

# Load cohort data -------------------------------------------------------------
print('Load cohort data')

input <- read_rds(file.path("output", paste0("input_",cohort,".rds")))
print(paste0(cohort, " cohort: ", nrow(input), " rows in the input file"))

# Handle missing values --------------------------------------------------------
print('Handle missing values')

input$cov_cat_smoking_status <- replace(input$cov_cat_smoking_status,
is.na(input$cov_cat_smoking_status),
"M")

input <- input %>%
mutate(cov_cat_region = as.character(cov_cat_region)) %>%
mutate(cov_cat_region = replace_na(cov_cat_region, "Missing")) %>%
mutate(cov_cat_region = as.factor(cov_cat_region))

# Set reference levels for factors ---------------------------------------------
print('Set reference levels for factors')

cat_factors <- colnames(input)[grepl("_cat_",colnames(input))]
input[,cat_factors] <- lapply(input[,cat_factors], function(x) factor(x, ordered = FALSE))

# Set reference level for variable: sub_cat_covid19_hospital -------------------
print('Set reference level for variable: sub_cat_covid19_hospital')

input$sub_cat_covid19_hospital <- ordered(input$sub_cat_covid19_hospital,
levels = c("non_hospitalised",
"hospitalised",
"no_infection"))

# Set reference level for variable: cov_cat_ethnicity --------------------------
print('Set reference level for variable: cov_cat_ethnicity')

levels(input$cov_cat_ethnicity) <- list("Missing" = "0", "White" = "1",
"Mixed" = "2", "South Asian" = "3",
"Black" = "4", "Other" = "5")

input$cov_cat_ethnicity <- ordered(input$cov_cat_ethnicity,
levels = c("White","Mixed",
"South Asian","Black",
"Other","Missing"))

# Set reference level for variable: cov_cat_imd -------------------------------
print('Set reference level for variable: cov_cat_imd')

input$cov_cat_imd <- ordered(input$cov_cat_imd,
levels = c("1 (most deprived)","2","3","4","5 (least deprived)"))

# Set reference level for variable: cov_cat_region -----------------------------
print('Set reference level for variable: cov_cat_region')

input$cov_cat_region <- relevel(input$cov_cat_region, ref = "East")

# Set reference level for variable: cov_cat_smoking_status ---------------------
print('Set reference level for variable: cov_cat_smoking_status')

levels(input$cov_cat_smoking_status) <- list("Ever smoker" = "E", "Missing" = "M", "Never smoker" = "N", "Current smoker" = "S")

input$cov_cat_smoking_status <- ordered(input$cov_cat_smoking_status, levels = c("Never smoker","Ever smoker","Current smoker","Missing"))

# Set reference level for variable: cov_cat_sex --------------------------------
print('Set reference level for variable: cov_cat_sex')

levels(input$cov_cat_sex) <- list("Female" = "female", "Male" = "male", "Unknown" = "unknown")

# # Replace NA values with "Unknown" to ensure patients with missing sex
# # are not evaluated entirely to NA during quality assurance steps

input$cov_cat_sex <- replace(input$cov_cat_sex, is.na(input$cov_cat_sex), "Unknown")


input$cov_cat_sex <- factor(input$cov_cat_sex,
levels = c("Female", "Male", "Unknown"))
input$cov_cat_sex <- relevel(input$cov_cat_sex, ref = "Female")


# Set reference level for variable: vax_cat_jcvi_group -------------------------
print('Set reference level for variable: vax_cat_jcvi_group')

input$vax_cat_jcvi_group <- ordered(input$vax_cat_jcvi_group,
levels = c("12","11","10",
"09","08","07",
"06","05","04",
"03","02","01","99"))

# Set reference level for binaries ---------------------------------------------
print('Set reference level for binaries')

bin_factors <- colnames(input)[grepl("cov_bin_",colnames(input))]

input[,bin_factors] <- lapply(input[,bin_factors],
function(x) factor(x, levels = c("FALSE","TRUE")))

# Specify consort table --------------------------------------------------------
print('Specify consort table')

consort <- data.frame(Description = "Input",
N = nrow(input),
stringsAsFactors = FALSE)

# Quality assurance ------------------------------------------------------------

print('Call quality assurance function')
qa_results <- qa(input, study_dates, consort, cohort, threshold)
input <- qa_results$input
consort <- qa_results$consort

# Inclusion criteria -----------------------------------------------------------

print('Call inclusion criteria function')
inex(input, consort, cohort, threshold)
188 changes: 188 additions & 0 deletions analysis/data_cleaning/fn-inex.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
inex <- function(input, consort, cohort, threshold) {
print('Load libraries')

library(dplyr)
library(tictoc)
library(readr)
library(tidyr)
library(stringr)
library(ggplot2)
library(jsonlite)
library(here)
library(arrow)

print('Source common functions')
source("analysis/utility.R")

print('Inclusion criteria: Alive at index')

input <- subset(input, input$inex_bin_alive == TRUE) # Subset input if alive at index.
consort[nrow(consort)+1,] <- c("Inclusion criteria: Alive at index",
nrow(input))

print('Inclusion criteria: Known age 18 or over at index')

input <- subset(input, input$cov_num_age >= 18) # Subset input if age between 18 and 110 at index.
consort[nrow(consort)+1,] <- c("Inclusion criteria: Known age 18 or over at index",
nrow(input))

print('Inclusion criteria: Known age 110 or under at index')

input <- subset(input, input$cov_num_age <= 110) # Subset input if age between 18 and 110 on 01/06/2021.
consort[nrow(consort)+1,] <- c("Inclusion criteria: Known age 110 or under at index",
nrow(input))

print('Inclusion criteria: Known sex at index')

input <- input[!is.na(input$cov_cat_sex),] # removes NAs, if any
input$cov_cat_sex <- relevel(input$cov_cat_sex, ref = "Female")
consort[nrow(consort)+1,] <- c("Inclusion criteria: Known sex at index",
nrow(input))

print('Inclusion criteria: Known IMD at index')

input <- input[!is.na(input$cov_cat_imd),] # removes NAs, if any
input$cov_cat_imd <- ordered(input$cov_cat_imd,
levels = c("1 (most deprived)","2","3","4","5 (least deprived)"))
consort[nrow(consort)+1,] <- c("Inclusion criteria: Known IMD at index",
nrow(input))

print('Inclusion criteria: Continuous registration with the same practice for at least six months up to and including the index date')

input <- subset(input, input$inex_bin_6m_reg == TRUE)
consort[nrow(consort)+1,] <- c("Inclusion criteria: Continuous registration with the same practice for at least six months up to and including the index date",
nrow(input))

print('Inclusion criteria: Known region at index')

input <- input %>% mutate(cov_cat_region = as.character(cov_cat_region)) %>%
filter(cov_cat_region != "Missing")%>%
mutate(cov_cat_region = as.factor(cov_cat_region))
input$cov_cat_region <- relevel(input$cov_cat_region, ref = "East")
consort[nrow(consort)+1,] <- c("Inclusion criteria: Known region at index",
nrow(input))
# Apply cohort specific inclusion criteria -------------------------------------
print('Apply cohort specific inclusion criteria')

if (cohort == "vax") {
print('Inclusion criteria: Record of two vaccination doses prior to the study end date')

input$vax_gap <- input$vax_date_covid_2 - input$vax_date_covid_1 #Determine the vaccination gap in days : gap is NA if any vaccine date is missing
input <- input[!is.na(input$vax_gap),] # Subset the fully vaccinated group
consort[nrow(consort)+1,] <- c("Inclusion criteria: Record of two vaccination doses prior to the study end date",
nrow(input))

print('Inclusion criteria: Did not receive a vaccination prior to 08-12-2020 (i.e., the start of the vaccination program)')

input <- subset(input, input$vax_date_covid_1 >= vax_start_date & input$vax_date_covid_2 >= vax_start_date)
consort[nrow(consort)+1,] <- c("Inclusion criteria: Did not receive a vaccination prior to 08-12-2020 (i.e., the start of the vaccination program)",
nrow(input))

print('Inclusion criteria: Did not recieve a second dose vaccination before their first dose vaccination')

input <- subset(input, input$vax_gap >= 0) # Keep those with positive vaccination gap
consort[nrow(consort)+1,] <- c("Inclusion criteria: Did not recieve a second dose vaccination before their first dose vaccination",
nrow(input))

print('Inclusion criteria: Did not recieve a second dose vaccination less than three weeks after their first dose')

input <- subset(input, input$vax_gap >= 21) # Keep those with at least 3 weeks vaccination gap
consort[nrow(consort)+1,] <- c("Inclusion criteria: Did not recieve a second dose vaccination before their first dose vaccination",
nrow(input))

print('Inclusion criteria: Did not recieve a mixed vaccine products before 07-05-2021')

# Trick to run the mixed vaccine code on dummy data with limited levels -> To ensure that the levels are the same in vax_cat_product variables

input <- input %>%
mutate(AZ_date = ifelse(vax_date_AstraZeneca_1 < mixed_vax_threshold, 1,
ifelse(vax_date_AstraZeneca_2 < mixed_vax_threshold, 1,
ifelse(vax_date_AstraZeneca_3 < mixed_vax_threshold, 1, 0)))) %>%
mutate(Moderna_date = ifelse(vax_date_Moderna_1 < mixed_vax_threshold, 1,
ifelse(vax_date_Moderna_2 < mixed_vax_threshold, 1,
ifelse(vax_date_Moderna_3 < mixed_vax_threshold, 1, 0)))) %>%
mutate(Pfizer_date = ifelse(vax_date_Pfizer_1 < mixed_vax_threshold, 1,
ifelse(vax_date_Pfizer_2 < mixed_vax_threshold, 1,
ifelse(vax_date_Pfizer_3 < mixed_vax_threshold, 1, 0)))) %>%
rowwise() %>%
mutate(vax_mixed = sum(across(c("AZ_date", "Moderna_date", "Pfizer_date")), na.rm = T)) %>%
dplyr::filter(vax_mixed < 2)

consort[nrow(consort)+1,] <- c("Inclusion criteria: Did not recieve a mixed vaccine products before 07-05-2021",
nrow(input))

print('Inclusion criteria: Index date is before cohort end date')

# Will remove anyone who is not fully vaccinated by the cohort end date

input <- input %>% filter (!is.na(index_date) & index_date <= end_date_exposure & index_date >= start_date_delta)
consort[nrow(consort)+1,] <- c("Inclusion criteria: Index date is before cohort end date",
nrow(input))

} else if (cohort == "unvax"){

print('Inclusion criteria: Does not have a record of one or more vaccination prior index date')

# i.e. Have a record of a first vaccination prior to index date
# (no more vax 2 and 3 variables available in this dataset)
# a.Determine the vaccination status on index start date

input$prior_vax1 <- ifelse(input$vax_date_covid_1 <= input$index_date, 1,0)
input$prior_vax1[is.na(input$prior_vax1)] <- 0
input <- subset(input, input$prior_vax1 == 0) # Exclude people with prior vaccination
consort[nrow(consort)+1,] <- c("Inclusion criteria: Does not have a record of one or more vaccination prior index date",
nrow(input))

print('Inclusion criteria: Not missing JCVI group')

input <- subset(input, vax_cat_jcvi_group == "01" | vax_cat_jcvi_group == "02" | vax_cat_jcvi_group == "03" | vax_cat_jcvi_group == "04" |
vax_cat_jcvi_group == "05" | vax_cat_jcvi_group == "06" | vax_cat_jcvi_group == "07" | vax_cat_jcvi_group == "08" |
vax_cat_jcvi_group == "09" | vax_cat_jcvi_group == "10" | vax_cat_jcvi_group == "11" | vax_cat_jcvi_group == "12")
consort[nrow(consort)+1,] <- c("Inclusion criteria: Not missing JCVI group",
nrow(input))

print('Inclusion criteria: Index date is not before cohort end date - will remove anyone whose eligibility date + 84 days is after study end date (only those with unknown JCVI group)')

input <- input %>% filter (!is.na(index_date) & index_date <= end_date_exposure & index_date >= start_date_delta)
consort[nrow(consort)+1,] <- c("Inclusion criteria: Index date is not before cohort end date - will remove anyone whose eligibility date + 84 days is after study end date (only those with unknown JCVI group)",
nrow(input))

}
# Save consort data after Inclusion criteria
print('Saving consort data after Inclusion criteria')

consort$N <- as.numeric(consort$N)
consort$removed <- dplyr::lag(consort$N, default = dplyr::first(consort$N)) - consort$N

write.csv(consort, file = paste0("output/consort_", cohort, "_inex.csv"), row.names = FALSE)

# Perform redaction
print('Performing redaction')

consort$removed <- NULL
consort$N_midpoint6 <- roundmid_any(consort$N, to=threshold)
consort$removed_derived <- dplyr::lag(consort$N_midpoint6, default = dplyr::first(consort$N_midpoint6)) - consort$N_midpoint6
consort$N <- NULL

# Save rounded consort data
print('Saving rounded consort data after Inclusion criteria')

write.csv(consort, file = paste0("output/consort_", cohort, "_inex_midpoint6.csv"), row.names = FALSE)

# Save the dataset after Inclusion criteria
print('Saving dataset after Inclusion criteria')

input <- input[, c("patient_id", "cens_date_death", "index_date",
colnames(input)[grepl("end_date_", colnames(input))],
colnames(input)[grepl("sub_", colnames(input))],
colnames(input)[grepl("exp_", colnames(input))],
colnames(input)[grepl("out_", colnames(input))],
colnames(input)[grepl("cov_", colnames(input))],
colnames(input)[grepl("cens_", colnames(input))],
colnames(input)[grepl("vax_date_", colnames(input))],
colnames(input)[grepl("vax_cat_", colnames(input))])]

saveRDS(input, file = paste0("output/input_", cohort, "_inex.rds"), compress = TRUE)

return(NULL)
}
Loading

0 comments on commit 5f9811c

Please sign in to comment.