opensafely · wjchulme · Oct 24, 2024 · Oct 17, 2024 · Oct 17, 2024 · Oct 17, 2024
diff --git a/analysis/dataset_definition_snapshot.py b/analysis/dataset_definition_snapshot.py
@@ -23,7 +23,8 @@
   practice_registrations, 
   vaccinations, 
   clinical_events, 
-  ons_deaths
+  ons_deaths,
+  addresses,
 )
 # import codelists
 from codelists import *
@@ -62,6 +63,7 @@
 dataset.age = patients.age_on(snapshot_date)
 dataset.region = registered_patients.practice_nuts1_region_name
 dataset.stp = registered_patients.practice_stp
+dataset.imd = addresses.for_patient_on(snapshot_date).imd_rounded
 
 
 

diff --git a/analysis/dataset_definition_varying.py b/analysis/dataset_definition_varying.py
@@ -19,7 +19,8 @@
   practice_registrations, 
   vaccinations, 
   clinical_events, 
-  ons_deaths
+  ons_deaths,
+  addresses,
 )
 # import codelists
 from codelists import *
@@ -57,7 +58,7 @@
     setattr(dataset, f"deregistered_{i}_date", registration.end_date)
     setattr(dataset, f"region_{i}", registration.practice_nuts1_region_name)
     setattr(dataset, f"stp_{i}", registration.practice_stp)
-
+    setattr(dataset, f"imd_{i}", addresses.for_patient_on(current_vax.date).imd_rounded)
 
     previous_vax_date = current_vax.date
 
diff --git a/analysis/dummydata_fixed.R b/analysis/dummydata_fixed.R
@@ -39,8 +39,8 @@ known_variables <- c(
 
 
 # function to convert ethnicity 16 group into 5 group
-ethnicity_16_to_5 <- function(x){
-  x1 <- fct_relabel(x, ~str_extract(.x, ".*(?= - )")) # pick up everything before " - "
+ethnicity_16_to_5 <- function(x) {
+  x1 <- fct_relabel(x, ~ str_extract(.x, ".*(?= - )")) # pick up everything before " - "
   x2 <- fct_recode(x1, `Chinese or Other Ethnic Groups` = "Other Ethnic Groups")
   return(x2)
 }

diff --git a/analysis/dummydata_varying.R b/analysis/dummydata_varying.R
@@ -173,6 +173,11 @@ sim_list_varying_i <- function(i) {
       ~ as.integer(rnorm(n = ..n, mean = 60, sd = 14)),
       needs = vax_variable
     ),
+   "imd_{i}" := bn_node(
+     ~ as.integer(plyr::round_any(runif(n=..n, 100, 32000), 100)),
+     missing = ~ 0.05,
+     needs = vax_variable
+   ),
     "registered_{i}" := bn_node(
       ~ rbernoulli(n = ..n, p = 0.99),
       needs = vax_variable

diff --git a/analysis/process.R b/analysis/process.R
@@ -48,7 +48,7 @@ data_processed_fixed %>%
   select(
     patient_id,
     sex,
-    #ethnicity...
+    # ethnicity...
     death_date
   ) %>%
   write_rds(fs::path(output_dir, "data_fixed.rds"), compress = "gz")
@@ -83,7 +83,9 @@ data_vax <-
     matches("ageband_\\d+"),
     matches("region_\\d+"),
     matches("stp_\\d+"),
-    #... more clinical characteristics here
+    matches("imd_\\d+"),
+    matches("imd_quintile_\\d+"),
+    # ... more clinical characteristics here
   ) %>%
   pivot_longer(
     cols = -patient_id,
@@ -95,7 +97,7 @@ data_vax <-
   mutate(
     !!!standardise_characteristics
   ) %>%
-    rename(
+  rename(
     vax_date = covid_vax,
     vax_type = covid_vax_type,
   ) %>%

diff --git a/analysis/report_history.R b/analysis/report_history.R
@@ -132,7 +132,7 @@ summary_stratified <-
   data_vax %>%
   group_by(
     vax_index, vax_type, vax_week,
-    sex, ageband, region
+    sex, ageband, region, imd_quintile
   ) %>%
   summarise(
     n = ceiling_any(n(), 100)
@@ -213,10 +213,12 @@ plot_vax_dates <- function(rows, cols) {
 }
 
 plot_vax_dates(ageband, vax_dosenumber)
+plot_vax_dates(imd_quintile, vax_dosenumber)
 plot_vax_dates(region, vax_dosenumber)
 plot_vax_dates(sex, vax_dosenumber)
 plot_vax_dates(vax_dosenumber, all)
 plot_vax_dates(ageband, all)
+plot_vax_dates(imd_quintile, all)
 plot_vax_dates(region, all)
 plot_vax_dates(sex, all)
 plot_vax_dates(all, all2)
@@ -292,6 +294,7 @@ plot_vax_intervals <- function(rows, cols) {
 }
 
 plot_vax_intervals(ageband, vax_dosenumber)
+plot_vax_intervals(imd_quintile, vax_dosenumber)
 plot_vax_intervals(region, vax_dosenumber)
 plot_vax_intervals(sex, vax_dosenumber)
 plot_vax_intervals(vax_dosenumber, all)

diff --git a/analysis/report_snapshot.R b/analysis/report_snapshot.R
@@ -16,14 +16,14 @@ library("arrow")
 # Import custom functions
 source(here("analysis", "utility.R"))
 
-args <- commandArgs(trailingOnly=TRUE)
+args <- commandArgs(trailingOnly = TRUE)
 
-if(length(args)==0){
+if (length(args) == 0) {
   # use for interactive testing
-  #removeobjects <- FALSE
+  # removeobjects <- FALSE
   snapshot_date <- as.Date("2023-09-30")
 } else {
-  #removeobjects <- TRUE
+  # removeobjects <- TRUE
   snapshot_date <- as.Date(args[[1]])
 }
 
@@ -94,7 +94,6 @@ data_snapshot <-
 ## output plots of date of last dose by type and other characteristics ----
 
 plot_date_of_last_dose <- function(rows) {
-
   summary_by <- data_snapshot %>%
     group_by(last_vax_type, last_vax_week) %>%
     group_by({{ rows }}, .add = TRUE) %>%
@@ -159,6 +158,7 @@ plot_date_of_last_dose <- function(rows) {
 
 ## --VARIABLES--
 plot_date_of_last_dose(ageband)
+plot_date_of_last_dose(imd_quintile)
 plot_date_of_last_dose(region)
 plot_date_of_last_dose(sex)
 plot_date_of_last_dose(all)
@@ -169,31 +169,29 @@ plot_date_of_last_dose(all)
 ## output plots of dose count by type and other characteristics ----
 
 plot_vax_count <- function(rows) {
-
   summary_by <- data_snapshot %>%
     group_by(vax_count, {{ rows }}) %>%
     summarise(
       n = ceiling_any(n(), 10),
     ) %>%
-    group_by( {{ rows }}) %>%
+    group_by({{ rows }}) %>%
     mutate(
       row_total = sum(n),
       prop = n / row_total,
-
     )
 
   temp_plot <-
     ggplot(summary_by) +
     geom_bar(
       aes(x = prop, y = {{ rows }}, width = row_total, fill = as.character(vax_count)),
-      stat= "identity", position = "fill"
-      #position = position_stack(reverse = TRUE),
+      stat = "identity", position = "fill"
+      # position = position_stack(reverse = TRUE),
     ) +
     facet_grid(
-      rows = vars({{rows}}),
+      rows = vars({{ rows }}),
       scales = "free_y",
       space = "free_y"
-    )+
+    ) +
     labs(
       x = "%",
       y = NULL,
@@ -215,7 +213,7 @@ plot_vax_count <- function(rows) {
       axis.ticks.x = element_line(),
       strip.text = element_blank(),
       legend.position = "bottom"
-    )+
+    ) +
     NULL
 
   print(temp_plot)
@@ -228,8 +226,7 @@ plot_vax_count <- function(rows) {
 
 ## --VARIABLES--
 plot_vax_count(ageband)
+plot_vax_count(imd_quintile)
 plot_vax_count(region)
 plot_vax_count(sex)
 plot_vax_count(all)
-
-
diff --git a/analysis/utility.R b/analysis/utility.R
@@ -15,7 +15,7 @@
 # - start_date is when we start the observational period proper, at the start of the mass vax programme
 # - end_date is when we stopthe observation period. This may be extended as the study progresses
 study_dates <-
-  jsonlite::read_json(path=here("lib", "dates.json")) %>%
+  jsonlite::read_json(path = here("lib", "dates.json")) %>%
   map(as.Date)
 
 # make these available in the global environment
@@ -45,7 +45,7 @@ nthmin <- function(x, n = 1) {
 
 # overwrite splice function to avoid deprecation warnings
 splice <- function(...) {
-  list_flatten(lst(...), name_spec ="{inner}", name_repair = "check_unique")
+  list_flatten(lst(...), name_spec = "{inner}", name_repair = "check_unique")
 }
 
 
@@ -95,36 +95,38 @@ vax_shortname_lookup <- c(
 # template for standardising characteristics that are extracted multiple times
 # using this in mutate like this: `mutate(!!!standardise_characteristics)`
 standardise_characteristics <-
-
-
-
-rlang::quos(
-
-  ## --VARIABLES--
-  ageband = cut(
-    age,
-    breaks = c(-Inf, 18, 40, 55, 65, 75, Inf),
-    labels = c("under 18", "18-39", "40-54", "55-64", "65-74", "75+"),
-    right = FALSE
-  ),
-  region = fct_collapse(
-    region,
-    `East of England` = "East",
-    `London` = "London",
-    `Midlands` = c("West Midlands", "East Midlands"),
-    `North East and Yorkshire` = c("Yorkshire and The Humber", "North East"),
-    `North West` = "North West",
-    `South East` = "South East",
-    `South West` = "South West"
+  rlang::quos(
+
+    ## --VARIABLES--
+    ageband = cut(
+      age,
+      breaks = c(-Inf, 18, 40, 55, 65, 75, Inf),
+      labels = c("under 18", "18-39", "40-54", "55-64", "65-74", "75+"),
+      right = FALSE
+    ),
+    region = fct_collapse(
+      region,
+      `East of England` = "East",
+      `London` = "London",
+      `Midlands` = c("West Midlands", "East Midlands"),
+      `North East and Yorkshire` = c("Yorkshire and The Humber", "North East"),
+      `North West` = "North West",
+      `South East` = "South East",
+      `South West` = "South West"
+    ),
+    imd_quintile = cut(
+      imd,
+      breaks = c(0, 32844 * (1:5) / 5),
+      labels = c("1 (most deprived)", "2", "3", "4", "5 (least deprived)"),
+      include.lowest = TRUE,
+      right = FALSE
+    )
   )
-)
 
 
 # Import dummy data if running locally, or real data if running on the server
-import_extract <- function(custom_file_path, ehrql_file_path){
-
-  if(Sys.getenv("OPENSAFELY_BACKEND") %in% c("", "expectations")){
-
+import_extract <- function(custom_file_path, ehrql_file_path) {
+  if (Sys.getenv("OPENSAFELY_BACKEND") %in% c("", "expectations")) {
     # ideally in future this will check column existence and types from metadata,
     # rather than from a ehrql-generated dummy data
 
@@ -135,32 +137,37 @@ import_extract <- function(custom_file_path, ehrql_file_path){
 
     data_custom_dummy <- read_feather(custom_file_path)
 
-    not_in_ehrql <- names(data_custom_dummy)[!( names(data_custom_dummy) %in% names(data_ehrql_dummy) )]
-    not_in_custom  <- names(data_ehrql_dummy)[!( names(data_ehrql_dummy) %in% names(data_custom_dummy) )]
+    not_in_ehrql <- names(data_custom_dummy)[!(names(data_custom_dummy) %in% names(data_ehrql_dummy))]
+    not_in_custom <- names(data_ehrql_dummy)[!(names(data_ehrql_dummy) %in% names(data_custom_dummy))]
 
 
-    if(length(not_in_custom)!=0) stop(
-      paste(
-        "These variables are in ehrql but not in custom: ",
-        paste(not_in_custom, collapse=", ")
+    if (length(not_in_custom) != 0) {
+      stop(
+        paste(
+          "These variables are in ehrql but not in custom: ",
+          paste(not_in_custom, collapse = ", ")
+        )
       )
-    )
-
-    if(length(not_in_ehrql)!=0) stop(
-      paste(
-        "These variables are in custom but not in ehrql: ",
-        paste(not_in_ehrql, collapse=", ")
+    }
+
+    if (length(not_in_ehrql) != 0) {
+      stop(
+        paste(
+          "These variables are in custom but not in ehrql: ",
+          paste(not_in_ehrql, collapse = ", ")
+        )
       )
-    )
+    }
 
     # reorder columns
-    data_ehrql_dummy <- data_ehrql_dummy[,names(data_custom_dummy)]
+    data_ehrql_dummy <- data_ehrql_dummy[, names(data_custom_dummy)]
 
     unmatched_types <- cbind(
-      map_chr(data_ehrql_dummy, ~paste(class(.), collapse=", ")),
-      map_chr(data_custom_dummy, ~paste(class(.), collapse=", "))
-    )[ (map_chr(data_ehrql_dummy, ~paste(class(.), collapse=", ")) != map_chr(data_custom_dummy, ~paste(class(.), collapse=", ")) ), ] %>%
-      as.data.frame() %>% rownames_to_column()
+      map_chr(data_ehrql_dummy, ~ paste(class(.), collapse = ", ")),
+      map_chr(data_custom_dummy, ~ paste(class(.), collapse = ", "))
+    )[(map_chr(data_ehrql_dummy, ~ paste(class(.), collapse = ", ")) != map_chr(data_custom_dummy, ~ paste(class(.), collapse = ", "))), ] %>%
+      as.data.frame() %>%
+      rownames_to_column()
 
 
     # if(nrow(unmatched_types)>0) stop(
@@ -172,11 +179,8 @@ import_extract <- function(custom_file_path, ehrql_file_path){
     data_extract <- data_custom_dummy
   } else {
     data_extract <- read_feather(ehrql_file_path) %>%
-      #because date types are not returned consistently by ehrql
-      mutate(across(ends_with("_date"),  as.Date))
+      # because date types are not returned consistently by ehrql
+      mutate(across(ends_with("_date"), as.Date))
   }
   data_extract
 }
-
-
-
diff --git a/lib/dummydata/dummyinput_varying.arrow b/lib/dummydata/dummyinput_varying.arrow