Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

First draft of IMD variable #3

Merged
merged 4 commits into from
Oct 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion analysis/dataset_definition_snapshot.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@
practice_registrations,
vaccinations,
clinical_events,
ons_deaths
ons_deaths,
addresses,
)
# import codelists
from codelists import *
Expand Down Expand Up @@ -62,6 +63,7 @@
dataset.age = patients.age_on(snapshot_date)
dataset.region = registered_patients.practice_nuts1_region_name
dataset.stp = registered_patients.practice_stp
dataset.imd = addresses.for_patient_on(snapshot_date).imd_rounded



Expand Down
5 changes: 3 additions & 2 deletions analysis/dataset_definition_varying.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@
practice_registrations,
vaccinations,
clinical_events,
ons_deaths
ons_deaths,
addresses,
)
# import codelists
from codelists import *
Expand Down Expand Up @@ -57,7 +58,7 @@
setattr(dataset, f"deregistered_{i}_date", registration.end_date)
setattr(dataset, f"region_{i}", registration.practice_nuts1_region_name)
setattr(dataset, f"stp_{i}", registration.practice_stp)

setattr(dataset, f"imd_{i}", addresses.for_patient_on(current_vax.date).imd_rounded)

previous_vax_date = current_vax.date

4 changes: 2 additions & 2 deletions analysis/dummydata_fixed.R
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,8 @@ known_variables <- c(


# function to convert ethnicity 16 group into 5 group
ethnicity_16_to_5 <- function(x){
x1 <- fct_relabel(x, ~str_extract(.x, ".*(?= - )")) # pick up everything before " - "
ethnicity_16_to_5 <- function(x) {
x1 <- fct_relabel(x, ~ str_extract(.x, ".*(?= - )")) # pick up everything before " - "
x2 <- fct_recode(x1, `Chinese or Other Ethnic Groups` = "Other Ethnic Groups")
return(x2)
}
Expand Down
5 changes: 5 additions & 0 deletions analysis/dummydata_varying.R
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,11 @@ sim_list_varying_i <- function(i) {
~ as.integer(rnorm(n = ..n, mean = 60, sd = 14)),
needs = vax_variable
),
"imd_{i}" := bn_node(
~ as.integer(plyr::round_any(runif(n=..n, 100, 32000), 100)),
missing = ~ 0.05,
needs = vax_variable
),
"registered_{i}" := bn_node(
~ rbernoulli(n = ..n, p = 0.99),
needs = vax_variable
Expand Down
8 changes: 5 additions & 3 deletions analysis/process.R
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ data_processed_fixed %>%
select(
patient_id,
sex,
#ethnicity...
# ethnicity...
death_date
) %>%
write_rds(fs::path(output_dir, "data_fixed.rds"), compress = "gz")
Expand Down Expand Up @@ -83,7 +83,9 @@ data_vax <-
matches("ageband_\\d+"),
matches("region_\\d+"),
matches("stp_\\d+"),
#... more clinical characteristics here
matches("imd_\\d+"),
matches("imd_quintile_\\d+"),
# ... more clinical characteristics here
) %>%
pivot_longer(
cols = -patient_id,
Expand All @@ -95,7 +97,7 @@ data_vax <-
mutate(
!!!standardise_characteristics
) %>%
rename(
rename(
vax_date = covid_vax,
vax_type = covid_vax_type,
) %>%
Expand Down
5 changes: 4 additions & 1 deletion analysis/report_history.R
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ summary_stratified <-
data_vax %>%
group_by(
vax_index, vax_type, vax_week,
sex, ageband, region
sex, ageband, region, imd_quintile
) %>%
summarise(
n = ceiling_any(n(), 100)
Expand Down Expand Up @@ -213,10 +213,12 @@ plot_vax_dates <- function(rows, cols) {
}

plot_vax_dates(ageband, vax_dosenumber)
plot_vax_dates(imd_quintile, vax_dosenumber)
plot_vax_dates(region, vax_dosenumber)
plot_vax_dates(sex, vax_dosenumber)
plot_vax_dates(vax_dosenumber, all)
plot_vax_dates(ageband, all)
plot_vax_dates(imd_quintile, all)
plot_vax_dates(region, all)
plot_vax_dates(sex, all)
plot_vax_dates(all, all2)
Expand Down Expand Up @@ -292,6 +294,7 @@ plot_vax_intervals <- function(rows, cols) {
}

plot_vax_intervals(ageband, vax_dosenumber)
plot_vax_intervals(imd_quintile, vax_dosenumber)
plot_vax_intervals(region, vax_dosenumber)
plot_vax_intervals(sex, vax_dosenumber)
plot_vax_intervals(vax_dosenumber, all)
Expand Down
27 changes: 12 additions & 15 deletions analysis/report_snapshot.R
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,14 @@ library("arrow")
# Import custom functions
source(here("analysis", "utility.R"))

args <- commandArgs(trailingOnly=TRUE)
args <- commandArgs(trailingOnly = TRUE)

if(length(args)==0){
if (length(args) == 0) {
# use for interactive testing
#removeobjects <- FALSE
# removeobjects <- FALSE
snapshot_date <- as.Date("2023-09-30")
} else {
#removeobjects <- TRUE
# removeobjects <- TRUE
snapshot_date <- as.Date(args[[1]])
}

Expand Down Expand Up @@ -94,7 +94,6 @@ data_snapshot <-
## output plots of date of last dose by type and other characteristics ----

plot_date_of_last_dose <- function(rows) {

summary_by <- data_snapshot %>%
group_by(last_vax_type, last_vax_week) %>%
group_by({{ rows }}, .add = TRUE) %>%
Expand Down Expand Up @@ -159,6 +158,7 @@ plot_date_of_last_dose <- function(rows) {

## --VARIABLES--
plot_date_of_last_dose(ageband)
plot_date_of_last_dose(imd_quintile)
plot_date_of_last_dose(region)
plot_date_of_last_dose(sex)
plot_date_of_last_dose(all)
Expand All @@ -169,31 +169,29 @@ plot_date_of_last_dose(all)
## output plots of dose count by type and other characteristics ----

plot_vax_count <- function(rows) {

summary_by <- data_snapshot %>%
group_by(vax_count, {{ rows }}) %>%
summarise(
n = ceiling_any(n(), 10),
) %>%
group_by( {{ rows }}) %>%
group_by({{ rows }}) %>%
mutate(
row_total = sum(n),
prop = n / row_total,

)

temp_plot <-
ggplot(summary_by) +
geom_bar(
aes(x = prop, y = {{ rows }}, width = row_total, fill = as.character(vax_count)),
stat= "identity", position = "fill"
#position = position_stack(reverse = TRUE),
stat = "identity", position = "fill"
# position = position_stack(reverse = TRUE),
) +
facet_grid(
rows = vars({{rows}}),
rows = vars({{ rows }}),
scales = "free_y",
space = "free_y"
)+
) +
labs(
x = "%",
y = NULL,
Expand All @@ -215,7 +213,7 @@ plot_vax_count <- function(rows) {
axis.ticks.x = element_line(),
strip.text = element_blank(),
legend.position = "bottom"
)+
) +
NULL

print(temp_plot)
Expand All @@ -228,8 +226,7 @@ plot_vax_count <- function(rows) {

## --VARIABLES--
plot_vax_count(ageband)
plot_vax_count(imd_quintile)
plot_vax_count(region)
plot_vax_count(sex)
plot_vax_count(all)


106 changes: 55 additions & 51 deletions analysis/utility.R
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
# - start_date is when we start the observational period proper, at the start of the mass vax programme
# - end_date is when we stopthe observation period. This may be extended as the study progresses
study_dates <-
jsonlite::read_json(path=here("lib", "dates.json")) %>%
jsonlite::read_json(path = here("lib", "dates.json")) %>%
map(as.Date)

# make these available in the global environment
Expand Down Expand Up @@ -45,7 +45,7 @@ nthmin <- function(x, n = 1) {

# overwrite splice function to avoid deprecation warnings
splice <- function(...) {
list_flatten(lst(...), name_spec ="{inner}", name_repair = "check_unique")
list_flatten(lst(...), name_spec = "{inner}", name_repair = "check_unique")
}


Expand Down Expand Up @@ -95,36 +95,38 @@ vax_shortname_lookup <- c(
# template for standardising characteristics that are extracted multiple times
# using this in mutate like this: `mutate(!!!standardise_characteristics)`
standardise_characteristics <-



rlang::quos(

## --VARIABLES--
ageband = cut(
age,
breaks = c(-Inf, 18, 40, 55, 65, 75, Inf),
labels = c("under 18", "18-39", "40-54", "55-64", "65-74", "75+"),
right = FALSE
),
region = fct_collapse(
region,
`East of England` = "East",
`London` = "London",
`Midlands` = c("West Midlands", "East Midlands"),
`North East and Yorkshire` = c("Yorkshire and The Humber", "North East"),
`North West` = "North West",
`South East` = "South East",
`South West` = "South West"
rlang::quos(

## --VARIABLES--
ageband = cut(
age,
breaks = c(-Inf, 18, 40, 55, 65, 75, Inf),
labels = c("under 18", "18-39", "40-54", "55-64", "65-74", "75+"),
right = FALSE
),
region = fct_collapse(
region,
`East of England` = "East",
`London` = "London",
`Midlands` = c("West Midlands", "East Midlands"),
`North East and Yorkshire` = c("Yorkshire and The Humber", "North East"),
`North West` = "North West",
`South East` = "South East",
`South West` = "South West"
),
imd_quintile = cut(
imd,
breaks = c(0, 32844 * (1:5) / 5),
labels = c("1 (most deprived)", "2", "3", "4", "5 (least deprived)"),
include.lowest = TRUE,
right = FALSE
)
)
)


# Import dummy data if running locally, or real data if running on the server
import_extract <- function(custom_file_path, ehrql_file_path){

if(Sys.getenv("OPENSAFELY_BACKEND") %in% c("", "expectations")){

import_extract <- function(custom_file_path, ehrql_file_path) {
if (Sys.getenv("OPENSAFELY_BACKEND") %in% c("", "expectations")) {
# ideally in future this will check column existence and types from metadata,
# rather than from a ehrql-generated dummy data

Expand All @@ -135,32 +137,37 @@ import_extract <- function(custom_file_path, ehrql_file_path){

data_custom_dummy <- read_feather(custom_file_path)

not_in_ehrql <- names(data_custom_dummy)[!( names(data_custom_dummy) %in% names(data_ehrql_dummy) )]
not_in_custom <- names(data_ehrql_dummy)[!( names(data_ehrql_dummy) %in% names(data_custom_dummy) )]
not_in_ehrql <- names(data_custom_dummy)[!(names(data_custom_dummy) %in% names(data_ehrql_dummy))]
not_in_custom <- names(data_ehrql_dummy)[!(names(data_ehrql_dummy) %in% names(data_custom_dummy))]


if(length(not_in_custom)!=0) stop(
paste(
"These variables are in ehrql but not in custom: ",
paste(not_in_custom, collapse=", ")
if (length(not_in_custom) != 0) {
stop(
paste(
"These variables are in ehrql but not in custom: ",
paste(not_in_custom, collapse = ", ")
)
)
)

if(length(not_in_ehrql)!=0) stop(
paste(
"These variables are in custom but not in ehrql: ",
paste(not_in_ehrql, collapse=", ")
}

if (length(not_in_ehrql) != 0) {
stop(
paste(
"These variables are in custom but not in ehrql: ",
paste(not_in_ehrql, collapse = ", ")
)
)
)
}

# reorder columns
data_ehrql_dummy <- data_ehrql_dummy[,names(data_custom_dummy)]
data_ehrql_dummy <- data_ehrql_dummy[, names(data_custom_dummy)]

unmatched_types <- cbind(
map_chr(data_ehrql_dummy, ~paste(class(.), collapse=", ")),
map_chr(data_custom_dummy, ~paste(class(.), collapse=", "))
)[ (map_chr(data_ehrql_dummy, ~paste(class(.), collapse=", ")) != map_chr(data_custom_dummy, ~paste(class(.), collapse=", ")) ), ] %>%
as.data.frame() %>% rownames_to_column()
map_chr(data_ehrql_dummy, ~ paste(class(.), collapse = ", ")),
map_chr(data_custom_dummy, ~ paste(class(.), collapse = ", "))
)[(map_chr(data_ehrql_dummy, ~ paste(class(.), collapse = ", ")) != map_chr(data_custom_dummy, ~ paste(class(.), collapse = ", "))), ] %>%
as.data.frame() %>%
rownames_to_column()


# if(nrow(unmatched_types)>0) stop(
Expand All @@ -172,11 +179,8 @@ import_extract <- function(custom_file_path, ehrql_file_path){
data_extract <- data_custom_dummy
} else {
data_extract <- read_feather(ehrql_file_path) %>%
#because date types are not returned consistently by ehrql
mutate(across(ends_with("_date"), as.Date))
# because date types are not returned consistently by ehrql
mutate(across(ends_with("_date"), as.Date))
}
data_extract
}



Binary file modified lib/dummydata/dummyinput_varying.arrow
Binary file not shown.