From 7997c6c367323f541047e1c39fa40de63eef0e54 Mon Sep 17 00:00:00 2001
From: XuedaShen <110584221+XuedaShen@users.noreply.github.com>
Date: Mon, 9 Dec 2024 14:36:06 -0800
Subject: [PATCH 1/2] `epi_slide()` intro
Moving over minor edits iterated with @dajmcdon on a previous PR.
This one is clean: only commited the qmd file.
---
slides/day1-afternoon.qmd | 1189 +++++++++++++++++++------------------
1 file changed, 599 insertions(+), 590 deletions(-)
diff --git a/slides/day1-afternoon.qmd b/slides/day1-afternoon.qmd
index 37c7133..24b8641 100644
--- a/slides/day1-afternoon.qmd
+++ b/slides/day1-afternoon.qmd
@@ -27,6 +27,8 @@ theme_set(theme_bw())
1. Basic Nowcasting using `{epiprocess}`
+1. Nowcasting with One Variable
+
1. Nowcasting with Two Variables
1. Case Study - Nowcasting Cases Using %CLI
@@ -694,6 +696,7 @@ attr(edf, "metadata")
1. Correlating signals across location or time
1. Computing growth rates
1. Detecting and removing outliers
+1. Calculating summaries with rolling windows
1. Dealing with revisions
## Features - Correlations at different lags
@@ -818,7 +821,7 @@ edfg |>
```{r outlier-ex}
#| echo: true
#| message: false
-edfo <- filter(edf, geo_value %in% c("ut", "ca")) |>
+edfo <- filter(edf, geo_value %in% c("ca", "ut")) |>
select(geo_value, time_value, case_rate) |>
as_epi_df() |>
group_by(geo_value) |>
@@ -862,6 +865,127 @@ edfo |>
```
+## Features -- sliding a computation on an `epi_df`
+
+* It is often useful to compute rolling summaries of signals.
+
+* These depend on the reference time, and are computed separately over geographies (and other groups).
+
+* For example, a trailing average can smooth out daily variation.
+
+* In `epiprocess`, this is achieved by `epi_slide()`.
+
+```{r epi-slide-example-call}
+#| echo: true
+#| eval: false
+
+epi_slide(
+ .x,
+ .f,
+ ...,
+ .window_size = NULL,
+ .align = c("right", "center", "left"),
+ .ref_time_values = NULL,
+ .new_col_name = NULL,
+ .all_rows = FALSE
+)
+```
+
+For example, we can use `epi_slide()` to compute a trailing 7-day average.
+
+## Features -- sliding a computation on an `epi_df`
+
+* The simplest way to use `epi_slide` is tidy evaluation.
+
+* For a grouped `epi_df`, `epi_slide()` applies the computation to groups [separately]{.primary}.
+
+```{r, grouped-df-to-slide}
+#| echo: false
+
+cases_edf <- cases_df |>
+ group_by(geo_value) |>
+ as_epi_df()
+
+```
+
+```{r epi-slide-tidy-eval}
+#| echo: true
+
+cases_7dav <- epi_slide(
+ .x = cases_edf,
+ cases_7dav = mean(raw_cases, na.rm = TRUE),
+ .window_size = 7,
+ .align = "right"
+)
+```
+
+
+```{r epi-slide-tidy-eval-display-result}
+#| echo: false
+
+cases_7dav |>
+ arrange(geo_value, time_value) |>
+ group_by(geo_value) |>
+ slice_min(time_value, n = 2) |>
+ as_tibble() |>
+ print()
+
+```
+
+
+
+## Features -- sliding a computation on an `epi_df`
+
+`epi_slide` also accepts custom functions of a certain form.
+
+```{r epi-slide-custom-fucntion-mandatory-form}
+#| eval: false
+#| echo: true
+
+custom_function <- function(x, g, t, ...) {
+
+ # Function body
+
+}
+```
+
+* `x`: the data frame with all the columns with original object [except]{.primary} groupping vars.
+* `g`: the one-row tibble with values of gropping vars of the given group.
+* `t`: the `.ref_time_value` of the current window.
+* `...`: additional arguments.
+
+
+## Features -- sliding a computation on an `epi_df`
+
+```{r slide-apply-custom-function}
+#| echo: true
+#| code-line-numbers: "|9-17"
+
+mean_by_hand <- function(x, g, t, ...) {
+ data.frame(cases_7dav = mean(x$raw_cases, na.rm = TRUE))
+}
+
+cases_mean_custom_f = epi_slide(
+ .x = cases_edf,
+ .f = mean_by_hand,
+ .window_size = 7,
+ .align = "right"
+)
+
+```
+
+
+
+```{r epi-slide-custom-function-display-result}
+cases_mean_custom_f |>
+ arrange(geo_value, time_value) |>
+ slice_min(time_value, n = 2) |>
+ as_tibble() |>
+ print()
+```
+
+
+
## `epi_archive`: Collection of `epi_df`s
* full version history of a data set
@@ -873,41 +997,117 @@ edfo |>
Epidemiology data gets revised frequently.
-* We may want to use the data [as it looked in the past]{.primary}
+* We may want to use the data [as it looked in the past].{.primary}
* or we may want to examine [the history of revisions]{.primary}.
:::
## `epi_archive`: Collection of `epi_df`s
-Subset of daily COVID-19 doctor visits (Optum) and cases (JHU CSSE) from 6 states in `archive` format:
+Subset of daily COVID-19 doctor visits (Optum) and cases (JHU CSSE) from all U.S. states in `archive` format:
```{r archive-ex}
#| echo: true
-archive_cases_dv_subset
+archive_cases_dv_subset_all_states |> head()
```
+
+## Features -- sliding computation over `epi_df`s
+
+* We can apply a computation over different snapshots in an `epi_archive`.
+
+```{r}
+#| echo: true
+#| eval: false
+
+epix_slide(
+ .x,
+ .f,
+ ...,
+ .before = Inf,
+ .versions = NULL,
+ .new_col_name = NULL,
+ .all_versions = FALSE
+)
+
+```
+
+This functionality is very helpful in version aware forecasting. We will return with a concrete example.
-## Revision patterns
+
+## Features -- summarize revision behavior
+
+* `revision_summary()` is a helper function that summarizes revision behavior of an `epix_archive`.
+
+```{r epiprocess-revision-summary-demo}
+#| echo: true
+#| eval: true
+
+revision_data <- revision_summary(
+ archive_cases_dv_subset,
+ case_rate_7d_av,
+ drop_nas = TRUE,
+ print_inform = FALSE, # NOT the default, to save space
+ min_waiting_period = as.difftime(60, units = "days"),
+ within_latest = 0.2,
+ quick_revision = as.difftime(3, units = "days"),
+ few_revisions = 3,
+ abs_spread_threshold = NULL,
+ rel_spread_threshold = 0.1,
+ compactify_tol = .Machine$double.eps^0.5,
+ should_compactify = TRUE
+)
+```
+
+## Features -- summarize revision behavior
+
+```{r epiprocess-revision-summary-results}
+#| echo: true
+
+head(revision_data)
+```
+
+
+
+## Visualize revision patterns
+
+```{r create-snapshots-of-data}
+#| echo: false
+#| eval: true
+
+versions <- seq(as.Date("2020-07-01"), as.Date("2021-11-30"), by = "1 month")
+max_version <- max(versions)
+
+snapshots <- map(versions, \(v) {
+ epix_as_of(archive_cases_dv_subset_all_states, v) |>
+ mutate(version = v) |>
+ filter(geo_value %in% c("ca", "ut"))
+ }) |>
+ bind_rows() |>
+ mutate(latest = version == max_version)
+
+```
+
```{r plot-revision-patterns}
#| fig-width: 7
-# ggplot(snapshots |> filter(!latest),
-# aes(x = time_value, y = percent_cli)) +
-# geom_line(aes(color = factor(version))) +
-# geom_vline(aes(color = factor(version), xintercept = version), lty = 3) +
-# facet_wrap(~ geo_value, scales = "free_y", nrow = 1) +
-# scale_x_date(minor_breaks = "month", date_labels = "%b %Y") +
-# labs(x = "", y = "% of doctor's visits with\n Covid-like illness") +
-# scale_color_viridis_d(option = "B", end = .8) +
-# theme(legend.position = "none") +
-# geom_line(data = snapshots |> filter(latest),
-# aes(x = time_value, y = percent_cli),
-# inherit.aes = FALSE, color = "black")
-```
-
-## Finalized data
-
- * Counts are revised as time proceeds
+ggplot(snapshots |> filter(!latest),
+ aes(x = time_value, y = percent_cli)) +
+ geom_line(aes(color = factor(version))) +
+ geom_vline(aes(color = factor(version), xintercept = version), lty = 3) +
+ facet_wrap(~ geo_value, scales = "free_y", nrow = 1) +
+ scale_x_date(minor_breaks = "month", date_labels = "%b %Y") +
+ labs(x = "", y = "% of doctor's visits with\n Covid-like illness") +
+ scale_color_viridis_d(option = "B", end = .8) +
+ theme(legend.position = "none") +
+ geom_line(data = snapshots |> filter(latest),
+ aes(x = time_value, y = percent_cli),
+ inherit.aes = FALSE, color = "black")
+```
+
+
+## Types of prediction
+
+* Counts are revised as time proceeds
* Want to know the [final]{.primary} value
* Often not available until weeks/months later
@@ -924,6 +1124,8 @@ archive_cases_dv_subset
Nowcasting
: At time $t$, predict the final value for time $t$
+
+
# Basic Nowcasting in the Epiverse
@@ -964,12 +1166,50 @@ archive_cases_dv_subset
:::
-* We may also have access to $p$ other time series
-$x_{ij},\; i=1,\ldots,t, \; j = 1,\ldots, p$ which may be subject to revisions.
+* We also have access to $p$ other time series
+$x_{ij},\; i=1,\ldots,t, \; j = 1,\ldots,p$
+* All may be subject to revisions.
-## Case study: NCHS mortality
+
+## Aside on nowcasting
+
+* To some Epis, "nowcasting" can be equated with "estimate the time-varying instantaneous reproduction number, $R_t$"
+
+* Ex. using the number of reported COVID-19 cases in British Columbia between Jan. 2020 and Apr. 15, 2023.
+
+
+```{r rtestim}
+#| fig-width: 9
+#| fig-height: 3
+#| out-height: "400px"
+#| label: nowcasting
+library(rtestim)
+source(here::here("_code/bccovid.R"))
+
+p1 <- bccovid|>
+ ggplot(aes(date, cases)) +
+ geom_line(colour = primary) +
+ geom_vline(xintercept = ymd("2023-04-15"), colour = secondary,
+ linewidth = 2) +
+ labs(y = "BC Covid-19 cases", x = "Date") +
+ scale_y_continuous(expand = expansion(c(0, NA)))
+bc_rt <- estimate_rt(bccovid$cases, x = bccovid$date,
+ lambda = c(1e6, 1e5))
+p2 <- plot(confband(bc_rt, lambda = 1e5)) +
+ coord_cartesian(ylim = c(0.5, 2)) +
+ scale_y_continuous(expand = expansion(0))
+cowplot::plot_grid(p1, p2)
+```
+
+* Group built [`{rtestim}`](https://dajmcdon.github.io/rtestim) doing for this nonparametrically.
+
+* We may come back to this later...
+
+# Nowcasting with One Variable
+
+## Nowcasting simple ratio Ex: NCHS mortality
* In this example, we'll demonstrate the concept of nowcasting using [**NHCS mortality data**]{.primary}.
(the number of weekly new deaths with confirmed or presumed COVID-19, per 100,000 population).
@@ -979,65 +1219,47 @@ $x_{ij},\; i=1,\ldots,t, \; j = 1,\ldots, p$ which may be subject to revisions.
## Fetch versioned data
Let's fetch versioned mortality data from the API (`pub_covidcast`) for CA (`geo_values = "ca"`) and the signal of interest (`deaths_covid_incidence_num`) over early 2024.
-
```{r mortality-archive-construct}
#| echo: true
-
# Fetch the versioned NCHS mortality data (weekly)
-nchs_archive <- pub_covidcast(
+mortality_archive <- pub_covidcast(
source = "nchs-mortality",
signals = "deaths_covid_incidence_num",
geo_type = "state",
time_type = "week",
- geo_values = c("ca", "ut"),
- time_values = epirange(202001, 202440),
+ geo_values = "ca", # California (CA)
+ time_values = epirange(202401, 202413),
issues = "*"
) |>
select(geo_value, time_value, version = issue, mortality = value) |>
as_epi_archive(compactify = TRUE)
+# Set the start and end days for the analysis
+# corresponding to the weeks entered in time_values
+start_time = as.Date("2023-12-31")
+end_time = as.Date("2024-03-24")
```
+## Latency in reporting - Minimum lag
-## Analysis of versioning behavior
-
-Recall, we need to watch out for:
-
-* [**Latency**]{.primary} the time difference between date of occurrence and date of the initial report
-* [**Backfill**]{.primary} how data for a given date is updated after initial report.
-
-`revision_summary()` provides a sumamry to both aspects.
-
-```{r inspect-revision_summary}
-#| echo: true
-revision_data = revision_summary(nchs_archive, mortality, print_inform = FALSE)
-```
-
-
-## Versioning analysis -- latency
-
-* [**Question:**]{.primary} What is the latency of NCHS data?
+* A quick inspection reveals that mortality rates are systematically 7 days latent ([**fixed lag**]{.primary}).
-```{r nchs-latency-via-revision-summary}
+```{r inspect-latency-dplyr-way}
#| echo: true
+mortality_revision_inspect = mortality_archive$DT |> mutate(version_time_diff = version - time_value)
-revision_data |> select(geo_value, time_value, min_lag) |> slice_sample(n = 10)
+# Look at the first revision for each week
+mortality_revision_inspect |> group_by(time_value) |> slice(1) |> head()
```
-* We randomly sampled some dates to check if there is a consistent latency pattern.
-* Understanding latency prevents us from using data that we shouldn't have access to.
+* Use `revision_summary()` from `epiprocess` to generate basic statistics about the revision behavior for the dataset.
-## Versioning analysis -- backfill
-
-* [**Question:**]{.primary} How long does it take for the reported value to be close to the finalized value?
-
-```{r revision-summary-time-near-latest-demo}
-revision_data |> select(geo_value, time_value, time_near_latest) |> slice_sample(n = 10)
+```{r revision-summary-ex}
+#| eval: false
+revision_summary(mortality_archive, print_inform = TRUE)
```
-* It generally takes at least 4 weeks for reported value to be within 20\% of the finalized value.
-* We can change the threshold of percentage difference by specifying `within_latest` argument of `revision_summary()`.
-## Versioning analysis - backfill
+## Latency in reporting - Finalized value attainment
* [**Question:**]{.primary} When is the [**finalized value**]{.primary} first attained for each date? Would we have access to any in real-time?
* How fast are the final values attained & what's the pattern for these times, if any?
@@ -1071,55 +1293,35 @@ check_when_finalized <- function(epi_archive, start_date = NULL, end_date = NULL
```{r check-when-finalized-run}
#| echo: false
-res <- check_when_finalized(nchs_archive,
- start_date = min(nchs_archive$DT$time_value),
- end_date = max(nchs_archive$DT$time_value))
-
+res <- check_when_finalized(mortality_archive, start_date = start_time, end_date = end_time)
head(res)
```
-Here is a look at its quantiles:
+And here's a numerical summary:
```{r summary-diff}
#| echo: false
summary(as.numeric(res$diff))
```
-* [**Conclusion**]{.primary}: The revision behavior is pretty long-tailed. Value reported 4 weeks later is reasonably close to the finalized value.
-
+* [**Conclusion**]{.primary}: Tends to take a long time & varies. Even for this relatively small time period... Goes as low as 84 days or as high as 294 days. Yikes.
+* So if we were doing this in real-time, then we wouldn't have access to the finalized data.
-## Revision pattern visualization
+## Comparison of final vs. multiple revisions
This shows the finalized rates in comparison to [**multiple revisions**]{.primary} to see how the data changes over time:
-```{r mortality-by-accessed-weeks-later}
+```{r mortality-by-revision-date}
#| echo: false
-ref_dates <- unique(nchs_archive$DT$time_value)
-offsets = seq(1, 7) * 7
-max_version <- nchs_archive$versions_end
-
-get_val_asof <- function(time_val, archive, offsets) {
-
- as_of_dates <- pmin(time_val + offsets, max_version)
-
- result <- map(as_of_dates, function(x) {
-
- qd <- archive |>
- epix_as_of(x) |>
- filter(time_value == time_val) |>
- select(geo_value, time_value, mortality) |>
- mutate(lag = x - time_val)
- }) |>
- list_rbind()
-
- return(result)
+# Visualize the mortality values for different revisions
+revision_dates <- seq(min(mortality_archive$DT$version), end_time, by = "1 week")
-
-}
+# Create a data frame for each version and label them by revision date
+mortality_revisions <- map_dfr(revision_dates, function(date) {
+ epix_as_of(mortality_archive, date) |>
+ mutate(revision_date = date) # Add a column for the revision date
+})
-value_at_lags <- map(ref_dates, get_val_asof,
- archive <- nchs_archive, offsets <- offsets) |>
- list_rbind()
-
-values_final <- epix_as_of(nchs_archive, max(nchs_archive$versions_end))
+# Extract the latest/finalized version
+mortality_latest <- epix_as_of(mortality_archive, max(mortality_archive$DT$version))
```
@@ -1128,503 +1330,327 @@ values_final <- epix_as_of(nchs_archive, max(nchs_archive$versions_end))
#| fig-width: 9
#| fig-height: 4
#| out-height: "500px"
-ggplot(value_at_lags, aes(x = time_value, y = mortality)) +
- geom_line(aes(color = factor(lag))) +
- facet_wrap(~ geo_value, scales = "free_y", ncol = 1) +
- scale_x_date(minor_breaks = "month", date_labels = "%b %Y") +
- labs(x = "", y = "Weekly new COVID deaths") +
- # scale_color_viridis_d(option="D", end=0.8) +
- theme(legend.position = "none") +
- geom_line(data = values_final, aes(x = time_value, y = mortality),
- inherit.aes = FALSE, color = "black")
-
+ggplot() +
+ geom_line(data = mortality_latest, aes(x = time_value, y = mortality), color = "black", size = 1) +
+ geom_line(data = mortality_revisions, aes(x = time_value, y = mortality, color = as.factor(revision_date))) +
+ geom_point(data = mortality_revisions, aes(x = time_value, y = mortality, color = as.factor(revision_date)), size = 2) +
+ theme_gray() +
+ labs(color = "Revision Date", y = "Mortality", x = "Date") +
+ ggtitle("COVID-19 Mortality in CA: Finalized (black) vs Various Revisions")
```
-## Revision pattern visualization
+## Comparison of final vs. one revision
-```{r nchs-plot-different-ver}
-#| echo: false
-#| eval: true
-
-versions = seq.Date(as.Date("2021-01-01"), nchs_archive$versions_end, by = "1 month")
-nchs_snapshots = map(versions, function(v) {
- epix_as_of(nchs_archive, v) |>
- mutate(version = v)}) |>
- bind_rows() |>
- mutate(latest = version == nchs_archive$versions_end)
-```
-
-```{r nchs-plot-val-different-ver}
-#| echo: false
-#| fig-width: 9
-#| fig-height: 4
-#| out-height: "500px"
+The below figure compares the finalized rates (in black) to [**one revision**]{.primary} (in yellow) from March 3, 2024.
-ggplot(nchs_snapshots |> filter(!latest),
- aes(x = time_value, y = mortality)) +
- geom_line(aes(color = factor(version))) +
- geom_vline(aes(color = factor(version), xintercept = version), lty = 3) +
- facet_wrap(~ geo_value, scales = "free_y", ncol = 1) +
- scale_x_date(minor_breaks = "month", date_labels = "%b %Y") +
- labs(x = "", y = "Weekly covid deaths") +
- scale_color_viridis_d(option = "B", end = .8) +
- theme(legend.position = "none") +
- geom_line(data = nchs_snapshots |> filter(latest),
- aes(x = time_value, y = mortality),
- inherit.aes = FALSE, color = "black")
-```
-
-## Aside: Do we need a burn-in/training set?
-
-* Typical stat-ML practise suggests a train, validation, test split.
-* But our exploratory analysis covered all available data, is that fine?
-
-
-* Generally, for exploratory analysis, it is fine to not do train/test split.
- + These analyses do not involve model fitting, we have little risk of an overly optimistic performance evaluation (no overfitting on test data).
-* However, for a [**psuedo-prospective analysis**]{.primary}, the best practise is to do a train/test split.
- + In such analysis, one would be fitting and validating many models, a train/test split provides a more rigorous control on overfitting to test set.
-
-## Ratio nowcaster: jumping from provisional to finalized value
-
-* Recall, the goal of nowcast at date $t$ is to use project the [*finalized value*]{.primary} of $y_t,$ given the information available on date $t$.
-* A very simple nowcaster is the ratio between finalized and provisional value.
-
-
-
-How can we sensibly estimate this quantity?
-
-
-
-::: {.fragment .fade-in}
+```{r one-revision-final-plot}
+#| out-height: "400px"
+as_of_date = as.Date("2024-03-03")
-* At nowcast date $t,$ would have recieved reports with versions up to and including $t.$
-* We need to build training samples, which
- + correctly aligns finalized value against provisional value
- + uses features that would have been available at test time
- + have enough data to ensure sensible estimation results
-:::
+ggplot() +
+ geom_line(data = mortality_latest,
+ aes(x = time_value, y = mortality, color = "Finalized"),
+ size = 1) +
+ geom_line(data = mortality_revisions |> filter(revision_date == as_of_date),
+ aes(x = time_value, y = mortality, color = "Revision"),
+ size = 1) +
+ ylab("Mortality") +
+ xlab("Date") +
+ scale_color_manual(values = c("Finalized" = "black", "Revision" = "#FFB300")) +
+ labs(color = "Mortality Type")
-```{r nchs-ca-only}
-#| echo: false
-#| eval: true
-nchs_archive <- pub_covidcast(
- source = "nchs-mortality",
- signals = "deaths_covid_incidence_num",
- geo_type = "state",
- time_type = "week",
- geo_values = "ca",
- time_values = epirange(202001, 202440),
- issues = "*"
-) |>
- select(geo_value, time_value, version = issue, mortality = value) |>
- as_epi_archive(compactify = TRUE)
```
+The real-time data is biased downwards (systematically below the true value). That is, the signal tends to get scaled up with future revisions.
-## Ratio nowcaster: building training samples
+## Calculate one ratio: Provisional vs. finalized data
+
-* At nowcast date $t,$ would have recieved reports with versions up to and including $t.$
-* We need to build training samples, which
- + correctly aligns finalized value against provisional value
- + uses features that would have been available at test time
- + have enough samples to ensure sensible estimation results
+Suppose that the day is March 10, 2024. Then, because the data is 7 days latent, we can compute the ratio between provisional and finalized data for [**March 3, 2024**]{.primary}.
-::: {.fragment .fade-in}
-* Build training samples by treating dates prior to date $t$ as actual nowcast dates.
- + What is the provisional data on that date?
- + Have we recieved finalized value for that date?
-:::
+```{r one-ratio-calc}
+#| echo: true
+as_of_date = as.Date("2024-03-10"); fixed_lag = 7
-## Ratio nowcaster: building training samples
+# Load the finalized mortality data for CA
+ca_finalized <- mortality_latest |>
+ filter(time_value == (as_of_date - fixed_lag)) |>
+ dplyr::select(mortality)
-* At an earlier nowcast date $t_0,$ we define
- + [**Provisional value**]{.primary} as the reported value of $Y_s$ with version $t_0.$ Here $s_0$ is the largest occurence date among all values reported up until $t_0.$
- + [**Finalized value**]{.primary} as the (potentially unobserved) finalized value of $Y_{s_0}.$
- - We only know in *hindsight* when reported value of $Y_{s_0}$ is finalized -- need an approximation.
+# Load the provisional mortality data for the same week
+mortality_old = epix_as_of(mortality_archive, as_of_date)
-::: {.fragment .fade-in}
-```{r revision-summary-time-near-latest-show-again}
-#| echo: false
-#| eval: true
+ca_provisional <- mortality_old |>
+ filter(time_value == (as_of_date - fixed_lag)) |>
+ dplyr::select(mortality)
-revision_data |> select(geo_value, time_value, time_near_latest) |> slice_sample(n = 5)
-quantile(revision_data$time_near_latest)
+# Calculate ratio between provisional and finalized cases for the week of interest
+ratio <- ca_provisional$mortality / ca_finalized$mortality
+ratio
```
-:::
-
-::: {.fragment .fade-in}
-We can say data reported 49 days after occurrence date is good enough to be considered finalized.
-:::
-
-## Ratio nowcaster: test time feature
-* At test time $t,$ take provisional value to be $Y_s,$ where $s$ is the largest occureence date among all values reported up until time $t.$
+[**Conclusion**]{.primary}: The real-time rate is well below the finalized for this time (26 vs 72 here).
-## Nowcasting at a single date: building training samples
+[**Question**]{.primary}: Can we generalize this over many days?
-::: {.fragment .fade-in}
-* Searching for provisional values, at previous hypothetical nowcast dates.
-
-```{r one-date-look-for-provisional}
+## Calculating the ratio using multiple dates
+Let's move from calculating the ratio by using one day to multiple days with the goal to use it to nowcast for Feb. 18, which has a [**provisional value**]{.primary} of 23
+```{r provisional-val-feb18}
#| echo: true
+as_of_date = as.Date("2024-02-25")
-nowcast_date <- as.Date("2022-01-02"); window_length = 180
-
-initial_data <- nchs_archive$DT |>
- group_by(geo_value, time_value) |>
- filter(version == min(version)) |>
- rename(initial_val = mortality) |>
- select(geo_value, time_value, initial_val)
-
+provisional <- epix_as_of(mortality_archive, as_of_date) |>
+ filter(time_value == as_of_date - 7) |>
+ pull(mortality)
+provisional
```
-:::
-
-
-::: {.fragment .fade-in}
-* Searching for finalized values, at previous hypothetical nowcast dates.
+
+and a [**finalized value**]{.primary} of 104
-```{r one-date-look-for-final}
+```{r finalized-val-feb18}
#| echo: true
-#|
-finalized_data <- epix_as_of(nchs_archive, nowcast_date) |>
- filter(time_value >= nowcast_date - 49 - window_length & time_value <= nowcast_date - 49) |>
- rename(finalized_val = mortality) |>
- select(geo_value, time_value, finalized_val)
+finalized <- mortality_latest |>
+ filter(time_value == as_of_date - 7) |>
+ pull(mortality)
+finalized
```
-:::
+## Calculating the ratio using multiple dates
+First, let's download the real-time rates for CA, and compare them to their finalized version.
-::: {.fragment .fade-in}
-* After searching for both provisional and finalized values, we merge them together and estimate the ratio.
-```{r one-date-combine-provsional-and-final}
+```{r real-time-mortality}
#| echo: true
-
-ratio <- finalized_data |>
- inner_join(initial_data, by = c("geo_value", "time_value")) |>
- mutate(ratio = finalized_val / initial_val) |>
- pull(ratio) |>
- mean(na.rm = TRUE)
+dates <- seq(start_time, (as_of_date - 7), by = "day")
+mortality_real_time <- function(date) {
+ epix_as_of(mortality_archive, date + 7) |>
+ filter(time_value == date)
+}
+mortality_real_time_df <- map_dfr(dates, mortality_real_time)
+head(mortality_real_time_df)
+```
+
+## Calculating the ratio using multiple dates
+Now, let's plot the real-time vs the finalized mortality rates:
+```{r real-time-vs-finalized}
+ggplot() +
+ geom_line(data = mortality_latest |> filter(time_value <= (as_of_date - 7)),
+ aes(x = time_value, y = mortality, color = "Finalized")) +
+ geom_line(data = mortality_real_time_df,
+ aes(x = time_value, y = mortality, color = "Real-Time")) +
+ ylab("Mortality") +
+ xlab("Date") +
+ scale_color_manual(values = c("Finalized" = "black", "Real-Time" = "red")) +
+ labs(color = "Mortality Type") # Adds the legend title
+
```
-:::
+* [**Takeaways**]{.primary}: The real-time counts are biased [**well below**]{.primary} the finalized counts.
+* Systematic underreporting tends to lessen over time (the gap between the lines decreases).
-## Nowcasting at a single date: test feature and actual nowcast
+## Realistic limitation of nowcasting - Finalized data
+* Recall that real-time access to finalized data is limited as finalized values can take months to report (e.g., Jan. 7 is finalized 294 days later).
+* To nowcast accurately, we must rely on the [**best available approximation of finalized data**]{.primary} at the time of estimation (Feb. 25).
-```{r one-date-test-feat and nowcast}
+```{r finalized-data-as-of-feb25}
#| echo: true
-
-last_avail <- epix_as_of(nchs_archive, nowcast_date) |>
- slice_max(time_value) |>
- pull(mortality)
-
-nowcast <- last_avail * ratio
-nowcast
+mortality_as_of_feb25 <- epix_as_of(mortality_archive, as_of_date)
+head(mortality_as_of_feb25)
```
+## Ratio calculation & summary
-## Nowcasting for multiple dates
-
-* All previous manipulations should really be seen as a template for all nowcast dates.
-* The template computation sould be applied over all nowcast dates, [**but we must respect data versioning**]{.primary}!
-* `epix_slide()` is designed just for this! It behaves similarly to `epi_slide`.
-* Key exception: `epix_slide()` is version aware: the sliding computation at any reference time $t$ is performed on [**data that would have been available as of t**]{.primary}.
-
-
-
-## Nowcasting for multiple dates via `epix_slide()`
-
-We begin by templatizing our previous operations.
-
-```{r nowcaster-to-slide}
+We then use these "finalized" and real-time values to compute the mean ratio:
+```{r ratio-calc-summary}
#| echo: true
-
-nowcaster = function(x, g, t, wl=180, appx=49) {
-
-
- initial_data = x$DT |>
- group_by(geo_value, time_value) |>
- filter(version == min(version)) |>
- filter(time_value >= t - wl - appx & time_value <= t - appx) |>
- rename(initial_val = mortality) |>
- select(geo_value, time_value, initial_val)
-
- finalized_data = x$DT |>
- group_by(geo_value, time_value) |>
- filter(version == max(version)) |>
- filter(time_value >= t - wl - appx & time_value <= t - appx) |>
- rename(finalized_val = mortality) |>
- select(geo_value, time_value, finalized_val)
-
- ratio = finalized_data |>
- inner_join(initial_data, by = c("geo_value", "time_value")) |>
- mutate(ratio = finalized_val / initial_val) |>
- pull(ratio) |>
- median(na.rm=TRUE)
-
- last_avail = epix_as_of(x, t) |>
- slice_max(time_value) |>
- pull(mortality)
-
- res = tibble(geo_value = x$geo_value, target_date = t, nowcast = last_avail * ratio)
-
- return(res)
-
-}
-
+# exclude date we're nowcasting for
+mortality_real_time_df = mortality_real_time_df |> filter(time_value != "2024-02-18")
+mortality_as_of_feb25 = mortality_as_of_feb25 |> filter(time_value != "2024-02-18")
+ratio_real_time_to_feb25 <- mortality_real_time_df$mortality / mortality_as_of_feb25$mortality
+summary(ratio_real_time_to_feb25)
```
+On average, the real-time rates are ~25.7% of the finalized.
-## Nowcasting for multiple dates via `epix_slide()`
-
-```{r epix-slide-extract-nowcast-date}
+```{r boxplot-ratio}
#| echo: false
-#| eval: true
-
-all_nowcast_dates = nchs_archive$DT |>
- filter(time_value >= as.Date("2022-01-01")) |>
- distinct(time_value) |>
- pull(time_value)
-```
-
-
-```{r nowcasts-slided}
-#| echo: true
-#| eval: true
-
-nowcasts = nchs_archive |>
- group_by(geo_value) |>
- epix_slide(
- nowcaster,
- .before=Inf,
- .versions = all_nowcast_dates,
- .all_versions = TRUE
-)
+ratio_df <- data.frame(ratio_real_time_to_feb25, mean_ratio = mean(ratio_real_time_to_feb25))
+
+# Create the boxplot with mean marked as a bold cross
+ggplot(ratio_df, aes(y = ratio_real_time_to_feb25)) +
+ geom_boxplot(fill = "lightblue", color = "black") +
+ geom_point(
+ aes(x = 0, y = mean_ratio), # Place point at the mean
+ shape = 4, # Cross shape
+ size = 7, # Size of the cross
+ color = "darkblue", # Color of the cross
+ stroke = 2 # Boldness of the cross
+ ) +
+ labs(
+ title = "Distribution of Real-Time to Finalized Mortality Ratios",
+ y = "Real-Time to Finalized Ratio"
+ ) +
+ theme_minimal() + # Minimal theme for clean look
+ theme(
+ plot.title = element_text(hjust = 0.5), # Center the title
+ axis.text.x = element_blank()
+ ) +
+ coord_cartesian(ylim = c(0.2, 0.3)) # Limit y-axis between 0 and 0.5 to zoom in
```
+Tells us the distribution is right-skewed (mean > median) and so we should opt for the median.
+## Nowcasting on Feb. 25
-## Details of `epix_slide()`
-
-
-```{r epix-slide-demo-call-allversions-FALSE}
+* Since the [**median ratio**]{.primary} between real-time and finalized values is [**0.250**]{.primary} (i.e., real-time values are typically 25% of the finalized), then the nowcast is
+```{r nowcast-feb25-point}
#| echo: true
-#| eval: false
-
-epix_slide(
- .x,
- .f,
- ...,
- .before = Inf,
- .versions = NULL,
- .new_col_name = NULL,
- .all_versions = FALSE
-)
+# Now we can nowcast properly:
+nowcast <- provisional *
+ 1 / median(ratio_real_time_to_feb25)
+nowcast
```
+* To get the accompanying 95% prediction interval, calculate the 2.5th and 97.5th percentiles:
-* `.f` in `epix_slide()` can be specified with the same form of custom function as `epi_slide()`.
-
-```{r epix-slide-form-of-custom-function}
+```{r nowcast-feb25-lower}
#| echo: true
-#| eval: false
+percentile_97.5 <- quantile(ratio_real_time_to_feb25, 0.975) |> unname()
-function(x, g, t) {
- # function body
-}
+(lower_PI <- provisional * 1 / percentile_97.5)
```
-* Mandatory variables of `.f` would have different forms depending on the value of `.all_versions`.
-
-
-## Details of `epix_slide()`
-
-```{r epix-slide-demo-call-allversions-FALSE-again}
+```{r nowcast-feb25-upper}
#| echo: true
-#| eval: false
-#| code-line-numbers: "|8"
-
-epix_slide(
- .x,
- .f,
- ...,
- .before = Inf,
- .versions = NULL,
- .new_col_name = NULL,
- .all_versions = FALSE
-)
+percentile_2.5 <- quantile(ratio_real_time_to_feb25, 0.025) |> unname()
+(upper_PI <- provisional * 1 / percentile_2.5)
```
-::: {.fragment .fade-in}
-* When `.all_versions = FALSE`, `epix_slide()` essentially iterates the templatized computation over snapshots.
-* Said differently, when `.all_versions = FALSE`, data accessed at any sliding iteration [**only involves a single version**]{.primary}.
-:::
+* So, the [**nowcast is 92**]{.primary} with 95% PI: [61, 140], which is much closer to the [**finalized value of 104**]{.primary} than the [**provisional value of 23**]{.primary}.
-::: {.fragment .fade-in}
-* Hence:
- + `x`: an `epi_df` with same column names as archive's `DT`, minus the `version` column.
- + `g`: a one-row tibble containing the values of groupping variables of the associated group.
- + `t`: the `ref_time_value` of the current window.
- + `...`: additional arguments.
-:::
+## Summary of three main steps
+So the main steps for this type of fixed lag nowcasting are...
-## Details of `epix_slide()`
+1. Obtain the [**provisional value**]{.primary} for the target.
-```{r epix-slide-demo-call-allversions-TRUE}
-#| echo: true
-#| eval: false
-#| code-line-numbers: "|8"
+2. Estimate the ratio using the [**real-time**]{.primary} and [**"finalized"**]{.primary} data (for all previous dates that follow a consistent pattern in reporting).
-epix_slide(
- .x,
- .f,
- ...,
- .before = Inf,
- .versions = NULL,
- .new_col_name = NULL,
- .all_versions = TRUE
-)
-```
+3. Profit.
-::: {.fragment .fade-in}
-* When `.all_versions = FALSE`, data accessed at any sliding iteration involves versions [**up to and including .ref_time_value**]{.primary}.
-:::
-
-::: {.fragment .fade-in}
-* Hence:
- + `x`: an `epi_archive`, with version up to and including `.ref_time_value`.
- + `g`: a one-row tibble containing the values of groupping variables of the associated group.
- + `t`: the `ref_time_value` of the current window.
- + `...`: additional arguments.
-:::
-
-
-## Details of `epix_slide()`
-
-```{r nowcasts-slide-demo-only, eval=FALSE}
+```{r ratio-summary-steps}
#| echo: true
#| eval: false
-#| code-line-numbers: "|7,13,17"
-
-nowcasts <- nchs_archive |>
- group_by(geo_value) |>
- epix_slide(
- nowcaster,
- .before=Inf,
- .versions = all_nowcast_dates,
- .all_versions = TRUE
-)
+#| code-fold: true
+#| code-summary: "Expand for the accompanying code"
+# Today
+as_of_date = as.Date("2024-02-25")
-nowcaster <- function(x, g, t, wl=180, appx=49) {
-
+# 1. Obtain the provisional value
+provisional <- epix_as_of(mortality_archive, as_of_date) |>
+ filter(time_value == as_of_date - 7) |>
+ pull(mortality)
+provisional
- initial_data = x$DT |>
- # Function body, omitted
+# 2. Estimate the ratio
+mortality_real_time_df <- map_dfr(dates, mortality_real_time) |> filter(time_value != "2024-02-18") # Real-time
+mortality_as_of_feb25 <- epix_as_of(mortality_archive, as_of_date) |> filter(time_value != "2024-02-18") # "Finalized"
+ratio_real_time_to_feb25 <- mortality_real_time_df$mortality / mortality_as_of_feb25$mortality
- finalized_data = x$DT |>
- # Function body, omitted
-
-}
+# 3. Profit.
+(nowcast <- provisional * 1 / median(ratio_real_time_to_feb25))
+(upper_PI <- provisional * 1 / quantile(ratio_real_time_to_feb25, 0.025))
+(lower_PI <- provisional * 1 / quantile(ratio_real_time_to_feb25, 0.975))
```
+## Nowcasting mortality for multiple dates
-
-## Visualize nowcasts
-
-
-We are now finally able to compare nowcasts against first available reports:
-
-```{r nowcast-subsetting}
-#| echo: false
-
-
-intial_val_extracter <- function(x, g, t, wl=180, appx=49) {
+* [**Define Nowcast Function**]{.primary}:
+ * [**Input**]{.primary}: Takes in the dates to nowcast and the fixed lag
+ * [**Output**]{.primary}: The nowcasted mortality rates based on the ratio of real-time to finalized data.
+```{r nowcasting-multipl-dates-fun}
+#| echo: true
+#| code-fold: true
+nowcast_function <- function(nowcast_date, fixed_lag) {
+ as_of_date = nowcast_date + fixed_lag
- last_avail = epix_as_of(x, t) |>
- slice_max(time_value) |>
+ # 1. Obtain the provisional value for the target.
+ provisional <- epix_as_of(mortality_archive, as_of_date) |>
+ filter(time_value == as_of_date - fixed_lag) |>
pull(mortality)
- res = tibble(geo_value = x$geo_value, target_date = t, value = last_avail)
+ #2. Estimate the ratio multiplier using
+ # real-time
+ dates_seq <- seq(start_time, (nowcast_date - fixed_lag), by = "week")
+ mortality_real_time <- map_dfr(dates_seq, mortality_real_time)
+
+ # and "finalized" data
+ finalized <- epix_as_of(mortality_archive, as_of_date) |> filter(time_value >= start_time & time_value <= (nowcast_date - fixed_lag))
+
+ ratios <- mortality_real_time$mortality / finalized$mortality
+
+ # Remove infinite or NaN ratios (i.e., keep only finite values)
+ median_ratio <- median(ratios[is.finite(ratios)])
- return(res)
+ #3. Profit.
+ nowcast <- provisional * (1 / median_ratio)
+ upper_PI <- provisional * (1 / quantile(ratios[is.finite(ratios)], 0.025))
+ lower_PI <- provisional * (1 / quantile(ratios[is.finite(ratios)], 0.975))
+ # Return a dataframe with the nowcast and date
+ tibble(
+ time_value = nowcast_date,
+ nowcast = nowcast,
+ lower_PI = lower_PI,
+ upper_PI = upper_PI
+ )
}
-
-
-provisional_val = epix_slide(nchs_archive, intial_val_extracter, .versions = all_nowcast_dates, .all_versions = TRUE)
-
-finalized_val = nchs_archive$DT |>
- filter(time_value >= as.Date("2022-01-01")) |>
- group_by(geo_value, time_value) |>
- filter(version == max(version))
-
```
+## Map nowcast over multiple dates
+* We can use `map2()` to apply the function to a series of weeks (e.g., Jan. 28 to Mar. 24).
+* Returns a [**dataframe**]{.primary} with nowcasted results.
-```{r nowcast-fun-plot-results}
-#| echo: false
-
-ggplot() +
- geom_line(data = finalized_val, aes(x = time_value, y = mortality, color = "Finalized")) +
- geom_point(data = finalized_val, aes(x = time_value, y = mortality, color = "Finalized"), shape = 16) +
- geom_line(data = provisional_val, aes(x = target_date, y = value, color = "Provisional")) +
- geom_point(data = provisional_val, aes(x = target_date, y = value, color = "Provisional"), shape = 16) +
- geom_line(data = nowcasts, aes(x = target_date, y = nowcast, color = "Nowcast")) +
- geom_point(data = nowcasts, aes(x = target_date, y = nowcast, color = "Nowcast"), shape = 16) +
- # geom_ribbon(data = nowcast_results_df,
- # aes(x = time_value, ymin = lower_PI, ymax = upper_PI),
- # alpha = 0.2, fill = "blue") +
- ylab("Mortality") +
- xlab("Date") +
- scale_color_manual(name = "",
- values = c("Nowcast" = "#71c5e8", "Provisional" = "#FF6900", "Finalized" = "black")) +
- theme(legend.position = "bottom")
+```{r map-nowcast-fun-multiple-dates}
+#| echo: true
+# Apply Nowcast Function Over Multiple Dates
+nowcast_dates <- seq(as.Date("2024-01-28"), as.Date("2024-03-24"), by = "week")
+fixed_lag <- 7
+nowcast_results_df <- map2(nowcast_dates, fixed_lag, nowcast_function) |> list_rbind()
```
-* The real-time counts tend to be biased below the finalized counts. Nowcasted values tend to provide a much better approximation of the truth (at least for these dates).
-
-
-## Smoothing nowcasts
-
-* Nowcasts are quite volatile, reflecting the provisional counts are far from complete.
-* We can use a trailing average to smooth them.
-
-```{r smooth-nowcasts-epi-slide}
-#| echo: true
-
-smoothed_nowcasts <- epi_slide(
- nowcasts |> as_epi_df(),
- smoothed_nowcasts = mean(nowcast, na.rm = TRUE),
- .window_size = as.difftime(3, units = "weeks")
-)
+Let's smooth with a rolling trailing mean (window size 4) & see the results:
+```{r smooth-print-res}
+# Smooth results: Apply rolling median to nowcast and bounds with a window size of 4
+library(zoo)
+nowcast_results_df <- nowcast_results_df |>
+ mutate(across(.cols = -time_value,
+ .fns = ~ zoo::rollapply(.x, width = 4, FUN = mean, partial = TRUE, align = "right"),
+ .names = "{.col}"))
+nowcast_results_df
```
-```{r nowcast-smoothed-vis}
+## Visualize nowcast, real-time, and finalized values
+Finally, we can compare these nowcast results to the real-time and finalized values:
+```{r nowcast-fun-plot-results}
#| echo: false
-
-cbPalette = c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442",
- "#0072B2", "#D55E00", "#CC79A7")
-
ggplot() +
- geom_line(data = finalized_val, aes(x = time_value, y = mortality, color = "Finalized")) +
- geom_point(data = finalized_val, aes(x = time_value, y = mortality, color = "Finalized"), shape = 16) +
- geom_line(data = smoothed_nowcasts, aes(x = time_value, y = smoothed_nowcasts, color = "Smoothed")) +
- geom_point(data = smoothed_nowcasts, aes(x = time_value, y = smoothed_nowcasts, color = "Smoothed"), shape = 16) +
- geom_line(data = nowcasts, aes(x = target_date, y = nowcast, color = "Nowcast")) +
- geom_point(data = nowcasts, aes(x = target_date, y = nowcast, color = "Nowcast"), shape = 16) +
- # geom_ribbon(data = nowcast_results_df,
- # aes(x = time_value, ymin = lower_PI, ymax = upper_PI),
- # alpha = 0.2, fill = "blue") +
+ geom_line(data = nowcast_results_df, aes(x = time_value, y = nowcast, color = "Nowcast")) +
+ geom_point(data = nowcast_results_df, aes(x = time_value, y = nowcast, color = "Nowcast"), shape = 16) +
+ geom_line(data = map_dfr(nowcast_dates, mortality_real_time), aes(x = time_value, y = mortality, color = "Real-Time")) +
+ geom_point(data = map_dfr(nowcast_dates, mortality_real_time), aes(x = time_value, y = mortality, color = "Real-Time"), shape = 16) +
+ geom_line(data = mortality_latest |> filter(time_value %in% nowcast_dates), aes(x = time_value, y = mortality, color = "Finalized")) +
+ geom_point(data = mortality_latest |> filter(time_value %in% nowcast_dates), aes(x = time_value, y = mortality, color = "Finalized"), shape = 16) +
+ geom_ribbon(data = nowcast_results_df,
+ aes(x = time_value, ymin = lower_PI, ymax = upper_PI),
+ alpha = 0.2, fill = "blue") +
ylab("Mortality") +
xlab("Date") +
- scale_color_manual(name = "", values = c("Nowcast" = "#71c5e8", "Smoothed" = "#97D700", "Finalized" = "black")) +
- theme(legend.position = "bottom")
+ scale_color_manual(values = c("Nowcast" = "blue", "Real-Time" = "red", "Finalized" = "black")) +
+ labs(color = "Mortality Type") # Adds the legend title
```
-
-
+The real-time counts tend to be biased below the finalized counts. Nowcasted values tend to provide a much better approximation of the truth (at least for these dates).
## Evaluation using MAE
@@ -1632,6 +1658,7 @@ ggplot() +
* Then for $y_{t}$ over times $t = 1, \dots, N$, then we may compute error metrics like mean absolute error (MAE).
+
* MAE measures the average absolute difference between the nowcast and finalized values.
@@ -1645,33 +1672,50 @@ Let's numerically evaluate our point nowcasts for the provisional values of a ti
-
```{r mae-code}
-#| echo: false
+#| echo: true
# Step 1: Join the mortality data with nowcast data
-mae_data <- finalized_val |>
- select(-version) |>
- inner_join(nowcasts |> select(-version), by = join_by("geo_value", "time_value" == "target_date")) |>
- inner_join(initial_data, by = c("geo_value", "time_value")) |>
- inner_join(smoothed_nowcasts |> select(-version, -nowcast), by = c("geo_value", "time_value"))
+mae_data <- mortality_latest |>
+ filter(time_value %in% nowcast_dates) |>
+ left_join(nowcast_results_df, by = "time_value") |>
+ left_join(map_dfr(nowcast_dates, mortality_real_time) |> rename(real_time = mortality), by = c("geo_value", "time_value"))
# Step 2: Calculate the absolute error between actual and nowcasted values
mae_data <- mae_data |>
- mutate(raw_nc_res = abs(mortality - nowcast),
- prov_res = abs(mortality - initial_val),
- smoothed_res = abs(mortality - smoothed_nowcasts))
+ mutate(nc_abs_error = abs(mortality - nowcast),
+ rt_abs_error = abs(mortality - real_time))
# Step 3: Compute the MAE (mean of absolute errors)
mae_value <- mae_data |>
- group_by(geo_value) |>
- summarise(`Smoothed MAE` = mean(smoothed_res, na.rm = TRUE),
- `Unsmoothed nowcast MAE` = mean(raw_nc_res, na.rm = TRUE),
- `Provisional value MAE` = mean(prov_res, na.rm = TRUE)) |>
- select(-geo_value)
-
+ summarise(nc_MAE = mean(nc_abs_error),
+ rt_MAE = mean(rt_abs_error))
knitr::kable(mae_value)
```
+## Evaluation using MAE
+
+Finally, we may visualize the distribution of errors across time:
+```{r abs-error-plot}
+ggplot(mae_data) +
+ # Nowcast Absolute Error
+ geom_point(aes(x = time_value, y = nc_abs_error), color = "blue", size = 2) +
+ geom_line(aes(x = time_value, y = nc_abs_error), color = "blue") +
+ # Horizontal line for Nowcast Mean Absolute Error
+ geom_hline(aes(yintercept = mean(nc_abs_error), color = "Nowcast MAE"), linetype = "dashed") +
+
+ # Real-Time Absolute Error
+ geom_point(aes(x = time_value, y = rt_abs_error), color = "red", size = 2) +
+ geom_line(aes(x = time_value, y = rt_abs_error), color = "red") +
+ # Horizontal line for Real-Time Mean Absolute Error
+ geom_hline(aes(yintercept = mean(rt_abs_error), color = "Real-Time MAE"), linetype = "dashed") +
+
+ # Customize the x and y axes labels, legend & add title
+ xlab("Time") + ylab("Absolute Error") +
+ scale_color_manual(values = c("Real-Time MAE" = "red", "Nowcast MAE" = "blue")) +
+ labs(color = "Mean Absolute Error")
+```
+
+We can see that the absolute errors are almost always lower for nowcasting.
# Nowcasting with Two Variables
@@ -1939,7 +1983,8 @@ So visualizing can provide an important perspective that is missed from a simple
* [**Sliding Window Approach**]{.primary}: Predictions are based on [data up to the current time]{.primary}, ensuring no future information influences the nowcast.
* [**Evaluation**]{.primary}: Predictions are compared with actual admissions using numerical and visual perspectives.
-# Case Study - Nowcasting Cases Using %CLI
+
+# Case Study - Nowcasting Cases Using %CLI
## Goal of this case study
@@ -1952,7 +1997,7 @@ So visualizing can provide an important perspective that is missed from a simple
## Summary of main steps
-The workflow is similar to the previous example where we nowcasted using two variables, only more involved.
+The workflow is similar to the previous example where we nowcasted using two variables, only more involved.
The main steps are...
1. [**Fetch Data**]{.primary}: Retrieve %CLI and COVID-19 case data (by specimen collection date) for MA.
@@ -1961,7 +2006,7 @@ The main steps are...
3. [**Model & Prediction**]{.primary}: Fit a linear model to predict cases based on %CLI, trained on a 30-day rolling window.
-4. [**Nowcast Execution**]{.primary}: Use `epix_slide` to nowcast the cases dynamically.
+4. [**Nowcast Execution**]{.primary}: Use `epix_slide()` to nowcast the cases dynamically.
5. [**Visualization**]{.primary}: Plot actual vs. nowcasted cases with confidence intervals to assess model accuracy.
@@ -1999,11 +2044,11 @@ Brief summary of this data:
## Main steps to construct the `epi_archive`
1. [**Load necessary Libraries**]{.primary}: Such as `tidyverse`, `readxl`, `epiprocess`.
-2. [**Process Each Date's Data**]{.primary}:
+2. [**Process Each Date's Data**]{.primary}:
* A function we'll make (`process_covid_data`) downloads and processes daily COVID-19 data from the MA gov Excel files on their website.
* The data is cleaned and formatted with columns: `geo_value`, `time_value`, `version`, and values.
3. [**Handle Missing Data**]{.primary}: Checks if a date's data is available (handle 404 errors).
-4. [**Create `epi_archive`**]{.primary}:
+4. [**Create `epi_archive`**]{.primary}:
* Combine processed data into a tibble.
* Convert the tibble to an `epi_archive` object using `as_epi_archive()`.
@@ -2021,25 +2066,25 @@ library(epiprocess)
# Function to download and process each Excel file for a given date
process_covid_data <- function(Date) {
# Generate the URL for the given date
- url <- paste0("https://www.mass.gov/doc/covid-19-raw-data-", tolower(gsub("-0", "-", format(Date, "%B-%d-%Y"))), "/download")
+ url <- paste0("https://www.mass.gov/doc/covid-19-raw-data-", tolower(gsub("-0", "-", format(Date, "%B-%d-%Y"))), "/download")
# Applies gsub("-0", "-", ...) to replace any occurrence of -0 (such as in "April-01") with just - (resulting in "April-1").
-
+
# Check if the URL exists (handle the 404 error by skipping that date)
response <- GET(url)
-
+
if (status_code(response) != 200) {
return(NULL) # Skip if URL doesn't exist (404)
}
-
+
# Define the destination file path for the Excel file
file_path <- tempfile(fileext = ".xlsx")
-
+
# Download the Excel file
GET(url, write_disk(file_path, overwrite = TRUE))
-
+
# Read the relevant sheet from the Excel file
data <- read_excel(file_path, sheet = "CasesByDate (Test Date)")
-
+
# Process the data: rename columns and convert Date
data <- data |>
rename(
@@ -2049,7 +2094,7 @@ process_covid_data <- function(Date) {
Case_Average_7day = `7-day confirmed case average`
) |>
mutate(Date = as.Date(Date)) # Convert to Date class
-
+
# Create a tibble with the required columns for the epi_archive
tib <- tibble(
geo_value = "ma", # Massachusetts (geo_value)
@@ -2057,12 +2102,12 @@ process_covid_data <- function(Date) {
version = Date, # The extracted version date
case_rate_7d_av = data$Case_Average_7day # 7-day average case value
)
-
+
return(tib)
}
```
-## Fetch Data - Code breakdown
+## Fetch Data - Code breakdown
* This purpose of this function is to download and process each Excel file as of a date.
* [**URL Creation**]{.primary}: Dynamically generates the URL based on the date, removing leading zeros in day values (e.g., "April-01" → "April-1").
@@ -2084,20 +2129,20 @@ process_covid_data <- function(Date) {
process_data_for_date_range <- function(start_date, end_date) {
# Generate a sequence of dates between start_date and end_date
date_sequence <- seq(as.Date(start_date), as.Date(end_date), by = "day")
-
+
# Process data for each date and combine results
covid_data_list <- lapply(date_sequence, function(Date) {
process_covid_data(Date) # Skip over dates with no data (NULLs will be ignored)
})
-
+
# Combine all non-null individual tibbles into one data frame
combined_data <- bind_rows(covid_data_list[!sapply(covid_data_list, is.null)])
-
+
# Convert the combined data into an epi_archive object
if (nrow(combined_data) > 0) {
epi_archive_data <- combined_data |>
as_epi_archive(compactify = FALSE)
-
+
return(epi_archive_data)
} else {
message("No valid data available for the given date range.")
@@ -2129,7 +2174,7 @@ y <- process_data_for_date_range("2021-01-10", "2021-12-01") # Raw .xlsx data i
y
```
-* Alternatively, you may run the following to load `y` that was previously saved as an RDS file:
+* Alternatively, you may run the following to load `y` that was previously saved as an RDS file:
```{r load-ma-case-data}
#| echo: true
#| eval: FALSE
@@ -2146,7 +2191,7 @@ y <- readRDS("_data/ma_case_archive.rds")
```{r fetch-outpatient-cli}
#| echo: true
-# Step 1: Fetch Versioned Data
+# Step 1: Fetch Versioned Data
x <- pub_covidcast(
source = "doctor-visits",
signals = "smoothed_adj_cli",
@@ -2192,15 +2237,15 @@ lm_mod_pred <- function(data, ...) {
# Make predictions
predictions = predict(model,
newdata = data |>
- fill(percent_cli, .direction = "down") |>
+ fill(percent_cli, .direction = "down") |>
filter(time_value == max(time_value)),
interval = "prediction", level = 0.9)
-
+
# Pull off real-time value for later comparison to the nowcast value
real_time_val = data |> filter(time_value == max(time_value)) |> pull(case_rate_7d_av)
-
+
# Could clip predictions and bounds at 0
- return(data.frame(predictions, actual_nowcast_date = max(data$time_value), real_time_val = real_time_val))
+ return(data.frame(predictions, actual_nowcast_date = max(data$time_value), real_time_val = real_time_val))
}
```
@@ -2213,8 +2258,8 @@ lm_mod_pred <- function(data, ...) {
```{r nowcasting-epix-slide-cases}
#| echo: true
# Define the reference time points (to give the training/test split)
-targeted_nowcast_dates <- seq(as.Date("2021-04-01"), as.Date("2021-11-01"), by = "1 month")
-ref_time_values = targeted_nowcast_dates + 1 # + 1 because the case data is 1 day latent.
+targeted_nowcast_dates <- seq(as.Date("2021-04-01"), as.Date("2021-11-01"), by = "1 month")
+ref_time_values = targeted_nowcast_dates + 1 # + 1 because the case data is 1 day latent.
# Determine this from revision_summary(y)
# Use epix_slide to perform the nowcasting with a training-test split
@@ -2240,7 +2285,7 @@ Merge the nowcast results with the latest data for more direct comparison:
```{r join-with-actual-cases}
#| echo: true
x_latest <- epix_as_of(archive, max(archive$DT$version)) |>
- select(-percent_cli)
+ select(-percent_cli)
res <- nowcast_res |> left_join(x_latest, by = join_by(geo_value, time_value))
@@ -2269,7 +2314,7 @@ ggplot(res, aes(x = time_value)) +
# Adjust colors
scale_color_manual(values = c("Finalized Cases (7-dav)" = "black",
"Nowcast" = "#1f78b4",
- "Real Time" = "darkred")) +
+ "Real Time" = "darkred")) +
scale_fill_manual(values = c("Pred. Interval" = "#a6cee3")) + # Light blue
# Improve the theme
theme_minimal() +
@@ -2289,12 +2334,12 @@ ggplot(res, aes(x = time_value)) +
```{r mae-code-cases}
#| echo: true
# Calculate the absolute error between actual and nowcasted COVID-19 cases
-mae_data_cases <- res |>
+mae_data_cases <- res |>
mutate(nc_abs_error = abs(case_rate_7d_av - fit), # Nowcast vs Finalized cases (7-day average)
rt_abs_error = abs(case_rate_7d_av - real_time_val)) # Real-Time vs Finalized cases
# Compute the MAE (mean of absolute errors)
-mae_value_cases <- mae_data_cases |>
+mae_value_cases <- mae_data_cases |>
summarise(nc_MAE = mean(nc_abs_error),
rt_MAE = mean(rt_abs_error))
knitr::kable(mae_value_cases)
@@ -2306,18 +2351,18 @@ knitr::kable(mae_value_cases)
```{r abs-error-plot-cases}
# Plot the absolute errors for Nowcast and Real-Time vs Finalized COVID-19 cases
ggplot(mae_data_cases) +
- # Nowcast Absolute Error
+ # Nowcast Absolute Error
geom_point(aes(x = time_value, y = nc_abs_error), color = "#1f78b4", size = 2) +
geom_line(aes(x = time_value, y = nc_abs_error), color = "#1f78b4") +
# Horizontal line for Nowcast Mean Absolute Error
geom_hline(aes(yintercept = mean(nc_abs_error), color = "Nowcast MAE"), linetype = "dashed") +
-
- # Real-Time Absolute Error
+
+ # Real-Time Absolute Error
geom_point(aes(x = time_value, y = rt_abs_error), color = "darkred", size = 2) +
geom_line(aes(x = time_value, y = rt_abs_error), color = "darkred") +
# Horizontal line for Real-Time Mean Absolute Error
geom_hline(aes(yintercept = mean(rt_abs_error), color = "Real-Time MAE"), linetype = "dashed") +
-
+
# Customize the x and y axes labels, legend & add title
xlab("Date") + ylab("Absolute Error") +
scale_color_manual(values = c("Real-Time MAE" = "darkred", "Nowcast MAE" = "#1f78b4")) +
@@ -2347,39 +2392,3 @@ Main Steps:
Overall, nowcasting, based on the linear model, provided a closer approximation of true cases compared to the real-time values.
-
-
-## Aside on nowcasting
-
-* To some Epis, "nowcasting" can be equated with "estimate the time-varying instantaneous reproduction number, $R_t$"
-
-* Ex. using the number of reported COVID-19 cases in British Columbia between Jan. 2020 and Apr. 15, 2023.
-
-
-```{r rtestim}
-#| fig-width: 9
-#| fig-height: 3
-#| out-height: "400px"
-#| label: nowcasting
-library(rtestim)
-source(here::here("_code/bccovid.R"))
-
-p1 <- bccovid|>
- ggplot(aes(date, cases)) +
- geom_line(colour = primary) +
- geom_vline(xintercept = ymd("2023-04-15"), colour = secondary,
- linewidth = 2) +
- labs(y = "BC Covid-19 cases", x = "Date") +
- scale_y_continuous(expand = expansion(c(0, NA)))
-bc_rt <- estimate_rt(bccovid$cases, x = bccovid$date,
- lambda = c(1e6, 1e5))
-p2 <- plot(confband(bc_rt, lambda = 1e5)) +
- coord_cartesian(ylim = c(0.5, 2)) +
- scale_y_continuous(expand = expansion(0))
-cowplot::plot_grid(p1, p2)
-```
-
-* Group built [`{rtestim}`](https://dajmcdon.github.io/rtestim) doing for this nonparametrically.
-
-* We may come back to this later...
-
From 0ea5dddb6973d2ea6e167b8837a0a179952be8c6 Mon Sep 17 00:00:00 2001
From: XuedaShen <110584221+XuedaShen@users.noreply.github.com>
Date: Mon, 9 Dec 2024 14:45:24 -0800
Subject: [PATCH 2/2] Fix section on ratio nowcaster to be in sync with `main`
---
slides/day1-afternoon.qmd | 942 +++++++++++++++++++++++---------------
1 file changed, 567 insertions(+), 375 deletions(-)
diff --git a/slides/day1-afternoon.qmd b/slides/day1-afternoon.qmd
index 24b8641..16fdbbc 100644
--- a/slides/day1-afternoon.qmd
+++ b/slides/day1-afternoon.qmd
@@ -1166,50 +1166,12 @@ ggplot(snapshots |> filter(!latest),
:::
-* We also have access to $p$ other time series
-$x_{ij},\; i=1,\ldots,t, \; j = 1,\ldots,p$
+* We may also have access to $p$ other time series
+$x_{ij},\; i=1,\ldots,t, \; j = 1,\ldots, p$ which may be subject to revisions.
-* All may be subject to revisions.
-
-## Aside on nowcasting
-
-* To some Epis, "nowcasting" can be equated with "estimate the time-varying instantaneous reproduction number, $R_t$"
-
-* Ex. using the number of reported COVID-19 cases in British Columbia between Jan. 2020 and Apr. 15, 2023.
-
-
-```{r rtestim}
-#| fig-width: 9
-#| fig-height: 3
-#| out-height: "400px"
-#| label: nowcasting
-library(rtestim)
-source(here::here("_code/bccovid.R"))
-
-p1 <- bccovid|>
- ggplot(aes(date, cases)) +
- geom_line(colour = primary) +
- geom_vline(xintercept = ymd("2023-04-15"), colour = secondary,
- linewidth = 2) +
- labs(y = "BC Covid-19 cases", x = "Date") +
- scale_y_continuous(expand = expansion(c(0, NA)))
-bc_rt <- estimate_rt(bccovid$cases, x = bccovid$date,
- lambda = c(1e6, 1e5))
-p2 <- plot(confband(bc_rt, lambda = 1e5)) +
- coord_cartesian(ylim = c(0.5, 2)) +
- scale_y_continuous(expand = expansion(0))
-cowplot::plot_grid(p1, p2)
-```
-
-* Group built [`{rtestim}`](https://dajmcdon.github.io/rtestim) doing for this nonparametrically.
-
-* We may come back to this later...
-
-# Nowcasting with One Variable
-
-## Nowcasting simple ratio Ex: NCHS mortality
+## Case study: NCHS mortality
* In this example, we'll demonstrate the concept of nowcasting using [**NHCS mortality data**]{.primary}.
(the number of weekly new deaths with confirmed or presumed COVID-19, per 100,000 population).
@@ -1219,47 +1181,65 @@ cowplot::plot_grid(p1, p2)
## Fetch versioned data
Let's fetch versioned mortality data from the API (`pub_covidcast`) for CA (`geo_values = "ca"`) and the signal of interest (`deaths_covid_incidence_num`) over early 2024.
+
```{r mortality-archive-construct}
#| echo: true
+
# Fetch the versioned NCHS mortality data (weekly)
-mortality_archive <- pub_covidcast(
+nchs_archive <- pub_covidcast(
source = "nchs-mortality",
signals = "deaths_covid_incidence_num",
geo_type = "state",
time_type = "week",
- geo_values = "ca", # California (CA)
- time_values = epirange(202401, 202413),
+ geo_values = c("ca", "ut"),
+ time_values = epirange(202001, 202440),
issues = "*"
) |>
select(geo_value, time_value, version = issue, mortality = value) |>
as_epi_archive(compactify = TRUE)
-# Set the start and end days for the analysis
-# corresponding to the weeks entered in time_values
-start_time = as.Date("2023-12-31")
-end_time = as.Date("2024-03-24")
```
-## Latency in reporting - Minimum lag
-* A quick inspection reveals that mortality rates are systematically 7 days latent ([**fixed lag**]{.primary}).
+## Analysis of versioning behavior
-```{r inspect-latency-dplyr-way}
+Recall, we need to watch out for:
+
+* [**Latency**]{.primary} the time difference between date of occurrence and date of the initial report
+* [**Backfill**]{.primary} how data for a given date is updated after initial report.
+
+`revision_summary()` provides a sumamry to both aspects.
+
+```{r inspect-revision_summary}
#| echo: true
-mortality_revision_inspect = mortality_archive$DT |> mutate(version_time_diff = version - time_value)
+revision_data = revision_summary(nchs_archive, mortality, print_inform = FALSE)
+```
+
-# Look at the first revision for each week
-mortality_revision_inspect |> group_by(time_value) |> slice(1) |> head()
+## Versioning analysis -- latency
+
+* [**Question:**]{.primary} What is the latency of NCHS data?
+
+```{r nchs-latency-via-revision-summary}
+#| echo: true
+
+revision_data |> select(geo_value, time_value, min_lag) |> slice_sample(n = 10)
```
-* Use `revision_summary()` from `epiprocess` to generate basic statistics about the revision behavior for the dataset.
+* We randomly sampled some dates to check if there is a consistent latency pattern.
+* Understanding latency prevents us from using data that we shouldn't have access to.
-```{r revision-summary-ex}
-#| eval: false
-revision_summary(mortality_archive, print_inform = TRUE)
+## Versioning analysis -- backfill
+
+* [**Question:**]{.primary} How long does it take for the reported value to be close to the finalized value?
+
+```{r revision-summary-time-near-latest-demo}
+revision_data |> select(geo_value, time_value, time_near_latest) |> slice_sample(n = 10)
```
+* It generally takes at least 4 weeks for reported value to be within 20\% of the finalized value.
+* We can change the threshold of percentage difference by specifying `within_latest` argument of `revision_summary()`.
-## Latency in reporting - Finalized value attainment
+## Versioning analysis - backfill
* [**Question:**]{.primary} When is the [**finalized value**]{.primary} first attained for each date? Would we have access to any in real-time?
* How fast are the final values attained & what's the pattern for these times, if any?
@@ -1293,35 +1273,55 @@ check_when_finalized <- function(epi_archive, start_date = NULL, end_date = NULL
```{r check-when-finalized-run}
#| echo: false
-res <- check_when_finalized(mortality_archive, start_date = start_time, end_date = end_time)
+res <- check_when_finalized(nchs_archive,
+ start_date = min(nchs_archive$DT$time_value),
+ end_date = max(nchs_archive$DT$time_value))
+
head(res)
```
-And here's a numerical summary:
+Here is a look at its quantiles:
```{r summary-diff}
#| echo: false
summary(as.numeric(res$diff))
```
-* [**Conclusion**]{.primary}: Tends to take a long time & varies. Even for this relatively small time period... Goes as low as 84 days or as high as 294 days. Yikes.
-* So if we were doing this in real-time, then we wouldn't have access to the finalized data.
+* [**Conclusion**]{.primary}: The revision behavior is pretty long-tailed. Value reported 4 weeks later is reasonably close to the finalized value.
+
-## Comparison of final vs. multiple revisions
+## Revision pattern visualization
This shows the finalized rates in comparison to [**multiple revisions**]{.primary} to see how the data changes over time:
-```{r mortality-by-revision-date}
+```{r mortality-by-accessed-weeks-later}
#| echo: false
-# Visualize the mortality values for different revisions
-revision_dates <- seq(min(mortality_archive$DT$version), end_time, by = "1 week")
+ref_dates <- unique(nchs_archive$DT$time_value)
+offsets = seq(1, 7) * 7
+max_version <- nchs_archive$versions_end
-# Create a data frame for each version and label them by revision date
-mortality_revisions <- map_dfr(revision_dates, function(date) {
- epix_as_of(mortality_archive, date) |>
- mutate(revision_date = date) # Add a column for the revision date
-})
+get_val_asof <- function(time_val, archive, offsets) {
+
+ as_of_dates <- pmin(time_val + offsets, max_version)
+
+ result <- map(as_of_dates, function(x) {
+
+ qd <- archive |>
+ epix_as_of(x) |>
+ filter(time_value == time_val) |>
+ select(geo_value, time_value, mortality) |>
+ mutate(lag = x - time_val)
+ }) |>
+ list_rbind()
+
+ return(result)
+
+
+}
-# Extract the latest/finalized version
-mortality_latest <- epix_as_of(mortality_archive, max(mortality_archive$DT$version))
+value_at_lags <- map(ref_dates, get_val_asof,
+ archive <- nchs_archive, offsets <- offsets) |>
+ list_rbind()
+
+values_final <- epix_as_of(nchs_archive, max(nchs_archive$versions_end))
```
@@ -1330,327 +1330,503 @@ mortality_latest <- epix_as_of(mortality_archive, max(mortality_archive$DT$versi
#| fig-width: 9
#| fig-height: 4
#| out-height: "500px"
-ggplot() +
- geom_line(data = mortality_latest, aes(x = time_value, y = mortality), color = "black", size = 1) +
- geom_line(data = mortality_revisions, aes(x = time_value, y = mortality, color = as.factor(revision_date))) +
- geom_point(data = mortality_revisions, aes(x = time_value, y = mortality, color = as.factor(revision_date)), size = 2) +
- theme_gray() +
- labs(color = "Revision Date", y = "Mortality", x = "Date") +
- ggtitle("COVID-19 Mortality in CA: Finalized (black) vs Various Revisions")
+ggplot(value_at_lags, aes(x = time_value, y = mortality)) +
+ geom_line(aes(color = factor(lag))) +
+ facet_wrap(~ geo_value, scales = "free_y", ncol = 1) +
+ scale_x_date(minor_breaks = "month", date_labels = "%b %Y") +
+ labs(x = "", y = "Weekly new COVID deaths") +
+ # scale_color_viridis_d(option="D", end=0.8) +
+ theme(legend.position = "none") +
+ geom_line(data = values_final, aes(x = time_value, y = mortality),
+ inherit.aes = FALSE, color = "black")
+
```
-## Comparison of final vs. one revision
+## Revision pattern visualization
-The below figure compares the finalized rates (in black) to [**one revision**]{.primary} (in yellow) from March 3, 2024.
+```{r nchs-plot-different-ver}
+#| echo: false
+#| eval: true
+
+versions = seq.Date(as.Date("2021-01-01"), nchs_archive$versions_end, by = "1 month")
+nchs_snapshots = map(versions, function(v) {
+ epix_as_of(nchs_archive, v) |>
+ mutate(version = v)}) |>
+ bind_rows() |>
+ mutate(latest = version == nchs_archive$versions_end)
+```
-```{r one-revision-final-plot}
-#| out-height: "400px"
-as_of_date = as.Date("2024-03-03")
+```{r nchs-plot-val-different-ver}
+#| echo: false
+#| fig-width: 9
+#| fig-height: 4
+#| out-height: "500px"
+
+ggplot(nchs_snapshots |> filter(!latest),
+ aes(x = time_value, y = mortality)) +
+ geom_line(aes(color = factor(version))) +
+ geom_vline(aes(color = factor(version), xintercept = version), lty = 3) +
+ facet_wrap(~ geo_value, scales = "free_y", ncol = 1) +
+ scale_x_date(minor_breaks = "month", date_labels = "%b %Y") +
+ labs(x = "", y = "Weekly covid deaths") +
+ scale_color_viridis_d(option = "B", end = .8) +
+ theme(legend.position = "none") +
+ geom_line(data = nchs_snapshots |> filter(latest),
+ aes(x = time_value, y = mortality),
+ inherit.aes = FALSE, color = "black")
+```
+
+## Aside: Do we need a burn-in/training set?
+
+* Typical stat-ML practise suggests a train, validation, test split.
+* But our exploratory analysis covered all available data, is that fine?
-ggplot() +
- geom_line(data = mortality_latest,
- aes(x = time_value, y = mortality, color = "Finalized"),
- size = 1) +
- geom_line(data = mortality_revisions |> filter(revision_date == as_of_date),
- aes(x = time_value, y = mortality, color = "Revision"),
- size = 1) +
- ylab("Mortality") +
- xlab("Date") +
- scale_color_manual(values = c("Finalized" = "black", "Revision" = "#FFB300")) +
- labs(color = "Mortality Type")
+* Generally, for exploratory analysis, it is fine to not do train/test split.
+ + These analyses do not involve model fitting, we have little risk of an overly optimistic performance evaluation (no overfitting on test data).
+* However, for a [**psuedo-prospective analysis**]{.primary}, the best practise is to do a train/test split.
+ + In such analysis, one would be fitting and validating many models, a train/test split provides a more rigorous control on overfitting to test set.
+
+## Ratio nowcaster: jumping from provisional to finalized value
+
+* Recall, the goal of nowcast at date $t$ is to use project the [*finalized value*]{.primary} of $y_t,$ given the information available on date $t$.
+* A very simple nowcaster is the ratio between finalized and provisional value.
+
+
+
+How can we sensibly estimate this quantity?
+
+
+
+::: {.fragment .fade-in}
+
+* At nowcast date $t,$ would have recieved reports with versions up to and including $t.$
+* We need to build training samples, which
+ + correctly aligns finalized value against provisional value
+ + uses features that would have been available at test time
+ + have enough data to ensure sensible estimation results
+:::
+
+```{r nchs-ca-only}
+#| echo: false
+#| eval: true
+nchs_archive <- pub_covidcast(
+ source = "nchs-mortality",
+ signals = "deaths_covid_incidence_num",
+ geo_type = "state",
+ time_type = "week",
+ geo_values = "ca",
+ time_values = epirange(202001, 202440),
+ issues = "*"
+) |>
+ select(geo_value, time_value, version = issue, mortality = value) |>
+ as_epi_archive(compactify = TRUE)
```
-The real-time data is biased downwards (systematically below the true value). That is, the signal tends to get scaled up with future revisions.
-## Calculate one ratio: Provisional vs. finalized data
-
+## Ratio nowcaster: building training samples
-Suppose that the day is March 10, 2024. Then, because the data is 7 days latent, we can compute the ratio between provisional and finalized data for [**March 3, 2024**]{.primary}.
+* At nowcast date $t,$ would have recieved reports with versions up to and including $t.$
+* We need to build training samples, which
+ + correctly aligns finalized value against provisional value
+ + uses features that would have been available at test time
+ + have enough samples to ensure sensible estimation results
-```{r one-ratio-calc}
-#| echo: true
-as_of_date = as.Date("2024-03-10"); fixed_lag = 7
+::: {.fragment .fade-in}
+* Build training samples by treating dates prior to date $t$ as actual nowcast dates.
+ + What is the provisional data on that date?
+ + Have we recieved finalized value for that date?
+:::
-# Load the finalized mortality data for CA
-ca_finalized <- mortality_latest |>
- filter(time_value == (as_of_date - fixed_lag)) |>
- dplyr::select(mortality)
+## Ratio nowcaster: building training samples
-# Load the provisional mortality data for the same week
-mortality_old = epix_as_of(mortality_archive, as_of_date)
+* At an earlier nowcast date $t_0,$ we define
+ + [**Provisional value**]{.primary} as the reported value of $Y_s$ with version $t_0.$ Here $s_0$ is the largest occurence date among all values reported up until $t_0.$
+ + [**Finalized value**]{.primary} as the (potentially unobserved) finalized value of $Y_{s_0}.$
+ - We only know in *hindsight* when reported value of $Y_{s_0}$ is finalized -- need an approximation.
-ca_provisional <- mortality_old |>
- filter(time_value == (as_of_date - fixed_lag)) |>
- dplyr::select(mortality)
+::: {.fragment .fade-in}
+```{r revision-summary-time-near-latest-show-again}
+#| echo: false
+#| eval: true
-# Calculate ratio between provisional and finalized cases for the week of interest
-ratio <- ca_provisional$mortality / ca_finalized$mortality
-ratio
+revision_data |> select(geo_value, time_value, time_near_latest) |> slice_sample(n = 5)
+quantile(revision_data$time_near_latest)
```
+:::
-[**Conclusion**]{.primary}: The real-time rate is well below the finalized for this time (26 vs 72 here).
+::: {.fragment .fade-in}
+We can say data reported 49 days after occurrence date is good enough to be considered finalized.
+:::
-[**Question**]{.primary}: Can we generalize this over many days?
+## Ratio nowcaster: test time feature
-## Calculating the ratio using multiple dates
-Let's move from calculating the ratio by using one day to multiple days with the goal to use it to nowcast for Feb. 18, which has a [**provisional value**]{.primary} of 23
-```{r provisional-val-feb18}
+* At test time $t,$ take provisional value to be $Y_s,$ where $s$ is the largest occureence date among all values reported up until time $t.$
+
+## Nowcasting at a single date: building training samples
+
+::: {.fragment .fade-in}
+* Searching for provisional values, at previous hypothetical nowcast dates.
+
+```{r one-date-look-for-provisional}
#| echo: true
-as_of_date = as.Date("2024-02-25")
-provisional <- epix_as_of(mortality_archive, as_of_date) |>
- filter(time_value == as_of_date - 7) |>
- pull(mortality)
-provisional
+nowcast_date <- as.Date("2022-01-02"); window_length = 180
+
+initial_data <- nchs_archive$DT |>
+ group_by(geo_value, time_value) |>
+ filter(version == min(version)) |>
+ rename(initial_val = mortality) |>
+ select(geo_value, time_value, initial_val)
+
```
-
-and a [**finalized value**]{.primary} of 104
+:::
+
-```{r finalized-val-feb18}
+::: {.fragment .fade-in}
+* Searching for finalized values, at previous hypothetical nowcast dates.
+
+```{r one-date-look-for-final}
#| echo: true
-finalized <- mortality_latest |>
- filter(time_value == as_of_date - 7) |>
- pull(mortality)
-finalized
+#|
+finalized_data <- epix_as_of(nchs_archive, nowcast_date) |>
+ filter(time_value >= nowcast_date - 49 - window_length & time_value <= nowcast_date - 49) |>
+ rename(finalized_val = mortality) |>
+ select(geo_value, time_value, finalized_val)
```
+:::
-## Calculating the ratio using multiple dates
-First, let's download the real-time rates for CA, and compare them to their finalized version.
-```{r real-time-mortality}
+::: {.fragment .fade-in}
+* After searching for both provisional and finalized values, we merge them together and estimate the ratio.
+```{r one-date-combine-provsional-and-final}
#| echo: true
-dates <- seq(start_time, (as_of_date - 7), by = "day")
-mortality_real_time <- function(date) {
- epix_as_of(mortality_archive, date + 7) |>
- filter(time_value == date)
-}
-mortality_real_time_df <- map_dfr(dates, mortality_real_time)
-head(mortality_real_time_df)
-```
-
-## Calculating the ratio using multiple dates
-Now, let's plot the real-time vs the finalized mortality rates:
-```{r real-time-vs-finalized}
-ggplot() +
- geom_line(data = mortality_latest |> filter(time_value <= (as_of_date - 7)),
- aes(x = time_value, y = mortality, color = "Finalized")) +
- geom_line(data = mortality_real_time_df,
- aes(x = time_value, y = mortality, color = "Real-Time")) +
- ylab("Mortality") +
- xlab("Date") +
- scale_color_manual(values = c("Finalized" = "black", "Real-Time" = "red")) +
- labs(color = "Mortality Type") # Adds the legend title
-
+
+ratio <- finalized_data |>
+ inner_join(initial_data, by = c("geo_value", "time_value")) |>
+ mutate(ratio = finalized_val / initial_val) |>
+ pull(ratio) |>
+ mean(na.rm = TRUE)
```
-* [**Takeaways**]{.primary}: The real-time counts are biased [**well below**]{.primary} the finalized counts.
-* Systematic underreporting tends to lessen over time (the gap between the lines decreases).
+:::
-## Realistic limitation of nowcasting - Finalized data
-* Recall that real-time access to finalized data is limited as finalized values can take months to report (e.g., Jan. 7 is finalized 294 days later).
-* To nowcast accurately, we must rely on the [**best available approximation of finalized data**]{.primary} at the time of estimation (Feb. 25).
+## Nowcasting at a single date: test feature and actual nowcast
-```{r finalized-data-as-of-feb25}
+```{r one-date-test-feat and nowcast}
#| echo: true
-mortality_as_of_feb25 <- epix_as_of(mortality_archive, as_of_date)
-head(mortality_as_of_feb25)
+
+last_avail <- epix_as_of(nchs_archive, nowcast_date) |>
+ slice_max(time_value) |>
+ pull(mortality)
+
+nowcast <- last_avail * ratio
+nowcast
```
-## Ratio calculation & summary
-We then use these "finalized" and real-time values to compute the mean ratio:
-```{r ratio-calc-summary}
+## Nowcasting for multiple dates
+
+* All previous manipulations should really be seen as a template for all nowcast dates.
+* The template computation sould be applied over all nowcast dates, [**but we must respect data versioning**]{.primary}!
+* `epix_slide()` is designed just for this! It behaves similarly to `epi_slide`.
+* Key exception: `epix_slide()` is version aware: the sliding computation at any reference time $t$ is performed on [**data that would have been available as of t**]{.primary}.
+
+
+
+## Nowcasting for multiple dates via `epix_slide()`
+
+We begin by templatizing our previous operations.
+
+```{r nowcaster-to-slide}
#| echo: true
-# exclude date we're nowcasting for
-mortality_real_time_df = mortality_real_time_df |> filter(time_value != "2024-02-18")
-mortality_as_of_feb25 = mortality_as_of_feb25 |> filter(time_value != "2024-02-18")
-ratio_real_time_to_feb25 <- mortality_real_time_df$mortality / mortality_as_of_feb25$mortality
-summary(ratio_real_time_to_feb25)
+
+nowcaster = function(x, g, t, wl=180, appx=49) {
+
+
+ initial_data = x$DT |>
+ group_by(geo_value, time_value) |>
+ filter(version == min(version)) |>
+ filter(time_value >= t - wl - appx & time_value <= t - appx) |>
+ rename(initial_val = mortality) |>
+ select(geo_value, time_value, initial_val)
+
+ finalized_data = x$DT |>
+ group_by(geo_value, time_value) |>
+ filter(version == max(version)) |>
+ filter(time_value >= t - wl - appx & time_value <= t - appx) |>
+ rename(finalized_val = mortality) |>
+ select(geo_value, time_value, finalized_val)
+
+ ratio = finalized_data |>
+ inner_join(initial_data, by = c("geo_value", "time_value")) |>
+ mutate(ratio = finalized_val / initial_val) |>
+ pull(ratio) |>
+ median(na.rm=TRUE)
+
+ last_avail = epix_as_of(x, t) |>
+ slice_max(time_value) |>
+ pull(mortality)
+
+ res = tibble(geo_value = x$geo_value, target_date = t, nowcast = last_avail * ratio)
+
+ return(res)
+
+}
+
```
-On average, the real-time rates are ~25.7% of the finalized.
-```{r boxplot-ratio}
+## Nowcasting for multiple dates via `epix_slide()`
+
+```{r epix-slide-extract-nowcast-date}
#| echo: false
-ratio_df <- data.frame(ratio_real_time_to_feb25, mean_ratio = mean(ratio_real_time_to_feb25))
-
-# Create the boxplot with mean marked as a bold cross
-ggplot(ratio_df, aes(y = ratio_real_time_to_feb25)) +
- geom_boxplot(fill = "lightblue", color = "black") +
- geom_point(
- aes(x = 0, y = mean_ratio), # Place point at the mean
- shape = 4, # Cross shape
- size = 7, # Size of the cross
- color = "darkblue", # Color of the cross
- stroke = 2 # Boldness of the cross
- ) +
- labs(
- title = "Distribution of Real-Time to Finalized Mortality Ratios",
- y = "Real-Time to Finalized Ratio"
- ) +
- theme_minimal() + # Minimal theme for clean look
- theme(
- plot.title = element_text(hjust = 0.5), # Center the title
- axis.text.x = element_blank()
- ) +
- coord_cartesian(ylim = c(0.2, 0.3)) # Limit y-axis between 0 and 0.5 to zoom in
+#| eval: true
+
+all_nowcast_dates = nchs_archive$DT |>
+ filter(time_value >= as.Date("2022-01-01")) |>
+ distinct(time_value) |>
+ pull(time_value)
+```
+
+
+```{r nowcasts-slided}
+#| echo: true
+#| eval: true
+
+nowcasts = nchs_archive |>
+ group_by(geo_value) |>
+ epix_slide(
+ nowcaster,
+ .before=Inf,
+ .versions = all_nowcast_dates,
+ .all_versions = TRUE
+)
```
-Tells us the distribution is right-skewed (mean > median) and so we should opt for the median.
-## Nowcasting on Feb. 25
-* Since the [**median ratio**]{.primary} between real-time and finalized values is [**0.250**]{.primary} (i.e., real-time values are typically 25% of the finalized), then the nowcast is
-```{r nowcast-feb25-point}
+## Details of `epix_slide()`
+
+
+```{r epix-slide-demo-call-allversions-FALSE}
#| echo: true
-# Now we can nowcast properly:
-nowcast <- provisional *
- 1 / median(ratio_real_time_to_feb25)
-nowcast
+#| eval: false
+
+epix_slide(
+ .x,
+ .f,
+ ...,
+ .before = Inf,
+ .versions = NULL,
+ .new_col_name = NULL,
+ .all_versions = FALSE
+)
```
-* To get the accompanying 95% prediction interval, calculate the 2.5th and 97.5th percentiles:
-```{r nowcast-feb25-lower}
+* `.f` in `epix_slide()` can be specified with the same form of custom function as `epi_slide()`.
+
+```{r epix-slide-form-of-custom-function}
#| echo: true
-percentile_97.5 <- quantile(ratio_real_time_to_feb25, 0.975) |> unname()
+#| eval: false
-(lower_PI <- provisional * 1 / percentile_97.5)
+function(x, g, t) {
+ # function body
+}
```
-```{r nowcast-feb25-upper}
+* Mandatory variables of `.f` would have different forms depending on the value of `.all_versions`.
+
+
+## Details of `epix_slide()`
+
+```{r epix-slide-demo-call-allversions-FALSE-again}
#| echo: true
-percentile_2.5 <- quantile(ratio_real_time_to_feb25, 0.025) |> unname()
-(upper_PI <- provisional * 1 / percentile_2.5)
+#| eval: false
+#| code-line-numbers: "|8"
+
+epix_slide(
+ .x,
+ .f,
+ ...,
+ .before = Inf,
+ .versions = NULL,
+ .new_col_name = NULL,
+ .all_versions = FALSE
+)
```
-* So, the [**nowcast is 92**]{.primary} with 95% PI: [61, 140], which is much closer to the [**finalized value of 104**]{.primary} than the [**provisional value of 23**]{.primary}.
+::: {.fragment .fade-in}
+* When `.all_versions = FALSE`, `epix_slide()` essentially iterates the templatized computation over snapshots.
+* Said differently, when `.all_versions = FALSE`, data accessed at any sliding iteration [**only involves a single version**]{.primary}.
+:::
+
+::: {.fragment .fade-in}
+* Hence:
+ + `x`: an `epi_df` with same column names as archive's `DT`, minus the `version` column.
+ + `g`: a one-row tibble containing the values of groupping variables of the associated group.
+ + `t`: the `ref_time_value` of the current window.
+ + `...`: additional arguments.
+:::
+
+## Details of `epix_slide()`
-## Summary of three main steps
-So the main steps for this type of fixed lag nowcasting are...
+```{r epix-slide-demo-call-allversions-TRUE}
+#| echo: true
+#| eval: false
+#| code-line-numbers: "|8"
-1. Obtain the [**provisional value**]{.primary} for the target.
+epix_slide(
+ .x,
+ .f,
+ ...,
+ .before = Inf,
+ .versions = NULL,
+ .new_col_name = NULL,
+ .all_versions = TRUE
+)
+```
-2. Estimate the ratio using the [**real-time**]{.primary} and [**"finalized"**]{.primary} data (for all previous dates that follow a consistent pattern in reporting).
+::: {.fragment .fade-in}
+* When `.all_versions = FALSE`, data accessed at any sliding iteration involves versions [**up to and including .ref_time_value**]{.primary}.
+:::
+
+::: {.fragment .fade-in}
+* Hence:
+ + `x`: an `epi_archive`, with version up to and including `.ref_time_value`.
+ + `g`: a one-row tibble containing the values of groupping variables of the associated group.
+ + `t`: the `ref_time_value` of the current window.
+ + `...`: additional arguments.
+:::
-3. Profit.
-```{r ratio-summary-steps}
+## Details of `epix_slide()`
+
+```{r nowcasts-slide-demo-only, eval=FALSE}
#| echo: true
#| eval: false
-#| code-fold: true
-#| code-summary: "Expand for the accompanying code"
-# Today
-as_of_date = as.Date("2024-02-25")
+#| code-line-numbers: "|7,13,17"
-# 1. Obtain the provisional value
-provisional <- epix_as_of(mortality_archive, as_of_date) |>
- filter(time_value == as_of_date - 7) |>
- pull(mortality)
-provisional
+nowcasts <- nchs_archive |>
+ group_by(geo_value) |>
+ epix_slide(
+ nowcaster,
+ .before=Inf,
+ .versions = all_nowcast_dates,
+ .all_versions = TRUE
+)
+
+nowcaster <- function(x, g, t, wl=180, appx=49) {
+
-# 2. Estimate the ratio
-mortality_real_time_df <- map_dfr(dates, mortality_real_time) |> filter(time_value != "2024-02-18") # Real-time
-mortality_as_of_feb25 <- epix_as_of(mortality_archive, as_of_date) |> filter(time_value != "2024-02-18") # "Finalized"
+ initial_data = x$DT |>
+ # Function body, omitted
-ratio_real_time_to_feb25 <- mortality_real_time_df$mortality / mortality_as_of_feb25$mortality
-# 3. Profit.
-(nowcast <- provisional * 1 / median(ratio_real_time_to_feb25))
+ finalized_data = x$DT |>
+ # Function body, omitted
+
+}
-(upper_PI <- provisional * 1 / quantile(ratio_real_time_to_feb25, 0.025))
-(lower_PI <- provisional * 1 / quantile(ratio_real_time_to_feb25, 0.975))
```
-## Nowcasting mortality for multiple dates
-* [**Define Nowcast Function**]{.primary}:
- * [**Input**]{.primary}: Takes in the dates to nowcast and the fixed lag
- * [**Output**]{.primary}: The nowcasted mortality rates based on the ratio of real-time to finalized data.
-```{r nowcasting-multipl-dates-fun}
-#| echo: true
-#| code-fold: true
-nowcast_function <- function(nowcast_date, fixed_lag) {
- as_of_date = nowcast_date + fixed_lag
+
+## Visualize nowcasts
+
+
+We are now finally able to compare nowcasts against first available reports:
+
+```{r nowcast-subsetting}
+#| echo: false
+
+
+intial_val_extracter <- function(x, g, t, wl=180, appx=49) {
- # 1. Obtain the provisional value for the target.
- provisional <- epix_as_of(mortality_archive, as_of_date) |>
- filter(time_value == as_of_date - fixed_lag) |>
+ last_avail = epix_as_of(x, t) |>
+ slice_max(time_value) |>
pull(mortality)
- #2. Estimate the ratio multiplier using
- # real-time
- dates_seq <- seq(start_time, (nowcast_date - fixed_lag), by = "week")
- mortality_real_time <- map_dfr(dates_seq, mortality_real_time)
+ res = tibble(geo_value = x$geo_value, target_date = t, value = last_avail)
- # and "finalized" data
- finalized <- epix_as_of(mortality_archive, as_of_date) |> filter(time_value >= start_time & time_value <= (nowcast_date - fixed_lag))
+ return(res)
- ratios <- mortality_real_time$mortality / finalized$mortality
-
- # Remove infinite or NaN ratios (i.e., keep only finite values)
- median_ratio <- median(ratios[is.finite(ratios)])
-
- #3. Profit.
- nowcast <- provisional * (1 / median_ratio)
- upper_PI <- provisional * (1 / quantile(ratios[is.finite(ratios)], 0.025))
- lower_PI <- provisional * (1 / quantile(ratios[is.finite(ratios)], 0.975))
-
- # Return a dataframe with the nowcast and date
- tibble(
- time_value = nowcast_date,
- nowcast = nowcast,
- lower_PI = lower_PI,
- upper_PI = upper_PI
- )
}
+
+
+provisional_val = epix_slide(nchs_archive, intial_val_extracter, .versions = all_nowcast_dates, .all_versions = TRUE)
+
+finalized_val = nchs_archive$DT |>
+ filter(time_value >= as.Date("2022-01-01")) |>
+ group_by(geo_value, time_value) |>
+ filter(version == max(version))
+
```
-## Map nowcast over multiple dates
-* We can use `map2()` to apply the function to a series of weeks (e.g., Jan. 28 to Mar. 24).
-* Returns a [**dataframe**]{.primary} with nowcasted results.
-```{r map-nowcast-fun-multiple-dates}
-#| echo: true
-# Apply Nowcast Function Over Multiple Dates
-nowcast_dates <- seq(as.Date("2024-01-28"), as.Date("2024-03-24"), by = "week")
-fixed_lag <- 7
-nowcast_results_df <- map2(nowcast_dates, fixed_lag, nowcast_function) |> list_rbind()
+```{r nowcast-fun-plot-results}
+#| echo: false
+
+ggplot() +
+ geom_line(data = finalized_val, aes(x = time_value, y = mortality, color = "Finalized")) +
+ geom_point(data = finalized_val, aes(x = time_value, y = mortality, color = "Finalized"), shape = 16) +
+ geom_line(data = provisional_val, aes(x = target_date, y = value, color = "Provisional")) +
+ geom_point(data = provisional_val, aes(x = target_date, y = value, color = "Provisional"), shape = 16) +
+ geom_line(data = nowcasts, aes(x = target_date, y = nowcast, color = "Nowcast")) +
+ geom_point(data = nowcasts, aes(x = target_date, y = nowcast, color = "Nowcast"), shape = 16) +
+ # geom_ribbon(data = nowcast_results_df,
+ # aes(x = time_value, ymin = lower_PI, ymax = upper_PI),
+ # alpha = 0.2, fill = "blue") +
+ ylab("Mortality") +
+ xlab("Date") +
+ scale_color_manual(name = "",
+ values = c("Nowcast" = "#71c5e8", "Provisional" = "#FF6900", "Finalized" = "black")) +
+ theme(legend.position = "bottom")
```
-Let's smooth with a rolling trailing mean (window size 4) & see the results:
+* The real-time counts tend to be biased below the finalized counts. Nowcasted values tend to provide a much better approximation of the truth (at least for these dates).
+
+
+## Smoothing nowcasts
+
+* Nowcasts are quite volatile, reflecting the provisional counts are far from complete.
+* We can use a trailing average to smooth them.
+
+```{r smooth-nowcasts-epi-slide}
+#| echo: true
+
+smoothed_nowcasts <- epi_slide(
+ nowcasts |> as_epi_df(),
+ smoothed_nowcasts = mean(nowcast, na.rm = TRUE),
+ .window_size = as.difftime(3, units = "weeks")
+)
-```{r smooth-print-res}
-# Smooth results: Apply rolling median to nowcast and bounds with a window size of 4
-library(zoo)
-nowcast_results_df <- nowcast_results_df |>
- mutate(across(.cols = -time_value,
- .fns = ~ zoo::rollapply(.x, width = 4, FUN = mean, partial = TRUE, align = "right"),
- .names = "{.col}"))
-nowcast_results_df
```
-## Visualize nowcast, real-time, and finalized values
-Finally, we can compare these nowcast results to the real-time and finalized values:
-```{r nowcast-fun-plot-results}
+```{r nowcast-smoothed-vis}
#| echo: false
+
+cbPalette = c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442",
+ "#0072B2", "#D55E00", "#CC79A7")
+
ggplot() +
- geom_line(data = nowcast_results_df, aes(x = time_value, y = nowcast, color = "Nowcast")) +
- geom_point(data = nowcast_results_df, aes(x = time_value, y = nowcast, color = "Nowcast"), shape = 16) +
- geom_line(data = map_dfr(nowcast_dates, mortality_real_time), aes(x = time_value, y = mortality, color = "Real-Time")) +
- geom_point(data = map_dfr(nowcast_dates, mortality_real_time), aes(x = time_value, y = mortality, color = "Real-Time"), shape = 16) +
- geom_line(data = mortality_latest |> filter(time_value %in% nowcast_dates), aes(x = time_value, y = mortality, color = "Finalized")) +
- geom_point(data = mortality_latest |> filter(time_value %in% nowcast_dates), aes(x = time_value, y = mortality, color = "Finalized"), shape = 16) +
- geom_ribbon(data = nowcast_results_df,
- aes(x = time_value, ymin = lower_PI, ymax = upper_PI),
- alpha = 0.2, fill = "blue") +
+ geom_line(data = finalized_val, aes(x = time_value, y = mortality, color = "Finalized")) +
+ geom_point(data = finalized_val, aes(x = time_value, y = mortality, color = "Finalized"), shape = 16) +
+ geom_line(data = smoothed_nowcasts, aes(x = time_value, y = smoothed_nowcasts, color = "Smoothed")) +
+ geom_point(data = smoothed_nowcasts, aes(x = time_value, y = smoothed_nowcasts, color = "Smoothed"), shape = 16) +
+ geom_line(data = nowcasts, aes(x = target_date, y = nowcast, color = "Nowcast")) +
+ geom_point(data = nowcasts, aes(x = target_date, y = nowcast, color = "Nowcast"), shape = 16) +
+ # geom_ribbon(data = nowcast_results_df,
+ # aes(x = time_value, ymin = lower_PI, ymax = upper_PI),
+ # alpha = 0.2, fill = "blue") +
ylab("Mortality") +
xlab("Date") +
- scale_color_manual(values = c("Nowcast" = "blue", "Real-Time" = "red", "Finalized" = "black")) +
- labs(color = "Mortality Type") # Adds the legend title
+ scale_color_manual(name = "", values = c("Nowcast" = "#71c5e8", "Smoothed" = "#97D700", "Finalized" = "black")) +
+ theme(legend.position = "bottom")
```
-The real-time counts tend to be biased below the finalized counts. Nowcasted values tend to provide a much better approximation of the truth (at least for these dates).
+
+
## Evaluation using MAE
@@ -1658,7 +1834,6 @@ The real-time counts tend to be biased below the finalized counts. Nowcasted val
* Then for $y_{t}$ over times $t = 1, \dots, N$, then we may compute error metrics like mean absolute error (MAE).
-
* MAE measures the average absolute difference between the nowcast and finalized values.
@@ -1672,50 +1847,33 @@ Let's numerically evaluate our point nowcasts for the provisional values of a ti
+
```{r mae-code}
-#| echo: true
+#| echo: false
# Step 1: Join the mortality data with nowcast data
-mae_data <- mortality_latest |>
- filter(time_value %in% nowcast_dates) |>
- left_join(nowcast_results_df, by = "time_value") |>
- left_join(map_dfr(nowcast_dates, mortality_real_time) |> rename(real_time = mortality), by = c("geo_value", "time_value"))
+mae_data <- finalized_val |>
+ select(-version) |>
+ inner_join(nowcasts |> select(-version), by = join_by("geo_value", "time_value" == "target_date")) |>
+ inner_join(initial_data, by = c("geo_value", "time_value")) |>
+ inner_join(smoothed_nowcasts |> select(-version, -nowcast), by = c("geo_value", "time_value"))
# Step 2: Calculate the absolute error between actual and nowcasted values
mae_data <- mae_data |>
- mutate(nc_abs_error = abs(mortality - nowcast),
- rt_abs_error = abs(mortality - real_time))
+ mutate(raw_nc_res = abs(mortality - nowcast),
+ prov_res = abs(mortality - initial_val),
+ smoothed_res = abs(mortality - smoothed_nowcasts))
# Step 3: Compute the MAE (mean of absolute errors)
mae_value <- mae_data |>
- summarise(nc_MAE = mean(nc_abs_error),
- rt_MAE = mean(rt_abs_error))
-knitr::kable(mae_value)
-```
-
-## Evaluation using MAE
-
-Finally, we may visualize the distribution of errors across time:
-```{r abs-error-plot}
-ggplot(mae_data) +
- # Nowcast Absolute Error
- geom_point(aes(x = time_value, y = nc_abs_error), color = "blue", size = 2) +
- geom_line(aes(x = time_value, y = nc_abs_error), color = "blue") +
- # Horizontal line for Nowcast Mean Absolute Error
- geom_hline(aes(yintercept = mean(nc_abs_error), color = "Nowcast MAE"), linetype = "dashed") +
-
- # Real-Time Absolute Error
- geom_point(aes(x = time_value, y = rt_abs_error), color = "red", size = 2) +
- geom_line(aes(x = time_value, y = rt_abs_error), color = "red") +
- # Horizontal line for Real-Time Mean Absolute Error
- geom_hline(aes(yintercept = mean(rt_abs_error), color = "Real-Time MAE"), linetype = "dashed") +
+ group_by(geo_value) |>
+ summarise(`Smoothed MAE` = mean(smoothed_res, na.rm = TRUE),
+ `Unsmoothed nowcast MAE` = mean(raw_nc_res, na.rm = TRUE),
+ `Provisional value MAE` = mean(prov_res, na.rm = TRUE)) |>
+ select(-geo_value)
- # Customize the x and y axes labels, legend & add title
- xlab("Time") + ylab("Absolute Error") +
- scale_color_manual(values = c("Real-Time MAE" = "red", "Nowcast MAE" = "blue")) +
- labs(color = "Mean Absolute Error")
+knitr::kable(mae_value)
```
-We can see that the absolute errors are almost always lower for nowcasting.
# Nowcasting with Two Variables
@@ -1983,8 +2141,7 @@ So visualizing can provide an important perspective that is missed from a simple
* [**Sliding Window Approach**]{.primary}: Predictions are based on [data up to the current time]{.primary}, ensuring no future information influences the nowcast.
* [**Evaluation**]{.primary}: Predictions are compared with actual admissions using numerical and visual perspectives.
-
-# Case Study - Nowcasting Cases Using %CLI
+# Case Study - Nowcasting Cases Using %CLI
## Goal of this case study
@@ -1997,7 +2154,7 @@ So visualizing can provide an important perspective that is missed from a simple
## Summary of main steps
-The workflow is similar to the previous example where we nowcasted using two variables, only more involved.
+The workflow is similar to the previous example where we nowcasted using two variables, only more involved.
The main steps are...
1. [**Fetch Data**]{.primary}: Retrieve %CLI and COVID-19 case data (by specimen collection date) for MA.
@@ -2006,7 +2163,7 @@ The main steps are...
3. [**Model & Prediction**]{.primary}: Fit a linear model to predict cases based on %CLI, trained on a 30-day rolling window.
-4. [**Nowcast Execution**]{.primary}: Use `epix_slide()` to nowcast the cases dynamically.
+4. [**Nowcast Execution**]{.primary}: Use `epix_slide` to nowcast the cases dynamically.
5. [**Visualization**]{.primary}: Plot actual vs. nowcasted cases with confidence intervals to assess model accuracy.
@@ -2044,11 +2201,11 @@ Brief summary of this data:
## Main steps to construct the `epi_archive`
1. [**Load necessary Libraries**]{.primary}: Such as `tidyverse`, `readxl`, `epiprocess`.
-2. [**Process Each Date's Data**]{.primary}:
+2. [**Process Each Date's Data**]{.primary}:
* A function we'll make (`process_covid_data`) downloads and processes daily COVID-19 data from the MA gov Excel files on their website.
* The data is cleaned and formatted with columns: `geo_value`, `time_value`, `version`, and values.
3. [**Handle Missing Data**]{.primary}: Checks if a date's data is available (handle 404 errors).
-4. [**Create `epi_archive`**]{.primary}:
+4. [**Create `epi_archive`**]{.primary}:
* Combine processed data into a tibble.
* Convert the tibble to an `epi_archive` object using `as_epi_archive()`.
@@ -2066,25 +2223,25 @@ library(epiprocess)
# Function to download and process each Excel file for a given date
process_covid_data <- function(Date) {
# Generate the URL for the given date
- url <- paste0("https://www.mass.gov/doc/covid-19-raw-data-", tolower(gsub("-0", "-", format(Date, "%B-%d-%Y"))), "/download")
+ url <- paste0("https://www.mass.gov/doc/covid-19-raw-data-", tolower(gsub("-0", "-", format(Date, "%B-%d-%Y"))), "/download")
# Applies gsub("-0", "-", ...) to replace any occurrence of -0 (such as in "April-01") with just - (resulting in "April-1").
-
+
# Check if the URL exists (handle the 404 error by skipping that date)
response <- GET(url)
-
+
if (status_code(response) != 200) {
return(NULL) # Skip if URL doesn't exist (404)
}
-
+
# Define the destination file path for the Excel file
file_path <- tempfile(fileext = ".xlsx")
-
+
# Download the Excel file
GET(url, write_disk(file_path, overwrite = TRUE))
-
+
# Read the relevant sheet from the Excel file
data <- read_excel(file_path, sheet = "CasesByDate (Test Date)")
-
+
# Process the data: rename columns and convert Date
data <- data |>
rename(
@@ -2094,7 +2251,7 @@ process_covid_data <- function(Date) {
Case_Average_7day = `7-day confirmed case average`
) |>
mutate(Date = as.Date(Date)) # Convert to Date class
-
+
# Create a tibble with the required columns for the epi_archive
tib <- tibble(
geo_value = "ma", # Massachusetts (geo_value)
@@ -2102,12 +2259,12 @@ process_covid_data <- function(Date) {
version = Date, # The extracted version date
case_rate_7d_av = data$Case_Average_7day # 7-day average case value
)
-
+
return(tib)
}
```
-## Fetch Data - Code breakdown
+## Fetch Data - Code breakdown
* This purpose of this function is to download and process each Excel file as of a date.
* [**URL Creation**]{.primary}: Dynamically generates the URL based on the date, removing leading zeros in day values (e.g., "April-01" → "April-1").
@@ -2129,20 +2286,20 @@ process_covid_data <- function(Date) {
process_data_for_date_range <- function(start_date, end_date) {
# Generate a sequence of dates between start_date and end_date
date_sequence <- seq(as.Date(start_date), as.Date(end_date), by = "day")
-
+
# Process data for each date and combine results
covid_data_list <- lapply(date_sequence, function(Date) {
process_covid_data(Date) # Skip over dates with no data (NULLs will be ignored)
})
-
+
# Combine all non-null individual tibbles into one data frame
combined_data <- bind_rows(covid_data_list[!sapply(covid_data_list, is.null)])
-
+
# Convert the combined data into an epi_archive object
if (nrow(combined_data) > 0) {
epi_archive_data <- combined_data |>
as_epi_archive(compactify = FALSE)
-
+
return(epi_archive_data)
} else {
message("No valid data available for the given date range.")
@@ -2174,7 +2331,7 @@ y <- process_data_for_date_range("2021-01-10", "2021-12-01") # Raw .xlsx data i
y
```
-* Alternatively, you may run the following to load `y` that was previously saved as an RDS file:
+* Alternatively, you may run the following to load `y` that was previously saved as an RDS file:
```{r load-ma-case-data}
#| echo: true
#| eval: FALSE
@@ -2191,7 +2348,7 @@ y <- readRDS("_data/ma_case_archive.rds")
```{r fetch-outpatient-cli}
#| echo: true
-# Step 1: Fetch Versioned Data
+# Step 1: Fetch Versioned Data
x <- pub_covidcast(
source = "doctor-visits",
signals = "smoothed_adj_cli",
@@ -2237,15 +2394,15 @@ lm_mod_pred <- function(data, ...) {
# Make predictions
predictions = predict(model,
newdata = data |>
- fill(percent_cli, .direction = "down") |>
+ fill(percent_cli, .direction = "down") |>
filter(time_value == max(time_value)),
interval = "prediction", level = 0.9)
-
+
# Pull off real-time value for later comparison to the nowcast value
real_time_val = data |> filter(time_value == max(time_value)) |> pull(case_rate_7d_av)
-
+
# Could clip predictions and bounds at 0
- return(data.frame(predictions, actual_nowcast_date = max(data$time_value), real_time_val = real_time_val))
+ return(data.frame(predictions, actual_nowcast_date = max(data$time_value), real_time_val = real_time_val))
}
```
@@ -2258,8 +2415,8 @@ lm_mod_pred <- function(data, ...) {
```{r nowcasting-epix-slide-cases}
#| echo: true
# Define the reference time points (to give the training/test split)
-targeted_nowcast_dates <- seq(as.Date("2021-04-01"), as.Date("2021-11-01"), by = "1 month")
-ref_time_values = targeted_nowcast_dates + 1 # + 1 because the case data is 1 day latent.
+targeted_nowcast_dates <- seq(as.Date("2021-04-01"), as.Date("2021-11-01"), by = "1 month")
+ref_time_values = targeted_nowcast_dates + 1 # + 1 because the case data is 1 day latent.
# Determine this from revision_summary(y)
# Use epix_slide to perform the nowcasting with a training-test split
@@ -2285,7 +2442,7 @@ Merge the nowcast results with the latest data for more direct comparison:
```{r join-with-actual-cases}
#| echo: true
x_latest <- epix_as_of(archive, max(archive$DT$version)) |>
- select(-percent_cli)
+ select(-percent_cli)
res <- nowcast_res |> left_join(x_latest, by = join_by(geo_value, time_value))
@@ -2314,7 +2471,7 @@ ggplot(res, aes(x = time_value)) +
# Adjust colors
scale_color_manual(values = c("Finalized Cases (7-dav)" = "black",
"Nowcast" = "#1f78b4",
- "Real Time" = "darkred")) +
+ "Real Time" = "darkred")) +
scale_fill_manual(values = c("Pred. Interval" = "#a6cee3")) + # Light blue
# Improve the theme
theme_minimal() +
@@ -2334,12 +2491,12 @@ ggplot(res, aes(x = time_value)) +
```{r mae-code-cases}
#| echo: true
# Calculate the absolute error between actual and nowcasted COVID-19 cases
-mae_data_cases <- res |>
+mae_data_cases <- res |>
mutate(nc_abs_error = abs(case_rate_7d_av - fit), # Nowcast vs Finalized cases (7-day average)
rt_abs_error = abs(case_rate_7d_av - real_time_val)) # Real-Time vs Finalized cases
# Compute the MAE (mean of absolute errors)
-mae_value_cases <- mae_data_cases |>
+mae_value_cases <- mae_data_cases |>
summarise(nc_MAE = mean(nc_abs_error),
rt_MAE = mean(rt_abs_error))
knitr::kable(mae_value_cases)
@@ -2351,18 +2508,18 @@ knitr::kable(mae_value_cases)
```{r abs-error-plot-cases}
# Plot the absolute errors for Nowcast and Real-Time vs Finalized COVID-19 cases
ggplot(mae_data_cases) +
- # Nowcast Absolute Error
+ # Nowcast Absolute Error
geom_point(aes(x = time_value, y = nc_abs_error), color = "#1f78b4", size = 2) +
geom_line(aes(x = time_value, y = nc_abs_error), color = "#1f78b4") +
# Horizontal line for Nowcast Mean Absolute Error
geom_hline(aes(yintercept = mean(nc_abs_error), color = "Nowcast MAE"), linetype = "dashed") +
-
- # Real-Time Absolute Error
+
+ # Real-Time Absolute Error
geom_point(aes(x = time_value, y = rt_abs_error), color = "darkred", size = 2) +
geom_line(aes(x = time_value, y = rt_abs_error), color = "darkred") +
# Horizontal line for Real-Time Mean Absolute Error
geom_hline(aes(yintercept = mean(rt_abs_error), color = "Real-Time MAE"), linetype = "dashed") +
-
+
# Customize the x and y axes labels, legend & add title
xlab("Date") + ylab("Absolute Error") +
scale_color_manual(values = c("Real-Time MAE" = "darkred", "Nowcast MAE" = "#1f78b4")) +
@@ -2392,3 +2549,38 @@ Main Steps:
Overall, nowcasting, based on the linear model, provided a closer approximation of true cases compared to the real-time values.
+
+
+## Aside on nowcasting
+
+* To some Epis, "nowcasting" can be equated with "estimate the time-varying instantaneous reproduction number, $R_t$"
+
+* Ex. using the number of reported COVID-19 cases in British Columbia between Jan. 2020 and Apr. 15, 2023.
+
+
+```{r rtestim}
+#| fig-width: 9
+#| fig-height: 3
+#| out-height: "400px"
+#| label: nowcasting
+library(rtestim)
+source(here::here("_code/bccovid.R"))
+
+p1 <- bccovid|>
+ ggplot(aes(date, cases)) +
+ geom_line(colour = primary) +
+ geom_vline(xintercept = ymd("2023-04-15"), colour = secondary,
+ linewidth = 2) +
+ labs(y = "BC Covid-19 cases", x = "Date") +
+ scale_y_continuous(expand = expansion(c(0, NA)))
+bc_rt <- estimate_rt(bccovid$cases, x = bccovid$date,
+ lambda = c(1e6, 1e5))
+p2 <- plot(confband(bc_rt, lambda = 1e5)) +
+ coord_cartesian(ylim = c(0.5, 2)) +
+ scale_y_continuous(expand = expansion(0))
+cowplot::plot_grid(p1, p2)
+```
+
+* Group built [`{rtestim}`](https://dajmcdon.github.io/rtestim) doing for this nonparametrically.
+
+* We may come back to this later...
\ No newline at end of file