From 7997c6c367323f541047e1c39fa40de63eef0e54 Mon Sep 17 00:00:00 2001
From: XuedaShen <110584221+XuedaShen@users.noreply.github.com>
Date: Mon, 9 Dec 2024 14:36:06 -0800
Subject: [PATCH 1/2] `epi_slide()` intro

Moving over minor edits iterated with @dajmcdon on a previous PR.

This one is clean: only commited the qmd file.
---
 slides/day1-afternoon.qmd | 1189 +++++++++++++++++++------------------
 1 file changed, 599 insertions(+), 590 deletions(-)

diff --git a/slides/day1-afternoon.qmd b/slides/day1-afternoon.qmd
index 37c7133..24b8641 100644
--- a/slides/day1-afternoon.qmd
+++ b/slides/day1-afternoon.qmd
@@ -27,6 +27,8 @@ theme_set(theme_bw())
 
 1. Basic Nowcasting using `{epiprocess}`
 
+1. Nowcasting with One Variable
+
 1. Nowcasting with Two Variables
 
 1. Case Study - Nowcasting Cases Using %CLI
@@ -694,6 +696,7 @@ attr(edf, "metadata")
 1. Correlating signals across location or time 
 1. Computing growth rates
 1. Detecting and removing outliers
+1. Calculating summaries with rolling windows
 1. Dealing with revisions 
 
 ## Features - Correlations at different lags
@@ -818,7 +821,7 @@ edfg |>
 ```{r outlier-ex}
 #| echo: true
 #| message: false
-edfo <- filter(edf, geo_value %in% c("ut", "ca")) |>
+edfo <- filter(edf, geo_value %in% c("ca", "ut")) |>
   select(geo_value, time_value, case_rate) |>
   as_epi_df() |>
   group_by(geo_value) |>
@@ -862,6 +865,127 @@ edfo |>
 ```
 
 
+## Features -- sliding a computation on an `epi_df`
+
+* It is often useful to compute rolling summaries of signals. 
+
+* These depend on the reference time, and are computed separately over geographies (and other groups). 
+
+* For example, a trailing average can smooth out daily variation.
+
+* In `epiprocess`, this is achieved by `epi_slide()`.
+
+```{r epi-slide-example-call}
+#| echo: true
+#| eval: false
+
+epi_slide(
+  .x,
+  .f,
+  ...,
+  .window_size = NULL,
+  .align = c("right", "center", "left"),
+  .ref_time_values = NULL,
+  .new_col_name = NULL,
+  .all_rows = FALSE
+)
+```
+
+For example, we can use `epi_slide()` to compute a trailing 7-day average. 
+
+## Features -- sliding a computation on an `epi_df`
+
+* The simplest way to use `epi_slide` is tidy evaluation. 
+
+* For a grouped `epi_df`, `epi_slide()` applies the computation to groups [separately]{.primary}. 
+
+```{r, grouped-df-to-slide}
+#| echo: false
+
+cases_edf <- cases_df |>
+  group_by(geo_value) |>
+  as_epi_df()
+
+```
+
+```{r epi-slide-tidy-eval}
+#| echo: true
+
+cases_7dav <- epi_slide(
+  .x = cases_edf,
+  cases_7dav = mean(raw_cases, na.rm = TRUE),
+  .window_size = 7,
+  .align = "right"
+)
+```
+
+
+```{r epi-slide-tidy-eval-display-result}
+#| echo: false
+
+cases_7dav |>
+  arrange(geo_value, time_value) |>
+  group_by(geo_value) |>
+  slice_min(time_value, n = 2) |>
+  as_tibble() |>
+  print()
+
+```
+
+
+
+## Features -- sliding a computation on an `epi_df`
+
+`epi_slide` also accepts custom functions of a certain form. 
+
+```{r epi-slide-custom-fucntion-mandatory-form}
+#| eval: false
+#| echo: true
+
+custom_function <- function(x, g, t, ...) {
+  
+  # Function body
+  
+}
+```
+
+* `x`: the data frame with all the columns with original object [except]{.primary} groupping vars. 
+* `g`: the one-row tibble with values of gropping vars of the given group. 
+* `t`: the `.ref_time_value` of the current window. 
+* `...`: additional arguments. 
+
+
+## Features -- sliding a computation on an `epi_df`
+
+```{r slide-apply-custom-function}
+#| echo: true
+#| code-line-numbers: "|9-17"
+
+mean_by_hand <- function(x, g, t, ...) {
+  data.frame(cases_7dav = mean(x$raw_cases, na.rm = TRUE))
+}
+
+cases_mean_custom_f = epi_slide(
+    .x = cases_edf,
+    .f = mean_by_hand,
+    .window_size = 7,
+    .align = "right"
+)
+
+```
+
+
+
+```{r epi-slide-custom-function-display-result}
+cases_mean_custom_f |>
+  arrange(geo_value, time_value) |>
+  slice_min(time_value, n = 2) |>
+  as_tibble() |>
+  print()
+```
+
+
+
 ## `epi_archive`: Collection of `epi_df`s
 
 * full version history of a data set
@@ -873,41 +997,117 @@ edfo |>
 
 Epidemiology data gets revised frequently.
 
-* We may want to use the data [as it looked in the past]{.primary} 
+* We may want to use the data [as it looked in the past].{.primary} 
 * or we may want to examine [the history of revisions]{.primary}.
 :::
 
 ## `epi_archive`: Collection of `epi_df`s
 
-Subset of daily COVID-19 doctor visits (Optum) and cases (JHU CSSE) from 6 states in `archive` format:
+Subset of daily COVID-19 doctor visits (Optum) and cases (JHU CSSE) from all U.S. states in `archive` format:
 
 ```{r archive-ex}
 #| echo: true
-archive_cases_dv_subset
+archive_cases_dv_subset_all_states |> head()
 ```
+
+## Features -- sliding computation over `epi_df`s
+
+* We can apply a computation over different snapshots in an `epi_archive`.
+
+```{r}
+#| echo: true
+#| eval: false
+
+epix_slide(
+  .x,
+  .f,
+  ...,
+  .before = Inf,
+  .versions = NULL,
+  .new_col_name = NULL,
+  .all_versions = FALSE
+)
+
+```
+
+This functionality is very helpful in version aware forecasting. We will return with a concrete example. 
   
-## Revision patterns
+
+## Features -- summarize revision behavior
+
+* `revision_summary()` is a helper function that summarizes revision behavior of an `epix_archive`.
+
+```{r epiprocess-revision-summary-demo}
+#| echo: true
+#| eval: true
+
+revision_data <- revision_summary(
+  archive_cases_dv_subset,
+  case_rate_7d_av,
+  drop_nas = TRUE,
+  print_inform = FALSE, # NOT the default, to save space
+  min_waiting_period = as.difftime(60, units = "days"),
+  within_latest = 0.2,
+  quick_revision = as.difftime(3, units = "days"),
+  few_revisions = 3,
+  abs_spread_threshold = NULL,
+  rel_spread_threshold = 0.1,
+  compactify_tol = .Machine$double.eps^0.5,
+  should_compactify = TRUE
+)
+```
+
+## Features -- summarize revision behavior
+
+```{r epiprocess-revision-summary-results}
+#| echo: true
+
+head(revision_data)
+```
+
+
+
+## Visualize revision patterns
+
+```{r create-snapshots-of-data}
+#| echo: false
+#| eval: true
+
+versions <- seq(as.Date("2020-07-01"), as.Date("2021-11-30"), by = "1 month")
+max_version <- max(versions)
+
+snapshots <- map(versions, \(v) {
+  epix_as_of(archive_cases_dv_subset_all_states, v) |>
+    mutate(version = v) |>
+    filter(geo_value %in% c("ca", "ut"))
+  }) |>
+  bind_rows() |>
+  mutate(latest = version == max_version) 
+
+```
+
 
 
 ```{r plot-revision-patterns}
 #| fig-width: 7
-# ggplot(snapshots |> filter(!latest),
-#        aes(x = time_value, y = percent_cli)) +  
-#   geom_line(aes(color = factor(version))) + 
-#   geom_vline(aes(color = factor(version), xintercept = version), lty = 3) +
-#   facet_wrap(~ geo_value, scales = "free_y", nrow = 1) +
-#   scale_x_date(minor_breaks = "month", date_labels = "%b %Y") +
-#   labs(x = "", y = "% of doctor's visits with\n Covid-like illness") + 
-#   scale_color_viridis_d(option = "B", end = .8) +
-#   theme(legend.position = "none") +
-#   geom_line(data = snapshots |> filter(latest),
-#             aes(x = time_value, y = percent_cli), 
-#             inherit.aes = FALSE, color = "black")
-```
-
-## Finalized data
-
-  * Counts are revised as time proceeds
+ggplot(snapshots |> filter(!latest),
+       aes(x = time_value, y = percent_cli)) +  
+  geom_line(aes(color = factor(version))) + 
+  geom_vline(aes(color = factor(version), xintercept = version), lty = 3) +
+  facet_wrap(~ geo_value, scales = "free_y", nrow = 1) +
+  scale_x_date(minor_breaks = "month", date_labels = "%b %Y") +
+  labs(x = "", y = "% of doctor's visits with\n Covid-like illness") + 
+  scale_color_viridis_d(option = "B", end = .8) +
+  theme(legend.position = "none") +
+  geom_line(data = snapshots |> filter(latest),
+            aes(x = time_value, y = percent_cli), 
+            inherit.aes = FALSE, color = "black")
+```
+
+  
+## Types of prediction
+
+* Counts are revised as time proceeds
 * Want to know the [final]{.primary} value 
 * Often not available until weeks/months later
 
@@ -924,6 +1124,8 @@ archive_cases_dv_subset
   Nowcasting
 : At time $t$, predict the final value for time $t$
 
+
+
 # Basic Nowcasting in the Epiverse
 
 <!-- predicting a finalized value from a provisional value and making predictions. -->
@@ -964,12 +1166,50 @@ archive_cases_dv_subset
 
 :::
 
-* We may also have access to $p$ other time series 
-$x_{ij},\; i=1,\ldots,t, \; j = 1,\ldots, p$ which may be subject to revisions.
+* We also have access to $p$ other time series 
+$x_{ij},\; i=1,\ldots,t, \; j = 1,\ldots,p$
 
+* All may be subject to revisions.
 
 
-## Case study: NCHS mortality
+
+## Aside on nowcasting
+
+* To some Epis, "nowcasting" can be equated with "estimate the time-varying instantaneous reproduction number, $R_t$"
+
+* Ex. using the number of reported COVID-19 cases in British Columbia between Jan. 2020 and Apr. 15, 2023. 
+
+<!-- This data is the number of reported COVID-19 cases in British Columbia between January 2020 and April 15, 2023. The values are.up-to-date as of August 2023. -->
+```{r rtestim}
+#| fig-width: 9
+#| fig-height: 3
+#| out-height: "400px"
+#| label: nowcasting
+library(rtestim)
+source(here::here("_code/bccovid.R"))
+
+p1 <- bccovid|>
+  ggplot(aes(date, cases)) + 
+  geom_line(colour = primary) +
+  geom_vline(xintercept = ymd("2023-04-15"), colour = secondary,
+             linewidth = 2) +
+  labs(y = "BC Covid-19 cases", x = "Date") +
+  scale_y_continuous(expand = expansion(c(0, NA)))
+bc_rt <- estimate_rt(bccovid$cases, x = bccovid$date, 
+                     lambda = c(1e6, 1e5))
+p2 <- plot(confband(bc_rt, lambda = 1e5)) + 
+  coord_cartesian(ylim = c(0.5, 2)) +
+  scale_y_continuous(expand = expansion(0))
+cowplot::plot_grid(p1, p2)
+```
+
+* Group built [`{rtestim}`](https://dajmcdon.github.io/rtestim) doing for this nonparametrically.
+
+* We may come back to this later...
+
+# Nowcasting with One Variable
+
+## Nowcasting simple ratio Ex: NCHS mortality 
 
 * In this example, we'll demonstrate the concept of nowcasting using [**NHCS mortality data**]{.primary}.
 (the number of weekly new deaths with confirmed or presumed COVID-19, per 100,000 population).
@@ -979,65 +1219,47 @@ $x_{ij},\; i=1,\ldots,t, \; j = 1,\ldots, p$ which may be subject to revisions.
 ## Fetch versioned data
 
 Let's fetch versioned mortality data from the API (`pub_covidcast`) for CA (`geo_values = "ca"`) and the signal of interest (`deaths_covid_incidence_num`) over early 2024.
-
 ```{r mortality-archive-construct}
 #| echo: true
-
 # Fetch the versioned NCHS mortality data (weekly)
-nchs_archive <- pub_covidcast(
+mortality_archive <- pub_covidcast(
   source = "nchs-mortality",
   signals = "deaths_covid_incidence_num",
   geo_type = "state",
   time_type = "week",
-  geo_values = c("ca", "ut"),  
-  time_values = epirange(202001, 202440),  
+  geo_values = "ca",  # California (CA)
+  time_values = epirange(202401, 202413),  
   issues = "*"
 ) |> 
   select(geo_value, time_value, version = issue, mortality = value) |> 
   as_epi_archive(compactify = TRUE)
 
+# Set the start and end days for the analysis 
+# corresponding to the weeks entered in time_values
+start_time = as.Date("2023-12-31")
+end_time = as.Date("2024-03-24")
 ```
 
+## Latency in reporting - Minimum lag
 
-## Analysis of versioning behavior 
-
-Recall, we need to watch out for:
-
-* [**Latency**]{.primary} the time difference between date of occurrence and date of the initial report
-* [**Backfill**]{.primary} how data for a given date is updated after initial report. 
-
-`revision_summary()` provides a sumamry to both aspects.  
-
-```{r inspect-revision_summary}
-#| echo: true
-revision_data = revision_summary(nchs_archive, mortality, print_inform = FALSE)
-```
-
-
-## Versioning analysis -- latency
-
-* [**Question:**]{.primary} What is the latency of NCHS data? 
+* A quick inspection reveals that mortality rates are systematically 7 days latent ([**fixed lag**]{.primary}).
 
-```{r nchs-latency-via-revision-summary}
+```{r inspect-latency-dplyr-way}
 #| echo: true
+mortality_revision_inspect = mortality_archive$DT |> mutate(version_time_diff = version - time_value)
 
-revision_data |> select(geo_value, time_value, min_lag) |> slice_sample(n = 10)
+# Look at the first revision for each week
+mortality_revision_inspect |> group_by(time_value) |> slice(1) |> head()
 ```
 
-* We randomly sampled some dates to check if there is a consistent latency pattern. 
-* Understanding latency prevents us from using data that we shouldn't have access to. 
+* Use `revision_summary()` from `epiprocess` to generate basic statistics about the revision behavior for the dataset.
 
-## Versioning analysis -- backfill
-
-* [**Question:**]{.primary} How long does it take for the reported value to be close to the finalized value? 
-
-```{r revision-summary-time-near-latest-demo}
-revision_data |> select(geo_value, time_value, time_near_latest) |> slice_sample(n = 10)
+```{r revision-summary-ex}
+#| eval: false
+revision_summary(mortality_archive, print_inform = TRUE)
 ```
-* It generally takes at least 4 weeks for reported value to be within 20\% of the finalized value. 
-* We can change the threshold of percentage difference by specifying `within_latest`  argument of `revision_summary()`. 
 
-## Versioning analysis - backfill 
+## Latency in reporting - Finalized value attainment
 
 * [**Question:**]{.primary} When is the [**finalized value**]{.primary} first attained for each date? Would we have access to any in real-time?
 * How fast are the final values attained & what's the pattern for these times, if any?
@@ -1071,55 +1293,35 @@ check_when_finalized <- function(epi_archive, start_date = NULL, end_date = NULL
 
 ```{r check-when-finalized-run}
 #| echo: false
-res <- check_when_finalized(nchs_archive, 
-  start_date = min(nchs_archive$DT$time_value), 
-  end_date = max(nchs_archive$DT$time_value))
-
+res <- check_when_finalized(mortality_archive, start_date = start_time, end_date = end_time)
 head(res)
 ```
-Here is a look at its quantiles:
+And here's a numerical summary:
 ```{r summary-diff}
 #| echo: false
 summary(as.numeric(res$diff))
 ```
 
-* [**Conclusion**]{.primary}: The revision behavior is pretty long-tailed. Value reported 4 weeks later is reasonably close to the finalized value. 
-
+* [**Conclusion**]{.primary}: Tends to take a long time & varies. Even for this relatively small time period... Goes as low as 84 days or as high as 294 days. Yikes.
+* So if we were doing this in real-time, then we wouldn't have access to the finalized data.
 
-## Revision pattern visualization  
+## Comparison of final vs. multiple revisions
 This shows the finalized rates in comparison to [**multiple revisions**]{.primary} to see how the data changes over time:
 
-```{r mortality-by-accessed-weeks-later}
+```{r mortality-by-revision-date}
 #| echo: false
 
-ref_dates <- unique(nchs_archive$DT$time_value)
-offsets = seq(1, 7) * 7
-max_version <- nchs_archive$versions_end
-
-get_val_asof <- function(time_val, archive, offsets) {
-  
-  as_of_dates <- pmin(time_val + offsets, max_version)
-  
-  result <- map(as_of_dates, function(x) {
-    
-    qd <- archive |>
-      epix_as_of(x) |>
-      filter(time_value == time_val) |>
-      select(geo_value, time_value, mortality) |>
-      mutate(lag = x - time_val)
-  }) |>
-    list_rbind()
-  
-  return(result)
+# Visualize the mortality values for different revisions
+revision_dates <- seq(min(mortality_archive$DT$version), end_time, by = "1 week")
 
-  
-}
+# Create a data frame for each version and label them by revision date
+mortality_revisions <- map_dfr(revision_dates, function(date) {
+  epix_as_of(mortality_archive, date) |>
+    mutate(revision_date = date)  # Add a column for the revision date
+})
 
-value_at_lags <- map(ref_dates, get_val_asof, 
-  archive <- nchs_archive, offsets <- offsets) |>
-  list_rbind()
-  
-values_final <- epix_as_of(nchs_archive, max(nchs_archive$versions_end))
+# Extract the latest/finalized version
+mortality_latest <- epix_as_of(mortality_archive, max(mortality_archive$DT$version))
 
 ```
 
@@ -1128,503 +1330,327 @@ values_final <- epix_as_of(nchs_archive, max(nchs_archive$versions_end))
 #| fig-width: 9
 #| fig-height: 4
 #| out-height: "500px"
-ggplot(value_at_lags, aes(x = time_value, y = mortality)) +  
-  geom_line(aes(color = factor(lag))) + 
-  facet_wrap(~ geo_value, scales = "free_y", ncol = 1) +
-  scale_x_date(minor_breaks = "month", date_labels = "%b %Y") +
-  labs(x = "", y = "Weekly new COVID deaths") + 
-  # scale_color_viridis_d(option="D", end=0.8) +
-  theme(legend.position = "none") +
-  geom_line(data = values_final, aes(x = time_value, y = mortality), 
-            inherit.aes = FALSE, color = "black")
-
+ggplot() +
+  geom_line(data = mortality_latest, aes(x = time_value, y = mortality), color = "black", size = 1) +
+  geom_line(data = mortality_revisions, aes(x = time_value, y = mortality, color = as.factor(revision_date))) +
+  geom_point(data = mortality_revisions, aes(x = time_value, y = mortality, color = as.factor(revision_date)), size = 2) +
+  theme_gray() + 
+  labs(color = "Revision Date", y = "Mortality", x = "Date") + 
+  ggtitle("COVID-19 Mortality in CA: Finalized (black) vs Various Revisions")
 
 ```
 
-## Revision pattern visualization 
+## Comparison of final vs. one revision
 
-```{r nchs-plot-different-ver}
-#| echo: false
-#| eval: true
-  
-versions = seq.Date(as.Date("2021-01-01"), nchs_archive$versions_end, by = "1 month")
-nchs_snapshots = map(versions, function(v) {
-  epix_as_of(nchs_archive, v) |>
-    mutate(version = v)}) |>
-  bind_rows() |>
-  mutate(latest = version == nchs_archive$versions_end)
-```
-
-```{r nchs-plot-val-different-ver}
-#| echo: false
-#| fig-width: 9
-#| fig-height: 4
-#| out-height: "500px"
+The below figure compares the finalized rates (in black) to [**one revision**]{.primary} (in yellow) from March 3, 2024.
 
-ggplot(nchs_snapshots |> filter(!latest),
-       aes(x = time_value, y = mortality)) +
-  geom_line(aes(color = factor(version))) +
-  geom_vline(aes(color = factor(version), xintercept = version), lty = 3) +
-  facet_wrap(~ geo_value, scales = "free_y", ncol = 1) +
-  scale_x_date(minor_breaks = "month", date_labels = "%b %Y") +
-  labs(x = "", y = "Weekly covid deaths") +
-  scale_color_viridis_d(option = "B", end = .8) +
-  theme(legend.position = "none") +
-  geom_line(data = nchs_snapshots |> filter(latest),
-            aes(x = time_value, y = mortality),
-            inherit.aes = FALSE, color = "black")
-```
-
-## Aside: Do we need a burn-in/training set? 
-
-* Typical stat-ML practise suggests a train, validation, test split.
-* But our exploratory analysis covered all available data, is that fine?
-
-
-* Generally, for exploratory analysis, it is fine to not do train/test split. 
-  + These analyses do not involve model fitting, we have little risk of an overly optimistic performance evaluation (no overfitting on test data).
-* However, for a [**psuedo-prospective analysis**]{.primary}, the best practise is to do a train/test split.
-  + In such analysis, one would be fitting and validating many models, a train/test split provides a more rigorous control on overfitting to test set. 
-
-## Ratio nowcaster: jumping from provisional to finalized value
-
-* Recall, the goal of nowcast at date $t$ is to use project the [*finalized value*]{.primary} of $y_t,$ given the information available on date $t$. 
-* A very simple nowcaster is the ratio between finalized and provisional value. 
-
-<br>
-
-How can we sensibly estimate this quantity? 
-
-<br>
-
-::: {.fragment .fade-in}
+```{r one-revision-final-plot}
+#| out-height: "400px"
+as_of_date = as.Date("2024-03-03")
 
-* At nowcast date $t,$ would have recieved reports with versions up to and including $t.$
-* We need to build training samples, which
-  + correctly aligns finalized value against provisional value 
-  + uses features that would have been available at test time 
-  + have enough data to ensure sensible estimation results
-:::
+ggplot() +
+  geom_line(data = mortality_latest, 
+            aes(x = time_value, y = mortality, color = "Finalized"), 
+            size = 1) +
+  geom_line(data = mortality_revisions |> filter(revision_date == as_of_date), 
+            aes(x = time_value, y = mortality, color = "Revision"), 
+            size = 1) +
+  ylab("Mortality") + 
+  xlab("Date") + 
+  scale_color_manual(values = c("Finalized" = "black", "Revision" = "#FFB300")) + 
+  labs(color = "Mortality Type")
 
-```{r nchs-ca-only}
-#| echo: false
-#| eval: true
-nchs_archive <- pub_covidcast(
-  source = "nchs-mortality",
-  signals = "deaths_covid_incidence_num",
-  geo_type = "state",
-  time_type = "week",
-  geo_values = "ca",  
-  time_values = epirange(202001, 202440),  
-  issues = "*"
-) |> 
-  select(geo_value, time_value, version = issue, mortality = value) |> 
-  as_epi_archive(compactify = TRUE)
 ```
+The real-time data is biased downwards (systematically below the true value). That is, the signal tends to get scaled up with future revisions.
 
 
-## Ratio nowcaster: building training samples
+## Calculate one ratio: Provisional vs. finalized data
+<!-- * Let's start simple with computing one ratio. -->
 
-* At nowcast date $t,$ would have recieved reports with versions up to and including $t.$
-* We need to build training samples, which
-  + correctly aligns finalized value against provisional value 
-  + uses features that would have been available at test time 
-  + have enough samples to ensure sensible estimation results
+Suppose that the day is March 10, 2024. Then, because the data is 7 days latent, we can compute the ratio between provisional and finalized data for [**March 3, 2024**]{.primary}.
 
-::: {.fragment .fade-in}
-* Build training samples by treating dates prior to date $t$ as actual nowcast dates.
-  + What is the provisional data on that date?
-  + Have we recieved finalized value for that date? 
-:::
+```{r one-ratio-calc}
+#| echo: true
+as_of_date = as.Date("2024-03-10"); fixed_lag = 7
 
-## Ratio nowcaster: building training samples
+# Load the finalized mortality data for CA
+ca_finalized <- mortality_latest |>
+  filter(time_value == (as_of_date - fixed_lag)) |>
+  dplyr::select(mortality)
 
-* At an earlier nowcast date $t_0,$ we define 
-  + [**Provisional value**]{.primary} as the reported value of $Y_s$ with version $t_0.$ Here $s_0$ is the largest occurence date among all values reported up until $t_0.$
-  + [**Finalized value**]{.primary} as the (potentially unobserved) finalized value of $Y_{s_0}.$
-    - We only know in *hindsight* when reported value of $Y_{s_0}$ is finalized -- need an approximation. 
+# Load the provisional mortality data for the same week
+mortality_old = epix_as_of(mortality_archive, as_of_date)
 
-::: {.fragment .fade-in}
-```{r revision-summary-time-near-latest-show-again}
-#| echo: false
-#| eval: true
+ca_provisional <- mortality_old |>
+  filter(time_value == (as_of_date - fixed_lag)) |>
+  dplyr::select(mortality)
 
-revision_data |> select(geo_value, time_value, time_near_latest) |> slice_sample(n = 5)
-quantile(revision_data$time_near_latest)
+# Calculate ratio between provisional and finalized cases for the week of interest
+ratio <- ca_provisional$mortality / ca_finalized$mortality
+ratio
 ```
-:::
-
-::: {.fragment .fade-in}
-We can say data reported 49 days after occurrence date is good enough to be considered finalized. 
-:::
-
-## Ratio nowcaster: test time feature
 
-* At test time $t,$ take provisional value to be $Y_s,$ where $s$ is the largest occureence date among all values reported up until time $t.$
+[**Conclusion**]{.primary}: The real-time rate is well below the finalized for this time (26 vs 72 here).
 
-## Nowcasting at a single date: building training samples 
+[**Question**]{.primary}: Can we generalize this over many days? 
 
-::: {.fragment .fade-in}
-* Searching for provisional values, at previous hypothetical nowcast dates.
-
-```{r one-date-look-for-provisional}
+## Calculating the ratio using multiple dates
+Let's move from calculating the ratio by using one day to multiple days with the goal to use it to nowcast for Feb. 18, which has a [**provisional value**]{.primary} of 23
+```{r provisional-val-feb18}
 #| echo: true
+as_of_date = as.Date("2024-02-25")
 
-nowcast_date <- as.Date("2022-01-02"); window_length = 180
-
-initial_data <- nchs_archive$DT |>
-  group_by(geo_value, time_value) |>
-  filter(version == min(version)) |>
-  rename(initial_val = mortality) |>
-  select(geo_value, time_value, initial_val)
-
+provisional <- epix_as_of(mortality_archive, as_of_date) |>
+  filter(time_value == as_of_date - 7) |>
+  pull(mortality)
+provisional
 ```
-:::
-
-
-::: {.fragment .fade-in}
-* Searching for finalized values, at previous hypothetical nowcast dates.
+<br>
+and a [**finalized value**]{.primary} of 104
 
-```{r one-date-look-for-final}
+```{r finalized-val-feb18}
 #| echo: true
-#| 
-finalized_data <- epix_as_of(nchs_archive, nowcast_date) |>
-  filter(time_value >= nowcast_date - 49 - window_length & time_value <= nowcast_date - 49) |>
-  rename(finalized_val = mortality) |>
-  select(geo_value, time_value, finalized_val)
+finalized <- mortality_latest |>
+  filter(time_value == as_of_date - 7) |>
+  pull(mortality)
+finalized
 ```
-:::
 
+## Calculating the ratio using multiple dates
+First, let's download the real-time rates for CA, and compare them to their finalized version.
 
-::: {.fragment .fade-in}
-* After searching for both provisional and finalized values, we merge them together and estimate the ratio. 
-```{r one-date-combine-provsional-and-final}
+```{r real-time-mortality}
 #| echo: true
-  
-ratio <- finalized_data |>
-  inner_join(initial_data, by = c("geo_value", "time_value")) |>
-  mutate(ratio = finalized_val / initial_val) |>
-  pull(ratio) |>
-  mean(na.rm = TRUE)
+dates <- seq(start_time, (as_of_date - 7), by = "day")
+mortality_real_time <- function(date) {
+  epix_as_of(mortality_archive, date + 7) |>
+    filter(time_value == date)
+}
+mortality_real_time_df <- map_dfr(dates, mortality_real_time)
+head(mortality_real_time_df)
+```
+
+## Calculating the ratio using multiple dates
+Now, let's plot the real-time vs the finalized mortality rates:
+```{r real-time-vs-finalized}
+ggplot() +
+  geom_line(data = mortality_latest |> filter(time_value <= (as_of_date - 7)), 
+            aes(x = time_value, y = mortality, color = "Finalized")) +
+  geom_line(data = mortality_real_time_df, 
+            aes(x = time_value, y = mortality, color = "Real-Time")) +
+  ylab("Mortality") +
+  xlab("Date") + 
+  scale_color_manual(values = c("Finalized" = "black", "Real-Time" = "red")) + 
+  labs(color = "Mortality Type")  # Adds the legend title
+
 ```
-:::
+* [**Takeaways**]{.primary}: The real-time counts are biased [**well below**]{.primary} the finalized counts.
+* Systematic underreporting tends to lessen over time (the gap between the lines decreases).
 
-## Nowcasting at a single date: test feature and actual nowcast 
+## Realistic limitation of nowcasting - Finalized data
+* Recall that real-time access to finalized data is limited as finalized values can take months to report (e.g., Jan. 7 is finalized 294 days later).
+* To nowcast accurately, we must rely on the [**best available approximation of finalized data**]{.primary} at the time of estimation (Feb. 25).
 
-```{r one-date-test-feat and nowcast}
+```{r finalized-data-as-of-feb25}
 #| echo: true
-
-last_avail <- epix_as_of(nchs_archive, nowcast_date) |>
-  slice_max(time_value) |>
-  pull(mortality) 
-
-nowcast <- last_avail * ratio
-nowcast
+mortality_as_of_feb25 <- epix_as_of(mortality_archive, as_of_date)
+head(mortality_as_of_feb25)
 ```
 
+## Ratio calculation & summary
 
-## Nowcasting for multiple dates
-
-* All previous manipulations should really be seen as a template for all nowcast dates. 
-* The template computation sould be applied over all nowcast dates, [**but we must respect data versioning**]{.primary}! 
-* `epix_slide()` is designed just for this! It behaves similarly to `epi_slide`.
-* Key exception: `epix_slide()` is version aware: the sliding computation at any reference time $t$ is performed on [**data that would have been available as of t**]{.primary}.
-
-
-
-## Nowcasting for multiple dates via `epix_slide()`
-
-We begin by templatizing our previous operations. 
-
-```{r nowcaster-to-slide}
+We then use these "finalized" and real-time values to compute the mean ratio:
+```{r ratio-calc-summary}
 #| echo: true
-
-nowcaster = function(x, g, t, wl=180, appx=49) {
-  
-
-  initial_data = x$DT |>
-    group_by(geo_value, time_value) |>
-    filter(version ==  min(version)) |>
-    filter(time_value >= t - wl - appx & time_value <= t - appx) |>
-    rename(initial_val = mortality) |>
-    select(geo_value, time_value, initial_val)
-
-  finalized_data = x$DT |>
-    group_by(geo_value, time_value) |>
-    filter(version ==  max(version)) |>
-    filter(time_value >= t - wl - appx & time_value <= t - appx) |>
-    rename(finalized_val = mortality) |>
-    select(geo_value, time_value, finalized_val)
-  
-  ratio = finalized_data |>
-    inner_join(initial_data, by = c("geo_value", "time_value")) |>
-    mutate(ratio = finalized_val / initial_val) |>
-    pull(ratio) |>
-    median(na.rm=TRUE)
-
-  last_avail = epix_as_of(x, t) |>
-    slice_max(time_value) |>
-    pull(mortality) 
-  
-  res = tibble(geo_value = x$geo_value, target_date = t, nowcast = last_avail * ratio)
-  
-  return(res)
-  
-}
-
+# exclude date we're nowcasting for
+mortality_real_time_df = mortality_real_time_df |> filter(time_value != "2024-02-18") 
+mortality_as_of_feb25 = mortality_as_of_feb25 |> filter(time_value != "2024-02-18")
+ratio_real_time_to_feb25 <- mortality_real_time_df$mortality / mortality_as_of_feb25$mortality
+summary(ratio_real_time_to_feb25)
 ```
+On average, the real-time rates are ~25.7% of the finalized.
 
-## Nowcasting for multiple dates via `epix_slide()`
-
-```{r epix-slide-extract-nowcast-date}
+```{r boxplot-ratio}
 #| echo: false
-#| eval: true
-
-all_nowcast_dates = nchs_archive$DT |>
-  filter(time_value >= as.Date("2022-01-01")) |>
-  distinct(time_value) |>
-  pull(time_value)
-```
-
-
-```{r nowcasts-slided}
-#| echo: true
-#| eval: true
-
-nowcasts = nchs_archive |>
-  group_by(geo_value) |>
-  epix_slide(
-    nowcaster,
-    .before=Inf,
-    .versions = all_nowcast_dates,
-    .all_versions = TRUE
-)
+ratio_df <- data.frame(ratio_real_time_to_feb25, mean_ratio = mean(ratio_real_time_to_feb25))
+
+# Create the boxplot with mean marked as a bold cross
+ggplot(ratio_df, aes(y = ratio_real_time_to_feb25)) +
+  geom_boxplot(fill = "lightblue", color = "black") +
+  geom_point(
+    aes(x = 0, y = mean_ratio),  # Place point at the mean
+    shape = 4,                  # Cross shape
+    size = 7,                   # Size of the cross
+    color = "darkblue",         # Color of the cross
+    stroke = 2                  # Boldness of the cross
+  ) +
+  labs(
+    title = "Distribution of Real-Time to Finalized Mortality Ratios",
+    y = "Real-Time to Finalized Ratio"
+  ) +
+  theme_minimal() +  # Minimal theme for clean look
+  theme(
+    plot.title = element_text(hjust = 0.5),  # Center the title
+    axis.text.x = element_blank()
+  ) +
+  coord_cartesian(ylim = c(0.2, 0.3))  # Limit y-axis between 0 and 0.5 to zoom in
 ```
+Tells us the distribution is right-skewed (mean > median) and so we should opt for the median.
 
 
+## Nowcasting on Feb. 25
 
-## Details of `epix_slide()`
-
-
-```{r epix-slide-demo-call-allversions-FALSE}
+* Since the [**median ratio**]{.primary} between real-time and finalized values is [**0.250**]{.primary} (i.e., real-time values are typically 25% of the finalized), then the nowcast is
+```{r nowcast-feb25-point}
 #| echo: true
-#| eval: false
-
-epix_slide(
-  .x,
-  .f,
-  ...,
-  .before = Inf,
-  .versions = NULL,
-  .new_col_name = NULL,
-  .all_versions = FALSE
-)
+# Now we can nowcast properly:
+nowcast <- provisional *
+  1 / median(ratio_real_time_to_feb25)
+nowcast
 ```
 
+* To get the accompanying 95% prediction interval, calculate the 2.5th and 97.5th percentiles:
 
-* `.f` in `epix_slide()` can be specified with the same form of custom function as `epi_slide()`. 
-
-```{r epix-slide-form-of-custom-function}
+```{r nowcast-feb25-lower}
 #| echo: true
-#| eval: false
+percentile_97.5 <- quantile(ratio_real_time_to_feb25, 0.975) |> unname()
 
-function(x, g, t) {
-  # function body
-}
+(lower_PI <- provisional * 1 / percentile_97.5)
 ```
 
-* Mandatory variables of `.f` would have different forms depending on the value of `.all_versions`. 
-
-
-## Details of `epix_slide()`
-
-```{r epix-slide-demo-call-allversions-FALSE-again}
+```{r nowcast-feb25-upper}
 #| echo: true
-#| eval: false
-#| code-line-numbers: "|8"
-
-epix_slide(
-  .x,                          
-  .f,                         
-  ...,                        
-  .before = Inf,             
-  .versions = NULL,           
-  .new_col_name = NULL,      
-  .all_versions = FALSE
-)
+percentile_2.5 <- quantile(ratio_real_time_to_feb25, 0.025) |> unname()
+(upper_PI <- provisional * 1 / percentile_2.5)
 ```
 
-::: {.fragment .fade-in}
-* When `.all_versions = FALSE`, `epix_slide()` essentially iterates the templatized computation over snapshots. 
-* Said differently, when `.all_versions = FALSE`, data accessed at any sliding iteration [**only involves a single version**]{.primary}. 
-:::
+* So, the [**nowcast is 92**]{.primary} with 95% PI: [61, 140], which is much closer to the [**finalized value of 104**]{.primary} than the [**provisional value of 23**]{.primary}.
 
-::: {.fragment .fade-in}
-* Hence: 
-  + `x`: an `epi_df` with same column names as archive's `DT`, minus the `version` column.
-  +  `g`: a one-row tibble containing the values of groupping variables of the associated group.
-  + `t`: the `ref_time_value` of the current window.
-  + `...`: additional arguments. 
-:::
+## Summary of three main steps 
+So the main steps for this type of fixed lag nowcasting are...
 
-## Details of `epix_slide()`
+1. Obtain the [**provisional value**]{.primary} for the target.
 
-```{r epix-slide-demo-call-allversions-TRUE}
-#| echo: true
-#| eval: false
-#| code-line-numbers: "|8"
+2. Estimate the ratio using the [**real-time**]{.primary} and [**"finalized"**]{.primary} data (for all previous dates that follow a consistent pattern in reporting). 
 
-epix_slide(
-  .x,                          
-  .f,                         
-  ...,                        
-  .before = Inf,             
-  .versions = NULL,           
-  .new_col_name = NULL,      
-  .all_versions = TRUE
-)
-```
+3. Profit.
 
-::: {.fragment .fade-in}
-* When `.all_versions = FALSE`, data accessed at any sliding iteration involves versions [**up to and including .ref_time_value**]{.primary}. 
-:::
-
-::: {.fragment .fade-in}
-* Hence: 
-  + `x`: an `epi_archive`, with version up to and including `.ref_time_value`. 
-  +  `g`: a one-row tibble containing the values of groupping variables of the associated group.
-  + `t`: the `ref_time_value` of the current window.
-  + `...`: additional arguments. 
-:::
-
-
-## Details of `epix_slide()`
-
-```{r nowcasts-slide-demo-only, eval=FALSE}
+```{r ratio-summary-steps}
 #| echo: true
 #| eval: false
-#| code-line-numbers: "|7,13,17"
-
-nowcasts <- nchs_archive |>
-  group_by(geo_value) |>
-  epix_slide(
-    nowcaster,
-    .before=Inf,
-    .versions = all_nowcast_dates,
-    .all_versions = TRUE
-)
+#| code-fold: true
+#| code-summary: "Expand for the accompanying code"
+# Today
+as_of_date = as.Date("2024-02-25")
 
-nowcaster <- function(x, g, t, wl=180, appx=49) {
-  
+# 1. Obtain the provisional value
+provisional <- epix_as_of(mortality_archive, as_of_date) |>
+  filter(time_value == as_of_date - 7) |>
+  pull(mortality)
+provisional
 
-  initial_data = x$DT |>
-    # Function body, omitted
+# 2. Estimate the ratio 
+mortality_real_time_df <- map_dfr(dates, mortality_real_time) |> filter(time_value != "2024-02-18") # Real-time
+mortality_as_of_feb25 <- epix_as_of(mortality_archive, as_of_date) |> filter(time_value != "2024-02-18")  # "Finalized"
 
+ratio_real_time_to_feb25 <- mortality_real_time_df$mortality / mortality_as_of_feb25$mortality
 
-  finalized_data = x$DT |>
-    # Function body, omitted
-  
-}
+# 3. Profit.
+(nowcast <- provisional * 1 / median(ratio_real_time_to_feb25))
 
+(upper_PI <- provisional * 1 / quantile(ratio_real_time_to_feb25, 0.025))
+(lower_PI <- provisional * 1 / quantile(ratio_real_time_to_feb25, 0.975))
 
 ```
 
+## Nowcasting mortality for multiple dates
 
-
-## Visualize nowcasts
-
-
-We are now finally able to compare nowcasts against first available reports:
-
-```{r nowcast-subsetting}
-#| echo: false
-
-
-intial_val_extracter <- function(x, g, t, wl=180, appx=49) {
+* [**Define Nowcast Function**]{.primary}:
+  * [**Input**]{.primary}: Takes in the dates to nowcast and the fixed lag
+  * [**Output**]{.primary}: The nowcasted mortality rates based on the ratio of real-time to finalized data.
+```{r nowcasting-multipl-dates-fun}
+#| echo: true
+#| code-fold: true
+nowcast_function <- function(nowcast_date, fixed_lag) {
+  as_of_date = nowcast_date + fixed_lag
   
-  last_avail = epix_as_of(x, t) |>
-    slice_max(time_value) |>
+  # 1. Obtain the provisional value for the target.
+  provisional <- epix_as_of(mortality_archive, as_of_date) |>
+    filter(time_value == as_of_date - fixed_lag) |>
     pull(mortality)
   
-  res = tibble(geo_value = x$geo_value, target_date = t, value = last_avail)
+  #2. Estimate the ratio multiplier using
+  # real-time
+  dates_seq <- seq(start_time, (nowcast_date - fixed_lag), by = "week")
+  mortality_real_time <- map_dfr(dates_seq, mortality_real_time)
+  
+  # and "finalized" data
+  finalized <- epix_as_of(mortality_archive, as_of_date) |> filter(time_value >= start_time & time_value <= (nowcast_date - fixed_lag)) 
+  
+  ratios <- mortality_real_time$mortality / finalized$mortality
+  
+  # Remove infinite or NaN ratios (i.e., keep only finite values)
+  median_ratio <- median(ratios[is.finite(ratios)])
   
-  return(res)
+  #3. Profit.
+  nowcast <- provisional * (1 / median_ratio)
+  upper_PI <- provisional * (1 / quantile(ratios[is.finite(ratios)], 0.025))
+  lower_PI <- provisional * (1 / quantile(ratios[is.finite(ratios)], 0.975))
   
+  # Return a dataframe with the nowcast and date
+  tibble(
+    time_value = nowcast_date,
+    nowcast = nowcast,
+    lower_PI = lower_PI,
+    upper_PI = upper_PI 
+  )
 }
-
-
-provisional_val = epix_slide(nchs_archive, intial_val_extracter,  .versions = all_nowcast_dates, .all_versions = TRUE)
-
-finalized_val = nchs_archive$DT |>
-  filter(time_value >= as.Date("2022-01-01")) |>
-  group_by(geo_value, time_value) |>
-  filter(version == max(version))
-
 ```
 
+## Map nowcast over multiple dates
+* We can use `map2()` to apply the function to a series of weeks (e.g., Jan. 28 to Mar. 24).
+* Returns a [**dataframe**]{.primary} with nowcasted results.
 
-```{r nowcast-fun-plot-results}
-#| echo: false
-
-ggplot() + 
-  geom_line(data = finalized_val, aes(x = time_value, y = mortality, color = "Finalized")) +
-  geom_point(data = finalized_val, aes(x = time_value, y = mortality, color = "Finalized"), shape = 16) +
-  geom_line(data = provisional_val, aes(x = target_date, y = value, color = "Provisional")) +
-  geom_point(data = provisional_val, aes(x = target_date, y = value, color = "Provisional"), shape = 16) +
-  geom_line(data = nowcasts, aes(x = target_date, y = nowcast, color = "Nowcast")) +
-  geom_point(data = nowcasts, aes(x = target_date, y = nowcast, color = "Nowcast"), shape = 16) +
-  # geom_ribbon(data = nowcast_results_df, 
-  #             aes(x = time_value, ymin = lower_PI, ymax = upper_PI), 
-  #             alpha = 0.2, fill = "blue") +
-  ylab("Mortality") +
-  xlab("Date") +
-  scale_color_manual(name =  "", 
-    values = c("Nowcast" = "#71c5e8", "Provisional" = "#FF6900", "Finalized" = "black")) + 
-  theme(legend.position = "bottom")
+```{r map-nowcast-fun-multiple-dates}
+#| echo: true
+# Apply Nowcast Function Over Multiple Dates
+nowcast_dates <- seq(as.Date("2024-01-28"), as.Date("2024-03-24"), by = "week")
+fixed_lag <- 7
+nowcast_results_df <- map2(nowcast_dates, fixed_lag, nowcast_function) |> list_rbind()
 ```
 
-* The real-time counts tend to be biased below the finalized counts. Nowcasted values tend to provide a much better approximation of the truth (at least for these dates).
-
-
-## Smoothing nowcasts
-
-* Nowcasts are quite volatile, reflecting the provisional counts are far from complete. 
-* We can use a trailing average to smooth them.
-
-```{r smooth-nowcasts-epi-slide}
-#| echo: true
-
-smoothed_nowcasts <- epi_slide(
-  nowcasts |> as_epi_df(),
-  smoothed_nowcasts = mean(nowcast, na.rm = TRUE),
-  .window_size = as.difftime(3, units = "weeks")
-)
+Let's smooth with a rolling trailing mean (window size 4) & see the results:
 
+```{r smooth-print-res}
+# Smooth results: Apply rolling median to nowcast and bounds with a window size of 4
+library(zoo)
+nowcast_results_df <- nowcast_results_df |> 
+  mutate(across(.cols = -time_value,  
+                .fns = ~ zoo::rollapply(.x, width = 4, FUN = mean, partial = TRUE, align = "right"),
+                .names = "{.col}"))
+nowcast_results_df
 ```
 
-```{r nowcast-smoothed-vis}
+## Visualize nowcast, real-time, and finalized values
+Finally, we can compare these nowcast results to the real-time and finalized values:
+```{r nowcast-fun-plot-results}
 #| echo: false
-
-cbPalette = c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442",
-              "#0072B2", "#D55E00", "#CC79A7")
-
 ggplot() + 
-  geom_line(data = finalized_val, aes(x = time_value, y = mortality, color = "Finalized")) +
-  geom_point(data = finalized_val, aes(x = time_value, y = mortality, color = "Finalized"), shape = 16) +
-  geom_line(data = smoothed_nowcasts, aes(x = time_value, y = smoothed_nowcasts, color = "Smoothed")) +
-  geom_point(data = smoothed_nowcasts, aes(x = time_value, y = smoothed_nowcasts, color = "Smoothed"), shape = 16) +
-  geom_line(data = nowcasts, aes(x = target_date, y = nowcast, color = "Nowcast")) +
-  geom_point(data = nowcasts, aes(x = target_date, y = nowcast, color = "Nowcast"), shape = 16) +
-  # geom_ribbon(data = nowcast_results_df, 
-  #             aes(x = time_value, ymin = lower_PI, ymax = upper_PI), 
-  #             alpha = 0.2, fill = "blue") +
+  geom_line(data = nowcast_results_df, aes(x = time_value, y = nowcast, color = "Nowcast")) +
+  geom_point(data = nowcast_results_df, aes(x = time_value, y = nowcast, color = "Nowcast"), shape = 16) +
+  geom_line(data = map_dfr(nowcast_dates, mortality_real_time), aes(x = time_value, y = mortality, color = "Real-Time")) +
+  geom_point(data = map_dfr(nowcast_dates, mortality_real_time), aes(x = time_value, y = mortality, color = "Real-Time"), shape = 16) +
+  geom_line(data = mortality_latest |> filter(time_value %in% nowcast_dates), aes(x = time_value, y = mortality, color = "Finalized")) +
+  geom_point(data = mortality_latest |> filter(time_value %in% nowcast_dates), aes(x = time_value, y = mortality, color = "Finalized"), shape = 16) +
+  geom_ribbon(data = nowcast_results_df, 
+              aes(x = time_value, ymin = lower_PI, ymax = upper_PI), 
+              alpha = 0.2, fill = "blue") +
   ylab("Mortality") +
   xlab("Date") +
-  scale_color_manual(name = "", values = c("Nowcast" = "#71c5e8", "Smoothed" = "#97D700", "Finalized" = "black")) + 
-  theme(legend.position = "bottom")
+  scale_color_manual(values = c("Nowcast" = "blue", "Real-Time" = "red", "Finalized" = "black")) + 
+  labs(color = "Mortality Type")  # Adds the legend title
 ```
-
-
+The real-time counts tend to be biased below the finalized counts. Nowcasted values tend to provide a much better approximation of the truth (at least for these dates).
 
 ## Evaluation using MAE
 
@@ -1632,6 +1658,7 @@ ggplot() +
 
 * Then for $y_{t}$ over times $t = 1, \dots, N$, then we may compute error metrics like mean absolute error (MAE).
 
+<!-- We'll see other error measures later on! For now, let's start with one that is simple and easy to interpret.-->
 
 * MAE measures the average absolute difference between the nowcast and finalized values. 
 
@@ -1645,33 +1672,50 @@ Let's numerically evaluate our point nowcasts for the provisional values of a ti
 
 <!-- Accuracy of nowcast is assessed by how close provisional estimates are to the finalized values to gauge the model's performance. -->
 
-
 ```{r mae-code}
-#| echo: false
+#| echo: true
 # Step 1: Join the mortality data with nowcast data
-mae_data <- finalized_val |> 
-  select(-version) |>
-  inner_join(nowcasts |> select(-version), by = join_by("geo_value", "time_value" == "target_date")) |>
-  inner_join(initial_data, by = c("geo_value", "time_value")) |>
-  inner_join(smoothed_nowcasts |> select(-version, -nowcast), by = c("geo_value", "time_value"))
+mae_data <- mortality_latest |> 
+  filter(time_value %in% nowcast_dates) |>   
+  left_join(nowcast_results_df, by = "time_value") |> 
+  left_join(map_dfr(nowcast_dates, mortality_real_time) |> rename(real_time = mortality), by = c("geo_value", "time_value"))
 
 # Step 2: Calculate the absolute error between actual and nowcasted values
 mae_data <- mae_data |> 
-  mutate(raw_nc_res = abs(mortality - nowcast),
-         prov_res = abs(mortality - initial_val),
-         smoothed_res = abs(mortality - smoothed_nowcasts))  
+  mutate(nc_abs_error = abs(mortality - nowcast),
+         rt_abs_error = abs(mortality - real_time))  
 
 # Step 3: Compute the MAE (mean of absolute errors)
 mae_value <- mae_data |> 
-  group_by(geo_value) |>
-  summarise(`Smoothed MAE` = mean(smoothed_res, na.rm = TRUE),
-            `Unsmoothed nowcast MAE` = mean(raw_nc_res, na.rm = TRUE),
-            `Provisional value MAE` = mean(prov_res, na.rm = TRUE)) |>
-  select(-geo_value) 
-  
+  summarise(nc_MAE = mean(nc_abs_error),
+            rt_MAE = mean(rt_abs_error))
 knitr::kable(mae_value)
 ```
 
+## Evaluation using MAE
+
+Finally, we may visualize the distribution of errors across time:
+```{r abs-error-plot}
+ggplot(mae_data) +
+  # Nowcast Absolute Error 
+  geom_point(aes(x = time_value, y = nc_abs_error), color = "blue", size = 2) +
+  geom_line(aes(x = time_value, y = nc_abs_error), color = "blue") +
+  # Horizontal line for Nowcast Mean Absolute Error
+  geom_hline(aes(yintercept = mean(nc_abs_error), color = "Nowcast MAE"), linetype = "dashed") +
+  
+  # Real-Time Absolute Error 
+  geom_point(aes(x = time_value, y = rt_abs_error), color = "red", size = 2) +
+  geom_line(aes(x = time_value, y = rt_abs_error), color = "red") +
+  # Horizontal line for Real-Time Mean Absolute Error
+  geom_hline(aes(yintercept = mean(rt_abs_error), color = "Real-Time MAE"), linetype = "dashed") +
+  
+  # Customize the x and y axes labels, legend & add title
+  xlab("Time") + ylab("Absolute Error") +
+  scale_color_manual(values = c("Real-Time MAE" = "red", "Nowcast MAE" = "blue")) +
+  labs(color = "Mean Absolute Error")
+```
+
+We can see that the absolute errors are almost always lower for nowcasting.
 
 # Nowcasting with Two Variables
 
@@ -1939,7 +1983,8 @@ So visualizing can provide an important perspective that is missed from a simple
 * [**Sliding Window Approach**]{.primary}: Predictions are based on [data up to the current time]{.primary}, ensuring no future information influences the nowcast.
 * [**Evaluation**]{.primary}: Predictions are compared with actual admissions using numerical and visual perspectives.
 
-# Case Study - Nowcasting Cases Using %CLI 
+
+# Case Study - Nowcasting Cases Using %CLI
 
 ## Goal of this case study
 
@@ -1952,7 +1997,7 @@ So visualizing can provide an important perspective that is missed from a simple
 
 ## Summary of main steps
 
-The workflow is similar to the previous example where we nowcasted using two variables, only more involved. 
+The workflow is similar to the previous example where we nowcasted using two variables, only more involved.
 The main steps are...
 
 1. [**Fetch Data**]{.primary}: Retrieve %CLI and COVID-19 case data (by specimen collection date) for MA.
@@ -1961,7 +2006,7 @@ The main steps are...
 
 3. [**Model & Prediction**]{.primary}: Fit a linear model to predict cases based on %CLI, trained on a 30-day rolling window.
 
-4. [**Nowcast Execution**]{.primary}: Use `epix_slide` to nowcast the cases dynamically. 
+4. [**Nowcast Execution**]{.primary}: Use `epix_slide()` to nowcast the cases dynamically.
 
 5. [**Visualization**]{.primary}: Plot actual vs. nowcasted cases with confidence intervals to assess model accuracy.
 
@@ -1999,11 +2044,11 @@ Brief summary of this data:
 ## Main steps to construct the `epi_archive`
 
 1. [**Load necessary Libraries**]{.primary}: Such as `tidyverse`, `readxl`, `epiprocess`.
-2. [**Process Each Date's Data**]{.primary}: 
+2. [**Process Each Date's Data**]{.primary}:
    * A function we'll make (`process_covid_data`) downloads and processes daily COVID-19 data from the MA gov Excel files on their website.
    * The data is cleaned and formatted with columns: `geo_value`, `time_value`, `version`, and values.
 3. [**Handle Missing Data**]{.primary}: Checks if a date's data is available (handle 404 errors).
-4. [**Create `epi_archive`**]{.primary}: 
+4. [**Create `epi_archive`**]{.primary}:
    * Combine processed data into a tibble.
    * Convert the tibble to an `epi_archive` object using `as_epi_archive()`.
 
@@ -2021,25 +2066,25 @@ library(epiprocess)
 # Function to download and process each Excel file for a given date
 process_covid_data <- function(Date) {
   # Generate the URL for the given date
-  url <- paste0("https://www.mass.gov/doc/covid-19-raw-data-", tolower(gsub("-0", "-", format(Date, "%B-%d-%Y"))), "/download") 
+  url <- paste0("https://www.mass.gov/doc/covid-19-raw-data-", tolower(gsub("-0", "-", format(Date, "%B-%d-%Y"))), "/download")
   # Applies gsub("-0", "-", ...) to replace any occurrence of -0 (such as in "April-01") with just - (resulting in "April-1").
-  
+
   # Check if the URL exists (handle the 404 error by skipping that date)
   response <- GET(url)
-  
+
   if (status_code(response) != 200) {
     return(NULL)  # Skip if URL doesn't exist (404)
   }
-  
+
   # Define the destination file path for the Excel file
   file_path <- tempfile(fileext = ".xlsx")
-  
+
   # Download the Excel file
   GET(url, write_disk(file_path, overwrite = TRUE))
-  
+
   # Read the relevant sheet from the Excel file
   data <- read_excel(file_path, sheet = "CasesByDate (Test Date)")
-  
+
   # Process the data: rename columns and convert Date
   data <- data |>
     rename(
@@ -2049,7 +2094,7 @@ process_covid_data <- function(Date) {
       Case_Average_7day = `7-day confirmed case average`
     ) |>
     mutate(Date = as.Date(Date))  # Convert to Date class
-  
+
   # Create a tibble with the required columns for the epi_archive
   tib <- tibble(
     geo_value = "ma",  # Massachusetts (geo_value)
@@ -2057,12 +2102,12 @@ process_covid_data <- function(Date) {
     version = Date,  # The extracted version date
     case_rate_7d_av = data$Case_Average_7day  # 7-day average case value
   )
-  
+
   return(tib)
 }
 ```
 
-## Fetch Data - Code breakdown 
+## Fetch Data - Code breakdown
 
 * This purpose of this function is to download and process each Excel file as of a date.
 * [**URL Creation**]{.primary}: Dynamically generates the URL based on the date, removing leading zeros in day values (e.g., "April-01" → "April-1").
@@ -2084,20 +2129,20 @@ process_covid_data <- function(Date) {
 process_data_for_date_range <- function(start_date, end_date) {
   # Generate a sequence of dates between start_date and end_date
   date_sequence <- seq(as.Date(start_date), as.Date(end_date), by = "day")
-  
+
   # Process data for each date and combine results
   covid_data_list <- lapply(date_sequence, function(Date) {
     process_covid_data(Date)  # Skip over dates with no data (NULLs will be ignored)
   })
-  
+
   # Combine all non-null individual tibbles into one data frame
   combined_data <- bind_rows(covid_data_list[!sapply(covid_data_list, is.null)])
-  
+
   # Convert the combined data into an epi_archive object
   if (nrow(combined_data) > 0) {
     epi_archive_data <- combined_data |>
       as_epi_archive(compactify = FALSE)
-    
+
     return(epi_archive_data)
   } else {
     message("No valid data available for the given date range.")
@@ -2129,7 +2174,7 @@ y <- process_data_for_date_range("2021-01-10", "2021-12-01")  # Raw .xlsx data i
 y
 ```
 
-* Alternatively, you may run the following to load `y` that was previously saved as an RDS file: 
+* Alternatively, you may run the following to load `y` that was previously saved as an RDS file:
 ```{r load-ma-case-data}
 #| echo: true
 #| eval: FALSE
@@ -2146,7 +2191,7 @@ y <- readRDS("_data/ma_case_archive.rds")
 
 ```{r fetch-outpatient-cli}
 #| echo: true
-# Step 1: Fetch Versioned Data 
+# Step 1: Fetch Versioned Data
 x <- pub_covidcast(
   source = "doctor-visits",
   signals = "smoothed_adj_cli",
@@ -2192,15 +2237,15 @@ lm_mod_pred <- function(data, ...) {
   # Make predictions
   predictions = predict(model,
                         newdata = data |>
-                          fill(percent_cli, .direction = "down") |> 
+                          fill(percent_cli, .direction = "down") |>
                           filter(time_value == max(time_value)),
                         interval = "prediction", level = 0.9)
-  
+
   # Pull off real-time value for later comparison to the nowcast value
   real_time_val = data |> filter(time_value == max(time_value)) |> pull(case_rate_7d_av)
-  
+
   # Could clip predictions and bounds at 0
-  return(data.frame(predictions, actual_nowcast_date = max(data$time_value), real_time_val = real_time_val)) 
+  return(data.frame(predictions, actual_nowcast_date = max(data$time_value), real_time_val = real_time_val))
 }
 ```
 
@@ -2213,8 +2258,8 @@ lm_mod_pred <- function(data, ...) {
 ```{r nowcasting-epix-slide-cases}
 #| echo: true
 # Define the reference time points (to give the training/test split)
-targeted_nowcast_dates <- seq(as.Date("2021-04-01"), as.Date("2021-11-01"), by = "1 month") 
-ref_time_values = targeted_nowcast_dates + 1 # + 1 because the case data is 1 day latent. 
+targeted_nowcast_dates <- seq(as.Date("2021-04-01"), as.Date("2021-11-01"), by = "1 month")
+ref_time_values = targeted_nowcast_dates + 1 # + 1 because the case data is 1 day latent.
 # Determine this from revision_summary(y)
 
 # Use epix_slide to perform the nowcasting with a training-test split
@@ -2240,7 +2285,7 @@ Merge the nowcast results with the latest data for more direct comparison:
 ```{r join-with-actual-cases}
 #| echo: true
 x_latest <- epix_as_of(archive, max(archive$DT$version)) |>
-  select(-percent_cli) 
+  select(-percent_cli)
 
 res <- nowcast_res |> left_join(x_latest, by = join_by(geo_value, time_value))
 
@@ -2269,7 +2314,7 @@ ggplot(res, aes(x = time_value)) +
   # Adjust colors
   scale_color_manual(values = c("Finalized Cases (7-dav)" = "black",
                                 "Nowcast" = "#1f78b4",
-                                "Real Time" = "darkred")) + 
+                                "Real Time" = "darkred")) +
   scale_fill_manual(values = c("Pred. Interval" = "#a6cee3")) + # Light blue
   # Improve the theme
   theme_minimal() +
@@ -2289,12 +2334,12 @@ ggplot(res, aes(x = time_value)) +
 ```{r mae-code-cases}
 #| echo: true
 # Calculate the absolute error between actual and nowcasted COVID-19 cases
-mae_data_cases <- res |> 
+mae_data_cases <- res |>
   mutate(nc_abs_error = abs(case_rate_7d_av - fit),  # Nowcast vs Finalized cases (7-day average)
          rt_abs_error = abs(case_rate_7d_av - real_time_val))  # Real-Time vs Finalized cases
 
 # Compute the MAE (mean of absolute errors)
-mae_value_cases <- mae_data_cases |> 
+mae_value_cases <- mae_data_cases |>
   summarise(nc_MAE = mean(nc_abs_error),
             rt_MAE = mean(rt_abs_error))
 knitr::kable(mae_value_cases)
@@ -2306,18 +2351,18 @@ knitr::kable(mae_value_cases)
 ```{r abs-error-plot-cases}
 # Plot the absolute errors for Nowcast and Real-Time vs Finalized COVID-19 cases
 ggplot(mae_data_cases) +
-  # Nowcast Absolute Error 
+  # Nowcast Absolute Error
   geom_point(aes(x = time_value, y = nc_abs_error), color = "#1f78b4", size = 2) +
   geom_line(aes(x = time_value, y = nc_abs_error), color = "#1f78b4") +
   # Horizontal line for Nowcast Mean Absolute Error
   geom_hline(aes(yintercept = mean(nc_abs_error), color = "Nowcast MAE"), linetype = "dashed") +
-  
-  # Real-Time Absolute Error 
+
+  # Real-Time Absolute Error
   geom_point(aes(x = time_value, y = rt_abs_error), color = "darkred", size = 2) +
   geom_line(aes(x = time_value, y = rt_abs_error), color = "darkred") +
   # Horizontal line for Real-Time Mean Absolute Error
   geom_hline(aes(yintercept = mean(rt_abs_error), color = "Real-Time MAE"), linetype = "dashed") +
-  
+
   # Customize the x and y axes labels, legend & add title
   xlab("Date") + ylab("Absolute Error") +
   scale_color_manual(values = c("Real-Time MAE" = "darkred", "Nowcast MAE" = "#1f78b4")) +
@@ -2347,39 +2392,3 @@ Main Steps:
 
 Overall, nowcasting, based on the linear model, provided a closer approximation of true cases compared to the real-time values.
 
-
-
-## Aside on nowcasting
-
-* To some Epis, "nowcasting" can be equated with "estimate the time-varying instantaneous reproduction number, $R_t$"
-
-* Ex. using the number of reported COVID-19 cases in British Columbia between Jan. 2020 and Apr. 15, 2023. 
-
-<!-- This data is the number of reported COVID-19 cases in British Columbia between January 2020 and April 15, 2023. The values are.up-to-date as of August 2023. -->
-```{r rtestim}
-#| fig-width: 9
-#| fig-height: 3
-#| out-height: "400px"
-#| label: nowcasting
-library(rtestim)
-source(here::here("_code/bccovid.R"))
-
-p1 <- bccovid|>
-  ggplot(aes(date, cases)) + 
-  geom_line(colour = primary) +
-  geom_vline(xintercept = ymd("2023-04-15"), colour = secondary,
-             linewidth = 2) +
-  labs(y = "BC Covid-19 cases", x = "Date") +
-  scale_y_continuous(expand = expansion(c(0, NA)))
-bc_rt <- estimate_rt(bccovid$cases, x = bccovid$date, 
-                     lambda = c(1e6, 1e5))
-p2 <- plot(confband(bc_rt, lambda = 1e5)) + 
-  coord_cartesian(ylim = c(0.5, 2)) +
-  scale_y_continuous(expand = expansion(0))
-cowplot::plot_grid(p1, p2)
-```
-
-* Group built [`{rtestim}`](https://dajmcdon.github.io/rtestim) doing for this nonparametrically.
-
-* We may come back to this later...
-

From 0ea5dddb6973d2ea6e167b8837a0a179952be8c6 Mon Sep 17 00:00:00 2001
From: XuedaShen <110584221+XuedaShen@users.noreply.github.com>
Date: Mon, 9 Dec 2024 14:45:24 -0800
Subject: [PATCH 2/2] Fix section on ratio nowcaster to be in sync with `main`

---
 slides/day1-afternoon.qmd | 942 +++++++++++++++++++++++---------------
 1 file changed, 567 insertions(+), 375 deletions(-)

diff --git a/slides/day1-afternoon.qmd b/slides/day1-afternoon.qmd
index 24b8641..16fdbbc 100644
--- a/slides/day1-afternoon.qmd
+++ b/slides/day1-afternoon.qmd
@@ -1166,50 +1166,12 @@ ggplot(snapshots |> filter(!latest),
 
 :::
 
-* We also have access to $p$ other time series 
-$x_{ij},\; i=1,\ldots,t, \; j = 1,\ldots,p$
+* We may also have access to $p$ other time series 
+$x_{ij},\; i=1,\ldots,t, \; j = 1,\ldots, p$ which may be subject to revisions.
 
-* All may be subject to revisions.
 
 
-
-## Aside on nowcasting
-
-* To some Epis, "nowcasting" can be equated with "estimate the time-varying instantaneous reproduction number, $R_t$"
-
-* Ex. using the number of reported COVID-19 cases in British Columbia between Jan. 2020 and Apr. 15, 2023. 
-
-<!-- This data is the number of reported COVID-19 cases in British Columbia between January 2020 and April 15, 2023. The values are.up-to-date as of August 2023. -->
-```{r rtestim}
-#| fig-width: 9
-#| fig-height: 3
-#| out-height: "400px"
-#| label: nowcasting
-library(rtestim)
-source(here::here("_code/bccovid.R"))
-
-p1 <- bccovid|>
-  ggplot(aes(date, cases)) + 
-  geom_line(colour = primary) +
-  geom_vline(xintercept = ymd("2023-04-15"), colour = secondary,
-             linewidth = 2) +
-  labs(y = "BC Covid-19 cases", x = "Date") +
-  scale_y_continuous(expand = expansion(c(0, NA)))
-bc_rt <- estimate_rt(bccovid$cases, x = bccovid$date, 
-                     lambda = c(1e6, 1e5))
-p2 <- plot(confband(bc_rt, lambda = 1e5)) + 
-  coord_cartesian(ylim = c(0.5, 2)) +
-  scale_y_continuous(expand = expansion(0))
-cowplot::plot_grid(p1, p2)
-```
-
-* Group built [`{rtestim}`](https://dajmcdon.github.io/rtestim) doing for this nonparametrically.
-
-* We may come back to this later...
-
-# Nowcasting with One Variable
-
-## Nowcasting simple ratio Ex: NCHS mortality 
+## Case study: NCHS mortality
 
 * In this example, we'll demonstrate the concept of nowcasting using [**NHCS mortality data**]{.primary}.
 (the number of weekly new deaths with confirmed or presumed COVID-19, per 100,000 population).
@@ -1219,47 +1181,65 @@ cowplot::plot_grid(p1, p2)
 ## Fetch versioned data
 
 Let's fetch versioned mortality data from the API (`pub_covidcast`) for CA (`geo_values = "ca"`) and the signal of interest (`deaths_covid_incidence_num`) over early 2024.
+
 ```{r mortality-archive-construct}
 #| echo: true
+
 # Fetch the versioned NCHS mortality data (weekly)
-mortality_archive <- pub_covidcast(
+nchs_archive <- pub_covidcast(
   source = "nchs-mortality",
   signals = "deaths_covid_incidence_num",
   geo_type = "state",
   time_type = "week",
-  geo_values = "ca",  # California (CA)
-  time_values = epirange(202401, 202413),  
+  geo_values = c("ca", "ut"),  
+  time_values = epirange(202001, 202440),  
   issues = "*"
 ) |> 
   select(geo_value, time_value, version = issue, mortality = value) |> 
   as_epi_archive(compactify = TRUE)
 
-# Set the start and end days for the analysis 
-# corresponding to the weeks entered in time_values
-start_time = as.Date("2023-12-31")
-end_time = as.Date("2024-03-24")
 ```
 
-## Latency in reporting - Minimum lag
 
-* A quick inspection reveals that mortality rates are systematically 7 days latent ([**fixed lag**]{.primary}).
+## Analysis of versioning behavior 
 
-```{r inspect-latency-dplyr-way}
+Recall, we need to watch out for:
+
+* [**Latency**]{.primary} the time difference between date of occurrence and date of the initial report
+* [**Backfill**]{.primary} how data for a given date is updated after initial report. 
+
+`revision_summary()` provides a sumamry to both aspects.  
+
+```{r inspect-revision_summary}
 #| echo: true
-mortality_revision_inspect = mortality_archive$DT |> mutate(version_time_diff = version - time_value)
+revision_data = revision_summary(nchs_archive, mortality, print_inform = FALSE)
+```
+
 
-# Look at the first revision for each week
-mortality_revision_inspect |> group_by(time_value) |> slice(1) |> head()
+## Versioning analysis -- latency
+
+* [**Question:**]{.primary} What is the latency of NCHS data? 
+
+```{r nchs-latency-via-revision-summary}
+#| echo: true
+
+revision_data |> select(geo_value, time_value, min_lag) |> slice_sample(n = 10)
 ```
 
-* Use `revision_summary()` from `epiprocess` to generate basic statistics about the revision behavior for the dataset.
+* We randomly sampled some dates to check if there is a consistent latency pattern. 
+* Understanding latency prevents us from using data that we shouldn't have access to. 
 
-```{r revision-summary-ex}
-#| eval: false
-revision_summary(mortality_archive, print_inform = TRUE)
+## Versioning analysis -- backfill
+
+* [**Question:**]{.primary} How long does it take for the reported value to be close to the finalized value? 
+
+```{r revision-summary-time-near-latest-demo}
+revision_data |> select(geo_value, time_value, time_near_latest) |> slice_sample(n = 10)
 ```
+* It generally takes at least 4 weeks for reported value to be within 20\% of the finalized value. 
+* We can change the threshold of percentage difference by specifying `within_latest`  argument of `revision_summary()`. 
 
-## Latency in reporting - Finalized value attainment
+## Versioning analysis - backfill 
 
 * [**Question:**]{.primary} When is the [**finalized value**]{.primary} first attained for each date? Would we have access to any in real-time?
 * How fast are the final values attained & what's the pattern for these times, if any?
@@ -1293,35 +1273,55 @@ check_when_finalized <- function(epi_archive, start_date = NULL, end_date = NULL
 
 ```{r check-when-finalized-run}
 #| echo: false
-res <- check_when_finalized(mortality_archive, start_date = start_time, end_date = end_time)
+res <- check_when_finalized(nchs_archive, 
+  start_date = min(nchs_archive$DT$time_value), 
+  end_date = max(nchs_archive$DT$time_value))
+
 head(res)
 ```
-And here's a numerical summary:
+Here is a look at its quantiles:
 ```{r summary-diff}
 #| echo: false
 summary(as.numeric(res$diff))
 ```
 
-* [**Conclusion**]{.primary}: Tends to take a long time & varies. Even for this relatively small time period... Goes as low as 84 days or as high as 294 days. Yikes.
-* So if we were doing this in real-time, then we wouldn't have access to the finalized data.
+* [**Conclusion**]{.primary}: The revision behavior is pretty long-tailed. Value reported 4 weeks later is reasonably close to the finalized value. 
+
 
-## Comparison of final vs. multiple revisions
+## Revision pattern visualization  
 This shows the finalized rates in comparison to [**multiple revisions**]{.primary} to see how the data changes over time:
 
-```{r mortality-by-revision-date}
+```{r mortality-by-accessed-weeks-later}
 #| echo: false
 
-# Visualize the mortality values for different revisions
-revision_dates <- seq(min(mortality_archive$DT$version), end_time, by = "1 week")
+ref_dates <- unique(nchs_archive$DT$time_value)
+offsets = seq(1, 7) * 7
+max_version <- nchs_archive$versions_end
 
-# Create a data frame for each version and label them by revision date
-mortality_revisions <- map_dfr(revision_dates, function(date) {
-  epix_as_of(mortality_archive, date) |>
-    mutate(revision_date = date)  # Add a column for the revision date
-})
+get_val_asof <- function(time_val, archive, offsets) {
+  
+  as_of_dates <- pmin(time_val + offsets, max_version)
+  
+  result <- map(as_of_dates, function(x) {
+    
+    qd <- archive |>
+      epix_as_of(x) |>
+      filter(time_value == time_val) |>
+      select(geo_value, time_value, mortality) |>
+      mutate(lag = x - time_val)
+  }) |>
+    list_rbind()
+  
+  return(result)
+
+  
+}
 
-# Extract the latest/finalized version
-mortality_latest <- epix_as_of(mortality_archive, max(mortality_archive$DT$version))
+value_at_lags <- map(ref_dates, get_val_asof, 
+  archive <- nchs_archive, offsets <- offsets) |>
+  list_rbind()
+  
+values_final <- epix_as_of(nchs_archive, max(nchs_archive$versions_end))
 
 ```
 
@@ -1330,327 +1330,503 @@ mortality_latest <- epix_as_of(mortality_archive, max(mortality_archive$DT$versi
 #| fig-width: 9
 #| fig-height: 4
 #| out-height: "500px"
-ggplot() +
-  geom_line(data = mortality_latest, aes(x = time_value, y = mortality), color = "black", size = 1) +
-  geom_line(data = mortality_revisions, aes(x = time_value, y = mortality, color = as.factor(revision_date))) +
-  geom_point(data = mortality_revisions, aes(x = time_value, y = mortality, color = as.factor(revision_date)), size = 2) +
-  theme_gray() + 
-  labs(color = "Revision Date", y = "Mortality", x = "Date") + 
-  ggtitle("COVID-19 Mortality in CA: Finalized (black) vs Various Revisions")
+ggplot(value_at_lags, aes(x = time_value, y = mortality)) +  
+  geom_line(aes(color = factor(lag))) + 
+  facet_wrap(~ geo_value, scales = "free_y", ncol = 1) +
+  scale_x_date(minor_breaks = "month", date_labels = "%b %Y") +
+  labs(x = "", y = "Weekly new COVID deaths") + 
+  # scale_color_viridis_d(option="D", end=0.8) +
+  theme(legend.position = "none") +
+  geom_line(data = values_final, aes(x = time_value, y = mortality), 
+            inherit.aes = FALSE, color = "black")
+
 
 ```
 
-## Comparison of final vs. one revision
+## Revision pattern visualization 
 
-The below figure compares the finalized rates (in black) to [**one revision**]{.primary} (in yellow) from March 3, 2024.
+```{r nchs-plot-different-ver}
+#| echo: false
+#| eval: true
+  
+versions = seq.Date(as.Date("2021-01-01"), nchs_archive$versions_end, by = "1 month")
+nchs_snapshots = map(versions, function(v) {
+  epix_as_of(nchs_archive, v) |>
+    mutate(version = v)}) |>
+  bind_rows() |>
+  mutate(latest = version == nchs_archive$versions_end)
+```
 
-```{r one-revision-final-plot}
-#| out-height: "400px"
-as_of_date = as.Date("2024-03-03")
+```{r nchs-plot-val-different-ver}
+#| echo: false
+#| fig-width: 9
+#| fig-height: 4
+#| out-height: "500px"
+
+ggplot(nchs_snapshots |> filter(!latest),
+       aes(x = time_value, y = mortality)) +
+  geom_line(aes(color = factor(version))) +
+  geom_vline(aes(color = factor(version), xintercept = version), lty = 3) +
+  facet_wrap(~ geo_value, scales = "free_y", ncol = 1) +
+  scale_x_date(minor_breaks = "month", date_labels = "%b %Y") +
+  labs(x = "", y = "Weekly covid deaths") +
+  scale_color_viridis_d(option = "B", end = .8) +
+  theme(legend.position = "none") +
+  geom_line(data = nchs_snapshots |> filter(latest),
+            aes(x = time_value, y = mortality),
+            inherit.aes = FALSE, color = "black")
+```
+
+## Aside: Do we need a burn-in/training set? 
+
+* Typical stat-ML practise suggests a train, validation, test split.
+* But our exploratory analysis covered all available data, is that fine?
 
-ggplot() +
-  geom_line(data = mortality_latest, 
-            aes(x = time_value, y = mortality, color = "Finalized"), 
-            size = 1) +
-  geom_line(data = mortality_revisions |> filter(revision_date == as_of_date), 
-            aes(x = time_value, y = mortality, color = "Revision"), 
-            size = 1) +
-  ylab("Mortality") + 
-  xlab("Date") + 
-  scale_color_manual(values = c("Finalized" = "black", "Revision" = "#FFB300")) + 
-  labs(color = "Mortality Type")
 
+* Generally, for exploratory analysis, it is fine to not do train/test split. 
+  + These analyses do not involve model fitting, we have little risk of an overly optimistic performance evaluation (no overfitting on test data).
+* However, for a [**psuedo-prospective analysis**]{.primary}, the best practise is to do a train/test split.
+  + In such analysis, one would be fitting and validating many models, a train/test split provides a more rigorous control on overfitting to test set. 
+
+## Ratio nowcaster: jumping from provisional to finalized value
+
+* Recall, the goal of nowcast at date $t$ is to use project the [*finalized value*]{.primary} of $y_t,$ given the information available on date $t$. 
+* A very simple nowcaster is the ratio between finalized and provisional value. 
+
+<br>
+
+How can we sensibly estimate this quantity? 
+
+<br>
+
+::: {.fragment .fade-in}
+
+* At nowcast date $t,$ would have recieved reports with versions up to and including $t.$
+* We need to build training samples, which
+  + correctly aligns finalized value against provisional value 
+  + uses features that would have been available at test time 
+  + have enough data to ensure sensible estimation results
+:::
+
+```{r nchs-ca-only}
+#| echo: false
+#| eval: true
+nchs_archive <- pub_covidcast(
+  source = "nchs-mortality",
+  signals = "deaths_covid_incidence_num",
+  geo_type = "state",
+  time_type = "week",
+  geo_values = "ca",  
+  time_values = epirange(202001, 202440),  
+  issues = "*"
+) |> 
+  select(geo_value, time_value, version = issue, mortality = value) |> 
+  as_epi_archive(compactify = TRUE)
 ```
-The real-time data is biased downwards (systematically below the true value). That is, the signal tends to get scaled up with future revisions.
 
 
-## Calculate one ratio: Provisional vs. finalized data
-<!-- * Let's start simple with computing one ratio. -->
+## Ratio nowcaster: building training samples
 
-Suppose that the day is March 10, 2024. Then, because the data is 7 days latent, we can compute the ratio between provisional and finalized data for [**March 3, 2024**]{.primary}.
+* At nowcast date $t,$ would have recieved reports with versions up to and including $t.$
+* We need to build training samples, which
+  + correctly aligns finalized value against provisional value 
+  + uses features that would have been available at test time 
+  + have enough samples to ensure sensible estimation results
 
-```{r one-ratio-calc}
-#| echo: true
-as_of_date = as.Date("2024-03-10"); fixed_lag = 7
+::: {.fragment .fade-in}
+* Build training samples by treating dates prior to date $t$ as actual nowcast dates.
+  + What is the provisional data on that date?
+  + Have we recieved finalized value for that date? 
+:::
 
-# Load the finalized mortality data for CA
-ca_finalized <- mortality_latest |>
-  filter(time_value == (as_of_date - fixed_lag)) |>
-  dplyr::select(mortality)
+## Ratio nowcaster: building training samples
 
-# Load the provisional mortality data for the same week
-mortality_old = epix_as_of(mortality_archive, as_of_date)
+* At an earlier nowcast date $t_0,$ we define 
+  + [**Provisional value**]{.primary} as the reported value of $Y_s$ with version $t_0.$ Here $s_0$ is the largest occurence date among all values reported up until $t_0.$
+  + [**Finalized value**]{.primary} as the (potentially unobserved) finalized value of $Y_{s_0}.$
+    - We only know in *hindsight* when reported value of $Y_{s_0}$ is finalized -- need an approximation. 
 
-ca_provisional <- mortality_old |>
-  filter(time_value == (as_of_date - fixed_lag)) |>
-  dplyr::select(mortality)
+::: {.fragment .fade-in}
+```{r revision-summary-time-near-latest-show-again}
+#| echo: false
+#| eval: true
 
-# Calculate ratio between provisional and finalized cases for the week of interest
-ratio <- ca_provisional$mortality / ca_finalized$mortality
-ratio
+revision_data |> select(geo_value, time_value, time_near_latest) |> slice_sample(n = 5)
+quantile(revision_data$time_near_latest)
 ```
+:::
 
-[**Conclusion**]{.primary}: The real-time rate is well below the finalized for this time (26 vs 72 here).
+::: {.fragment .fade-in}
+We can say data reported 49 days after occurrence date is good enough to be considered finalized. 
+:::
 
-[**Question**]{.primary}: Can we generalize this over many days? 
+## Ratio nowcaster: test time feature
 
-## Calculating the ratio using multiple dates
-Let's move from calculating the ratio by using one day to multiple days with the goal to use it to nowcast for Feb. 18, which has a [**provisional value**]{.primary} of 23
-```{r provisional-val-feb18}
+* At test time $t,$ take provisional value to be $Y_s,$ where $s$ is the largest occureence date among all values reported up until time $t.$
+
+## Nowcasting at a single date: building training samples 
+
+::: {.fragment .fade-in}
+* Searching for provisional values, at previous hypothetical nowcast dates.
+
+```{r one-date-look-for-provisional}
 #| echo: true
-as_of_date = as.Date("2024-02-25")
 
-provisional <- epix_as_of(mortality_archive, as_of_date) |>
-  filter(time_value == as_of_date - 7) |>
-  pull(mortality)
-provisional
+nowcast_date <- as.Date("2022-01-02"); window_length = 180
+
+initial_data <- nchs_archive$DT |>
+  group_by(geo_value, time_value) |>
+  filter(version == min(version)) |>
+  rename(initial_val = mortality) |>
+  select(geo_value, time_value, initial_val)
+
 ```
-<br>
-and a [**finalized value**]{.primary} of 104
+:::
+
 
-```{r finalized-val-feb18}
+::: {.fragment .fade-in}
+* Searching for finalized values, at previous hypothetical nowcast dates.
+
+```{r one-date-look-for-final}
 #| echo: true
-finalized <- mortality_latest |>
-  filter(time_value == as_of_date - 7) |>
-  pull(mortality)
-finalized
+#| 
+finalized_data <- epix_as_of(nchs_archive, nowcast_date) |>
+  filter(time_value >= nowcast_date - 49 - window_length & time_value <= nowcast_date - 49) |>
+  rename(finalized_val = mortality) |>
+  select(geo_value, time_value, finalized_val)
 ```
+:::
 
-## Calculating the ratio using multiple dates
-First, let's download the real-time rates for CA, and compare them to their finalized version.
 
-```{r real-time-mortality}
+::: {.fragment .fade-in}
+* After searching for both provisional and finalized values, we merge them together and estimate the ratio. 
+```{r one-date-combine-provsional-and-final}
 #| echo: true
-dates <- seq(start_time, (as_of_date - 7), by = "day")
-mortality_real_time <- function(date) {
-  epix_as_of(mortality_archive, date + 7) |>
-    filter(time_value == date)
-}
-mortality_real_time_df <- map_dfr(dates, mortality_real_time)
-head(mortality_real_time_df)
-```
-
-## Calculating the ratio using multiple dates
-Now, let's plot the real-time vs the finalized mortality rates:
-```{r real-time-vs-finalized}
-ggplot() +
-  geom_line(data = mortality_latest |> filter(time_value <= (as_of_date - 7)), 
-            aes(x = time_value, y = mortality, color = "Finalized")) +
-  geom_line(data = mortality_real_time_df, 
-            aes(x = time_value, y = mortality, color = "Real-Time")) +
-  ylab("Mortality") +
-  xlab("Date") + 
-  scale_color_manual(values = c("Finalized" = "black", "Real-Time" = "red")) + 
-  labs(color = "Mortality Type")  # Adds the legend title
-
+  
+ratio <- finalized_data |>
+  inner_join(initial_data, by = c("geo_value", "time_value")) |>
+  mutate(ratio = finalized_val / initial_val) |>
+  pull(ratio) |>
+  mean(na.rm = TRUE)
 ```
-* [**Takeaways**]{.primary}: The real-time counts are biased [**well below**]{.primary} the finalized counts.
-* Systematic underreporting tends to lessen over time (the gap between the lines decreases).
+:::
 
-## Realistic limitation of nowcasting - Finalized data
-* Recall that real-time access to finalized data is limited as finalized values can take months to report (e.g., Jan. 7 is finalized 294 days later).
-* To nowcast accurately, we must rely on the [**best available approximation of finalized data**]{.primary} at the time of estimation (Feb. 25).
+## Nowcasting at a single date: test feature and actual nowcast 
 
-```{r finalized-data-as-of-feb25}
+```{r one-date-test-feat and nowcast}
 #| echo: true
-mortality_as_of_feb25 <- epix_as_of(mortality_archive, as_of_date)
-head(mortality_as_of_feb25)
+
+last_avail <- epix_as_of(nchs_archive, nowcast_date) |>
+  slice_max(time_value) |>
+  pull(mortality) 
+
+nowcast <- last_avail * ratio
+nowcast
 ```
 
-## Ratio calculation & summary
 
-We then use these "finalized" and real-time values to compute the mean ratio:
-```{r ratio-calc-summary}
+## Nowcasting for multiple dates
+
+* All previous manipulations should really be seen as a template for all nowcast dates. 
+* The template computation sould be applied over all nowcast dates, [**but we must respect data versioning**]{.primary}! 
+* `epix_slide()` is designed just for this! It behaves similarly to `epi_slide`.
+* Key exception: `epix_slide()` is version aware: the sliding computation at any reference time $t$ is performed on [**data that would have been available as of t**]{.primary}.
+
+
+
+## Nowcasting for multiple dates via `epix_slide()`
+
+We begin by templatizing our previous operations. 
+
+```{r nowcaster-to-slide}
 #| echo: true
-# exclude date we're nowcasting for
-mortality_real_time_df = mortality_real_time_df |> filter(time_value != "2024-02-18") 
-mortality_as_of_feb25 = mortality_as_of_feb25 |> filter(time_value != "2024-02-18")
-ratio_real_time_to_feb25 <- mortality_real_time_df$mortality / mortality_as_of_feb25$mortality
-summary(ratio_real_time_to_feb25)
+
+nowcaster = function(x, g, t, wl=180, appx=49) {
+  
+
+  initial_data = x$DT |>
+    group_by(geo_value, time_value) |>
+    filter(version ==  min(version)) |>
+    filter(time_value >= t - wl - appx & time_value <= t - appx) |>
+    rename(initial_val = mortality) |>
+    select(geo_value, time_value, initial_val)
+
+  finalized_data = x$DT |>
+    group_by(geo_value, time_value) |>
+    filter(version ==  max(version)) |>
+    filter(time_value >= t - wl - appx & time_value <= t - appx) |>
+    rename(finalized_val = mortality) |>
+    select(geo_value, time_value, finalized_val)
+  
+  ratio = finalized_data |>
+    inner_join(initial_data, by = c("geo_value", "time_value")) |>
+    mutate(ratio = finalized_val / initial_val) |>
+    pull(ratio) |>
+    median(na.rm=TRUE)
+
+  last_avail = epix_as_of(x, t) |>
+    slice_max(time_value) |>
+    pull(mortality) 
+  
+  res = tibble(geo_value = x$geo_value, target_date = t, nowcast = last_avail * ratio)
+  
+  return(res)
+  
+}
+
 ```
-On average, the real-time rates are ~25.7% of the finalized.
 
-```{r boxplot-ratio}
+## Nowcasting for multiple dates via `epix_slide()`
+
+```{r epix-slide-extract-nowcast-date}
 #| echo: false
-ratio_df <- data.frame(ratio_real_time_to_feb25, mean_ratio = mean(ratio_real_time_to_feb25))
-
-# Create the boxplot with mean marked as a bold cross
-ggplot(ratio_df, aes(y = ratio_real_time_to_feb25)) +
-  geom_boxplot(fill = "lightblue", color = "black") +
-  geom_point(
-    aes(x = 0, y = mean_ratio),  # Place point at the mean
-    shape = 4,                  # Cross shape
-    size = 7,                   # Size of the cross
-    color = "darkblue",         # Color of the cross
-    stroke = 2                  # Boldness of the cross
-  ) +
-  labs(
-    title = "Distribution of Real-Time to Finalized Mortality Ratios",
-    y = "Real-Time to Finalized Ratio"
-  ) +
-  theme_minimal() +  # Minimal theme for clean look
-  theme(
-    plot.title = element_text(hjust = 0.5),  # Center the title
-    axis.text.x = element_blank()
-  ) +
-  coord_cartesian(ylim = c(0.2, 0.3))  # Limit y-axis between 0 and 0.5 to zoom in
+#| eval: true
+
+all_nowcast_dates = nchs_archive$DT |>
+  filter(time_value >= as.Date("2022-01-01")) |>
+  distinct(time_value) |>
+  pull(time_value)
+```
+
+
+```{r nowcasts-slided}
+#| echo: true
+#| eval: true
+
+nowcasts = nchs_archive |>
+  group_by(geo_value) |>
+  epix_slide(
+    nowcaster,
+    .before=Inf,
+    .versions = all_nowcast_dates,
+    .all_versions = TRUE
+)
 ```
-Tells us the distribution is right-skewed (mean > median) and so we should opt for the median.
 
 
-## Nowcasting on Feb. 25
 
-* Since the [**median ratio**]{.primary} between real-time and finalized values is [**0.250**]{.primary} (i.e., real-time values are typically 25% of the finalized), then the nowcast is
-```{r nowcast-feb25-point}
+## Details of `epix_slide()`
+
+
+```{r epix-slide-demo-call-allversions-FALSE}
 #| echo: true
-# Now we can nowcast properly:
-nowcast <- provisional *
-  1 / median(ratio_real_time_to_feb25)
-nowcast
+#| eval: false
+
+epix_slide(
+  .x,
+  .f,
+  ...,
+  .before = Inf,
+  .versions = NULL,
+  .new_col_name = NULL,
+  .all_versions = FALSE
+)
 ```
 
-* To get the accompanying 95% prediction interval, calculate the 2.5th and 97.5th percentiles:
 
-```{r nowcast-feb25-lower}
+* `.f` in `epix_slide()` can be specified with the same form of custom function as `epi_slide()`. 
+
+```{r epix-slide-form-of-custom-function}
 #| echo: true
-percentile_97.5 <- quantile(ratio_real_time_to_feb25, 0.975) |> unname()
+#| eval: false
 
-(lower_PI <- provisional * 1 / percentile_97.5)
+function(x, g, t) {
+  # function body
+}
 ```
 
-```{r nowcast-feb25-upper}
+* Mandatory variables of `.f` would have different forms depending on the value of `.all_versions`. 
+
+
+## Details of `epix_slide()`
+
+```{r epix-slide-demo-call-allversions-FALSE-again}
 #| echo: true
-percentile_2.5 <- quantile(ratio_real_time_to_feb25, 0.025) |> unname()
-(upper_PI <- provisional * 1 / percentile_2.5)
+#| eval: false
+#| code-line-numbers: "|8"
+
+epix_slide(
+  .x,                          
+  .f,                         
+  ...,                        
+  .before = Inf,             
+  .versions = NULL,           
+  .new_col_name = NULL,      
+  .all_versions = FALSE
+)
 ```
 
-* So, the [**nowcast is 92**]{.primary} with 95% PI: [61, 140], which is much closer to the [**finalized value of 104**]{.primary} than the [**provisional value of 23**]{.primary}.
+::: {.fragment .fade-in}
+* When `.all_versions = FALSE`, `epix_slide()` essentially iterates the templatized computation over snapshots. 
+* Said differently, when `.all_versions = FALSE`, data accessed at any sliding iteration [**only involves a single version**]{.primary}. 
+:::
+
+::: {.fragment .fade-in}
+* Hence: 
+  + `x`: an `epi_df` with same column names as archive's `DT`, minus the `version` column.
+  +  `g`: a one-row tibble containing the values of groupping variables of the associated group.
+  + `t`: the `ref_time_value` of the current window.
+  + `...`: additional arguments. 
+:::
+
+## Details of `epix_slide()`
 
-## Summary of three main steps 
-So the main steps for this type of fixed lag nowcasting are...
+```{r epix-slide-demo-call-allversions-TRUE}
+#| echo: true
+#| eval: false
+#| code-line-numbers: "|8"
 
-1. Obtain the [**provisional value**]{.primary} for the target.
+epix_slide(
+  .x,                          
+  .f,                         
+  ...,                        
+  .before = Inf,             
+  .versions = NULL,           
+  .new_col_name = NULL,      
+  .all_versions = TRUE
+)
+```
 
-2. Estimate the ratio using the [**real-time**]{.primary} and [**"finalized"**]{.primary} data (for all previous dates that follow a consistent pattern in reporting). 
+::: {.fragment .fade-in}
+* When `.all_versions = FALSE`, data accessed at any sliding iteration involves versions [**up to and including .ref_time_value**]{.primary}. 
+:::
+
+::: {.fragment .fade-in}
+* Hence: 
+  + `x`: an `epi_archive`, with version up to and including `.ref_time_value`. 
+  +  `g`: a one-row tibble containing the values of groupping variables of the associated group.
+  + `t`: the `ref_time_value` of the current window.
+  + `...`: additional arguments. 
+:::
 
-3. Profit.
 
-```{r ratio-summary-steps}
+## Details of `epix_slide()`
+
+```{r nowcasts-slide-demo-only, eval=FALSE}
 #| echo: true
 #| eval: false
-#| code-fold: true
-#| code-summary: "Expand for the accompanying code"
-# Today
-as_of_date = as.Date("2024-02-25")
+#| code-line-numbers: "|7,13,17"
 
-# 1. Obtain the provisional value
-provisional <- epix_as_of(mortality_archive, as_of_date) |>
-  filter(time_value == as_of_date - 7) |>
-  pull(mortality)
-provisional
+nowcasts <- nchs_archive |>
+  group_by(geo_value) |>
+  epix_slide(
+    nowcaster,
+    .before=Inf,
+    .versions = all_nowcast_dates,
+    .all_versions = TRUE
+)
+
+nowcaster <- function(x, g, t, wl=180, appx=49) {
+  
 
-# 2. Estimate the ratio 
-mortality_real_time_df <- map_dfr(dates, mortality_real_time) |> filter(time_value != "2024-02-18") # Real-time
-mortality_as_of_feb25 <- epix_as_of(mortality_archive, as_of_date) |> filter(time_value != "2024-02-18")  # "Finalized"
+  initial_data = x$DT |>
+    # Function body, omitted
 
-ratio_real_time_to_feb25 <- mortality_real_time_df$mortality / mortality_as_of_feb25$mortality
 
-# 3. Profit.
-(nowcast <- provisional * 1 / median(ratio_real_time_to_feb25))
+  finalized_data = x$DT |>
+    # Function body, omitted
+  
+}
 
-(upper_PI <- provisional * 1 / quantile(ratio_real_time_to_feb25, 0.025))
-(lower_PI <- provisional * 1 / quantile(ratio_real_time_to_feb25, 0.975))
 
 ```
 
-## Nowcasting mortality for multiple dates
 
-* [**Define Nowcast Function**]{.primary}:
-  * [**Input**]{.primary}: Takes in the dates to nowcast and the fixed lag
-  * [**Output**]{.primary}: The nowcasted mortality rates based on the ratio of real-time to finalized data.
-```{r nowcasting-multipl-dates-fun}
-#| echo: true
-#| code-fold: true
-nowcast_function <- function(nowcast_date, fixed_lag) {
-  as_of_date = nowcast_date + fixed_lag
+
+## Visualize nowcasts
+
+
+We are now finally able to compare nowcasts against first available reports:
+
+```{r nowcast-subsetting}
+#| echo: false
+
+
+intial_val_extracter <- function(x, g, t, wl=180, appx=49) {
   
-  # 1. Obtain the provisional value for the target.
-  provisional <- epix_as_of(mortality_archive, as_of_date) |>
-    filter(time_value == as_of_date - fixed_lag) |>
+  last_avail = epix_as_of(x, t) |>
+    slice_max(time_value) |>
     pull(mortality)
   
-  #2. Estimate the ratio multiplier using
-  # real-time
-  dates_seq <- seq(start_time, (nowcast_date - fixed_lag), by = "week")
-  mortality_real_time <- map_dfr(dates_seq, mortality_real_time)
+  res = tibble(geo_value = x$geo_value, target_date = t, value = last_avail)
   
-  # and "finalized" data
-  finalized <- epix_as_of(mortality_archive, as_of_date) |> filter(time_value >= start_time & time_value <= (nowcast_date - fixed_lag)) 
+  return(res)
   
-  ratios <- mortality_real_time$mortality / finalized$mortality
-  
-  # Remove infinite or NaN ratios (i.e., keep only finite values)
-  median_ratio <- median(ratios[is.finite(ratios)])
-  
-  #3. Profit.
-  nowcast <- provisional * (1 / median_ratio)
-  upper_PI <- provisional * (1 / quantile(ratios[is.finite(ratios)], 0.025))
-  lower_PI <- provisional * (1 / quantile(ratios[is.finite(ratios)], 0.975))
-  
-  # Return a dataframe with the nowcast and date
-  tibble(
-    time_value = nowcast_date,
-    nowcast = nowcast,
-    lower_PI = lower_PI,
-    upper_PI = upper_PI 
-  )
 }
+
+
+provisional_val = epix_slide(nchs_archive, intial_val_extracter,  .versions = all_nowcast_dates, .all_versions = TRUE)
+
+finalized_val = nchs_archive$DT |>
+  filter(time_value >= as.Date("2022-01-01")) |>
+  group_by(geo_value, time_value) |>
+  filter(version == max(version))
+
 ```
 
-## Map nowcast over multiple dates
-* We can use `map2()` to apply the function to a series of weeks (e.g., Jan. 28 to Mar. 24).
-* Returns a [**dataframe**]{.primary} with nowcasted results.
 
-```{r map-nowcast-fun-multiple-dates}
-#| echo: true
-# Apply Nowcast Function Over Multiple Dates
-nowcast_dates <- seq(as.Date("2024-01-28"), as.Date("2024-03-24"), by = "week")
-fixed_lag <- 7
-nowcast_results_df <- map2(nowcast_dates, fixed_lag, nowcast_function) |> list_rbind()
+```{r nowcast-fun-plot-results}
+#| echo: false
+
+ggplot() + 
+  geom_line(data = finalized_val, aes(x = time_value, y = mortality, color = "Finalized")) +
+  geom_point(data = finalized_val, aes(x = time_value, y = mortality, color = "Finalized"), shape = 16) +
+  geom_line(data = provisional_val, aes(x = target_date, y = value, color = "Provisional")) +
+  geom_point(data = provisional_val, aes(x = target_date, y = value, color = "Provisional"), shape = 16) +
+  geom_line(data = nowcasts, aes(x = target_date, y = nowcast, color = "Nowcast")) +
+  geom_point(data = nowcasts, aes(x = target_date, y = nowcast, color = "Nowcast"), shape = 16) +
+  # geom_ribbon(data = nowcast_results_df, 
+  #             aes(x = time_value, ymin = lower_PI, ymax = upper_PI), 
+  #             alpha = 0.2, fill = "blue") +
+  ylab("Mortality") +
+  xlab("Date") +
+  scale_color_manual(name =  "", 
+    values = c("Nowcast" = "#71c5e8", "Provisional" = "#FF6900", "Finalized" = "black")) + 
+  theme(legend.position = "bottom")
 ```
 
-Let's smooth with a rolling trailing mean (window size 4) & see the results:
+* The real-time counts tend to be biased below the finalized counts. Nowcasted values tend to provide a much better approximation of the truth (at least for these dates).
+
+
+## Smoothing nowcasts
+
+* Nowcasts are quite volatile, reflecting the provisional counts are far from complete. 
+* We can use a trailing average to smooth them.
+
+```{r smooth-nowcasts-epi-slide}
+#| echo: true
+
+smoothed_nowcasts <- epi_slide(
+  nowcasts |> as_epi_df(),
+  smoothed_nowcasts = mean(nowcast, na.rm = TRUE),
+  .window_size = as.difftime(3, units = "weeks")
+)
 
-```{r smooth-print-res}
-# Smooth results: Apply rolling median to nowcast and bounds with a window size of 4
-library(zoo)
-nowcast_results_df <- nowcast_results_df |> 
-  mutate(across(.cols = -time_value,  
-                .fns = ~ zoo::rollapply(.x, width = 4, FUN = mean, partial = TRUE, align = "right"),
-                .names = "{.col}"))
-nowcast_results_df
 ```
 
-## Visualize nowcast, real-time, and finalized values
-Finally, we can compare these nowcast results to the real-time and finalized values:
-```{r nowcast-fun-plot-results}
+```{r nowcast-smoothed-vis}
 #| echo: false
+
+cbPalette = c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442",
+              "#0072B2", "#D55E00", "#CC79A7")
+
 ggplot() + 
-  geom_line(data = nowcast_results_df, aes(x = time_value, y = nowcast, color = "Nowcast")) +
-  geom_point(data = nowcast_results_df, aes(x = time_value, y = nowcast, color = "Nowcast"), shape = 16) +
-  geom_line(data = map_dfr(nowcast_dates, mortality_real_time), aes(x = time_value, y = mortality, color = "Real-Time")) +
-  geom_point(data = map_dfr(nowcast_dates, mortality_real_time), aes(x = time_value, y = mortality, color = "Real-Time"), shape = 16) +
-  geom_line(data = mortality_latest |> filter(time_value %in% nowcast_dates), aes(x = time_value, y = mortality, color = "Finalized")) +
-  geom_point(data = mortality_latest |> filter(time_value %in% nowcast_dates), aes(x = time_value, y = mortality, color = "Finalized"), shape = 16) +
-  geom_ribbon(data = nowcast_results_df, 
-              aes(x = time_value, ymin = lower_PI, ymax = upper_PI), 
-              alpha = 0.2, fill = "blue") +
+  geom_line(data = finalized_val, aes(x = time_value, y = mortality, color = "Finalized")) +
+  geom_point(data = finalized_val, aes(x = time_value, y = mortality, color = "Finalized"), shape = 16) +
+  geom_line(data = smoothed_nowcasts, aes(x = time_value, y = smoothed_nowcasts, color = "Smoothed")) +
+  geom_point(data = smoothed_nowcasts, aes(x = time_value, y = smoothed_nowcasts, color = "Smoothed"), shape = 16) +
+  geom_line(data = nowcasts, aes(x = target_date, y = nowcast, color = "Nowcast")) +
+  geom_point(data = nowcasts, aes(x = target_date, y = nowcast, color = "Nowcast"), shape = 16) +
+  # geom_ribbon(data = nowcast_results_df, 
+  #             aes(x = time_value, ymin = lower_PI, ymax = upper_PI), 
+  #             alpha = 0.2, fill = "blue") +
   ylab("Mortality") +
   xlab("Date") +
-  scale_color_manual(values = c("Nowcast" = "blue", "Real-Time" = "red", "Finalized" = "black")) + 
-  labs(color = "Mortality Type")  # Adds the legend title
+  scale_color_manual(name = "", values = c("Nowcast" = "#71c5e8", "Smoothed" = "#97D700", "Finalized" = "black")) + 
+  theme(legend.position = "bottom")
 ```
-The real-time counts tend to be biased below the finalized counts. Nowcasted values tend to provide a much better approximation of the truth (at least for these dates).
+
+
 
 ## Evaluation using MAE
 
@@ -1658,7 +1834,6 @@ The real-time counts tend to be biased below the finalized counts. Nowcasted val
 
 * Then for $y_{t}$ over times $t = 1, \dots, N$, then we may compute error metrics like mean absolute error (MAE).
 
-<!-- We'll see other error measures later on! For now, let's start with one that is simple and easy to interpret.-->
 
 * MAE measures the average absolute difference between the nowcast and finalized values. 
 
@@ -1672,50 +1847,33 @@ Let's numerically evaluate our point nowcasts for the provisional values of a ti
 
 <!-- Accuracy of nowcast is assessed by how close provisional estimates are to the finalized values to gauge the model's performance. -->
 
+
 ```{r mae-code}
-#| echo: true
+#| echo: false
 # Step 1: Join the mortality data with nowcast data
-mae_data <- mortality_latest |> 
-  filter(time_value %in% nowcast_dates) |>   
-  left_join(nowcast_results_df, by = "time_value") |> 
-  left_join(map_dfr(nowcast_dates, mortality_real_time) |> rename(real_time = mortality), by = c("geo_value", "time_value"))
+mae_data <- finalized_val |> 
+  select(-version) |>
+  inner_join(nowcasts |> select(-version), by = join_by("geo_value", "time_value" == "target_date")) |>
+  inner_join(initial_data, by = c("geo_value", "time_value")) |>
+  inner_join(smoothed_nowcasts |> select(-version, -nowcast), by = c("geo_value", "time_value"))
 
 # Step 2: Calculate the absolute error between actual and nowcasted values
 mae_data <- mae_data |> 
-  mutate(nc_abs_error = abs(mortality - nowcast),
-         rt_abs_error = abs(mortality - real_time))  
+  mutate(raw_nc_res = abs(mortality - nowcast),
+         prov_res = abs(mortality - initial_val),
+         smoothed_res = abs(mortality - smoothed_nowcasts))  
 
 # Step 3: Compute the MAE (mean of absolute errors)
 mae_value <- mae_data |> 
-  summarise(nc_MAE = mean(nc_abs_error),
-            rt_MAE = mean(rt_abs_error))
-knitr::kable(mae_value)
-```
-
-## Evaluation using MAE
-
-Finally, we may visualize the distribution of errors across time:
-```{r abs-error-plot}
-ggplot(mae_data) +
-  # Nowcast Absolute Error 
-  geom_point(aes(x = time_value, y = nc_abs_error), color = "blue", size = 2) +
-  geom_line(aes(x = time_value, y = nc_abs_error), color = "blue") +
-  # Horizontal line for Nowcast Mean Absolute Error
-  geom_hline(aes(yintercept = mean(nc_abs_error), color = "Nowcast MAE"), linetype = "dashed") +
-  
-  # Real-Time Absolute Error 
-  geom_point(aes(x = time_value, y = rt_abs_error), color = "red", size = 2) +
-  geom_line(aes(x = time_value, y = rt_abs_error), color = "red") +
-  # Horizontal line for Real-Time Mean Absolute Error
-  geom_hline(aes(yintercept = mean(rt_abs_error), color = "Real-Time MAE"), linetype = "dashed") +
+  group_by(geo_value) |>
+  summarise(`Smoothed MAE` = mean(smoothed_res, na.rm = TRUE),
+            `Unsmoothed nowcast MAE` = mean(raw_nc_res, na.rm = TRUE),
+            `Provisional value MAE` = mean(prov_res, na.rm = TRUE)) |>
+  select(-geo_value) 
   
-  # Customize the x and y axes labels, legend & add title
-  xlab("Time") + ylab("Absolute Error") +
-  scale_color_manual(values = c("Real-Time MAE" = "red", "Nowcast MAE" = "blue")) +
-  labs(color = "Mean Absolute Error")
+knitr::kable(mae_value)
 ```
 
-We can see that the absolute errors are almost always lower for nowcasting.
 
 # Nowcasting with Two Variables
 
@@ -1983,8 +2141,7 @@ So visualizing can provide an important perspective that is missed from a simple
 * [**Sliding Window Approach**]{.primary}: Predictions are based on [data up to the current time]{.primary}, ensuring no future information influences the nowcast.
 * [**Evaluation**]{.primary}: Predictions are compared with actual admissions using numerical and visual perspectives.
 
-
-# Case Study - Nowcasting Cases Using %CLI
+# Case Study - Nowcasting Cases Using %CLI 
 
 ## Goal of this case study
 
@@ -1997,7 +2154,7 @@ So visualizing can provide an important perspective that is missed from a simple
 
 ## Summary of main steps
 
-The workflow is similar to the previous example where we nowcasted using two variables, only more involved.
+The workflow is similar to the previous example where we nowcasted using two variables, only more involved. 
 The main steps are...
 
 1. [**Fetch Data**]{.primary}: Retrieve %CLI and COVID-19 case data (by specimen collection date) for MA.
@@ -2006,7 +2163,7 @@ The main steps are...
 
 3. [**Model & Prediction**]{.primary}: Fit a linear model to predict cases based on %CLI, trained on a 30-day rolling window.
 
-4. [**Nowcast Execution**]{.primary}: Use `epix_slide()` to nowcast the cases dynamically.
+4. [**Nowcast Execution**]{.primary}: Use `epix_slide` to nowcast the cases dynamically. 
 
 5. [**Visualization**]{.primary}: Plot actual vs. nowcasted cases with confidence intervals to assess model accuracy.
 
@@ -2044,11 +2201,11 @@ Brief summary of this data:
 ## Main steps to construct the `epi_archive`
 
 1. [**Load necessary Libraries**]{.primary}: Such as `tidyverse`, `readxl`, `epiprocess`.
-2. [**Process Each Date's Data**]{.primary}:
+2. [**Process Each Date's Data**]{.primary}: 
    * A function we'll make (`process_covid_data`) downloads and processes daily COVID-19 data from the MA gov Excel files on their website.
    * The data is cleaned and formatted with columns: `geo_value`, `time_value`, `version`, and values.
 3. [**Handle Missing Data**]{.primary}: Checks if a date's data is available (handle 404 errors).
-4. [**Create `epi_archive`**]{.primary}:
+4. [**Create `epi_archive`**]{.primary}: 
    * Combine processed data into a tibble.
    * Convert the tibble to an `epi_archive` object using `as_epi_archive()`.
 
@@ -2066,25 +2223,25 @@ library(epiprocess)
 # Function to download and process each Excel file for a given date
 process_covid_data <- function(Date) {
   # Generate the URL for the given date
-  url <- paste0("https://www.mass.gov/doc/covid-19-raw-data-", tolower(gsub("-0", "-", format(Date, "%B-%d-%Y"))), "/download")
+  url <- paste0("https://www.mass.gov/doc/covid-19-raw-data-", tolower(gsub("-0", "-", format(Date, "%B-%d-%Y"))), "/download") 
   # Applies gsub("-0", "-", ...) to replace any occurrence of -0 (such as in "April-01") with just - (resulting in "April-1").
-
+  
   # Check if the URL exists (handle the 404 error by skipping that date)
   response <- GET(url)
-
+  
   if (status_code(response) != 200) {
     return(NULL)  # Skip if URL doesn't exist (404)
   }
-
+  
   # Define the destination file path for the Excel file
   file_path <- tempfile(fileext = ".xlsx")
-
+  
   # Download the Excel file
   GET(url, write_disk(file_path, overwrite = TRUE))
-
+  
   # Read the relevant sheet from the Excel file
   data <- read_excel(file_path, sheet = "CasesByDate (Test Date)")
-
+  
   # Process the data: rename columns and convert Date
   data <- data |>
     rename(
@@ -2094,7 +2251,7 @@ process_covid_data <- function(Date) {
       Case_Average_7day = `7-day confirmed case average`
     ) |>
     mutate(Date = as.Date(Date))  # Convert to Date class
-
+  
   # Create a tibble with the required columns for the epi_archive
   tib <- tibble(
     geo_value = "ma",  # Massachusetts (geo_value)
@@ -2102,12 +2259,12 @@ process_covid_data <- function(Date) {
     version = Date,  # The extracted version date
     case_rate_7d_av = data$Case_Average_7day  # 7-day average case value
   )
-
+  
   return(tib)
 }
 ```
 
-## Fetch Data - Code breakdown
+## Fetch Data - Code breakdown 
 
 * This purpose of this function is to download and process each Excel file as of a date.
 * [**URL Creation**]{.primary}: Dynamically generates the URL based on the date, removing leading zeros in day values (e.g., "April-01" → "April-1").
@@ -2129,20 +2286,20 @@ process_covid_data <- function(Date) {
 process_data_for_date_range <- function(start_date, end_date) {
   # Generate a sequence of dates between start_date and end_date
   date_sequence <- seq(as.Date(start_date), as.Date(end_date), by = "day")
-
+  
   # Process data for each date and combine results
   covid_data_list <- lapply(date_sequence, function(Date) {
     process_covid_data(Date)  # Skip over dates with no data (NULLs will be ignored)
   })
-
+  
   # Combine all non-null individual tibbles into one data frame
   combined_data <- bind_rows(covid_data_list[!sapply(covid_data_list, is.null)])
-
+  
   # Convert the combined data into an epi_archive object
   if (nrow(combined_data) > 0) {
     epi_archive_data <- combined_data |>
       as_epi_archive(compactify = FALSE)
-
+    
     return(epi_archive_data)
   } else {
     message("No valid data available for the given date range.")
@@ -2174,7 +2331,7 @@ y <- process_data_for_date_range("2021-01-10", "2021-12-01")  # Raw .xlsx data i
 y
 ```
 
-* Alternatively, you may run the following to load `y` that was previously saved as an RDS file:
+* Alternatively, you may run the following to load `y` that was previously saved as an RDS file: 
 ```{r load-ma-case-data}
 #| echo: true
 #| eval: FALSE
@@ -2191,7 +2348,7 @@ y <- readRDS("_data/ma_case_archive.rds")
 
 ```{r fetch-outpatient-cli}
 #| echo: true
-# Step 1: Fetch Versioned Data
+# Step 1: Fetch Versioned Data 
 x <- pub_covidcast(
   source = "doctor-visits",
   signals = "smoothed_adj_cli",
@@ -2237,15 +2394,15 @@ lm_mod_pred <- function(data, ...) {
   # Make predictions
   predictions = predict(model,
                         newdata = data |>
-                          fill(percent_cli, .direction = "down") |>
+                          fill(percent_cli, .direction = "down") |> 
                           filter(time_value == max(time_value)),
                         interval = "prediction", level = 0.9)
-
+  
   # Pull off real-time value for later comparison to the nowcast value
   real_time_val = data |> filter(time_value == max(time_value)) |> pull(case_rate_7d_av)
-
+  
   # Could clip predictions and bounds at 0
-  return(data.frame(predictions, actual_nowcast_date = max(data$time_value), real_time_val = real_time_val))
+  return(data.frame(predictions, actual_nowcast_date = max(data$time_value), real_time_val = real_time_val)) 
 }
 ```
 
@@ -2258,8 +2415,8 @@ lm_mod_pred <- function(data, ...) {
 ```{r nowcasting-epix-slide-cases}
 #| echo: true
 # Define the reference time points (to give the training/test split)
-targeted_nowcast_dates <- seq(as.Date("2021-04-01"), as.Date("2021-11-01"), by = "1 month")
-ref_time_values = targeted_nowcast_dates + 1 # + 1 because the case data is 1 day latent.
+targeted_nowcast_dates <- seq(as.Date("2021-04-01"), as.Date("2021-11-01"), by = "1 month") 
+ref_time_values = targeted_nowcast_dates + 1 # + 1 because the case data is 1 day latent. 
 # Determine this from revision_summary(y)
 
 # Use epix_slide to perform the nowcasting with a training-test split
@@ -2285,7 +2442,7 @@ Merge the nowcast results with the latest data for more direct comparison:
 ```{r join-with-actual-cases}
 #| echo: true
 x_latest <- epix_as_of(archive, max(archive$DT$version)) |>
-  select(-percent_cli)
+  select(-percent_cli) 
 
 res <- nowcast_res |> left_join(x_latest, by = join_by(geo_value, time_value))
 
@@ -2314,7 +2471,7 @@ ggplot(res, aes(x = time_value)) +
   # Adjust colors
   scale_color_manual(values = c("Finalized Cases (7-dav)" = "black",
                                 "Nowcast" = "#1f78b4",
-                                "Real Time" = "darkred")) +
+                                "Real Time" = "darkred")) + 
   scale_fill_manual(values = c("Pred. Interval" = "#a6cee3")) + # Light blue
   # Improve the theme
   theme_minimal() +
@@ -2334,12 +2491,12 @@ ggplot(res, aes(x = time_value)) +
 ```{r mae-code-cases}
 #| echo: true
 # Calculate the absolute error between actual and nowcasted COVID-19 cases
-mae_data_cases <- res |>
+mae_data_cases <- res |> 
   mutate(nc_abs_error = abs(case_rate_7d_av - fit),  # Nowcast vs Finalized cases (7-day average)
          rt_abs_error = abs(case_rate_7d_av - real_time_val))  # Real-Time vs Finalized cases
 
 # Compute the MAE (mean of absolute errors)
-mae_value_cases <- mae_data_cases |>
+mae_value_cases <- mae_data_cases |> 
   summarise(nc_MAE = mean(nc_abs_error),
             rt_MAE = mean(rt_abs_error))
 knitr::kable(mae_value_cases)
@@ -2351,18 +2508,18 @@ knitr::kable(mae_value_cases)
 ```{r abs-error-plot-cases}
 # Plot the absolute errors for Nowcast and Real-Time vs Finalized COVID-19 cases
 ggplot(mae_data_cases) +
-  # Nowcast Absolute Error
+  # Nowcast Absolute Error 
   geom_point(aes(x = time_value, y = nc_abs_error), color = "#1f78b4", size = 2) +
   geom_line(aes(x = time_value, y = nc_abs_error), color = "#1f78b4") +
   # Horizontal line for Nowcast Mean Absolute Error
   geom_hline(aes(yintercept = mean(nc_abs_error), color = "Nowcast MAE"), linetype = "dashed") +
-
-  # Real-Time Absolute Error
+  
+  # Real-Time Absolute Error 
   geom_point(aes(x = time_value, y = rt_abs_error), color = "darkred", size = 2) +
   geom_line(aes(x = time_value, y = rt_abs_error), color = "darkred") +
   # Horizontal line for Real-Time Mean Absolute Error
   geom_hline(aes(yintercept = mean(rt_abs_error), color = "Real-Time MAE"), linetype = "dashed") +
-
+  
   # Customize the x and y axes labels, legend & add title
   xlab("Date") + ylab("Absolute Error") +
   scale_color_manual(values = c("Real-Time MAE" = "darkred", "Nowcast MAE" = "#1f78b4")) +
@@ -2392,3 +2549,38 @@ Main Steps:
 
 Overall, nowcasting, based on the linear model, provided a closer approximation of true cases compared to the real-time values.
 
+
+
+## Aside on nowcasting
+
+* To some Epis, "nowcasting" can be equated with "estimate the time-varying instantaneous reproduction number, $R_t$"
+
+* Ex. using the number of reported COVID-19 cases in British Columbia between Jan. 2020 and Apr. 15, 2023. 
+
+<!-- This data is the number of reported COVID-19 cases in British Columbia between January 2020 and April 15, 2023. The values are.up-to-date as of August 2023. -->
+```{r rtestim}
+#| fig-width: 9
+#| fig-height: 3
+#| out-height: "400px"
+#| label: nowcasting
+library(rtestim)
+source(here::here("_code/bccovid.R"))
+
+p1 <- bccovid|>
+  ggplot(aes(date, cases)) + 
+  geom_line(colour = primary) +
+  geom_vline(xintercept = ymd("2023-04-15"), colour = secondary,
+             linewidth = 2) +
+  labs(y = "BC Covid-19 cases", x = "Date") +
+  scale_y_continuous(expand = expansion(c(0, NA)))
+bc_rt <- estimate_rt(bccovid$cases, x = bccovid$date, 
+                     lambda = c(1e6, 1e5))
+p2 <- plot(confband(bc_rt, lambda = 1e5)) + 
+  coord_cartesian(ylim = c(0.5, 2)) +
+  scale_y_continuous(expand = expansion(0))
+cowplot::plot_grid(p1, p2)
+```
+
+* Group built [`{rtestim}`](https://dajmcdon.github.io/rtestim) doing for this nonparametrically.
+
+* We may come back to this later...
\ No newline at end of file