- calc_wqis now calculates 95% bootstrap confidence intervals by re…

…sampling with replacement on rows for 10^3 replicates
bcgov · Feb 11, 2015 · 2950263 · 2950263
1 parent e868279
commit 2950263
Show file tree

Hide file tree

Showing 5 changed files with 74 additions and 57 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,13 +1,13 @@
 Package: wqbc
 Type: Package
 Title: Water Quality Thresholds and Index Calculation for British Columbia
-Version: 0.0.0.9005
-Date: 2015-01-31
+Version: 0.0.0.9006
+Date: 2015-02-10
 Authors@R: as.person(c(
     "Joe Thorley <[email protected]> [aut, cre]",
     "Colin Millar <[email protected]> [aut]",
+    "Andy Teucher [aut]",
     "Wendy Wang [ctb]",
-    "Andy Teucher [ctb]",
     "Stephanie Hazlitt [ctb]",
     "Robyn Irvine [ctb]"))
 Description: Calculates water quality thresholds and water quality indices

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,8 @@
+# wqbc 0.0.0.9006
+
+- `calc_wqis` now calculates 95% bootstrap confidence intervals by resampling with
+replacement on rows for 10^3 replicates
+
 # wqbc 0.0.0.9005
 
 - Released new version number as wqdata now dependent

diff --git a/R/calc-wqi.R b/R/calc-wqi.R
@@ -80,7 +80,46 @@ bootstrap_wqis_column <- function (x, R) {
   boot::boot(data = x, statistic = resample_wqi_column, R = R, strata = x$Variable)
 }
 
-boot_wqis <- function (x, ci, cesi_code) {
+resample_wqi_tidy <- function (x, i, nt = nt, nv = nv) {
+  x <- x[i,,drop = FALSE]
+  wqif_excursion_variable(x = x$Excursion, v = x$Variable, nt = nt, nv = nv)["WQI"]
+}
+
+bootstrap_wqis_tidy <- function (x, R) {
+  stopifnot(!any(is.na(x$Excursion)))
+
+  nt <- nrow(x)
+  nv <- length(unique(x$Variable))
+
+  x <- x[x$Variable %in% x$Variable[x$Excursion != 0],,drop = FALSE]
+
+  if(!nrow(x))
+    return (c(100, 100))
+
+  x$Variable <- factor(x$Variable)
+
+  boot::boot(data = x, statistic = resample_wqi_tidy, R = R,
+                     strata = x$Variable, nt = nt, nv = nv)
+}
+
+lower_upper <- function (x) {
+  round(quantile(x$t, c(0.025, 0.975)), 1)
+}
+
+boot_wqis <- function (x) {
+  ci <- "row"
+  cesi_code <- FALSE
+
+  stopifnot(is.flag(cesi_code) & noNA(cesi_code))
+  stopifnot(is.string(ci) & noNA(ci))
+  stopifnot(ci %in% c("row", "column", "tidy"))
+
+  R <- 10^3
+
+  if(ci == "tidy") {
+    boot <- bootstrap_wqis_tidy(x, R = R)
+    return (lower_upper(boot))
+  }
 
   if (cesi_code) {
     x <- dplyr::mutate_(x, Excursion = ~Excursion + 1)
@@ -90,19 +129,16 @@ boot_wqis <- function (x, ci, cesi_code) {
   x <- dplyr::select_(x, ~-Date)
   x <- as.matrix(x)
 
-  R <- 10^3
-
   if(cesi_code) {
+    stopifnot(ci %in% c("row", "column"))
     boot <- BootCICol(x, Column = (ci == "column"), Par = 4, CL = 0.95, R = R)
     names(boot) <- NULL
   } else {
-
     if(ci == "row") {
       boot <- boot::boot(x, statistic = wqi_matrix, R = R)
-    } else {
+    } else # ci == column
       boot <- bootstrap_wqis_column(x, R = R)
-    }
-    boot <-  round(quantile(boot$t, c(0.025, 0.975)),1)
+    boot <-  lower_upper(boot)
   }
   boot
 }
@@ -117,7 +153,7 @@ fourtimesfour <- function (x) {
   nrow(x) >= 4
 }
 
-calc_wqi_by <- function (x, ci, cesi_code, messages) {
+calc_wqi_by <- function (x, messages) {
 
   x$Excursion <- get_excursions(x$Value, x$LowerLimit, x$UpperLimit)
   check_excursions(x)
@@ -127,11 +163,7 @@ calc_wqi_by <- function (x, ci, cesi_code, messages) {
   nv <- length(unique(x$Variable))
   wqi <- wqif_excursion_variable(x = x$Excursion, v = x$Variable, nt = nt, nv = nv)
 
-  if(ci == "none") {
-    boot <- rep(NA_real_,2)
-  } else {
-    boot <- boot_wqis(x, ci = ci, cesi_code = cesi_code)
-  }
+  boot <- boot_wqis(x)
 
   wqi <- data.frame(WQI = round(wqi["WQI"], 1), Lower = boot[1], Upper = boot[2],
                     Category = categorize_wqi(wqi["WQI"]),
@@ -172,31 +204,28 @@ set_detection_limits <- function (x, messages) {
 #' and it will not be possible to calculate the WQI. In this case \code{calc_wqi}
 #' throws an informative error. Finally it is important to note that in order
 #' for the WQI to be calculated the data set must include four variables each
-#' with non-missing values on at least four separate days.
+#' with non-missing values on at least four separate days. In addition
+#' to calculating the WQI the function also generates
+#' 95% Lower and Upper bootstrap confidence intervals about the WQI. The confidence
+#' intervals are generated by first spreading the data into wide format where each
+#' row represents one date and then resampling the rows with replacement to
+#' generate 1,000 replicates. The confidence intervals are the 2.5% and 97.5%
+#' percentiles of the replicate WQIs.
 #'
 #' @param x A data.frame to calculate the WQI for.
 #' @param by An optional character vector of the columns in x to calculate the WQI by.
-#' @param ci A string indicating whether to calculate 95% bootstrap confidence
-#' intervals by "row" or "column" or "none". Currently the confidence intervals are
-#' considered unreliable and should not be used.
-#' @param cesi_code A temporary flag indicating whether to use wqbc or cesi code to calculate the
-#' confidence intervals. This argument should be removed prior to release of the package.
 #' @param messages A flag indicating whether to print messages.
 #' @examples
 #' data(ccme)
 #' calc_wqi(ccme, messages = TRUE)
 #' calc_wqi(ccme, by = "Date", messages = TRUE)
 #' @seealso \code{\link{calc_limits}} and \code{\link{wqbc}}
 #' @export
-calc_wqi <- function (x, by = NULL, ci = "row", cesi_code = FALSE,
-                       messages = getOption("wqbc.messages", default = TRUE)) {
+calc_wqi <- function (x, by = NULL,
+                      messages = getOption("wqbc.messages", default = TRUE)) {
   assert_that(is.data.frame(x))
   assert_that(is.null(by) || (is.character(by) && noNA(by)))
   assert_that(is.flag(messages) && noNA(messages))
-  assert_that(is.string(ci))
-
-  ci <- tolower(ci)
-  if(!ci %in% c("row", "column", "none")) stop("ci must be \"row\", \"column\", or \"none\"")
 
   check_rows(x)
   if(!any(c("LowerLimit", "UpperLimit") %in% colnames(x)))
@@ -239,9 +268,9 @@ calc_wqi <- function (x, by = NULL, ci = "row", cesi_code = FALSE,
   check_rows(x)
 
   if(is.null(by)) {
-    x <- calc_wqi_by(x, ci = ci, cesi_code = cesi_code, messages = messages)
+    x <- calc_wqi_by(x, messages = messages)
   } else {
-    x <- plyr::ddply(x, .variables = by, .fun = calc_wqi_by, ci = ci, cesi_code = cesi_code,
+    x <- plyr::ddply(x, .variables = by, .fun = calc_wqi_by,
                      messages = messages)
   }
   if(messages) message("Calculated water quality indices.")

diff --git a/man/calc_wqi.Rd b/man/calc_wqi.Rd
@@ -4,21 +4,14 @@
 \alias{calc_wqi}
 \title{Calculate Water Quality Index (WQI)}
 \usage{
-calc_wqi(x, by = NULL, ci = "row", cesi_code = FALSE,
-  messages = getOption("wqbc.messages", default = TRUE))
+calc_wqi(x, by = NULL, messages = getOption("wqbc.messages", default =
+  TRUE))
 }
 \arguments{
 \item{x}{A data.frame to calculate the WQI for.}
 
 \item{by}{An optional character vector of the columns in x to calculate the WQI by.}
 
-\item{ci}{A string indicating whether to calculate 95% bootstrap confidence
-intervals by "row" or "column" or "none". Currently the confidence intervals are
-considered unreliable and should not be used.}
-
-\item{cesi_code}{A temporary flag indicating whether to use wqbc or cesi code to calculate the
-confidence intervals. This argument should be removed prior to release of the package.}
-
 \item{messages}{A flag indicating whether to print messages.}
 }
 \description{
@@ -36,7 +29,13 @@ the variable has lower limits because otherwise the excursion will be infinity
 and it will not be possible to calculate the WQI. In this case \code{calc_wqi}
 throws an informative error. Finally it is important to note that in order
 for the WQI to be calculated the data set must include four variables each
-with non-missing values on at least four separate days.
+with non-missing values on at least four separate days. In addition
+to calculating the WQI the function also generates
+95% Lower and Upper bootstrap confidence intervals about the WQI. The confidence
+intervals are generated by first spreading the data into wide format where each
+row represents one date and then resampling the rows with replacement to
+generate 1,000 replicates. The confidence intervals are the 2.5% and 97.5%
+percentiles of the replicate WQIs.
 }
 \examples{
 data(ccme)

diff --git a/tests/testthat/test-calc-wqi.R b/tests/testthat/test-calc-wqi.R
@@ -22,7 +22,7 @@ test_that("ccme", {
 
   data(ccme)
 
-  x <- calc_wqi(ccme, ci = "column")
+  x <- calc_wqi(ccme)
 
   expect_is(x, "data.frame")
   expect_equal(nrow(x), 1)
@@ -84,19 +84,3 @@ test_that("calc_wqi zero values", {
   ccme$DetectionLimit <- 1
   expect_is(calc_wqi(ccme), "data.frame")
 })
-
-test_that("calc_wqi boot", {
-
-  opts <- options()
-  on.exit(options(opts))
-  options(wqbc.messages = FALSE)
-
-  data(ccme)
-
-  calc_wqi(ccme, ci = "column", cesi_code = FALSE)
-  calc_wqi(ccme, ci = "row", cesi_code = FALSE)
-  calc_wqi(ccme, ci = "column", cesi_code = TRUE)
-  calc_wqi(ccme, ci = "row", cesi_code = TRUE)
-})
-
-