From 56770b53cb89f24718a79e59e9bf616ba6f607db Mon Sep 17 00:00:00 2001 From: Martin Date: Tue, 10 Aug 2021 12:59:55 +0200 Subject: [PATCH 01/16] w_threshold doc + reference --- R/explanation.R | 9 ++++++++- R/observations.R | 3 --- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/R/explanation.R b/R/explanation.R index e0aedb103..9d274c789 100644 --- a/R/explanation.R +++ b/R/explanation.R @@ -62,6 +62,10 @@ #' #' @author Camilla Lingjaerde, Nikolai Sellereite, Martin Jullum, Annabelle Redelmeier #' +#'@references +#' Aas, K., Jullum, M., & Løland, A. (2021). Explaining individual predictions when features are dependent: +#' More accurate approximations to Shapley values. Artificial Intelligence, 298, 103502. +#' #' @examples #' if (requireNamespace("MASS", quietly = TRUE)) { #' # Load example data @@ -186,7 +190,10 @@ explain <- function(x, explainer, approach, prediction_zero, ...) { #' is only applicable when \code{approach = "empirical"}, and \code{type} is either equal to #' \code{"AICc_each_k"} or \code{"AICc_full"} #' -#' @param w_threshold Positive integer between 0 and 1. +#' @param w_threshold Numeric vector of length 1, with \code{0 < w_threshold <= 1} representing the minimum proportion +#' of the total empirical weight that data samples should use. If e.g. \code{w_threshold = .8} we will choose the +#' \code{K} samples with the largest weight so that the sum of the weights accounts for 80\% of the total weight. +#' \code{w_threshold} is the \eqn{\eta} parameter in equation (15) of Aas et al (2021). #' #' @rdname explain #' diff --git a/R/observations.R b/R/observations.R index ed96a2add..37a6e177f 100644 --- a/R/observations.R +++ b/R/observations.R @@ -7,9 +7,6 @@ #' the total number of unique features, respectively. Note that \code{m = ncol(x_train)}. #' @param x_train Numeric matrix #' @param x_test Numeric matrix -#' @param w_threshold Numeric vector of length 1, where \code{w_threshold > 0} and -#' \code{w_threshold <= 1}. If \code{w_threshold = .8} we will choose the \code{K} samples with -#' the largest weight so that the sum of the weights accounts for 80\% of the total weight. #' #' @return data.table #' From 585bf5a8798163e1706409b39c1ec5feac4f1a8c Mon Sep 17 00:00:00 2001 From: Martin Date: Tue, 10 Aug 2021 13:06:43 +0200 Subject: [PATCH 02/16] update rd --- R/observations.R | 3 +++ man/explain.Rd | 9 ++++++++- man/observation_impute.Rd | 11 ++++++++--- 3 files changed, 19 insertions(+), 4 deletions(-) diff --git a/R/observations.R b/R/observations.R index 37a6e177f..5df93d570 100644 --- a/R/observations.R +++ b/R/observations.R @@ -8,6 +8,9 @@ #' @param x_train Numeric matrix #' @param x_test Numeric matrix #' +#' @inheritParams explain +#' @inherit explain references +#' #' @return data.table #' #' @keywords internal diff --git a/man/explain.Rd b/man/explain.Rd index afdf14e7f..8a27840d8 100644 --- a/man/explain.Rd +++ b/man/explain.Rd @@ -98,7 +98,10 @@ optimizing the AICc. Note that this argument is only applicable when is only applicable when \code{approach = "empirical"}, and \code{type} is either equal to \code{"AICc_each_k"} or \code{"AICc_full"}} -\item{w_threshold}{Positive integer between 0 and 1.} +\item{w_threshold}{Numeric vector of length 1, with \code{0 < w_threshold <= 1} representing the minimum proportion +of the total empirical weight that data samples should use. If e.g. \code{w_threshold = .8} we will choose the +\code{K} samples with the largest weight so that the sum of the weights accounts for 80\% of the total weight. +\code{w_threshold} is the \eqn{\eta} parameter in equation (15) of Aas et al (2021).} \item{mu}{Numeric vector. (Optional) Containing the mean of the data generating distribution. If \code{NULL} the expected values are estimated from the data. Note that this is only used @@ -241,6 +244,10 @@ if (requireNamespace("MASS", quietly = TRUE)) { print(explain_groups$dt) } } +\references{ +Aas, K., Jullum, M., & Løland, A. (2021). Explaining individual predictions when features are dependent: + More accurate approximations to Shapley values. Artificial Intelligence, 298, 103502. +} \author{ Camilla Lingjaerde, Nikolai Sellereite, Martin Jullum, Annabelle Redelmeier } diff --git a/man/observation_impute.Rd b/man/observation_impute.Rd index e437f39c3..a9a40ab03 100644 --- a/man/observation_impute.Rd +++ b/man/observation_impute.Rd @@ -25,9 +25,10 @@ the total number of unique features, respectively. Note that \code{m = ncol(x_tr \item{x_test}{Numeric matrix} -\item{w_threshold}{Numeric vector of length 1, where \code{w_threshold > 0} and -\code{w_threshold <= 1}. If \code{w_threshold = .8} we will choose the \code{K} samples with -the largest weight so that the sum of the weights accounts for 80\% of the total weight.} +\item{w_threshold}{Numeric vector of length 1, with \code{0 < w_threshold <= 1} representing the minimum proportion +of the total empirical weight that data samples should use. If e.g. \code{w_threshold = .8} we will choose the +\code{K} samples with the largest weight so that the sum of the weights accounts for 80\% of the total weight. +\code{w_threshold} is the \eqn{\eta} parameter in equation (15) of Aas et al (2021).} } \value{ data.table @@ -35,6 +36,10 @@ data.table \description{ Generate permutations of training data using test observations } +\references{ +Aas, K., Jullum, M., & Løland, A. (2021). Explaining individual predictions when features are dependent: + More accurate approximations to Shapley values. Artificial Intelligence, 298, 103502. +} \author{ Nikolai Sellereite } From 4ae95c89e10c57c28d9432663db897b126e006be Mon Sep 17 00:00:00 2001 From: Martin Date: Tue, 10 Aug 2021 14:12:05 +0200 Subject: [PATCH 03/16] move n_samples to explain --- R/explanation.R | 30 +++++++++++++++++++++++------- R/observations.R | 29 ++++++++++++----------------- 2 files changed, 35 insertions(+), 24 deletions(-) diff --git a/R/explanation.R b/R/explanation.R index 9d274c789..e8cd237f3 100644 --- a/R/explanation.R +++ b/R/explanation.R @@ -11,6 +11,9 @@ #' either be \code{"gaussian"}, \code{"copula"}, \code{"empirical"}, or \code{"ctree"}. See details for more #' information. #' +#' @param n_samples Positive integer. Indicating the maximum number of samples to use in the +#' Monte Carlo integration for every conditional expectation. See also details. +#' #' @param prediction_zero Numeric. The prediction value for unseen data, typically equal to the mean of #' the response. #' @@ -19,7 +22,6 @@ #' @details The most important thing to notice is that \code{shapr} has implemented four different #' approaches for estimating the conditional distributions of the data, namely \code{"empirical"}, #' \code{"gaussian"}, \code{"copula"} and \code{"ctree"}. -#' #' In addition, the user also has the option of combining the four approaches. #' E.g. if you're in a situation where you have trained a model the consists of 10 features, #' and you'd like to use the \code{"gaussian"} approach when you condition on a single feature, @@ -29,6 +31,13 @@ #' \code{"approach[i]" = "gaussian"} it means that you'd like to use the \code{"gaussian"} approach #' when conditioning on \code{i} features. #' +#' For \code{approach="ctree"}, \code{n_samples} corresponds to the number of samples +#' from the leaf node (see an exception related to the \code{sample} argument). +#' For \code{approach="empirical"}, \code{n_samples} is the \eqn{K} parameter in equations (14-15) of +#' Aas et al. (2021), i.e. the maximum number of observations (with largest weights) that is used, see also the +#' \code{w_threshold} argument. +#' +#' #' @return Object of class \code{c("shapr", "list")}. Contains the following items: #' \describe{ #' \item{dt}{data.table} @@ -135,7 +144,7 @@ #' ) #' print(explain_groups$dt) #' } -explain <- function(x, explainer, approach, prediction_zero, ...) { +explain <- function(x, explainer, approach, prediction_zero, n_samples = 1e3,...) { extras <- list(...) # Check input for x @@ -199,9 +208,10 @@ explain <- function(x, explainer, approach, prediction_zero, ...) { #' #' @export explain.empirical <- function(x, explainer, approach, prediction_zero, + n_samples = 1e3, w_threshold = 0.95, type = "fixed_sigma", fixed_sigma_vec = 0.1, n_samples_aicc = 1000, eval_max_aicc = 20, - start_aicc = 0.1, w_threshold = 0.95, ...) { + start_aicc = 0.1, ...) { # Add arguments to explainer object explainer$x_test <- as.matrix(preprocess_data(x, explainer$feature_list)$x_dt) @@ -212,6 +222,7 @@ explain.empirical <- function(x, explainer, approach, prediction_zero, explainer$eval_max_aicc <- eval_max_aicc explainer$start_aicc <- start_aicc explainer$w_threshold <- w_threshold + explainer$n_samples <- n_samples # Generate data dt <- prepare_data(explainer, ...) @@ -236,12 +247,14 @@ explain.empirical <- function(x, explainer, approach, prediction_zero, #' @rdname explain #' #' @export -explain.gaussian <- function(x, explainer, approach, prediction_zero, mu = NULL, cov_mat = NULL, ...) { +explain.gaussian <- function(x, explainer, approach, prediction_zero, n_samples = 1e3, mu = NULL, cov_mat = NULL, ...) { # Add arguments to explainer object explainer$x_test <- as.matrix(preprocess_data(x, explainer$feature_list)$x_dt) explainer$approach <- approach + explainer$n_samples <- n_samples + # If mu is not provided directly, use mean of training data if (is.null(mu)) { @@ -277,11 +290,12 @@ explain.gaussian <- function(x, explainer, approach, prediction_zero, mu = NULL, #' @rdname explain #' @export -explain.copula <- function(x, explainer, approach, prediction_zero, ...) { +explain.copula <- function(x, explainer, approach, prediction_zero, n_samples = 1e3, ...) { # Setup explainer$x_test <- as.matrix(preprocess_data(x, explainer$feature_list)$x_dt) explainer$approach <- approach + explainer$n_samples <- n_samples # Prepare transformed data x_train <- apply( @@ -341,7 +355,7 @@ explain.copula <- function(x, explainer, approach, prediction_zero, ...) { #' @name explain #' #' @export -explain.ctree <- function(x, explainer, approach, prediction_zero, +explain.ctree <- function(x, explainer, approach, prediction_zero, n_samples = 1e3, mincriterion = 0.95, minsplit = 20, minbucket = 7, sample = TRUE, ...) { # Checks input argument @@ -356,6 +370,7 @@ explain.ctree <- function(x, explainer, approach, prediction_zero, explainer$minsplit <- minsplit explainer$minbucket <- minbucket explainer$sample <- sample + explainer$n_samples <- n_samples # Generate data dt <- prepare_data(explainer, ...) @@ -374,12 +389,13 @@ explain.ctree <- function(x, explainer, approach, prediction_zero, #' @name explain #' #' @export -explain.combined <- function(x, explainer, approach, prediction_zero, +explain.combined <- function(x, explainer, approach, prediction_zero, n_samples = 1e3, mu = NULL, cov_mat = NULL, ...) { # Get indices of combinations l <- get_list_approaches(explainer$X$n_features, approach) explainer$return <- TRUE explainer$x_test <- as.matrix(preprocess_data(x, explainer$feature_list)$x_dt) + explainer$n_samples <- n_samples dt_l <- list() for (i in seq_along(l)) { diff --git a/R/observations.R b/R/observations.R index 5df93d570..8ba92bced 100644 --- a/R/observations.R +++ b/R/observations.R @@ -71,9 +71,6 @@ observation_impute <- function(W_kernel, S, x_train, x_test, w_threshold = .7, n #' #' @param x Explainer object. See \code{\link{explain}} for more information. #' -#' @param n_samples Positive integer. Indicating the maximum number of samples to use in the -#' Monte Carlo integration for every conditional expectation. -#' #' @param seed Positive integer. If \code{NULL} the seed will be inherited from the calling environment. #' #' @param index_features Positive integer vector. Specifies the indices of combinations to apply to the present method. @@ -94,7 +91,7 @@ prepare_data <- function(x, ...) { #' @rdname prepare_data #' @export -prepare_data.empirical <- function(x, seed = 1, n_samples = 1e3, index_features = NULL, ...) { +prepare_data.empirical <- function(x, seed = 1, index_features = NULL, ...) { id <- id_combination <- w <- NULL # due to NSE notes in R CMD check # Get distance matrix ---------------- @@ -158,7 +155,7 @@ prepare_data.empirical <- function(x, seed = 1, n_samples = 1e3, index_features x_train = as.matrix(x$x_train), x_test = x$x_test[i, , drop = FALSE], w_threshold = x$w_threshold, - n_samples = n_samples + n_samples = x$n_samples ) dt_l[[i]][, id := i] @@ -171,7 +168,7 @@ prepare_data.empirical <- function(x, seed = 1, n_samples = 1e3, index_features #' @rdname prepare_data #' @export -prepare_data.gaussian <- function(x, seed = 1, n_samples = 1e3, index_features = NULL, ...) { +prepare_data.gaussian <- function(x, seed = 1, index_features = NULL, ...) { id <- id_combination <- w <- NULL # due to NSE notes in R CMD check n_xtest <- nrow(x$x_test) @@ -187,7 +184,7 @@ prepare_data.gaussian <- function(x, seed = 1, n_samples = 1e3, index_features = l <- lapply( X = features, FUN = sample_gaussian, - n_samples = n_samples, + n_samples = x$n_samples, mu = x$mu, cov_mat = x$cov_mat, m = ncol(x$x_test), @@ -195,7 +192,7 @@ prepare_data.gaussian <- function(x, seed = 1, n_samples = 1e3, index_features = ) dt_l[[i]] <- data.table::rbindlist(l, idcol = "id_combination") - dt_l[[i]][, w := 1 / n_samples] + dt_l[[i]][, w := 1 / x$n_samples] dt_l[[i]][, id := i] if (!is.null(index_features)) dt_l[[i]][, id_combination := index_features[id_combination]] } @@ -206,7 +203,7 @@ prepare_data.gaussian <- function(x, seed = 1, n_samples = 1e3, index_features = #' @rdname prepare_data #' @export -prepare_data.copula <- function(x, x_test_gaussian = 1, seed = 1, n_samples = 1e3, index_features = NULL, ...) { +prepare_data.copula <- function(x, x_test_gaussian = 1, seed = 1, index_features = NULL, ...) { id <- id_combination <- w <- NULL # due to NSE notes in R CMD check n_xtest <- nrow(x$x_test) dt_l <- list() @@ -221,7 +218,7 @@ prepare_data.copula <- function(x, x_test_gaussian = 1, seed = 1, n_samples = 1e l <- lapply( X = features, FUN = sample_copula, - n_samples = n_samples, + n_samples = x$n_samples, mu = x$mu, cov_mat = x$cov_mat, m = ncol(x$x_test), @@ -231,7 +228,7 @@ prepare_data.copula <- function(x, x_test_gaussian = 1, seed = 1, n_samples = 1e ) dt_l[[i]] <- data.table::rbindlist(l, idcol = "id_combination") - dt_l[[i]][, w := 1 / n_samples] + dt_l[[i]][, w := 1 / x$n_samples] dt_l[[i]][, id := i] if (!is.null(index_features)) dt_l[[i]][, id_combination := index_features[id_combination]] } @@ -239,9 +236,7 @@ prepare_data.copula <- function(x, x_test_gaussian = 1, seed = 1, n_samples = 1e return(dt) } -#' @param n_samples Integer. The number of obs to sample from the leaf if \code{sample} = TRUE or if \code{sample} -#' = FALSE but \code{n_samples} is less than the number of obs in the leaf. -#' + #' @param index_features List. Default is NULL but if either various methods are being used or various mincriterion are #' used for different numbers of conditioned features, this will be a list with the features to pass. #' @@ -258,7 +253,7 @@ prepare_data.copula <- function(x, x_test_gaussian = 1, seed = 1, n_samples = 1e #' #' @rdname prepare_data #' @export -prepare_data.ctree <- function(x, seed = 1, n_samples = 1e3, index_features = NULL, +prepare_data.ctree <- function(x, seed = 1, index_features = NULL, mc_cores = 1, mc_cores_create_ctree = mc_cores, mc_cores_sample_ctree = mc_cores, ...) { id <- id_combination <- w <- NULL # due to NSE notes in R CMD check @@ -290,7 +285,7 @@ prepare_data.ctree <- function(x, seed = 1, n_samples = 1e3, index_features = NU l <- parallel::mclapply( X = all_trees, FUN = sample_ctree, - n_samples = n_samples, + n_samples = x$n_samples, x_test = x$x_test[i, , drop = FALSE], x_train = x$x_train, p = ncol(x$x_test), @@ -300,7 +295,7 @@ prepare_data.ctree <- function(x, seed = 1, n_samples = 1e3, index_features = NU ) dt_l[[i]] <- data.table::rbindlist(l, idcol = "id_combination") - dt_l[[i]][, w := 1 / n_samples] + dt_l[[i]][, w := 1 / x$n_samples] dt_l[[i]][, id := i] if (!is.null(index_features)) dt_l[[i]][, id_combination := index_features[id_combination]] } From 38e2635944f90b849232d465f96b8deffe4cb5f1 Mon Sep 17 00:00:00 2001 From: Martin Date: Tue, 10 Aug 2021 14:12:14 +0200 Subject: [PATCH 04/16] update rd --- R/observations.R | 29 +++++++++++++++++------------ man/explain.Rd | 30 +++++++++++++++++++++--------- man/observation_impute.Rd | 3 +++ man/prepare_data.Rd | 17 +++-------------- 4 files changed, 44 insertions(+), 35 deletions(-) diff --git a/R/observations.R b/R/observations.R index 8ba92bced..5df93d570 100644 --- a/R/observations.R +++ b/R/observations.R @@ -71,6 +71,9 @@ observation_impute <- function(W_kernel, S, x_train, x_test, w_threshold = .7, n #' #' @param x Explainer object. See \code{\link{explain}} for more information. #' +#' @param n_samples Positive integer. Indicating the maximum number of samples to use in the +#' Monte Carlo integration for every conditional expectation. +#' #' @param seed Positive integer. If \code{NULL} the seed will be inherited from the calling environment. #' #' @param index_features Positive integer vector. Specifies the indices of combinations to apply to the present method. @@ -91,7 +94,7 @@ prepare_data <- function(x, ...) { #' @rdname prepare_data #' @export -prepare_data.empirical <- function(x, seed = 1, index_features = NULL, ...) { +prepare_data.empirical <- function(x, seed = 1, n_samples = 1e3, index_features = NULL, ...) { id <- id_combination <- w <- NULL # due to NSE notes in R CMD check # Get distance matrix ---------------- @@ -155,7 +158,7 @@ prepare_data.empirical <- function(x, seed = 1, index_features = NULL, ...) { x_train = as.matrix(x$x_train), x_test = x$x_test[i, , drop = FALSE], w_threshold = x$w_threshold, - n_samples = x$n_samples + n_samples = n_samples ) dt_l[[i]][, id := i] @@ -168,7 +171,7 @@ prepare_data.empirical <- function(x, seed = 1, index_features = NULL, ...) { #' @rdname prepare_data #' @export -prepare_data.gaussian <- function(x, seed = 1, index_features = NULL, ...) { +prepare_data.gaussian <- function(x, seed = 1, n_samples = 1e3, index_features = NULL, ...) { id <- id_combination <- w <- NULL # due to NSE notes in R CMD check n_xtest <- nrow(x$x_test) @@ -184,7 +187,7 @@ prepare_data.gaussian <- function(x, seed = 1, index_features = NULL, ...) { l <- lapply( X = features, FUN = sample_gaussian, - n_samples = x$n_samples, + n_samples = n_samples, mu = x$mu, cov_mat = x$cov_mat, m = ncol(x$x_test), @@ -192,7 +195,7 @@ prepare_data.gaussian <- function(x, seed = 1, index_features = NULL, ...) { ) dt_l[[i]] <- data.table::rbindlist(l, idcol = "id_combination") - dt_l[[i]][, w := 1 / x$n_samples] + dt_l[[i]][, w := 1 / n_samples] dt_l[[i]][, id := i] if (!is.null(index_features)) dt_l[[i]][, id_combination := index_features[id_combination]] } @@ -203,7 +206,7 @@ prepare_data.gaussian <- function(x, seed = 1, index_features = NULL, ...) { #' @rdname prepare_data #' @export -prepare_data.copula <- function(x, x_test_gaussian = 1, seed = 1, index_features = NULL, ...) { +prepare_data.copula <- function(x, x_test_gaussian = 1, seed = 1, n_samples = 1e3, index_features = NULL, ...) { id <- id_combination <- w <- NULL # due to NSE notes in R CMD check n_xtest <- nrow(x$x_test) dt_l <- list() @@ -218,7 +221,7 @@ prepare_data.copula <- function(x, x_test_gaussian = 1, seed = 1, index_features l <- lapply( X = features, FUN = sample_copula, - n_samples = x$n_samples, + n_samples = n_samples, mu = x$mu, cov_mat = x$cov_mat, m = ncol(x$x_test), @@ -228,7 +231,7 @@ prepare_data.copula <- function(x, x_test_gaussian = 1, seed = 1, index_features ) dt_l[[i]] <- data.table::rbindlist(l, idcol = "id_combination") - dt_l[[i]][, w := 1 / x$n_samples] + dt_l[[i]][, w := 1 / n_samples] dt_l[[i]][, id := i] if (!is.null(index_features)) dt_l[[i]][, id_combination := index_features[id_combination]] } @@ -236,7 +239,9 @@ prepare_data.copula <- function(x, x_test_gaussian = 1, seed = 1, index_features return(dt) } - +#' @param n_samples Integer. The number of obs to sample from the leaf if \code{sample} = TRUE or if \code{sample} +#' = FALSE but \code{n_samples} is less than the number of obs in the leaf. +#' #' @param index_features List. Default is NULL but if either various methods are being used or various mincriterion are #' used for different numbers of conditioned features, this will be a list with the features to pass. #' @@ -253,7 +258,7 @@ prepare_data.copula <- function(x, x_test_gaussian = 1, seed = 1, index_features #' #' @rdname prepare_data #' @export -prepare_data.ctree <- function(x, seed = 1, index_features = NULL, +prepare_data.ctree <- function(x, seed = 1, n_samples = 1e3, index_features = NULL, mc_cores = 1, mc_cores_create_ctree = mc_cores, mc_cores_sample_ctree = mc_cores, ...) { id <- id_combination <- w <- NULL # due to NSE notes in R CMD check @@ -285,7 +290,7 @@ prepare_data.ctree <- function(x, seed = 1, index_features = NULL, l <- parallel::mclapply( X = all_trees, FUN = sample_ctree, - n_samples = x$n_samples, + n_samples = n_samples, x_test = x$x_test[i, , drop = FALSE], x_train = x$x_train, p = ncol(x$x_test), @@ -295,7 +300,7 @@ prepare_data.ctree <- function(x, seed = 1, index_features = NULL, ) dt_l[[i]] <- data.table::rbindlist(l, idcol = "id_combination") - dt_l[[i]][, w := 1 / x$n_samples] + dt_l[[i]][, w := 1 / n_samples] dt_l[[i]][, id := i] if (!is.null(index_features)) dt_l[[i]][, id_combination := index_features[id_combination]] } diff --git a/man/explain.Rd b/man/explain.Rd index 8a27840d8..3e4d1c67f 100644 --- a/man/explain.Rd +++ b/man/explain.Rd @@ -10,19 +10,20 @@ \alias{explain.ctree_comb_mincrit} \title{Explain the output of machine learning models with more accurately estimated Shapley values} \usage{ -explain(x, explainer, approach, prediction_zero, ...) +explain(x, explainer, approach, prediction_zero, n_samples = 1000, ...) \method{explain}{empirical}( x, explainer, approach, prediction_zero, + n_samples = 1000, + w_threshold = 0.95, type = "fixed_sigma", fixed_sigma_vec = 0.1, n_samples_aicc = 1000, eval_max_aicc = 20, start_aicc = 0.1, - w_threshold = 0.95, ... ) @@ -31,18 +32,20 @@ explain(x, explainer, approach, prediction_zero, ...) explainer, approach, prediction_zero, + n_samples = 1000, mu = NULL, cov_mat = NULL, ... ) -\method{explain}{copula}(x, explainer, approach, prediction_zero, ...) +\method{explain}{copula}(x, explainer, approach, prediction_zero, n_samples = 1000, ...) \method{explain}{ctree}( x, explainer, approach, prediction_zero, + n_samples = 1000, mincriterion = 0.95, minsplit = 20, minbucket = 7, @@ -55,6 +58,7 @@ explain(x, explainer, approach, prediction_zero, ...) explainer, approach, prediction_zero, + n_samples = 1000, mu = NULL, cov_mat = NULL, ... @@ -77,8 +81,16 @@ information.} \item{prediction_zero}{Numeric. The prediction value for unseen data, typically equal to the mean of the response.} +\item{n_samples}{Positive integer. Indicating the maximum number of samples to use in the +Monte Carlo integration for every conditional expectation. See also details.} + \item{...}{Additional arguments passed to \code{\link{prepare_data}}} +\item{w_threshold}{Numeric vector of length 1, with \code{0 < w_threshold <= 1} representing the minimum proportion +of the total empirical weight that data samples should use. If e.g. \code{w_threshold = .8} we will choose the +\code{K} samples with the largest weight so that the sum of the weights accounts for 80\% of the total weight. +\code{w_threshold} is the \eqn{\eta} parameter in equation (15) of Aas et al (2021).} + \item{type}{Character. Should be equal to either \code{"independence"}, \code{"fixed_sigma"}, \code{"AICc_each_k"} or \code{"AICc_full"}.} @@ -98,11 +110,6 @@ optimizing the AICc. Note that this argument is only applicable when is only applicable when \code{approach = "empirical"}, and \code{type} is either equal to \code{"AICc_each_k"} or \code{"AICc_full"}} -\item{w_threshold}{Numeric vector of length 1, with \code{0 < w_threshold <= 1} representing the minimum proportion -of the total empirical weight that data samples should use. If e.g. \code{w_threshold = .8} we will choose the -\code{K} samples with the largest weight so that the sum of the weights accounts for 80\% of the total weight. -\code{w_threshold} is the \eqn{\eta} parameter in equation (15) of Aas et al (2021).} - \item{mu}{Numeric vector. (Optional) Containing the mean of the data generating distribution. If \code{NULL} the expected values are estimated from the data. Note that this is only used when \code{approach = "gaussian"}.} @@ -164,7 +171,6 @@ Explain the output of machine learning models with more accurately estimated Sha The most important thing to notice is that \code{shapr} has implemented four different approaches for estimating the conditional distributions of the data, namely \code{"empirical"}, \code{"gaussian"}, \code{"copula"} and \code{"ctree"}. - In addition, the user also has the option of combining the four approaches. E.g. if you're in a situation where you have trained a model the consists of 10 features, and you'd like to use the \code{"gaussian"} approach when you condition on a single feature, @@ -173,6 +179,12 @@ if you condition on more than 5 features this can be done by simply passing \code{approach = c("gaussian", rep("empirical", 4), rep("copula", 5))}. If \code{"approach[i]" = "gaussian"} it means that you'd like to use the \code{"gaussian"} approach when conditioning on \code{i} features. + +For \code{approach="ctree"}, \code{n_samples} corresponds to the number of samples +from the leaf node (see an exception related to the \code{sample} argument). +For \code{approach="empirical"}, \code{n_samples} is the \eqn{K} parameter in equations (14-15) of +Aas et al. (2021), i.e. the maximum number of observations (with largest weights) that is used, see also the +\code{w_threshold} argument. } \examples{ if (requireNamespace("MASS", quietly = TRUE)) { diff --git a/man/observation_impute.Rd b/man/observation_impute.Rd index a9a40ab03..dd48d6c1f 100644 --- a/man/observation_impute.Rd +++ b/man/observation_impute.Rd @@ -29,6 +29,9 @@ the total number of unique features, respectively. Note that \code{m = ncol(x_tr of the total empirical weight that data samples should use. If e.g. \code{w_threshold = .8} we will choose the \code{K} samples with the largest weight so that the sum of the weights accounts for 80\% of the total weight. \code{w_threshold} is the \eqn{\eta} parameter in equation (15) of Aas et al (2021).} + +\item{n_samples}{Positive integer. Indicating the maximum number of samples to use in the +Monte Carlo integration for every conditional expectation. See also details.} } \value{ data.table diff --git a/man/prepare_data.Rd b/man/prepare_data.Rd index 49977759a..15a1f07e9 100644 --- a/man/prepare_data.Rd +++ b/man/prepare_data.Rd @@ -10,23 +10,15 @@ \usage{ prepare_data(x, ...) -\method{prepare_data}{empirical}(x, seed = 1, n_samples = 1000, index_features = NULL, ...) +\method{prepare_data}{empirical}(x, seed = 1, index_features = NULL, ...) -\method{prepare_data}{gaussian}(x, seed = 1, n_samples = 1000, index_features = NULL, ...) +\method{prepare_data}{gaussian}(x, seed = 1, index_features = NULL, ...) -\method{prepare_data}{copula}( - x, - x_test_gaussian = 1, - seed = 1, - n_samples = 1000, - index_features = NULL, - ... -) +\method{prepare_data}{copula}(x, x_test_gaussian = 1, seed = 1, index_features = NULL, ...) \method{prepare_data}{ctree}( x, seed = 1, - n_samples = 1000, index_features = NULL, mc_cores = 1, mc_cores_create_ctree = mc_cores, @@ -41,9 +33,6 @@ prepare_data(x, ...) \item{seed}{Positive integer. If \code{NULL} the seed will be inherited from the calling environment.} -\item{n_samples}{Integer. The number of obs to sample from the leaf if \code{sample} = TRUE or if \code{sample} -= FALSE but \code{n_samples} is less than the number of obs in the leaf.} - \item{index_features}{List. Default is NULL but if either various methods are being used or various mincriterion are used for different numbers of conditioned features, this will be a list with the features to pass.} From 153b66fd894c08c6f5cc47c6121b172e631c55d8 Mon Sep 17 00:00:00 2001 From: Martin Date: Tue, 10 Aug 2021 14:20:12 +0200 Subject: [PATCH 05/16] more --- R/observations.R | 29 ++++++++++++----------------- 1 file changed, 12 insertions(+), 17 deletions(-) diff --git a/R/observations.R b/R/observations.R index 5df93d570..8ba92bced 100644 --- a/R/observations.R +++ b/R/observations.R @@ -71,9 +71,6 @@ observation_impute <- function(W_kernel, S, x_train, x_test, w_threshold = .7, n #' #' @param x Explainer object. See \code{\link{explain}} for more information. #' -#' @param n_samples Positive integer. Indicating the maximum number of samples to use in the -#' Monte Carlo integration for every conditional expectation. -#' #' @param seed Positive integer. If \code{NULL} the seed will be inherited from the calling environment. #' #' @param index_features Positive integer vector. Specifies the indices of combinations to apply to the present method. @@ -94,7 +91,7 @@ prepare_data <- function(x, ...) { #' @rdname prepare_data #' @export -prepare_data.empirical <- function(x, seed = 1, n_samples = 1e3, index_features = NULL, ...) { +prepare_data.empirical <- function(x, seed = 1, index_features = NULL, ...) { id <- id_combination <- w <- NULL # due to NSE notes in R CMD check # Get distance matrix ---------------- @@ -158,7 +155,7 @@ prepare_data.empirical <- function(x, seed = 1, n_samples = 1e3, index_features x_train = as.matrix(x$x_train), x_test = x$x_test[i, , drop = FALSE], w_threshold = x$w_threshold, - n_samples = n_samples + n_samples = x$n_samples ) dt_l[[i]][, id := i] @@ -171,7 +168,7 @@ prepare_data.empirical <- function(x, seed = 1, n_samples = 1e3, index_features #' @rdname prepare_data #' @export -prepare_data.gaussian <- function(x, seed = 1, n_samples = 1e3, index_features = NULL, ...) { +prepare_data.gaussian <- function(x, seed = 1, index_features = NULL, ...) { id <- id_combination <- w <- NULL # due to NSE notes in R CMD check n_xtest <- nrow(x$x_test) @@ -187,7 +184,7 @@ prepare_data.gaussian <- function(x, seed = 1, n_samples = 1e3, index_features = l <- lapply( X = features, FUN = sample_gaussian, - n_samples = n_samples, + n_samples = x$n_samples, mu = x$mu, cov_mat = x$cov_mat, m = ncol(x$x_test), @@ -195,7 +192,7 @@ prepare_data.gaussian <- function(x, seed = 1, n_samples = 1e3, index_features = ) dt_l[[i]] <- data.table::rbindlist(l, idcol = "id_combination") - dt_l[[i]][, w := 1 / n_samples] + dt_l[[i]][, w := 1 / x$n_samples] dt_l[[i]][, id := i] if (!is.null(index_features)) dt_l[[i]][, id_combination := index_features[id_combination]] } @@ -206,7 +203,7 @@ prepare_data.gaussian <- function(x, seed = 1, n_samples = 1e3, index_features = #' @rdname prepare_data #' @export -prepare_data.copula <- function(x, x_test_gaussian = 1, seed = 1, n_samples = 1e3, index_features = NULL, ...) { +prepare_data.copula <- function(x, x_test_gaussian = 1, seed = 1, index_features = NULL, ...) { id <- id_combination <- w <- NULL # due to NSE notes in R CMD check n_xtest <- nrow(x$x_test) dt_l <- list() @@ -221,7 +218,7 @@ prepare_data.copula <- function(x, x_test_gaussian = 1, seed = 1, n_samples = 1e l <- lapply( X = features, FUN = sample_copula, - n_samples = n_samples, + n_samples = x$n_samples, mu = x$mu, cov_mat = x$cov_mat, m = ncol(x$x_test), @@ -231,7 +228,7 @@ prepare_data.copula <- function(x, x_test_gaussian = 1, seed = 1, n_samples = 1e ) dt_l[[i]] <- data.table::rbindlist(l, idcol = "id_combination") - dt_l[[i]][, w := 1 / n_samples] + dt_l[[i]][, w := 1 / x$n_samples] dt_l[[i]][, id := i] if (!is.null(index_features)) dt_l[[i]][, id_combination := index_features[id_combination]] } @@ -239,9 +236,7 @@ prepare_data.copula <- function(x, x_test_gaussian = 1, seed = 1, n_samples = 1e return(dt) } -#' @param n_samples Integer. The number of obs to sample from the leaf if \code{sample} = TRUE or if \code{sample} -#' = FALSE but \code{n_samples} is less than the number of obs in the leaf. -#' + #' @param index_features List. Default is NULL but if either various methods are being used or various mincriterion are #' used for different numbers of conditioned features, this will be a list with the features to pass. #' @@ -258,7 +253,7 @@ prepare_data.copula <- function(x, x_test_gaussian = 1, seed = 1, n_samples = 1e #' #' @rdname prepare_data #' @export -prepare_data.ctree <- function(x, seed = 1, n_samples = 1e3, index_features = NULL, +prepare_data.ctree <- function(x, seed = 1, index_features = NULL, mc_cores = 1, mc_cores_create_ctree = mc_cores, mc_cores_sample_ctree = mc_cores, ...) { id <- id_combination <- w <- NULL # due to NSE notes in R CMD check @@ -290,7 +285,7 @@ prepare_data.ctree <- function(x, seed = 1, n_samples = 1e3, index_features = NU l <- parallel::mclapply( X = all_trees, FUN = sample_ctree, - n_samples = n_samples, + n_samples = x$n_samples, x_test = x$x_test[i, , drop = FALSE], x_train = x$x_train, p = ncol(x$x_test), @@ -300,7 +295,7 @@ prepare_data.ctree <- function(x, seed = 1, n_samples = 1e3, index_features = NU ) dt_l[[i]] <- data.table::rbindlist(l, idcol = "id_combination") - dt_l[[i]][, w := 1 / n_samples] + dt_l[[i]][, w := 1 / x$n_samples] dt_l[[i]][, id := i] if (!is.null(index_features)) dt_l[[i]][, id_combination := index_features[id_combination]] } From bb535e148e1a0d98019dfa190d7e21c0c3130d7a Mon Sep 17 00:00:00 2001 From: Martin Date: Tue, 10 Aug 2021 14:25:24 +0200 Subject: [PATCH 06/16] bug --- R/explanation.R | 2 +- man/explain.Rd | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/R/explanation.R b/R/explanation.R index e8cd237f3..649d8cfe7 100644 --- a/R/explanation.R +++ b/R/explanation.R @@ -458,7 +458,7 @@ get_list_approaches <- function(n_features, approach) { #' #' @export explain.ctree_comb_mincrit <- function(x, explainer, approach, - prediction_zero, mincriterion, ...) { + prediction_zero, n_samples, mincriterion, ...) { # Get indices of combinations l <- get_list_ctree_mincrit(explainer$X$n_features, mincriterion) diff --git a/man/explain.Rd b/man/explain.Rd index 3e4d1c67f..2491332e9 100644 --- a/man/explain.Rd +++ b/man/explain.Rd @@ -64,7 +64,7 @@ explain(x, explainer, approach, prediction_zero, n_samples = 1000, ...) ... ) -\method{explain}{ctree_comb_mincrit}(x, explainer, approach, prediction_zero, mincriterion, ...) +\method{explain}{ctree_comb_mincrit}(x, explainer, approach, prediction_zero, n_samples, mincriterion, ...) } \arguments{ \item{x}{A matrix or data.frame. Contains the the features, whose From d14b3ddf04bf807fef5ddc3bf7ed67e39a392899 Mon Sep 17 00:00:00 2001 From: Martin Date: Tue, 10 Aug 2021 14:48:14 +0200 Subject: [PATCH 07/16] Add "Success with message:" to all messages for clarity --- R/features.R | 3 ++- R/preprocess_data.R | 15 +++++++++++---- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/R/features.R b/R/features.R index c51ae9ab0..d120917f3 100644 --- a/R/features.R +++ b/R/features.R @@ -73,7 +73,8 @@ feature_combinations <- function(m, exact = TRUE, n_combinations = 200, weight_z exact <- TRUE message( paste0( - "\nn_combinations is larger than or equal to 2^m = ", 2^m, ". \n", + "\nSuccess with message:\n", + "n_combinations is larger than or equal to 2^m = ", 2^m, ". \n", "Using exact instead." ) ) diff --git a/R/preprocess_data.R b/R/preprocess_data.R index 746a575c6..49283972f 100644 --- a/R/preprocess_data.R +++ b/R/preprocess_data.R @@ -156,6 +156,7 @@ check_features <- function(f_list_1, f_list_2, } if (NULL_1 & use_1_as_truth) { message(paste0( + "\nSuccess with message:\n", "The specified ", name_1, " provides NULL feature labels. ", "The labels of ", name_2, " are taken as the truth." )) @@ -170,6 +171,7 @@ check_features <- function(f_list_1, f_list_2, } if ((NA_1 & use_1_as_truth)) { message(paste0( + "\nSuccess with message:\n", "The specified ", name_1, " provides feature labels that are NA. ", "The labels of ", name_2, " are taken as the truth." )) @@ -245,6 +247,7 @@ check_features <- function(f_list_1, f_list_2, #### Checking classes #### if (any(is.na(f_list_1$classes)) & use_1_as_truth) { # Only relevant when f_list_1 is a model message(paste0( + "\nSuccess with message:\n", "The specified ", name_1, " provides feature classes that are NA. ", "The classes of ", name_2, " are taken as the truth." )) @@ -272,6 +275,7 @@ check_features <- function(f_list_1, f_list_2, is_NULL <- any(is.null(relevant_factor_levels)) if ((is_NA | is_NULL) & use_1_as_truth) { message(paste0( + "\nSuccess with message:\n", "The specified ", name_1, " provides factor feature levels that are NULL or NA. ", "The factor levels of ", name_2, " are taken as the truth." )) @@ -330,9 +334,9 @@ update_data <- function(data, updater) { # Reorder and delete unused columns cnms_remove <- setdiff(colnames(data), new_labels) if (length(cnms_remove) > 0) { - message( - paste0( - "The columns(s) ", + message(paste0( + "\nSuccess with message:\n", + "The columns(s) ", paste0(cnms_remove, collapse = ", "), " is not used by the model and thus removed from the data." ) @@ -348,6 +352,7 @@ update_data <- function(data, updater) { if (any(!identical_levels)) { changed_levels <- which(!identical_levels) message(paste0( + "\nSuccess with message:\n", "Levels are reordered for the factor feature(s) ", paste0(new_labels[changed_levels], collapse = ", "), "." )) @@ -383,7 +388,9 @@ process_groups <- function(group, feature_labels) { # Make group names if not existing if (is.null(names(group))) { - message("Group names not provided. Assigning them the default names 'group1', 'group2', 'group3' etc.") + message( + "\nSuccess with message:\n + Group names not provided. Assigning them the default names 'group1', 'group2', 'group3' etc.") names(group) <- paste0("group", seq_along(group)) } From bf2d2e69fde3b11c90069df622dfb83688379cd4 Mon Sep 17 00:00:00 2001 From: Martin Date: Tue, 10 Aug 2021 15:02:26 +0200 Subject: [PATCH 08/16] update GHA cmd checking --- .github/workflows/R-CMD-check.yaml | 64 ++++++++++++++---------------- 1 file changed, 29 insertions(+), 35 deletions(-) diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 5e10ea6be..8818274f0 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -1,12 +1,17 @@ +# For help debugging build failures open an issue on the RStudio community with the 'github-actions' tag. +# https://community.rstudio.com/new-topic?category=Package%20development&tags=github-actions +# This is slightly modified verison of this https://github.com/r-lib/actions/blob/master/examples/check-full.yaml +(added aug 10th 2021) + on: push: branches: + - main - master - - cranversion pull_request: branches: + - main - master - - cranversion name: R-CMD-check @@ -20,12 +25,12 @@ jobs: fail-fast: false matrix: config: - - {os: macOS-latest, r: 'release'} - - {os: macOS-latest, r: '4.0'} - - {os: windows-latest, r: '4.0'} - - {os: ubuntu-16.04, r: '4.0', rspm: "https://packagemanager.rstudio.com/cran/__linux__/xenial/latest"} - - {os: ubuntu-16.04, r: '3.6', rspm: "https://packagemanager.rstudio.com/cran/__linux__/xenial/latest"} - - {os: ubuntu-16.04, r: '3.5', rspm: "https://packagemanager.rstudio.com/cran/__linux__/xenial/latest"} + - {os: windows-latest, r: 'release'} + - {os: macOS-latest, r: 'release'} + - {os: ubuntu-20.04, r: 'release', rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest"} + - {os: ubuntu-20.04, r: 'devel', rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest", http-user-agent: "R/4.1.0 (ubuntu-20.04) R (4.1.0 x86_64-pc-linux-gnu x86_64 linux-gnu) on GitHub Actions" } + - {os: ubuntu-20.04, r: 'oldrel', rspm: "https://packagemanager.rstudio.com/cran/__linux__/bionic/latest"} # Added manually by martinju + - {os: ubuntu-20.04, r: '3.6', rspm: "https://packagemanager.rstudio.com/cran/__linux__/xenial/latest"} # Added manually by martinju, remove some time in 2022 env: R_REMOTES_NO_ERRORS_FROM_WARNINGS: true @@ -35,34 +40,33 @@ jobs: steps: - uses: actions/checkout@v2 - - uses: r-lib/actions/setup-r@master + - uses: r-lib/actions/setup-r@v1 with: r-version: ${{ matrix.config.r }} - - uses: r-lib/actions/setup-pandoc@master + - uses: r-lib/actions/setup-pandoc@v1 - name: Query dependencies run: | install.packages('remotes') saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2) + writeLines(sprintf("R-%i.%i", getRversion()$major, getRversion()$minor), ".github/R-version") shell: Rscript {0} - - name: Cache R packages - if: runner.os != 'Windows' - uses: actions/cache@v1 + - name: Restore R package cache + uses: actions/cache@v2 with: path: ${{ env.R_LIBS_USER }} - key: ${{ runner.os }}-r-${{ matrix.config.r }}-1-${{ hashFiles('.github/depends.Rds') }} - restore-keys: ${{ runner.os }}-r-${{ matrix.config.r }}-1- + key: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1-${{ hashFiles('.github/depends.Rds') }} + restore-keys: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1- - name: Install system dependencies if: runner.os == 'Linux' - env: - RHUB_PLATFORM: linux-x86_64-ubuntu-gcc run: | - Rscript -e "remotes::install_github('r-hub/sysreqs')" - sysreqs=$(Rscript -e "cat(sysreqs::sysreq_commands('DESCRIPTION'))") - sudo -s eval "$sysreqs" + while read -r cmd + do + eval sudo $cmd + done < <(Rscript -e 'writeLines(remotes::system_requirements("ubuntu", "20.04"))') - name: Install dependencies run: | @@ -70,27 +74,17 @@ jobs: remotes::install_cran("rcmdcheck") shell: Rscript {0} - - name: Session info - run: | - options(width = 100) - pkgs <- installed.packages()[, "Package"] - sessioninfo::session_info(pkgs, include_base = TRUE) - shell: Rscript {0} - - name: Check env: - _R_CHECK_CRAN_INCOMING_: false - run: rcmdcheck::rcmdcheck(args = c("--no-manual", "--as-cran"), error_on = "warning", check_dir = "check") + _R_CHECK_CRAN_INCOMING_REMOTE_: false + run: | + options(crayon.enabled = TRUE) + rcmdcheck::rcmdcheck(args = c("--no-manual", "--as-cran"), error_on = "warning", check_dir = "check") shell: Rscript {0} - - name: Show testthat output - if: always() - run: find check -name 'testthat.Rout*' -exec cat '{}' \; || true - shell: bash - - name: Upload check results if: failure() - uses: actions/upload-artifact@master + uses: actions/upload-artifact@main with: name: ${{ runner.os }}-r${{ matrix.config.r }}-results path: check From cafe69fbd7cce74cba738bbda2d5fa8df7e2830b Mon Sep 17 00:00:00 2001 From: Martin Date: Tue, 10 Aug 2021 15:08:29 +0200 Subject: [PATCH 09/16] bugfix --- .github/workflows/R-CMD-check.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 8818274f0..0fc731373 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -1,17 +1,17 @@ # For help debugging build failures open an issue on the RStudio community with the 'github-actions' tag. # https://community.rstudio.com/new-topic?category=Package%20development&tags=github-actions # This is slightly modified verison of this https://github.com/r-lib/actions/blob/master/examples/check-full.yaml -(added aug 10th 2021) +# (added aug 10th 2021) on: push: branches: - - main - master + - cranversion pull_request: branches: - - main - master + -cranversion name: R-CMD-check From e0941ea0bcea08631b9fcc97b66898037f69e019 Mon Sep 17 00:00:00 2001 From: Martin Date: Tue, 10 Aug 2021 15:10:22 +0200 Subject: [PATCH 10/16] . --- .github/workflows/R-CMD-check.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 0fc731373..1f93255ab 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -11,7 +11,7 @@ on: pull_request: branches: - master - -cranversion + - cranversion name: R-CMD-check From 2a6e60d3c8afa03c507e14868503ba83979b4074 Mon Sep 17 00:00:00 2001 From: Martin Date: Tue, 10 Aug 2021 15:30:43 +0200 Subject: [PATCH 11/16] adding no-vignette version of R-CMD-check --- .../workflows/R-CMD-check-no-vignette.yaml | 89 +++++++++++++++++++ 1 file changed, 89 insertions(+) create mode 100644 .github/workflows/R-CMD-check-no-vignette.yaml diff --git a/.github/workflows/R-CMD-check-no-vignette.yaml b/.github/workflows/R-CMD-check-no-vignette.yaml new file mode 100644 index 000000000..e74ab31b8 --- /dev/null +++ b/.github/workflows/R-CMD-check-no-vignette.yaml @@ -0,0 +1,89 @@ +# This is duplicate of R-CMD-check, but with arguments skipping the building of vignettes which often tends to fail +# on older R versions. + +on: + push: + branches: + - master + - cranversion + pull_request: + branches: + - master + - cranversion + +name: R-CMD-check-no-vignette + +jobs: + R-CMD-check: + runs-on: ${{ matrix.config.os }} + + name: ${{ matrix.config.os }} (${{ matrix.config.r }}) + + strategy: + fail-fast: false + matrix: + config: + - {os: windows-latest, r: 'release'} + - {os: macOS-latest, r: 'release'} + - {os: ubuntu-20.04, r: 'release', rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest"} + - {os: ubuntu-20.04, r: 'devel', rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest", http-user-agent: "R/4.1.0 (ubuntu-20.04) R (4.1.0 x86_64-pc-linux-gnu x86_64 linux-gnu) on GitHub Actions" } + - {os: ubuntu-20.04, r: 'oldrel', rspm: "https://packagemanager.rstudio.com/cran/__linux__/bionic/latest"} # Added manually by martinju + - {os: ubuntu-20.04, r: '3.6', rspm: "https://packagemanager.rstudio.com/cran/__linux__/xenial/latest"} # Added manually by martinju, remove some time in 2022 + + env: + R_REMOTES_NO_ERRORS_FROM_WARNINGS: true + RSPM: ${{ matrix.config.rspm }} + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + + steps: + - uses: actions/checkout@v2 + + - uses: r-lib/actions/setup-r@v1 + with: + r-version: ${{ matrix.config.r }} + + - uses: r-lib/actions/setup-pandoc@v1 + + - name: Query dependencies + run: | + install.packages('remotes') + saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2) + writeLines(sprintf("R-%i.%i", getRversion()$major, getRversion()$minor), ".github/R-version") + shell: Rscript {0} + + - name: Restore R package cache + uses: actions/cache@v2 + with: + path: ${{ env.R_LIBS_USER }} + key: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1-${{ hashFiles('.github/depends.Rds') }} + restore-keys: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1- + + - name: Install system dependencies + if: runner.os == 'Linux' + run: | + while read -r cmd + do + eval sudo $cmd + done < <(Rscript -e 'writeLines(remotes::system_requirements("ubuntu", "20.04"))') + + - name: Install dependencies + run: | + remotes::install_deps(dependencies = TRUE) + remotes::install_cran("rcmdcheck") + shell: Rscript {0} + + - name: Check + env: + _R_CHECK_CRAN_INCOMING_REMOTE_: false + run: | + options(crayon.enabled = TRUE) + rcmdcheck::rcmdcheck(args = c("--no-manual", "--ignore-vignettes", "--no-build-vignettes"), + build_args = c("--no-manual", "--ignore-vignettes", "--no-build-vignettes"), error_on = "warning", check_dir = "check") + shell: Rscript {0} + + - name: Upload check results + if: failure() + uses: actions/upload-artifact@main + with: + name: ${{ runner.os }}-r${{ matrix.config.r }}-results + path: check From ecb7970c41aee4bda6ee32e1fc67563a0910ddcc Mon Sep 17 00:00:00 2001 From: Martin Date: Tue, 10 Aug 2021 15:59:01 +0200 Subject: [PATCH 12/16] smoother version just running the manually added version without vignette, the others the standard way --- .github/workflows/R-CMD-check-no-vignette.yaml | 8 ++++---- .github/workflows/R-CMD-check.yaml | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/R-CMD-check-no-vignette.yaml b/.github/workflows/R-CMD-check-no-vignette.yaml index e74ab31b8..5f5e261bd 100644 --- a/.github/workflows/R-CMD-check-no-vignette.yaml +++ b/.github/workflows/R-CMD-check-no-vignette.yaml @@ -23,10 +23,10 @@ jobs: fail-fast: false matrix: config: - - {os: windows-latest, r: 'release'} - - {os: macOS-latest, r: 'release'} - - {os: ubuntu-20.04, r: 'release', rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest"} - - {os: ubuntu-20.04, r: 'devel', rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest", http-user-agent: "R/4.1.0 (ubuntu-20.04) R (4.1.0 x86_64-pc-linux-gnu x86_64 linux-gnu) on GitHub Actions" } +# - {os: windows-latest, r: 'release'} +# - {os: macOS-latest, r: 'release'} +# - {os: ubuntu-20.04, r: 'release', rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest"} +# - {os: ubuntu-20.04, r: 'devel', rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest", http-user-agent: "R/4.1.0 (ubuntu-20.04) R (4.1.0 x86_64-pc-linux-gnu x86_64 linux-gnu) on GitHub Actions" } - {os: ubuntu-20.04, r: 'oldrel', rspm: "https://packagemanager.rstudio.com/cran/__linux__/bionic/latest"} # Added manually by martinju - {os: ubuntu-20.04, r: '3.6', rspm: "https://packagemanager.rstudio.com/cran/__linux__/xenial/latest"} # Added manually by martinju, remove some time in 2022 diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 1f93255ab..a0952ddd4 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -29,8 +29,8 @@ jobs: - {os: macOS-latest, r: 'release'} - {os: ubuntu-20.04, r: 'release', rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest"} - {os: ubuntu-20.04, r: 'devel', rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest", http-user-agent: "R/4.1.0 (ubuntu-20.04) R (4.1.0 x86_64-pc-linux-gnu x86_64 linux-gnu) on GitHub Actions" } - - {os: ubuntu-20.04, r: 'oldrel', rspm: "https://packagemanager.rstudio.com/cran/__linux__/bionic/latest"} # Added manually by martinju - - {os: ubuntu-20.04, r: '3.6', rspm: "https://packagemanager.rstudio.com/cran/__linux__/xenial/latest"} # Added manually by martinju, remove some time in 2022 +# - {os: ubuntu-20.04, r: 'oldrel', rspm: "https://packagemanager.rstudio.com/cran/__linux__/bionic/latest"} # Added manually by martinju +# - {os: ubuntu-20.04, r: '3.6', rspm: "https://packagemanager.rstudio.com/cran/__linux__/xenial/latest"} # Added manually by martinju, remove some time in 2022 env: R_REMOTES_NO_ERRORS_FROM_WARNINGS: true From 2480134476a953ff9c1f23f3928b75b1c1adf94a Mon Sep 17 00:00:00 2001 From: Martin Date: Fri, 13 Aug 2021 13:57:26 +0200 Subject: [PATCH 13/16] testing lint.yaml updating --- .github/workflows/lint.yaml | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml index 92a57554d..41c78890b 100644 --- a/.github/workflows/lint.yaml +++ b/.github/workflows/lint.yaml @@ -18,27 +18,32 @@ jobs: steps: - uses: actions/checkout@v2 - - uses: r-lib/actions/setup-r@master + - uses: r-lib/actions/setup-r@v1 - name: Query dependencies run: | install.packages('remotes') saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2) + writeLines(sprintf("R-%i.%i", getRversion()$major, getRversion()$minor), ".github/R-version") shell: Rscript {0} - - name: Cache R packages - uses: actions/cache@v1 + - name: Restore R package cache + uses: actions/cache@v2 with: path: ${{ env.R_LIBS_USER }} - key: macOS-r-4.0-1-${{ hashFiles('.github/depends.Rds') }} - restore-keys: macOS-r-4.0-1- + key: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1-${{ hashFiles('.github/depends.Rds') }} + restore-keys: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1- - name: Install dependencies run: | - remotes::install_deps(dependencies = TRUE, type = "binary") - remotes::install_cran("lintr", type = "binary") + install.packages(c("remotes")) + remotes::install_deps(dependencies = TRUE) + remotes::install_cran("lintr") shell: Rscript {0} + - name: Install package + run: R CMD INSTALL . + - name: Lint run: lintr::lint_package() shell: Rscript {0} From f419659d1e29ff63dfec5e7ff12dbb3a5d23a6a1 Mon Sep 17 00:00:00 2001 From: Martin Date: Fri, 13 Aug 2021 14:04:34 +0200 Subject: [PATCH 14/16] linting --- R/explanation.R | 2 +- tests/testthat/test-explanation.R | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/R/explanation.R b/R/explanation.R index 649d8cfe7..ce0ef04ff 100644 --- a/R/explanation.R +++ b/R/explanation.R @@ -144,7 +144,7 @@ #' ) #' print(explain_groups$dt) #' } -explain <- function(x, explainer, approach, prediction_zero, n_samples = 1e3,...) { +explain <- function(x, explainer, approach, prediction_zero, n_samples = 1e3, ...) { extras <- list(...) # Check input for x diff --git a/tests/testthat/test-explanation.R b/tests/testthat/test-explanation.R index c82584c41..ce2917838 100644 --- a/tests/testthat/test-explanation.R +++ b/tests/testthat/test-explanation.R @@ -33,8 +33,8 @@ test_that("Test functions in explanation.R", { # Test way to insert test data (shapr=4.0) expect_silent(explain(x_test, explainer, approach = "gaussian", prediction_zero = p0)) expect_silent(explain(head(x_test), explainer, approach = "gaussian", prediction_zero = p0)) - expect_silent(explain(x_test[,1:4], explainer, approach = "gaussian", prediction_zero = p0)) - expect_silent(explain(x_test[1:2,], explainer, approach = "gaussian", prediction_zero = p0)) + expect_silent(explain(x_test[, 1:4], explainer, approach = "gaussian", prediction_zero = p0)) + expect_silent(explain(x_test[1:2, ], explainer, approach = "gaussian", prediction_zero = p0)) # Creating list with lots of different explainer objects From bd695769cef615f03b1dd9cb13441041084a4ba8 Mon Sep 17 00:00:00 2001 From: Martin Jullum Date: Tue, 17 Aug 2021 08:27:02 +0200 Subject: [PATCH 15/16] Update R/explanation.R Co-authored-by: Annabelle Redelmeier --- R/explanation.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/explanation.R b/R/explanation.R index ce0ef04ff..5443195d6 100644 --- a/R/explanation.R +++ b/R/explanation.R @@ -23,7 +23,7 @@ #' approaches for estimating the conditional distributions of the data, namely \code{"empirical"}, #' \code{"gaussian"}, \code{"copula"} and \code{"ctree"}. #' In addition, the user also has the option of combining the four approaches. -#' E.g. if you're in a situation where you have trained a model the consists of 10 features, +#' E.g., if you're in a situation where you have trained a model that consists of 10 features, #' and you'd like to use the \code{"gaussian"} approach when you condition on a single feature, #' the \code{"empirical"} approach if you condition on 2-5 features, and \code{"copula"} version #' if you condition on more than 5 features this can be done by simply passing From 277e15b2e63ac0e7d19cce6ec5f59e0eff3f35c6 Mon Sep 17 00:00:00 2001 From: Martin Jullum Date: Tue, 17 Aug 2021 08:27:10 +0200 Subject: [PATCH 16/16] Update R/explanation.R Co-authored-by: Annabelle Redelmeier --- R/explanation.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/explanation.R b/R/explanation.R index 5443195d6..374aba5c1 100644 --- a/R/explanation.R +++ b/R/explanation.R @@ -28,7 +28,7 @@ #' the \code{"empirical"} approach if you condition on 2-5 features, and \code{"copula"} version #' if you condition on more than 5 features this can be done by simply passing #' \code{approach = c("gaussian", rep("empirical", 4), rep("copula", 5))}. If -#' \code{"approach[i]" = "gaussian"} it means that you'd like to use the \code{"gaussian"} approach +#' \code{"approach[i]" = "gaussian"} means that you'd like to use the \code{"gaussian"} approach #' when conditioning on \code{i} features. #' #' For \code{approach="ctree"}, \code{n_samples} corresponds to the number of samples