-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
#24 add descriptive stat fns from manuscript w roxygen code
- Loading branch information
Showing
1 changed file
with
200 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,200 @@ | ||
# ----- Define Helper Functions for Calculating Summary Statistics ----- | ||
|
||
#' Prepare data for summarising descriptive statistics | ||
#' | ||
#' @description | ||
#' Calculates the number of fixed variables, the number of random variables, | ||
#' the sample size, the number of interactions, the number of linear models, | ||
#' the number of generalised models, the number of fixed effects, and the number | ||
#' of random effects for each analysis in the dataset. | ||
#' Also codes whether each analysis employs a linear model, a generalised model, | ||
#' or a Bayesian model. | ||
#' | ||
#' @param data A tibble containing many-analyst data to be summarised. | ||
#' | ||
#' @return A tibble containing the data prepared for summarising. | ||
#' @export | ||
#' | ||
#' @examples | ||
#' ManyEcoEvo::ManyEcoEvo %>% | ||
#' select(data) %>% | ||
#' unnest(everything()) %>% | ||
#' prepare_df_for_summarising() | ||
#' @import dplyr | ||
prepare_df_for_summarising <- function(data){ | ||
data %>% | ||
mutate(across(.cols = c(num_fixed_variables, | ||
num_random_variables, | ||
sample_size, | ||
num_interactions, | ||
Bayesian, #NA's coming from CHECK values | ||
mixed_model, | ||
num_fixed_effects, | ||
num_random_effects), | ||
as.numeric), | ||
lm = ifelse(linear_model == "linear", 1, 0), | ||
glm = ifelse(linear_model == "generalised", 1, 0)) | ||
} | ||
|
||
#' Calculate the number of teams per dataset for a given subset | ||
#' | ||
#' @param data A tibble containing the data to be analysed. | ||
#' @param subset_name A character vector of length 1, the name of the subset of data being analysed. | ||
#' | ||
#' @return A tibble containing the number of `teams` per `dataset` for a given `subset_name`. | ||
#' @export | ||
#' | ||
#' @examples | ||
#' ManyEcoEvo::ManyEcoEvo %>% | ||
#' select(data) %>% | ||
#' unnest(everything()) %>% | ||
#' prepare_df_for_summarising() %>% | ||
#' calc_teams_per_dataset("all") | ||
#' @import dplyr | ||
calc_teams_per_dataset <- function(data, subset_name = character(1L)){ | ||
data %>% | ||
group_by(dataset) %>% | ||
count(TeamIdentifier) %>% | ||
tally(name = "teams") %>% | ||
mutate(subset = subset_name) | ||
} | ||
|
||
#' Calculate total number of analyses per team for a given subset | ||
#' | ||
#' @description Calculates the number of analyses conducted by each team for each dataset in a given subset. | ||
#' | ||
#' @param data A tibble containing the data to be analysed. | ||
#' @param subset_name A character vector of length 1, the name of the subset of data being analysed. | ||
#' | ||
#' @return A tibble containing the number of `analyses` per `team` for each `dataset` in a given `subset_name`. | ||
#' @export | ||
#' | ||
#' @examples | ||
#' ManyEcoEvo::ManyEcoEvo %>% | ||
#' select(data) %>% | ||
#' unnest(everything()) %>% | ||
#' prepare_df_for_summarising() %>% | ||
#' calc_analyses_per_team("All") | ||
#' @import dplyr | ||
calc_analyses_per_team <- function(data, subset_name = character(1L)){ #TODO this is calculating number of analyses per dataset not number of analyses per team per dataset ... | ||
data %>% | ||
count(dataset, name = "totalanalyses") %>% | ||
mutate(subset = subset_name) | ||
} | ||
|
||
#' Calculate summary statistics for numeric summary variables | ||
#' | ||
#' @description | ||
#' Calculates the mean, standard deviation, minimum and maximum for each numeric summary variable (See [prepare_df_for_summarising()]). | ||
#' | ||
#' for numeric | ||
#' variables used in analyses of each dataset, for a given subset. | ||
#' Summary statistics are aggregated across variable type \(Number of fixed variables | ||
#' within the analysis, number of random variables within the analysis, | ||
#' analysis samplesize, number of interaction terms within the analysis\). | ||
#' | ||
#' @param data A tibble containing the data to be analysed. | ||
#' @param subset_name A character vector of length 1, the name of the subset of | ||
#' data being analysed. | ||
#' | ||
#' @return A tibble containing the mean, standard deviation, minimum and maximum | ||
#' values for each numeric variable used in analyses of each dataset for a given subset. | ||
#' @export | ||
#' | ||
#' @examples | ||
#' ManyEcoEvo::ManyEcoEvo %>% | ||
#' select(data) %>% | ||
#' unnest(everything()) %>% | ||
#' prepare_df_for_summarising() %>% | ||
#' calc_summary_stats_numeric("All") | ||
calc_summary_stats_numeric <- function(data, subset_name = character(1L)){ | ||
data %>% | ||
group_by(dataset) %>% | ||
summarise(across(.cols = c(fixed = num_fixed_effects, | ||
random = num_random_effects, | ||
samplesize = sample_size, | ||
interactions = num_interactions, | ||
), | ||
.fns = list(mean = ~ mean(.x, na.rm = T) %>% round(2), | ||
sd = ~ sd(.x, na.rm = T) %>% round(2), | ||
min = ~ min(.x, na.rm = T), | ||
max = ~ max(.x, na.rm = T)), | ||
.names = "{.fn}_{.col}"), | ||
subset = subset_name) | ||
} | ||
|
||
#' Calculate summary statistics for binary summary variables | ||
#' | ||
#' @description | ||
#' Calculates the total number of analyses using linear models, mixed models, | ||
#' and Bayesian models for each dataset, for a given subset. | ||
#' See [prepare_df_for_summarising()] for details on the binary variables. | ||
#' | ||
#' | ||
#' @param data A tibble containing the data to be analysed. | ||
#' @param subset_name A character vector of length 1, the name of the subset of data being analysed. | ||
#' | ||
#' @return A tibble containing the sum of binary variables used in analyses of each dataset for a given subset. | ||
#' @export | ||
#' | ||
#' @examples | ||
#' ManyEcoEvo::ManyEcoEvo %>% | ||
#' select(data) %>% | ||
#' unnest(everything()) %>% | ||
#' prepare_df_for_summarising() %>% | ||
#' calc_summary_stats_binary("All") | ||
calc_summary_stats_binary <- function(data, subset_name = character(1L)) { | ||
data %>% | ||
group_by(dataset) %>% | ||
summarise(., | ||
sum_linear =sum(lm,na.rm=T), | ||
sum_mixed= sum(mixed_model,na.rm=T), | ||
sum_Bayesian= sum(Bayesian,na.rm=T), subset = subset_name) | ||
} | ||
|
||
#' Count the number of times variables are used across analyses | ||
#' | ||
#' @description | ||
#' This function is used to count the number of times each variable is used across | ||
#' the analyses in the dataset. The output is a tibble with the columns `variable` | ||
#' and `count` which contains the number of times each variable is used across the | ||
#' analyses in the dataset. | ||
|
||
#' Count the number of times variables are used across analyses | ||
#' | ||
#' @param data A tibble of variables used in analyses of each dataset, for a given subset. | ||
#' @param subset_name A character vector of length 1, the name of the subset of data being analysed. | ||
#' | ||
#' @return A tibble containing the number of times each variable is used across the analyses in the dataset. | ||
#' @export | ||
#' | ||
#' @details | ||
#' Takes a tibble of diversity data, i.e. data that is ready for computing Sorensen | ||
#' diversity indices and computes the number of times each variable is used across | ||
#' the analyses. Note, that the function does not group by dataset, as the layout of the | ||
#' dataset assumes that each variable within a given dataset does not occur in another dataset. | ||
#' @importFrom tibble enframe | ||
#' @import dplyr | ||
calculate_variable_counts <- function(data, subset_name = character(1L)){ | ||
colSums(!is.na(data)) %>% | ||
enframe(name = "variable", value = "count") %>% | ||
mutate( subset = subset_name ) | ||
} | ||
|
||
#' Count the number of different conclusions made by analysts across each dataset. | ||
#' | ||
#' @param data A tibble containing the data to be analysed. | ||
#' @param subset_name A character vector of length 1, the name of the subset of data being analysed. | ||
#' | ||
#' @return A tibble containing counts of each conclusion type made by analysts across each dataset, for a given subset. | ||
#' @export | ||
#' @import dplyr | ||
count_conclusions <- function(data, subset_name = character(1L)){ | ||
data %>% | ||
filter(split_id == 1 & analysis_id == 1) %>% | ||
group_by(dataset, pick(contains("Conclusion"))) %>% | ||
summarise(count = n(), .groups = "drop") %>% | ||
filter(if_any(contains("Conclusion"), ~ !is.na(.x)), | ||
if_any(contains("Conclusion"), ~ .x != "CHECK")) %>% | ||
mutate(subset = subset_name) | ||
} |