From 87128df92d72e6b4c151b93be2a02c0848fa506c Mon Sep 17 00:00:00 2001 From: Tomer Iwan Date: Wed, 7 Feb 2024 12:16:34 +0100 Subject: [PATCH] feat: create summary table for dataframe --- DESCRIPTION | 5 +- NAMESPACE | 11 ++ R/create_dataset_summary_table.R | 257 ++++++++++++++++++++++++++ man/calculate_category_percentages.Rd | 28 +++ man/create_dataset_summary_table.Rd | 20 ++ man/find_maximum_value.Rd | 27 +++ man/find_minimum_value.Rd | 27 +++ man/get_distribution_statistics.Rd | 22 +++ man/get_first_element_class.Rd | 24 +++ man/is_unique_column.Rd | 34 ++++ 10 files changed, 454 insertions(+), 1 deletion(-) create mode 100644 R/create_dataset_summary_table.R create mode 100644 man/calculate_category_percentages.Rd create mode 100644 man/create_dataset_summary_table.Rd create mode 100644 man/find_maximum_value.Rd create mode 100644 man/find_minimum_value.Rd create mode 100644 man/get_distribution_statistics.Rd create mode 100644 man/get_first_element_class.Rd create mode 100644 man/is_unique_column.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 4fe6e16..9b6ea99 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -16,11 +16,14 @@ Imports: cli, dplyr, findR, + janitor, kit, + lubridate, magrittr, purrr, stats, - stringr + stringr, + tibble Suggests: testthat (>= 3.0.0) Config/testthat/edition: 3 diff --git a/NAMESPACE b/NAMESPACE index 6cfa9fb..de73228 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -4,6 +4,7 @@ export(assert_date_named) export(assert_logical_named) export(assert_no_duplicates_in_group) export(assertion_message) +export(calculate_category_percentages) export(check_double_columns) export(check_duplicates) export(check_na_columns) @@ -15,13 +16,19 @@ export(check_posixct_type) export(check_rows) export(check_zero_columns) export(count_more_than_1) +export(create_dataset_summary_table) export(drop_na_column_names) export(duplicates_in_column) export(find_common_columns) +export(find_maximum_value) +export(find_minimum_value) export(find_pattern_r) +export(get_distribution_statistics) +export(get_first_element_class) export(get_values) export(identify_join_pairs) export(identify_outliers) +export(is_unique_column) export(md_complete_cases) export(regex_content_parameter) export(regex_time) @@ -37,3 +44,7 @@ export(str_detect_in_file) export(test_all_equal) export(unique_id) importFrom(magrittr,"%>%") +importFrom(purrr,map_chr) +importFrom(purrr,map_dbl) +importFrom(purrr,map_lgl) +importFrom(purrr,possibly) diff --git a/R/create_dataset_summary_table.R b/R/create_dataset_summary_table.R new file mode 100644 index 0000000..5d9a81f --- /dev/null +++ b/R/create_dataset_summary_table.R @@ -0,0 +1,257 @@ +#' Create dataset summary statistics table +#' +#' This function creates a summary statistics table for a dataframe, providing insights into +#' the nature of the data contained within. It includes detailed statistics for each column, +#' such as column types, missing value percentages, minimum and maximum values for numeric +#' columns, patterns for character columns, uniqueness of identifiers, and distributions. +#' +#' @param df_input A dataframe for which to create a summary statistics table. +#' @return A tibble with comprehensive summary statistics for each column in the input dataframe. +#' @importFrom purrr map_chr map_dbl map_lgl possibly +#' @export +create_dataset_summary_table <- function(df_input) { + # Store column names + column_names <- names(df_input) + + # Store column types + column_types <- purrr::map_chr(df_input, get_first_element_class) + + # Calculate missing value percentages + missing_value_percentages <- colMeans(is.na(df_input)) * 100 + + # Find minimum and maximum values for numeric columns + min_values <- purrr::map_dbl(df_input, find_minimum_value) + max_values <- purrr::map_dbl(df_input, find_maximum_value) + + # Check for uniqueness of identifiers + unique_identifiers <- purrr::map_lgl(column_names, is_unique_column, data_frame = df_input) + + # Calculate distribution statistics + distribution_stats <- purrr::map_chr(df_input, get_distribution_statistics) + + # Calculate distribution percentages + distribution_percentages <- purrr::map_chr(df_input, calculate_category_percentages) + + # Create a tibble with all summary statistics + summary_table <- tibble::tibble( + column_name = column_names, + column_type = column_types, + missing_value_percentage = missing_value_percentages, + min_value = min_values, + max_value = max_values, + unique_identifier = unique_identifiers, + distribution_statistics = distribution_stats, + distribution_percentages = distribution_percentages + ) + + return(summary_table) +} + + +#' Check if a column in a dataframe has unique values +#' +#' @param column_name The name of the column to check for uniqueness. +#' @param data_frame A dataframe containing the column to check. +#' @return \code{TRUE} if the column has unique values, \code{FALSE} otherwise. +#' @export +#' @examples +#' # Create a dataframe with a unique ID column +#' data_frame <- tibble::tibble( +#' id = c(1, 2, 3, 4, 5), +#' value = c("a", "b", "c", "d", "e") +#' ) +#' is_unique_column("id", data_frame) # Returns TRUE +#' +#' # Create a dataframe with duplicate values in the ID column +#' data_frame <- tibble::tibble( +#' id = c(1, 2, 3, 4, 5, 1), +#' value = c("a", "b", "c", "d", "e", "a") +#' ) +#' is_unique_column("id", data_frame) # Returns FALSE +is_unique_column <- function(column_name, data_frame) { + column_values <- data_frame %>% dplyr::pull(column_name) + return(nrow(data_frame) == length(unique(column_values))) +} + +#' Retrieve the class of the first element of a vector +#' +#' @param input_vector A vector whose first element's class is to be retrieved. +#' @return The class of the first element of the input vector. +#' @export +#' @examples +#' # Get the class of the first element in a numeric vector +#' get_first_element_class(c(1, 2, 3)) # Returns "numeric" +#' +#' # Get the class of the first element in a character vector +#' get_first_element_class(c("apple", "banana", "cherry")) # Returns "character" +get_first_element_class <- function(input_vector) { + return(class(input_vector)[1]) +} + +#' Find the minimum numeric value in a vector, ignoring non-numeric values +#' +#' @param numeric_vector A vector from which to find the minimum numeric value. +#' @return The minimum numeric value in the input vector, or NA if none exist. +#' @export +#' @examples +#' # Find the minimum of a numeric vector +#' find_minimum_value(c(3, 1, 4, 1, 5, 9)) # Returns 1 +#' +#' # Find the minimum of a mixed vector with non-numeric values +#' find_minimum_value(c(3, 1, 4, "two", 5, 9)) # Returns 1 +#' +#' # Attempt to find the minimum of a vector with only non-numeric values +#' find_minimum_value(c("one", "two", "three")) # Returns NA +find_minimum_value <- function(numeric_vector) { + if (is.numeric(numeric_vector)) { + return(min(numeric_vector, na.rm = TRUE)) + } else { + return(NA) + } +} + +#' Find the maximum numeric value in a vector, ignoring non-numeric values +#' +#' @param numeric_vector A vector from which to find the maximum numeric value. +#' @return The maximum numeric value in the input vector, or NA if none exist. +#' @export +#' @examples +#' # Find the maximum of a numeric vector +#' find_maximum_value(c(3, 1, 4, 1, 5, 9)) # Returns 9 +#' +#' # Find the maximum of a mixed vector with non-numeric values +#' find_maximum_value(c(3, 1, 4, "two", 5, 9)) # Returns 9 +#' +#' # Attempt to find the maximum of a vector with only non-numeric values +#' find_maximum_value(c("one", "two", "three")) # Returns NA +find_maximum_value <- function(numeric_vector) { + if (is.numeric(numeric_vector)) { + return(max(numeric_vector, na.rm = TRUE)) + } else { + return(NA) + } +} + +#' Compute distribution statistics for a numeric vector +#' +#' This function computes summary statistics such as quartiles, mean, and standard deviation for a numeric vector. +#' +#' @param data_vector A numeric vector for which to compute summary statistics. +#' @return A character string describing the summary statistics of the input vector. +#' @export +#' @examples +#' # Compute summary statistics for a numeric vector +#' data_vector <- c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10) +#' get_distribution_statistics(data_vector) +get_distribution_statistics <- function(data_vector) { + if (is.numeric(data_vector)) { + calculated_quantiles <- stats::quantile(data_vector, probs = c(0.25, 0.5, 0.75), na.rm = TRUE) + calculated_summary_stats <- c( + Mean = mean(data_vector, na.rm = TRUE), + Std_Dev = stats::sd(data_vector, na.rm = TRUE) + ) + formatted_output <- paste( + "Q1 =", calculated_quantiles[1], " | median =", calculated_quantiles[2], " | Q3 = ", calculated_quantiles[3], + " | mean =", format(calculated_summary_stats["Mean"], digits = 3), " | stdev =", format(calculated_summary_stats["Std_Dev"], digits = 3) + ) + return(formatted_output) + } else if (is.character(data_vector)) { + # Filter out non-numeric characters + numeric_elements <- grepl("^\\d*\\.?\\d*$", data_vector) + if (any(numeric_elements)) { + numeric_values <- as.numeric(data_vector[numeric_elements]) + calculated_quantiles <- stats::quantile(numeric_values, probs = c(0.25, 0.5, 0.75), na.rm = TRUE) + calculated_summary_stats <- c( + Mean = mean(numeric_values, na.rm = TRUE), + Std_Dev = stats::sd(numeric_values, na.rm = TRUE) + ) + formatted_output <- paste( + "Q1 =", calculated_quantiles[1], " | median =", calculated_quantiles[2], " | Q3 = ", calculated_quantiles[3], + " | mean =", format(calculated_summary_stats["Mean"], digits = 3), " | stdev =", format(calculated_summary_stats["Std_Dev"], digits = 3) + ) + return(formatted_output) + } else { + return(NA) + } + } else { + return(NA) + } +} + +#' Calculate the percentage of categories in a data vector +#' +#' This function calculates the percentage of each category in a given data vector and returns the top 10 categories along with their percentages. +#' If the data vector is of Date class, it is converted to POSIXct. If the sum of the percentages is not 100%, an "Other" category is added to make up the difference, but only if the number of unique values exceeds 10. +#' If the data vector is of POSIXct class and the smallest percentage is less than 1%, the function returns "Not enough occurrences." +#' +#' @param data_vector A vector of categorical data. +#' @return A character string detailing the top 10 categories and their percentages, or a special message indicating not enough occurrences or unsupported data type. +#' @export +#' @examples +#' # Example with a character vector +#' data_vector <- c("cat", "dog", "bird", "cat", "dog", "cat", "other") +#' calculate_category_percentages(data_vector) +#' +#' # Example with a Date vector +#' data_vector <- as.Date(c("2020-01-01", "2020-01-02", "2020-01-03")) +#' calculate_category_percentages(data_vector) +calculate_category_percentages <- function(data_vector) { + # Initialize variables + percentages <- n <- percent <- categories <- .data <- NULL + + # Convert Date objects to POSIXct + if (inherits(data_vector, "Date")) { + data_vector <- as.POSIXct(data_vector) + } + + # Check if data_vector is a POSIXct object + if (!lubridate::is.POSIXct(data_vector)) { + # Calculate frequencies and percentages for non-Date vectors + frequency_table <- janitor::tabyl(data_vector) %>% + dplyr::arrange(dplyr::desc(n)) %>% + dplyr::mutate(percent = n / sum(n) * 100) %>% + dplyr::filter(!is.na(.data)) %>% + dplyr::slice_max(order_by = percent, n = 10) + + categories <- frequency_table$data_vector + percentages <- format(round(frequency_table$percent, 2), nsmall = 2) + + # Adjust percentages if total is not 100% and there are more than 10 unique values + if (sum(as.numeric(percentages)) < 100 && length(unique(data_vector)) > 10) { + percentages <- c(percentages, sprintf("%.2f", 100 - sum(as.numeric(percentages)))) + categories <- c(categories, "Other") + } else { + percentages[length(percentages)] <- sprintf("%.2f", 100 - sum(as.numeric(percentages[-length(percentages)]))) + } + + result <- paste0(categories, ": (", percentages, "%)", collapse = " | ") + return(result) + } else { + # Calculate frequencies and percentages for POSIXct vectors + frequency_table <- janitor::tabyl(data_vector) %>% + dplyr::arrange(dplyr::desc(n)) %>% + dplyr::mutate(percent = n / sum(n) * 100) %>% + dplyr::filter(!is.na(.data)) %>% + dplyr::slice_max(order_by = percent, n = 10) + + categories <- as.character(frequency_table$data_vector) + percentages <- format(round(frequency_table$percent, 2), nsmall = 2) + + # Adjust percentages if total is not 100% and there are more than 10 unique values + if (sum(as.numeric(percentages)) < 100 && length(unique(data_vector)) > 10) { + percentages <- c(percentages, sprintf("%.2f", 100 - sum(as.numeric(percentages)))) + categories <- c(categories, "Other") + } else { + percentages[length(percentages)] <- sprintf("%.2f", 100 - sum(as.numeric(percentages[-length(percentages)]))) + } + + # Check if the smallest percentage is less than 1% + if (as.numeric(percentages[1]) < 1) { + return("Not enough occurrences") + } else { + result <- paste0(categories, ": (", percentages, "%)", collapse = " | ") + return(result) + } + } + return("Data type not supported") +} diff --git a/man/calculate_category_percentages.Rd b/man/calculate_category_percentages.Rd new file mode 100644 index 0000000..4454633 --- /dev/null +++ b/man/calculate_category_percentages.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/create_dataset_summary_table.R +\name{calculate_category_percentages} +\alias{calculate_category_percentages} +\title{Calculate the percentage of categories in a data vector} +\usage{ +calculate_category_percentages(data_vector) +} +\arguments{ +\item{data_vector}{A vector of categorical data.} +} +\value{ +A character string detailing the top 10 categories and their percentages, or a special message indicating not enough occurrences or unsupported data type. +} +\description{ +This function calculates the percentage of each category in a given data vector and returns the top 10 categories along with their percentages. +If the data vector is of Date class, it is converted to POSIXct. If the sum of the percentages is not 100\%, an "Other" category is added to make up the difference, but only if the number of unique values exceeds 10. +If the data vector is of POSIXct class and the smallest percentage is less than 1\%, the function returns "Not enough occurrences." +} +\examples{ +# Example with a character vector +data_vector <- c("cat", "dog", "bird", "cat", "dog", "cat", "other") +calculate_category_percentages(data_vector) + +# Example with a Date vector +data_vector <- as.Date(c("2020-01-01", "2020-01-02", "2020-01-03")) +calculate_category_percentages(data_vector) +} diff --git a/man/create_dataset_summary_table.Rd b/man/create_dataset_summary_table.Rd new file mode 100644 index 0000000..7715124 --- /dev/null +++ b/man/create_dataset_summary_table.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/create_dataset_summary_table.R +\name{create_dataset_summary_table} +\alias{create_dataset_summary_table} +\title{Create dataset summary statistics table} +\usage{ +create_dataset_summary_table(df_input) +} +\arguments{ +\item{df_input}{A dataframe for which to create a summary statistics table.} +} +\value{ +A tibble with comprehensive summary statistics for each column in the input dataframe. +} +\description{ +This function creates a summary statistics table for a dataframe, providing insights into +the nature of the data contained within. It includes detailed statistics for each column, +such as column types, missing value percentages, minimum and maximum values for numeric +columns, patterns for character columns, uniqueness of identifiers, and distributions. +} diff --git a/man/find_maximum_value.Rd b/man/find_maximum_value.Rd new file mode 100644 index 0000000..3361ae2 --- /dev/null +++ b/man/find_maximum_value.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/create_dataset_summary_table.R +\name{find_maximum_value} +\alias{find_maximum_value} +\title{Find the maximum numeric value in a vector, ignoring non-numeric values} +\usage{ +find_maximum_value(numeric_vector) +} +\arguments{ +\item{numeric_vector}{A vector from which to find the maximum numeric value.} +} +\value{ +The maximum numeric value in the input vector, or NA if none exist. +} +\description{ +Find the maximum numeric value in a vector, ignoring non-numeric values +} +\examples{ +# Find the maximum of a numeric vector +find_maximum_value(c(3, 1, 4, 1, 5, 9)) # Returns 9 + +# Find the maximum of a mixed vector with non-numeric values +find_maximum_value(c(3, 1, 4, "two", 5, 9)) # Returns 9 + +# Attempt to find the maximum of a vector with only non-numeric values +find_maximum_value(c("one", "two", "three")) # Returns NA +} diff --git a/man/find_minimum_value.Rd b/man/find_minimum_value.Rd new file mode 100644 index 0000000..53769be --- /dev/null +++ b/man/find_minimum_value.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/create_dataset_summary_table.R +\name{find_minimum_value} +\alias{find_minimum_value} +\title{Find the minimum numeric value in a vector, ignoring non-numeric values} +\usage{ +find_minimum_value(numeric_vector) +} +\arguments{ +\item{numeric_vector}{A vector from which to find the minimum numeric value.} +} +\value{ +The minimum numeric value in the input vector, or NA if none exist. +} +\description{ +Find the minimum numeric value in a vector, ignoring non-numeric values +} +\examples{ +# Find the minimum of a numeric vector +find_minimum_value(c(3, 1, 4, 1, 5, 9)) # Returns 1 + +# Find the minimum of a mixed vector with non-numeric values +find_minimum_value(c(3, 1, 4, "two", 5, 9)) # Returns 1 + +# Attempt to find the minimum of a vector with only non-numeric values +find_minimum_value(c("one", "two", "three")) # Returns NA +} diff --git a/man/get_distribution_statistics.Rd b/man/get_distribution_statistics.Rd new file mode 100644 index 0000000..55063f7 --- /dev/null +++ b/man/get_distribution_statistics.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/create_dataset_summary_table.R +\name{get_distribution_statistics} +\alias{get_distribution_statistics} +\title{Compute distribution statistics for a numeric vector} +\usage{ +get_distribution_statistics(data_vector) +} +\arguments{ +\item{data_vector}{A numeric vector for which to compute summary statistics.} +} +\value{ +A character string describing the summary statistics of the input vector. +} +\description{ +This function computes summary statistics such as quartiles, mean, and standard deviation for a numeric vector. +} +\examples{ +# Compute summary statistics for a numeric vector +data_vector <- c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10) +get_distribution_statistics(data_vector) +} diff --git a/man/get_first_element_class.Rd b/man/get_first_element_class.Rd new file mode 100644 index 0000000..a0cb067 --- /dev/null +++ b/man/get_first_element_class.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/create_dataset_summary_table.R +\name{get_first_element_class} +\alias{get_first_element_class} +\title{Retrieve the class of the first element of a vector} +\usage{ +get_first_element_class(input_vector) +} +\arguments{ +\item{input_vector}{A vector whose first element's class is to be retrieved.} +} +\value{ +The class of the first element of the input vector. +} +\description{ +Retrieve the class of the first element of a vector +} +\examples{ +# Get the class of the first element in a numeric vector +get_first_element_class(c(1, 2, 3)) # Returns "numeric" + +# Get the class of the first element in a character vector +get_first_element_class(c("apple", "banana", "cherry")) # Returns "character" +} diff --git a/man/is_unique_column.Rd b/man/is_unique_column.Rd new file mode 100644 index 0000000..7fdf95c --- /dev/null +++ b/man/is_unique_column.Rd @@ -0,0 +1,34 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/create_dataset_summary_table.R +\name{is_unique_column} +\alias{is_unique_column} +\title{Check if a column in a dataframe has unique values} +\usage{ +is_unique_column(column_name, data_frame) +} +\arguments{ +\item{column_name}{The name of the column to check for uniqueness.} + +\item{data_frame}{A dataframe containing the column to check.} +} +\value{ +\code{TRUE} if the column has unique values, \code{FALSE} otherwise. +} +\description{ +Check if a column in a dataframe has unique values +} +\examples{ +# Create a dataframe with a unique ID column +data_frame <- tibble::tibble( + id = c(1, 2, 3, 4, 5), + value = c("a", "b", "c", "d", "e") +) +is_unique_column("id", data_frame) # Returns TRUE + +# Create a dataframe with duplicate values in the ID column +data_frame <- tibble::tibble( + id = c(1, 2, 3, 4, 5, 1), + value = c("a", "b", "c", "d", "e", "a") +) +is_unique_column("id", data_frame) # Returns FALSE +}