feat: create summary table for dataframe

vusaverse · Feb 7, 2024 · 87128df · 87128df
1 parent 45f4363
commit 87128df
Show file tree

Hide file tree

Showing 10 changed files with 454 additions and 1 deletion.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -16,11 +16,14 @@ Imports:
     cli,
     dplyr,
     findR,
+    janitor,
     kit,
+    lubridate,
     magrittr,
     purrr,
     stats,
-    stringr
+    stringr,
+    tibble
 Suggests: 
     testthat (>= 3.0.0)
 Config/testthat/edition: 3
diff --git a/NAMESPACE b/NAMESPACE
@@ -4,6 +4,7 @@ export(assert_date_named)
 export(assert_logical_named)
 export(assert_no_duplicates_in_group)
 export(assertion_message)
+export(calculate_category_percentages)
 export(check_double_columns)
 export(check_duplicates)
 export(check_na_columns)
@@ -15,13 +16,19 @@ export(check_posixct_type)
 export(check_rows)
 export(check_zero_columns)
 export(count_more_than_1)
+export(create_dataset_summary_table)
 export(drop_na_column_names)
 export(duplicates_in_column)
 export(find_common_columns)
+export(find_maximum_value)
+export(find_minimum_value)
 export(find_pattern_r)
+export(get_distribution_statistics)
+export(get_first_element_class)
 export(get_values)
 export(identify_join_pairs)
 export(identify_outliers)
+export(is_unique_column)
 export(md_complete_cases)
 export(regex_content_parameter)
 export(regex_time)
@@ -37,3 +44,7 @@ export(str_detect_in_file)
 export(test_all_equal)
 export(unique_id)
 importFrom(magrittr,"%>%")
+importFrom(purrr,map_chr)
+importFrom(purrr,map_dbl)
+importFrom(purrr,map_lgl)
+importFrom(purrr,possibly)
diff --git a/R/create_dataset_summary_table.R b/R/create_dataset_summary_table.R
@@ -0,0 +1,257 @@
+#' Create dataset summary statistics table
+#'
+#' This function creates a summary statistics table for a dataframe, providing insights into
+#' the nature of the data contained within. It includes detailed statistics for each column,
+#' such as column types, missing value percentages, minimum and maximum values for numeric
+#' columns, patterns for character columns, uniqueness of identifiers, and distributions.
+#'
+#' @param df_input A dataframe for which to create a summary statistics table.
+#' @return A tibble with comprehensive summary statistics for each column in the input dataframe.
+#' @importFrom purrr map_chr map_dbl map_lgl possibly
+#' @export
+create_dataset_summary_table <- function(df_input) {
+  # Store column names
+  column_names <- names(df_input)
+
+  # Store column types
+  column_types <- purrr::map_chr(df_input, get_first_element_class)
+
+  # Calculate missing value percentages
+  missing_value_percentages <- colMeans(is.na(df_input)) * 100
+
+  # Find minimum and maximum values for numeric columns
+  min_values <- purrr::map_dbl(df_input, find_minimum_value)
+  max_values <- purrr::map_dbl(df_input, find_maximum_value)
+
+  # Check for uniqueness of identifiers
+  unique_identifiers <- purrr::map_lgl(column_names, is_unique_column, data_frame = df_input)
+
+  # Calculate distribution statistics
+  distribution_stats <- purrr::map_chr(df_input, get_distribution_statistics)
+
+  # Calculate distribution percentages
+  distribution_percentages <- purrr::map_chr(df_input, calculate_category_percentages)
+
+  # Create a tibble with all summary statistics
+  summary_table <- tibble::tibble(
+    column_name = column_names,
+    column_type = column_types,
+    missing_value_percentage = missing_value_percentages,
+    min_value = min_values,
+    max_value = max_values,
+    unique_identifier = unique_identifiers,
+    distribution_statistics = distribution_stats,
+    distribution_percentages = distribution_percentages
+  )
+
+  return(summary_table)
+}
+
+
+#' Check if a column in a dataframe has unique values
+#'
+#' @param column_name The name of the column to check for uniqueness.
+#' @param data_frame A dataframe containing the column to check.
+#' @return \code{TRUE} if the column has unique values, \code{FALSE} otherwise.
+#' @export
+#' @examples
+#' # Create a dataframe with a unique ID column
+#' data_frame <- tibble::tibble(
+#'   id = c(1, 2, 3, 4, 5),
+#'   value = c("a", "b", "c", "d", "e")
+#' )
+#' is_unique_column("id", data_frame) # Returns TRUE
+#'
+#' # Create a dataframe with duplicate values in the ID column
+#' data_frame <- tibble::tibble(
+#'   id = c(1, 2, 3, 4, 5, 1),
+#'   value = c("a", "b", "c", "d", "e", "a")
+#' )
+#' is_unique_column("id", data_frame) # Returns FALSE
+is_unique_column <- function(column_name, data_frame) {
+  column_values <- data_frame %>% dplyr::pull(column_name)
+  return(nrow(data_frame) == length(unique(column_values)))
+}
+
+#' Retrieve the class of the first element of a vector
+#'
+#' @param input_vector A vector whose first element's class is to be retrieved.
+#' @return The class of the first element of the input vector.
+#' @export
+#' @examples
+#' # Get the class of the first element in a numeric vector
+#' get_first_element_class(c(1, 2, 3)) # Returns "numeric"
+#'
+#' # Get the class of the first element in a character vector
+#' get_first_element_class(c("apple", "banana", "cherry")) # Returns "character"
+get_first_element_class <- function(input_vector) {
+  return(class(input_vector)[1])
+}
+
+#' Find the minimum numeric value in a vector, ignoring non-numeric values
+#'
+#' @param numeric_vector A vector from which to find the minimum numeric value.
+#' @return The minimum numeric value in the input vector, or NA if none exist.
+#' @export
+#' @examples
+#' # Find the minimum of a numeric vector
+#' find_minimum_value(c(3, 1, 4, 1, 5, 9)) # Returns   1
+#'
+#' # Find the minimum of a mixed vector with non-numeric values
+#' find_minimum_value(c(3, 1, 4, "two", 5, 9)) # Returns   1
+#'
+#' # Attempt to find the minimum of a vector with only non-numeric values
+#' find_minimum_value(c("one", "two", "three")) # Returns NA
+find_minimum_value <- function(numeric_vector) {
+  if (is.numeric(numeric_vector)) {
+    return(min(numeric_vector, na.rm = TRUE))
+  } else {
+    return(NA)
+  }
+}
+
+#' Find the maximum numeric value in a vector, ignoring non-numeric values
+#'
+#' @param numeric_vector A vector from which to find the maximum numeric value.
+#' @return The maximum numeric value in the input vector, or NA if none exist.
+#' @export
+#' @examples
+#' # Find the maximum of a numeric vector
+#' find_maximum_value(c(3, 1, 4, 1, 5, 9)) # Returns   9
+#'
+#' # Find the maximum of a mixed vector with non-numeric values
+#' find_maximum_value(c(3, 1, 4, "two", 5, 9)) # Returns   9
+#'
+#' # Attempt to find the maximum of a vector with only non-numeric values
+#' find_maximum_value(c("one", "two", "three")) # Returns NA
+find_maximum_value <- function(numeric_vector) {
+  if (is.numeric(numeric_vector)) {
+    return(max(numeric_vector, na.rm = TRUE))
+  } else {
+    return(NA)
+  }
+}
+
+#' Compute distribution statistics for a numeric vector
+#'
+#' This function computes summary statistics such as quartiles, mean, and standard deviation for a numeric vector.
+#'
+#' @param data_vector A numeric vector for which to compute summary statistics.
+#' @return A character string describing the summary statistics of the input vector.
+#' @export
+#' @examples
+#' # Compute summary statistics for a numeric vector
+#' data_vector <- c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
+#' get_distribution_statistics(data_vector)
+get_distribution_statistics <- function(data_vector) {
+  if (is.numeric(data_vector)) {
+    calculated_quantiles <- stats::quantile(data_vector, probs = c(0.25, 0.5, 0.75), na.rm = TRUE)
+    calculated_summary_stats <- c(
+      Mean = mean(data_vector, na.rm = TRUE),
+      Std_Dev = stats::sd(data_vector, na.rm = TRUE)
+    )
+    formatted_output <- paste(
+      "Q1 =", calculated_quantiles[1], " | median =", calculated_quantiles[2], " | Q3 = ", calculated_quantiles[3],
+      " | mean =", format(calculated_summary_stats["Mean"], digits = 3), " | stdev =", format(calculated_summary_stats["Std_Dev"], digits = 3)
+    )
+    return(formatted_output)
+  } else if (is.character(data_vector)) {
+    # Filter out non-numeric characters
+    numeric_elements <- grepl("^\\d*\\.?\\d*$", data_vector)
+    if (any(numeric_elements)) {
+      numeric_values <- as.numeric(data_vector[numeric_elements])
+      calculated_quantiles <- stats::quantile(numeric_values, probs = c(0.25, 0.5, 0.75), na.rm = TRUE)
+      calculated_summary_stats <- c(
+        Mean = mean(numeric_values, na.rm = TRUE),
+        Std_Dev = stats::sd(numeric_values, na.rm = TRUE)
+      )
+      formatted_output <- paste(
+        "Q1 =", calculated_quantiles[1], " | median =", calculated_quantiles[2], " | Q3 = ", calculated_quantiles[3],
+        " | mean =", format(calculated_summary_stats["Mean"], digits = 3), " | stdev =", format(calculated_summary_stats["Std_Dev"], digits = 3)
+      )
+      return(formatted_output)
+    } else {
+      return(NA)
+    }
+  } else {
+    return(NA)
+  }
+}
+
+#' Calculate the percentage of categories in a data vector
+#'
+#' This function calculates the percentage of each category in a given data vector and returns the top   10 categories along with their percentages.
+#' If the data vector is of Date class, it is converted to POSIXct. If the sum of the percentages is not   100%, an "Other" category is added to make up the difference, but only if the number of unique values exceeds  10.
+#' If the data vector is of POSIXct class and the smallest percentage is less than   1%, the function returns "Not enough occurrences."
+#'
+#' @param data_vector A vector of categorical data.
+#' @return A character string detailing the top   10 categories and their percentages, or a special message indicating not enough occurrences or unsupported data type.
+#' @export
+#' @examples
+#' # Example with a character vector
+#' data_vector <- c("cat", "dog", "bird", "cat", "dog", "cat", "other")
+#' calculate_category_percentages(data_vector)
+#'
+#' # Example with a Date vector
+#' data_vector <- as.Date(c("2020-01-01", "2020-01-02", "2020-01-03"))
+#' calculate_category_percentages(data_vector)
+calculate_category_percentages <- function(data_vector) {
+  # Initialize variables
+  percentages <- n <- percent <- categories <- .data <- NULL
+
+  # Convert Date objects to POSIXct
+  if (inherits(data_vector, "Date")) {
+    data_vector <- as.POSIXct(data_vector)
+  }
+
+  # Check if data_vector is a POSIXct object
+  if (!lubridate::is.POSIXct(data_vector)) {
+    # Calculate frequencies and percentages for non-Date vectors
+    frequency_table <- janitor::tabyl(data_vector) %>%
+      dplyr::arrange(dplyr::desc(n)) %>%
+      dplyr::mutate(percent = n / sum(n) * 100) %>%
+      dplyr::filter(!is.na(.data)) %>%
+      dplyr::slice_max(order_by = percent, n = 10)
+
+    categories <- frequency_table$data_vector
+    percentages <- format(round(frequency_table$percent, 2), nsmall = 2)
+
+    # Adjust percentages if total is not   100% and there are more than  10 unique values
+    if (sum(as.numeric(percentages)) < 100 && length(unique(data_vector)) > 10) {
+      percentages <- c(percentages, sprintf("%.2f", 100 - sum(as.numeric(percentages))))
+      categories <- c(categories, "Other")
+    } else {
+      percentages[length(percentages)] <- sprintf("%.2f", 100 - sum(as.numeric(percentages[-length(percentages)])))
+    }
+
+    result <- paste0(categories, ": (", percentages, "%)", collapse = " | ")
+    return(result)
+  } else {
+    # Calculate frequencies and percentages for POSIXct vectors
+    frequency_table <- janitor::tabyl(data_vector) %>%
+      dplyr::arrange(dplyr::desc(n)) %>%
+      dplyr::mutate(percent = n / sum(n) * 100) %>%
+      dplyr::filter(!is.na(.data)) %>%
+      dplyr::slice_max(order_by = percent, n = 10)
+
+    categories <- as.character(frequency_table$data_vector)
+    percentages <- format(round(frequency_table$percent, 2), nsmall = 2)
+
+    # Adjust percentages if total is not   100% and there are more than  10 unique values
+    if (sum(as.numeric(percentages)) < 100 && length(unique(data_vector)) > 10) {
+      percentages <- c(percentages, sprintf("%.2f", 100 - sum(as.numeric(percentages))))
+      categories <- c(categories, "Other")
+    } else {
+      percentages[length(percentages)] <- sprintf("%.2f", 100 - sum(as.numeric(percentages[-length(percentages)])))
+    }
+
+    # Check if the smallest percentage is less than   1%
+    if (as.numeric(percentages[1]) < 1) {
+      return("Not enough occurrences")
+    } else {
+      result <- paste0(categories, ": (", percentages, "%)", collapse = " | ")
+      return(result)
+    }
+  }
+  return("Data type not supported")
+}
diff --git a/man/calculate_category_percentages.Rd b/man/calculate_category_percentages.Rd
diff --git a/man/create_dataset_summary_table.Rd b/man/create_dataset_summary_table.Rd
diff --git a/man/find_maximum_value.Rd b/man/find_maximum_value.Rd