From 87128df92d72e6b4c151b93be2a02c0848fa506c Mon Sep 17 00:00:00 2001
From: Tomer Iwan <t.iwan@vu.nl>
Date: Wed, 7 Feb 2024 12:16:34 +0100
Subject: [PATCH] feat: create summary table for dataframe

---
 DESCRIPTION                           |   5 +-
 NAMESPACE                             |  11 ++
 R/create_dataset_summary_table.R      | 257 ++++++++++++++++++++++++++
 man/calculate_category_percentages.Rd |  28 +++
 man/create_dataset_summary_table.Rd   |  20 ++
 man/find_maximum_value.Rd             |  27 +++
 man/find_minimum_value.Rd             |  27 +++
 man/get_distribution_statistics.Rd    |  22 +++
 man/get_first_element_class.Rd        |  24 +++
 man/is_unique_column.Rd               |  34 ++++
 10 files changed, 454 insertions(+), 1 deletion(-)
 create mode 100644 R/create_dataset_summary_table.R
 create mode 100644 man/calculate_category_percentages.Rd
 create mode 100644 man/create_dataset_summary_table.Rd
 create mode 100644 man/find_maximum_value.Rd
 create mode 100644 man/find_minimum_value.Rd
 create mode 100644 man/get_distribution_statistics.Rd
 create mode 100644 man/get_first_element_class.Rd
 create mode 100644 man/is_unique_column.Rd

diff --git a/DESCRIPTION b/DESCRIPTION
index 4fe6e16..9b6ea99 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -16,11 +16,14 @@ Imports:
     cli,
     dplyr,
     findR,
+    janitor,
     kit,
+    lubridate,
     magrittr,
     purrr,
     stats,
-    stringr
+    stringr,
+    tibble
 Suggests: 
     testthat (>= 3.0.0)
 Config/testthat/edition: 3
diff --git a/NAMESPACE b/NAMESPACE
index 6cfa9fb..de73228 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -4,6 +4,7 @@ export(assert_date_named)
 export(assert_logical_named)
 export(assert_no_duplicates_in_group)
 export(assertion_message)
+export(calculate_category_percentages)
 export(check_double_columns)
 export(check_duplicates)
 export(check_na_columns)
@@ -15,13 +16,19 @@ export(check_posixct_type)
 export(check_rows)
 export(check_zero_columns)
 export(count_more_than_1)
+export(create_dataset_summary_table)
 export(drop_na_column_names)
 export(duplicates_in_column)
 export(find_common_columns)
+export(find_maximum_value)
+export(find_minimum_value)
 export(find_pattern_r)
+export(get_distribution_statistics)
+export(get_first_element_class)
 export(get_values)
 export(identify_join_pairs)
 export(identify_outliers)
+export(is_unique_column)
 export(md_complete_cases)
 export(regex_content_parameter)
 export(regex_time)
@@ -37,3 +44,7 @@ export(str_detect_in_file)
 export(test_all_equal)
 export(unique_id)
 importFrom(magrittr,"%>%")
+importFrom(purrr,map_chr)
+importFrom(purrr,map_dbl)
+importFrom(purrr,map_lgl)
+importFrom(purrr,possibly)
diff --git a/R/create_dataset_summary_table.R b/R/create_dataset_summary_table.R
new file mode 100644
index 0000000..5d9a81f
--- /dev/null
+++ b/R/create_dataset_summary_table.R
@@ -0,0 +1,257 @@
+#' Create dataset summary statistics table
+#'
+#' This function creates a summary statistics table for a dataframe, providing insights into
+#' the nature of the data contained within. It includes detailed statistics for each column,
+#' such as column types, missing value percentages, minimum and maximum values for numeric
+#' columns, patterns for character columns, uniqueness of identifiers, and distributions.
+#'
+#' @param df_input A dataframe for which to create a summary statistics table.
+#' @return A tibble with comprehensive summary statistics for each column in the input dataframe.
+#' @importFrom purrr map_chr map_dbl map_lgl possibly
+#' @export
+create_dataset_summary_table <- function(df_input) {
+  # Store column names
+  column_names <- names(df_input)
+
+  # Store column types
+  column_types <- purrr::map_chr(df_input, get_first_element_class)
+
+  # Calculate missing value percentages
+  missing_value_percentages <- colMeans(is.na(df_input)) * 100
+
+  # Find minimum and maximum values for numeric columns
+  min_values <- purrr::map_dbl(df_input, find_minimum_value)
+  max_values <- purrr::map_dbl(df_input, find_maximum_value)
+
+  # Check for uniqueness of identifiers
+  unique_identifiers <- purrr::map_lgl(column_names, is_unique_column, data_frame = df_input)
+
+  # Calculate distribution statistics
+  distribution_stats <- purrr::map_chr(df_input, get_distribution_statistics)
+
+  # Calculate distribution percentages
+  distribution_percentages <- purrr::map_chr(df_input, calculate_category_percentages)
+
+  # Create a tibble with all summary statistics
+  summary_table <- tibble::tibble(
+    column_name = column_names,
+    column_type = column_types,
+    missing_value_percentage = missing_value_percentages,
+    min_value = min_values,
+    max_value = max_values,
+    unique_identifier = unique_identifiers,
+    distribution_statistics = distribution_stats,
+    distribution_percentages = distribution_percentages
+  )
+
+  return(summary_table)
+}
+
+
+#' Check if a column in a dataframe has unique values
+#'
+#' @param column_name The name of the column to check for uniqueness.
+#' @param data_frame A dataframe containing the column to check.
+#' @return \code{TRUE} if the column has unique values, \code{FALSE} otherwise.
+#' @export
+#' @examples
+#' # Create a dataframe with a unique ID column
+#' data_frame <- tibble::tibble(
+#'   id = c(1, 2, 3, 4, 5),
+#'   value = c("a", "b", "c", "d", "e")
+#' )
+#' is_unique_column("id", data_frame) # Returns TRUE
+#'
+#' # Create a dataframe with duplicate values in the ID column
+#' data_frame <- tibble::tibble(
+#'   id = c(1, 2, 3, 4, 5, 1),
+#'   value = c("a", "b", "c", "d", "e", "a")
+#' )
+#' is_unique_column("id", data_frame) # Returns FALSE
+is_unique_column <- function(column_name, data_frame) {
+  column_values <- data_frame %>% dplyr::pull(column_name)
+  return(nrow(data_frame) == length(unique(column_values)))
+}
+
+#' Retrieve the class of the first element of a vector
+#'
+#' @param input_vector A vector whose first element's class is to be retrieved.
+#' @return The class of the first element of the input vector.
+#' @export
+#' @examples
+#' # Get the class of the first element in a numeric vector
+#' get_first_element_class(c(1, 2, 3)) # Returns "numeric"
+#'
+#' # Get the class of the first element in a character vector
+#' get_first_element_class(c("apple", "banana", "cherry")) # Returns "character"
+get_first_element_class <- function(input_vector) {
+  return(class(input_vector)[1])
+}
+
+#' Find the minimum numeric value in a vector, ignoring non-numeric values
+#'
+#' @param numeric_vector A vector from which to find the minimum numeric value.
+#' @return The minimum numeric value in the input vector, or NA if none exist.
+#' @export
+#' @examples
+#' # Find the minimum of a numeric vector
+#' find_minimum_value(c(3, 1, 4, 1, 5, 9)) # Returns   1
+#'
+#' # Find the minimum of a mixed vector with non-numeric values
+#' find_minimum_value(c(3, 1, 4, "two", 5, 9)) # Returns   1
+#'
+#' # Attempt to find the minimum of a vector with only non-numeric values
+#' find_minimum_value(c("one", "two", "three")) # Returns NA
+find_minimum_value <- function(numeric_vector) {
+  if (is.numeric(numeric_vector)) {
+    return(min(numeric_vector, na.rm = TRUE))
+  } else {
+    return(NA)
+  }
+}
+
+#' Find the maximum numeric value in a vector, ignoring non-numeric values
+#'
+#' @param numeric_vector A vector from which to find the maximum numeric value.
+#' @return The maximum numeric value in the input vector, or NA if none exist.
+#' @export
+#' @examples
+#' # Find the maximum of a numeric vector
+#' find_maximum_value(c(3, 1, 4, 1, 5, 9)) # Returns   9
+#'
+#' # Find the maximum of a mixed vector with non-numeric values
+#' find_maximum_value(c(3, 1, 4, "two", 5, 9)) # Returns   9
+#'
+#' # Attempt to find the maximum of a vector with only non-numeric values
+#' find_maximum_value(c("one", "two", "three")) # Returns NA
+find_maximum_value <- function(numeric_vector) {
+  if (is.numeric(numeric_vector)) {
+    return(max(numeric_vector, na.rm = TRUE))
+  } else {
+    return(NA)
+  }
+}
+
+#' Compute distribution statistics for a numeric vector
+#'
+#' This function computes summary statistics such as quartiles, mean, and standard deviation for a numeric vector.
+#'
+#' @param data_vector A numeric vector for which to compute summary statistics.
+#' @return A character string describing the summary statistics of the input vector.
+#' @export
+#' @examples
+#' # Compute summary statistics for a numeric vector
+#' data_vector <- c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
+#' get_distribution_statistics(data_vector)
+get_distribution_statistics <- function(data_vector) {
+  if (is.numeric(data_vector)) {
+    calculated_quantiles <- stats::quantile(data_vector, probs = c(0.25, 0.5, 0.75), na.rm = TRUE)
+    calculated_summary_stats <- c(
+      Mean = mean(data_vector, na.rm = TRUE),
+      Std_Dev = stats::sd(data_vector, na.rm = TRUE)
+    )
+    formatted_output <- paste(
+      "Q1 =", calculated_quantiles[1], " | median =", calculated_quantiles[2], " | Q3 = ", calculated_quantiles[3],
+      " | mean =", format(calculated_summary_stats["Mean"], digits = 3), " | stdev =", format(calculated_summary_stats["Std_Dev"], digits = 3)
+    )
+    return(formatted_output)
+  } else if (is.character(data_vector)) {
+    # Filter out non-numeric characters
+    numeric_elements <- grepl("^\\d*\\.?\\d*$", data_vector)
+    if (any(numeric_elements)) {
+      numeric_values <- as.numeric(data_vector[numeric_elements])
+      calculated_quantiles <- stats::quantile(numeric_values, probs = c(0.25, 0.5, 0.75), na.rm = TRUE)
+      calculated_summary_stats <- c(
+        Mean = mean(numeric_values, na.rm = TRUE),
+        Std_Dev = stats::sd(numeric_values, na.rm = TRUE)
+      )
+      formatted_output <- paste(
+        "Q1 =", calculated_quantiles[1], " | median =", calculated_quantiles[2], " | Q3 = ", calculated_quantiles[3],
+        " | mean =", format(calculated_summary_stats["Mean"], digits = 3), " | stdev =", format(calculated_summary_stats["Std_Dev"], digits = 3)
+      )
+      return(formatted_output)
+    } else {
+      return(NA)
+    }
+  } else {
+    return(NA)
+  }
+}
+
+#' Calculate the percentage of categories in a data vector
+#'
+#' This function calculates the percentage of each category in a given data vector and returns the top   10 categories along with their percentages.
+#' If the data vector is of Date class, it is converted to POSIXct. If the sum of the percentages is not   100%, an "Other" category is added to make up the difference, but only if the number of unique values exceeds  10.
+#' If the data vector is of POSIXct class and the smallest percentage is less than   1%, the function returns "Not enough occurrences."
+#'
+#' @param data_vector A vector of categorical data.
+#' @return A character string detailing the top   10 categories and their percentages, or a special message indicating not enough occurrences or unsupported data type.
+#' @export
+#' @examples
+#' # Example with a character vector
+#' data_vector <- c("cat", "dog", "bird", "cat", "dog", "cat", "other")
+#' calculate_category_percentages(data_vector)
+#'
+#' # Example with a Date vector
+#' data_vector <- as.Date(c("2020-01-01", "2020-01-02", "2020-01-03"))
+#' calculate_category_percentages(data_vector)
+calculate_category_percentages <- function(data_vector) {
+  # Initialize variables
+  percentages <- n <- percent <- categories <- .data <- NULL
+
+  # Convert Date objects to POSIXct
+  if (inherits(data_vector, "Date")) {
+    data_vector <- as.POSIXct(data_vector)
+  }
+
+  # Check if data_vector is a POSIXct object
+  if (!lubridate::is.POSIXct(data_vector)) {
+    # Calculate frequencies and percentages for non-Date vectors
+    frequency_table <- janitor::tabyl(data_vector) %>%
+      dplyr::arrange(dplyr::desc(n)) %>%
+      dplyr::mutate(percent = n / sum(n) * 100) %>%
+      dplyr::filter(!is.na(.data)) %>%
+      dplyr::slice_max(order_by = percent, n = 10)
+
+    categories <- frequency_table$data_vector
+    percentages <- format(round(frequency_table$percent, 2), nsmall = 2)
+
+    # Adjust percentages if total is not   100% and there are more than  10 unique values
+    if (sum(as.numeric(percentages)) < 100 && length(unique(data_vector)) > 10) {
+      percentages <- c(percentages, sprintf("%.2f", 100 - sum(as.numeric(percentages))))
+      categories <- c(categories, "Other")
+    } else {
+      percentages[length(percentages)] <- sprintf("%.2f", 100 - sum(as.numeric(percentages[-length(percentages)])))
+    }
+
+    result <- paste0(categories, ": (", percentages, "%)", collapse = " | ")
+    return(result)
+  } else {
+    # Calculate frequencies and percentages for POSIXct vectors
+    frequency_table <- janitor::tabyl(data_vector) %>%
+      dplyr::arrange(dplyr::desc(n)) %>%
+      dplyr::mutate(percent = n / sum(n) * 100) %>%
+      dplyr::filter(!is.na(.data)) %>%
+      dplyr::slice_max(order_by = percent, n = 10)
+
+    categories <- as.character(frequency_table$data_vector)
+    percentages <- format(round(frequency_table$percent, 2), nsmall = 2)
+
+    # Adjust percentages if total is not   100% and there are more than  10 unique values
+    if (sum(as.numeric(percentages)) < 100 && length(unique(data_vector)) > 10) {
+      percentages <- c(percentages, sprintf("%.2f", 100 - sum(as.numeric(percentages))))
+      categories <- c(categories, "Other")
+    } else {
+      percentages[length(percentages)] <- sprintf("%.2f", 100 - sum(as.numeric(percentages[-length(percentages)])))
+    }
+
+    # Check if the smallest percentage is less than   1%
+    if (as.numeric(percentages[1]) < 1) {
+      return("Not enough occurrences")
+    } else {
+      result <- paste0(categories, ": (", percentages, "%)", collapse = " | ")
+      return(result)
+    }
+  }
+  return("Data type not supported")
+}
diff --git a/man/calculate_category_percentages.Rd b/man/calculate_category_percentages.Rd
new file mode 100644
index 0000000..4454633
--- /dev/null
+++ b/man/calculate_category_percentages.Rd
@@ -0,0 +1,28 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_dataset_summary_table.R
+\name{calculate_category_percentages}
+\alias{calculate_category_percentages}
+\title{Calculate the percentage of categories in a data vector}
+\usage{
+calculate_category_percentages(data_vector)
+}
+\arguments{
+\item{data_vector}{A vector of categorical data.}
+}
+\value{
+A character string detailing the top   10 categories and their percentages, or a special message indicating not enough occurrences or unsupported data type.
+}
+\description{
+This function calculates the percentage of each category in a given data vector and returns the top   10 categories along with their percentages.
+If the data vector is of Date class, it is converted to POSIXct. If the sum of the percentages is not   100\%, an "Other" category is added to make up the difference, but only if the number of unique values exceeds  10.
+If the data vector is of POSIXct class and the smallest percentage is less than   1\%, the function returns "Not enough occurrences."
+}
+\examples{
+# Example with a character vector
+data_vector <- c("cat", "dog", "bird", "cat", "dog", "cat", "other")
+calculate_category_percentages(data_vector)
+
+# Example with a Date vector
+data_vector <- as.Date(c("2020-01-01", "2020-01-02", "2020-01-03"))
+calculate_category_percentages(data_vector)
+}
diff --git a/man/create_dataset_summary_table.Rd b/man/create_dataset_summary_table.Rd
new file mode 100644
index 0000000..7715124
--- /dev/null
+++ b/man/create_dataset_summary_table.Rd
@@ -0,0 +1,20 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_dataset_summary_table.R
+\name{create_dataset_summary_table}
+\alias{create_dataset_summary_table}
+\title{Create dataset summary statistics table}
+\usage{
+create_dataset_summary_table(df_input)
+}
+\arguments{
+\item{df_input}{A dataframe for which to create a summary statistics table.}
+}
+\value{
+A tibble with comprehensive summary statistics for each column in the input dataframe.
+}
+\description{
+This function creates a summary statistics table for a dataframe, providing insights into
+the nature of the data contained within. It includes detailed statistics for each column,
+such as column types, missing value percentages, minimum and maximum values for numeric
+columns, patterns for character columns, uniqueness of identifiers, and distributions.
+}
diff --git a/man/find_maximum_value.Rd b/man/find_maximum_value.Rd
new file mode 100644
index 0000000..3361ae2
--- /dev/null
+++ b/man/find_maximum_value.Rd
@@ -0,0 +1,27 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_dataset_summary_table.R
+\name{find_maximum_value}
+\alias{find_maximum_value}
+\title{Find the maximum numeric value in a vector, ignoring non-numeric values}
+\usage{
+find_maximum_value(numeric_vector)
+}
+\arguments{
+\item{numeric_vector}{A vector from which to find the maximum numeric value.}
+}
+\value{
+The maximum numeric value in the input vector, or NA if none exist.
+}
+\description{
+Find the maximum numeric value in a vector, ignoring non-numeric values
+}
+\examples{
+# Find the maximum of a numeric vector
+find_maximum_value(c(3, 1, 4, 1, 5, 9)) # Returns   9
+
+# Find the maximum of a mixed vector with non-numeric values
+find_maximum_value(c(3, 1, 4, "two", 5, 9)) # Returns   9
+
+# Attempt to find the maximum of a vector with only non-numeric values
+find_maximum_value(c("one", "two", "three")) # Returns NA
+}
diff --git a/man/find_minimum_value.Rd b/man/find_minimum_value.Rd
new file mode 100644
index 0000000..53769be
--- /dev/null
+++ b/man/find_minimum_value.Rd
@@ -0,0 +1,27 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_dataset_summary_table.R
+\name{find_minimum_value}
+\alias{find_minimum_value}
+\title{Find the minimum numeric value in a vector, ignoring non-numeric values}
+\usage{
+find_minimum_value(numeric_vector)
+}
+\arguments{
+\item{numeric_vector}{A vector from which to find the minimum numeric value.}
+}
+\value{
+The minimum numeric value in the input vector, or NA if none exist.
+}
+\description{
+Find the minimum numeric value in a vector, ignoring non-numeric values
+}
+\examples{
+# Find the minimum of a numeric vector
+find_minimum_value(c(3, 1, 4, 1, 5, 9)) # Returns   1
+
+# Find the minimum of a mixed vector with non-numeric values
+find_minimum_value(c(3, 1, 4, "two", 5, 9)) # Returns   1
+
+# Attempt to find the minimum of a vector with only non-numeric values
+find_minimum_value(c("one", "two", "three")) # Returns NA
+}
diff --git a/man/get_distribution_statistics.Rd b/man/get_distribution_statistics.Rd
new file mode 100644
index 0000000..55063f7
--- /dev/null
+++ b/man/get_distribution_statistics.Rd
@@ -0,0 +1,22 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_dataset_summary_table.R
+\name{get_distribution_statistics}
+\alias{get_distribution_statistics}
+\title{Compute distribution statistics for a numeric vector}
+\usage{
+get_distribution_statistics(data_vector)
+}
+\arguments{
+\item{data_vector}{A numeric vector for which to compute summary statistics.}
+}
+\value{
+A character string describing the summary statistics of the input vector.
+}
+\description{
+This function computes summary statistics such as quartiles, mean, and standard deviation for a numeric vector.
+}
+\examples{
+# Compute summary statistics for a numeric vector
+data_vector <- c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
+get_distribution_statistics(data_vector)
+}
diff --git a/man/get_first_element_class.Rd b/man/get_first_element_class.Rd
new file mode 100644
index 0000000..a0cb067
--- /dev/null
+++ b/man/get_first_element_class.Rd
@@ -0,0 +1,24 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_dataset_summary_table.R
+\name{get_first_element_class}
+\alias{get_first_element_class}
+\title{Retrieve the class of the first element of a vector}
+\usage{
+get_first_element_class(input_vector)
+}
+\arguments{
+\item{input_vector}{A vector whose first element's class is to be retrieved.}
+}
+\value{
+The class of the first element of the input vector.
+}
+\description{
+Retrieve the class of the first element of a vector
+}
+\examples{
+# Get the class of the first element in a numeric vector
+get_first_element_class(c(1, 2, 3)) # Returns "numeric"
+
+# Get the class of the first element in a character vector
+get_first_element_class(c("apple", "banana", "cherry")) # Returns "character"
+}
diff --git a/man/is_unique_column.Rd b/man/is_unique_column.Rd
new file mode 100644
index 0000000..7fdf95c
--- /dev/null
+++ b/man/is_unique_column.Rd
@@ -0,0 +1,34 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_dataset_summary_table.R
+\name{is_unique_column}
+\alias{is_unique_column}
+\title{Check if a column in a dataframe has unique values}
+\usage{
+is_unique_column(column_name, data_frame)
+}
+\arguments{
+\item{column_name}{The name of the column to check for uniqueness.}
+
+\item{data_frame}{A dataframe containing the column to check.}
+}
+\value{
+\code{TRUE} if the column has unique values, \code{FALSE} otherwise.
+}
+\description{
+Check if a column in a dataframe has unique values
+}
+\examples{
+# Create a dataframe with a unique ID column
+data_frame <- tibble::tibble(
+  id = c(1, 2, 3, 4, 5),
+  value = c("a", "b", "c", "d", "e")
+)
+is_unique_column("id", data_frame) # Returns TRUE
+
+# Create a dataframe with duplicate values in the ID column
+data_frame <- tibble::tibble(
+  id = c(1, 2, 3, 4, 5, 1),
+  value = c("a", "b", "c", "d", "e", "a")
+)
+is_unique_column("id", data_frame) # Returns FALSE
+}