Skip to content

Commit

Permalink
feat: create summary table for dataframe
Browse files Browse the repository at this point in the history
  • Loading branch information
tin900 committed Feb 7, 2024
1 parent 45f4363 commit 87128df
Show file tree
Hide file tree
Showing 10 changed files with 454 additions and 1 deletion.
5 changes: 4 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,14 @@ Imports:
cli,
dplyr,
findR,
janitor,
kit,
lubridate,
magrittr,
purrr,
stats,
stringr
stringr,
tibble
Suggests:
testthat (>= 3.0.0)
Config/testthat/edition: 3
11 changes: 11 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ export(assert_date_named)
export(assert_logical_named)
export(assert_no_duplicates_in_group)
export(assertion_message)
export(calculate_category_percentages)
export(check_double_columns)
export(check_duplicates)
export(check_na_columns)
Expand All @@ -15,13 +16,19 @@ export(check_posixct_type)
export(check_rows)
export(check_zero_columns)
export(count_more_than_1)
export(create_dataset_summary_table)
export(drop_na_column_names)
export(duplicates_in_column)
export(find_common_columns)
export(find_maximum_value)
export(find_minimum_value)
export(find_pattern_r)
export(get_distribution_statistics)
export(get_first_element_class)
export(get_values)
export(identify_join_pairs)
export(identify_outliers)
export(is_unique_column)
export(md_complete_cases)
export(regex_content_parameter)
export(regex_time)
Expand All @@ -37,3 +44,7 @@ export(str_detect_in_file)
export(test_all_equal)
export(unique_id)
importFrom(magrittr,"%>%")
importFrom(purrr,map_chr)
importFrom(purrr,map_dbl)
importFrom(purrr,map_lgl)
importFrom(purrr,possibly)
257 changes: 257 additions & 0 deletions R/create_dataset_summary_table.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,257 @@
#' Create dataset summary statistics table
#'
#' This function creates a summary statistics table for a dataframe, providing insights into
#' the nature of the data contained within. It includes detailed statistics for each column,
#' such as column types, missing value percentages, minimum and maximum values for numeric
#' columns, patterns for character columns, uniqueness of identifiers, and distributions.
#'
#' @param df_input A dataframe for which to create a summary statistics table.
#' @return A tibble with comprehensive summary statistics for each column in the input dataframe.
#' @importFrom purrr map_chr map_dbl map_lgl possibly
#' @export
create_dataset_summary_table <- function(df_input) {
# Store column names
column_names <- names(df_input)

# Store column types
column_types <- purrr::map_chr(df_input, get_first_element_class)

# Calculate missing value percentages
missing_value_percentages <- colMeans(is.na(df_input)) * 100

# Find minimum and maximum values for numeric columns
min_values <- purrr::map_dbl(df_input, find_minimum_value)
max_values <- purrr::map_dbl(df_input, find_maximum_value)

# Check for uniqueness of identifiers
unique_identifiers <- purrr::map_lgl(column_names, is_unique_column, data_frame = df_input)

# Calculate distribution statistics
distribution_stats <- purrr::map_chr(df_input, get_distribution_statistics)

# Calculate distribution percentages
distribution_percentages <- purrr::map_chr(df_input, calculate_category_percentages)

# Create a tibble with all summary statistics
summary_table <- tibble::tibble(
column_name = column_names,
column_type = column_types,
missing_value_percentage = missing_value_percentages,
min_value = min_values,
max_value = max_values,
unique_identifier = unique_identifiers,
distribution_statistics = distribution_stats,
distribution_percentages = distribution_percentages
)

return(summary_table)
}


#' Check if a column in a dataframe has unique values
#'
#' @param column_name The name of the column to check for uniqueness.
#' @param data_frame A dataframe containing the column to check.
#' @return \code{TRUE} if the column has unique values, \code{FALSE} otherwise.
#' @export
#' @examples
#' # Create a dataframe with a unique ID column
#' data_frame <- tibble::tibble(
#' id = c(1, 2, 3, 4, 5),
#' value = c("a", "b", "c", "d", "e")
#' )
#' is_unique_column("id", data_frame) # Returns TRUE
#'
#' # Create a dataframe with duplicate values in the ID column
#' data_frame <- tibble::tibble(
#' id = c(1, 2, 3, 4, 5, 1),
#' value = c("a", "b", "c", "d", "e", "a")
#' )
#' is_unique_column("id", data_frame) # Returns FALSE
is_unique_column <- function(column_name, data_frame) {
column_values <- data_frame %>% dplyr::pull(column_name)
return(nrow(data_frame) == length(unique(column_values)))
}

#' Retrieve the class of the first element of a vector
#'
#' @param input_vector A vector whose first element's class is to be retrieved.
#' @return The class of the first element of the input vector.
#' @export
#' @examples
#' # Get the class of the first element in a numeric vector
#' get_first_element_class(c(1, 2, 3)) # Returns "numeric"
#'
#' # Get the class of the first element in a character vector
#' get_first_element_class(c("apple", "banana", "cherry")) # Returns "character"
get_first_element_class <- function(input_vector) {
return(class(input_vector)[1])
}

#' Find the minimum numeric value in a vector, ignoring non-numeric values
#'
#' @param numeric_vector A vector from which to find the minimum numeric value.
#' @return The minimum numeric value in the input vector, or NA if none exist.
#' @export
#' @examples
#' # Find the minimum of a numeric vector
#' find_minimum_value(c(3, 1, 4, 1, 5, 9)) # Returns 1
#'
#' # Find the minimum of a mixed vector with non-numeric values
#' find_minimum_value(c(3, 1, 4, "two", 5, 9)) # Returns 1
#'
#' # Attempt to find the minimum of a vector with only non-numeric values
#' find_minimum_value(c("one", "two", "three")) # Returns NA
find_minimum_value <- function(numeric_vector) {
if (is.numeric(numeric_vector)) {
return(min(numeric_vector, na.rm = TRUE))
} else {
return(NA)
}
}

#' Find the maximum numeric value in a vector, ignoring non-numeric values
#'
#' @param numeric_vector A vector from which to find the maximum numeric value.
#' @return The maximum numeric value in the input vector, or NA if none exist.
#' @export
#' @examples
#' # Find the maximum of a numeric vector
#' find_maximum_value(c(3, 1, 4, 1, 5, 9)) # Returns 9
#'
#' # Find the maximum of a mixed vector with non-numeric values
#' find_maximum_value(c(3, 1, 4, "two", 5, 9)) # Returns 9
#'
#' # Attempt to find the maximum of a vector with only non-numeric values
#' find_maximum_value(c("one", "two", "three")) # Returns NA
find_maximum_value <- function(numeric_vector) {
if (is.numeric(numeric_vector)) {
return(max(numeric_vector, na.rm = TRUE))
} else {
return(NA)
}
}

#' Compute distribution statistics for a numeric vector
#'
#' This function computes summary statistics such as quartiles, mean, and standard deviation for a numeric vector.
#'
#' @param data_vector A numeric vector for which to compute summary statistics.
#' @return A character string describing the summary statistics of the input vector.
#' @export
#' @examples
#' # Compute summary statistics for a numeric vector
#' data_vector <- c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
#' get_distribution_statistics(data_vector)
get_distribution_statistics <- function(data_vector) {
if (is.numeric(data_vector)) {
calculated_quantiles <- stats::quantile(data_vector, probs = c(0.25, 0.5, 0.75), na.rm = TRUE)
calculated_summary_stats <- c(
Mean = mean(data_vector, na.rm = TRUE),
Std_Dev = stats::sd(data_vector, na.rm = TRUE)
)
formatted_output <- paste(
"Q1 =", calculated_quantiles[1], " | median =", calculated_quantiles[2], " | Q3 = ", calculated_quantiles[3],
" | mean =", format(calculated_summary_stats["Mean"], digits = 3), " | stdev =", format(calculated_summary_stats["Std_Dev"], digits = 3)
)
return(formatted_output)
} else if (is.character(data_vector)) {
# Filter out non-numeric characters
numeric_elements <- grepl("^\\d*\\.?\\d*$", data_vector)
if (any(numeric_elements)) {
numeric_values <- as.numeric(data_vector[numeric_elements])
calculated_quantiles <- stats::quantile(numeric_values, probs = c(0.25, 0.5, 0.75), na.rm = TRUE)
calculated_summary_stats <- c(
Mean = mean(numeric_values, na.rm = TRUE),
Std_Dev = stats::sd(numeric_values, na.rm = TRUE)
)
formatted_output <- paste(
"Q1 =", calculated_quantiles[1], " | median =", calculated_quantiles[2], " | Q3 = ", calculated_quantiles[3],
" | mean =", format(calculated_summary_stats["Mean"], digits = 3), " | stdev =", format(calculated_summary_stats["Std_Dev"], digits = 3)
)
return(formatted_output)
} else {
return(NA)
}
} else {
return(NA)
}
}

#' Calculate the percentage of categories in a data vector
#'
#' This function calculates the percentage of each category in a given data vector and returns the top 10 categories along with their percentages.
#' If the data vector is of Date class, it is converted to POSIXct. If the sum of the percentages is not 100%, an "Other" category is added to make up the difference, but only if the number of unique values exceeds 10.
#' If the data vector is of POSIXct class and the smallest percentage is less than 1%, the function returns "Not enough occurrences."
#'
#' @param data_vector A vector of categorical data.
#' @return A character string detailing the top 10 categories and their percentages, or a special message indicating not enough occurrences or unsupported data type.
#' @export
#' @examples
#' # Example with a character vector
#' data_vector <- c("cat", "dog", "bird", "cat", "dog", "cat", "other")
#' calculate_category_percentages(data_vector)
#'
#' # Example with a Date vector
#' data_vector <- as.Date(c("2020-01-01", "2020-01-02", "2020-01-03"))
#' calculate_category_percentages(data_vector)
calculate_category_percentages <- function(data_vector) {
# Initialize variables
percentages <- n <- percent <- categories <- .data <- NULL

# Convert Date objects to POSIXct
if (inherits(data_vector, "Date")) {
data_vector <- as.POSIXct(data_vector)
}

# Check if data_vector is a POSIXct object
if (!lubridate::is.POSIXct(data_vector)) {
# Calculate frequencies and percentages for non-Date vectors
frequency_table <- janitor::tabyl(data_vector) %>%
dplyr::arrange(dplyr::desc(n)) %>%
dplyr::mutate(percent = n / sum(n) * 100) %>%
dplyr::filter(!is.na(.data)) %>%
dplyr::slice_max(order_by = percent, n = 10)

categories <- frequency_table$data_vector
percentages <- format(round(frequency_table$percent, 2), nsmall = 2)

# Adjust percentages if total is not 100% and there are more than 10 unique values
if (sum(as.numeric(percentages)) < 100 && length(unique(data_vector)) > 10) {
percentages <- c(percentages, sprintf("%.2f", 100 - sum(as.numeric(percentages))))
categories <- c(categories, "Other")
} else {
percentages[length(percentages)] <- sprintf("%.2f", 100 - sum(as.numeric(percentages[-length(percentages)])))
}

result <- paste0(categories, ": (", percentages, "%)", collapse = " | ")
return(result)
} else {
# Calculate frequencies and percentages for POSIXct vectors
frequency_table <- janitor::tabyl(data_vector) %>%
dplyr::arrange(dplyr::desc(n)) %>%
dplyr::mutate(percent = n / sum(n) * 100) %>%
dplyr::filter(!is.na(.data)) %>%
dplyr::slice_max(order_by = percent, n = 10)

categories <- as.character(frequency_table$data_vector)
percentages <- format(round(frequency_table$percent, 2), nsmall = 2)

# Adjust percentages if total is not 100% and there are more than 10 unique values
if (sum(as.numeric(percentages)) < 100 && length(unique(data_vector)) > 10) {
percentages <- c(percentages, sprintf("%.2f", 100 - sum(as.numeric(percentages))))
categories <- c(categories, "Other")
} else {
percentages[length(percentages)] <- sprintf("%.2f", 100 - sum(as.numeric(percentages[-length(percentages)])))
}

# Check if the smallest percentage is less than 1%
if (as.numeric(percentages[1]) < 1) {
return("Not enough occurrences")
} else {
result <- paste0(categories, ": (", percentages, "%)", collapse = " | ")
return(result)
}
}
return("Data type not supported")
}
28 changes: 28 additions & 0 deletions man/calculate_category_percentages.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

20 changes: 20 additions & 0 deletions man/create_dataset_summary_table.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

27 changes: 27 additions & 0 deletions man/find_maximum_value.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 87128df

Please sign in to comment.