Skip to content

Commit

Permalink
Support measurement col checks in check_dataset() #3
Browse files Browse the repository at this point in the history
* Add first draft of tests for `check_dataset()` #5
* Add ideas to fix weird 'reached time limit' error #9
  • Loading branch information
daxkellie committed Jan 21, 2025
1 parent 091f1c6 commit 5150e5a
Show file tree
Hide file tree
Showing 6 changed files with 156 additions and 11 deletions.
23 changes: 17 additions & 6 deletions R/check_dataset.R
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,12 @@ check_dataset <- function(.df){
pull()
checkable_fields <- fields[fields %in% available_checks]

# If measurementOrFact is a column, add nested column names
if(any(fields %in% "measurementOrFact")) {
checkable_fields <- c(checkable_fields,
"measurementValue", "measurementID", "measurementUnit", "measurementType")
}

# find fields in .df with available checks
check_functions_names <- c(glue("check_{checkable_fields}"))
check_functions <- as.list(check_functions_names)
Expand All @@ -53,6 +59,7 @@ check_dataset <- function(.df){
add_table_headers(checkable_fields)
invisible() # prevent df results from printing with headers

# browser()
# check all checkable fields, save fields & error messages
check_results <- check_functions_names |>
map(~ check_all(.x, .df, checkable_fields)) |>
Expand All @@ -63,7 +70,12 @@ check_dataset <- function(.df){
cat_line()

## Darwin Core compliance
# dwc_spinny_message(c("Meets minimum requirements for Darwin Core terms"))
# inform user
cli::cli_alert_info("Checking Darwin Core compliance")
for(i in 1:100) {
wait(0.001)
}

dwc_compliant <- check_min_req_dwc(checkable_fields)
min_req_dwc_message(dwc_compliant)

Expand All @@ -75,7 +87,7 @@ check_dataset <- function(.df){

if(length(check_results$messages) > 0) {

dwc_spinny_message(paste0("Collecting error messages"))
dwc_spinny_message(paste0("Collecting error messages..."))

# split messages by function for message formatting
results_split <- check_results |>
Expand All @@ -96,7 +108,6 @@ check_dataset <- function(.df){
}
}

# TODO: check_dataset() should only celebrate when Data meets minimum requirements AND column checks pass
invisible(.df)
}

Expand Down Expand Up @@ -257,7 +268,7 @@ summary_message <- function(results, checkable_fields) {
check_min_req_dwc <- function(checkable_fields) {

# message
dwc_spinny_message(glue("Data meets minimum Darwin Core requirements"))
dwc_spinny_message(glue("Data meets minimum Darwin Core column requirements"))

# check matching user columns with minimum required DwC terms
req_terms_results <- check_required_terms(checkable_fields)
Expand All @@ -276,11 +287,11 @@ check_min_req_dwc <- function(checkable_fields) {
min_req_dwc_message <- function(is_dwc_compliant) {

if(isTRUE(is_dwc_compliant)) {
complies_text <- "Data meets minimum Darwin Core requirements"
complies_text <- "Data meets minimum Darwin Core column requirements"
cli::cli_status_clear()
cat_line(glue("{col_green(symbol$tick)} {complies_text}"))
} else {
noncomplies_text <- "Data does not meet minimum Darwin Core requirements"
noncomplies_text <- "Data does not meet minimum Darwin Core column requirements"
cli::cli_status_clear()
cat_line(glue("{col_red(symbol$cross)} {noncomplies_text}"))
cli_bullets(c(i = "Use `suggest_workflow()` to see more information."))
Expand Down
15 changes: 15 additions & 0 deletions R/set_coordinates.R
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ set_coordinates <- function(
# run column checks
check_decimalLatitude(result, level = "abort")
check_decimalLongitude(result, level = "abort")
check_coordinateUncertaintyInMeters(result, level = "abort")
check_geodeticDatum(result, level = "abort")

return(result)
Expand Down Expand Up @@ -124,6 +125,20 @@ check_decimalLongitude <- function(.df,
}
}

#' Check coordinateUncertaintyInMeters
#' @noRd
#' @keywords Internal
check_coordinateUncertaintyInMeters <- function(.df,
level = c("inform", "warn", "abort")
){
level <- match.arg(level)
if(any(colnames(.df) == "coordinateUncertaintyInMeters")){
.df |>
select("coordinateUncertaintyInMeters") |>
check_is_numeric(level = level)
}
}

#' Check geodeticDatum
#' @noRd
#' @keywords Internal
Expand Down
30 changes: 28 additions & 2 deletions R/set_measurements.R
Original file line number Diff line number Diff line change
Expand Up @@ -101,24 +101,50 @@ set_measurements <- function(

# inform user which columns will be checked
# if they've made it this far, these columns should exist (and be checked)
matched_cols <- c("measurementID", "measurementUnit", "measurementType")
matched_cols <- c("measurementValue", "measurementID", "measurementUnit", "measurementType")

if(isTRUE(.messages)) {
if(length(matched_cols > 0)) {
col_progress_bar(cols = matched_cols)
}
}

check_measurementValue(result, level = "abort")
check_measurementID(result, level = "abort")
check_measurementUnit(result, level = "abort")
check_measurementType(result, level = "abort")

cli::cli_progress_step("Successfully nested measurement columns in column {.field measurementOrFact}.")

# To fix 'reached time limit' error:
# could set time limit, but needs testing how long? From: https://stackoverflow.com/questions/51247102/reached-elapsed-time-limit-errors-in-r
setTimeLimit(1)
# clear memory (NOTE: Can we do this?)
gc()
return(result)
}

#' TODO: select & unnest nested columns, then run normal individual term/column checks like other use functions
#' Check measurementValue
#' @importFrom tidyr unnest
#' @noRd
#' @keywords Internal
check_measurementValue <- function(.df,
level = c("inform", "warn", "abort")
){
level <- match.arg(level)
if(any(colnames(.df) == "measurementOrFact")) {
#unnest columns
result <- .df |>
unnest(cols = measurementOrFact)


if(any(colnames(.df) == "measurementValue")){
.df |>
select("measurementValue")
check_is_numeric(level = level) # NOTE: Is this always true?
}
}
}

#' Check measurementID
#' @importFrom tidyr unnest
Expand Down
8 changes: 6 additions & 2 deletions R/suggest_workflow.R
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ minreq_terms_message <- function(req_terms_results) {
cat_line(req_terms_table)
if(isTRUE(all_req_terms_found)) {
# celebrate
cat_line(paste0("\n", add_emoji(), " ", col_green("All minimum requirements met!"), "\n"))
cat_line(paste0("\n", add_emoji(), " ", col_green("All minimum column requirements met!"), "\n"))
}
}

Expand Down Expand Up @@ -375,7 +375,11 @@ fn_to_term_table <- function() {
"set_observer()", "recordedByID",
"set_events()", "eventID",
"set_events()", "eventType",
"set_events()", "parentEventID"
"set_events()", "parentEventID",
"set_measurements()", "measurementValue",
"set_measurements()", "measurementID",
"set_measurements()", "measurementUnit",
"set_measurements()", "measurementType"
)

table <- lst(main, optional) # named list
Expand Down
9 changes: 8 additions & 1 deletion man/set_measurements.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

82 changes: 82 additions & 0 deletions tests/testthat/test-check_dataset.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@

test_that("check_dataset prints table and results", {
df <- tibble::tibble(
scientificName = c("Callocephalon fimbriatum", "Eolophus roseicapilla")
)

expect_snapshot(cat(check_dataset(df)))
})

test_that("check_dataset errors in table and results", {
df <- tibble::tibble(
scientificName = c("Callocephalon fimbriatum", 2)
)

expect_snapshot(cat(check_dataset(df)))
})

test_that("check_dataset handles multiple rows with errors", {
df <- tibble::tibble(
scientificName = c("Callocephalon fimbriatum", "Eolophus roseicapilla"),
occurrenceStatus = c("present", "present"),
decimalLatitude = c(-35.310, "-35.273")
)

expect_snapshot(cat(check_dataset(df)))
})

test_that("check_dataset only checks columns that match DwC terms", {
df <- tibble::tibble(
scientificName = c("Callocephalon fimbriatum", "Eolophus roseicapilla"),
occurrenceStatus = c("present", "present"),
decimalLatitude = c(-35.310, "-35.273"),
longitude = c(149.125, 149.133)
)

expect_snapshot(cat(check_dataset(df)))
})

test_that("check_dataset prints a maximum of 5 error messages", {
df <- tibble::tibble(
scientificName = c("Callocephalon fimbriatum", 2),
occurrenceStatus = c("present", "blop"),
decimalLatitude = c(-35.310, "-35.273"),
decimalLongitude = c(149.125, "149.133"),
coordinatePrecision = c(.0001, ".0001"),
genus = 1:2
)

expect_snapshot(cat(check_dataset(df)))
})

test_that("check_dataset notifies when data meets minimum Darwin Core column requirements", {
df <- tibble::tibble(
scientificName = c("Callocephalon fimbriatum", "Eolophus roseicapilla"),
decimalLatitude = c(-35.310, -35.273), # deliberate error for demonstration purposes
decimalLongitude = c(149.125, 149.133),
eventDate = lubridate::dmy("14-01-2023", "15-01-2023"),
occurrenceStatus = c("present", "present"),
occurrenceID = c("d32ed0c8-d791-11ef-8000-01ff50b5e852", "d32ed0e6-d791-11ef-8000-01ff50b5e852"),
basisOfRecord = "humanObservation",
coordinateUncertaintyInMeters = 10,
geodeticDatum = "WGS84"
)

expect_snapshot(cat(check_dataset(df)))
})

test_that("check_dataset handles `set_measurements()`", {
df <- tibble::tibble(
Species = c("Toechima", "Callicoma serratifolia"),
Latitude = c(-17.1, -30.3),
Longitude = c(146.002, 153.003),
measurementValue = c(81.4, NA),
measurementID = c("LMA_g.m2|1", "LMA_g.m2|2"),
measurementUnit = "g/m2",
measurementType = "LMA"
) |>
tidyr::nest(measurementOrFact = c(measurementValue, measurementID,
measurementUnit, measurementType))

expect_snapshot(cat(check_dataset(df)))
})

0 comments on commit 5150e5a

Please sign in to comment.