diff --git a/base/db/DESCRIPTION b/base/db/DESCRIPTION index b2590784b5..e8b7976070 100644 --- a/base/db/DESCRIPTION +++ b/base/db/DESCRIPTION @@ -80,5 +80,5 @@ Copyright: Authors LazyLoad: yes LazyData: FALSE Encoding: UTF-8 -RoxygenNote: 7.3.3 X-schema.org-keywords: PEcAn, database +RoxygenNote: 7.3.3 diff --git a/base/db/NAMESPACE b/base/db/NAMESPACE index c9d6c10b2d..e88ff29962 100644 --- a/base/db/NAMESPACE +++ b/base/db/NAMESPACE @@ -33,6 +33,7 @@ export(get.trait.data) export(get.trait.data.pft) export(get_postgres_envvars) export(get_run_ids) +export(get_trait_data_pft) export(get_users) export(get_var_names) export(get_workflow_ids) diff --git a/base/db/R/get.trait.data.pft.R b/base/db/R/get.trait.data.pft.R index 30c4005915..64e24b854d 100644 --- a/base/db/R/get.trait.data.pft.R +++ b/base/db/R/get.trait.data.pft.R @@ -1,147 +1,130 @@ ##' Get trait data from the database for a single PFT ##' -##' @md ##' Queries BETYdb for trait observations and prior distributions for a single -##' plant functional type (PFT). Results are saved to files -##' in the PFT output directory (`pft$outdir`), and also registered in the -##' database as posterior records when `write = TRUE`. +##' plant functional type (PFT). Results are saved to files in the PFT output +##' directory (`pft$outdir`), and also registered in the database as posterior +##' records when `write = TRUE`. ##' ##' @details ##' `pft` should be a list containing at least `name` and `outdir`, and ##' optionally `posteriorid` and `constants`. ##' -##' **File-based side effects (saved to `pft$outdir`):** -##' \describe{ -##' \item{`trait.data.Rdata`}{Contains a single object `trait.data`: a named -##' list of data frames, one per trait. Each data frame has columns from -##' BETYdb's traits/yields views (e.g., `mean`, `stat`, `n`, `site_id`, -##' `treatment_id`). Names correspond to trait variable names -##' (e.g., `"SLA"`, `"Vcmax"`).} -##' \item{`prior.distns.Rdata`}{Contains a single object `prior.distns`: a -##' data frame with one row per trait and columns `distn`, `parama`, -##' `paramb`, and `n`. Row names are trait variable names. Traits listed -##' in `pft$constants` are excluded.} -##' \item{`trait.data.csv`}{CSV export of `trait.data` (all traits -##' row-bound).} -##' \item{`prior.distns.csv`}{CSV export of `prior.distns`.} -##' \item{`species.csv` or `cultivars.csv`}{PFT membership list used to -##' detect changes between runs.} -##' } -##' -##' **Downstream contract:** The files `trait.data.Rdata` and -##' `prior.distns.Rdata` are expected by \code{run.meta.analysis.pft}, which -##' loads them from `pft$outdir`. This implicit file-based coupling means -##' the two functions must agree on directory path and object names. A future -##' refactoring goal is to pass these objects directly via function arguments -##' instead. -##' +##' @md ##' @param pft list of settings for the pft whose traits to retrieve. See details. ##' @param modeltype type of model that is used, this is used to distinguish ##' between different pfts with the same name. ##' @param dbfiles location where previous results are found ##' @param dbcon database connection +##' @param trait.names list of trait names to retrieve ##' @param forceupdate set this to true to force an update, auto will check to ##' see if an update is needed. ##' @param write (Logical) If `TRUE` updated posteriors will be written to ##' BETYdb. Defaults to `FALSE`. -##' @param trait.names list of trait names to retrieve -##' @return The `pft` input list, updated with `pft$posteriorid` set to the -##' ID of the (possibly new) posterior record in BETYdb. The posterior ID can -##' be used to locate the output files (`trait.data.Rdata`, `prior.distns.Rdata`, -##' etc.) via BETYdb's `dbfiles` table. +##' @param return_data (Logical) If `TRUE`, the returned `pft` list also +##' includes `trait_data` and `prior_distns` as in-memory objects. +##' Defaults to `FALSE` to preserve legacy behavior — when `pft` is +##' embedded inside a `settings` object, attaching these data frames by +##' default would inflate the settings and break serialization. +##' @return The updated \code{pft} list with \code{posteriorid} set to the +##' ID of the (possibly new) posterior record in BETYdb. The posterior ID +##' can be used to locate the output files (\code{trait.data.Rdata}, +##' \code{prior.distns.Rdata}, etc.) via BETYdb's \code{dbfiles} table. +##' +##' When \code{return_data = TRUE}, the returned list also includes +##' \code{trait_data} and \code{prior_distns} as in-memory objects, so +##' downstream callers can use them without reloading from \code{pft$outdir}. +##' +##' @section File-based side effects: +##' The following files are written to \code{pft$outdir}: +##' \describe{ +##' \item{\code{trait.data.Rdata}}{Named list of trait data frames, one per +##' trait that has observations. Read by \code{run.meta.analysis.pft()}.} +##' \item{\code{prior.distns.Rdata}}{Data frame of prior distributions, rows +##' named by trait. Read by \code{run.meta.analysis.pft()}.} +##' \item{\code{species.csv} or \code{cultivars.csv}}{Data frame of PFT +##' member species or cultivar IDs, depending on PFT type.} +##' \item{\code{trait.data.csv}}{Flattened CSV of all trait observations, +##' one row per observation, for human inspection.} +##' } +##' In addition, each output file is registered in BETYdb via +##' \code{dbfile.insert()} and associated with the posterior record whose ID +##' is returned as \code{pft$posteriorid}. +##' +##' @section Downstream contract: +##' \code{run.meta.analysis.pft()} reads \code{trait.data.Rdata} and +##' \code{prior.distns.Rdata} from \code{pft$outdir} at the start of the +##' meta-analysis step. These two files are therefore a required output of +##' this wrapper. +##' +##' Note: this file-based contract will be removed once +##' \code{run.meta.analysis.pft()} is refactored to accept in-memory inputs +##' (GSoC Week 2). +##' ##' @author David LeBauer, Shawn Serbin, Rob Kooper ##' @export -get.trait.data.pft <- - function(pft, - modeltype, - dbfiles, - dbcon, - trait.names, - forceupdate = FALSE, - write = FALSE) { - +get.trait.data.pft <- function(pft, modeltype, dbfiles, dbcon, + trait.names = NULL, + forceupdate = FALSE, + write = TRUE, + return_data = FALSE) { # Create directory if necessary if (!file.exists(pft$outdir) && !dir.create(pft$outdir, recursive = TRUE)) { - PEcAn.logger::logger.error(paste0("Couldn't create PFT output directory: ", pft$outdir)) - } - - # find appropriate pft - pftres <- query_pfts(dbcon, pft[["name"]], modeltype) - pfttype <- pftres[["pft_type"]] - pftid <- pftres[["id"]] - - if (nrow(pftres) > 1) { - PEcAn.logger::logger.severe( - "Multiple PFTs named", pft[["name"]], "found,", - "with ids", PEcAn.utils::vecpaste(pftres[["id"]]), ".", - "Specify modeltype to fix this.") - } - - if (nrow(pftres) == 0) { - PEcAn.logger::logger.severe("Could not find pft", pft[["name"]]) - return(NA) - } - - # get the member species/cultivars, we need to check if anything changed - if (pfttype == "plant") { - pft_member_filename = "species.csv" - pft_members <- PEcAn.DB::query.pft_species(pft$name, modeltype, dbcon) - } else if (pfttype == "cultivar") { - pft_member_filename = "cultivars.csv" - pft_members <- PEcAn.DB::query.pft_cultivars(pft$name, modeltype, dbcon) - } else { - PEcAn.logger::logger.severe("Unknown pft type! Expected 'plant' or 'cultivar', got", pfttype) + PEcAn.logger::logger.error( + paste0("Couldn't create PFT output directory: ", pft$outdir) + ) } - # ANS: Need to do this conversion for the check against existing - # membership later on. Otherwise, `NA` from the CSV is interpreted - # as different from `""` returned here, even though they are really - # the same thing. - pft_members <- pft_members %>% - dplyr::mutate_if(is.character, ~dplyr::na_if(., "")) - - # get the priors - prior.distns <- PEcAn.DB::query.priors( - pft = pftid, - trstr = PEcAn.utils::vecpaste(trait.names), - con = dbcon) - prior.distns <- prior.distns[which(!rownames(prior.distns) %in% names(pft$constants)),] - traits <- rownames(prior.distns) - - # get the trait data (don't bother sampling derived traits until after update check) - trait.data.check <- PEcAn.DB::query.traits(ids = pft_members$id, priors = traits, con = dbcon, update.check.only = TRUE, ids_are_cultivars = (pfttype=="cultivar")) - traits <- names(trait.data.check) - - # Set forceupdate FALSE if it's a string (backwards compatible with 'AUTO' flag used in the past) + # Backwards-compatible: forceupdate may arrive as "AUTO" or other strings forceupdate <- isTRUE(as.logical(forceupdate)) - # check to see if we need to update + # All database queries are delegated to get_trait_data_pft() exactly once. + # The returned objects feed both the cache-staleness check and the save + # step, so the database is never queried more than once per call. + # Input validation (pft_name, trait_names, dbcon) and the pft_type guard + # also live inside get_trait_data_pft(); errors surface from there. + computed <- get_trait_data_pft( + pft_name = pft[["name"]], + modeltype = modeltype, + dbcon = dbcon, + trait_names = trait.names, + constants = if (!is.null(pft$constants)) pft$constants else list() + ) + trait.data <- computed$trait_data + prior.distns <- computed$prior_distns + pft_members <- computed$pft_info$pft_members + pftid <- computed$pft_info$pft_id + pfttype <- computed$pft_info$pft_type + pft_member_filename <- computed$pft_info$pft_member_filename + + # ---- Cache staleness check ---- if (!forceupdate) { if (is.null(pft$posteriorid)) { - recent_posterior <- dplyr::tbl(dbcon, "posteriors") %>% - dplyr::filter(.data$pft_id == !!pftid) %>% + recent_posterior <- dplyr::tbl(dbcon, "posteriors") |> + dplyr::filter(.data$pft_id == !!pftid) |> dplyr::collect() if (length(recent_posterior) > 0) { - pft$posteriorid <- dplyr::tbl(dbcon, "posteriors") %>% - dplyr::filter(.data$pft_id == !!pftid) %>% - dplyr::arrange(dplyr::desc(.data$created_at)) %>% - utils::head(1) %>% + pft$posteriorid <- dplyr::tbl(dbcon, "posteriors") |> + dplyr::filter(.data$pft_id == !!pftid) |> + dplyr::arrange(dplyr::desc(.data$created_at)) |> + utils::head(1) |> dplyr::pull("id") } else { PEcAn.logger::logger.info("No previous posterior found. Forcing update") } } + if (!is.null(pft$posteriorid)) { - files <- dbfile.check(type = "Posterior", container.id = pft$posteriorid, con = dbcon, - return.all = TRUE) + files <- dbfile.check(type = "Posterior", container.id = pft$posteriorid, + con = dbcon, return.all = TRUE) need_files <- c( - trait_data = "trait.data.Rdata", - priors = "prior.distns.Rdata", + trait_data = "trait.data.Rdata", + priors = "prior.distns.Rdata", pft_membership = pft_member_filename ) ids <- match(need_files, files$file_name) names(ids) <- names(need_files) + if (any(is.na(ids))) { missing_files <- need_files[is.na(ids)] PEcAn.logger::logger.info(paste0( @@ -159,11 +142,13 @@ get.trait.data.pft <- "All posterior files are present. Performing additional checks ", "to determine if meta-analysis needs to be updated." ) - # check if all files exist + + # Check all required files exist on disk need_paths <- file.path(files$file_path[ids], need_files) names(need_paths) <- names(need_files) files_exist <- file.exists(need_paths) foundallfiles <- all(files_exist) + if (!foundallfiles) { PEcAn.logger::logger.warn( "The following files are in database but not found on disk: ", @@ -175,24 +160,24 @@ get.trait.data.pft <- PEcAn.logger::logger.debug("Checking if PFT membership has changed.") if (pfttype == "plant") { # Columns are: id, genus, species, scientificname - colClass = c("double", "character", "character", "character") + colClass <- c("double", "character", "character", "character") } else if (pfttype == "cultivar") { # Columns are: id, specie_id, genus, species, scientificname, cultivar - colClass = c("double", "double", "character", "character", "character", "character") - } + colClass <- c("double", "double", "character", "character", + "character", "character") + } existing_membership <- utils::read.csv( need_paths[["pft_membership"]], - # Need this so NA values are formatted consistently - colClasses = colClass, + colClasses = colClass, stringsAsFactors = FALSE, - na.strings = c("", "NA") - ) + na.strings = c("", "NA") + ) diff_membership <- symmetric_setdiff( existing_membership, pft_members, xname = "existing", yname = "current" - ) + ) if (nrow(diff_membership) > 0) { PEcAn.logger::logger.error( "\n PFT membership has changed. \n", @@ -205,7 +190,9 @@ get.trait.data.pft <- # Check if priors have changed PEcAn.logger::logger.debug("Checking if priors have changed") - existing_prior <- PEcAn.utils::load_local(need_paths[["priors"]])[["prior.distns"]] + existing_prior <- PEcAn.utils::load_local( + need_paths[["priors"]] + )[["prior.distns"]] diff_prior <- symmetric_setdiff( dplyr::as_tibble(prior.distns, rownames = "trait"), dplyr::as_tibble(existing_prior, rownames = "trait") @@ -225,25 +212,28 @@ get.trait.data.pft <- existing_trait_data <- PEcAn.utils::load_local( need_paths[["trait_data"]] )[["trait.data"]] - if (length(trait.data.check) != length(existing_trait_data)) { + if (length(trait.data) != length(existing_trait_data)) { PEcAn.logger::logger.warn( "Lengths of new and existing `trait.data` differ. ", "Re-running meta-analysis." ) foundallfiles <- FALSE - } else if (length(trait.data.check) == 0) { - PEcAn.logger::logger.warn("New and existing trait data are both empty. Skipping this check.") + } else if (length(trait.data) == 0) { + PEcAn.logger::logger.warn( + "New and existing trait data are both empty. Skipping this check." + ) } else { - current_traits <- dplyr::bind_rows(trait.data.check, .id = "trait") %>% + current_traits <- dplyr::bind_rows(trait.data, .id = "trait") |> dplyr::select(-mean, -"stat") - existing_traits <- dplyr::bind_rows(existing_trait_data, .id = "trait") %>% + existing_traits <- dplyr::bind_rows(existing_trait_data, + .id = "trait") |> dplyr::select(-mean, -"stat") diff_traits <- symmetric_setdiff(current_traits, existing_traits) if (nrow(diff_traits) > 0) { - diff_summary <- diff_traits %>% + diff_summary <- diff_traits |> dplyr::count(source, .data$trait) PEcAn.logger::logger.error( - "\n Prior has changed. \n", + "\n Trait data has changed. \n", "Here are the number of differing trait records by trait:\n", PEcAn.logger::print2string(diff_summary), wrap = FALSE @@ -251,8 +241,7 @@ get.trait.data.pft <- foundallfiles <- FALSE } } - } - + } # end else (all files on disk) if (foundallfiles) { PEcAn.logger::logger.info( @@ -260,26 +249,29 @@ get.trait.data.pft <- "for PFT", shQuote(pft$name) ) for (id in seq_len(nrow(files))) { - file.copy(from = file.path(files[[id, "file_path"]], files[[id, "file_name"]]), - to = file.path(pft$outdir, files[[id, "file_name"]])) + file.copy( + from = file.path(files[[id, "file_path"]], files[[id, "file_name"]]), + to = file.path(pft$outdir, files[[id, "file_name"]]) + ) } done <- TRUE - # May need to symlink the generic post.distns.Rdata to a specific post.distns.*.Rdata file. + # May need to symlink the generic post.distns.Rdata to the + # model-specific post.distns.*.Rdata file. if (length(list.files(pft$outdir, "post.distns.Rdata")) == 0) { all.files <- list.files(pft$outdir) - post.distn.file <- all.files[grep("post\\.distns\\..*\\.Rdata", all.files)] - if (length(post.distn.file) > 1) + post.distn.file <- all.files[grep("post\\.distns\\..*\\.Rdata", + all.files)] + if (length(post.distn.file) > 1) { PEcAn.logger::logger.severe( "get.trait.data.pft() doesn't know how to ", "handle multiple `post.distns.*.Rdata` files.", "Found the following files: ", paste(shQuote(post.distn.file), collapse = ", ") ) - else if (length(post.distn.file) == 1) { - # Found exactly one post.distns.*.Rdata file. Use it. - link_input <- file.path(pft[["outdir"]], post.distn.file) + } else if (length(post.distn.file) == 1) { + link_input <- file.path(pft[["outdir"]], post.distn.file) link_target <- file.path(pft[["outdir"]], "post.distns.Rdata") PEcAn.logger::logger.debug( "Found exactly one posterior distribution file: ", @@ -297,25 +289,27 @@ get.trait.data.pft <- done <- FALSE } } - if (done) return(pft) - } - } - } - } - # get the trait data (including sampling of derived traits, if any) - trait.data <- query.traits(pft_members$id, traits, con = dbcon, - update.check.only = FALSE, - ids_are_cultivars = (pfttype == "cultivar")) - traits <- names(trait.data) + if (done) { + if (return_data) { + pft$trait_data <- trait.data + pft$prior_distns <- prior.distns + } + return(pft) + } + } + } # end else (all files in DB) + } # end if (!is.null(pft$posteriorid)) + } # end if (!forceupdate) + # ---- Cache miss: log counts, save to disk, register in DB ---- if (length(trait.data) > 0) { - trait_counts <- trait.data %>% - dplyr::bind_rows(.id = "trait") %>% + trait_counts <- trait.data |> + dplyr::bind_rows(.id = "trait") |> dplyr::count(.data$trait) - PEcAn.logger::logger.info( - "\n Number of observations per trait for PFT ", shQuote(pft[["name"]]), ":\n", + "\n Number of observations per trait for PFT ", shQuote(pft[["name"]]), + ":\n", PEcAn.logger::print2string(trait_counts, n = Inf, na.print = ""), wrap = FALSE ) @@ -326,36 +320,39 @@ get.trait.data.pft <- ) } - # get list of existing files so they get ignored saving + # Snapshot existing files so we know which ones are new after saving old.files <- list.files(path = pft$outdir) - # create a new posterior + # Create a new posterior record in BETYdb insert_result <- db.query( - paste0("INSERT INTO posteriors (pft_id) VALUES (", pftid, ") RETURNING id"), - con = dbcon) + paste0("INSERT INTO posteriors (pft_id) VALUES (", pftid, + ") RETURNING id"), + con = dbcon + ) pft$posteriorid <- insert_result[["id"]] - # create path where to store files + # Create the storage path for this posterior pathname <- file.path(dbfiles, "posterior", pft$posteriorid) dir.create(pathname, showWarnings = FALSE, recursive = TRUE) - ## 1. get species/cultivar list based on pft - utils::write.csv(pft_members, file.path(pft$outdir, pft_member_filename), + ## Write species/cultivar membership list + utils::write.csv(pft_members, + file.path(pft$outdir, pft_member_filename), row.names = FALSE) - ## save priors + ## Save prior distributions save(prior.distns, file = file.path(pft$outdir, "prior.distns.Rdata")) - utils::write.csv(prior.distns, file.path(pft$outdir, "prior.distns.csv"), + utils::write.csv(prior.distns, + file.path(pft$outdir, "prior.distns.csv"), row.names = TRUE) - ## 3. display info to the console PEcAn.logger::logger.info( "\n Summary of prior distributions for PFT ", shQuote(pft$name), ":\n", PEcAn.logger::print2string(prior.distns), wrap = FALSE ) - ## traits = variables with prior distributions for this pft + ## Save trait data trait.data.file <- file.path(pft$outdir, "trait.data.Rdata") save(trait.data, file = trait.data.file) utils::write.csv( @@ -364,16 +361,18 @@ get.trait.data.pft <- row.names = FALSE ) - ### save and store in database all results except those that were there already - if(isTRUE(write)) { + ## Register new files in BETYdb + if (isTRUE(write)) { store_files_all <- list.files(path = pft[["outdir"]]) store_files <- setdiff(store_files_all, old.files) PEcAn.logger::logger.debug( "The following posterior files found in PFT outdir ", "(", shQuote(pft[["outdir"]]), ") will be registered in BETY ", - "under posterior ID ", format(pft[["posteriorid"]], scientific = FALSE), ": ", + "under posterior ID ", + format(pft[["posteriorid"]], scientific = FALSE), ": ", paste(shQuote(store_files), collapse = ", "), ". ", - "The following files (if any) will not be registered because they already existed: ", + "The following files (if any) will not be registered because they ", + "already existed: ", paste(shQuote(intersect(store_files, old.files)), collapse = ", "), wrap = FALSE ) @@ -386,5 +385,9 @@ get.trait.data.pft <- } } + if (return_data) { + pft$trait_data <- trait.data + pft$prior_distns <- prior.distns + } return(pft) -} +} \ No newline at end of file diff --git a/base/db/R/get_trait_data_pft.R b/base/db/R/get_trait_data_pft.R new file mode 100644 index 0000000000..dcc4e2c076 --- /dev/null +++ b/base/db/R/get_trait_data_pft.R @@ -0,0 +1,202 @@ +#' Retrieve trait data and priors for one PFT from BETYdb +#' +#' Core computation extracted from \code{\link{get.trait.data.pft}}. +#' Queries the database for trait observations and prior distributions, +#' returning them as R objects with no file I/O of any kind. +#' +#' The wrapper \code{\link{get.trait.data.pft}} handles directory creation, +#' caching, CSV output, and BETYdb registration via \code{dbfile.insert}. +#' This function handles only the query. Choosing between them is the +#' provenance opt-in: calling the wrapper saves artifacts to disk; calling +#' this function never does. +#' +#' This follows the pattern established by \code{meta_analysis_standalone} +#' for the meta-analysis step and \code{get_parameter_samples} for parameter +#' sampling — each is a computation core that can be tested in isolation +#' without a filesystem or a \code{settings} object. +#' +#' @param pft_name character. PFT name as stored in BETYdb. +#' @param modeltype character or NULL. Disambiguates PFTs that share a name +#' across model types (e.g. \code{"SIPNET"}, \code{"ED2"}). +#' @param dbcon database connection from \code{\link[PEcAn.DB]{db.open}}. +#' @param trait_names character vector of trait names to retrieve. +#' @param constants named list from \code{pft$constants} in the settings. +#' Traits named here are excluded from the returned priors because their +#' values are fixed rather than sampled by the meta-analysis. +#' +#' @return Named list with three elements: +#' \describe{ +#' \item{\code{trait_data}}{Named list of data frames, one per trait that +#' has observations. Column structure matches what +#' \code{meta_analysis_standalone} expects. Traits with no observations +#' are omitted from the list.} +#' \item{\code{prior_distns}}{Data frame with columns \code{distn}, +#' \code{parama}, \code{paramb}, \code{n}; rows named by trait. Traits +#' listed in \code{constants} are excluded.} +#' \item{\code{pft_info}}{List with \code{name}, \code{pft_id}, +#' \code{pft_type}, \code{pft_members}, \code{pft_member_filename}, and +#' \code{posteriorid}. \code{pft_members} is the data frame of species or +#' cultivar IDs used during the query. \code{pft_member_filename} is +#' \code{"species.csv"} or \code{"cultivars.csv"} depending on PFT type. +#' \code{posteriorid} is always \code{NULL} — the wrapper sets it after +#' registering outputs in BETYdb.} +#' } +#' +#' @seealso \code{\link{get.trait.data.pft}} for the backward-compatible +#' wrapper that handles provenance and caching. +#' \code{meta_analysis_standalone} (in PEcAn.MA) for the analogous +#' function in the meta-analysis step. +#' \code{get_parameter_samples} for the analogous function in the +#' parameter sampling step. +#' +#' @examples +#' \dontrun{ +#' dbcon <- PEcAn.DB::db.open(list( +#' host = "localhost", user = "bety", +#' password = "bety", dbname = "bety" +#' )) +#' result <- get_trait_data_pft( +#' pft_name = "temperate.deciduous", +#' modeltype = "SIPNET", +#' dbcon = dbcon, +#' trait_names = c("SLA", "Vcmax", "leaf_respiration_rate_m2") +#' ) +#' str(result$trait_data) +#' str(result$prior_distns) +#' PEcAn.DB::db.close(dbcon) +#' } +#' +#' @author David LeBauer, Shawn Serbin, Alexey Shiklomanov, Om Kapale +#' @export +get_trait_data_pft <- function(pft_name, + modeltype, + dbcon, + trait_names, + constants = list()) { + + # ---- Input validation (cheap checks before any DB call) ---- + if (!is.character(pft_name) || length(pft_name) != 1L) { + PEcAn.logger::logger.severe("'pft_name' must be a single character string") + } + if (!is.character(trait_names) || length(trait_names) == 0L) { + PEcAn.logger::logger.severe( + "'trait_names' must be a non-empty character vector" + ) + } + if (!inherits(dbcon, "DBIConnection")) { + PEcAn.logger::logger.severe("'dbcon' must be a database connection") + } + + # ---- Resolve PFT to a single database record ---- + pft_record <- query_pfts(dbcon, pft_name, modeltype, strict = TRUE) + if (nrow(pft_record) > 1L) { + PEcAn.logger::logger.severe( + "Multiple PFTs named '", pft_name, "' found in the database;", + " pass modeltype to disambiguate." + ) + } + + pft_id <- pft_record[["id"]] + pft_type <- pft_record[["pft_type"]] + + PEcAn.logger::logger.info( + "Querying trait data for PFT '", pft_name, "' (id = ", pft_id, ")" + ) + + # ---- Fetch PFT member species or cultivars ---- + # The join table depends on pft_type. An unknown type is a hard error — + # silently falling through to the species path would produce wrong results. + if (identical(pft_type, "cultivar")) { + pft_member_filename <- "cultivars.csv" + members <- query.pft_cultivars(pft = pft_name, modeltype = modeltype, + con = dbcon) + } else if (identical(pft_type, "plant")) { + pft_member_filename <- "species.csv" + members <- query.pft_species(pft = pft_name, modeltype = modeltype, + con = dbcon) + } else { + PEcAn.logger::logger.severe( + "Unknown pft_type '", pft_type, "' for PFT '", pft_name, + "'; expected 'plant' or 'cultivar'." + ) + } + + # Normalise empty strings to NA so membership comparisons are consistent + members <- members |> + dplyr::mutate(dplyr::across( + dplyr::where(is.character), + \(x) dplyr::na_if(x, "") + )) + + member_ids <- members[["id"]] + + if (length(member_ids) == 0L) { + PEcAn.logger::logger.info( + "PFT '", pft_name, "' has no associated ", + if (identical(pft_type, "cultivar")) "cultivars" else "species", + "; trait_data will be an empty list." + ) + } + + # ---- Query prior distributions ---- + # format() prevents integer64 from being silently coerced to double in SQL. + prior_distns <- query.priors( + pft = format(pft_id, scientific = FALSE), + trstr = PEcAn.utils::vecpaste(trait_names), + con = dbcon + ) + + # Exclude traits listed in pft$constants — their values are fixed and must + # not be sampled by the meta-analysis. + if (length(constants) > 0L && !is.null(names(constants))) { + constant_traits <- names(constants) + in_constants <- rownames(prior_distns) %in% constant_traits + if (any(in_constants)) { + PEcAn.logger::logger.info( + "Excluding ", sum(in_constants), " constant trait(s) from priors: ", + PEcAn.utils::vecpaste(rownames(prior_distns)[in_constants]) + ) + prior_distns <- prior_distns[!in_constants, , drop = FALSE] + } + } + + # ---- Query trait observations ---- + # Only query traits that actually have a prior — querying without a prior + # is meaningless for meta-analysis. + traits_with_priors <- rownames(prior_distns) + if (length(member_ids) > 0L && length(traits_with_priors) > 0L) { + trait_data <- query.traits( + ids = member_ids, + priors = traits_with_priors, + con = dbcon, + ids_are_cultivars = identical(pft_type, "cultivar") + ) + } else { + trait_data <- list() + } + + PEcAn.logger::logger.info( + "PFT '", pft_name, "': ", + length(trait_data), " trait(s) with observations, ", + nrow(prior_distns), " trait(s) with priors" + ) + + # posteriorid is always NULL here — the wrapper assigns it after registering + # the output files in BETYdb via dbfile.insert(). + # pft_members and pft_member_filename are included so the wrapper can use + # them for cache comparison and CSV output without re-querying the database. + pft_info <- list( + name = pft_name, + pft_id = pft_id, + pft_type = pft_type, + pft_members = members, + pft_member_filename = pft_member_filename, + posteriorid = NULL + ) + + return(list( + trait_data = trait_data, + prior_distns = prior_distns, + pft_info = pft_info + )) +} \ No newline at end of file diff --git a/base/db/man/get.trait.data.pft.Rd b/base/db/man/get.trait.data.pft.Rd index 406e84e02e..276573fde8 100644 --- a/base/db/man/get.trait.data.pft.Rd +++ b/base/db/man/get.trait.data.pft.Rd @@ -9,9 +9,10 @@ get.trait.data.pft( modeltype, dbfiles, dbcon, - trait.names, + trait.names = NULL, forceupdate = FALSE, - write = FALSE + write = TRUE, + return_data = FALSE ) } \arguments{ @@ -31,45 +32,63 @@ see if an update is needed.} \item{write}{(Logical) If \code{TRUE} updated posteriors will be written to BETYdb. Defaults to \code{FALSE}.} + +\item{return_data}{(Logical) If \code{TRUE}, the returned \code{pft} list also +includes \code{trait_data} and \code{prior_distns} as in-memory objects. +Defaults to \code{FALSE} to preserve legacy behavior — when \code{pft} is +embedded inside a \code{settings} object, attaching these data frames by +default would inflate the settings and break serialization.} } \value{ -The \code{pft} input list, updated with \code{pft$posteriorid} set to the -ID of the (possibly new) posterior record in BETYdb. The posterior ID can -be used to locate the output files (\code{trait.data.Rdata}, \code{prior.distns.Rdata}, -etc.) via BETYdb's \code{dbfiles} table. +The updated \code{pft} list with \code{posteriorid} set to the +ID of the (possibly new) posterior record in BETYdb. The posterior ID +can be used to locate the output files (\code{trait.data.Rdata}, +\code{prior.distns.Rdata}, etc.) via BETYdb's \code{dbfiles} table. + +When \code{return_data = TRUE}, the returned list also includes +\code{trait_data} and \code{prior_distns} as in-memory objects, so +downstream callers can use them without reloading from \code{pft$outdir}. } \description{ -Get trait data from the database for a single PFT +Queries BETYdb for trait observations and prior distributions for a single +plant functional type (PFT). Results are saved to files in the PFT output +directory (\code{pft$outdir}), and also registered in the database as posterior +records when \code{write = TRUE}. } \details{ \code{pft} should be a list containing at least \code{name} and \code{outdir}, and optionally \code{posteriorid} and \code{constants}. +} +\section{File-based side effects}{ -\strong{File-based side effects (saved to \code{pft$outdir}):} +The following files are written to \code{pft$outdir}: \describe{ -\item{\code{trait.data.Rdata}}{Contains a single object \code{trait.data}: a named -list of data frames, one per trait. Each data frame has columns from -BETYdb's traits/yields views (e.g., \code{mean}, \code{stat}, \code{n}, \code{site_id}, -\code{treatment_id}). Names correspond to trait variable names -(e.g., \code{"SLA"}, \code{"Vcmax"}).} -\item{\code{prior.distns.Rdata}}{Contains a single object \code{prior.distns}: a -data frame with one row per trait and columns \code{distn}, \code{parama}, -\code{paramb}, and \code{n}. Row names are trait variable names. Traits listed -in \code{pft$constants} are excluded.} -\item{\code{trait.data.csv}}{CSV export of \code{trait.data} (all traits -row-bound).} -\item{\code{prior.distns.csv}}{CSV export of \code{prior.distns}.} -\item{\code{species.csv} or \code{cultivars.csv}}{PFT membership list used to -detect changes between runs.} +\item{\code{trait.data.Rdata}}{Named list of trait data frames, one per +trait that has observations. Read by \code{run.meta.analysis.pft()}.} +\item{\code{prior.distns.Rdata}}{Data frame of prior distributions, rows +named by trait. Read by \code{run.meta.analysis.pft()}.} +\item{\code{species.csv} or \code{cultivars.csv}}{Data frame of PFT +member species or cultivar IDs, depending on PFT type.} +\item{\code{trait.data.csv}}{Flattened CSV of all trait observations, +one row per observation, for human inspection.} +} +In addition, each output file is registered in BETYdb via +\code{dbfile.insert()} and associated with the posterior record whose ID +is returned as \code{pft$posteriorid}. } -\strong{Downstream contract:} The files \code{trait.data.Rdata} and -\code{prior.distns.Rdata} are expected by \code{run.meta.analysis.pft}, which -loads them from \code{pft$outdir}. This implicit file-based coupling means -the two functions must agree on directory path and object names. A future -refactoring goal is to pass these objects directly via function arguments -instead. +\section{Downstream contract}{ + +\code{run.meta.analysis.pft()} reads \code{trait.data.Rdata} and +\code{prior.distns.Rdata} from \code{pft$outdir} at the start of the +meta-analysis step. These two files are therefore a required output of +this wrapper. + +Note: this file-based contract will be removed once +\code{run.meta.analysis.pft()} is refactored to accept in-memory inputs +(GSoC Week 2). } + \author{ David LeBauer, Shawn Serbin, Rob Kooper } diff --git a/base/db/man/get_trait_data_pft.Rd b/base/db/man/get_trait_data_pft.Rd new file mode 100644 index 0000000000..87086ba3a0 --- /dev/null +++ b/base/db/man/get_trait_data_pft.Rd @@ -0,0 +1,87 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_trait_data_pft.R +\name{get_trait_data_pft} +\alias{get_trait_data_pft} +\title{Retrieve trait data and priors for one PFT from BETYdb} +\usage{ +get_trait_data_pft(pft_name, modeltype, dbcon, trait_names, constants = list()) +} +\arguments{ +\item{pft_name}{character. PFT name as stored in BETYdb.} + +\item{modeltype}{character or NULL. Disambiguates PFTs that share a name +across model types (e.g. \code{"SIPNET"}, \code{"ED2"}).} + +\item{dbcon}{database connection from \code{\link[PEcAn.DB]{db.open}}.} + +\item{trait_names}{character vector of trait names to retrieve.} + +\item{constants}{named list from \code{pft$constants} in the settings. +Traits named here are excluded from the returned priors because their +values are fixed rather than sampled by the meta-analysis.} +} +\value{ +Named list with three elements: +\describe{ + \item{\code{trait_data}}{Named list of data frames, one per trait that + has observations. Column structure matches what + \code{meta_analysis_standalone} expects. Traits with no observations + are omitted from the list.} + \item{\code{prior_distns}}{Data frame with columns \code{distn}, + \code{parama}, \code{paramb}, \code{n}; rows named by trait. Traits + listed in \code{constants} are excluded.} + \item{\code{pft_info}}{List with \code{name}, \code{pft_id}, + \code{pft_type}, \code{pft_members}, \code{pft_member_filename}, and + \code{posteriorid}. \code{pft_members} is the data frame of species or + cultivar IDs used during the query. \code{pft_member_filename} is + \code{"species.csv"} or \code{"cultivars.csv"} depending on PFT type. + \code{posteriorid} is always \code{NULL} — the wrapper sets it after + registering outputs in BETYdb.} +} +} +\description{ +Core computation extracted from \code{\link{get.trait.data.pft}}. +Queries the database for trait observations and prior distributions, +returning them as R objects with no file I/O of any kind. +} +\details{ +The wrapper \code{\link{get.trait.data.pft}} handles directory creation, +caching, CSV output, and BETYdb registration via \code{dbfile.insert}. +This function handles only the query. Choosing between them is the +provenance opt-in: calling the wrapper saves artifacts to disk; calling +this function never does. + +This follows the pattern established by \code{meta_analysis_standalone} +for the meta-analysis step and \code{get_parameter_samples} for parameter +sampling — each is a computation core that can be tested in isolation +without a filesystem or a \code{settings} object. +} +\examples{ +\dontrun{ +dbcon <- PEcAn.DB::db.open(list( + host = "localhost", user = "bety", + password = "bety", dbname = "bety" +)) +result <- get_trait_data_pft( + pft_name = "temperate.deciduous", + modeltype = "SIPNET", + dbcon = dbcon, + trait_names = c("SLA", "Vcmax", "leaf_respiration_rate_m2") +) +str(result$trait_data) +str(result$prior_distns) +PEcAn.DB::db.close(dbcon) +} + +} +\seealso{ +\code{\link{get.trait.data.pft}} for the backward-compatible + wrapper that handles provenance and caching. + \code{meta_analysis_standalone} (in PEcAn.MA) for the analogous + function in the meta-analysis step. + \code{get_parameter_samples} for the analogous function in the + parameter sampling step. +} +\author{ +David LeBauer, Shawn Serbin, Alexey Shiklomanov, Om Kapale +} diff --git a/base/db/tests/testthat/test-get.trait.data.pft.R b/base/db/tests/testthat/test-get.trait.data.pft.R index dbcc5ddfcf..91257db6ff 100644 --- a/base/db/tests/testthat/test-get.trait.data.pft.R +++ b/base/db/tests/testthat/test-get.trait.data.pft.R @@ -25,6 +25,7 @@ make_empty_pft <- function(outdir) { std_modeltype <- "SIPNET" std_traits <- c("SLA", "Vcmax", "leaf_respiration_rate_m2") +std_pft <- "temperate.deciduous" cleanup_posterior <- function(dbcon, posteriorid) { if (!is.null(posteriorid)) { @@ -91,7 +92,7 @@ test_that("errors for non-existent PFT name", { dbcon = test_dbcon, trait.names = std_traits ), - "Could not find pft" + "PFTs were not found" ) }) @@ -182,17 +183,28 @@ test_that("returns pft list with name, posteriorid, and outdir", { expect_true("posteriorid" %in% names(result)) }) -test_that("return value does not include trait_data or prior_distns", { +test_that("wrapper attaches trait_data and prior_distns to returned pft", { + # Documents the Week 1 enhancement: these fields are now returned so that + # run.meta.analysis.pft() can skip load() calls in a later PR test_dbcon <- check_db_test() withr::defer(PEcAn.DB::db.close(test_dbcon)) outdir <- withr::local_tempdir() result <- get.trait.data.pft( pft = make_test_pft(outdir), modeltype = std_modeltype, - dbfiles = outdir, dbcon = test_dbcon, trait.names = std_traits + dbfiles = outdir, dbcon = test_dbcon, trait.names = std_traits, return_data = TRUE ) withr::defer(cleanup_posterior(test_dbcon, result$posteriorid)) - expect_false("trait_data" %in% names(result)) - expect_false("prior_distns" %in% names(result)) + + expect_true("trait_data" %in% names(result)) + expect_true("prior_distns" %in% names(result)) + + # The attached objects must match what was saved — no silent divergence + trait_env <- new.env(parent = emptyenv()) + prior_env <- new.env(parent = emptyenv()) + load(file.path(outdir, "trait.data.Rdata"), envir = trait_env) + load(file.path(outdir, "prior.distns.Rdata"), envir = prior_env) + expect_identical(result$trait_data, trait_env$trait.data) + expect_identical(result$prior_distns, prior_env$prior.distns) }) # PFT with no observations @@ -272,4 +284,130 @@ test_that("reference species and cultivar PFTs write traits properly", { expect_gt(file.info(allcv_csv)$size, file.info(cv_csv)$size) expect_gt(file.info(allcv_trt)$size, file.info(cv_trt)$size) -}) \ No newline at end of file +}) + +test_that("wrapper attaches trait_data and prior_distns on the cache-HIT path", { + # This guards the early-return path: when foundallfiles is TRUE the wrapper + # copies files from the DB store and returns early. trait_data and + # prior_distns must be present on that path too. + test_dbcon <- check_db_test() + withr::defer(PEcAn.DB::db.close(test_dbcon)) + + outdir1 <- withr::local_tempdir() + outdir2 <- withr::local_tempdir() + + # First call — populates the cache and the DB posterior record + result1 <- get.trait.data.pft( + pft = make_test_pft(outdir1), + modeltype = std_modeltype, + dbfiles = dbdir, + dbcon = test_dbcon, + trait.names = std_traits, + write = TRUE + ) + withr::defer(cleanup_posterior(test_dbcon, result1$posteriorid)) + + # Second call — supplies the posteriorid from the first call so the + # wrapper finds valid files in the DB and takes the cache-hit path. + cached_pft <- make_test_pft(outdir2) + cached_pft$posteriorid <- result1$posteriorid + + result2 <- get.trait.data.pft( + pft = cached_pft, + modeltype = std_modeltype, + dbfiles = outdir2, + dbcon = test_dbcon, + trait.names = std_traits, + return_data = TRUE + ) + + expect_true("trait_data" %in% names(result2), + info = "trait_data missing on cache-hit return path") + expect_true("prior_distns" %in% names(result2), + info = "prior_distns missing on cache-hit return path") + expect_type(result2$trait_data, "list") + expect_s3_class(result2$prior_distns, "data.frame") +}) + +test_that("cache-hit in-memory objects match the files copied to outdir", { + # Verifies the objects attached on the cache-hit path are identical to + # what was written to disk on the original (cache-miss) run. + test_dbcon <- check_db_test() + withr::defer(PEcAn.DB::db.close(test_dbcon)) + + outdir_miss <- withr::local_tempdir() + outdir_hit <- withr::local_tempdir() + + result_miss <- get.trait.data.pft( + pft = make_test_pft(outdir_miss), + modeltype = std_modeltype, + dbfiles = dbdir, + dbcon = test_dbcon, + trait.names = std_traits, + write = TRUE + ) + withr::defer(cleanup_posterior(test_dbcon, result_miss$posteriorid)) + + cached_pft <- make_test_pft(outdir_hit) + cached_pft$posteriorid <- result_miss$posteriorid + + result_hit <- get.trait.data.pft( + pft = cached_pft, + modeltype = std_modeltype, + dbfiles = outdir_hit, + dbcon = test_dbcon, + trait.names = std_traits, + return_data = TRUE + ) + + # Load the files the cache copied to outdir_hit + trait_env <- new.env(parent = emptyenv()) + prior_env <- new.env(parent = emptyenv()) + load(file.path(outdir_hit, "trait.data.Rdata"), envir = trait_env) + load(file.path(outdir_hit, "prior.distns.Rdata"), envir = prior_env) + + expect_identical(result_hit$trait_data, trait_env$trait.data) + expect_identical(result_hit$prior_distns, prior_env$prior.distns) +}) + +test_that("wrapper does NOT attach trait_data/prior_distns by default (legacy)", { + test_dbcon <- check_db_test() + withr::defer(PEcAn.DB::db.close(test_dbcon)) + outdir <- withr::local_tempdir() + result <- get.trait.data.pft( + pft = make_test_pft(outdir), + modeltype = std_modeltype, + dbfiles = outdir, + dbcon = test_dbcon, + trait.names = std_traits + ) + withr::defer(cleanup_posterior(test_dbcon, result$posteriorid)) + expect_false("trait_data" %in% names(result)) + expect_false("prior_distns" %in% names(result)) +}) + +test_that("wrapper errors for unknown pft_type, not silent fallback", { + # Verifies the explicit guard in get.trait.data.pft(): + # an unrecognised pft_type must throw, not silently fall through + # to the species-query path. We mock query_pfts to return a bad record + # so no live DB call is needed. + fake_record <- data.frame( + id = 1L, + pft_type = "weird", + name = std_pft, + stringsAsFactors = FALSE + ) + mockery::stub(get.trait.data.pft, "query_pfts", fake_record) + fake_dbcon <- structure(list(), class = c("PostgreSQLConnection", + "DBIConnection")) + outdir <- withr::local_tempdir() + expect_error( + get.trait.data.pft( + pft = make_test_pft(outdir), + modeltype = std_modeltype, + dbfiles = outdir, + dbcon = fake_dbcon, + trait.names = std_traits + ) + ) +}) diff --git a/base/db/tests/testthat/test-get_trait_data_pft.R b/base/db/tests/testthat/test-get_trait_data_pft.R new file mode 100644 index 0000000000..6aa2d31131 --- /dev/null +++ b/base/db/tests/testthat/test-get_trait_data_pft.R @@ -0,0 +1,262 @@ +# Tests for get_trait_data_pft() + +old_log_level <- PEcAn.logger::logger.getLevel() +PEcAn.logger::logger.setLevel("WARN") +teardown({ + PEcAn.logger::logger.setLevel(old_log_level) +}) + +std_pft <- "temperate.deciduous" +std_modeltype <- "SIPNET" +std_traits <- c("SLA", "Vcmax", "leaf_respiration_rate_m2") + +# Input validation — no DB connection needed for these + +test_that("errors when pft_name is not a single string", { + # pft_name is validated before dbcon, so list() is safe to pass here + expect_error( + get_trait_data_pft( + pft_name = c("pft1", "pft2"), + modeltype = std_modeltype, + dbcon = list(), + trait_names = "SLA" + ) + ) +}) + +test_that("errors when trait_names is empty", { + # trait_names is validated before dbcon, so list() is safe to pass here + expect_error( + get_trait_data_pft( + pft_name = std_pft, + modeltype = std_modeltype, + dbcon = list(), + trait_names = character(0) + ) + ) +}) + +test_that("errors when dbcon is not a database connection", { + expect_error( + get_trait_data_pft( + pft_name = std_pft, + modeltype = std_modeltype, + dbcon = list(), + trait_names = "SLA" + ) + ) +}) + +# Database-backed tests + +test_that("errors for a PFT name not in the database", { + test_dbcon <- check_db_test() + withr::defer(PEcAn.DB::db.close(test_dbcon)) + + expect_error( + get_trait_data_pft( + pft_name = "NOTAPFT_GSOC2026", + modeltype = std_modeltype, + dbcon = test_dbcon, + trait_names = "SLA" + ) + ) +}) + +test_that("returns a named list with trait_data, prior_distns, pft_info", { + test_dbcon <- check_db_test() + withr::defer(PEcAn.DB::db.close(test_dbcon)) + + result <- get_trait_data_pft( + pft_name = std_pft, + modeltype = std_modeltype, + dbcon = test_dbcon, + trait_names = std_traits + ) + + expect_named(result, c("trait_data", "prior_distns", "pft_info")) +}) + +test_that("trait_data is a named list of data frames", { + test_dbcon <- check_db_test() + withr::defer(PEcAn.DB::db.close(test_dbcon)) + + result <- get_trait_data_pft( + pft_name = std_pft, + modeltype = std_modeltype, + dbcon = test_dbcon, + trait_names = std_traits + ) + + expect_type(result$trait_data, "list") + for (df in result$trait_data) { + expect_s3_class(df, "data.frame") + # These columns are the contract that meta_analysis_standalone expects + expect_true(all(c("mean", "stat", "statname", "n", "site_id", + "greenhouse", "specie_id", "citation_id") %in% + names(df))) + } +}) + +test_that("prior_distns is a data frame with the required columns", { + test_dbcon <- check_db_test() + withr::defer(PEcAn.DB::db.close(test_dbcon)) + + result <- get_trait_data_pft( + pft_name = std_pft, + modeltype = std_modeltype, + dbcon = test_dbcon, + trait_names = std_traits + ) + + expect_s3_class(result$prior_distns, "data.frame") + expect_true(all(c("distn", "parama", "paramb") %in% + colnames(result$prior_distns))) + # Row names are trait names — the contract downstream functions rely on + expect_true(all(rownames(result$prior_distns) %in% std_traits)) +}) + +test_that("pft_info contains expected fields and posteriorid is NULL", { + test_dbcon <- check_db_test() + withr::defer(PEcAn.DB::db.close(test_dbcon)) + result <- get_trait_data_pft( + pft_name = std_pft, + modeltype = std_modeltype, + dbcon = test_dbcon, + trait_names = "SLA" + ) + expect_named( + result$pft_info, + c("name", "pft_id", "pft_type", "pft_members", "pft_member_filename", + "posteriorid"), + ignore.order = TRUE + ) + expect_equal(result$pft_info$name, std_pft) + # posteriorid is always NULL here — the wrapper sets it after DB registration + expect_null(result$pft_info$posteriorid) + # pft_members must be a data frame with at least an id column + expect_s3_class(result$pft_info$pft_members, "data.frame") + expect_true("id" %in% names(result$pft_info$pft_members)) + # pft_member_filename must be species.csv or cultivars.csv + expect_true( + result$pft_info$pft_member_filename %in% c("species.csv", "cultivars.csv") + ) +}) + +test_that("no files are written to disk", { + test_dbcon <- check_db_test() + withr::defer(PEcAn.DB::db.close(test_dbcon)) + + tmp <- withr::local_tempdir() + old_wd <- setwd(tmp) + withr::defer(setwd(old_wd)) + + get_trait_data_pft( + pft_name = std_pft, + modeltype = std_modeltype, + dbcon = test_dbcon, + trait_names = std_traits + ) + + expect_length(list.files(tmp, pattern = "\\.Rdata$", recursive = TRUE), 0L) +}) + +test_that("constants are excluded from prior_distns", { + test_dbcon <- check_db_test() + withr::defer(PEcAn.DB::db.close(test_dbcon)) + + # Get the full prior list first so we know which traits actually have priors + result_full <- get_trait_data_pft( + pft_name = std_pft, + modeltype = std_modeltype, + dbcon = test_dbcon, + trait_names = std_traits + ) + + traits_with_priors <- rownames(result_full$prior_distns) + skip_if(length(traits_with_priors) < 2L, + "Need at least 2 traits with priors to test constants exclusion") + + constant_trait <- traits_with_priors[[1L]] + + result_const <- get_trait_data_pft( + pft_name = std_pft, + modeltype = std_modeltype, + dbcon = test_dbcon, + trait_names = std_traits, + constants = stats::setNames(list(1.0), constant_trait) + ) + + expect_false(constant_trait %in% rownames(result_const$prior_distns)) + expect_equal(nrow(result_const$prior_distns), + nrow(result_full$prior_distns) - 1L) +}) + +test_that("end-to-end: standalone gives identical objects to what wrapper saves", { + test_dbcon <- check_db_test() + withr::defer(PEcAn.DB::db.close(test_dbcon)) + + outdir <- withr::local_tempdir() + test_pft <- list( + name = std_pft, + outdir = outdir, + posteriorid = NULL, + constants = list() + ) + + # Run the wrapper — it writes trait.data.Rdata and prior.distns.Rdata + wrapper_result <- get.trait.data.pft( + pft = test_pft, + modeltype = std_modeltype, + dbfiles = outdir, + dbcon = test_dbcon, + trait.names = std_traits + ) + withr::defer({ + if (!is.null(wrapper_result$posteriorid)) { + try(DBI::dbExecute(test_dbcon, + "DELETE FROM dbfiles WHERE container_type = 'Posterior' AND container_id = $1", + list(wrapper_result$posteriorid)), silent = TRUE) + try(DBI::dbExecute(test_dbcon, + "DELETE FROM posteriors WHERE id = $1", + list(wrapper_result$posteriorid)), silent = TRUE) + } + }) + + standalone_result <- get_trait_data_pft( + pft_name = std_pft, + modeltype = std_modeltype, + dbcon = test_dbcon, + trait_names = std_traits + ) + + # Load the on-disk files the wrapper created + trait_env <- new.env(parent = emptyenv()) + prior_env <- new.env(parent = emptyenv()) + load(file.path(outdir, "trait.data.Rdata"), envir = trait_env) + load(file.path(outdir, "prior.distns.Rdata"), envir = prior_env) + + expect_identical(standalone_result$trait_data, trait_env$trait.data) + expect_identical(standalone_result$prior_distns, prior_env$prior.distns) +}) + +test_that("errors for unknown pft_type returns an error, not silent fallback", { + # This verifies the explicit guard added in get_trait_data_pft(): + # an unrecognised pft_type must throw, not silently fall through + # to the species-query path. We simulate a bad record by mocking + # query_pfts — no live DB call needed. + fake_record <- data.frame(id = 1L, pft_type = "unknown_type", + name = std_pft, stringsAsFactors = FALSE) + mockery::stub(get_trait_data_pft, "query_pfts", fake_record) + + fake_dbcon <- structure(list(), class = c("PostgreSQLConnection", + "DBIConnection")) + expect_error( + get_trait_data_pft( + pft_name = std_pft, + modeltype = std_modeltype, + dbcon = fake_dbcon, + trait_names = "SLA" + ) + ) +}) \ No newline at end of file