From ef6f12dfa901e9d153fb87fb191cba99b1f68a28 Mon Sep 17 00:00:00 2001 From: Om Kapale Date: Mon, 11 May 2026 19:35:05 +0530 Subject: [PATCH 1/4] feat: extract get_trait_data_pft() standalone + enhance wrapper (Modularity Part 2) --- base/db/R/get.trait.data.pft.R | 4 + base/db/R/get_trait_data_pft.R | 182 ++++++++++++++ .../tests/testthat/test-get.trait.data.pft.R | 17 +- .../tests/testthat/test-get_trait_data_pft.R | 235 ++++++++++++++++++ 4 files changed, 435 insertions(+), 3 deletions(-) create mode 100644 base/db/R/get_trait_data_pft.R create mode 100644 base/db/tests/testthat/test-get_trait_data_pft.R diff --git a/base/db/R/get.trait.data.pft.R b/base/db/R/get.trait.data.pft.R index 30c4005915..ee2418a777 100644 --- a/base/db/R/get.trait.data.pft.R +++ b/base/db/R/get.trait.data.pft.R @@ -386,5 +386,9 @@ get.trait.data.pft <- } } + # caching — is unchanged. Callers that only use pft$name / pft$outdir / + # pft$posteriorid are unaffected. + pft$trait_data <- trait.data + pft$prior_distns <- prior.distns return(pft) } diff --git a/base/db/R/get_trait_data_pft.R b/base/db/R/get_trait_data_pft.R new file mode 100644 index 0000000000..e380ac086e --- /dev/null +++ b/base/db/R/get_trait_data_pft.R @@ -0,0 +1,182 @@ +#' Retrieve trait data and priors for one PFT from BETYdb +#' +#' Core computation extracted from \code{\link{get.trait.data.pft}}. +#' Queries the database for trait observations and prior distributions, +#' returning them as R objects with no file I/O of any kind. +#' +#' The wrapper \code{\link{get.trait.data.pft}} handles directory creation, +#' caching, CSV output, and BETYdb registration via \code{dbfile.insert}. +#' This function handles only the query. Choosing between them is the +#' provenance opt-in: calling the wrapper saves artifacts to disk; calling +#' this function never does. +#' +#' This follows the pattern established by \code{meta_analysis_standalone} +#' for the meta-analysis step and \code{\link{get_parameter_samples}} for +#' parameter sampling — each is a computation core that can be tested in +#' isolation without a filesystem or a \code{settings} object. +#' +#' @param pft_name character. PFT name as stored in BETYdb. +#' @param modeltype character or NULL. Disambiguates PFTs that share a name +#' across model types (e.g. \code{"SIPNET"}, \code{"ED2"}). +#' @param dbcon database connection from \code{\link[PEcAn.DB]{db.open}}. +#' @param trait_names character vector of trait names to retrieve. +#' @param constants named list from \code{pft$constants} in the settings. +#' Traits named here are excluded from the returned priors because their +#' values are fixed rather than sampled by the meta-analysis. +#' +#' @return Named list with three elements: +#' \describe{ +#' \item{\code{trait_data}}{Named list of data frames, one per trait that +#' has observations. Column structure matches what +#' \code{meta_analysis_standalone} expects. Traits with no observations +#' are omitted from the list.} +#' \item{\code{prior_distns}}{Data frame with columns \code{distn}, +#' \code{parama}, \code{paramb}, \code{n}; rows named by trait. Traits +#' listed in \code{constants} are excluded.} +#' \item{\code{pft_info}}{List with \code{name}, \code{pft_id}, +#' \code{pft_type}, and \code{posteriorid}. \code{posteriorid} is always +#' \code{NULL} — the wrapper sets it after registering outputs in BETYdb.} +#' } +#' +#' @seealso \code{\link{get.trait.data.pft}} for the backward-compatible +#' wrapper that handles provenance and caching. +#' \code{\link[PEcAn.MA]{meta_analysis_standalone}} for the analogous +#' function in the meta-analysis step. +#' \code{\link{get_parameter_samples}} for the analogous function in the +#' parameter sampling step. +#' +#' @examples +#' \dontrun{ +#' dbcon <- PEcAn.DB::db.open(list( +#' host = "localhost", user = "bety", +#' password = "bety", dbname = "bety" +#' )) +#' result <- get_trait_data_pft( +#' pft_name = "temperate.deciduous", +#' modeltype = "SIPNET", +#' dbcon = dbcon, +#' trait_names = c("SLA", "Vcmax", "leaf_respiration_rate_m2") +#' ) +#' str(result$trait_data) +#' str(result$prior_distns) +#' PEcAn.DB::db.close(dbcon) +#' } +#' +#' @author David LeBauer, Shawn Serbin, Alexey Shiklomanov, Om Kapale +#' @export +get_trait_data_pft <- function(pft_name, + modeltype, + dbcon, + trait_names, + constants = list()) { + + # Validate the cheap arguments before making any database calls + if (!is.character(pft_name) || length(pft_name) != 1L) { + PEcAn.logger::logger.severe("'pft_name' must be a single character string") + } + if (!is.character(trait_names) || length(trait_names) == 0L) { + PEcAn.logger::logger.severe( + "'trait_names' must be a non-empty character vector" + ) + } + if (!inherits(dbcon, "DBIConnection")) { + PEcAn.logger::logger.severe("'dbcon' must be a database connection") + } + + # Resolve PFT name to a single database record. + # strict = TRUE gives a clear error when the PFT is not found rather than + # returning an empty data frame silently. + pft_record <- query_pfts(dbcon, pft_name, modeltype, strict = TRUE) + + if (nrow(pft_record) > 1L) { + PEcAn.logger::logger.severe( + "Multiple PFTs named '", pft_name, "' found in the database;", + " pass modeltype to disambiguate." + ) + } + + pft_id <- pft_record[["id"]] + pft_type <- pft_record[["pft_type"]] + + PEcAn.logger::logger.info( + "Querying trait data for PFT '", pft_name, "' (id = ", pft_id, ")" + ) + + # Which join table holds the member IDs depends on pft_type + ids_are_cultivars <- identical(pft_type, "cultivar") + + if (ids_are_cultivars) { + members <- query.pft_cultivars(pft = pft_name, modeltype = modeltype, + con = dbcon) + } else { + members <- query.pft_species(pft = pft_name, modeltype = modeltype, + con = dbcon) + } + member_ids <- members[["id"]] + + if (length(member_ids) == 0L) { + PEcAn.logger::logger.info( + "PFT '", pft_name, "' has no associated ", + if (ids_are_cultivars) "cultivars" else "species", + "; trait_data will be an empty list." + ) + } + + # format() prevents integer64 from being silently coerced in the SQL query + # (same approach used in get.trait.data()) + prior_distns <- query.priors( + pft = format(pft_id, scientific = FALSE), + trstr = trait_names, + con = dbcon + ) + + # Traits in pft$constants have fixed values and are never sampled, so they + # should not appear in the prior distributions returned to callers + if (length(constants) > 0L && !is.null(names(constants))) { + constant_traits <- names(constants) + in_constants <- rownames(prior_distns) %in% constant_traits + if (any(in_constants)) { + PEcAn.logger::logger.info( + "Excluding ", sum(in_constants), " constant trait(s) from priors: ", + PEcAn.utils::vecpaste(rownames(prior_distns)[in_constants]) + ) + prior_distns <- prior_distns[!in_constants, , drop = FALSE] + } + } + + # Only query traits that have a prior — querying for traits with no prior + # is meaningless for meta-analysis + traits_with_priors <- rownames(prior_distns) + + if (length(member_ids) > 0L && length(traits_with_priors) > 0L) { + trait_data <- query.traits( + ids = member_ids, + priors = traits_with_priors, + con = dbcon, + ids_are_cultivars = ids_are_cultivars + ) + } else { + trait_data <- list() + } + + PEcAn.logger::logger.info( + "PFT '", pft_name, "': ", + length(trait_data), " trait(s) with observations, ", + nrow(prior_distns), " trait(s) with priors" + ) + + # posteriorid is NULL here — the wrapper sets it after registering the + # output files in BETYdb via dbfile.insert() + pft_info <- list( + name = pft_name, + pft_id = pft_id, + pft_type = pft_type, + posteriorid = NULL + ) + + return(list( + trait_data = trait_data, + prior_distns = prior_distns, + pft_info = pft_info + )) +} \ No newline at end of file diff --git a/base/db/tests/testthat/test-get.trait.data.pft.R b/base/db/tests/testthat/test-get.trait.data.pft.R index dbcc5ddfcf..dfe41af630 100644 --- a/base/db/tests/testthat/test-get.trait.data.pft.R +++ b/base/db/tests/testthat/test-get.trait.data.pft.R @@ -182,7 +182,9 @@ test_that("returns pft list with name, posteriorid, and outdir", { expect_true("posteriorid" %in% names(result)) }) -test_that("return value does not include trait_data or prior_distns", { +test_that("wrapper attaches trait_data and prior_distns to returned pft", { + # Documents the Week 1 enhancement: these fields are now returned so that + # run.meta.analysis.pft() can skip load() calls in a later PR test_dbcon <- check_db_test() withr::defer(PEcAn.DB::db.close(test_dbcon)) outdir <- withr::local_tempdir() @@ -191,8 +193,17 @@ test_that("return value does not include trait_data or prior_distns", { dbfiles = outdir, dbcon = test_dbcon, trait.names = std_traits ) withr::defer(cleanup_posterior(test_dbcon, result$posteriorid)) - expect_false("trait_data" %in% names(result)) - expect_false("prior_distns" %in% names(result)) + + expect_true("trait_data" %in% names(result)) + expect_true("prior_distns" %in% names(result)) + + # The attached objects must match what was saved — no silent divergence + trait_env <- new.env(parent = emptyenv()) + prior_env <- new.env(parent = emptyenv()) + load(file.path(outdir, "trait.data.Rdata"), envir = trait_env) + load(file.path(outdir, "prior.distns.Rdata"), envir = prior_env) + expect_identical(result$trait_data, trait_env$trait.data) + expect_identical(result$prior_distns, prior_env$prior.distns) }) # PFT with no observations diff --git a/base/db/tests/testthat/test-get_trait_data_pft.R b/base/db/tests/testthat/test-get_trait_data_pft.R new file mode 100644 index 0000000000..e778f4b525 --- /dev/null +++ b/base/db/tests/testthat/test-get_trait_data_pft.R @@ -0,0 +1,235 @@ +# Tests for get_trait_data_pft() +# +# Validation tests fire before any DB call and never skip. +# DB-dependent tests call check_db_test() inside each test_that block +# and skip automatically when no connection is available. + +old_log_level <- PEcAn.logger::logger.getLevel() +PEcAn.logger::logger.setLevel("WARN") +teardown({ + PEcAn.logger::logger.setLevel(old_log_level) +}) + +std_pft <- "temperate.deciduous" +std_modeltype <- "SIPNET" +std_traits <- c("SLA", "Vcmax", "leaf_respiration_rate_m2") + +# Input validation — no DB connection needed for these + +test_that("errors when pft_name is not a single string", { + # pft_name is validated before dbcon, so list() is safe to pass here + expect_error( + get_trait_data_pft( + pft_name = c("pft1", "pft2"), + modeltype = std_modeltype, + dbcon = list(), + trait_names = "SLA" + ) + ) +}) + +test_that("errors when trait_names is empty", { + # trait_names is validated before dbcon, so list() is safe to pass here + expect_error( + get_trait_data_pft( + pft_name = std_pft, + modeltype = std_modeltype, + dbcon = list(), + trait_names = character(0) + ) + ) +}) + +test_that("errors when dbcon is not a database connection", { + expect_error( + get_trait_data_pft( + pft_name = std_pft, + modeltype = std_modeltype, + dbcon = list(), + trait_names = "SLA" + ) + ) +}) + +# Database-backed tests + +test_that("errors for a PFT name not in the database", { + test_dbcon <- check_db_test() + withr::defer(PEcAn.DB::db.close(test_dbcon)) + + expect_error( + get_trait_data_pft( + pft_name = "NOTAPFT_GSOC2026", + modeltype = std_modeltype, + dbcon = test_dbcon, + trait_names = "SLA" + ) + ) +}) + +test_that("returns a named list with trait_data, prior_distns, pft_info", { + test_dbcon <- check_db_test() + withr::defer(PEcAn.DB::db.close(test_dbcon)) + + result <- get_trait_data_pft( + pft_name = std_pft, + modeltype = std_modeltype, + dbcon = test_dbcon, + trait_names = std_traits + ) + + expect_named(result, c("trait_data", "prior_distns", "pft_info")) +}) + +test_that("trait_data is a named list of data frames", { + test_dbcon <- check_db_test() + withr::defer(PEcAn.DB::db.close(test_dbcon)) + + result <- get_trait_data_pft( + pft_name = std_pft, + modeltype = std_modeltype, + dbcon = test_dbcon, + trait_names = std_traits + ) + + expect_type(result$trait_data, "list") + for (df in result$trait_data) { + expect_s3_class(df, "data.frame") + # These columns are the contract that meta_analysis_standalone expects + expect_true(all(c("mean", "stat", "statname", "n", "site_id", + "greenhouse", "specie_id", "citation_id") %in% + names(df))) + } +}) + +test_that("prior_distns is a data frame with the required columns", { + test_dbcon <- check_db_test() + withr::defer(PEcAn.DB::db.close(test_dbcon)) + + result <- get_trait_data_pft( + pft_name = std_pft, + modeltype = std_modeltype, + dbcon = test_dbcon, + trait_names = std_traits + ) + + expect_s3_class(result$prior_distns, "data.frame") + expect_true(all(c("distn", "parama", "paramb") %in% + colnames(result$prior_distns))) + # Row names are trait names — the contract downstream functions rely on + expect_true(all(rownames(result$prior_distns) %in% std_traits)) +}) + +test_that("pft_info contains expected fields and posteriorid is NULL", { + test_dbcon <- check_db_test() + withr::defer(PEcAn.DB::db.close(test_dbcon)) + + result <- get_trait_data_pft( + pft_name = std_pft, + modeltype = std_modeltype, + dbcon = test_dbcon, + trait_names = "SLA" + ) + + expect_named(result$pft_info, c("name", "pft_id", "pft_type", "posteriorid")) + expect_equal(result$pft_info$name, std_pft) + # posteriorid is always NULL here — the wrapper sets it after DB registration + expect_null(result$pft_info$posteriorid) +}) + +test_that("no files are written to disk", { + test_dbcon <- check_db_test() + withr::defer(PEcAn.DB::db.close(test_dbcon)) + + tmp <- withr::local_tempdir() + old_wd <- setwd(tmp) + withr::defer(setwd(old_wd)) + + get_trait_data_pft( + pft_name = std_pft, + modeltype = std_modeltype, + dbcon = test_dbcon, + trait_names = std_traits + ) + + expect_length(list.files(tmp, pattern = "\\.Rdata$", recursive = TRUE), 0L) +}) + +test_that("constants are excluded from prior_distns", { + test_dbcon <- check_db_test() + withr::defer(PEcAn.DB::db.close(test_dbcon)) + + # Get the full prior list first so we know which traits actually have priors + result_full <- get_trait_data_pft( + pft_name = std_pft, + modeltype = std_modeltype, + dbcon = test_dbcon, + trait_names = std_traits + ) + + traits_with_priors <- rownames(result_full$prior_distns) + skip_if(length(traits_with_priors) < 2L, + "Need at least 2 traits with priors to test constants exclusion") + + constant_trait <- traits_with_priors[[1L]] + + result_const <- get_trait_data_pft( + pft_name = std_pft, + modeltype = std_modeltype, + dbcon = test_dbcon, + trait_names = std_traits, + constants = stats::setNames(list(1.0), constant_trait) + ) + + expect_false(constant_trait %in% rownames(result_const$prior_distns)) + expect_equal(nrow(result_const$prior_distns), + nrow(result_full$prior_distns) - 1L) +}) + +test_that("end-to-end: standalone gives identical objects to what wrapper saves", { + test_dbcon <- check_db_test() + withr::defer(PEcAn.DB::db.close(test_dbcon)) + + outdir <- withr::local_tempdir() + test_pft <- list( + name = std_pft, + outdir = outdir, + posteriorid = NULL, + constants = list() + ) + + # Run the wrapper — it writes trait.data.Rdata and prior.distns.Rdata + wrapper_result <- get.trait.data.pft( + pft = test_pft, + modeltype = std_modeltype, + dbfiles = outdir, + dbcon = test_dbcon, + trait.names = std_traits + ) + withr::defer({ + if (!is.null(wrapper_result$posteriorid)) { + try(DBI::dbExecute(test_dbcon, + "DELETE FROM dbfiles WHERE container_type = 'Posterior' AND container_id = $1", + list(wrapper_result$posteriorid)), silent = TRUE) + try(DBI::dbExecute(test_dbcon, + "DELETE FROM posteriors WHERE id = $1", + list(wrapper_result$posteriorid)), silent = TRUE) + } + }) + + standalone_result <- get_trait_data_pft( + pft_name = std_pft, + modeltype = std_modeltype, + dbcon = test_dbcon, + trait_names = std_traits + ) + + # Load the on-disk files the wrapper created + trait_env <- new.env(parent = emptyenv()) + prior_env <- new.env(parent = emptyenv()) + load(file.path(outdir, "trait.data.Rdata"), envir = trait_env) + load(file.path(outdir, "prior.distns.Rdata"), envir = prior_env) + + expect_identical(standalone_result$trait_data, trait_env$trait.data) + expect_identical(standalone_result$prior_distns, prior_env$prior.distns) +}) \ No newline at end of file From c0caa1b700f14832249e3524b1e14352a01ec9aa Mon Sep 17 00:00:00 2001 From: Om Kapale Date: Mon, 11 May 2026 20:09:10 +0530 Subject: [PATCH 2/4] docs: add NAMESPACE export and Rd file for get_trait_data_pft --- base/db/NAMESPACE | 1 + base/db/R/get.trait.data.pft.R | 32 +++++++----- base/db/R/get_trait_data_pft.R | 10 ++-- base/db/man/get.trait.data.pft.Rd | 7 +-- base/db/man/get_trait_data_pft.Rd | 83 +++++++++++++++++++++++++++++++ 5 files changed, 114 insertions(+), 19 deletions(-) create mode 100644 base/db/man/get_trait_data_pft.Rd diff --git a/base/db/NAMESPACE b/base/db/NAMESPACE index c9d6c10b2d..e88ff29962 100644 --- a/base/db/NAMESPACE +++ b/base/db/NAMESPACE @@ -33,6 +33,7 @@ export(get.trait.data) export(get.trait.data.pft) export(get_postgres_envvars) export(get_run_ids) +export(get_trait_data_pft) export(get_users) export(get_var_names) export(get_workflow_ids) diff --git a/base/db/R/get.trait.data.pft.R b/base/db/R/get.trait.data.pft.R index ee2418a777..2e2da9ae21 100644 --- a/base/db/R/get.trait.data.pft.R +++ b/base/db/R/get.trait.data.pft.R @@ -46,9 +46,10 @@ ##' BETYdb. Defaults to `FALSE`. ##' @param trait.names list of trait names to retrieve ##' @return The `pft` input list, updated with `pft$posteriorid` set to the -##' ID of the (possibly new) posterior record in BETYdb. The posterior ID can -##' be used to locate the output files (`trait.data.Rdata`, `prior.distns.Rdata`, -##' etc.) via BETYdb's `dbfiles` table. +##' ID of the (possibly new) posterior record in BETYdb. Also contains +##' `pft$trait_data` and `pft$prior_distns` for in-memory chaining. The +##' posterior ID can be used to locate the output files (`trait.data.Rdata`, +##' `prior.distns.Rdata`, etc.) via BETYdb's `dbfiles` table. ##' @author David LeBauer, Shawn Serbin, Rob Kooper ##' @export get.trait.data.pft <- @@ -59,7 +60,6 @@ get.trait.data.pft <- trait.names, forceupdate = FALSE, write = FALSE) { - # Create directory if necessary if (!file.exists(pft$outdir) && !dir.create(pft$outdir, recursive = TRUE)) { @@ -303,11 +303,19 @@ get.trait.data.pft <- } } - # get the trait data (including sampling of derived traits, if any) - trait.data <- query.traits(pft_members$id, traits, con = dbcon, - update.check.only = FALSE, - ids_are_cultivars = (pfttype == "cultivar")) - traits <- names(trait.data) + ## Cache miss: delegate the full data query to the standalone function so + ## that query logic lives in one place. This wrapper handles only the + ## file I/O and database registration that follows. + computed <- get_trait_data_pft( + pft_name = pft[["name"]], + modeltype = modeltype, + dbcon = dbcon, + trait_names = trait.names, + constants = if (!is.null(pft$constants)) pft$constants else list() + ) + trait.data <- computed$trait_data + prior.distns <- computed$prior_distns + traits <- names(trait.data) if (length(trait.data) > 0) { trait_counts <- trait.data %>% @@ -386,9 +394,9 @@ get.trait.data.pft <- } } - # caching — is unchanged. Callers that only use pft$name / pft$outdir / - # pft$posteriorid are unaffected. + ## Attach computed objects so downstream callers can chain in-memory + ## without loading files from pft$outdir. pft$trait_data <- trait.data pft$prior_distns <- prior.distns return(pft) -} +} \ No newline at end of file diff --git a/base/db/R/get_trait_data_pft.R b/base/db/R/get_trait_data_pft.R index e380ac086e..be5e9f96b6 100644 --- a/base/db/R/get_trait_data_pft.R +++ b/base/db/R/get_trait_data_pft.R @@ -11,7 +11,7 @@ #' this function never does. #' #' This follows the pattern established by \code{meta_analysis_standalone} -#' for the meta-analysis step and \code{\link{get_parameter_samples}} for +#' for the meta-analysis step and \code{get_parameter_samples} (in PEcAn.DB) for #' parameter sampling — each is a computation core that can be tested in #' isolation without a filesystem or a \code{settings} object. #' @@ -40,9 +40,9 @@ #' #' @seealso \code{\link{get.trait.data.pft}} for the backward-compatible #' wrapper that handles provenance and caching. -#' \code{\link[PEcAn.MA]{meta_analysis_standalone}} for the analogous +#' \code{meta_analysis_standalone} (in PEcAn.MA) for the analogous #' function in the meta-analysis step. -#' \code{\link{get_parameter_samples}} for the analogous function in the +#' \code{get_parameter_samples} (in PEcAn.DB) for the analogous function in the #' parameter sampling step. #' #' @examples @@ -112,6 +112,8 @@ get_trait_data_pft <- function(pft_name, members <- query.pft_species(pft = pft_name, modeltype = modeltype, con = dbcon) } + members <- members %>% + dplyr::mutate_if(is.character, ~dplyr::na_if(., "")) member_ids <- members[["id"]] if (length(member_ids) == 0L) { @@ -126,7 +128,7 @@ get_trait_data_pft <- function(pft_name, # (same approach used in get.trait.data()) prior_distns <- query.priors( pft = format(pft_id, scientific = FALSE), - trstr = trait_names, + trstr = PEcAn.utils::vecpaste(trait_names), con = dbcon ) diff --git a/base/db/man/get.trait.data.pft.Rd b/base/db/man/get.trait.data.pft.Rd index 406e84e02e..bba7aa6dcd 100644 --- a/base/db/man/get.trait.data.pft.Rd +++ b/base/db/man/get.trait.data.pft.Rd @@ -34,9 +34,10 @@ BETYdb. Defaults to \code{FALSE}.} } \value{ The \code{pft} input list, updated with \code{pft$posteriorid} set to the -ID of the (possibly new) posterior record in BETYdb. The posterior ID can -be used to locate the output files (\code{trait.data.Rdata}, \code{prior.distns.Rdata}, -etc.) via BETYdb's \code{dbfiles} table. +ID of the (possibly new) posterior record in BETYdb. Also contains +\code{pft$trait_data} and \code{pft$prior_distns} for in-memory chaining. The +posterior ID can be used to locate the output files (\code{trait.data.Rdata}, +\code{prior.distns.Rdata}, etc.) via BETYdb's \code{dbfiles} table. } \description{ Get trait data from the database for a single PFT diff --git a/base/db/man/get_trait_data_pft.Rd b/base/db/man/get_trait_data_pft.Rd new file mode 100644 index 0000000000..31252af3ee --- /dev/null +++ b/base/db/man/get_trait_data_pft.Rd @@ -0,0 +1,83 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_trait_data_pft.R +\name{get_trait_data_pft} +\alias{get_trait_data_pft} +\title{Retrieve trait data and priors for one PFT from BETYdb} +\usage{ +get_trait_data_pft(pft_name, modeltype, dbcon, trait_names, constants = list()) +} +\arguments{ +\item{pft_name}{character. PFT name as stored in BETYdb.} + +\item{modeltype}{character or NULL. Disambiguates PFTs that share a name +across model types (e.g. \code{"SIPNET"}, \code{"ED2"}).} + +\item{dbcon}{database connection from \code{\link[PEcAn.DB]{db.open}}.} + +\item{trait_names}{character vector of trait names to retrieve.} + +\item{constants}{named list from \code{pft$constants} in the settings. +Traits named here are excluded from the returned priors because their +values are fixed rather than sampled by the meta-analysis.} +} +\value{ +Named list with three elements: +\describe{ + \item{\code{trait_data}}{Named list of data frames, one per trait that + has observations. Column structure matches what + \code{meta_analysis_standalone} expects. Traits with no observations + are omitted from the list.} + \item{\code{prior_distns}}{Data frame with columns \code{distn}, + \code{parama}, \code{paramb}, \code{n}; rows named by trait. Traits + listed in \code{constants} are excluded.} + \item{\code{pft_info}}{List with \code{name}, \code{pft_id}, + \code{pft_type}, and \code{posteriorid}. \code{posteriorid} is always + \code{NULL} — the wrapper sets it after registering outputs in BETYdb.} +} +} +\description{ +Core computation extracted from \code{\link{get.trait.data.pft}}. +Queries the database for trait observations and prior distributions, +returning them as R objects with no file I/O of any kind. +} +\details{ +The wrapper \code{\link{get.trait.data.pft}} handles directory creation, +caching, CSV output, and BETYdb registration via \code{dbfile.insert}. +This function handles only the query. Choosing between them is the +provenance opt-in: calling the wrapper saves artifacts to disk; calling +this function never does. + +This follows the pattern established by \code{meta_analysis_standalone} +for the meta-analysis step and \code{get_parameter_samples} (in PEcAn.DB) for +parameter sampling — each is a computation core that can be tested in +isolation without a filesystem or a \code{settings} object. +} +\examples{ +\dontrun{ +dbcon <- PEcAn.DB::db.open(list( + host = "localhost", user = "bety", + password = "bety", dbname = "bety" +)) +result <- get_trait_data_pft( + pft_name = "temperate.deciduous", + modeltype = "SIPNET", + dbcon = dbcon, + trait_names = c("SLA", "Vcmax", "leaf_respiration_rate_m2") +) +str(result$trait_data) +str(result$prior_distns) +PEcAn.DB::db.close(dbcon) +} + +} +\seealso{ +\code{\link{get.trait.data.pft}} for the backward-compatible + wrapper that handles provenance and caching. + \code{meta_analysis_standalone} (in PEcAn.MA) for the analogous + function in the meta-analysis step. + \code{get_parameter_samples} (in PEcAn.DB) for the analogous function in the + parameter sampling step. +} +\author{ +David LeBauer, Shawn Serbin, Alexey Shiklomanov, Om Kapale +} From 3fd20cf4c5f30ed4a8330edf0707f991fd7363a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Om=20Kapale=20=F0=9F=90=99?= Date: Tue, 12 May 2026 03:09:35 +0000 Subject: [PATCH 3/4] fix: remove @md tag conflict, fix cross-references, sync Rd files with roxygen2 7.3.3 --- base/db/DESCRIPTION | 3 +- base/db/R/get.trait.data.pft.R | 268 ++++++++---------- base/db/R/get_trait_data_pft.R | 84 +++--- base/db/man/get.trait.data.pft.Rd | 36 +-- base/db/man/get_trait_data_pft.Rd | 16 +- .../tests/testthat/test-get.trait.data.pft.R | 84 +++++- .../tests/testthat/test-get_trait_data_pft.R | 41 ++- 7 files changed, 305 insertions(+), 227 deletions(-) diff --git a/base/db/DESCRIPTION b/base/db/DESCRIPTION index b2590784b5..cfb3f9fd52 100644 --- a/base/db/DESCRIPTION +++ b/base/db/DESCRIPTION @@ -80,5 +80,6 @@ Copyright: Authors LazyLoad: yes LazyData: FALSE Encoding: UTF-8 -RoxygenNote: 7.3.3 X-schema.org-keywords: PEcAn, database +Config/roxygen2/version: 8.0.0 +RoxygenNote: 7.3.3 diff --git a/base/db/R/get.trait.data.pft.R b/base/db/R/get.trait.data.pft.R index 2e2da9ae21..a6b5afe36b 100644 --- a/base/db/R/get.trait.data.pft.R +++ b/base/db/R/get.trait.data.pft.R @@ -2,38 +2,19 @@ ##' ##' @md ##' Queries BETYdb for trait observations and prior distributions for a single -##' plant functional type (PFT). Results are saved to files -##' in the PFT output directory (`pft$outdir`), and also registered in the -##' database as posterior records when `write = TRUE`. +##' plant functional type (PFT). Results are saved to files in the PFT output +##' directory (`pft$outdir`), and also registered in the database as posterior +##' records when `write = TRUE`. ##' ##' @details ##' `pft` should be a list containing at least `name` and `outdir`, and ##' optionally `posteriorid` and `constants`. ##' -##' **File-based side effects (saved to `pft$outdir`):** -##' \describe{ -##' \item{`trait.data.Rdata`}{Contains a single object `trait.data`: a named -##' list of data frames, one per trait. Each data frame has columns from -##' BETYdb's traits/yields views (e.g., `mean`, `stat`, `n`, `site_id`, -##' `treatment_id`). Names correspond to trait variable names -##' (e.g., `"SLA"`, `"Vcmax"`).} -##' \item{`prior.distns.Rdata`}{Contains a single object `prior.distns`: a -##' data frame with one row per trait and columns `distn`, `parama`, -##' `paramb`, and `n`. Row names are trait variable names. Traits listed -##' in `pft$constants` are excluded.} -##' \item{`trait.data.csv`}{CSV export of `trait.data` (all traits -##' row-bound).} -##' \item{`prior.distns.csv`}{CSV export of `prior.distns`.} -##' \item{`species.csv` or `cultivars.csv`}{PFT membership list used to -##' detect changes between runs.} -##' } -##' -##' **Downstream contract:** The files `trait.data.Rdata` and -##' `prior.distns.Rdata` are expected by \code{run.meta.analysis.pft}, which -##' loads them from `pft$outdir`. This implicit file-based coupling means -##' the two functions must agree on directory path and object names. A future -##' refactoring goal is to pass these objects directly via function arguments -##' instead. +##' Internally this wrapper delegates all database queries to +##' \code{\link{get_trait_data_pft}} exactly once. The returned objects are +##' used for both the cache-staleness check and the save step, so the database +##' is never queried more than once per call regardless of whether the cache +##' hits or misses. ##' ##' @param pft list of settings for the pft whose traits to retrieve. See details. ##' @param modeltype type of model that is used, this is used to distinguish @@ -47,9 +28,10 @@ ##' @param trait.names list of trait names to retrieve ##' @return The `pft` input list, updated with `pft$posteriorid` set to the ##' ID of the (possibly new) posterior record in BETYdb. Also contains -##' `pft$trait_data` and `pft$prior_distns` for in-memory chaining. The -##' posterior ID can be used to locate the output files (`trait.data.Rdata`, -##' `prior.distns.Rdata`, etc.) via BETYdb's `dbfiles` table. +##' `pft$trait_data` and `pft$prior_distns` for in-memory chaining on both +##' the cache-hit and cache-miss paths. The posterior ID can be used to +##' locate the output files (`trait.data.Rdata`, `prior.distns.Rdata`, +##' etc.) via BETYdb's `dbfiles` table. ##' @author David LeBauer, Shawn Serbin, Rob Kooper ##' @export get.trait.data.pft <- @@ -66,82 +48,56 @@ get.trait.data.pft <- PEcAn.logger::logger.error(paste0("Couldn't create PFT output directory: ", pft$outdir)) } - # find appropriate pft - pftres <- query_pfts(dbcon, pft[["name"]], modeltype) - pfttype <- pftres[["pft_type"]] - pftid <- pftres[["id"]] - - if (nrow(pftres) > 1) { - PEcAn.logger::logger.severe( - "Multiple PFTs named", pft[["name"]], "found,", - "with ids", PEcAn.utils::vecpaste(pftres[["id"]]), ".", - "Specify modeltype to fix this.") - } - - if (nrow(pftres) == 0) { - PEcAn.logger::logger.severe("Could not find pft", pft[["name"]]) - return(NA) - } - - # get the member species/cultivars, we need to check if anything changed - if (pfttype == "plant") { - pft_member_filename = "species.csv" - pft_members <- PEcAn.DB::query.pft_species(pft$name, modeltype, dbcon) - } else if (pfttype == "cultivar") { - pft_member_filename = "cultivars.csv" - pft_members <- PEcAn.DB::query.pft_cultivars(pft$name, modeltype, dbcon) - } else { - PEcAn.logger::logger.severe("Unknown pft type! Expected 'plant' or 'cultivar', got", pfttype) - } - - # ANS: Need to do this conversion for the check against existing - # membership later on. Otherwise, `NA` from the CSV is interpreted - # as different from `""` returned here, even though they are really - # the same thing. - pft_members <- pft_members %>% - dplyr::mutate_if(is.character, ~dplyr::na_if(., "")) - - # get the priors - prior.distns <- PEcAn.DB::query.priors( - pft = pftid, - trstr = PEcAn.utils::vecpaste(trait.names), - con = dbcon) - prior.distns <- prior.distns[which(!rownames(prior.distns) %in% names(pft$constants)),] - traits <- rownames(prior.distns) + # ---- Single DB round-trip via standalone ---- + # All query logic lives in get_trait_data_pft(). The objects it returns + # are used below for both the cache check and the save step — the database + # is never queried a second time. + computed <- get_trait_data_pft( + pft_name = pft[["name"]], + modeltype = modeltype, + dbcon = dbcon, + trait_names = trait.names, + constants = if (!is.null(pft$constants)) pft$constants else list() + ) - # get the trait data (don't bother sampling derived traits until after update check) - trait.data.check <- PEcAn.DB::query.traits(ids = pft_members$id, priors = traits, con = dbcon, update.check.only = TRUE, ids_are_cultivars = (pfttype=="cultivar")) - traits <- names(trait.data.check) + trait.data <- computed$trait_data + prior.distns <- computed$prior_distns + pftid <- computed$pft_info$pft_id + pfttype <- computed$pft_info$pft_type + pft_members <- computed$pft_info$pft_members + pft_member_filename <- computed$pft_info$pft_member_filename - # Set forceupdate FALSE if it's a string (backwards compatible with 'AUTO' flag used in the past) + # Set forceupdate FALSE if it's a string (backwards compatible with 'AUTO') forceupdate <- isTRUE(as.logical(forceupdate)) - # check to see if we need to update + # ---- Cache staleness check ---- if (!forceupdate) { if (is.null(pft$posteriorid)) { - recent_posterior <- dplyr::tbl(dbcon, "posteriors") %>% - dplyr::filter(.data$pft_id == !!pftid) %>% + recent_posterior <- dplyr::tbl(dbcon, "posteriors") |> + dplyr::filter(.data$pft_id == !!pftid) |> dplyr::collect() if (length(recent_posterior) > 0) { - pft$posteriorid <- dplyr::tbl(dbcon, "posteriors") %>% - dplyr::filter(.data$pft_id == !!pftid) %>% - dplyr::arrange(dplyr::desc(.data$created_at)) %>% - utils::head(1) %>% + pft$posteriorid <- dplyr::tbl(dbcon, "posteriors") |> + dplyr::filter(.data$pft_id == !!pftid) |> + dplyr::arrange(dplyr::desc(.data$created_at)) |> + utils::head(1) |> dplyr::pull("id") } else { PEcAn.logger::logger.info("No previous posterior found. Forcing update") } } + if (!is.null(pft$posteriorid)) { - files <- dbfile.check(type = "Posterior", container.id = pft$posteriorid, con = dbcon, - return.all = TRUE) + files <- dbfile.check(type = "Posterior", container.id = pft$posteriorid, + con = dbcon, return.all = TRUE) need_files <- c( - trait_data = "trait.data.Rdata", - priors = "prior.distns.Rdata", + trait_data = "trait.data.Rdata", + priors = "prior.distns.Rdata", pft_membership = pft_member_filename ) ids <- match(need_files, files$file_name) names(ids) <- names(need_files) + if (any(is.na(ids))) { missing_files <- need_files[is.na(ids)] PEcAn.logger::logger.info(paste0( @@ -159,11 +115,13 @@ get.trait.data.pft <- "All posterior files are present. Performing additional checks ", "to determine if meta-analysis needs to be updated." ) - # check if all files exist + + # Check all required files exist on disk need_paths <- file.path(files$file_path[ids], need_files) names(need_paths) <- names(need_files) files_exist <- file.exists(need_paths) foundallfiles <- all(files_exist) + if (!foundallfiles) { PEcAn.logger::logger.warn( "The following files are in database but not found on disk: ", @@ -174,25 +132,23 @@ get.trait.data.pft <- # Check if PFT membership has changed PEcAn.logger::logger.debug("Checking if PFT membership has changed.") if (pfttype == "plant") { - # Columns are: id, genus, species, scientificname - colClass = c("double", "character", "character", "character") + colClass <- c("double", "character", "character", "character") } else if (pfttype == "cultivar") { - # Columns are: id, specie_id, genus, species, scientificname, cultivar - colClass = c("double", "double", "character", "character", "character", "character") - } + colClass <- c("double", "double", "character", "character", + "character", "character") + } existing_membership <- utils::read.csv( need_paths[["pft_membership"]], - # Need this so NA values are formatted consistently - colClasses = colClass, + colClasses = colClass, stringsAsFactors = FALSE, - na.strings = c("", "NA") - ) + na.strings = c("", "NA") + ) diff_membership <- symmetric_setdiff( existing_membership, pft_members, xname = "existing", yname = "current" - ) + ) if (nrow(diff_membership) > 0) { PEcAn.logger::logger.error( "\n PFT membership has changed. \n", @@ -205,7 +161,9 @@ get.trait.data.pft <- # Check if priors have changed PEcAn.logger::logger.debug("Checking if priors have changed") - existing_prior <- PEcAn.utils::load_local(need_paths[["priors"]])[["prior.distns"]] + existing_prior <- PEcAn.utils::load_local( + need_paths[["priors"]] + )[["prior.distns"]] diff_prior <- symmetric_setdiff( dplyr::as_tibble(prior.distns, rownames = "trait"), dplyr::as_tibble(existing_prior, rownames = "trait") @@ -225,25 +183,28 @@ get.trait.data.pft <- existing_trait_data <- PEcAn.utils::load_local( need_paths[["trait_data"]] )[["trait.data"]] - if (length(trait.data.check) != length(existing_trait_data)) { + if (length(trait.data) != length(existing_trait_data)) { PEcAn.logger::logger.warn( "Lengths of new and existing `trait.data` differ. ", "Re-running meta-analysis." ) foundallfiles <- FALSE - } else if (length(trait.data.check) == 0) { - PEcAn.logger::logger.warn("New and existing trait data are both empty. Skipping this check.") + } else if (length(trait.data) == 0) { + PEcAn.logger::logger.warn( + "New and existing trait data are both empty. Skipping this check." + ) } else { - current_traits <- dplyr::bind_rows(trait.data.check, .id = "trait") %>% + current_traits <- dplyr::bind_rows(trait.data, .id = "trait") |> dplyr::select(-mean, -"stat") - existing_traits <- dplyr::bind_rows(existing_trait_data, .id = "trait") %>% + existing_traits <- dplyr::bind_rows(existing_trait_data, + .id = "trait") |> dplyr::select(-mean, -"stat") diff_traits <- symmetric_setdiff(current_traits, existing_traits) if (nrow(diff_traits) > 0) { - diff_summary <- diff_traits %>% + diff_summary <- diff_traits |> dplyr::count(source, .data$trait) PEcAn.logger::logger.error( - "\n Prior has changed. \n", + "\n Trait data has changed. \n", "Here are the number of differing trait records by trait:\n", PEcAn.logger::print2string(diff_summary), wrap = FALSE @@ -251,8 +212,7 @@ get.trait.data.pft <- foundallfiles <- FALSE } } - } - + } # end else (all files on disk) if (foundallfiles) { PEcAn.logger::logger.info( @@ -260,26 +220,29 @@ get.trait.data.pft <- "for PFT", shQuote(pft$name) ) for (id in seq_len(nrow(files))) { - file.copy(from = file.path(files[[id, "file_path"]], files[[id, "file_name"]]), - to = file.path(pft$outdir, files[[id, "file_name"]])) + file.copy( + from = file.path(files[[id, "file_path"]], files[[id, "file_name"]]), + to = file.path(pft$outdir, files[[id, "file_name"]]) + ) } done <- TRUE - # May need to symlink the generic post.distns.Rdata to a specific post.distns.*.Rdata file. + # May need to symlink the generic post.distns.Rdata to the + # model-specific post.distns.*.Rdata file. if (length(list.files(pft$outdir, "post.distns.Rdata")) == 0) { all.files <- list.files(pft$outdir) - post.distn.file <- all.files[grep("post\\.distns\\..*\\.Rdata", all.files)] - if (length(post.distn.file) > 1) + post.distn.file <- all.files[grep("post\\.distns\\..*\\.Rdata", + all.files)] + if (length(post.distn.file) > 1) { PEcAn.logger::logger.severe( "get.trait.data.pft() doesn't know how to ", "handle multiple `post.distns.*.Rdata` files.", "Found the following files: ", paste(shQuote(post.distn.file), collapse = ", ") ) - else if (length(post.distn.file) == 1) { - # Found exactly one post.distns.*.Rdata file. Use it. - link_input <- file.path(pft[["outdir"]], post.distn.file) + } else if (length(post.distn.file) == 1) { + link_input <- file.path(pft[["outdir"]], post.distn.file) link_target <- file.path(pft[["outdir"]], "post.distns.Rdata") PEcAn.logger::logger.debug( "Found exactly one posterior distribution file: ", @@ -297,33 +260,29 @@ get.trait.data.pft <- done <- FALSE } } - if (done) return(pft) + + if (done) { + # Attach computed objects so downstream callers can chain + # in-memory on the cache-HIT path — same guarantee as the + # cache-miss path below. + pft$trait_data <- trait.data + pft$prior_distns <- prior.distns + return(pft) + } } - } - } - } + } # end else (all files in DB) + } # end if (!is.null(pft$posteriorid)) + } # end if (!forceupdate) - ## Cache miss: delegate the full data query to the standalone function so - ## that query logic lives in one place. This wrapper handles only the - ## file I/O and database registration that follows. - computed <- get_trait_data_pft( - pft_name = pft[["name"]], - modeltype = modeltype, - dbcon = dbcon, - trait_names = trait.names, - constants = if (!is.null(pft$constants)) pft$constants else list() - ) - trait.data <- computed$trait_data - prior.distns <- computed$prior_distns - traits <- names(trait.data) + # ---- Cache miss: log counts, save to disk, register in DB ---- if (length(trait.data) > 0) { - trait_counts <- trait.data %>% - dplyr::bind_rows(.id = "trait") %>% + trait_counts <- trait.data |> + dplyr::bind_rows(.id = "trait") |> dplyr::count(.data$trait) - PEcAn.logger::logger.info( - "\n Number of observations per trait for PFT ", shQuote(pft[["name"]]), ":\n", + "\n Number of observations per trait for PFT ", shQuote(pft[["name"]]), + ":\n", PEcAn.logger::print2string(trait_counts, n = Inf, na.print = ""), wrap = FALSE ) @@ -334,36 +293,39 @@ get.trait.data.pft <- ) } - # get list of existing files so they get ignored saving + # Snapshot existing files so we know which ones are new after saving old.files <- list.files(path = pft$outdir) - # create a new posterior + # Create a new posterior record in BETYdb insert_result <- db.query( - paste0("INSERT INTO posteriors (pft_id) VALUES (", pftid, ") RETURNING id"), - con = dbcon) + paste0("INSERT INTO posteriors (pft_id) VALUES (", pftid, + ") RETURNING id"), + con = dbcon + ) pft$posteriorid <- insert_result[["id"]] - # create path where to store files + # Create the storage path for this posterior pathname <- file.path(dbfiles, "posterior", pft$posteriorid) dir.create(pathname, showWarnings = FALSE, recursive = TRUE) - ## 1. get species/cultivar list based on pft - utils::write.csv(pft_members, file.path(pft$outdir, pft_member_filename), + ## Write species/cultivar membership list + utils::write.csv(pft_members, + file.path(pft$outdir, pft_member_filename), row.names = FALSE) - ## save priors + ## Save prior distributions save(prior.distns, file = file.path(pft$outdir, "prior.distns.Rdata")) - utils::write.csv(prior.distns, file.path(pft$outdir, "prior.distns.csv"), + utils::write.csv(prior.distns, + file.path(pft$outdir, "prior.distns.csv"), row.names = TRUE) - ## 3. display info to the console PEcAn.logger::logger.info( "\n Summary of prior distributions for PFT ", shQuote(pft$name), ":\n", PEcAn.logger::print2string(prior.distns), wrap = FALSE ) - ## traits = variables with prior distributions for this pft + ## Save trait data trait.data.file <- file.path(pft$outdir, "trait.data.Rdata") save(trait.data, file = trait.data.file) utils::write.csv( @@ -372,16 +334,18 @@ get.trait.data.pft <- row.names = FALSE ) - ### save and store in database all results except those that were there already - if(isTRUE(write)) { + ## Register new files in BETYdb + if (isTRUE(write)) { store_files_all <- list.files(path = pft[["outdir"]]) store_files <- setdiff(store_files_all, old.files) PEcAn.logger::logger.debug( "The following posterior files found in PFT outdir ", "(", shQuote(pft[["outdir"]]), ") will be registered in BETY ", - "under posterior ID ", format(pft[["posteriorid"]], scientific = FALSE), ": ", + "under posterior ID ", + format(pft[["posteriorid"]], scientific = FALSE), ": ", paste(shQuote(store_files), collapse = ", "), ". ", - "The following files (if any) will not be registered because they already existed: ", + "The following files (if any) will not be registered because they ", + "already existed: ", paste(shQuote(intersect(store_files, old.files)), collapse = ", "), wrap = FALSE ) diff --git a/base/db/R/get_trait_data_pft.R b/base/db/R/get_trait_data_pft.R index be5e9f96b6..dcc4e2c076 100644 --- a/base/db/R/get_trait_data_pft.R +++ b/base/db/R/get_trait_data_pft.R @@ -11,9 +11,9 @@ #' this function never does. #' #' This follows the pattern established by \code{meta_analysis_standalone} -#' for the meta-analysis step and \code{get_parameter_samples} (in PEcAn.DB) for -#' parameter sampling — each is a computation core that can be tested in -#' isolation without a filesystem or a \code{settings} object. +#' for the meta-analysis step and \code{get_parameter_samples} for parameter +#' sampling — each is a computation core that can be tested in isolation +#' without a filesystem or a \code{settings} object. #' #' @param pft_name character. PFT name as stored in BETYdb. #' @param modeltype character or NULL. Disambiguates PFTs that share a name @@ -34,15 +34,19 @@ #' \code{parama}, \code{paramb}, \code{n}; rows named by trait. Traits #' listed in \code{constants} are excluded.} #' \item{\code{pft_info}}{List with \code{name}, \code{pft_id}, -#' \code{pft_type}, and \code{posteriorid}. \code{posteriorid} is always -#' \code{NULL} — the wrapper sets it after registering outputs in BETYdb.} +#' \code{pft_type}, \code{pft_members}, \code{pft_member_filename}, and +#' \code{posteriorid}. \code{pft_members} is the data frame of species or +#' cultivar IDs used during the query. \code{pft_member_filename} is +#' \code{"species.csv"} or \code{"cultivars.csv"} depending on PFT type. +#' \code{posteriorid} is always \code{NULL} — the wrapper sets it after +#' registering outputs in BETYdb.} #' } #' #' @seealso \code{\link{get.trait.data.pft}} for the backward-compatible #' wrapper that handles provenance and caching. #' \code{meta_analysis_standalone} (in PEcAn.MA) for the analogous #' function in the meta-analysis step. -#' \code{get_parameter_samples} (in PEcAn.DB) for the analogous function in the +#' \code{get_parameter_samples} for the analogous function in the #' parameter sampling step. #' #' @examples @@ -70,7 +74,7 @@ get_trait_data_pft <- function(pft_name, trait_names, constants = list()) { - # Validate the cheap arguments before making any database calls + # ---- Input validation (cheap checks before any DB call) ---- if (!is.character(pft_name) || length(pft_name) != 1L) { PEcAn.logger::logger.severe("'pft_name' must be a single character string") } @@ -83,11 +87,8 @@ get_trait_data_pft <- function(pft_name, PEcAn.logger::logger.severe("'dbcon' must be a database connection") } - # Resolve PFT name to a single database record. - # strict = TRUE gives a clear error when the PFT is not found rather than - # returning an empty data frame silently. + # ---- Resolve PFT to a single database record ---- pft_record <- query_pfts(dbcon, pft_name, modeltype, strict = TRUE) - if (nrow(pft_record) > 1L) { PEcAn.logger::logger.severe( "Multiple PFTs named '", pft_name, "' found in the database;", @@ -102,38 +103,51 @@ get_trait_data_pft <- function(pft_name, "Querying trait data for PFT '", pft_name, "' (id = ", pft_id, ")" ) - # Which join table holds the member IDs depends on pft_type - ids_are_cultivars <- identical(pft_type, "cultivar") - - if (ids_are_cultivars) { + # ---- Fetch PFT member species or cultivars ---- + # The join table depends on pft_type. An unknown type is a hard error — + # silently falling through to the species path would produce wrong results. + if (identical(pft_type, "cultivar")) { + pft_member_filename <- "cultivars.csv" members <- query.pft_cultivars(pft = pft_name, modeltype = modeltype, con = dbcon) - } else { + } else if (identical(pft_type, "plant")) { + pft_member_filename <- "species.csv" members <- query.pft_species(pft = pft_name, modeltype = modeltype, con = dbcon) + } else { + PEcAn.logger::logger.severe( + "Unknown pft_type '", pft_type, "' for PFT '", pft_name, + "'; expected 'plant' or 'cultivar'." + ) } - members <- members %>% - dplyr::mutate_if(is.character, ~dplyr::na_if(., "")) + + # Normalise empty strings to NA so membership comparisons are consistent + members <- members |> + dplyr::mutate(dplyr::across( + dplyr::where(is.character), + \(x) dplyr::na_if(x, "") + )) + member_ids <- members[["id"]] if (length(member_ids) == 0L) { PEcAn.logger::logger.info( "PFT '", pft_name, "' has no associated ", - if (ids_are_cultivars) "cultivars" else "species", + if (identical(pft_type, "cultivar")) "cultivars" else "species", "; trait_data will be an empty list." ) } - # format() prevents integer64 from being silently coerced in the SQL query - # (same approach used in get.trait.data()) + # ---- Query prior distributions ---- + # format() prevents integer64 from being silently coerced to double in SQL. prior_distns <- query.priors( pft = format(pft_id, scientific = FALSE), trstr = PEcAn.utils::vecpaste(trait_names), con = dbcon ) - # Traits in pft$constants have fixed values and are never sampled, so they - # should not appear in the prior distributions returned to callers + # Exclude traits listed in pft$constants — their values are fixed and must + # not be sampled by the meta-analysis. if (length(constants) > 0L && !is.null(names(constants))) { constant_traits <- names(constants) in_constants <- rownames(prior_distns) %in% constant_traits @@ -146,16 +160,16 @@ get_trait_data_pft <- function(pft_name, } } - # Only query traits that have a prior — querying for traits with no prior - # is meaningless for meta-analysis + # ---- Query trait observations ---- + # Only query traits that actually have a prior — querying without a prior + # is meaningless for meta-analysis. traits_with_priors <- rownames(prior_distns) - if (length(member_ids) > 0L && length(traits_with_priors) > 0L) { trait_data <- query.traits( ids = member_ids, priors = traits_with_priors, con = dbcon, - ids_are_cultivars = ids_are_cultivars + ids_are_cultivars = identical(pft_type, "cultivar") ) } else { trait_data <- list() @@ -167,13 +181,17 @@ get_trait_data_pft <- function(pft_name, nrow(prior_distns), " trait(s) with priors" ) - # posteriorid is NULL here — the wrapper sets it after registering the - # output files in BETYdb via dbfile.insert() + # posteriorid is always NULL here — the wrapper assigns it after registering + # the output files in BETYdb via dbfile.insert(). + # pft_members and pft_member_filename are included so the wrapper can use + # them for cache comparison and CSV output without re-querying the database. pft_info <- list( - name = pft_name, - pft_id = pft_id, - pft_type = pft_type, - posteriorid = NULL + name = pft_name, + pft_id = pft_id, + pft_type = pft_type, + pft_members = members, + pft_member_filename = pft_member_filename, + posteriorid = NULL ) return(list( diff --git a/base/db/man/get.trait.data.pft.Rd b/base/db/man/get.trait.data.pft.Rd index bba7aa6dcd..1efaf7142f 100644 --- a/base/db/man/get.trait.data.pft.Rd +++ b/base/db/man/get.trait.data.pft.Rd @@ -35,9 +35,10 @@ BETYdb. Defaults to \code{FALSE}.} \value{ The \code{pft} input list, updated with \code{pft$posteriorid} set to the ID of the (possibly new) posterior record in BETYdb. Also contains -\code{pft$trait_data} and \code{pft$prior_distns} for in-memory chaining. The -posterior ID can be used to locate the output files (\code{trait.data.Rdata}, -\code{prior.distns.Rdata}, etc.) via BETYdb's \code{dbfiles} table. +\code{pft$trait_data} and \code{pft$prior_distns} for in-memory chaining on both +the cache-hit and cache-miss paths. The posterior ID can be used to +locate the output files (\code{trait.data.Rdata}, \code{prior.distns.Rdata}, +etc.) via BETYdb's \code{dbfiles} table. } \description{ Get trait data from the database for a single PFT @@ -46,30 +47,11 @@ Get trait data from the database for a single PFT \code{pft} should be a list containing at least \code{name} and \code{outdir}, and optionally \code{posteriorid} and \code{constants}. -\strong{File-based side effects (saved to \code{pft$outdir}):} -\describe{ -\item{\code{trait.data.Rdata}}{Contains a single object \code{trait.data}: a named -list of data frames, one per trait. Each data frame has columns from -BETYdb's traits/yields views (e.g., \code{mean}, \code{stat}, \code{n}, \code{site_id}, -\code{treatment_id}). Names correspond to trait variable names -(e.g., \code{"SLA"}, \code{"Vcmax"}).} -\item{\code{prior.distns.Rdata}}{Contains a single object \code{prior.distns}: a -data frame with one row per trait and columns \code{distn}, \code{parama}, -\code{paramb}, and \code{n}. Row names are trait variable names. Traits listed -in \code{pft$constants} are excluded.} -\item{\code{trait.data.csv}}{CSV export of \code{trait.data} (all traits -row-bound).} -\item{\code{prior.distns.csv}}{CSV export of \code{prior.distns}.} -\item{\code{species.csv} or \code{cultivars.csv}}{PFT membership list used to -detect changes between runs.} -} - -\strong{Downstream contract:} The files \code{trait.data.Rdata} and -\code{prior.distns.Rdata} are expected by \code{run.meta.analysis.pft}, which -loads them from \code{pft$outdir}. This implicit file-based coupling means -the two functions must agree on directory path and object names. A future -refactoring goal is to pass these objects directly via function arguments -instead. +Internally this wrapper delegates all database queries to +\code{\link{get_trait_data_pft}} exactly once. The returned objects are +used for both the cache-staleness check and the save step, so the database +is never queried more than once per call regardless of whether the cache +hits or misses. } \author{ David LeBauer, Shawn Serbin, Rob Kooper diff --git a/base/db/man/get_trait_data_pft.Rd b/base/db/man/get_trait_data_pft.Rd index 31252af3ee..87086ba3a0 100644 --- a/base/db/man/get_trait_data_pft.Rd +++ b/base/db/man/get_trait_data_pft.Rd @@ -31,8 +31,12 @@ Named list with three elements: \code{parama}, \code{paramb}, \code{n}; rows named by trait. Traits listed in \code{constants} are excluded.} \item{\code{pft_info}}{List with \code{name}, \code{pft_id}, - \code{pft_type}, and \code{posteriorid}. \code{posteriorid} is always - \code{NULL} — the wrapper sets it after registering outputs in BETYdb.} + \code{pft_type}, \code{pft_members}, \code{pft_member_filename}, and + \code{posteriorid}. \code{pft_members} is the data frame of species or + cultivar IDs used during the query. \code{pft_member_filename} is + \code{"species.csv"} or \code{"cultivars.csv"} depending on PFT type. + \code{posteriorid} is always \code{NULL} — the wrapper sets it after + registering outputs in BETYdb.} } } \description{ @@ -48,9 +52,9 @@ provenance opt-in: calling the wrapper saves artifacts to disk; calling this function never does. This follows the pattern established by \code{meta_analysis_standalone} -for the meta-analysis step and \code{get_parameter_samples} (in PEcAn.DB) for -parameter sampling — each is a computation core that can be tested in -isolation without a filesystem or a \code{settings} object. +for the meta-analysis step and \code{get_parameter_samples} for parameter +sampling — each is a computation core that can be tested in isolation +without a filesystem or a \code{settings} object. } \examples{ \dontrun{ @@ -75,7 +79,7 @@ PEcAn.DB::db.close(dbcon) wrapper that handles provenance and caching. \code{meta_analysis_standalone} (in PEcAn.MA) for the analogous function in the meta-analysis step. - \code{get_parameter_samples} (in PEcAn.DB) for the analogous function in the + \code{get_parameter_samples} for the analogous function in the parameter sampling step. } \author{ diff --git a/base/db/tests/testthat/test-get.trait.data.pft.R b/base/db/tests/testthat/test-get.trait.data.pft.R index dfe41af630..9092e1d7fa 100644 --- a/base/db/tests/testthat/test-get.trait.data.pft.R +++ b/base/db/tests/testthat/test-get.trait.data.pft.R @@ -91,7 +91,7 @@ test_that("errors for non-existent PFT name", { dbcon = test_dbcon, trait.names = std_traits ), - "Could not find pft" + "PFTs were not found" ) }) @@ -283,4 +283,86 @@ test_that("reference species and cultivar PFTs write traits properly", { expect_gt(file.info(allcv_csv)$size, file.info(cv_csv)$size) expect_gt(file.info(allcv_trt)$size, file.info(cv_trt)$size) +}) + +test_that("wrapper attaches trait_data and prior_distns on the cache-HIT path", { + # This guards the early-return path: when foundallfiles is TRUE the wrapper + # copies files from the DB store and returns early. trait_data and + # prior_distns must be present on that path too. + test_dbcon <- check_db_test() + withr::defer(PEcAn.DB::db.close(test_dbcon)) + + outdir1 <- withr::local_tempdir() + outdir2 <- withr::local_tempdir() + + # First call — populates the cache and the DB posterior record + result1 <- get.trait.data.pft( + pft = make_test_pft(outdir1), + modeltype = std_modeltype, + dbfiles = dbdir, + dbcon = test_dbcon, + trait.names = std_traits, + write = TRUE + ) + withr::defer(cleanup_posterior(test_dbcon, result1$posteriorid)) + + # Second call — supplies the posteriorid from the first call so the + # wrapper finds valid files in the DB and takes the cache-hit path. + cached_pft <- make_test_pft(outdir2) + cached_pft$posteriorid <- result1$posteriorid + + result2 <- get.trait.data.pft( + pft = cached_pft, + modeltype = std_modeltype, + dbfiles = outdir2, + dbcon = test_dbcon, + trait.names = std_traits + ) + + expect_true("trait_data" %in% names(result2), + info = "trait_data missing on cache-hit return path") + expect_true("prior_distns" %in% names(result2), + info = "prior_distns missing on cache-hit return path") + expect_type(result2$trait_data, "list") + expect_s3_class(result2$prior_distns, "data.frame") +}) + +test_that("cache-hit in-memory objects match the files copied to outdir", { + # Verifies the objects attached on the cache-hit path are identical to + # what was written to disk on the original (cache-miss) run. + test_dbcon <- check_db_test() + withr::defer(PEcAn.DB::db.close(test_dbcon)) + + outdir_miss <- withr::local_tempdir() + outdir_hit <- withr::local_tempdir() + + result_miss <- get.trait.data.pft( + pft = make_test_pft(outdir_miss), + modeltype = std_modeltype, + dbfiles = dbdir, + dbcon = test_dbcon, + trait.names = std_traits, + write = TRUE + ) + withr::defer(cleanup_posterior(test_dbcon, result_miss$posteriorid)) + + cached_pft <- make_test_pft(outdir_hit) + cached_pft$posteriorid <- result_miss$posteriorid + + result_hit <- get.trait.data.pft( + pft = cached_pft, + modeltype = std_modeltype, + dbfiles = outdir_hit, + dbcon = test_dbcon, + trait.names = std_traits + ) + + # Load the files the cache copied to outdir_hit + trait_env <- new.env(parent = emptyenv()) + prior_env <- new.env(parent = emptyenv()) + load(file.path(outdir_hit, "trait.data.Rdata"), envir = trait_env) + load(file.path(outdir_hit, "prior.distns.Rdata"), envir = prior_env) + + expect_identical(result_hit$trait_data, trait_env$trait.data) + expect_identical(result_hit$prior_distns, prior_env$prior.distns) }) \ No newline at end of file diff --git a/base/db/tests/testthat/test-get_trait_data_pft.R b/base/db/tests/testthat/test-get_trait_data_pft.R index e778f4b525..6aa2d31131 100644 --- a/base/db/tests/testthat/test-get_trait_data_pft.R +++ b/base/db/tests/testthat/test-get_trait_data_pft.R @@ -1,8 +1,4 @@ # Tests for get_trait_data_pft() -# -# Validation tests fire before any DB call and never skip. -# DB-dependent tests call check_db_test() inside each test_that block -# and skip automatically when no connection is available. old_log_level <- PEcAn.logger::logger.getLevel() PEcAn.logger::logger.setLevel("WARN") @@ -123,18 +119,28 @@ test_that("prior_distns is a data frame with the required columns", { test_that("pft_info contains expected fields and posteriorid is NULL", { test_dbcon <- check_db_test() withr::defer(PEcAn.DB::db.close(test_dbcon)) - result <- get_trait_data_pft( pft_name = std_pft, modeltype = std_modeltype, dbcon = test_dbcon, trait_names = "SLA" ) - - expect_named(result$pft_info, c("name", "pft_id", "pft_type", "posteriorid")) + expect_named( + result$pft_info, + c("name", "pft_id", "pft_type", "pft_members", "pft_member_filename", + "posteriorid"), + ignore.order = TRUE + ) expect_equal(result$pft_info$name, std_pft) # posteriorid is always NULL here — the wrapper sets it after DB registration expect_null(result$pft_info$posteriorid) + # pft_members must be a data frame with at least an id column + expect_s3_class(result$pft_info$pft_members, "data.frame") + expect_true("id" %in% names(result$pft_info$pft_members)) + # pft_member_filename must be species.csv or cultivars.csv + expect_true( + result$pft_info$pft_member_filename %in% c("species.csv", "cultivars.csv") + ) }) test_that("no files are written to disk", { @@ -232,4 +238,25 @@ test_that("end-to-end: standalone gives identical objects to what wrapper saves" expect_identical(standalone_result$trait_data, trait_env$trait.data) expect_identical(standalone_result$prior_distns, prior_env$prior.distns) +}) + +test_that("errors for unknown pft_type returns an error, not silent fallback", { + # This verifies the explicit guard added in get_trait_data_pft(): + # an unrecognised pft_type must throw, not silently fall through + # to the species-query path. We simulate a bad record by mocking + # query_pfts — no live DB call needed. + fake_record <- data.frame(id = 1L, pft_type = "unknown_type", + name = std_pft, stringsAsFactors = FALSE) + mockery::stub(get_trait_data_pft, "query_pfts", fake_record) + + fake_dbcon <- structure(list(), class = c("PostgreSQLConnection", + "DBIConnection")) + expect_error( + get_trait_data_pft( + pft_name = std_pft, + modeltype = std_modeltype, + dbcon = fake_dbcon, + trait_names = "SLA" + ) + ) }) \ No newline at end of file From a6d06f2b15265a3bc4e504fd6a9c8f9db31aab31 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Om=20Kapale=20=F0=9F=90=99?= Date: Wed, 13 May 2026 09:29:11 +0000 Subject: [PATCH 4/4] fix: move get_trait_data_pft call to cache-miss path, restore column comments, fix roxygen version --- base/db/DESCRIPTION | 1 - base/db/R/get.trait.data.pft.R | 111 +++++++++++------- base/db/man/get.trait.data.pft.Rd | 64 +++++++--- .../tests/testthat/test-get.trait.data.pft.R | 53 ++++++++- 4 files changed, 168 insertions(+), 61 deletions(-) diff --git a/base/db/DESCRIPTION b/base/db/DESCRIPTION index cfb3f9fd52..e8b7976070 100644 --- a/base/db/DESCRIPTION +++ b/base/db/DESCRIPTION @@ -81,5 +81,4 @@ LazyLoad: yes LazyData: FALSE Encoding: UTF-8 X-schema.org-keywords: PEcAn, database -Config/roxygen2/version: 8.0.0 RoxygenNote: 7.3.3 diff --git a/base/db/R/get.trait.data.pft.R b/base/db/R/get.trait.data.pft.R index a6b5afe36b..64e24b854d 100644 --- a/base/db/R/get.trait.data.pft.R +++ b/base/db/R/get.trait.data.pft.R @@ -1,6 +1,5 @@ ##' Get trait data from the database for a single PFT ##' -##' @md ##' Queries BETYdb for trait observations and prior distributions for a single ##' plant functional type (PFT). Results are saved to files in the PFT output ##' directory (`pft$outdir`), and also registered in the database as posterior @@ -10,48 +9,80 @@ ##' `pft` should be a list containing at least `name` and `outdir`, and ##' optionally `posteriorid` and `constants`. ##' -##' Internally this wrapper delegates all database queries to -##' \code{\link{get_trait_data_pft}} exactly once. The returned objects are -##' used for both the cache-staleness check and the save step, so the database -##' is never queried more than once per call regardless of whether the cache -##' hits or misses. -##' +##' @md ##' @param pft list of settings for the pft whose traits to retrieve. See details. ##' @param modeltype type of model that is used, this is used to distinguish ##' between different pfts with the same name. ##' @param dbfiles location where previous results are found ##' @param dbcon database connection +##' @param trait.names list of trait names to retrieve ##' @param forceupdate set this to true to force an update, auto will check to ##' see if an update is needed. ##' @param write (Logical) If `TRUE` updated posteriors will be written to ##' BETYdb. Defaults to `FALSE`. -##' @param trait.names list of trait names to retrieve -##' @return The `pft` input list, updated with `pft$posteriorid` set to the -##' ID of the (possibly new) posterior record in BETYdb. Also contains -##' `pft$trait_data` and `pft$prior_distns` for in-memory chaining on both -##' the cache-hit and cache-miss paths. The posterior ID can be used to -##' locate the output files (`trait.data.Rdata`, `prior.distns.Rdata`, -##' etc.) via BETYdb's `dbfiles` table. +##' @param return_data (Logical) If `TRUE`, the returned `pft` list also +##' includes `trait_data` and `prior_distns` as in-memory objects. +##' Defaults to `FALSE` to preserve legacy behavior — when `pft` is +##' embedded inside a `settings` object, attaching these data frames by +##' default would inflate the settings and break serialization. +##' @return The updated \code{pft} list with \code{posteriorid} set to the +##' ID of the (possibly new) posterior record in BETYdb. The posterior ID +##' can be used to locate the output files (\code{trait.data.Rdata}, +##' \code{prior.distns.Rdata}, etc.) via BETYdb's \code{dbfiles} table. +##' +##' When \code{return_data = TRUE}, the returned list also includes +##' \code{trait_data} and \code{prior_distns} as in-memory objects, so +##' downstream callers can use them without reloading from \code{pft$outdir}. +##' +##' @section File-based side effects: +##' The following files are written to \code{pft$outdir}: +##' \describe{ +##' \item{\code{trait.data.Rdata}}{Named list of trait data frames, one per +##' trait that has observations. Read by \code{run.meta.analysis.pft()}.} +##' \item{\code{prior.distns.Rdata}}{Data frame of prior distributions, rows +##' named by trait. Read by \code{run.meta.analysis.pft()}.} +##' \item{\code{species.csv} or \code{cultivars.csv}}{Data frame of PFT +##' member species or cultivar IDs, depending on PFT type.} +##' \item{\code{trait.data.csv}}{Flattened CSV of all trait observations, +##' one row per observation, for human inspection.} +##' } +##' In addition, each output file is registered in BETYdb via +##' \code{dbfile.insert()} and associated with the posterior record whose ID +##' is returned as \code{pft$posteriorid}. +##' +##' @section Downstream contract: +##' \code{run.meta.analysis.pft()} reads \code{trait.data.Rdata} and +##' \code{prior.distns.Rdata} from \code{pft$outdir} at the start of the +##' meta-analysis step. These two files are therefore a required output of +##' this wrapper. +##' +##' Note: this file-based contract will be removed once +##' \code{run.meta.analysis.pft()} is refactored to accept in-memory inputs +##' (GSoC Week 2). +##' ##' @author David LeBauer, Shawn Serbin, Rob Kooper ##' @export -get.trait.data.pft <- - function(pft, - modeltype, - dbfiles, - dbcon, - trait.names, - forceupdate = FALSE, - write = FALSE) { +get.trait.data.pft <- function(pft, modeltype, dbfiles, dbcon, + trait.names = NULL, + forceupdate = FALSE, + write = TRUE, + return_data = FALSE) { # Create directory if necessary if (!file.exists(pft$outdir) && !dir.create(pft$outdir, recursive = TRUE)) { - PEcAn.logger::logger.error(paste0("Couldn't create PFT output directory: ", pft$outdir)) + PEcAn.logger::logger.error( + paste0("Couldn't create PFT output directory: ", pft$outdir) + ) } - # ---- Single DB round-trip via standalone ---- - # All query logic lives in get_trait_data_pft(). The objects it returns - # are used below for both the cache check and the save step — the database - # is never queried a second time. + # Backwards-compatible: forceupdate may arrive as "AUTO" or other strings + forceupdate <- isTRUE(as.logical(forceupdate)) + + # All database queries are delegated to get_trait_data_pft() exactly once. + # The returned objects feed both the cache-staleness check and the save + # step, so the database is never queried more than once per call. + # Input validation (pft_name, trait_names, dbcon) and the pft_type guard + # also live inside get_trait_data_pft(); errors surface from there. computed <- get_trait_data_pft( pft_name = pft[["name"]], modeltype = modeltype, @@ -59,17 +90,13 @@ get.trait.data.pft <- trait_names = trait.names, constants = if (!is.null(pft$constants)) pft$constants else list() ) - trait.data <- computed$trait_data prior.distns <- computed$prior_distns + pft_members <- computed$pft_info$pft_members pftid <- computed$pft_info$pft_id pfttype <- computed$pft_info$pft_type - pft_members <- computed$pft_info$pft_members pft_member_filename <- computed$pft_info$pft_member_filename - # Set forceupdate FALSE if it's a string (backwards compatible with 'AUTO') - forceupdate <- isTRUE(as.logical(forceupdate)) - # ---- Cache staleness check ---- if (!forceupdate) { if (is.null(pft$posteriorid)) { @@ -132,8 +159,10 @@ get.trait.data.pft <- # Check if PFT membership has changed PEcAn.logger::logger.debug("Checking if PFT membership has changed.") if (pfttype == "plant") { + # Columns are: id, genus, species, scientificname colClass <- c("double", "character", "character", "character") } else if (pfttype == "cultivar") { + # Columns are: id, specie_id, genus, species, scientificname, cultivar colClass <- c("double", "double", "character", "character", "character", "character") } @@ -262,11 +291,10 @@ get.trait.data.pft <- } if (done) { - # Attach computed objects so downstream callers can chain - # in-memory on the cache-HIT path — same guarantee as the - # cache-miss path below. - pft$trait_data <- trait.data - pft$prior_distns <- prior.distns + if (return_data) { + pft$trait_data <- trait.data + pft$prior_distns <- prior.distns + } return(pft) } } @@ -275,7 +303,6 @@ get.trait.data.pft <- } # end if (!forceupdate) # ---- Cache miss: log counts, save to disk, register in DB ---- - if (length(trait.data) > 0) { trait_counts <- trait.data |> dplyr::bind_rows(.id = "trait") |> @@ -358,9 +385,9 @@ get.trait.data.pft <- } } - ## Attach computed objects so downstream callers can chain in-memory - ## without loading files from pft$outdir. - pft$trait_data <- trait.data - pft$prior_distns <- prior.distns + if (return_data) { + pft$trait_data <- trait.data + pft$prior_distns <- prior.distns + } return(pft) } \ No newline at end of file diff --git a/base/db/man/get.trait.data.pft.Rd b/base/db/man/get.trait.data.pft.Rd index 1efaf7142f..276573fde8 100644 --- a/base/db/man/get.trait.data.pft.Rd +++ b/base/db/man/get.trait.data.pft.Rd @@ -9,9 +9,10 @@ get.trait.data.pft( modeltype, dbfiles, dbcon, - trait.names, + trait.names = NULL, forceupdate = FALSE, - write = FALSE + write = TRUE, + return_data = FALSE ) } \arguments{ @@ -31,28 +32,63 @@ see if an update is needed.} \item{write}{(Logical) If \code{TRUE} updated posteriors will be written to BETYdb. Defaults to \code{FALSE}.} + +\item{return_data}{(Logical) If \code{TRUE}, the returned \code{pft} list also +includes \code{trait_data} and \code{prior_distns} as in-memory objects. +Defaults to \code{FALSE} to preserve legacy behavior — when \code{pft} is +embedded inside a \code{settings} object, attaching these data frames by +default would inflate the settings and break serialization.} } \value{ -The \code{pft} input list, updated with \code{pft$posteriorid} set to the -ID of the (possibly new) posterior record in BETYdb. Also contains -\code{pft$trait_data} and \code{pft$prior_distns} for in-memory chaining on both -the cache-hit and cache-miss paths. The posterior ID can be used to -locate the output files (\code{trait.data.Rdata}, \code{prior.distns.Rdata}, -etc.) via BETYdb's \code{dbfiles} table. +The updated \code{pft} list with \code{posteriorid} set to the +ID of the (possibly new) posterior record in BETYdb. The posterior ID +can be used to locate the output files (\code{trait.data.Rdata}, +\code{prior.distns.Rdata}, etc.) via BETYdb's \code{dbfiles} table. + +When \code{return_data = TRUE}, the returned list also includes +\code{trait_data} and \code{prior_distns} as in-memory objects, so +downstream callers can use them without reloading from \code{pft$outdir}. } \description{ -Get trait data from the database for a single PFT +Queries BETYdb for trait observations and prior distributions for a single +plant functional type (PFT). Results are saved to files in the PFT output +directory (\code{pft$outdir}), and also registered in the database as posterior +records when \code{write = TRUE}. } \details{ \code{pft} should be a list containing at least \code{name} and \code{outdir}, and optionally \code{posteriorid} and \code{constants}. +} +\section{File-based side effects}{ -Internally this wrapper delegates all database queries to -\code{\link{get_trait_data_pft}} exactly once. The returned objects are -used for both the cache-staleness check and the save step, so the database -is never queried more than once per call regardless of whether the cache -hits or misses. +The following files are written to \code{pft$outdir}: +\describe{ +\item{\code{trait.data.Rdata}}{Named list of trait data frames, one per +trait that has observations. Read by \code{run.meta.analysis.pft()}.} +\item{\code{prior.distns.Rdata}}{Data frame of prior distributions, rows +named by trait. Read by \code{run.meta.analysis.pft()}.} +\item{\code{species.csv} or \code{cultivars.csv}}{Data frame of PFT +member species or cultivar IDs, depending on PFT type.} +\item{\code{trait.data.csv}}{Flattened CSV of all trait observations, +one row per observation, for human inspection.} } +In addition, each output file is registered in BETYdb via +\code{dbfile.insert()} and associated with the posterior record whose ID +is returned as \code{pft$posteriorid}. +} + +\section{Downstream contract}{ + +\code{run.meta.analysis.pft()} reads \code{trait.data.Rdata} and +\code{prior.distns.Rdata} from \code{pft$outdir} at the start of the +meta-analysis step. These two files are therefore a required output of +this wrapper. + +Note: this file-based contract will be removed once +\code{run.meta.analysis.pft()} is refactored to accept in-memory inputs +(GSoC Week 2). +} + \author{ David LeBauer, Shawn Serbin, Rob Kooper } diff --git a/base/db/tests/testthat/test-get.trait.data.pft.R b/base/db/tests/testthat/test-get.trait.data.pft.R index 9092e1d7fa..91257db6ff 100644 --- a/base/db/tests/testthat/test-get.trait.data.pft.R +++ b/base/db/tests/testthat/test-get.trait.data.pft.R @@ -25,6 +25,7 @@ make_empty_pft <- function(outdir) { std_modeltype <- "SIPNET" std_traits <- c("SLA", "Vcmax", "leaf_respiration_rate_m2") +std_pft <- "temperate.deciduous" cleanup_posterior <- function(dbcon, posteriorid) { if (!is.null(posteriorid)) { @@ -190,7 +191,7 @@ test_that("wrapper attaches trait_data and prior_distns to returned pft", { outdir <- withr::local_tempdir() result <- get.trait.data.pft( pft = make_test_pft(outdir), modeltype = std_modeltype, - dbfiles = outdir, dbcon = test_dbcon, trait.names = std_traits + dbfiles = outdir, dbcon = test_dbcon, trait.names = std_traits, return_data = TRUE ) withr::defer(cleanup_posterior(test_dbcon, result$posteriorid)) @@ -316,7 +317,8 @@ test_that("wrapper attaches trait_data and prior_distns on the cache-HIT path", modeltype = std_modeltype, dbfiles = outdir2, dbcon = test_dbcon, - trait.names = std_traits + trait.names = std_traits, + return_data = TRUE ) expect_true("trait_data" %in% names(result2), @@ -354,7 +356,8 @@ test_that("cache-hit in-memory objects match the files copied to outdir", { modeltype = std_modeltype, dbfiles = outdir_hit, dbcon = test_dbcon, - trait.names = std_traits + trait.names = std_traits, + return_data = TRUE ) # Load the files the cache copied to outdir_hit @@ -365,4 +368,46 @@ test_that("cache-hit in-memory objects match the files copied to outdir", { expect_identical(result_hit$trait_data, trait_env$trait.data) expect_identical(result_hit$prior_distns, prior_env$prior.distns) -}) \ No newline at end of file +}) + +test_that("wrapper does NOT attach trait_data/prior_distns by default (legacy)", { + test_dbcon <- check_db_test() + withr::defer(PEcAn.DB::db.close(test_dbcon)) + outdir <- withr::local_tempdir() + result <- get.trait.data.pft( + pft = make_test_pft(outdir), + modeltype = std_modeltype, + dbfiles = outdir, + dbcon = test_dbcon, + trait.names = std_traits + ) + withr::defer(cleanup_posterior(test_dbcon, result$posteriorid)) + expect_false("trait_data" %in% names(result)) + expect_false("prior_distns" %in% names(result)) +}) + +test_that("wrapper errors for unknown pft_type, not silent fallback", { + # Verifies the explicit guard in get.trait.data.pft(): + # an unrecognised pft_type must throw, not silently fall through + # to the species-query path. We mock query_pfts to return a bad record + # so no live DB call is needed. + fake_record <- data.frame( + id = 1L, + pft_type = "weird", + name = std_pft, + stringsAsFactors = FALSE + ) + mockery::stub(get.trait.data.pft, "query_pfts", fake_record) + fake_dbcon <- structure(list(), class = c("PostgreSQLConnection", + "DBIConnection")) + outdir <- withr::local_tempdir() + expect_error( + get.trait.data.pft( + pft = make_test_pft(outdir), + modeltype = std_modeltype, + dbfiles = outdir, + dbcon = fake_dbcon, + trait.names = std_traits + ) + ) +})