From ef6f12dfa901e9d153fb87fb191cba99b1f68a28 Mon Sep 17 00:00:00 2001
From: Om Kapale <omshripadkapale@gmail.com>
Date: Mon, 11 May 2026 19:35:05 +0530
Subject: [PATCH 1/4] feat: extract get_trait_data_pft() standalone + enhance
 wrapper (Modularity Part 2)

---
 base/db/R/get.trait.data.pft.R                |   4 +
 base/db/R/get_trait_data_pft.R                | 182 ++++++++++++++
 .../tests/testthat/test-get.trait.data.pft.R  |  17 +-
 .../tests/testthat/test-get_trait_data_pft.R  | 235 ++++++++++++++++++
 4 files changed, 435 insertions(+), 3 deletions(-)
 create mode 100644 base/db/R/get_trait_data_pft.R
 create mode 100644 base/db/tests/testthat/test-get_trait_data_pft.R

diff --git a/base/db/R/get.trait.data.pft.R b/base/db/R/get.trait.data.pft.R
index 30c4005915..ee2418a777 100644
--- a/base/db/R/get.trait.data.pft.R
+++ b/base/db/R/get.trait.data.pft.R
@@ -386,5 +386,9 @@ get.trait.data.pft <-
     }
   }
 
+  # caching — is unchanged. Callers that only use pft$name / pft$outdir /
+  # pft$posteriorid are unaffected.
+  pft$trait_data   <- trait.data
+  pft$prior_distns <- prior.distns
   return(pft)
 }
diff --git a/base/db/R/get_trait_data_pft.R b/base/db/R/get_trait_data_pft.R
new file mode 100644
index 0000000000..e380ac086e
--- /dev/null
+++ b/base/db/R/get_trait_data_pft.R
@@ -0,0 +1,182 @@
+#' Retrieve trait data and priors for one PFT from BETYdb
+#'
+#' Core computation extracted from \code{\link{get.trait.data.pft}}.
+#' Queries the database for trait observations and prior distributions,
+#' returning them as R objects with no file I/O of any kind.
+#'
+#' The wrapper \code{\link{get.trait.data.pft}} handles directory creation,
+#' caching, CSV output, and BETYdb registration via \code{dbfile.insert}.
+#' This function handles only the query. Choosing between them is the
+#' provenance opt-in: calling the wrapper saves artifacts to disk; calling
+#' this function never does.
+#'
+#' This follows the pattern established by \code{meta_analysis_standalone}
+#' for the meta-analysis step and \code{\link{get_parameter_samples}} for
+#' parameter sampling — each is a computation core that can be tested in
+#' isolation without a filesystem or a \code{settings} object.
+#'
+#' @param pft_name character. PFT name as stored in BETYdb.
+#' @param modeltype character or NULL. Disambiguates PFTs that share a name
+#'   across model types (e.g. \code{"SIPNET"}, \code{"ED2"}).
+#' @param dbcon database connection from \code{\link[PEcAn.DB]{db.open}}.
+#' @param trait_names character vector of trait names to retrieve.
+#' @param constants named list from \code{pft$constants} in the settings.
+#'   Traits named here are excluded from the returned priors because their
+#'   values are fixed rather than sampled by the meta-analysis.
+#'
+#' @return Named list with three elements:
+#' \describe{
+#'   \item{\code{trait_data}}{Named list of data frames, one per trait that
+#'     has observations. Column structure matches what
+#'     \code{meta_analysis_standalone} expects. Traits with no observations
+#'     are omitted from the list.}
+#'   \item{\code{prior_distns}}{Data frame with columns \code{distn},
+#'     \code{parama}, \code{paramb}, \code{n}; rows named by trait. Traits
+#'     listed in \code{constants} are excluded.}
+#'   \item{\code{pft_info}}{List with \code{name}, \code{pft_id},
+#'     \code{pft_type}, and \code{posteriorid}. \code{posteriorid} is always
+#'     \code{NULL} — the wrapper sets it after registering outputs in BETYdb.}
+#' }
+#'
+#' @seealso \code{\link{get.trait.data.pft}} for the backward-compatible
+#'   wrapper that handles provenance and caching.
+#'   \code{\link[PEcAn.MA]{meta_analysis_standalone}} for the analogous
+#'   function in the meta-analysis step.
+#'   \code{\link{get_parameter_samples}} for the analogous function in the
+#'   parameter sampling step.
+#'
+#' @examples
+#' \dontrun{
+#' dbcon <- PEcAn.DB::db.open(list(
+#'   host = "localhost", user = "bety",
+#'   password = "bety",  dbname = "bety"
+#' ))
+#' result <- get_trait_data_pft(
+#'   pft_name    = "temperate.deciduous",
+#'   modeltype   = "SIPNET",
+#'   dbcon       = dbcon,
+#'   trait_names = c("SLA", "Vcmax", "leaf_respiration_rate_m2")
+#' )
+#' str(result$trait_data)
+#' str(result$prior_distns)
+#' PEcAn.DB::db.close(dbcon)
+#' }
+#'
+#' @author David LeBauer, Shawn Serbin, Alexey Shiklomanov, Om Kapale
+#' @export
+get_trait_data_pft <- function(pft_name,
+                               modeltype,
+                               dbcon,
+                               trait_names,
+                               constants = list()) {
+
+  # Validate the cheap arguments before making any database calls
+  if (!is.character(pft_name) || length(pft_name) != 1L) {
+    PEcAn.logger::logger.severe("'pft_name' must be a single character string")
+  }
+  if (!is.character(trait_names) || length(trait_names) == 0L) {
+    PEcAn.logger::logger.severe(
+      "'trait_names' must be a non-empty character vector"
+    )
+  }
+  if (!inherits(dbcon, "DBIConnection")) {
+    PEcAn.logger::logger.severe("'dbcon' must be a database connection")
+  }
+
+  # Resolve PFT name to a single database record.
+  # strict = TRUE gives a clear error when the PFT is not found rather than
+  # returning an empty data frame silently.
+  pft_record <- query_pfts(dbcon, pft_name, modeltype, strict = TRUE)
+
+  if (nrow(pft_record) > 1L) {
+    PEcAn.logger::logger.severe(
+      "Multiple PFTs named '", pft_name, "' found in the database;",
+      " pass modeltype to disambiguate."
+    )
+  }
+
+  pft_id   <- pft_record[["id"]]
+  pft_type <- pft_record[["pft_type"]]
+
+  PEcAn.logger::logger.info(
+    "Querying trait data for PFT '", pft_name, "' (id = ", pft_id, ")"
+  )
+
+  # Which join table holds the member IDs depends on pft_type
+  ids_are_cultivars <- identical(pft_type, "cultivar")
+
+  if (ids_are_cultivars) {
+    members <- query.pft_cultivars(pft = pft_name, modeltype = modeltype,
+                                   con = dbcon)
+  } else {
+    members <- query.pft_species(pft = pft_name, modeltype = modeltype,
+                                 con = dbcon)
+  }
+  member_ids <- members[["id"]]
+
+  if (length(member_ids) == 0L) {
+    PEcAn.logger::logger.info(
+      "PFT '", pft_name, "' has no associated ",
+      if (ids_are_cultivars) "cultivars" else "species",
+      "; trait_data will be an empty list."
+    )
+  }
+
+  # format() prevents integer64 from being silently coerced in the SQL query
+  # (same approach used in get.trait.data())
+  prior_distns <- query.priors(
+    pft   = format(pft_id, scientific = FALSE),
+    trstr = trait_names,
+    con   = dbcon
+  )
+
+  # Traits in pft$constants have fixed values and are never sampled, so they
+  # should not appear in the prior distributions returned to callers
+  if (length(constants) > 0L && !is.null(names(constants))) {
+    constant_traits <- names(constants)
+    in_constants    <- rownames(prior_distns) %in% constant_traits
+    if (any(in_constants)) {
+      PEcAn.logger::logger.info(
+        "Excluding ", sum(in_constants), " constant trait(s) from priors: ",
+        PEcAn.utils::vecpaste(rownames(prior_distns)[in_constants])
+      )
+      prior_distns <- prior_distns[!in_constants, , drop = FALSE]
+    }
+  }
+
+  # Only query traits that have a prior — querying for traits with no prior
+  # is meaningless for meta-analysis
+  traits_with_priors <- rownames(prior_distns)
+
+  if (length(member_ids) > 0L && length(traits_with_priors) > 0L) {
+    trait_data <- query.traits(
+      ids               = member_ids,
+      priors            = traits_with_priors,
+      con               = dbcon,
+      ids_are_cultivars = ids_are_cultivars
+    )
+  } else {
+    trait_data <- list()
+  }
+
+  PEcAn.logger::logger.info(
+    "PFT '", pft_name, "': ",
+    length(trait_data), " trait(s) with observations, ",
+    nrow(prior_distns), " trait(s) with priors"
+  )
+
+  # posteriorid is NULL here — the wrapper sets it after registering the
+  # output files in BETYdb via dbfile.insert()
+  pft_info <- list(
+    name        = pft_name,
+    pft_id      = pft_id,
+    pft_type    = pft_type,
+    posteriorid = NULL
+  )
+
+  return(list(
+    trait_data   = trait_data,
+    prior_distns = prior_distns,
+    pft_info     = pft_info
+  ))
+}
\ No newline at end of file
diff --git a/base/db/tests/testthat/test-get.trait.data.pft.R b/base/db/tests/testthat/test-get.trait.data.pft.R
index dbcc5ddfcf..dfe41af630 100644
--- a/base/db/tests/testthat/test-get.trait.data.pft.R
+++ b/base/db/tests/testthat/test-get.trait.data.pft.R
@@ -182,7 +182,9 @@ test_that("returns pft list with name, posteriorid, and outdir", {
   expect_true("posteriorid" %in% names(result))
 })
 
-test_that("return value does not include trait_data or prior_distns", {
+test_that("wrapper attaches trait_data and prior_distns to returned pft", {
+  # Documents the Week 1 enhancement: these fields are now returned so that
+  # run.meta.analysis.pft() can skip load() calls in a later PR
   test_dbcon <- check_db_test()
   withr::defer(PEcAn.DB::db.close(test_dbcon))
   outdir <- withr::local_tempdir()
@@ -191,8 +193,17 @@ test_that("return value does not include trait_data or prior_distns", {
     dbfiles = outdir, dbcon = test_dbcon, trait.names = std_traits
   )
   withr::defer(cleanup_posterior(test_dbcon, result$posteriorid))
-  expect_false("trait_data"   %in% names(result))
-  expect_false("prior_distns" %in% names(result))
+
+  expect_true("trait_data"   %in% names(result))
+  expect_true("prior_distns" %in% names(result))
+
+  # The attached objects must match what was saved — no silent divergence
+  trait_env <- new.env(parent = emptyenv())
+  prior_env <- new.env(parent = emptyenv())
+  load(file.path(outdir, "trait.data.Rdata"),   envir = trait_env)
+  load(file.path(outdir, "prior.distns.Rdata"), envir = prior_env)
+  expect_identical(result$trait_data,   trait_env$trait.data)
+  expect_identical(result$prior_distns, prior_env$prior.distns)
 })
 
 # PFT with no observations
diff --git a/base/db/tests/testthat/test-get_trait_data_pft.R b/base/db/tests/testthat/test-get_trait_data_pft.R
new file mode 100644
index 0000000000..e778f4b525
--- /dev/null
+++ b/base/db/tests/testthat/test-get_trait_data_pft.R
@@ -0,0 +1,235 @@
+# Tests for get_trait_data_pft()
+#
+# Validation tests fire before any DB call and never skip.
+# DB-dependent tests call check_db_test() inside each test_that block
+# and skip automatically when no connection is available.
+
+old_log_level <- PEcAn.logger::logger.getLevel()
+PEcAn.logger::logger.setLevel("WARN")
+teardown({
+  PEcAn.logger::logger.setLevel(old_log_level)
+})
+
+std_pft       <- "temperate.deciduous"
+std_modeltype <- "SIPNET"
+std_traits    <- c("SLA", "Vcmax", "leaf_respiration_rate_m2")
+
+# Input validation — no DB connection needed for these
+
+test_that("errors when pft_name is not a single string", {
+  # pft_name is validated before dbcon, so list() is safe to pass here
+  expect_error(
+    get_trait_data_pft(
+      pft_name    = c("pft1", "pft2"),
+      modeltype   = std_modeltype,
+      dbcon       = list(),
+      trait_names = "SLA"
+    )
+  )
+})
+
+test_that("errors when trait_names is empty", {
+  # trait_names is validated before dbcon, so list() is safe to pass here
+  expect_error(
+    get_trait_data_pft(
+      pft_name    = std_pft,
+      modeltype   = std_modeltype,
+      dbcon       = list(),
+      trait_names = character(0)
+    )
+  )
+})
+
+test_that("errors when dbcon is not a database connection", {
+  expect_error(
+    get_trait_data_pft(
+      pft_name    = std_pft,
+      modeltype   = std_modeltype,
+      dbcon       = list(),
+      trait_names = "SLA"
+    )
+  )
+})
+
+# Database-backed tests
+
+test_that("errors for a PFT name not in the database", {
+  test_dbcon <- check_db_test()
+  withr::defer(PEcAn.DB::db.close(test_dbcon))
+
+  expect_error(
+    get_trait_data_pft(
+      pft_name    = "NOTAPFT_GSOC2026",
+      modeltype   = std_modeltype,
+      dbcon       = test_dbcon,
+      trait_names = "SLA"
+    )
+  )
+})
+
+test_that("returns a named list with trait_data, prior_distns, pft_info", {
+  test_dbcon <- check_db_test()
+  withr::defer(PEcAn.DB::db.close(test_dbcon))
+
+  result <- get_trait_data_pft(
+    pft_name    = std_pft,
+    modeltype   = std_modeltype,
+    dbcon       = test_dbcon,
+    trait_names = std_traits
+  )
+
+  expect_named(result, c("trait_data", "prior_distns", "pft_info"))
+})
+
+test_that("trait_data is a named list of data frames", {
+  test_dbcon <- check_db_test()
+  withr::defer(PEcAn.DB::db.close(test_dbcon))
+
+  result <- get_trait_data_pft(
+    pft_name    = std_pft,
+    modeltype   = std_modeltype,
+    dbcon       = test_dbcon,
+    trait_names = std_traits
+  )
+
+  expect_type(result$trait_data, "list")
+  for (df in result$trait_data) {
+    expect_s3_class(df, "data.frame")
+    # These columns are the contract that meta_analysis_standalone expects
+    expect_true(all(c("mean", "stat", "statname", "n", "site_id",
+                       "greenhouse", "specie_id", "citation_id") %in%
+                      names(df)))
+  }
+})
+
+test_that("prior_distns is a data frame with the required columns", {
+  test_dbcon <- check_db_test()
+  withr::defer(PEcAn.DB::db.close(test_dbcon))
+
+  result <- get_trait_data_pft(
+    pft_name    = std_pft,
+    modeltype   = std_modeltype,
+    dbcon       = test_dbcon,
+    trait_names = std_traits
+  )
+
+  expect_s3_class(result$prior_distns, "data.frame")
+  expect_true(all(c("distn", "parama", "paramb") %in%
+                    colnames(result$prior_distns)))
+  # Row names are trait names — the contract downstream functions rely on
+  expect_true(all(rownames(result$prior_distns) %in% std_traits))
+})
+
+test_that("pft_info contains expected fields and posteriorid is NULL", {
+  test_dbcon <- check_db_test()
+  withr::defer(PEcAn.DB::db.close(test_dbcon))
+
+  result <- get_trait_data_pft(
+    pft_name    = std_pft,
+    modeltype   = std_modeltype,
+    dbcon       = test_dbcon,
+    trait_names = "SLA"
+  )
+
+  expect_named(result$pft_info, c("name", "pft_id", "pft_type", "posteriorid"))
+  expect_equal(result$pft_info$name, std_pft)
+  # posteriorid is always NULL here — the wrapper sets it after DB registration
+  expect_null(result$pft_info$posteriorid)
+})
+
+test_that("no files are written to disk", {
+  test_dbcon <- check_db_test()
+  withr::defer(PEcAn.DB::db.close(test_dbcon))
+
+  tmp    <- withr::local_tempdir()
+  old_wd <- setwd(tmp)
+  withr::defer(setwd(old_wd))
+
+  get_trait_data_pft(
+    pft_name    = std_pft,
+    modeltype   = std_modeltype,
+    dbcon       = test_dbcon,
+    trait_names = std_traits
+  )
+
+  expect_length(list.files(tmp, pattern = "\\.Rdata$", recursive = TRUE), 0L)
+})
+
+test_that("constants are excluded from prior_distns", {
+  test_dbcon <- check_db_test()
+  withr::defer(PEcAn.DB::db.close(test_dbcon))
+
+  # Get the full prior list first so we know which traits actually have priors
+  result_full <- get_trait_data_pft(
+    pft_name    = std_pft,
+    modeltype   = std_modeltype,
+    dbcon       = test_dbcon,
+    trait_names = std_traits
+  )
+
+  traits_with_priors <- rownames(result_full$prior_distns)
+  skip_if(length(traits_with_priors) < 2L,
+          "Need at least 2 traits with priors to test constants exclusion")
+
+  constant_trait <- traits_with_priors[[1L]]
+
+  result_const <- get_trait_data_pft(
+    pft_name    = std_pft,
+    modeltype   = std_modeltype,
+    dbcon       = test_dbcon,
+    trait_names = std_traits,
+    constants   = stats::setNames(list(1.0), constant_trait)
+  )
+
+  expect_false(constant_trait %in% rownames(result_const$prior_distns))
+  expect_equal(nrow(result_const$prior_distns),
+               nrow(result_full$prior_distns) - 1L)
+})
+
+test_that("end-to-end: standalone gives identical objects to what wrapper saves", {
+  test_dbcon <- check_db_test()
+  withr::defer(PEcAn.DB::db.close(test_dbcon))
+
+  outdir   <- withr::local_tempdir()
+  test_pft <- list(
+    name        = std_pft,
+    outdir      = outdir,
+    posteriorid = NULL,
+    constants   = list()
+  )
+
+  # Run the wrapper — it writes trait.data.Rdata and prior.distns.Rdata
+  wrapper_result <- get.trait.data.pft(
+    pft         = test_pft,
+    modeltype   = std_modeltype,
+    dbfiles     = outdir,
+    dbcon       = test_dbcon,
+    trait.names = std_traits
+  )
+  withr::defer({
+    if (!is.null(wrapper_result$posteriorid)) {
+      try(DBI::dbExecute(test_dbcon,
+        "DELETE FROM dbfiles WHERE container_type = 'Posterior' AND container_id = $1",
+        list(wrapper_result$posteriorid)), silent = TRUE)
+      try(DBI::dbExecute(test_dbcon,
+        "DELETE FROM posteriors WHERE id = $1",
+        list(wrapper_result$posteriorid)), silent = TRUE)
+    }
+  })
+
+  standalone_result <- get_trait_data_pft(
+    pft_name    = std_pft,
+    modeltype   = std_modeltype,
+    dbcon       = test_dbcon,
+    trait_names = std_traits
+  )
+
+  # Load the on-disk files the wrapper created
+  trait_env <- new.env(parent = emptyenv())
+  prior_env <- new.env(parent = emptyenv())
+  load(file.path(outdir, "trait.data.Rdata"),   envir = trait_env)
+  load(file.path(outdir, "prior.distns.Rdata"), envir = prior_env)
+
+  expect_identical(standalone_result$trait_data,   trait_env$trait.data)
+  expect_identical(standalone_result$prior_distns, prior_env$prior.distns)
+})
\ No newline at end of file

From c0caa1b700f14832249e3524b1e14352a01ec9aa Mon Sep 17 00:00:00 2001
From: Om Kapale <omshripadkapale@gmail.com>
Date: Mon, 11 May 2026 20:09:10 +0530
Subject: [PATCH 2/4] docs: add NAMESPACE export and Rd file for
 get_trait_data_pft

---
 base/db/NAMESPACE                 |  1 +
 base/db/R/get.trait.data.pft.R    | 32 +++++++-----
 base/db/R/get_trait_data_pft.R    | 10 ++--
 base/db/man/get.trait.data.pft.Rd |  7 +--
 base/db/man/get_trait_data_pft.Rd | 83 +++++++++++++++++++++++++++++++
 5 files changed, 114 insertions(+), 19 deletions(-)
 create mode 100644 base/db/man/get_trait_data_pft.Rd

diff --git a/base/db/NAMESPACE b/base/db/NAMESPACE
index c9d6c10b2d..e88ff29962 100644
--- a/base/db/NAMESPACE
+++ b/base/db/NAMESPACE
@@ -33,6 +33,7 @@ export(get.trait.data)
 export(get.trait.data.pft)
 export(get_postgres_envvars)
 export(get_run_ids)
+export(get_trait_data_pft)
 export(get_users)
 export(get_var_names)
 export(get_workflow_ids)
diff --git a/base/db/R/get.trait.data.pft.R b/base/db/R/get.trait.data.pft.R
index ee2418a777..2e2da9ae21 100644
--- a/base/db/R/get.trait.data.pft.R
+++ b/base/db/R/get.trait.data.pft.R
@@ -46,9 +46,10 @@
 ##'   BETYdb.  Defaults to `FALSE`.
 ##' @param trait.names list of trait names to retrieve
 ##' @return The `pft` input list, updated with `pft$posteriorid` set to the
-##'   ID of the (possibly new) posterior record in BETYdb. The posterior ID can
-##'   be used to locate the output files (`trait.data.Rdata`, `prior.distns.Rdata`,
-##'   etc.) via BETYdb's `dbfiles` table.
+##'   ID of the (possibly new) posterior record in BETYdb. Also contains
+##'   `pft$trait_data` and `pft$prior_distns` for in-memory chaining. The
+##'   posterior ID can be used to locate the output files (`trait.data.Rdata`,
+##'   `prior.distns.Rdata`, etc.) via BETYdb's `dbfiles` table.
 ##' @author David LeBauer, Shawn Serbin, Rob Kooper
 ##' @export
 get.trait.data.pft <-
@@ -59,7 +60,6 @@ get.trait.data.pft <-
            trait.names,
            forceupdate = FALSE,
            write = FALSE) {
-    
 
   # Create directory if necessary
   if (!file.exists(pft$outdir) && !dir.create(pft$outdir, recursive = TRUE)) {
@@ -303,11 +303,19 @@ get.trait.data.pft <-
     }
   }
 
-  # get the trait data (including sampling of derived traits, if any)
-  trait.data <- query.traits(pft_members$id, traits, con = dbcon,
-                             update.check.only = FALSE,
-                             ids_are_cultivars = (pfttype == "cultivar"))
-  traits <- names(trait.data)
+  ## Cache miss: delegate the full data query to the standalone function so
+  ## that query logic lives in one place. This wrapper handles only the
+  ## file I/O and database registration that follows.
+  computed     <- get_trait_data_pft(
+    pft_name    = pft[["name"]],
+    modeltype   = modeltype,
+    dbcon       = dbcon,
+    trait_names = trait.names,
+    constants   = if (!is.null(pft$constants)) pft$constants else list()
+  )
+  trait.data   <- computed$trait_data
+  prior.distns <- computed$prior_distns
+  traits       <- names(trait.data)
 
   if (length(trait.data) > 0) {
     trait_counts <- trait.data %>%
@@ -386,9 +394,9 @@ get.trait.data.pft <-
     }
   }
 
-  # caching — is unchanged. Callers that only use pft$name / pft$outdir /
-  # pft$posteriorid are unaffected.
+  ## Attach computed objects so downstream callers can chain in-memory
+  ## without loading files from pft$outdir.
   pft$trait_data   <- trait.data
   pft$prior_distns <- prior.distns
   return(pft)
-}
+}
\ No newline at end of file
diff --git a/base/db/R/get_trait_data_pft.R b/base/db/R/get_trait_data_pft.R
index e380ac086e..be5e9f96b6 100644
--- a/base/db/R/get_trait_data_pft.R
+++ b/base/db/R/get_trait_data_pft.R
@@ -11,7 +11,7 @@
 #' this function never does.
 #'
 #' This follows the pattern established by \code{meta_analysis_standalone}
-#' for the meta-analysis step and \code{\link{get_parameter_samples}} for
+#' for the meta-analysis step and \code{get_parameter_samples} (in PEcAn.DB) for
 #' parameter sampling — each is a computation core that can be tested in
 #' isolation without a filesystem or a \code{settings} object.
 #'
@@ -40,9 +40,9 @@
 #'
 #' @seealso \code{\link{get.trait.data.pft}} for the backward-compatible
 #'   wrapper that handles provenance and caching.
-#'   \code{\link[PEcAn.MA]{meta_analysis_standalone}} for the analogous
+#'   \code{meta_analysis_standalone} (in PEcAn.MA) for the analogous
 #'   function in the meta-analysis step.
-#'   \code{\link{get_parameter_samples}} for the analogous function in the
+#'   \code{get_parameter_samples} (in PEcAn.DB) for the analogous function in the
 #'   parameter sampling step.
 #'
 #' @examples
@@ -112,6 +112,8 @@ get_trait_data_pft <- function(pft_name,
     members <- query.pft_species(pft = pft_name, modeltype = modeltype,
                                  con = dbcon)
   }
+  members <- members %>%
+    dplyr::mutate_if(is.character, ~dplyr::na_if(., ""))
   member_ids <- members[["id"]]
 
   if (length(member_ids) == 0L) {
@@ -126,7 +128,7 @@ get_trait_data_pft <- function(pft_name,
   # (same approach used in get.trait.data())
   prior_distns <- query.priors(
     pft   = format(pft_id, scientific = FALSE),
-    trstr = trait_names,
+    trstr = PEcAn.utils::vecpaste(trait_names),
     con   = dbcon
   )
 
diff --git a/base/db/man/get.trait.data.pft.Rd b/base/db/man/get.trait.data.pft.Rd
index 406e84e02e..bba7aa6dcd 100644
--- a/base/db/man/get.trait.data.pft.Rd
+++ b/base/db/man/get.trait.data.pft.Rd
@@ -34,9 +34,10 @@ BETYdb.  Defaults to \code{FALSE}.}
 }
 \value{
 The \code{pft} input list, updated with \code{pft$posteriorid} set to the
-ID of the (possibly new) posterior record in BETYdb. The posterior ID can
-be used to locate the output files (\code{trait.data.Rdata}, \code{prior.distns.Rdata},
-etc.) via BETYdb's \code{dbfiles} table.
+ID of the (possibly new) posterior record in BETYdb. Also contains
+\code{pft$trait_data} and \code{pft$prior_distns} for in-memory chaining. The
+posterior ID can be used to locate the output files (\code{trait.data.Rdata},
+\code{prior.distns.Rdata}, etc.) via BETYdb's \code{dbfiles} table.
 }
 \description{
 Get trait data from the database for a single PFT
diff --git a/base/db/man/get_trait_data_pft.Rd b/base/db/man/get_trait_data_pft.Rd
new file mode 100644
index 0000000000..31252af3ee
--- /dev/null
+++ b/base/db/man/get_trait_data_pft.Rd
@@ -0,0 +1,83 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/get_trait_data_pft.R
+\name{get_trait_data_pft}
+\alias{get_trait_data_pft}
+\title{Retrieve trait data and priors for one PFT from BETYdb}
+\usage{
+get_trait_data_pft(pft_name, modeltype, dbcon, trait_names, constants = list())
+}
+\arguments{
+\item{pft_name}{character. PFT name as stored in BETYdb.}
+
+\item{modeltype}{character or NULL. Disambiguates PFTs that share a name
+across model types (e.g. \code{"SIPNET"}, \code{"ED2"}).}
+
+\item{dbcon}{database connection from \code{\link[PEcAn.DB]{db.open}}.}
+
+\item{trait_names}{character vector of trait names to retrieve.}
+
+\item{constants}{named list from \code{pft$constants} in the settings.
+Traits named here are excluded from the returned priors because their
+values are fixed rather than sampled by the meta-analysis.}
+}
+\value{
+Named list with three elements:
+\describe{
+  \item{\code{trait_data}}{Named list of data frames, one per trait that
+    has observations. Column structure matches what
+    \code{meta_analysis_standalone} expects. Traits with no observations
+    are omitted from the list.}
+  \item{\code{prior_distns}}{Data frame with columns \code{distn},
+    \code{parama}, \code{paramb}, \code{n}; rows named by trait. Traits
+    listed in \code{constants} are excluded.}
+  \item{\code{pft_info}}{List with \code{name}, \code{pft_id},
+    \code{pft_type}, and \code{posteriorid}. \code{posteriorid} is always
+    \code{NULL} — the wrapper sets it after registering outputs in BETYdb.}
+}
+}
+\description{
+Core computation extracted from \code{\link{get.trait.data.pft}}.
+Queries the database for trait observations and prior distributions,
+returning them as R objects with no file I/O of any kind.
+}
+\details{
+The wrapper \code{\link{get.trait.data.pft}} handles directory creation,
+caching, CSV output, and BETYdb registration via \code{dbfile.insert}.
+This function handles only the query. Choosing between them is the
+provenance opt-in: calling the wrapper saves artifacts to disk; calling
+this function never does.
+
+This follows the pattern established by \code{meta_analysis_standalone}
+for the meta-analysis step and \code{get_parameter_samples} (in PEcAn.DB) for
+parameter sampling — each is a computation core that can be tested in
+isolation without a filesystem or a \code{settings} object.
+}
+\examples{
+\dontrun{
+dbcon <- PEcAn.DB::db.open(list(
+  host = "localhost", user = "bety",
+  password = "bety",  dbname = "bety"
+))
+result <- get_trait_data_pft(
+  pft_name    = "temperate.deciduous",
+  modeltype   = "SIPNET",
+  dbcon       = dbcon,
+  trait_names = c("SLA", "Vcmax", "leaf_respiration_rate_m2")
+)
+str(result$trait_data)
+str(result$prior_distns)
+PEcAn.DB::db.close(dbcon)
+}
+
+}
+\seealso{
+\code{\link{get.trait.data.pft}} for the backward-compatible
+  wrapper that handles provenance and caching.
+  \code{meta_analysis_standalone} (in PEcAn.MA) for the analogous
+  function in the meta-analysis step.
+  \code{get_parameter_samples} (in PEcAn.DB) for the analogous function in the
+  parameter sampling step.
+}
+\author{
+David LeBauer, Shawn Serbin, Alexey Shiklomanov, Om Kapale
+}

From 3fd20cf4c5f30ed4a8330edf0707f991fd7363a7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Om=20Kapale=20=F0=9F=90=99?= <omshripadkapale@gmail.com>
Date: Tue, 12 May 2026 03:09:35 +0000
Subject: [PATCH 3/4] fix: remove @md tag conflict, fix cross-references, sync
 Rd files with roxygen2 7.3.3

---
 base/db/DESCRIPTION                           |   3 +-
 base/db/R/get.trait.data.pft.R                | 268 ++++++++----------
 base/db/R/get_trait_data_pft.R                |  84 +++---
 base/db/man/get.trait.data.pft.Rd             |  36 +--
 base/db/man/get_trait_data_pft.Rd             |  16 +-
 .../tests/testthat/test-get.trait.data.pft.R  |  84 +++++-
 .../tests/testthat/test-get_trait_data_pft.R  |  41 ++-
 7 files changed, 305 insertions(+), 227 deletions(-)

diff --git a/base/db/DESCRIPTION b/base/db/DESCRIPTION
index b2590784b5..cfb3f9fd52 100644
--- a/base/db/DESCRIPTION
+++ b/base/db/DESCRIPTION
@@ -80,5 +80,6 @@ Copyright: Authors
 LazyLoad: yes
 LazyData: FALSE
 Encoding: UTF-8
-RoxygenNote: 7.3.3
 X-schema.org-keywords: PEcAn, database
+Config/roxygen2/version: 8.0.0
+RoxygenNote: 7.3.3
diff --git a/base/db/R/get.trait.data.pft.R b/base/db/R/get.trait.data.pft.R
index 2e2da9ae21..a6b5afe36b 100644
--- a/base/db/R/get.trait.data.pft.R
+++ b/base/db/R/get.trait.data.pft.R
@@ -2,38 +2,19 @@
 ##'
 ##' @md
 ##' Queries BETYdb for trait observations and prior distributions for a single
-##' plant functional type (PFT). Results are saved to files
-##' in the PFT output directory (`pft$outdir`), and also registered in the
-##' database as posterior records when `write = TRUE`.
+##' plant functional type (PFT). Results are saved to files in the PFT output
+##' directory (`pft$outdir`), and also registered in the database as posterior
+##' records when `write = TRUE`.
 ##'
 ##' @details
 ##' `pft` should be a list containing at least `name` and `outdir`, and
 ##' optionally `posteriorid` and `constants`.
 ##'
-##' **File-based side effects (saved to `pft$outdir`):**
-##' \describe{
-##'   \item{`trait.data.Rdata`}{Contains a single object `trait.data`: a named
-##'     list of data frames, one per trait. Each data frame has columns from
-##'     BETYdb's traits/yields views (e.g., `mean`, `stat`, `n`, `site_id`,
-##'     `treatment_id`). Names correspond to trait variable names
-##'     (e.g., `"SLA"`, `"Vcmax"`).}
-##'   \item{`prior.distns.Rdata`}{Contains a single object `prior.distns`: a
-##'     data frame with one row per trait and columns `distn`, `parama`,
-##'     `paramb`, and `n`. Row names are trait variable names. Traits listed
-##'     in `pft$constants` are excluded.}
-##'   \item{`trait.data.csv`}{CSV export of `trait.data` (all traits
-##'     row-bound).}
-##'   \item{`prior.distns.csv`}{CSV export of `prior.distns`.}
-##'   \item{`species.csv` or `cultivars.csv`}{PFT membership list used to
-##'     detect changes between runs.}
-##' }
-##'
-##' **Downstream contract:** The files `trait.data.Rdata` and
-##' `prior.distns.Rdata` are expected by \code{run.meta.analysis.pft}, which
-##' loads them from `pft$outdir`. This implicit file-based coupling means
-##' the two functions must agree on directory path and object names. A future
-##' refactoring goal is to pass these objects directly via function arguments
-##' instead.
+##' Internally this wrapper delegates all database queries to
+##' \code{\link{get_trait_data_pft}} exactly once.  The returned objects are
+##' used for both the cache-staleness check and the save step, so the database
+##' is never queried more than once per call regardless of whether the cache
+##' hits or misses.
 ##'
 ##' @param pft list of settings for the pft whose traits to retrieve. See details.
 ##' @param modeltype type of model that is used, this is used to distinguish
@@ -47,9 +28,10 @@
 ##' @param trait.names list of trait names to retrieve
 ##' @return The `pft` input list, updated with `pft$posteriorid` set to the
 ##'   ID of the (possibly new) posterior record in BETYdb. Also contains
-##'   `pft$trait_data` and `pft$prior_distns` for in-memory chaining. The
-##'   posterior ID can be used to locate the output files (`trait.data.Rdata`,
-##'   `prior.distns.Rdata`, etc.) via BETYdb's `dbfiles` table.
+##'   `pft$trait_data` and `pft$prior_distns` for in-memory chaining on both
+##'   the cache-hit and cache-miss paths. The posterior ID can be used to
+##'   locate the output files (`trait.data.Rdata`, `prior.distns.Rdata`,
+##'   etc.) via BETYdb's `dbfiles` table.
 ##' @author David LeBauer, Shawn Serbin, Rob Kooper
 ##' @export
 get.trait.data.pft <-
@@ -66,82 +48,56 @@ get.trait.data.pft <-
     PEcAn.logger::logger.error(paste0("Couldn't create PFT output directory: ", pft$outdir))
   }
 
-  # find appropriate pft
-  pftres <- query_pfts(dbcon, pft[["name"]], modeltype)
-  pfttype <- pftres[["pft_type"]]
-  pftid <- pftres[["id"]]
-
-  if (nrow(pftres) > 1) {
-    PEcAn.logger::logger.severe(
-      "Multiple PFTs named", pft[["name"]], "found,",
-      "with ids", PEcAn.utils::vecpaste(pftres[["id"]]), ".",
-      "Specify modeltype to fix this.")
-  }
-
-  if (nrow(pftres) == 0) {
-    PEcAn.logger::logger.severe("Could not find pft", pft[["name"]])
-    return(NA)
-  }
-
-  # get the member species/cultivars, we need to check if anything changed
-  if (pfttype == "plant") {
-    pft_member_filename = "species.csv"
-    pft_members <- PEcAn.DB::query.pft_species(pft$name, modeltype, dbcon)
-  } else if (pfttype == "cultivar") {
-    pft_member_filename = "cultivars.csv"
-    pft_members <- PEcAn.DB::query.pft_cultivars(pft$name, modeltype, dbcon)
-  } else {
-    PEcAn.logger::logger.severe("Unknown pft type! Expected 'plant' or 'cultivar', got", pfttype)
-  }
-
-  # ANS: Need to do this conversion for the check against existing
-  # membership later on. Otherwise, `NA` from the CSV is interpreted
-  # as different from `""` returned here, even though they are really
-  # the same thing.
-  pft_members <- pft_members %>%
-    dplyr::mutate_if(is.character, ~dplyr::na_if(., ""))
-
-  # get the priors
-  prior.distns <- PEcAn.DB::query.priors(
-    pft = pftid,
-    trstr = PEcAn.utils::vecpaste(trait.names),
-    con = dbcon)
-  prior.distns <- prior.distns[which(!rownames(prior.distns) %in% names(pft$constants)),]
-  traits <- rownames(prior.distns)
+  # ---- Single DB round-trip via standalone ----
+  # All query logic lives in get_trait_data_pft().  The objects it returns
+  # are used below for both the cache check and the save step — the database
+  # is never queried a second time.
+  computed <- get_trait_data_pft(
+    pft_name    = pft[["name"]],
+    modeltype   = modeltype,
+    dbcon       = dbcon,
+    trait_names = trait.names,
+    constants   = if (!is.null(pft$constants)) pft$constants else list()
+  )
 
-  # get the trait data (don't bother sampling derived traits until after update check)
-  trait.data.check <- PEcAn.DB::query.traits(ids = pft_members$id, priors = traits, con = dbcon, update.check.only = TRUE, ids_are_cultivars = (pfttype=="cultivar"))
-  traits <- names(trait.data.check)
+  trait.data          <- computed$trait_data
+  prior.distns        <- computed$prior_distns
+  pftid               <- computed$pft_info$pft_id
+  pfttype             <- computed$pft_info$pft_type
+  pft_members         <- computed$pft_info$pft_members
+  pft_member_filename <- computed$pft_info$pft_member_filename
 
-  # Set forceupdate FALSE if it's a string (backwards compatible with 'AUTO' flag used in the past)
+  # Set forceupdate FALSE if it's a string (backwards compatible with 'AUTO')
   forceupdate <- isTRUE(as.logical(forceupdate))
 
-  # check to see if we need to update
+  # ---- Cache staleness check ----
   if (!forceupdate) {
     if (is.null(pft$posteriorid)) {
-      recent_posterior <- dplyr::tbl(dbcon, "posteriors") %>%
-        dplyr::filter(.data$pft_id == !!pftid) %>%
+      recent_posterior <- dplyr::tbl(dbcon, "posteriors") |>
+        dplyr::filter(.data$pft_id == !!pftid) |>
         dplyr::collect()
       if (length(recent_posterior) > 0) {
-        pft$posteriorid <- dplyr::tbl(dbcon, "posteriors") %>%
-          dplyr::filter(.data$pft_id == !!pftid) %>%
-          dplyr::arrange(dplyr::desc(.data$created_at)) %>%
-          utils::head(1) %>%
+        pft$posteriorid <- dplyr::tbl(dbcon, "posteriors") |>
+          dplyr::filter(.data$pft_id == !!pftid) |>
+          dplyr::arrange(dplyr::desc(.data$created_at)) |>
+          utils::head(1) |>
           dplyr::pull("id")
       } else {
         PEcAn.logger::logger.info("No previous posterior found. Forcing update")
       }
     }
+
     if (!is.null(pft$posteriorid)) {
-      files <- dbfile.check(type = "Posterior", container.id = pft$posteriorid, con = dbcon,
-                            return.all = TRUE)
+      files <- dbfile.check(type = "Posterior", container.id = pft$posteriorid,
+                            con = dbcon, return.all = TRUE)
       need_files <- c(
-        trait_data = "trait.data.Rdata",
-        priors = "prior.distns.Rdata",
+        trait_data     = "trait.data.Rdata",
+        priors         = "prior.distns.Rdata",
         pft_membership = pft_member_filename
       )
       ids <- match(need_files, files$file_name)
       names(ids) <- names(need_files)
+
       if (any(is.na(ids))) {
         missing_files <- need_files[is.na(ids)]
         PEcAn.logger::logger.info(paste0(
@@ -159,11 +115,13 @@ get.trait.data.pft <-
           "All posterior files are present. Performing additional checks ",
           "to determine if meta-analysis needs to be updated."
         )
-        # check if all files exist
+
+        # Check all required files exist on disk
         need_paths <- file.path(files$file_path[ids], need_files)
         names(need_paths) <- names(need_files)
         files_exist <- file.exists(need_paths)
         foundallfiles <- all(files_exist)
+
         if (!foundallfiles) {
           PEcAn.logger::logger.warn(
             "The following files are in database but not found on disk: ",
@@ -174,25 +132,23 @@ get.trait.data.pft <-
           # Check if PFT membership has changed
           PEcAn.logger::logger.debug("Checking if PFT membership has changed.")
           if (pfttype == "plant") {
-            # Columns are: id, genus, species, scientificname
-            colClass = c("double", "character", "character", "character")
+            colClass <- c("double", "character", "character", "character")
           } else if (pfttype == "cultivar") {
-            # Columns are: id, specie_id, genus, species, scientificname, cultivar
-            colClass = c("double", "double", "character", "character", "character", "character")
-            }
+            colClass <- c("double", "double", "character", "character",
+                          "character", "character")
+          }
           existing_membership <- utils::read.csv(
             need_paths[["pft_membership"]],
-            # Need this so NA values are formatted consistently
-            colClasses = colClass,
+            colClasses       = colClass,
             stringsAsFactors = FALSE,
-            na.strings = c("", "NA")
-            )
+            na.strings       = c("", "NA")
+          )
           diff_membership <- symmetric_setdiff(
             existing_membership,
             pft_members,
             xname = "existing",
             yname = "current"
-            )
+          )
           if (nrow(diff_membership) > 0) {
             PEcAn.logger::logger.error(
               "\n PFT membership has changed. \n",
@@ -205,7 +161,9 @@ get.trait.data.pft <-
 
           # Check if priors have changed
           PEcAn.logger::logger.debug("Checking if priors have changed")
-          existing_prior <- PEcAn.utils::load_local(need_paths[["priors"]])[["prior.distns"]]
+          existing_prior <- PEcAn.utils::load_local(
+            need_paths[["priors"]]
+          )[["prior.distns"]]
           diff_prior <- symmetric_setdiff(
             dplyr::as_tibble(prior.distns, rownames = "trait"),
             dplyr::as_tibble(existing_prior, rownames = "trait")
@@ -225,25 +183,28 @@ get.trait.data.pft <-
           existing_trait_data <- PEcAn.utils::load_local(
             need_paths[["trait_data"]]
           )[["trait.data"]]
-          if (length(trait.data.check) != length(existing_trait_data)) {
+          if (length(trait.data) != length(existing_trait_data)) {
             PEcAn.logger::logger.warn(
               "Lengths of new and existing `trait.data` differ. ",
               "Re-running meta-analysis."
             )
             foundallfiles <- FALSE
-          } else if (length(trait.data.check) == 0) {
-            PEcAn.logger::logger.warn("New and existing trait data are both empty. Skipping this check.")
+          } else if (length(trait.data) == 0) {
+            PEcAn.logger::logger.warn(
+              "New and existing trait data are both empty. Skipping this check."
+            )
           } else {
-            current_traits <- dplyr::bind_rows(trait.data.check, .id = "trait") %>%
+            current_traits <- dplyr::bind_rows(trait.data, .id = "trait") |>
               dplyr::select(-mean, -"stat")
-            existing_traits <- dplyr::bind_rows(existing_trait_data, .id = "trait") %>%
+            existing_traits <- dplyr::bind_rows(existing_trait_data,
+                                                .id = "trait") |>
               dplyr::select(-mean, -"stat")
             diff_traits <- symmetric_setdiff(current_traits, existing_traits)
             if (nrow(diff_traits) > 0) {
-              diff_summary <- diff_traits %>%
+              diff_summary <- diff_traits |>
                 dplyr::count(source, .data$trait)
               PEcAn.logger::logger.error(
-                "\n Prior has changed. \n",
+                "\n Trait data has changed. \n",
                 "Here are the number of differing trait records by trait:\n",
                 PEcAn.logger::print2string(diff_summary),
                 wrap = FALSE
@@ -251,8 +212,7 @@ get.trait.data.pft <-
               foundallfiles <- FALSE
             }
           }
-        }
-
+        } # end else (all files on disk)
 
         if (foundallfiles) {
           PEcAn.logger::logger.info(
@@ -260,26 +220,29 @@ get.trait.data.pft <-
             "for PFT", shQuote(pft$name)
           )
           for (id in seq_len(nrow(files))) {
-            file.copy(from = file.path(files[[id, "file_path"]], files[[id, "file_name"]]),
-                      to = file.path(pft$outdir, files[[id, "file_name"]]))
+            file.copy(
+              from = file.path(files[[id, "file_path"]], files[[id, "file_name"]]),
+              to   = file.path(pft$outdir, files[[id, "file_name"]])
+            )
           }
 
           done <- TRUE
 
-          # May need to symlink the generic post.distns.Rdata to a specific post.distns.*.Rdata file.
+          # May need to symlink the generic post.distns.Rdata to the
+          # model-specific post.distns.*.Rdata file.
           if (length(list.files(pft$outdir, "post.distns.Rdata")) == 0) {
             all.files <- list.files(pft$outdir)
-            post.distn.file <- all.files[grep("post\\.distns\\..*\\.Rdata", all.files)]
-            if (length(post.distn.file) > 1)
+            post.distn.file <- all.files[grep("post\\.distns\\..*\\.Rdata",
+                                              all.files)]
+            if (length(post.distn.file) > 1) {
               PEcAn.logger::logger.severe(
                 "get.trait.data.pft() doesn't know how to ",
                 "handle multiple `post.distns.*.Rdata` files.",
                 "Found the following files: ",
                 paste(shQuote(post.distn.file), collapse = ", ")
               )
-            else if (length(post.distn.file) == 1) {
-              # Found exactly one post.distns.*.Rdata file. Use it.
-              link_input <- file.path(pft[["outdir"]], post.distn.file)
+            } else if (length(post.distn.file) == 1) {
+              link_input  <- file.path(pft[["outdir"]], post.distn.file)
               link_target <- file.path(pft[["outdir"]], "post.distns.Rdata")
               PEcAn.logger::logger.debug(
                 "Found exactly one posterior distribution file: ",
@@ -297,33 +260,29 @@ get.trait.data.pft <-
               done <- FALSE
             }
           }
-          if (done) return(pft)
+
+          if (done) {
+            # Attach computed objects so downstream callers can chain
+            # in-memory on the cache-HIT path — same guarantee as the
+            # cache-miss path below.
+            pft$trait_data   <- trait.data
+            pft$prior_distns <- prior.distns
+            return(pft)
+          }
         }
-      }
-    }
-  }
+      } # end else (all files in DB)
+    } # end if (!is.null(pft$posteriorid))
+  } # end if (!forceupdate)
 
-  ## Cache miss: delegate the full data query to the standalone function so
-  ## that query logic lives in one place. This wrapper handles only the
-  ## file I/O and database registration that follows.
-  computed     <- get_trait_data_pft(
-    pft_name    = pft[["name"]],
-    modeltype   = modeltype,
-    dbcon       = dbcon,
-    trait_names = trait.names,
-    constants   = if (!is.null(pft$constants)) pft$constants else list()
-  )
-  trait.data   <- computed$trait_data
-  prior.distns <- computed$prior_distns
-  traits       <- names(trait.data)
+  # ---- Cache miss: log counts, save to disk, register in DB ----
 
   if (length(trait.data) > 0) {
-    trait_counts <- trait.data %>%
-      dplyr::bind_rows(.id = "trait") %>%
+    trait_counts <- trait.data |>
+      dplyr::bind_rows(.id = "trait") |>
       dplyr::count(.data$trait)
-
     PEcAn.logger::logger.info(
-      "\n Number of observations per trait for PFT ", shQuote(pft[["name"]]), ":\n",
+      "\n Number of observations per trait for PFT ", shQuote(pft[["name"]]),
+      ":\n",
       PEcAn.logger::print2string(trait_counts, n = Inf, na.print = ""),
       wrap = FALSE
     )
@@ -334,36 +293,39 @@ get.trait.data.pft <-
     )
   }
 
-  # get list of existing files so they get ignored saving
+  # Snapshot existing files so we know which ones are new after saving
   old.files <- list.files(path = pft$outdir)
 
-  # create a new posterior
+  # Create a new posterior record in BETYdb
   insert_result <- db.query(
-    paste0("INSERT INTO posteriors (pft_id) VALUES (", pftid, ") RETURNING id"),
-    con = dbcon)
+    paste0("INSERT INTO posteriors (pft_id) VALUES (", pftid,
+           ") RETURNING id"),
+    con = dbcon
+  )
   pft$posteriorid <- insert_result[["id"]]
 
-  # create path where to store files
+  # Create the storage path for this posterior
   pathname <- file.path(dbfiles, "posterior", pft$posteriorid)
   dir.create(pathname, showWarnings = FALSE, recursive = TRUE)
 
-  ## 1. get species/cultivar list based on pft
-  utils::write.csv(pft_members, file.path(pft$outdir, pft_member_filename),
+  ## Write species/cultivar membership list
+  utils::write.csv(pft_members,
+                   file.path(pft$outdir, pft_member_filename),
                    row.names = FALSE)
 
-  ## save priors
+  ## Save prior distributions
   save(prior.distns, file = file.path(pft$outdir, "prior.distns.Rdata"))
-  utils::write.csv(prior.distns, file.path(pft$outdir, "prior.distns.csv"),
+  utils::write.csv(prior.distns,
+                   file.path(pft$outdir, "prior.distns.csv"),
                    row.names = TRUE)
 
-  ## 3. display info to the console
   PEcAn.logger::logger.info(
     "\n Summary of prior distributions for PFT ", shQuote(pft$name), ":\n",
     PEcAn.logger::print2string(prior.distns),
     wrap = FALSE
   )
 
-  ## traits = variables with prior distributions for this pft
+  ## Save trait data
   trait.data.file <- file.path(pft$outdir, "trait.data.Rdata")
   save(trait.data, file = trait.data.file)
   utils::write.csv(
@@ -372,16 +334,18 @@ get.trait.data.pft <-
     row.names = FALSE
   )
 
-  ### save and store in database all results except those that were there already
-  if(isTRUE(write)) {
+  ## Register new files in BETYdb
+  if (isTRUE(write)) {
     store_files_all <- list.files(path = pft[["outdir"]])
     store_files <- setdiff(store_files_all, old.files)
     PEcAn.logger::logger.debug(
       "The following posterior files found in PFT outdir ",
       "(", shQuote(pft[["outdir"]]), ") will be registered in BETY ",
-      "under posterior ID ", format(pft[["posteriorid"]], scientific = FALSE), ": ",
+      "under posterior ID ",
+      format(pft[["posteriorid"]], scientific = FALSE), ": ",
       paste(shQuote(store_files), collapse = ", "), ". ",
-      "The following files (if any) will not be registered because they already existed: ",
+      "The following files (if any) will not be registered because they ",
+      "already existed: ",
       paste(shQuote(intersect(store_files, old.files)), collapse = ", "),
       wrap = FALSE
     )
diff --git a/base/db/R/get_trait_data_pft.R b/base/db/R/get_trait_data_pft.R
index be5e9f96b6..dcc4e2c076 100644
--- a/base/db/R/get_trait_data_pft.R
+++ b/base/db/R/get_trait_data_pft.R
@@ -11,9 +11,9 @@
 #' this function never does.
 #'
 #' This follows the pattern established by \code{meta_analysis_standalone}
-#' for the meta-analysis step and \code{get_parameter_samples} (in PEcAn.DB) for
-#' parameter sampling — each is a computation core that can be tested in
-#' isolation without a filesystem or a \code{settings} object.
+#' for the meta-analysis step and \code{get_parameter_samples} for parameter
+#' sampling — each is a computation core that can be tested in isolation
+#' without a filesystem or a \code{settings} object.
 #'
 #' @param pft_name character. PFT name as stored in BETYdb.
 #' @param modeltype character or NULL. Disambiguates PFTs that share a name
@@ -34,15 +34,19 @@
 #'     \code{parama}, \code{paramb}, \code{n}; rows named by trait. Traits
 #'     listed in \code{constants} are excluded.}
 #'   \item{\code{pft_info}}{List with \code{name}, \code{pft_id},
-#'     \code{pft_type}, and \code{posteriorid}. \code{posteriorid} is always
-#'     \code{NULL} — the wrapper sets it after registering outputs in BETYdb.}
+#'     \code{pft_type}, \code{pft_members}, \code{pft_member_filename}, and
+#'     \code{posteriorid}. \code{pft_members} is the data frame of species or
+#'     cultivar IDs used during the query. \code{pft_member_filename} is
+#'     \code{"species.csv"} or \code{"cultivars.csv"} depending on PFT type.
+#'     \code{posteriorid} is always \code{NULL} — the wrapper sets it after
+#'     registering outputs in BETYdb.}
 #' }
 #'
 #' @seealso \code{\link{get.trait.data.pft}} for the backward-compatible
 #'   wrapper that handles provenance and caching.
 #'   \code{meta_analysis_standalone} (in PEcAn.MA) for the analogous
 #'   function in the meta-analysis step.
-#'   \code{get_parameter_samples} (in PEcAn.DB) for the analogous function in the
+#'   \code{get_parameter_samples} for the analogous function in the
 #'   parameter sampling step.
 #'
 #' @examples
@@ -70,7 +74,7 @@ get_trait_data_pft <- function(pft_name,
                                trait_names,
                                constants = list()) {
 
-  # Validate the cheap arguments before making any database calls
+  # ---- Input validation (cheap checks before any DB call) ----
   if (!is.character(pft_name) || length(pft_name) != 1L) {
     PEcAn.logger::logger.severe("'pft_name' must be a single character string")
   }
@@ -83,11 +87,8 @@ get_trait_data_pft <- function(pft_name,
     PEcAn.logger::logger.severe("'dbcon' must be a database connection")
   }
 
-  # Resolve PFT name to a single database record.
-  # strict = TRUE gives a clear error when the PFT is not found rather than
-  # returning an empty data frame silently.
+  # ---- Resolve PFT to a single database record ----
   pft_record <- query_pfts(dbcon, pft_name, modeltype, strict = TRUE)
-
   if (nrow(pft_record) > 1L) {
     PEcAn.logger::logger.severe(
       "Multiple PFTs named '", pft_name, "' found in the database;",
@@ -102,38 +103,51 @@ get_trait_data_pft <- function(pft_name,
     "Querying trait data for PFT '", pft_name, "' (id = ", pft_id, ")"
   )
 
-  # Which join table holds the member IDs depends on pft_type
-  ids_are_cultivars <- identical(pft_type, "cultivar")
-
-  if (ids_are_cultivars) {
+  # ---- Fetch PFT member species or cultivars ----
+  # The join table depends on pft_type.  An unknown type is a hard error —
+  # silently falling through to the species path would produce wrong results.
+  if (identical(pft_type, "cultivar")) {
+    pft_member_filename <- "cultivars.csv"
     members <- query.pft_cultivars(pft = pft_name, modeltype = modeltype,
                                    con = dbcon)
-  } else {
+  } else if (identical(pft_type, "plant")) {
+    pft_member_filename <- "species.csv"
     members <- query.pft_species(pft = pft_name, modeltype = modeltype,
                                  con = dbcon)
+  } else {
+    PEcAn.logger::logger.severe(
+      "Unknown pft_type '", pft_type, "' for PFT '", pft_name,
+      "'; expected 'plant' or 'cultivar'."
+    )
   }
-  members <- members %>%
-    dplyr::mutate_if(is.character, ~dplyr::na_if(., ""))
+
+  # Normalise empty strings to NA so membership comparisons are consistent
+  members <- members |>
+    dplyr::mutate(dplyr::across(
+        dplyr::where(is.character),
+        \(x) dplyr::na_if(x, "")
+    ))
+
   member_ids <- members[["id"]]
 
   if (length(member_ids) == 0L) {
     PEcAn.logger::logger.info(
       "PFT '", pft_name, "' has no associated ",
-      if (ids_are_cultivars) "cultivars" else "species",
+      if (identical(pft_type, "cultivar")) "cultivars" else "species",
       "; trait_data will be an empty list."
     )
   }
 
-  # format() prevents integer64 from being silently coerced in the SQL query
-  # (same approach used in get.trait.data())
+  # ---- Query prior distributions ----
+  # format() prevents integer64 from being silently coerced to double in SQL.
   prior_distns <- query.priors(
     pft   = format(pft_id, scientific = FALSE),
     trstr = PEcAn.utils::vecpaste(trait_names),
     con   = dbcon
   )
 
-  # Traits in pft$constants have fixed values and are never sampled, so they
-  # should not appear in the prior distributions returned to callers
+  # Exclude traits listed in pft$constants — their values are fixed and must
+  # not be sampled by the meta-analysis.
   if (length(constants) > 0L && !is.null(names(constants))) {
     constant_traits <- names(constants)
     in_constants    <- rownames(prior_distns) %in% constant_traits
@@ -146,16 +160,16 @@ get_trait_data_pft <- function(pft_name,
     }
   }
 
-  # Only query traits that have a prior — querying for traits with no prior
-  # is meaningless for meta-analysis
+  # ---- Query trait observations ----
+  # Only query traits that actually have a prior — querying without a prior
+  # is meaningless for meta-analysis.
   traits_with_priors <- rownames(prior_distns)
-
   if (length(member_ids) > 0L && length(traits_with_priors) > 0L) {
     trait_data <- query.traits(
       ids               = member_ids,
       priors            = traits_with_priors,
       con               = dbcon,
-      ids_are_cultivars = ids_are_cultivars
+      ids_are_cultivars = identical(pft_type, "cultivar")
     )
   } else {
     trait_data <- list()
@@ -167,13 +181,17 @@ get_trait_data_pft <- function(pft_name,
     nrow(prior_distns), " trait(s) with priors"
   )
 
-  # posteriorid is NULL here — the wrapper sets it after registering the
-  # output files in BETYdb via dbfile.insert()
+  # posteriorid is always NULL here — the wrapper assigns it after registering
+  # the output files in BETYdb via dbfile.insert().
+  # pft_members and pft_member_filename are included so the wrapper can use
+  # them for cache comparison and CSV output without re-querying the database.
   pft_info <- list(
-    name        = pft_name,
-    pft_id      = pft_id,
-    pft_type    = pft_type,
-    posteriorid = NULL
+    name                = pft_name,
+    pft_id              = pft_id,
+    pft_type            = pft_type,
+    pft_members         = members,
+    pft_member_filename = pft_member_filename,
+    posteriorid         = NULL
   )
 
   return(list(
diff --git a/base/db/man/get.trait.data.pft.Rd b/base/db/man/get.trait.data.pft.Rd
index bba7aa6dcd..1efaf7142f 100644
--- a/base/db/man/get.trait.data.pft.Rd
+++ b/base/db/man/get.trait.data.pft.Rd
@@ -35,9 +35,10 @@ BETYdb.  Defaults to \code{FALSE}.}
 \value{
 The \code{pft} input list, updated with \code{pft$posteriorid} set to the
 ID of the (possibly new) posterior record in BETYdb. Also contains
-\code{pft$trait_data} and \code{pft$prior_distns} for in-memory chaining. The
-posterior ID can be used to locate the output files (\code{trait.data.Rdata},
-\code{prior.distns.Rdata}, etc.) via BETYdb's \code{dbfiles} table.
+\code{pft$trait_data} and \code{pft$prior_distns} for in-memory chaining on both
+the cache-hit and cache-miss paths. The posterior ID can be used to
+locate the output files (\code{trait.data.Rdata}, \code{prior.distns.Rdata},
+etc.) via BETYdb's \code{dbfiles} table.
 }
 \description{
 Get trait data from the database for a single PFT
@@ -46,30 +47,11 @@ Get trait data from the database for a single PFT
 \code{pft} should be a list containing at least \code{name} and \code{outdir}, and
 optionally \code{posteriorid} and \code{constants}.
 
-\strong{File-based side effects (saved to \code{pft$outdir}):}
-\describe{
-\item{\code{trait.data.Rdata}}{Contains a single object \code{trait.data}: a named
-list of data frames, one per trait. Each data frame has columns from
-BETYdb's traits/yields views (e.g., \code{mean}, \code{stat}, \code{n}, \code{site_id},
-\code{treatment_id}). Names correspond to trait variable names
-(e.g., \code{"SLA"}, \code{"Vcmax"}).}
-\item{\code{prior.distns.Rdata}}{Contains a single object \code{prior.distns}: a
-data frame with one row per trait and columns \code{distn}, \code{parama},
-\code{paramb}, and \code{n}. Row names are trait variable names. Traits listed
-in \code{pft$constants} are excluded.}
-\item{\code{trait.data.csv}}{CSV export of \code{trait.data} (all traits
-row-bound).}
-\item{\code{prior.distns.csv}}{CSV export of \code{prior.distns}.}
-\item{\code{species.csv} or \code{cultivars.csv}}{PFT membership list used to
-detect changes between runs.}
-}
-
-\strong{Downstream contract:} The files \code{trait.data.Rdata} and
-\code{prior.distns.Rdata} are expected by \code{run.meta.analysis.pft}, which
-loads them from \code{pft$outdir}. This implicit file-based coupling means
-the two functions must agree on directory path and object names. A future
-refactoring goal is to pass these objects directly via function arguments
-instead.
+Internally this wrapper delegates all database queries to
+\code{\link{get_trait_data_pft}} exactly once.  The returned objects are
+used for both the cache-staleness check and the save step, so the database
+is never queried more than once per call regardless of whether the cache
+hits or misses.
 }
 \author{
 David LeBauer, Shawn Serbin, Rob Kooper
diff --git a/base/db/man/get_trait_data_pft.Rd b/base/db/man/get_trait_data_pft.Rd
index 31252af3ee..87086ba3a0 100644
--- a/base/db/man/get_trait_data_pft.Rd
+++ b/base/db/man/get_trait_data_pft.Rd
@@ -31,8 +31,12 @@ Named list with three elements:
     \code{parama}, \code{paramb}, \code{n}; rows named by trait. Traits
     listed in \code{constants} are excluded.}
   \item{\code{pft_info}}{List with \code{name}, \code{pft_id},
-    \code{pft_type}, and \code{posteriorid}. \code{posteriorid} is always
-    \code{NULL} — the wrapper sets it after registering outputs in BETYdb.}
+    \code{pft_type}, \code{pft_members}, \code{pft_member_filename}, and
+    \code{posteriorid}. \code{pft_members} is the data frame of species or
+    cultivar IDs used during the query. \code{pft_member_filename} is
+    \code{"species.csv"} or \code{"cultivars.csv"} depending on PFT type.
+    \code{posteriorid} is always \code{NULL} — the wrapper sets it after
+    registering outputs in BETYdb.}
 }
 }
 \description{
@@ -48,9 +52,9 @@ provenance opt-in: calling the wrapper saves artifacts to disk; calling
 this function never does.
 
 This follows the pattern established by \code{meta_analysis_standalone}
-for the meta-analysis step and \code{get_parameter_samples} (in PEcAn.DB) for
-parameter sampling — each is a computation core that can be tested in
-isolation without a filesystem or a \code{settings} object.
+for the meta-analysis step and \code{get_parameter_samples} for parameter
+sampling — each is a computation core that can be tested in isolation
+without a filesystem or a \code{settings} object.
 }
 \examples{
 \dontrun{
@@ -75,7 +79,7 @@ PEcAn.DB::db.close(dbcon)
   wrapper that handles provenance and caching.
   \code{meta_analysis_standalone} (in PEcAn.MA) for the analogous
   function in the meta-analysis step.
-  \code{get_parameter_samples} (in PEcAn.DB) for the analogous function in the
+  \code{get_parameter_samples} for the analogous function in the
   parameter sampling step.
 }
 \author{
diff --git a/base/db/tests/testthat/test-get.trait.data.pft.R b/base/db/tests/testthat/test-get.trait.data.pft.R
index dfe41af630..9092e1d7fa 100644
--- a/base/db/tests/testthat/test-get.trait.data.pft.R
+++ b/base/db/tests/testthat/test-get.trait.data.pft.R
@@ -91,7 +91,7 @@ test_that("errors for non-existent PFT name", {
       dbcon     = test_dbcon,
       trait.names = std_traits
     ),
-    "Could not find pft"
+    "PFTs were not found"
   )
 })
 
@@ -283,4 +283,86 @@ test_that("reference species and cultivar PFTs write traits properly", {
 
   expect_gt(file.info(allcv_csv)$size, file.info(cv_csv)$size)
   expect_gt(file.info(allcv_trt)$size, file.info(cv_trt)$size)
+})
+
+test_that("wrapper attaches trait_data and prior_distns on the cache-HIT path", {
+  # This guards the early-return path: when foundallfiles is TRUE the wrapper
+  # copies files from the DB store and returns early.  trait_data and
+  # prior_distns must be present on that path too.
+  test_dbcon <- check_db_test()
+  withr::defer(PEcAn.DB::db.close(test_dbcon))
+
+  outdir1 <- withr::local_tempdir()
+  outdir2 <- withr::local_tempdir()
+
+  # First call — populates the cache and the DB posterior record
+  result1 <- get.trait.data.pft(
+    pft     = make_test_pft(outdir1),
+    modeltype   = std_modeltype,
+    dbfiles = dbdir,          
+    dbcon   = test_dbcon,
+    trait.names = std_traits,
+    write   = TRUE            
+  )
+  withr::defer(cleanup_posterior(test_dbcon, result1$posteriorid))
+
+  # Second call — supplies the posteriorid from the first call so the
+  # wrapper finds valid files in the DB and takes the cache-hit path.
+  cached_pft          <- make_test_pft(outdir2)
+  cached_pft$posteriorid <- result1$posteriorid
+
+  result2 <- get.trait.data.pft(
+    pft         = cached_pft,
+    modeltype   = std_modeltype,
+    dbfiles     = outdir2,
+    dbcon       = test_dbcon,
+    trait.names = std_traits
+  )
+
+  expect_true("trait_data"   %in% names(result2),
+              info = "trait_data missing on cache-hit return path")
+  expect_true("prior_distns" %in% names(result2),
+              info = "prior_distns missing on cache-hit return path")
+  expect_type(result2$trait_data,   "list")
+  expect_s3_class(result2$prior_distns, "data.frame")
+})
+
+test_that("cache-hit in-memory objects match the files copied to outdir", {
+  # Verifies the objects attached on the cache-hit path are identical to
+  # what was written to disk on the original (cache-miss) run.
+  test_dbcon <- check_db_test()
+  withr::defer(PEcAn.DB::db.close(test_dbcon))
+
+  outdir_miss <- withr::local_tempdir()
+  outdir_hit  <- withr::local_tempdir()
+
+  result_miss <- get.trait.data.pft(
+    pft         = make_test_pft(outdir_miss),
+    modeltype   = std_modeltype,
+    dbfiles     = dbdir,        
+    dbcon       = test_dbcon,
+    trait.names = std_traits,
+    write       = TRUE            
+  )
+  withr::defer(cleanup_posterior(test_dbcon, result_miss$posteriorid))
+
+  cached_pft             <- make_test_pft(outdir_hit)
+  cached_pft$posteriorid <- result_miss$posteriorid
+
+  result_hit <- get.trait.data.pft(
+    pft         = cached_pft,
+    modeltype   = std_modeltype,
+    dbfiles     = outdir_hit,
+    dbcon       = test_dbcon,
+    trait.names = std_traits
+  )
+
+  # Load the files the cache copied to outdir_hit
+  trait_env <- new.env(parent = emptyenv())
+  prior_env <- new.env(parent = emptyenv())
+  load(file.path(outdir_hit, "trait.data.Rdata"),   envir = trait_env)
+  load(file.path(outdir_hit, "prior.distns.Rdata"), envir = prior_env)
+
+  expect_identical(result_hit$trait_data,   trait_env$trait.data)
+  expect_identical(result_hit$prior_distns, prior_env$prior.distns)
 })
\ No newline at end of file
diff --git a/base/db/tests/testthat/test-get_trait_data_pft.R b/base/db/tests/testthat/test-get_trait_data_pft.R
index e778f4b525..6aa2d31131 100644
--- a/base/db/tests/testthat/test-get_trait_data_pft.R
+++ b/base/db/tests/testthat/test-get_trait_data_pft.R
@@ -1,8 +1,4 @@
 # Tests for get_trait_data_pft()
-#
-# Validation tests fire before any DB call and never skip.
-# DB-dependent tests call check_db_test() inside each test_that block
-# and skip automatically when no connection is available.
 
 old_log_level <- PEcAn.logger::logger.getLevel()
 PEcAn.logger::logger.setLevel("WARN")
@@ -123,18 +119,28 @@ test_that("prior_distns is a data frame with the required columns", {
 test_that("pft_info contains expected fields and posteriorid is NULL", {
   test_dbcon <- check_db_test()
   withr::defer(PEcAn.DB::db.close(test_dbcon))
-
   result <- get_trait_data_pft(
     pft_name    = std_pft,
     modeltype   = std_modeltype,
     dbcon       = test_dbcon,
     trait_names = "SLA"
   )
-
-  expect_named(result$pft_info, c("name", "pft_id", "pft_type", "posteriorid"))
+  expect_named(
+    result$pft_info,
+    c("name", "pft_id", "pft_type", "pft_members", "pft_member_filename",
+      "posteriorid"),
+    ignore.order = TRUE
+  )
   expect_equal(result$pft_info$name, std_pft)
   # posteriorid is always NULL here — the wrapper sets it after DB registration
   expect_null(result$pft_info$posteriorid)
+  # pft_members must be a data frame with at least an id column
+  expect_s3_class(result$pft_info$pft_members, "data.frame")
+  expect_true("id" %in% names(result$pft_info$pft_members))
+  # pft_member_filename must be species.csv or cultivars.csv
+  expect_true(
+    result$pft_info$pft_member_filename %in% c("species.csv", "cultivars.csv")
+  )
 })
 
 test_that("no files are written to disk", {
@@ -232,4 +238,25 @@ test_that("end-to-end: standalone gives identical objects to what wrapper saves"
 
   expect_identical(standalone_result$trait_data,   trait_env$trait.data)
   expect_identical(standalone_result$prior_distns, prior_env$prior.distns)
+})
+
+test_that("errors for unknown pft_type returns an error, not silent fallback", {
+  # This verifies the explicit guard added in get_trait_data_pft():
+  # an unrecognised pft_type must throw, not silently fall through
+  # to the species-query path.  We simulate a bad record by mocking
+  # query_pfts — no live DB call needed.
+  fake_record <- data.frame(id = 1L, pft_type = "unknown_type",
+                            name = std_pft, stringsAsFactors = FALSE)
+  mockery::stub(get_trait_data_pft, "query_pfts", fake_record)
+
+  fake_dbcon <- structure(list(), class = c("PostgreSQLConnection",
+                                            "DBIConnection"))
+  expect_error(
+    get_trait_data_pft(
+      pft_name    = std_pft,
+      modeltype   = std_modeltype,
+      dbcon       = fake_dbcon,
+      trait_names = "SLA"
+    )
+  )
 })
\ No newline at end of file

From a6d06f2b15265a3bc4e504fd6a9c8f9db31aab31 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Om=20Kapale=20=F0=9F=90=99?= <omshripadkapale@gmail.com>
Date: Wed, 13 May 2026 09:29:11 +0000
Subject: [PATCH 4/4] fix: move get_trait_data_pft call to cache-miss path,
 restore column comments, fix roxygen version

---
 base/db/DESCRIPTION                           |   1 -
 base/db/R/get.trait.data.pft.R                | 111 +++++++++++-------
 base/db/man/get.trait.data.pft.Rd             |  64 +++++++---
 .../tests/testthat/test-get.trait.data.pft.R  |  53 ++++++++-
 4 files changed, 168 insertions(+), 61 deletions(-)

diff --git a/base/db/DESCRIPTION b/base/db/DESCRIPTION
index cfb3f9fd52..e8b7976070 100644
--- a/base/db/DESCRIPTION
+++ b/base/db/DESCRIPTION
@@ -81,5 +81,4 @@ LazyLoad: yes
 LazyData: FALSE
 Encoding: UTF-8
 X-schema.org-keywords: PEcAn, database
-Config/roxygen2/version: 8.0.0
 RoxygenNote: 7.3.3
diff --git a/base/db/R/get.trait.data.pft.R b/base/db/R/get.trait.data.pft.R
index a6b5afe36b..64e24b854d 100644
--- a/base/db/R/get.trait.data.pft.R
+++ b/base/db/R/get.trait.data.pft.R
@@ -1,6 +1,5 @@
 ##' Get trait data from the database for a single PFT
 ##'
-##' @md
 ##' Queries BETYdb for trait observations and prior distributions for a single
 ##' plant functional type (PFT). Results are saved to files in the PFT output
 ##' directory (`pft$outdir`), and also registered in the database as posterior
@@ -10,48 +9,80 @@
 ##' `pft` should be a list containing at least `name` and `outdir`, and
 ##' optionally `posteriorid` and `constants`.
 ##'
-##' Internally this wrapper delegates all database queries to
-##' \code{\link{get_trait_data_pft}} exactly once.  The returned objects are
-##' used for both the cache-staleness check and the save step, so the database
-##' is never queried more than once per call regardless of whether the cache
-##' hits or misses.
-##'
+##' @md
 ##' @param pft list of settings for the pft whose traits to retrieve. See details.
 ##' @param modeltype type of model that is used, this is used to distinguish
 ##'   between different pfts with the same name.
 ##' @param dbfiles location where previous results are found
 ##' @param dbcon database connection
+##' @param trait.names list of trait names to retrieve
 ##' @param forceupdate set this to true to force an update, auto will check to
 ##'   see if an update is needed.
 ##' @param write (Logical) If `TRUE` updated posteriors will be written to
 ##'   BETYdb.  Defaults to `FALSE`.
-##' @param trait.names list of trait names to retrieve
-##' @return The `pft` input list, updated with `pft$posteriorid` set to the
-##'   ID of the (possibly new) posterior record in BETYdb. Also contains
-##'   `pft$trait_data` and `pft$prior_distns` for in-memory chaining on both
-##'   the cache-hit and cache-miss paths. The posterior ID can be used to
-##'   locate the output files (`trait.data.Rdata`, `prior.distns.Rdata`,
-##'   etc.) via BETYdb's `dbfiles` table.
+##' @param return_data (Logical) If `TRUE`, the returned `pft` list also
+##'   includes `trait_data` and `prior_distns` as in-memory objects.
+##'   Defaults to `FALSE` to preserve legacy behavior — when `pft` is
+##'   embedded inside a `settings` object, attaching these data frames by
+##'   default would inflate the settings and break serialization.
+##' @return The updated \code{pft} list with \code{posteriorid} set to the
+##'   ID of the (possibly new) posterior record in BETYdb. The posterior ID
+##'   can be used to locate the output files (\code{trait.data.Rdata},
+##'   \code{prior.distns.Rdata}, etc.) via BETYdb's \code{dbfiles} table.
+##'
+##'   When \code{return_data = TRUE}, the returned list also includes
+##'   \code{trait_data} and \code{prior_distns} as in-memory objects, so
+##'   downstream callers can use them without reloading from \code{pft$outdir}.
+##'
+##' @section File-based side effects:
+##' The following files are written to \code{pft$outdir}:
+##' \describe{
+##'   \item{\code{trait.data.Rdata}}{Named list of trait data frames, one per
+##'     trait that has observations. Read by \code{run.meta.analysis.pft()}.}
+##'   \item{\code{prior.distns.Rdata}}{Data frame of prior distributions, rows
+##'     named by trait. Read by \code{run.meta.analysis.pft()}.}
+##'   \item{\code{species.csv} or \code{cultivars.csv}}{Data frame of PFT
+##'     member species or cultivar IDs, depending on PFT type.}
+##'   \item{\code{trait.data.csv}}{Flattened CSV of all trait observations,
+##'     one row per observation, for human inspection.}
+##' }
+##' In addition, each output file is registered in BETYdb via
+##' \code{dbfile.insert()} and associated with the posterior record whose ID
+##' is returned as \code{pft$posteriorid}.
+##'
+##' @section Downstream contract:
+##' \code{run.meta.analysis.pft()} reads \code{trait.data.Rdata} and
+##' \code{prior.distns.Rdata} from \code{pft$outdir} at the start of the
+##' meta-analysis step. These two files are therefore a required output of
+##' this wrapper.
+##'
+##' Note: this file-based contract will be removed once
+##' \code{run.meta.analysis.pft()} is refactored to accept in-memory inputs
+##' (GSoC Week 2).
+##'
 ##' @author David LeBauer, Shawn Serbin, Rob Kooper
 ##' @export
-get.trait.data.pft <-
-  function(pft,
-           modeltype,
-           dbfiles,
-           dbcon,
-           trait.names,
-           forceupdate = FALSE,
-           write = FALSE) {
+get.trait.data.pft <- function(pft, modeltype, dbfiles, dbcon,
+                               trait.names = NULL,
+                               forceupdate = FALSE,
+                               write = TRUE,
+                               return_data = FALSE) {
 
   # Create directory if necessary
   if (!file.exists(pft$outdir) && !dir.create(pft$outdir, recursive = TRUE)) {
-    PEcAn.logger::logger.error(paste0("Couldn't create PFT output directory: ", pft$outdir))
+    PEcAn.logger::logger.error(
+      paste0("Couldn't create PFT output directory: ", pft$outdir)
+    )
   }
 
-  # ---- Single DB round-trip via standalone ----
-  # All query logic lives in get_trait_data_pft().  The objects it returns
-  # are used below for both the cache check and the save step — the database
-  # is never queried a second time.
+  # Backwards-compatible: forceupdate may arrive as "AUTO" or other strings
+  forceupdate <- isTRUE(as.logical(forceupdate))
+
+  # All database queries are delegated to get_trait_data_pft() exactly once.
+  # The returned objects feed both the cache-staleness check and the save
+  # step, so the database is never queried more than once per call.
+  # Input validation (pft_name, trait_names, dbcon) and the pft_type guard
+  # also live inside get_trait_data_pft(); errors surface from there.
   computed <- get_trait_data_pft(
     pft_name    = pft[["name"]],
     modeltype   = modeltype,
@@ -59,17 +90,13 @@ get.trait.data.pft <-
     trait_names = trait.names,
     constants   = if (!is.null(pft$constants)) pft$constants else list()
   )
-
   trait.data          <- computed$trait_data
   prior.distns        <- computed$prior_distns
+  pft_members         <- computed$pft_info$pft_members
   pftid               <- computed$pft_info$pft_id
   pfttype             <- computed$pft_info$pft_type
-  pft_members         <- computed$pft_info$pft_members
   pft_member_filename <- computed$pft_info$pft_member_filename
 
-  # Set forceupdate FALSE if it's a string (backwards compatible with 'AUTO')
-  forceupdate <- isTRUE(as.logical(forceupdate))
-
   # ---- Cache staleness check ----
   if (!forceupdate) {
     if (is.null(pft$posteriorid)) {
@@ -132,8 +159,10 @@ get.trait.data.pft <-
           # Check if PFT membership has changed
           PEcAn.logger::logger.debug("Checking if PFT membership has changed.")
           if (pfttype == "plant") {
+            # Columns are: id, genus, species, scientificname
             colClass <- c("double", "character", "character", "character")
           } else if (pfttype == "cultivar") {
+            # Columns are: id, specie_id, genus, species, scientificname, cultivar
             colClass <- c("double", "double", "character", "character",
                           "character", "character")
           }
@@ -262,11 +291,10 @@ get.trait.data.pft <-
           }
 
           if (done) {
-            # Attach computed objects so downstream callers can chain
-            # in-memory on the cache-HIT path — same guarantee as the
-            # cache-miss path below.
-            pft$trait_data   <- trait.data
-            pft$prior_distns <- prior.distns
+            if (return_data) {
+              pft$trait_data   <- trait.data
+              pft$prior_distns <- prior.distns
+            }
             return(pft)
           }
         }
@@ -275,7 +303,6 @@ get.trait.data.pft <-
   } # end if (!forceupdate)
 
   # ---- Cache miss: log counts, save to disk, register in DB ----
-
   if (length(trait.data) > 0) {
     trait_counts <- trait.data |>
       dplyr::bind_rows(.id = "trait") |>
@@ -358,9 +385,9 @@ get.trait.data.pft <-
     }
   }
 
-  ## Attach computed objects so downstream callers can chain in-memory
-  ## without loading files from pft$outdir.
-  pft$trait_data   <- trait.data
-  pft$prior_distns <- prior.distns
+  if (return_data) {
+    pft$trait_data   <- trait.data
+    pft$prior_distns <- prior.distns
+  }
   return(pft)
 }
\ No newline at end of file
diff --git a/base/db/man/get.trait.data.pft.Rd b/base/db/man/get.trait.data.pft.Rd
index 1efaf7142f..276573fde8 100644
--- a/base/db/man/get.trait.data.pft.Rd
+++ b/base/db/man/get.trait.data.pft.Rd
@@ -9,9 +9,10 @@ get.trait.data.pft(
   modeltype,
   dbfiles,
   dbcon,
-  trait.names,
+  trait.names = NULL,
   forceupdate = FALSE,
-  write = FALSE
+  write = TRUE,
+  return_data = FALSE
 )
 }
 \arguments{
@@ -31,28 +32,63 @@ see if an update is needed.}
 
 \item{write}{(Logical) If \code{TRUE} updated posteriors will be written to
 BETYdb.  Defaults to \code{FALSE}.}
+
+\item{return_data}{(Logical) If \code{TRUE}, the returned \code{pft} list also
+includes \code{trait_data} and \code{prior_distns} as in-memory objects.
+Defaults to \code{FALSE} to preserve legacy behavior — when \code{pft} is
+embedded inside a \code{settings} object, attaching these data frames by
+default would inflate the settings and break serialization.}
 }
 \value{
-The \code{pft} input list, updated with \code{pft$posteriorid} set to the
-ID of the (possibly new) posterior record in BETYdb. Also contains
-\code{pft$trait_data} and \code{pft$prior_distns} for in-memory chaining on both
-the cache-hit and cache-miss paths. The posterior ID can be used to
-locate the output files (\code{trait.data.Rdata}, \code{prior.distns.Rdata},
-etc.) via BETYdb's \code{dbfiles} table.
+The updated \code{pft} list with \code{posteriorid} set to the
+ID of the (possibly new) posterior record in BETYdb. The posterior ID
+can be used to locate the output files (\code{trait.data.Rdata},
+\code{prior.distns.Rdata}, etc.) via BETYdb's \code{dbfiles} table.
+
+When \code{return_data = TRUE}, the returned list also includes
+\code{trait_data} and \code{prior_distns} as in-memory objects, so
+downstream callers can use them without reloading from \code{pft$outdir}.
 }
 \description{
-Get trait data from the database for a single PFT
+Queries BETYdb for trait observations and prior distributions for a single
+plant functional type (PFT). Results are saved to files in the PFT output
+directory (\code{pft$outdir}), and also registered in the database as posterior
+records when \code{write = TRUE}.
 }
 \details{
 \code{pft} should be a list containing at least \code{name} and \code{outdir}, and
 optionally \code{posteriorid} and \code{constants}.
+}
+\section{File-based side effects}{
 
-Internally this wrapper delegates all database queries to
-\code{\link{get_trait_data_pft}} exactly once.  The returned objects are
-used for both the cache-staleness check and the save step, so the database
-is never queried more than once per call regardless of whether the cache
-hits or misses.
+The following files are written to \code{pft$outdir}:
+\describe{
+\item{\code{trait.data.Rdata}}{Named list of trait data frames, one per
+trait that has observations. Read by \code{run.meta.analysis.pft()}.}
+\item{\code{prior.distns.Rdata}}{Data frame of prior distributions, rows
+named by trait. Read by \code{run.meta.analysis.pft()}.}
+\item{\code{species.csv} or \code{cultivars.csv}}{Data frame of PFT
+member species or cultivar IDs, depending on PFT type.}
+\item{\code{trait.data.csv}}{Flattened CSV of all trait observations,
+one row per observation, for human inspection.}
 }
+In addition, each output file is registered in BETYdb via
+\code{dbfile.insert()} and associated with the posterior record whose ID
+is returned as \code{pft$posteriorid}.
+}
+
+\section{Downstream contract}{
+
+\code{run.meta.analysis.pft()} reads \code{trait.data.Rdata} and
+\code{prior.distns.Rdata} from \code{pft$outdir} at the start of the
+meta-analysis step. These two files are therefore a required output of
+this wrapper.
+
+Note: this file-based contract will be removed once
+\code{run.meta.analysis.pft()} is refactored to accept in-memory inputs
+(GSoC Week 2).
+}
+
 \author{
 David LeBauer, Shawn Serbin, Rob Kooper
 }
diff --git a/base/db/tests/testthat/test-get.trait.data.pft.R b/base/db/tests/testthat/test-get.trait.data.pft.R
index 9092e1d7fa..91257db6ff 100644
--- a/base/db/tests/testthat/test-get.trait.data.pft.R
+++ b/base/db/tests/testthat/test-get.trait.data.pft.R
@@ -25,6 +25,7 @@ make_empty_pft <- function(outdir) {
 
 std_modeltype <- "SIPNET"
 std_traits    <- c("SLA", "Vcmax", "leaf_respiration_rate_m2")
+std_pft       <- "temperate.deciduous"
 
 cleanup_posterior <- function(dbcon, posteriorid) {
   if (!is.null(posteriorid)) {
@@ -190,7 +191,7 @@ test_that("wrapper attaches trait_data and prior_distns to returned pft", {
   outdir <- withr::local_tempdir()
   result <- get.trait.data.pft(
     pft = make_test_pft(outdir), modeltype = std_modeltype,
-    dbfiles = outdir, dbcon = test_dbcon, trait.names = std_traits
+    dbfiles = outdir, dbcon = test_dbcon, trait.names = std_traits, return_data = TRUE
   )
   withr::defer(cleanup_posterior(test_dbcon, result$posteriorid))
 
@@ -316,7 +317,8 @@ test_that("wrapper attaches trait_data and prior_distns on the cache-HIT path",
     modeltype   = std_modeltype,
     dbfiles     = outdir2,
     dbcon       = test_dbcon,
-    trait.names = std_traits
+    trait.names = std_traits,
+    return_data = TRUE
   )
 
   expect_true("trait_data"   %in% names(result2),
@@ -354,7 +356,8 @@ test_that("cache-hit in-memory objects match the files copied to outdir", {
     modeltype   = std_modeltype,
     dbfiles     = outdir_hit,
     dbcon       = test_dbcon,
-    trait.names = std_traits
+    trait.names = std_traits,
+    return_data = TRUE
   )
 
   # Load the files the cache copied to outdir_hit
@@ -365,4 +368,46 @@ test_that("cache-hit in-memory objects match the files copied to outdir", {
 
   expect_identical(result_hit$trait_data,   trait_env$trait.data)
   expect_identical(result_hit$prior_distns, prior_env$prior.distns)
-})
\ No newline at end of file
+})
+
+test_that("wrapper does NOT attach trait_data/prior_distns by default (legacy)", {
+  test_dbcon <- check_db_test()
+  withr::defer(PEcAn.DB::db.close(test_dbcon))
+  outdir <- withr::local_tempdir()
+  result <- get.trait.data.pft(
+    pft         = make_test_pft(outdir),
+    modeltype   = std_modeltype,
+    dbfiles     = outdir,
+    dbcon       = test_dbcon,
+    trait.names = std_traits
+  )
+  withr::defer(cleanup_posterior(test_dbcon, result$posteriorid))
+  expect_false("trait_data"   %in% names(result))
+  expect_false("prior_distns" %in% names(result))
+})
+
+test_that("wrapper errors for unknown pft_type, not silent fallback", {
+  # Verifies the explicit guard in get.trait.data.pft():
+  # an unrecognised pft_type must throw, not silently fall through
+  # to the species-query path. We mock query_pfts to return a bad record
+  # so no live DB call is needed.
+  fake_record <- data.frame(
+    id       = 1L,
+    pft_type = "weird",
+    name     = std_pft,
+    stringsAsFactors = FALSE
+  )
+  mockery::stub(get.trait.data.pft, "query_pfts", fake_record)
+  fake_dbcon <- structure(list(), class = c("PostgreSQLConnection",
+                                            "DBIConnection"))
+  outdir <- withr::local_tempdir()
+  expect_error(
+    get.trait.data.pft(
+      pft         = make_test_pft(outdir),
+      modeltype   = std_modeltype,
+      dbfiles     = outdir,
+      dbcon       = fake_dbcon,
+      trait.names = std_traits
+    )
+  )
+})