From 77f3c93473b2847fe86717744956a8f5953c738b Mon Sep 17 00:00:00 2001
From: yanggu <yanggu@scc-kd1.scc.bu.edu>
Date: Wed, 18 Mar 2026 23:23:06 -0400
Subject: [PATCH 01/13] Upload NEE and LE gap-fill functions

---
 modules/data.land/DESCRIPTION             |   6 +-
 modules/data.land/R/SIPNET_flux_gapfill.R | 588 ++++++++++++++++++++++
 2 files changed, 593 insertions(+), 1 deletion(-)
 create mode 100644 modules/data.land/R/SIPNET_flux_gapfill.R

diff --git a/modules/data.land/DESCRIPTION b/modules/data.land/DESCRIPTION
index 13691a4327..ba5ad75b22 100644
--- a/modules/data.land/DESCRIPTION
+++ b/modules/data.land/DESCRIPTION
@@ -46,7 +46,11 @@ Imports:
     terra,
     tidyr,
     tidyselect,
-    XML (>= 3.98-1.4)
+    XML (>= 3.98-1.4),
+    data.table,
+    tibble,
+    xgboost,
+    doParallel
 Suggests:
     dataone,
     datapack,
diff --git a/modules/data.land/R/SIPNET_flux_gapfill.R b/modules/data.land/R/SIPNET_flux_gapfill.R
new file mode 100644
index 0000000000..3997513616
--- /dev/null
+++ b/modules/data.land/R/SIPNET_flux_gapfill.R
@@ -0,0 +1,588 @@
+#' @description
+#' This function reads the site information table and returns the selected
+#' site IDs for downstream modeling.
+#'
+#' @title read_site_ids
+#'
+#' @param site.dir character: path to the site information table.
+#' @param n_sites numeric: number of sites to keep from the site table,
+#' the total number of sites is 241.
+#'
+#' @return character vector of selected site IDs. Example: 'Site ID' CA-ARB
+#'
+#' @author Yang Gu
+read_site_ids <- function(site.dir, n_sites = 40) {
+  fluxnet.df <- data.table::fread(site.dir) |>
+    tibble::as_tibble()
+  
+  site_ids <- fluxnet.df$`Site ID`[1:n_sites] |>
+    unique()
+  
+  return(site_ids)
+}
+
+#' @description
+#' This function reads the predictor/flux dataset and keeps only rows
+#' corresponding to the selected site IDs.
+#'
+#' @title read_pred_data
+#'
+#' @param pred.var.dir character: path to the predictor/flux data file.
+#' @param site_ids character: vector of site IDs to retain.
+#'
+#' @return data.frame containing filtered predictor/flux data.
+#'
+#' @author Yang Gu
+read_pred_data <- function(pred.var.dir, site_ids) {
+  resimet.df <- data.table::fread(pred.var.dir) |>
+    tibble::as_tibble() |>
+    dplyr::mutate(is_night_suncalc = as.integer(is_night_suncalc))
+  
+  resimet.df <- resimet.df |>
+    dplyr::filter(Site_ID %in% site_ids)
+  
+  return(resimet.df)
+}
+
+#' @description
+#' This function calculates site-specific missing-value percentages for
+#' candidate predictor variables and returns a long-format table of retained
+#' predictors for each site.
+#'
+#' @title build_sitecov_df
+#'
+#' @param resimet.df data.frame: predictor/flux dataset filtered to target sites.
+#' @param vars_to_check character: vector of candidate predictor variable names.
+#' @param outdir character: optional output directory to save the missing summary.
+#' Default is NULL.
+#' @param missing_threshold numeric: maximum allowed missing percentage for a
+#' predictor to be retained at a site. Default is 40.
+#'
+#' @return list containing:
+#' \describe{
+#'   \item{missing_summary}{data frame of missing-value summary by site and variable.}
+#'   \item{sitecov.df}{long-format data frame of retained predictors by site.}
+#' }
+#'
+#' @author Yang Gu
+build_sitecov_df <- function(resimet.df,
+                             vars_to_check,
+                             outdir = NULL,
+                             missing_threshold = 40) {
+  missing_summary <- resimet.df |>
+    dplyr::select(Site_ID, dplyr::all_of(vars_to_check)) |>
+    tidyr::pivot_longer(
+      cols = -Site_ID,
+      names_to = "variable",
+      values_to = "value"
+    ) |>
+    dplyr::group_by(Site_ID, variable) |>
+    dplyr::summarise(
+      total_obs = dplyr::n(),
+      missing_count = sum(is.na(value)),
+      missing_pct = missing_count / total_obs * 100,
+      .groups = "drop"
+    ) |>
+    dplyr::arrange(Site_ID, dplyr::desc(missing_pct))
+  
+  if (!is.null(outdir)) {
+    if (!file.exists(outdir)) {
+      dir.create(outdir, recursive = TRUE)
+    }
+    data.table::fwrite(
+      missing_summary,
+      file.path(outdir, "summary.csv")
+    )
+  }
+  
+  high_staying_vars <- missing_summary |>
+    dplyr::filter(missing_pct < missing_threshold) |>
+    dplyr::group_by(Site_ID) |>
+    dplyr::summarise(
+      filtered_variables = list(variable),
+      .groups = "drop"
+    )
+  
+  sitecov.df <- high_staying_vars |>
+    tidyr::unnest(filtered_variables)
+  
+  return(list(
+    missing_summary = missing_summary,
+    sitecov.df = sitecov.df
+  ))
+}
+
+#' @description
+#' This internal helper function fits a site-specific XGBoost model using
+#' k-fold CV. It evaluates model performance with fold-level
+#' training and validation R2, and fits a final model using the full
+#' site training dataset if the average validation R2 exceeds the
+#' specified threshold.
+#'
+#' @title fit_site_xgb_cv
+#'
+#' @param subdf data.frame: a site-specific training data frame containing
+#' the response variable and predictor variables.
+#' @param feats character: a character vector of predictor variable names used
+#' for model fitting.
+#' @param site_name character: site identifier used for progress reporting.
+#' @param response_var character: the name of the response variable column in
+#' `subdf`.
+#' @param nfolds numeric: number of folds used in CV.
+#' @param params list: a list of XGBoost model parameters passed to
+#' `xgboost::xgb.train`.
+#' @param nrounds numeric: number of boosting rounds used in XGBoost training.
+#' @param r2_threshold numeric: minimum average validation R2 required
+#' to fit the final model.
+#'
+#' @return list containing:
+#' \describe{
+#'   \item{r2}{average validation R2 across folds.}
+#'   \item{model}{the final fitted XGBoost model, or NULL if the site does not meet the threshold.}
+#'   \item{folds_df}{a data frame of fold-level training and validation R2.}
+#' }
+#'
+#' @author Yang Gu
+fit_site_xgb_cv <- function(subdf,
+                            feats,
+                            site_name,
+                            response_var,
+                            nfolds,
+                            params,
+                            nrounds,
+                            r2_threshold) {
+  
+  # Return empty results if the site has too few samples or no predictors
+  if (nrow(subdf) < nfolds || length(feats) == 0) {
+    return(list(
+      r2 = NA_real_,
+      model = NULL,
+      folds_df = tibble::tibble()
+    ))
+  }
+  
+  # Randomly assign observations to CV folds
+  set.seed(123)
+  folds <- sample(rep(seq_len(nfolds), length.out = nrow(subdf)))
+  
+  # Initialize containers for fold-level performance metrics
+  r2s <- numeric(nfolds)
+  r2s_train <- numeric(nfolds)
+  folds_df <- vector("list", nfolds)
+  
+  # Loop over folds
+  for (i in seq_len(nfolds)) {
+    # Split training and validation subsets
+    tr <- subdf[folds != i, , drop = FALSE]
+    te <- subdf[folds == i, , drop = FALSE]
+    
+    # Construct XGBoost training matrix
+    dtrain <- xgboost::xgb.DMatrix(
+      data = as.matrix(dplyr::select(tr, dplyr::all_of(feats))),
+      label = tr[[response_var]],
+      missing = NA
+    )
+    
+    # Fit fold-specific XGBoost model
+    mdl <- xgboost::xgb.train(
+      params = params,
+      data = dtrain,
+      nrounds = nrounds,
+      verbose = 0
+    )
+    
+    # Construct validation matrix
+    dvalid <- xgboost::xgb.DMatrix(
+      data = as.matrix(dplyr::select(te, dplyr::all_of(feats))),
+      missing = NA
+    )
+    
+    # Predict on validation and training data
+    pred_valid <- stats::predict(mdl, dvalid)
+    pred_train <- stats::predict(mdl, dtrain)
+    
+    # Compute denominator for R2
+    denom_valid <- sum((te[[response_var]] - mean(te[[response_var]], na.rm = TRUE))^2, na.rm = TRUE)
+    denom_train <- sum((tr[[response_var]] - mean(tr[[response_var]], na.rm = TRUE))^2, na.rm = TRUE)
+    
+    # Compute validation R2
+    r2_valid <- if (denom_valid > 0) {
+      1 - sum((te[[response_var]] - pred_valid)^2, na.rm = TRUE) / denom_valid
+    } else {
+      NA_real_
+    }
+    
+    # Compute training R2
+    r2_train <- if (denom_train > 0) {
+      1 - sum((tr[[response_var]] - pred_train)^2, na.rm = TRUE) / denom_train
+    } else {
+      NA_real_
+    }
+    
+    # Store fold-level metrics
+    r2s[i] <- r2_valid
+    r2s_train[i] <- r2_train
+    
+    folds_df[[i]] <- tibble::tibble(
+      fold = i,
+      R2_train = r2_train,
+      R2_valid = r2_valid
+    )
+  }
+  
+  # Print fold-level validation performance for the current site
+  message(sprintf(
+    "[Site %s] Fold Valid R2: %s",
+    site_name,
+    paste(round(r2s, 3), collapse = ", ")
+  ))
+  
+  # Compute average validation R2 across folds
+  avg_r2 <- mean(r2s, na.rm = TRUE)
+  final_mdl <- NULL
+  
+  # Fit final model using all site data if the site passes the threshold
+  if (!is.na(avg_r2) && avg_r2 >= r2_threshold) {
+    final_mdl <- xgboost::xgb.train(
+      params = params,
+      data = xgboost::xgb.DMatrix(
+        data = as.matrix(dplyr::select(subdf, dplyr::all_of(feats))),
+        label = subdf[[response_var]],
+        missing = NA
+      ),
+      nrounds = nrounds,
+      verbose = 0
+    )
+  }
+  
+  # Return site-level summary results
+  list(
+    r2 = avg_r2,
+    model = final_mdl,
+    folds_df = dplyr::bind_rows(folds_df)
+  )
+}
+
+#########Need to add sub-functions to add site_ids, resimet.df, and sitecov.df
+#' @description
+#' This function performs site-level XGBoost gap-filling with CV
+#' for eddy-covariance flux data. For each ensemble iteration, it samples
+#' observed data(QC=0) within each site, fits site-specific XGBoost models
+#' using k-fold CV, predicts missing or low-quality flux values,
+#' and exports both gap-filled outputs and fold-level CV diagnostics.
+#'
+#' @title xgb_gapfill
+#'
+#' @param site_ids character: a character vector of site IDs to be modeled.
+#' @param resimet.df data.frame: a data frame containing eddy-covariance
+#' observations, QC flags, timestamps, and predictor variables.
+#' @param sitecov.df data.frame: a data frame containing site-specific selected
+#' predictor names, with columns including `Site_ID` and `filtered_variables`.
+#' @param flux_var character: the target flux type to be gap-filled. Currently
+#' supported values are `"LE"` and `"NEE"`.
+#' @param outdir character: the output directory where the exported CSV files
+#' will be written.
+#' @param nfolds numeric: number of folds used in CV, default is 10.
+#' @param nrounds numeric: number of boosting rounds used in XGBoost training,
+#' default is 200.
+#' @param nens numeric: number of ensemble iterations, default is 25.
+#' @param nsamp numeric: proportion of high-quality observations sampled within
+#' each site for training in each ensemble, default is 0.95.
+#' @param r2_threshold numeric: minimum average validation R2 required to
+#' fit the final model and retain a site in the exported gap-filled output
+#' @param cores numeric: how many CPUs to be used in the calculation
+#' @param overwrite boolean: decide if existing output files should be overwritten,
+#' the default is TRUE.
+#'
+#' @return list containing ensemble-level outputs. Each element corresponds to
+#' one ensemble iteration and includes:
+#' \describe{
+#'   \item{cv_results}{a data frame of site-level average CV R2.}
+#'   \item{filled_data}{a data frame of the full input dataset with merged gap-filled values.}
+#'   \item{exported_data}{a data frame of filtered gap-filled results for well-performing sites only.}
+#'   \item{folds_all}{a data frame of fold-level CV diagnostics across sites.}
+#' }
+#'
+#' @author Yang Gu
+#' @importFrom foreach %dopar%
+#' @export
+xgb_gapfill <- function(site.dir,
+                        pred.var.dir,
+                        flux_var,
+                        outdir,
+                        vars_to_check = c(
+                          "TA_F","SW_IN_F","SW_DIF","LW_IN_JSB","VPD_F","PA_F","P_F",
+                          "RH","NETRAD","PPFD_IN","PPFD_DIF","TS_F_MDS_1","LW_IN_F",
+                          "TS_F_MDS_3","TS_F_MDS_5","SWC_F_MDS_1","NDVI","EVI",
+                          "sin_doy","cos_doy","SWC_F_MDS_3","SWC_F_MDS_5"
+                        ),
+                        n_sites = 40,
+                        missing_threshold = 40,
+                        nfolds = 10,
+                        nrounds = 200,
+                        nens = 25,
+                        nsamp = 0.95,
+                        params = list(
+                          objective = "reg:squarederror",
+                          eta = 0.13,
+                          max_depth = 13,
+                          subsample = 0.9,
+                          colsample_bytree = 0.9
+                        ),
+                        r2_threshold = 0.6,
+                        cores = max(1L, parallel::detectCores() - 1L),
+                        overwrite = TRUE) {
+  
+  if (!requireNamespace("xgboost", quietly = TRUE)) {
+    stop("Package 'xgboost' is required for cv_xgb_site_cv_gapfill() but is not installed.")
+  }
+  if (!requireNamespace("doParallel", quietly = TRUE)) {
+    stop("Package 'doParallel' is required for parallel execution but is not installed.")
+  }
+  if (!requireNamespace("foreach", quietly = TRUE)) {
+    stop("Package 'foreach' is required for parallel execution but is not installed.")
+  }
+  
+  # Read target site IDs
+  site_ids <- read_site_ids(
+    site.dir = site.dir,
+    n_sites = n_sites
+  )
+  
+  # Read predictor and flux data for selected sites
+  resimet.df <- read_pred_data(
+    pred.var.dir = pred.var.dir,
+    site_ids = site_ids
+  )
+  
+  # Build site-specific predictor table
+  sitecov_res <- build_sitecov_df(
+    resimet.df = resimet.df,
+    vars_to_check = vars_to_check,
+    outdir = outdir,
+    missing_threshold = missing_threshold
+  )
+  sitecov.df <- sitecov_res$sitecov.df
+  
+  # Map flux type to variable names
+  flux_var <- toupper(flux_var)
+  if (flux_var == "LE") {
+    response_var <- "LE_F_MDS"
+    qc_var <- "LE_F_MDS_QC"
+    gapfill_var <- "LE_gapfill"
+  } else if (flux_var == "NEE") {
+    response_var <- "NEE_CUT_USTAR50"
+    qc_var <- "NEE_CUT_USTAR50_QC"
+    gapfill_var <- "NEE_gapfill"
+  } else {
+    stop("Unsupported flux_var. Currently only 'LE' and 'NEE' are supported.")
+  }
+  
+  # Validate required columns in input data
+  required_cols <- c("Site_ID", "TIMESTAMP_START", response_var, qc_var)
+  missing_cols <- setdiff(required_cols, names(resimet.df))
+  if (length(missing_cols) > 0) {
+    stop(
+      sprintf(
+        "Missing required columns in resimet.df: %s",
+        paste(missing_cols, collapse = ", ")
+      )
+    )
+  }
+  if (!file.exists(outdir)) {
+    dir.create(outdir, recursive = TRUE)
+  }
+  
+  # Filter high-quality observations for training (QC = 0)
+  data_good <- resimet.df |>
+    dplyr::filter(.data[[qc_var]] == 0)
+  
+  ensemble_results <- vector("list", nens)
+  
+  # Loop over ensemble members
+  for (m in seq_len(nens)) {
+    message(sprintf("Ensemble %d/%d", m, nens))
+    
+    # Stratified Sampling: Sample 95% from within each site for training.
+    train_df <- data_good |>
+      dplyr::group_by(Site_ID) |>
+      dplyr::slice_sample(prop = nsamp) |>
+      dplyr::ungroup()
+    
+    # Build site-specific training datasets
+    site_dfs <- lapply(site_ids, function(site) {
+      feats0 <- sitecov.df |>
+        dplyr::filter(Site_ID == site) |>
+        dplyr::pull(filtered_variables) |>
+        unique()
+      
+      feats <- unique(c(feats0, "is_night_suncalc"))
+      feats <- intersect(feats, names(train_df))
+      
+      df <- train_df |>
+        dplyr::filter(Site_ID == site) |>
+        dplyr::select(
+          TIMESTAMP_START,
+          Site_ID,
+          dplyr::all_of(response_var),
+          dplyr::all_of(feats)
+        ) |>
+        tibble::as_tibble()
+      
+      attr(df, "features") <- feats
+      df
+    })
+    names(site_dfs) <- site_ids
+    
+    # Parallel model fitting and gap-filling by site   
+    use_cores <- min(as.numeric(cores), length(site_ids))
+    cl <- parallel::makeCluster(use_cores, type = "PSOCK")
+    doParallel::registerDoParallel(cl)
+    results <- tryCatch({
+      foreach::foreach(
+        site = site_ids,
+        .packages = c("dplyr", "xgboost", "tibble"),
+        .combine = dplyr::bind_rows,
+        .errorhandling = "pass"
+      ) %dopar% {
+        tryCatch({
+          df <- site_dfs[[site]]
+          feats <- attr(df, "features")
+          
+          # Fit XGBoost model with CV
+          fit <- fit_site_xgb_cv(
+            subdf = df,
+            feats = feats,
+            site_name = site,
+            response_var = response_var,
+            nfolds = nfolds,
+            params = params,
+            nrounds = nrounds,
+            r2_threshold = r2_threshold
+          )
+          
+          # Identify gap-fill candidates
+          gap_cands <- resimet.df |>
+            dplyr::filter(
+              Site_ID == site,
+              is.na(.data[[response_var]]) | .data[[qc_var]] != 0
+            ) |>
+            dplyr::select(TIMESTAMP_START, Site_ID, dplyr::all_of(feats)) |>
+            tibble::as_tibble()
+          
+          pd <- gap_cands
+          
+          # Gap fill flux if model is valid
+          if (!is.null(fit$model) && nrow(pd) > 0) {
+            dtest <- xgboost::xgb.DMatrix(
+              data = as.matrix(dplyr::select(pd, dplyr::all_of(feats))),
+              missing = NA
+            )
+            pd[[gapfill_var]] <- stats::predict(fit$model, dtest)
+            pd$GF_QC <- 0L
+          } else if (nrow(pd) > 0) {
+            pd[[gapfill_var]] <- NA_real_
+            pd$GF_QC <- 1L
+          }
+          
+          # Return site-level results
+          tibble::tibble(
+            Site_ID = site,
+            Avg_R2 = fit$r2,
+            pred_df = list(pd),
+            folds_df = list(dplyr::mutate(fit$folds_df, Site_ID = site))
+          )
+        }, error = function(e) {
+          message(sprintf("[Site %s] ERROR: %s", site, conditionMessage(e)))
+          tibble::tibble(
+            Site_ID = site,
+            Avg_R2 = NA_real_,
+            pred_df = list(NULL),
+            folds_df = list(tibble::tibble())
+          )
+        })
+      }
+    }, error = function(e) {
+      message(sprintf("Parallel loop error in ensemble %d: %s", m, conditionMessage(e)))
+      NULL
+    }, finally = {
+      parallel::stopCluster(cl)
+      foreach::registerDoSEQ()
+    })
+    
+    # Skip ensemble if no results
+    if (is.null(results) || nrow(results) == 0) {
+      ensemble_results[[m]] <- NULL
+      next
+    }
+    
+    # Combine gap-fill predictions across sites
+    pred_list <- results$pred_df
+    gap_all <- dplyr::bind_rows(pred_list)
+    
+    # Merge predictions back to full dataset
+    resimet_filled <- resimet.df |>
+      dplyr::left_join(gap_all, by = c("Site_ID", "TIMESTAMP_START")) |>
+      dplyr::mutate(
+        !!gapfill_var := dplyr::if_else(
+          !is.na(.data[[gapfill_var]]),
+          .data[[gapfill_var]],
+          .data[[response_var]]
+        ),
+        GF_QC = dplyr::if_else(is.na(GF_QC), 1L, GF_QC)
+      )
+    
+    # Extract CV performance
+    cv_results <- results |>
+      dplyr::select(Site_ID, Avg_R2)
+    
+    # Select well-performing sites
+    good_sites <- cv_results |>
+      dplyr::filter(!is.na(Avg_R2) & Avg_R2 >= r2_threshold) |>
+      dplyr::pull(Site_ID)
+    
+    # Subset final output for good sites
+    sub.df <- resimet_filled |>
+      dplyr::filter(Site_ID %in% good_sites) |>
+      dplyr::select(
+        Site_ID,
+        TIMESTAMP_START,
+        dplyr::all_of(response_var),
+        dplyr::all_of(gapfill_var),
+        GF_QC,
+        dplyr::all_of(qc_var)
+      )
+    
+    # Aggregate fold-level CV diagnostics
+    folds_all <- dplyr::bind_rows(results$folds_df) |>
+      dplyr::mutate(ensemble_id = m)
+    
+    ens_tag <- sprintf("ens%02d", m)
+    flux_tag <- tolower(flux_var)
+    
+    data_path <- file.path(outdir, sprintf("%s_gfed_dat_%s.csv", flux_tag, ens_tag))
+    folds_path <- file.path(outdir, sprintf("%s_gfed_cv_folds_%s.csv", flux_tag, ens_tag))
+    
+    if (overwrite || !file.exists(data_path)) {
+      data.table::fwrite(sub.df, data_path)
+    }
+    
+    if (overwrite || !file.exists(folds_path)) {
+      data.table::fwrite(folds_all, folds_path)
+    }
+    
+    # Store ensemble results
+    ensemble_results[[m]] <- list(
+      cv_results = cv_results,
+      filled_data = resimet_filled,
+      exported_data = sub.df,
+      folds_all = folds_all
+    )
+    
+    rm(results, pred_list, gap_all, resimet_filled, cv_results, good_sites,
+       sub.df, folds_all, site_dfs, train_df)
+    gc(FALSE)
+  }
+  
+  return(ensemble_results)
+}
\ No newline at end of file

From 61981715cd53ea7f5a96e76c29ff7946bba107c6 Mon Sep 17 00:00:00 2001
From: yanggu <yanggu@scc-eb4.scc.bu.edu>
Date: Thu, 19 Mar 2026 14:25:59 -0400
Subject: [PATCH 02/13] make roxygen files fot gap-filling scripts

---
 modules/data.land/NAMESPACE               |  1 +
 modules/data.land/man/build_sitecov_df.Rd | 39 +++++++++++
 modules/data.land/man/fit_site_xgb_cv.Rd  | 57 ++++++++++++++++
 modules/data.land/man/read_pred_data.Rd   | 23 +++++++
 modules/data.land/man/xgb_gapfill.Rd      | 81 +++++++++++++++++++++++
 5 files changed, 201 insertions(+)
 create mode 100644 modules/data.land/man/build_sitecov_df.Rd
 create mode 100644 modules/data.land/man/fit_site_xgb_cv.Rd
 create mode 100644 modules/data.land/man/read_pred_data.Rd
 create mode 100644 modules/data.land/man/xgb_gapfill.Rd

diff --git a/modules/data.land/NAMESPACE b/modules/data.land/NAMESPACE
index dc864627af..ec0d690a37 100644
--- a/modules/data.land/NAMESPACE
+++ b/modules/data.land/NAMESPACE
@@ -71,6 +71,7 @@ export(to.TreeCode)
 export(validate_events_json)
 export(write_ic)
 export(write_veg)
+export(xgb_gapfill)
 importFrom(dplyr,"%>%")
 importFrom(foreach,"%dopar%")
 importFrom(magrittr,"%>%")
diff --git a/modules/data.land/man/build_sitecov_df.Rd b/modules/data.land/man/build_sitecov_df.Rd
new file mode 100644
index 0000000000..52388c7be9
--- /dev/null
+++ b/modules/data.land/man/build_sitecov_df.Rd
@@ -0,0 +1,39 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/SIPNET_flux_gapfill.R
+\name{build_sitecov_df}
+\alias{build_sitecov_df}
+\title{build_sitecov_df}
+\usage{
+build_sitecov_df(
+  resimet.df,
+  vars_to_check,
+  outdir = NULL,
+  missing_threshold = 40
+)
+}
+\arguments{
+\item{resimet.df}{data.frame: predictor/flux dataset filtered to target sites.}
+
+\item{vars_to_check}{character: vector of candidate predictor variable names.}
+
+\item{outdir}{character: optional output directory to save the missing summary.
+Default is NULL.}
+
+\item{missing_threshold}{numeric: maximum allowed missing percentage for a
+predictor to be retained at a site. Default is 40.}
+}
+\value{
+list containing:
+\describe{
+  \item{missing_summary}{data frame of missing-value summary by site and variable.}
+  \item{sitecov.df}{long-format data frame of retained predictors by site.}
+}
+}
+\description{
+This function calculates site-specific missing-value percentages for
+candidate predictor variables and returns a long-format table of retained
+predictors for each site.
+}
+\author{
+Yang Gu
+}
diff --git a/modules/data.land/man/fit_site_xgb_cv.Rd b/modules/data.land/man/fit_site_xgb_cv.Rd
new file mode 100644
index 0000000000..97998c2a82
--- /dev/null
+++ b/modules/data.land/man/fit_site_xgb_cv.Rd
@@ -0,0 +1,57 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/SIPNET_flux_gapfill.R
+\name{fit_site_xgb_cv}
+\alias{fit_site_xgb_cv}
+\title{fit_site_xgb_cv}
+\usage{
+fit_site_xgb_cv(
+  subdf,
+  feats,
+  site_name,
+  response_var,
+  nfolds,
+  params,
+  nrounds,
+  r2_threshold
+)
+}
+\arguments{
+\item{subdf}{data.frame: a site-specific training data frame containing
+the response variable and predictor variables.}
+
+\item{feats}{character: a character vector of predictor variable names used
+for model fitting.}
+
+\item{site_name}{character: site identifier used for progress reporting.}
+
+\item{response_var}{character: the name of the response variable column in
+`subdf`.}
+
+\item{nfolds}{numeric: number of folds used in CV.}
+
+\item{params}{list: a list of XGBoost model parameters passed to
+`xgboost::xgb.train`.}
+
+\item{nrounds}{numeric: number of boosting rounds used in XGBoost training.}
+
+\item{r2_threshold}{numeric: minimum average validation R2 required
+to fit the final model.}
+}
+\value{
+list containing:
+\describe{
+  \item{r2}{average validation R2 across folds.}
+  \item{model}{the final fitted XGBoost model, or NULL if the site does not meet the threshold.}
+  \item{folds_df}{a data frame of fold-level training and validation R2.}
+}
+}
+\description{
+This internal helper function fits a site-specific XGBoost model using
+k-fold CV. It evaluates model performance with fold-level
+training and validation R2, and fits a final model using the full
+site training dataset if the average validation R2 exceeds the
+specified threshold.
+}
+\author{
+Yang Gu
+}
diff --git a/modules/data.land/man/read_pred_data.Rd b/modules/data.land/man/read_pred_data.Rd
new file mode 100644
index 0000000000..603c071738
--- /dev/null
+++ b/modules/data.land/man/read_pred_data.Rd
@@ -0,0 +1,23 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/SIPNET_flux_gapfill.R
+\name{read_pred_data}
+\alias{read_pred_data}
+\title{read_pred_data}
+\usage{
+read_pred_data(pred.var.dir, site_ids)
+}
+\arguments{
+\item{pred.var.dir}{character: path to the predictor/flux data file.}
+
+\item{site_ids}{character: vector of site IDs to retain.}
+}
+\value{
+data.frame containing filtered predictor/flux data.
+}
+\description{
+This function reads the predictor/flux dataset and keeps only rows
+corresponding to the selected site IDs.
+}
+\author{
+Yang Gu
+}
diff --git a/modules/data.land/man/xgb_gapfill.Rd b/modules/data.land/man/xgb_gapfill.Rd
new file mode 100644
index 0000000000..7a66779f1f
--- /dev/null
+++ b/modules/data.land/man/xgb_gapfill.Rd
@@ -0,0 +1,81 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/SIPNET_flux_gapfill.R
+\name{xgb_gapfill}
+\alias{xgb_gapfill}
+\title{xgb_gapfill}
+\usage{
+xgb_gapfill(
+  site.dir,
+  pred.var.dir,
+  flux_var,
+  outdir,
+  vars_to_check = c("TA_F", "SW_IN_F", "SW_DIF", "LW_IN_JSB", "VPD_F", "PA_F", "P_F",
+    "RH", "NETRAD", "PPFD_IN", "PPFD_DIF", "TS_F_MDS_1", "LW_IN_F", "TS_F_MDS_3",
+    "TS_F_MDS_5", "SWC_F_MDS_1", "NDVI", "EVI", "sin_doy", "cos_doy", "SWC_F_MDS_3",
+    "SWC_F_MDS_5"),
+  n_sites = 40,
+  missing_threshold = 40,
+  nfolds = 10,
+  nrounds = 200,
+  nens = 25,
+  nsamp = 0.95,
+  params = list(objective = "reg:squarederror", eta = 0.13, max_depth = 13, subsample =
+    0.9, colsample_bytree = 0.9),
+  r2_threshold = 0.6,
+  cores = max(1L, parallel::detectCores() - 1L),
+  overwrite = TRUE
+)
+}
+\arguments{
+\item{flux_var}{character: the target flux type to be gap-filled. Currently
+supported values are `"LE"` and `"NEE"`.}
+
+\item{outdir}{character: the output directory where the exported CSV files
+will be written.}
+
+\item{nfolds}{numeric: number of folds used in CV, default is 10.}
+
+\item{nrounds}{numeric: number of boosting rounds used in XGBoost training,
+default is 200.}
+
+\item{nens}{numeric: number of ensemble iterations, default is 25.}
+
+\item{nsamp}{numeric: proportion of high-quality observations sampled within
+each site for training in each ensemble, default is 0.95.}
+
+\item{r2_threshold}{numeric: minimum average validation R2 required to
+fit the final model and retain a site in the exported gap-filled output}
+
+\item{cores}{numeric: how many CPUs to be used in the calculation}
+
+\item{overwrite}{boolean: decide if existing output files should be overwritten,
+the default is TRUE.}
+
+\item{site_ids}{character: a character vector of site IDs to be modeled.}
+
+\item{resimet.df}{data.frame: a data frame containing eddy-covariance
+observations, QC flags, timestamps, and predictor variables.}
+
+\item{sitecov.df}{data.frame: a data frame containing site-specific selected
+predictor names, with columns including `Site_ID` and `filtered_variables`.}
+}
+\value{
+list containing ensemble-level outputs. Each element corresponds to
+one ensemble iteration and includes:
+\describe{
+  \item{cv_results}{a data frame of site-level average CV R2.}
+  \item{filled_data}{a data frame of the full input dataset with merged gap-filled values.}
+  \item{exported_data}{a data frame of filtered gap-filled results for well-performing sites only.}
+  \item{folds_all}{a data frame of fold-level CV diagnostics across sites.}
+}
+}
+\description{
+This function performs site-level XGBoost gap-filling with CV
+for eddy-covariance flux data. For each ensemble iteration, it samples
+observed data(QC=0) within each site, fits site-specific XGBoost models
+using k-fold CV, predicts missing or low-quality flux values,
+and exports both gap-filled outputs and fold-level CV diagnostics.
+}
+\author{
+Yang Gu
+}

From 2dce279d131407a21a3eb94dccdc3a953506aa73 Mon Sep 17 00:00:00 2001
From: yanggu <yanggu@scc-eb4.scc.bu.edu>
Date: Thu, 19 Mar 2026 14:26:57 -0400
Subject: [PATCH 03/13] commit rosxygen files for read_site_ids

---
 modules/data.land/man/read_site_ids.Rd | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)
 create mode 100644 modules/data.land/man/read_site_ids.Rd

diff --git a/modules/data.land/man/read_site_ids.Rd b/modules/data.land/man/read_site_ids.Rd
new file mode 100644
index 0000000000..8c4bf35143
--- /dev/null
+++ b/modules/data.land/man/read_site_ids.Rd
@@ -0,0 +1,24 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/SIPNET_flux_gapfill.R
+\name{read_site_ids}
+\alias{read_site_ids}
+\title{read_site_ids}
+\usage{
+read_site_ids(site.dir, n_sites = 40)
+}
+\arguments{
+\item{site.dir}{character: path to the site information table.}
+
+\item{n_sites}{numeric: number of sites to keep from the site table,
+the total number of sites is 241.}
+}
+\value{
+character vector of selected site IDs. Example: 'Site ID' CA-ARB
+}
+\description{
+This function reads the site information table and returns the selected
+site IDs for downstream modeling.
+}
+\author{
+Yang Gu
+}

From dd5367886f06f3eaad399ae303077d5584e2d0d0 Mon Sep 17 00:00:00 2001
From: yanggu <yanggu@scc-eb4.scc.bu.edu>
Date: Thu, 19 Mar 2026 16:37:19 -0400
Subject: [PATCH 04/13] update description

---
 docker/depends/pecan_package_dependencies.csv |  4 ++
 modules/data.land/R/SIPNET_flux_gapfill.R     | 65 ++++++++++---------
 2 files changed, 37 insertions(+), 32 deletions(-)

diff --git a/docker/depends/pecan_package_dependencies.csv b/docker/depends/pecan_package_dependencies.csv
index 1c64dd7120..0fbef5edb4 100644
--- a/docker/depends/pecan_package_dependencies.csv
+++ b/docker/depends/pecan_package_dependencies.csv
@@ -33,6 +33,7 @@
 "data.table","*","base/utils","Suggests",FALSE
 "data.table","*","base/visualization","Imports",FALSE
 "data.table","*","models/biocro","Imports",FALSE
+"data.table","*","modules/data.land","Imports",FALSE
 "data.table","*","modules/data.remote","Suggests",FALSE
 "dataone","*","modules/data.land","Suggests",FALSE
 "datapack","*","modules/data.land","Suggests",FALSE
@@ -41,6 +42,7 @@
 "dbplyr",">= 2.4.0","base/db","Imports",FALSE
 "devtools","*","models/ed","Suggests",FALSE
 "doParallel","*","modules/data.atmosphere","Suggests",FALSE
+"doParallel","*","modules/data.land","Imports",FALSE
 "doParallel","*","modules/data.remote","Imports",FALSE
 "doSNOW","*","base/remote","Suggests",FALSE
 "doSNOW","*","base/utils","Suggests",FALSE
@@ -667,6 +669,7 @@
 "tibble","*","models/fates","Imports",FALSE
 "tibble","*","models/lpjguess","Imports",FALSE
 "tibble","*","modules/data.atmosphere","Imports",FALSE
+"tibble","*","modules/data.land","Imports",FALSE
 "tibble","*","modules/data.remote","Suggests",FALSE
 "tibble","*","modules/meta.analysis","Suggests",FALSE
 "tictoc","*","modules/assim.sequential","Suggests",FALSE
@@ -722,6 +725,7 @@
 "withr","*","modules/data.land","Suggests",FALSE
 "withr","*","modules/uncertainty","Suggests",FALSE
 "xgboost","*","modules/assim.sequential","Suggests",FALSE
+"xgboost","*","modules/data.land","Imports",FALSE
 "XML","*","base/workflow","Imports",FALSE
 "XML","*","models/biocro","Imports",FALSE
 "XML","*","models/maat","Imports",FALSE
diff --git a/modules/data.land/R/SIPNET_flux_gapfill.R b/modules/data.land/R/SIPNET_flux_gapfill.R
index 3997513616..17f33823ec 100644
--- a/modules/data.land/R/SIPNET_flux_gapfill.R
+++ b/modules/data.land/R/SIPNET_flux_gapfill.R
@@ -263,44 +263,45 @@ fit_site_xgb_cv <- function(subdf,
   )
 }
 
-#########Need to add sub-functions to add site_ids, resimet.df, and sitecov.df
 #' @description
-#' This function performs site-level XGBoost gap-filling with CV
-#' for eddy-covariance flux data. For each ensemble iteration, it samples
-#' observed data(QC=0) within each site, fits site-specific XGBoost models
-#' using k-fold CV, predicts missing or low-quality flux values,
-#' and exports both gap-filled outputs and fold-level CV diagnostics.
+#' Perform site-level XGBoost gap-filling with cross-validation for
+#' eddy-covariance flux data. For each ensemble iteration, the function
+#' samples high-quality observations (QC = 0) within each site, fits a
+#' site-specific XGBoost model using k-fold cross-validation, predicts
+#' missing or low-quality flux values, and exports both gap-filled outputs
+#' and fold-level CV diagnostics.
 #'
 #' @title xgb_gapfill
 #'
-#' @param site_ids character: a character vector of site IDs to be modeled.
-#' @param resimet.df data.frame: a data frame containing eddy-covariance
-#' observations, QC flags, timestamps, and predictor variables.
-#' @param sitecov.df data.frame: a data frame containing site-specific selected
-#' predictor names, with columns including `Site_ID` and `filtered_variables`.
-#' @param flux_var character: the target flux type to be gap-filled. Currently
-#' supported values are `"LE"` and `"NEE"`.
-#' @param outdir character: the output directory where the exported CSV files
-#' will be written.
-#' @param nfolds numeric: number of folds used in CV, default is 10.
-#' @param nrounds numeric: number of boosting rounds used in XGBoost training,
-#' default is 200.
-#' @param nens numeric: number of ensemble iterations, default is 25.
-#' @param nsamp numeric: proportion of high-quality observations sampled within
-#' each site for training in each ensemble, default is 0.95.
-#' @param r2_threshold numeric: minimum average validation R2 required to
-#' fit the final model and retain a site in the exported gap-filled output
-#' @param cores numeric: how many CPUs to be used in the calculation
-#' @param overwrite boolean: decide if existing output files should be overwritten,
-#' the default is TRUE.
+#' @param site.dir character. Path to the site information table.
+#' @param pred.var.dir character. Path to the predictor/flux data file.
+#' @param flux_var character. Flux type to be gap-filled. Currently supports
+#' `"LE"` and `"NEE"`.
+#' @param outdir character. Output directory for exported CSV files.
+#' @param vars_to_check character vector. Candidate predictor variables used
+#' to build site-specific predictor sets.
+#' @param n_sites numeric. Number of sites to keep from the site table.
+#' @param missing_threshold numeric. Maximum allowed percentage of missingness
+#' when selecting predictors for each site.
+#' @param nfolds numeric. Number of folds used in cross-validation. Default is 10.
+#' @param nrounds numeric. Number of boosting rounds for XGBoost training.
+#' Default is 200.
+#' @param nens numeric. Number of ensemble iterations. Default is 25.
+#' @param nsamp numeric. Proportion of QC = 0 observations sampled within each
+#' site for training in each ensemble. Default is 0.95.
+#' @param params list. XGBoost model parameters.
+#' @param r2_threshold numeric. Minimum average validation R2 required to keep
+#' a site in the exported gap-filled output.
+#' @param cores numeric. Number of CPUs used for parallel site-level fitting.
+#' @param overwrite logical. Whether existing output files should be overwritten.
 #'
-#' @return list containing ensemble-level outputs. Each element corresponds to
-#' one ensemble iteration and includes:
+#' @return A list of length `nens`. Each element corresponds to one ensemble
+#' iteration and contains:
 #' \describe{
-#'   \item{cv_results}{a data frame of site-level average CV R2.}
-#'   \item{filled_data}{a data frame of the full input dataset with merged gap-filled values.}
-#'   \item{exported_data}{a data frame of filtered gap-filled results for well-performing sites only.}
-#'   \item{folds_all}{a data frame of fold-level CV diagnostics across sites.}
+#'   \item{cv_results}{data frame of site-level average CV R2.}
+#'   \item{filled_data}{data frame of the full input dataset with merged gap-filled values.}
+#'   \item{exported_data}{data frame of filtered gap-filled results for well-performing sites only.}
+#'   \item{folds_all}{data frame of fold-level CV diagnostics across sites.}
 #' }
 #'
 #' @author Yang Gu

From 7c5a76b3513b2cd6cff7690f034ae5c446b9ed26 Mon Sep 17 00:00:00 2001
From: yanggu <yanggu@scc-eb4.scc.bu.edu>
Date: Thu, 19 Mar 2026 17:47:25 -0400
Subject: [PATCH 05/13] update Rd

---
 modules/data.land/man/xgb_gapfill.Rd | 67 +++++++++++++++-------------
 1 file changed, 36 insertions(+), 31 deletions(-)

diff --git a/modules/data.land/man/xgb_gapfill.Rd b/modules/data.land/man/xgb_gapfill.Rd
index 7a66779f1f..d473940280 100644
--- a/modules/data.land/man/xgb_gapfill.Rd
+++ b/modules/data.land/man/xgb_gapfill.Rd
@@ -27,54 +27,59 @@ xgb_gapfill(
 )
 }
 \arguments{
-\item{flux_var}{character: the target flux type to be gap-filled. Currently
-supported values are `"LE"` and `"NEE"`.}
+\item{site.dir}{character. Path to the site information table.}
 
-\item{outdir}{character: the output directory where the exported CSV files
-will be written.}
+\item{pred.var.dir}{character. Path to the predictor/flux data file.}
 
-\item{nfolds}{numeric: number of folds used in CV, default is 10.}
+\item{flux_var}{character. Flux type to be gap-filled. Currently supports
+`"LE"` and `"NEE"`.}
 
-\item{nrounds}{numeric: number of boosting rounds used in XGBoost training,
-default is 200.}
+\item{outdir}{character. Output directory for exported CSV files.}
 
-\item{nens}{numeric: number of ensemble iterations, default is 25.}
+\item{vars_to_check}{character vector. Candidate predictor variables used
+to build site-specific predictor sets.}
 
-\item{nsamp}{numeric: proportion of high-quality observations sampled within
-each site for training in each ensemble, default is 0.95.}
+\item{n_sites}{numeric. Number of sites to keep from the site table.}
 
-\item{r2_threshold}{numeric: minimum average validation R2 required to
-fit the final model and retain a site in the exported gap-filled output}
+\item{missing_threshold}{numeric. Maximum allowed percentage of missingness
+when selecting predictors for each site.}
 
-\item{cores}{numeric: how many CPUs to be used in the calculation}
+\item{nfolds}{numeric. Number of folds used in cross-validation. Default is 10.}
 
-\item{overwrite}{boolean: decide if existing output files should be overwritten,
-the default is TRUE.}
+\item{nrounds}{numeric. Number of boosting rounds for XGBoost training.
+Default is 200.}
 
-\item{site_ids}{character: a character vector of site IDs to be modeled.}
+\item{nens}{numeric. Number of ensemble iterations. Default is 25.}
 
-\item{resimet.df}{data.frame: a data frame containing eddy-covariance
-observations, QC flags, timestamps, and predictor variables.}
+\item{nsamp}{numeric. Proportion of QC = 0 observations sampled within each
+site for training in each ensemble. Default is 0.95.}
 
-\item{sitecov.df}{data.frame: a data frame containing site-specific selected
-predictor names, with columns including `Site_ID` and `filtered_variables`.}
+\item{params}{list. XGBoost model parameters.}
+
+\item{r2_threshold}{numeric. Minimum average validation R2 required to keep
+a site in the exported gap-filled output.}
+
+\item{cores}{numeric. Number of CPUs used for parallel site-level fitting.}
+
+\item{overwrite}{logical. Whether existing output files should be overwritten.}
 }
 \value{
-list containing ensemble-level outputs. Each element corresponds to
-one ensemble iteration and includes:
+A list of length `nens`. Each element corresponds to one ensemble
+iteration and contains:
 \describe{
-  \item{cv_results}{a data frame of site-level average CV R2.}
-  \item{filled_data}{a data frame of the full input dataset with merged gap-filled values.}
-  \item{exported_data}{a data frame of filtered gap-filled results for well-performing sites only.}
-  \item{folds_all}{a data frame of fold-level CV diagnostics across sites.}
+  \item{cv_results}{data frame of site-level average CV R2.}
+  \item{filled_data}{data frame of the full input dataset with merged gap-filled values.}
+  \item{exported_data}{data frame of filtered gap-filled results for well-performing sites only.}
+  \item{folds_all}{data frame of fold-level CV diagnostics across sites.}
 }
 }
 \description{
-This function performs site-level XGBoost gap-filling with CV
-for eddy-covariance flux data. For each ensemble iteration, it samples
-observed data(QC=0) within each site, fits site-specific XGBoost models
-using k-fold CV, predicts missing or low-quality flux values,
-and exports both gap-filled outputs and fold-level CV diagnostics.
+Perform site-level XGBoost gap-filling with cross-validation for
+eddy-covariance flux data. For each ensemble iteration, the function
+samples high-quality observations (QC = 0) within each site, fits a
+site-specific XGBoost model using k-fold cross-validation, predicts
+missing or low-quality flux values, and exports both gap-filled outputs
+and fold-level CV diagnostics.
 }
 \author{
 Yang Gu

From 5e0c31e42a74d31ecd6b0a49ce91edf1f7dbef13 Mon Sep 17 00:00:00 2001
From: yanggu <yanggu@scc-eb4.scc.bu.edu>
Date: Thu, 19 Mar 2026 19:37:45 -0400
Subject: [PATCH 06/13] Fix NSE notes in xgb_gapfill

---
 modules/data.land/R/SIPNET_flux_gapfill.R | 278 +++++++++++-----------
 1 file changed, 139 insertions(+), 139 deletions(-)

diff --git a/modules/data.land/R/SIPNET_flux_gapfill.R b/modules/data.land/R/SIPNET_flux_gapfill.R
index 17f33823ec..2543658981 100644
--- a/modules/data.land/R/SIPNET_flux_gapfill.R
+++ b/modules/data.land/R/SIPNET_flux_gapfill.R
@@ -1,3 +1,9 @@
+utils::globalVariables(c(
+  "Site_ID", "variable", "value", "missing_count", "total_obs",
+  "missing_pct", "filtered_variables", "is_night_suncalc",
+  "TIMESTAMP_START", "GF_QC", "Avg_R2", "site", ":="
+))
+
 #' @description
 #' This function reads the site information table and returns the selected
 #' site IDs for downstream modeling.
@@ -39,7 +45,7 @@ read_pred_data <- function(pred.var.dir, site_ids) {
     dplyr::mutate(is_night_suncalc = as.integer(is_night_suncalc))
   
   resimet.df <- resimet.df |>
-    dplyr::filter(Site_ID %in% site_ids)
+    dplyr::filter(.data$Site_ID %in% site_ids)
   
   return(resimet.df)
 }
@@ -70,20 +76,20 @@ build_sitecov_df <- function(resimet.df,
                              outdir = NULL,
                              missing_threshold = 40) {
   missing_summary <- resimet.df |>
-    dplyr::select(Site_ID, dplyr::all_of(vars_to_check)) |>
+    dplyr::select(.data$Site_ID, dplyr::all_of(vars_to_check)) |>
     tidyr::pivot_longer(
-      cols = -Site_ID,
+      cols = -.data$Site_ID,
       names_to = "variable",
       values_to = "value"
     ) |>
-    dplyr::group_by(Site_ID, variable) |>
+    dplyr::group_by(.data$Site_ID, .data$variable) |>
     dplyr::summarise(
       total_obs = dplyr::n(),
-      missing_count = sum(is.na(value)),
-      missing_pct = missing_count / total_obs * 100,
+      missing_count = sum(is.na(.data$value)),
+      missing_pct = .data$missing_count / .data$total_obs * 100,
       .groups = "drop"
     ) |>
-    dplyr::arrange(Site_ID, dplyr::desc(missing_pct))
+    dplyr::arrange(.data$Site_ID, dplyr::desc(.data$missing_pct))
   
   if (!is.null(outdir)) {
     if (!file.exists(outdir)) {
@@ -96,15 +102,15 @@ build_sitecov_df <- function(resimet.df,
   }
   
   high_staying_vars <- missing_summary |>
-    dplyr::filter(missing_pct < missing_threshold) |>
-    dplyr::group_by(Site_ID) |>
+    dplyr::filter(.data$missing_pct < missing_threshold) |>
+    dplyr::group_by(.data$Site_ID) |>
     dplyr::summarise(
-      filtered_variables = list(variable),
+      filtered_variables = list(.data$variable),
       .groups = "drop"
     )
   
   sitecov.df <- high_staying_vars |>
-    tidyr::unnest(filtered_variables)
+    tidyr::unnest(.data$filtered_variables)
   
   return(list(
     missing_summary = missing_summary,
@@ -334,29 +340,32 @@ xgb_gapfill <- function(site.dir,
                         cores = max(1L, parallel::detectCores() - 1L),
                         overwrite = TRUE) {
   
+  # ------------------------------------------------------------
+  # Check required packages
+  # ------------------------------------------------------------
   if (!requireNamespace("xgboost", quietly = TRUE)) {
-    stop("Package 'xgboost' is required for cv_xgb_site_cv_gapfill() but is not installed.")
+    stop("Package 'xgboost' is required but not installed.")
   }
   if (!requireNamespace("doParallel", quietly = TRUE)) {
-    stop("Package 'doParallel' is required for parallel execution but is not installed.")
+    stop("Package 'doParallel' is required but not installed.")
   }
   if (!requireNamespace("foreach", quietly = TRUE)) {
-    stop("Package 'foreach' is required for parallel execution but is not installed.")
+    stop("Package 'foreach' is required but not installed.")
   }
   
-  # Read target site IDs
-  site_ids <- read_site_ids(
-    site.dir = site.dir,
-    n_sites = n_sites
-  )
+  # ------------------------------------------------------------
+  # Read site IDs and input data
+  # ------------------------------------------------------------
+  site_ids <- read_site_ids(site.dir = site.dir, n_sites = n_sites)
   
-  # Read predictor and flux data for selected sites
   resimet.df <- read_pred_data(
     pred.var.dir = pred.var.dir,
     site_ids = site_ids
   )
   
-  # Build site-specific predictor table
+  # ------------------------------------------------------------
+  # Build site-specific predictor sets
+  # ------------------------------------------------------------
   sitecov_res <- build_sitecov_df(
     resimet.df = resimet.df,
     vars_to_check = vars_to_check,
@@ -365,7 +374,9 @@ xgb_gapfill <- function(site.dir,
   )
   sitecov.df <- sitecov_res$sitecov.df
   
-  # Map flux type to variable names
+  # ------------------------------------------------------------
+  # Define response and QC variables
+  # ------------------------------------------------------------
   flux_var <- toupper(flux_var)
   if (flux_var == "LE") {
     response_var <- "LE_F_MDS"
@@ -376,55 +387,60 @@ xgb_gapfill <- function(site.dir,
     qc_var <- "NEE_CUT_USTAR50_QC"
     gapfill_var <- "NEE_gapfill"
   } else {
-    stop("Unsupported flux_var. Currently only 'LE' and 'NEE' are supported.")
+    stop("flux_var must be 'LE' or 'NEE'")
   }
   
-  # Validate required columns in input data
+  # ------------------------------------------------------------
+  # Check required columns
+  # ------------------------------------------------------------
   required_cols <- c("Site_ID", "TIMESTAMP_START", response_var, qc_var)
   missing_cols <- setdiff(required_cols, names(resimet.df))
   if (length(missing_cols) > 0) {
-    stop(
-      sprintf(
-        "Missing required columns in resimet.df: %s",
-        paste(missing_cols, collapse = ", ")
-      )
-    )
-  }
-  if (!file.exists(outdir)) {
-    dir.create(outdir, recursive = TRUE)
+    stop(sprintf("Missing columns: %s", paste(missing_cols, collapse = ", ")))
   }
   
-  # Filter high-quality observations for training (QC = 0)
+  if (!file.exists(outdir)) dir.create(outdir, recursive = TRUE)
+  
+  # ------------------------------------------------------------
+  # Filter QC=0 data for training
+  # ------------------------------------------------------------
   data_good <- resimet.df |>
     dplyr::filter(.data[[qc_var]] == 0)
   
   ensemble_results <- vector("list", nens)
   
-  # Loop over ensemble members
+  # ------------------------------------------------------------
+  # Ensemble loop
+  # ------------------------------------------------------------
   for (m in seq_len(nens)) {
     message(sprintf("Ensemble %d/%d", m, nens))
     
-    # Stratified Sampling: Sample 95% from within each site for training.
+    # Sample within each site
     train_df <- data_good |>
-      dplyr::group_by(Site_ID) |>
+      dplyr::group_by(.data$Site_ID) |>
       dplyr::slice_sample(prop = nsamp) |>
       dplyr::ungroup()
     
-    # Build site-specific training datasets
+    # ------------------------------------------------------------
+    # Build site-level datasets
+    # ------------------------------------------------------------
     site_dfs <- lapply(site_ids, function(site) {
+      
+      # Select predictors for this site
       feats0 <- sitecov.df |>
-        dplyr::filter(Site_ID == site) |>
-        dplyr::pull(filtered_variables) |>
+        dplyr::filter(.data$Site_ID == .env$site) |>
+        dplyr::pull(.data$filtered_variables) |>
         unique()
       
       feats <- unique(c(feats0, "is_night_suncalc"))
       feats <- intersect(feats, names(train_df))
       
+      # Subset training data
       df <- train_df |>
-        dplyr::filter(Site_ID == site) |>
+        dplyr::filter(.data$Site_ID == .env$site) |>
         dplyr::select(
-          TIMESTAMP_START,
-          Site_ID,
+          .data$TIMESTAMP_START,
+          .data$Site_ID,
           dplyr::all_of(response_var),
           dplyr::all_of(feats)
         ) |>
@@ -433,95 +449,90 @@ xgb_gapfill <- function(site.dir,
       attr(df, "features") <- feats
       df
     })
+    
     names(site_dfs) <- site_ids
     
-    # Parallel model fitting and gap-filling by site   
+    # ------------------------------------------------------------
+    # Parallel fitting
+    # ------------------------------------------------------------
     use_cores <- min(as.numeric(cores), length(site_ids))
     cl <- parallel::makeCluster(use_cores, type = "PSOCK")
     doParallel::registerDoParallel(cl)
+    
     results <- tryCatch({
       foreach::foreach(
         site = site_ids,
         .packages = c("dplyr", "xgboost", "tibble"),
-        .combine = dplyr::bind_rows,
-        .errorhandling = "pass"
+        .combine = dplyr::bind_rows
       ) %dopar% {
-        tryCatch({
-          df <- site_dfs[[site]]
-          feats <- attr(df, "features")
-          
-          # Fit XGBoost model with CV
-          fit <- fit_site_xgb_cv(
-            subdf = df,
-            feats = feats,
-            site_name = site,
-            response_var = response_var,
-            nfolds = nfolds,
-            params = params,
-            nrounds = nrounds,
-            r2_threshold = r2_threshold
+        
+        df <- site_dfs[[site]]
+        feats <- attr(df, "features")
+        
+        # Fit model
+        fit <- fit_site_xgb_cv(
+          subdf = df,
+          feats = feats,
+          site_name = site,
+          response_var = response_var,
+          nfolds = nfolds,
+          params = params,
+          nrounds = nrounds,
+          r2_threshold = r2_threshold
+        )
+        
+        # --------------------------------------------------------
+        # Gap-fill candidates
+        # --------------------------------------------------------
+        gap_cands <- resimet.df |>
+          dplyr::filter(
+            .data$Site_ID == .env$site,
+            is.na(.data[[response_var]]) | .data[[qc_var]] != 0
+          ) |>
+          dplyr::select(
+            .data$TIMESTAMP_START,
+            .data$Site_ID,
+            dplyr::all_of(feats)
+          ) |>
+          tibble::as_tibble()
+        
+        pd <- gap_cands
+        
+        # Predict
+        if (!is.null(fit$model) && nrow(pd) > 0) {
+          dtest <- xgboost::xgb.DMatrix(
+            data = as.matrix(dplyr::select(pd, dplyr::all_of(feats))),
+            missing = NA
           )
-          
-          # Identify gap-fill candidates
-          gap_cands <- resimet.df |>
-            dplyr::filter(
-              Site_ID == site,
-              is.na(.data[[response_var]]) | .data[[qc_var]] != 0
-            ) |>
-            dplyr::select(TIMESTAMP_START, Site_ID, dplyr::all_of(feats)) |>
-            tibble::as_tibble()
-          
-          pd <- gap_cands
-          
-          # Gap fill flux if model is valid
-          if (!is.null(fit$model) && nrow(pd) > 0) {
-            dtest <- xgboost::xgb.DMatrix(
-              data = as.matrix(dplyr::select(pd, dplyr::all_of(feats))),
-              missing = NA
-            )
-            pd[[gapfill_var]] <- stats::predict(fit$model, dtest)
-            pd$GF_QC <- 0L
-          } else if (nrow(pd) > 0) {
-            pd[[gapfill_var]] <- NA_real_
-            pd$GF_QC <- 1L
-          }
-          
-          # Return site-level results
-          tibble::tibble(
-            Site_ID = site,
-            Avg_R2 = fit$r2,
-            pred_df = list(pd),
-            folds_df = list(dplyr::mutate(fit$folds_df, Site_ID = site))
-          )
-        }, error = function(e) {
-          message(sprintf("[Site %s] ERROR: %s", site, conditionMessage(e)))
-          tibble::tibble(
-            Site_ID = site,
-            Avg_R2 = NA_real_,
-            pred_df = list(NULL),
-            folds_df = list(tibble::tibble())
-          )
-        })
+          pd[[gapfill_var]] <- stats::predict(fit$model, dtest)
+          pd$GF_QC <- 0L
+        } else if (nrow(pd) > 0) {
+          pd[[gapfill_var]] <- NA_real_
+          pd$GF_QC <- 1L
+        }
+        
+        tibble::tibble(
+          Site_ID = site,
+          Avg_R2 = fit$r2,
+          pred_df = list(pd),
+          folds_df = list(dplyr::mutate(fit$folds_df, Site_ID = .env$site))
+        )
       }
-    }, error = function(e) {
-      message(sprintf("Parallel loop error in ensemble %d: %s", m, conditionMessage(e)))
-      NULL
     }, finally = {
       parallel::stopCluster(cl)
       foreach::registerDoSEQ()
     })
     
-    # Skip ensemble if no results
     if (is.null(results) || nrow(results) == 0) {
       ensemble_results[[m]] <- NULL
       next
     }
     
-    # Combine gap-fill predictions across sites
-    pred_list <- results$pred_df
-    gap_all <- dplyr::bind_rows(pred_list)
+    # ------------------------------------------------------------
+    # Merge predictions
+    # ------------------------------------------------------------
+    gap_all <- dplyr::bind_rows(results$pred_df)
     
-    # Merge predictions back to full dataset
     resimet_filled <- resimet.df |>
       dplyr::left_join(gap_all, by = c("Site_ID", "TIMESTAMP_START")) |>
       dplyr::mutate(
@@ -530,58 +541,47 @@ xgb_gapfill <- function(site.dir,
           .data[[gapfill_var]],
           .data[[response_var]]
         ),
-        GF_QC = dplyr::if_else(is.na(GF_QC), 1L, GF_QC)
+        GF_QC = dplyr::if_else(is.na(.data$GF_QC), 1L, .data$GF_QC)
       )
     
-    # Extract CV performance
+    # ------------------------------------------------------------
+    # Select good sites
+    # ------------------------------------------------------------
     cv_results <- results |>
-      dplyr::select(Site_ID, Avg_R2)
+      dplyr::select(.data$Site_ID, .data$Avg_R2)
     
-    # Select well-performing sites
     good_sites <- cv_results |>
-      dplyr::filter(!is.na(Avg_R2) & Avg_R2 >= r2_threshold) |>
-      dplyr::pull(Site_ID)
+      dplyr::filter(!is.na(.data$Avg_R2) & .data$Avg_R2 >= r2_threshold) |>
+      dplyr::pull(.data$Site_ID)
     
-    # Subset final output for good sites
     sub.df <- resimet_filled |>
-      dplyr::filter(Site_ID %in% good_sites) |>
+      dplyr::filter(.data$Site_ID %in% good_sites) |>
       dplyr::select(
-        Site_ID,
-        TIMESTAMP_START,
+        .data$Site_ID,
+        .data$TIMESTAMP_START,
         dplyr::all_of(response_var),
         dplyr::all_of(gapfill_var),
-        GF_QC,
+        .data$GF_QC,
         dplyr::all_of(qc_var)
       )
     
-    # Aggregate fold-level CV diagnostics
-    folds_all <- dplyr::bind_rows(results$folds_df) |>
-      dplyr::mutate(ensemble_id = m)
-    
+    # ------------------------------------------------------------
+    # Save outputs
+    # ------------------------------------------------------------
     ens_tag <- sprintf("ens%02d", m)
     flux_tag <- tolower(flux_var)
     
-    data_path <- file.path(outdir, sprintf("%s_gfed_dat_%s.csv", flux_tag, ens_tag))
-    folds_path <- file.path(outdir, sprintf("%s_gfed_cv_folds_%s.csv", flux_tag, ens_tag))
-    
-    if (overwrite || !file.exists(data_path)) {
-      data.table::fwrite(sub.df, data_path)
-    }
-    
-    if (overwrite || !file.exists(folds_path)) {
-      data.table::fwrite(folds_all, folds_path)
-    }
+    data.table::fwrite(
+      sub.df,
+      file.path(outdir, sprintf("%s_gfed_dat_%s.csv", flux_tag, ens_tag))
+    )
     
-    # Store ensemble results
     ensemble_results[[m]] <- list(
       cv_results = cv_results,
       filled_data = resimet_filled,
-      exported_data = sub.df,
-      folds_all = folds_all
+      exported_data = sub.df
     )
     
-    rm(results, pred_list, gap_all, resimet_filled, cv_results, good_sites,
-       sub.df, folds_all, site_dfs, train_df)
     gc(FALSE)
   }
   

From 42d8f0cbd069a20807d167523fbf1f5696c411fe Mon Sep 17 00:00:00 2001
From: yanggu <yanggu@scc-eb4.scc.bu.edu>
Date: Thu, 19 Mar 2026 20:03:47 -0400
Subject: [PATCH 07/13] put packages into Suggests

---
 docker/depends/pecan_package_dependencies.csv |  8 ++++----
 modules/data.land/DESCRIPTION                 | 12 ++++++------
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/docker/depends/pecan_package_dependencies.csv b/docker/depends/pecan_package_dependencies.csv
index 0fbef5edb4..230e11c589 100644
--- a/docker/depends/pecan_package_dependencies.csv
+++ b/docker/depends/pecan_package_dependencies.csv
@@ -33,7 +33,7 @@
 "data.table","*","base/utils","Suggests",FALSE
 "data.table","*","base/visualization","Imports",FALSE
 "data.table","*","models/biocro","Imports",FALSE
-"data.table","*","modules/data.land","Imports",FALSE
+"data.table","*","modules/data.land","Suggests",FALSE
 "data.table","*","modules/data.remote","Suggests",FALSE
 "dataone","*","modules/data.land","Suggests",FALSE
 "datapack","*","modules/data.land","Suggests",FALSE
@@ -42,7 +42,7 @@
 "dbplyr",">= 2.4.0","base/db","Imports",FALSE
 "devtools","*","models/ed","Suggests",FALSE
 "doParallel","*","modules/data.atmosphere","Suggests",FALSE
-"doParallel","*","modules/data.land","Imports",FALSE
+"doParallel","*","modules/data.land","Suggests",FALSE
 "doParallel","*","modules/data.remote","Imports",FALSE
 "doSNOW","*","base/remote","Suggests",FALSE
 "doSNOW","*","base/utils","Suggests",FALSE
@@ -669,7 +669,7 @@
 "tibble","*","models/fates","Imports",FALSE
 "tibble","*","models/lpjguess","Imports",FALSE
 "tibble","*","modules/data.atmosphere","Imports",FALSE
-"tibble","*","modules/data.land","Imports",FALSE
+"tibble","*","modules/data.land","Suggests",FALSE
 "tibble","*","modules/data.remote","Suggests",FALSE
 "tibble","*","modules/meta.analysis","Suggests",FALSE
 "tictoc","*","modules/assim.sequential","Suggests",FALSE
@@ -725,7 +725,7 @@
 "withr","*","modules/data.land","Suggests",FALSE
 "withr","*","modules/uncertainty","Suggests",FALSE
 "xgboost","*","modules/assim.sequential","Suggests",FALSE
-"xgboost","*","modules/data.land","Imports",FALSE
+"xgboost","*","modules/data.land","Suggests",FALSE
 "XML","*","base/workflow","Imports",FALSE
 "XML","*","models/biocro","Imports",FALSE
 "XML","*","models/maat","Imports",FALSE
diff --git a/modules/data.land/DESCRIPTION b/modules/data.land/DESCRIPTION
index ba5ad75b22..28fc5cd7be 100644
--- a/modules/data.land/DESCRIPTION
+++ b/modules/data.land/DESCRIPTION
@@ -46,11 +46,7 @@ Imports:
     terra,
     tidyr,
     tidyselect,
-    XML (>= 3.98-1.4),
-    data.table,
-    tibble,
-    xgboost,
-    doParallel
+    XML (>= 3.98-1.4)
 Suggests:
     dataone,
     datapack,
@@ -77,7 +73,11 @@ Suggests:
     sp,
     testthat (>= 3.1.0),
     traits,
-    withr
+    withr,
+    xgboost,
+    doParallel,
+    data.table,
+    tibble
 Remotes:
     github::ropensci/traits
 License: BSD_3_clause + file LICENSE

From b6bdea587c09a81c5623dd434d9297ef47731559 Mon Sep 17 00:00:00 2001
From: yanggu <yanggu@scc-uo3.scc.bu.edu>
Date: Thu, 16 Apr 2026 21:36:01 -0400
Subject: [PATCH 08/13] data prep for downscaling and bcm code

---
 .../assim.sequential/inst/bias_correction.R   | 747 ++++++++++++++++++
 modules/assim.sequential/inst/data_prep.R     | 633 +++++++++++++++
 2 files changed, 1380 insertions(+)
 create mode 100644 modules/assim.sequential/inst/bias_correction.R
 create mode 100644 modules/assim.sequential/inst/data_prep.R

diff --git a/modules/assim.sequential/inst/bias_correction.R b/modules/assim.sequential/inst/bias_correction.R
new file mode 100644
index 0000000000..f714ba8709
--- /dev/null
+++ b/modules/assim.sequential/inst/bias_correction.R
@@ -0,0 +1,747 @@
+## Load packages
+library(data.table)
+library(dplyr)
+library(xgboost)
+library(parallel)
+library(doParallel)
+library(caret)
+
+##' @title parse_bcm_args
+##' @name parse_bcm_args
+##' @author Yang Gu
+##'
+##' @param args Character vector from commandArgs(trailingOnly = TRUE).
+##'
+##' @description
+##' Parse command-line arguments for flux type and response variable.
+##' The first argument is the flux type ("LE" or "NEE"), and the second
+##' argument is the raw ensemble variable name (ens01).
+##'
+##' @return
+##' A list containing `flux_type`, `raw_response_var`, and `response_var`.
+parse_bcm_args <- function(args) {
+  flux_type <- if (length(args) >= 1) toupper(args[1]) else "LE"
+  raw_response_var <- if (length(args) >= 2) args[2] else "ens01"
+  
+  if (!flux_type %in% c("LE", "NEE")) {
+    stop("flux_type must be either 'LE' or 'NEE'.")
+  }
+  
+  response_var <- paste0(raw_response_var, "_residual")
+  
+  return(list(
+    flux_type = flux_type,
+    raw_response_var = raw_response_var,
+    response_var = response_var
+  ))
+}
+
+##' @title get_flux_config
+##' @name get_flux_config
+##' @author Yang Gu
+##'
+##' @param flux_type Character string, either "LE" or "NEE".
+##'
+##' @description
+##' Return flux-specific configuration, including observation column names,
+##' model mean column names, columns to remove, and output suffixes.
+##'
+##' @return
+##' A list of flux-specific settings.
+get_flux_config <- function(flux_type) {
+  if (flux_type == "LE") {
+    return(list(
+      flux_type = "LE",
+      mean_col = "Qle_mean",
+      mean_std_col = "flux_mean",
+      obs_col = "LE_F_MDS",
+      obs_qc_col = "LE_F_MDS_QC",
+      residual_col = "LE_residual",
+      corrected_col = "LE_corrected",
+      pred_residual_col = "LE_resi_pred",
+      gfed_file = "/projectnb/dietzelab/guYANG/Gap_fill/results/le_ens_ec_3h.csv",
+      output_tag = "le"
+    ))
+  }
+  
+  if (flux_type == "NEE") {
+    return(list(
+      flux_type = "NEE",
+      mean_col = "NEE_mean",
+      mean_std_col = "flux_mean",
+      obs_col = "NEE_CUT_USTAR50",
+      obs_qc_col = "NEE_CUT_USTAR50_QC",
+      residual_col = "NEE_residual",
+      corrected_col = "NEE_corrected",
+      pred_residual_col = "NEE_resi_pred",
+      gfed_file = "/projectnb/dietzelab/guYANG/Gap_fill/results/ens_ec_3h.csv",
+      output_tag = "nee"
+    ))
+  }
+  
+  stop("Unsupported flux_type.")
+}
+
+##' @title read_close_points_and_envres
+##' @name read_close_points_and_envres
+##' @author Yang Gu
+##'
+##' @param matched_file Path to matched point-site file.
+##' @param envres_file Path to .RData file containing `envres`.
+##'
+##' @description
+##' Read the nearest matched site-index table and load the integrated envres
+##' dataset from .RData.
+##'
+##' @return
+##' A list containing `close_points_df` and `envres`.
+read_close_points_and_envres <- function(matched_file, envres_file) {
+  close_points_df <- fread(matched_file)
+  setDT(close_points_df)
+  close_points_df <- close_points_df[order(min_dist_m), .SD[1], by = index]
+  
+  load(envres_file)
+  
+  return(list(
+    close_points_df = close_points_df,
+    envres = envres
+  ))
+}
+
+##' @title filter_envres_for_flux
+##' @name filter_envres_for_flux
+##' @author Yang Gu
+##'
+##' @param envres Environmental result table.
+##' @param flux_cfg Flux-specific configuration list.
+##'
+##' @description
+##' Remove flux-specific observed columns, filter land-cover classes,
+##' and restrict the date range used in BCM.
+##'
+##' @return
+##' A filtered data.table.
+filter_envres_for_flux <- function(envres, flux_cfg) {
+  cols_to_drop <- intersect(
+    c(flux_cfg$obs_col, flux_cfg$obs_qc_col, flux_cfg$residual_col),
+    names(envres)
+  )
+  if (length(cols_to_drop) > 0) {
+    envres[, (cols_to_drop) := NULL]
+  }
+  
+  envres <- envres[as.numeric(as.character(LC)) <= 4]
+  
+  cutoff_date <- as.POSIXct("2012-07-15 23:59:59", tz = "UTC")
+  envres <- envres[utc > cutoff_date]
+  
+  return(envres)
+}
+
+##' @title add_gapfill_and_predictors
+##' @name add_gapfill_and_predictors
+##' @author Yang Gu
+##'
+##' @param envres Environmental result table.
+##' @param gfed Gap-fill ensemble result table.
+##' @param close_points_df Nearest site-index table.
+##' @param raw_response_var Raw ensemble variable name, e.g. "ens01".
+##' @param flux_cfg Flux-specific configuration list.
+##'
+##' @description
+##' Merge a selected ensemble gap-fill column into `envres`, add predictor
+##' transformations, standardize the model mean column to `flux_mean`,
+##' and construct the response residual.
+##'
+##' @return
+##' A processed data.table ready for BCM modeling.
+add_gapfill_and_predictors <- function(envres,
+                                       gfed,
+                                       close_points_df,
+                                       raw_response_var,
+                                       flux_cfg) {
+  cols_to_merge <- c(raw_response_var)
+  gfed_sub <- gfed[, c("Site_ID", "utc", cols_to_merge), with = FALSE]
+  
+  setkey(envres, Site_ID, utc)
+  setkey(gfed_sub, Site_ID, utc)
+  envres <- gfed_sub[envres]
+  
+  envres[, c("LC", "is_day", "KGC") := lapply(.SD, as.factor),
+         .SDcols = c("LC", "is_day", "KGC")]
+  
+  envres[, t2m2 := t2m^2]
+  envres[, PPFD2 := PPFD^2]
+  envres[, EVI2 := EVI^2]
+  envres[, VPD_t2m := VPD * t2m]
+  envres[, sincos := sin_doy * cos_doy]
+  
+  envres[, Site_ID := NULL]
+  
+  setkey(envres, index)
+  setkey(close_points_df, index)
+  envres[close_points_df, Site_ID := i.Site_ID]
+  
+  envres[EVI == -3000, EVI := NA_real_]
+  envres[NDVI == -3000, NDVI := NA_real_]
+  
+  if (flux_cfg$mean_col %in% names(envres) &&
+      flux_cfg$mean_col != flux_cfg$mean_std_col) {
+    setnames(envres, old = flux_cfg$mean_col, new = flux_cfg$mean_std_col)
+  }
+  
+  cols_ens <- grep("ens", names(envres), value = TRUE)
+  cols_to_keep <- raw_response_var
+  cols_to_remove <- setdiff(cols_ens, cols_to_keep)
+  if (length(cols_to_remove) > 0) {
+    envres[, (cols_to_remove) := NULL]
+  }
+  
+  response_residual_col <- paste0(raw_response_var, "_residual")
+  envres[, (response_residual_col) := get(flux_cfg$mean_std_col) - get(raw_response_var)]
+  
+  return(envres)
+}
+
+##' @title split_bcm_data
+##' @name split_bcm_data
+##' @author Yang Gu
+##'
+##' @param envres Environmental result table.
+##' @param response_var Response column name.
+##'
+##' @description
+##' Split the data into observed-site training data and unobserved-site
+##' prediction data.
+##'
+##' @return
+##' A list containing `tenvres` and `pdat`.
+split_bcm_data <- function(envres, response_var) {
+  indices_to_keep <- envres[!is.na(Site_ID), unique(index)]
+  
+  tenvres <- envres[index %in% indices_to_keep]
+  setDT(tenvres)
+  tenvres[, LC := factor(as.character(LC))]
+  tenvres <- tenvres[!is.na(get(response_var))]
+  tenvres[, `:=`(
+    LC     = as.factor(LC),
+    is_day = as.factor(is_day),
+    KGC    = as.factor(KGC),
+    index  = as.character(index)
+  )]
+  
+  envres <- envres[!(index %in% indices_to_keep)]
+  pdat <- envres[is.na(get(response_var))]
+  
+  return(list(
+    tenvres = tenvres,
+    pdat = pdat
+  ))
+}
+
+##' @title get_bcm_model_components
+##' @name get_bcm_model_components
+##' @author Yang Gu
+##'
+##' @description
+##' Return the predictor list, formula, and XGBoost hyperparameter settings
+##' used in the BCM workflow.
+##'
+##' @return
+##' A list of modeling components.
+get_bcm_model_components <- function() {
+  predictors_all <- c(
+    "t2m", "sp", "d2m", "tp", "strd", "KGC", "lon",
+    "sin_doy", "cos_doy", "NDVI", "EVI", "twi", "PH", "Sand", "lat",
+    "agb", "SOC", "N", "year_since_disturb", "PPFD", "WindSpeed",
+    "t2m2", "PPFD2", "EVI2", "VPD_t2m", "sincos",
+    "sm_0_10", "sm_10_40", "sm_40_100", "sm_100_200",
+    "VPD", "is_day"
+  )
+  
+  pred_formula <- as.formula(
+    paste0("~ ", paste(predictors_all, collapse = " + "), " - 1")
+  )
+  
+  ### TBD: set the specific settings for different LC type
+  lc_param_list <- list(
+    `10` = list(objective = "reg:squarederror", eta = 0.02,  max_depth = 4,
+                subsample = 0.5,  colsample_bytree = 0.5,
+                min_child_weight = 20, gamma = 2, lambda = 4, alpha = 3),
+    `40` = list(objective = "reg:squarederror", eta = 0.025, max_depth = 5,
+                subsample = 0.55, colsample_bytree = 0.6,
+                min_child_weight = 15, gamma = 1, lambda = 4, alpha = 2),
+    `50` = list(objective = "reg:squarederror", eta = 0.015, max_depth = 3,
+                subsample = 0.35, colsample_bytree = 0.4,
+                min_child_weight = 40, gamma = 4, lambda = 6, alpha = 5),
+    `60` = list(objective = "reg:squarederror", eta = 0.02,  max_depth = 3,
+                subsample = 0.4,  colsample_bytree = 0.5,
+                min_child_weight = 25, gamma = 3, lambda = 4, alpha = 3),
+    `70` = list(objective = "reg:squarederror", eta = 0.02,  max_depth = 3,
+                subsample = 0.4,  colsample_bytree = 0.4,
+                min_child_weight = 30, gamma = 4, lambda = 5, alpha = 4)
+  )
+  
+  default_params <- list(
+    objective = "reg:squarederror", eta = 0.1, max_depth = 6,
+    subsample = 0.8, colsample_bytree = 0.8,
+    min_child_weight = 10, gamma = 0.5, lambda = 3, alpha = 2
+  )
+  
+  return(list(
+    predictors_all = predictors_all,
+    pred_formula = pred_formula,
+    lc_param_list = lc_param_list,
+    default_params = default_params,
+    nrounds_fixed = 200
+  ))
+}
+
+##' @title get_valid_lc_tasks
+##' @name get_valid_lc_tasks
+##' @author Yang Gu
+##'
+##' @param tenvres Training data.table.
+##'
+##' @description
+##' Construct leave-one-site-out CV tasks for LC classes with more than one site.
+##'
+##' @return
+##' A data.frame of CV tasks.
+get_valid_lc_tasks <- function(tenvres) {
+  lc_counts <- tenvres %>%
+    distinct(LC, index) %>%
+    group_by(LC) %>%
+    summarise(n_sites = n(), .groups = "drop")
+  
+  valid_lcs <- lc_counts %>%
+    filter(n_sites > 1) %>%
+    pull(LC)
+  
+  tasks <- tenvres %>%
+    filter(LC %in% valid_lcs) %>%
+    distinct(LC, index) %>%
+    arrange(LC, index)
+  
+  return(tasks)
+}
+
+##' @title run_bcm_cv
+##' @name run_bcm_cv
+##' @author Yang Gu
+##'
+##' @param tenvres Training data.table.
+##' @param tasks CV task table.
+##' @param predictors_all Predictor column names.
+##' @param response_var Response column name.
+##' @param nrounds_fixed Fixed number of boosting rounds.
+##' @param lc_param_list LC-specific XGBoost parameter list.
+##' @param default_params Default XGBoost parameter list.
+##' @param flux_cfg Flux-specific configuration list.
+##'
+##' @description
+##' Run leave-one-site-out cross validation within land-cover classes.
+##'
+##' @return
+##' A data.frame of CV predictions and diagnostics.
+run_bcm_cv <- function(tenvres,
+                       tasks,
+                       predictors_all,
+                       response_var,
+                       nrounds_fixed,
+                       lc_param_list,
+                       default_params,
+                       flux_cfg) {
+  ncores <- max(detectCores() - 1, 1)
+  cl <- makeCluster(ncores)
+  registerDoParallel(cl)
+  
+  cv_results <- foreach(
+    i = seq_len(nrow(tasks)),
+    .packages = c("dplyr", "xgboost", "data.table"),
+    .combine  = rbind,
+    .export   = c("tenvres", "predictors_all", "response_var",
+                  "nrounds_fixed", "tasks", "lc_param_list",
+                  "default_params", "flux_cfg")
+  ) %dopar% {
+    lc <- tasks$LC[i]
+    site <- tasks$index[i]
+    
+    df_lc <- tenvres %>% filter(LC == lc)
+    train_df <- df_lc %>% filter(index != site)
+    test_df  <- df_lc %>% filter(index == site)
+    
+    if (nrow(train_df) == 0 || nrow(test_df) == 0) {
+      return(NULL)
+    }
+    
+    train_x <- data.matrix(train_df[, predictors_all, with = FALSE])
+    test_x  <- data.matrix(test_df[, predictors_all, with = FALSE])
+    
+    train_y <- train_df[[response_var]]
+    test_y  <- test_df[[response_var]]
+    
+    dtrain <- xgb.DMatrix(data = train_x, label = train_y, missing = NA)
+    dtest  <- xgb.DMatrix(data = test_x, label = test_y, missing = NA)
+    watchlist <- list(train = dtrain, eval = dtest)
+    
+    params <- if (as.character(lc) %in% names(lc_param_list)) {
+      lc_param_list[[as.character(lc)]]
+    } else {
+      default_params
+    }
+    
+    model_i <- xgb.train(
+      params = params,
+      data = dtrain,
+      nrounds = nrounds_fixed,
+      watchlist = watchlist,
+      eval_metric = "rmse",
+      early_stopping_rounds = 20,
+      nthread = 1,
+      verbose = 0
+    )
+    
+    preds_i <- predict(model_i, test_x)
+    ev <- as.data.frame(model_i$evaluation_log)
+    
+    final_train_rmse <- ev$train_rmse[nrow(ev)]
+    final_eval_rmse  <- ev$eval_rmse[nrow(ev)]
+    best_iter        <- which.min(ev$eval_rmse)
+    
+    n_check <- 5
+    if (nrow(ev) >= n_check + 1) {
+      train_tail <- tail(ev$train_rmse, n_check + 1)
+      eval_tail  <- tail(ev$eval_rmse, n_check + 1)
+      train_trend <- diff(train_tail)
+      eval_trend  <- diff(eval_tail)
+      has_overfit <- all(train_trend < 0) && all(eval_trend > 0)
+    } else {
+      has_overfit <- FALSE
+    }
+    
+    rmse_gap   <- final_eval_rmse - final_train_rmse
+    rmse_ratio <- final_eval_rmse / final_train_rmse
+    
+    n <- nrow(test_df)
+    
+    data.frame(
+      LC         = rep(lc, n),
+      index      = rep(site, n),
+      utc        = test_df$utc,
+      flux_mean  = test_df[[flux_cfg$mean_std_col]],
+      observed   = test_df[[response_var]],
+      predicted  = preds_i,
+      train_rmse = rep(final_train_rmse, n),
+      eval_rmse  = rep(final_eval_rmse, n),
+      rmse_gap   = rep(rmse_gap, n),
+      rmse_ratio = rep(rmse_ratio, n),
+      best_iter  = rep(best_iter, n),
+      has_overfit = rep(has_overfit, n),
+      stringsAsFactors = FALSE
+    )
+  }
+  
+  stopCluster(cl)
+  registerDoSEQ()
+  
+  cv_results <- cv_results %>%
+    mutate(
+      flux_pred = flux_mean - predicted,
+      flux_obs  = flux_mean - observed
+    )
+  
+  return(cv_results)
+}
+
+##' @title predict_bcm
+##' @name predict_bcm
+##' @author Yang Gu
+##'
+##' @param tenvres Training data.table.
+##' @param pdat Prediction data.table.
+##' @param response_var Response column name.
+##' @param pred_formula Formula used for model.matrix construction.
+##' @param predictors_all Predictor column names.
+##' @param nrounds_fixed Fixed number of boosting rounds.
+##' @param lc_param_list LC-specific XGBoost parameter list.
+##' @param default_params Default XGBoost parameter list.
+##' @param flux_cfg Flux-specific configuration list.
+##'
+##' @description
+##' Fit one model per LC using all observed sites and predict residual
+##' corrections for unobserved points.
+##'
+##' @return
+##' A prediction data.table with flux-corrected outputs.
+predict_bcm <- function(tenvres,
+                                   pdat,
+                                   response_var,
+                                   pred_formula,
+                                   predictors_all,
+                                   nrounds_fixed,
+                                   lc_param_list,
+                                   default_params,
+                                   flux_cfg) {
+  pdat[, flux_resi_pred := NA_real_]
+  
+  for (lc in levels(tenvres$LC)) {
+    df_lc <- tenvres[LC == lc]
+    if (nrow(df_lc) < 10) next
+    
+    mf_train_lc <- model.frame(pred_formula, data = df_lc, na.action = na.pass)
+    mat_train_lc <- model.matrix(pred_formula, data = mf_train_lc)
+    label_train_lc <- df_lc[[response_var]]
+    dtrain_lc <- xgb.DMatrix(data = mat_train_lc, label = label_train_lc, missing = NA)
+    
+    params_lc <- if (lc %in% names(lc_param_list)) lc_param_list[[lc]] else default_params
+    
+    bst_lc <- xgb.train(
+      params = params_lc,
+      data = dtrain_lc,
+      nrounds = nrounds_fixed,
+      verbose = 0
+    )
+    
+    pdat_lc <- pdat[LC == lc]
+    if (nrow(pdat_lc) == 0) next
+    
+    mf_p_lc <- model.frame(pred_formula, data = pdat_lc, na.action = na.pass)
+    mat_p_lc <- model.matrix(pred_formula, data = mf_p_lc)
+    dtest_lc <- xgb.DMatrix(data = mat_p_lc, missing = NA)
+    preds_lc <- predict(bst_lc, dtest_lc)
+    
+    pdat[LC == lc, flux_resi_pred := preds_lc]
+  }
+  
+  pdat[, flux_corrected := get(flux_cfg$mean_std_col) - flux_resi_pred]
+  
+  if (flux_cfg$pred_residual_col != "flux_resi_pred") {
+    pdat[, (flux_cfg$pred_residual_col) := flux_resi_pred]
+  }
+  if (flux_cfg$corrected_col != "flux_corrected") {
+    pdat[, (flux_cfg$corrected_col) := flux_corrected]
+  }
+  
+  return(pdat)
+}
+
+##' @title compute_bcm_metrics
+##' @name compute_bcm_metrics
+##' @author Yang Gu
+##'
+##' @param cv_results Cross-validation result table.
+##'
+##' @description
+##' Compute site-level, LC-level, and corrected-vs-baseline R2 summaries,
+##' as well as overfitting diagnostics.
+##'
+##' @return
+##' A list of summary tables.
+compute_bcm_metrics <- function(cv_results) {
+  r2_by_site <- cv_results %>%
+    group_by(LC, index) %>%
+    summarise(
+      R2 = 1 - sum((observed - predicted)^2) /
+        sum((observed - mean(observed))^2),
+      .groups = "drop"
+    )
+  
+  r2_by_lc <- r2_by_site %>%
+    group_by(LC) %>%
+    summarise(
+      R2 = mean(R2),
+      .groups = "drop"
+    )
+  
+  overfit_table <- cv_results %>%
+    group_by(LC, index) %>%
+    summarise(
+      train_rmse = first(train_rmse),
+      eval_rmse  = first(eval_rmse),
+      rmse_gap   = first(rmse_gap),
+      rmse_ratio = first(rmse_ratio),
+      has_overfit = first(has_overfit),
+      .groups = "drop"
+    )
+  
+  r2_by_index <- cv_results %>%
+    group_by(LC, index) %>%
+    summarise(
+      R2_fused    = cor(flux_pred, flux_obs, use = "complete.obs")^2,
+      R2_baseline = cor(flux_mean, flux_obs, use = "complete.obs")^2,
+      .groups = "drop"
+    )
+  
+  return(list(
+    r2_by_site = r2_by_site,
+    r2_by_lc = r2_by_lc,
+    overfit_table = overfit_table,
+    r2_by_index = r2_by_index
+  ))
+}
+
+##' @title write_bcm_outputs
+##' @name write_bcm_outputs
+##' @author Yang Gu
+##'
+##' @param file_prefix Output file prefix.
+##' @param flux_cfg Flux-specific configuration list.
+##' @param overfit_table Overfitting summary table.
+##' @param pdat Prediction output table.
+##' @param r2_by_site Site-level R2 table.
+##' @param r2_by_lc LC-level R2 table.
+##' @param cv_results Cross-validation output table.
+##' @param r2_by_index Index-level corrected-vs-baseline R2 table.
+##'
+##' @description
+##' Write BCM outputs to disk, using flux-specific file suffixes.
+##'
+##' @return
+##' No return value.
+write_bcm_outputs <- function(file_prefix,
+                              flux_cfg,
+                              overfit_table,
+                              pdat,
+                              r2_by_site,
+                              r2_by_lc,
+                              cv_results,
+                              r2_by_index) {
+  fwrite(overfit_table, paste0(file_prefix, "_lc1_", flux_cfg$output_tag, "_overfit.csv"))
+  fwrite(pdat,          paste0(file_prefix, "_pred_lc1_", flux_cfg$output_tag, ".csv"))
+  
+  write.csv(
+    r2_by_site,
+    paste0(file_prefix, "_lc1_", flux_cfg$output_tag, "_site.csv"),
+    row.names = FALSE, quote = TRUE
+  )
+  write.csv(
+    r2_by_lc,
+    paste0(file_prefix, "_lc1_", flux_cfg$output_tag, "_r2.csv"),
+    row.names = FALSE, quote = TRUE
+  )
+  write.csv(
+    cv_results,
+    paste0(file_prefix, "_lc1_", flux_cfg$output_tag, "_cv.csv"),
+    row.names = FALSE, quote = TRUE
+  )
+  write.csv(
+    r2_by_index,
+    paste0(file_prefix, "_lc1_", flux_cfg$output_tag, "_index_improve_r2.csv"),
+    row.names = FALSE
+  )
+}
+
+##' @title run_bcm_pipeline
+##' @name run_bcm_pipeline
+##' @author Yang Gu
+##'
+##' @param flux_type Flux type, either "LE" or "NEE".
+##' @param raw_response_var Raw ensemble response variable, e.g. "ens01".
+##' @param envres_file Path to envres .RData file.
+##' @param matched_file Path to matched point-site CSV file.
+##' @param output_dir Directory for writing outputs.
+##'
+##' @description
+##' Run the full BCM workflow for a selected flux type using the integrated
+##' `envres` dataset and ensemble gap-fill predictions.
+##'
+##' @return
+##' A list containing prediction outputs and evaluation summaries.
+run_bcm_pipeline <- function(flux_type,
+                             raw_response_var,
+                             envres_file,
+                             matched_file,
+                             output_dir) {
+  flux_cfg <- get_flux_config(flux_type)
+  response_var <- paste0(raw_response_var, "_residual")
+  
+  data_obj <- read_close_points_and_envres(
+    matched_file = matched_file,
+    envres_file = envres_file
+  )
+  close_points_df <- data_obj$close_points_df
+  envres <- data_obj$envres
+  
+  gfed <- fread(flux_cfg$gfed_file)
+  
+  envres <- filter_envres_for_flux(envres, flux_cfg)
+  envres <- add_gapfill_and_predictors(
+    envres = envres,
+    gfed = gfed,
+    close_points_df = close_points_df,
+    raw_response_var = raw_response_var,
+    flux_cfg = flux_cfg
+  )
+  
+  split_obj <- split_bcm_data(envres, response_var)
+  tenvres <- split_obj$tenvres
+  pdat <- split_obj$pdat
+  rm(envres)
+  gc()
+  
+  model_obj <- get_bcm_model_components()
+  predictors_all <- model_obj$predictors_all
+  pred_formula <- model_obj$pred_formula
+  lc_param_list <- model_obj$lc_param_list
+  default_params <- model_obj$default_params
+  nrounds_fixed <- model_obj$nrounds_fixed
+  
+  tasks <- get_valid_lc_tasks(tenvres)
+  
+  cv_results <- run_bcm_cv(
+    tenvres = tenvres,
+    tasks = tasks,
+    predictors_all = predictors_all,
+    response_var = response_var,
+    nrounds_fixed = nrounds_fixed,
+    lc_param_list = lc_param_list,
+    default_params = default_params,
+    flux_cfg = flux_cfg
+  )
+  
+  pdat <- predict_bcm(
+    tenvres = tenvres,
+    pdat = pdat,
+    response_var = response_var,
+    pred_formula = pred_formula,
+    predictors_all = predictors_all,
+    nrounds_fixed = nrounds_fixed,
+    lc_param_list = lc_param_list,
+    default_params = default_params,
+    flux_cfg = flux_cfg
+  )
+  
+  cols_to_keep <- c(
+    "utc", "index",
+    "flux_resi_pred", "flux_corrected",
+    flux_cfg$pred_residual_col,
+    flux_cfg$corrected_col
+  )
+  cols_to_keep <- intersect(cols_to_keep, names(pdat))
+  pdat <- pdat[, ..cols_to_keep]
+  
+  metric_obj <- compute_bcm_metrics(cv_results)
+  
+  file_prefix <- paste0(output_dir, "/", response_var)
+  
+  write_bcm_outputs(
+    file_prefix = file_prefix,
+    flux_cfg = flux_cfg,
+    overfit_table = metric_obj$overfit_table,
+    pdat = pdat,
+    r2_by_site = metric_obj$r2_by_site,
+    r2_by_lc = metric_obj$r2_by_lc,
+    cv_results = cv_results,
+    r2_by_index = metric_obj$r2_by_index
+  )
+  
+  return(list(
+    pdat = pdat,
+    cv_results = cv_results,
+    r2_by_site = metric_obj$r2_by_site,
+    r2_by_lc = metric_obj$r2_by_lc,
+    overfit_table = metric_obj$overfit_table,
+    r2_by_index = metric_obj$r2_by_index
+  ))
+}
\ No newline at end of file
diff --git a/modules/assim.sequential/inst/data_prep.R b/modules/assim.sequential/inst/data_prep.R
new file mode 100644
index 0000000000..0f8b362cd7
--- /dev/null
+++ b/modules/assim.sequential/inst/data_prep.R
@@ -0,0 +1,633 @@
+## Load packages
+library(ncdf4)
+library(data.table)
+library(dplyr)
+library(future.apply)
+library(tidyr)
+library(lubridate)
+library(purrr)
+library(kgc)
+library(terra)
+library(lutz)
+
+##' @title read_annual_csv_series
+##' @name read_annual_csv_series
+##' @author Yang Gu
+##'
+##' @param years Integer vector of years.
+##' @param path_template A character string template containing one `%d`
+##'   placeholder for year, e.g. "/path/to/file_%d.csv".
+##'
+##' @description
+##' Read a series of annual CSV files and return them as a list of data.tables.
+##'
+##' @return
+##' A named list of data.tables, one per year.
+read_annual_csv_series <- function(years, path_template) {
+  out <- lapply(years, function(y) {
+    fread(sprintf(path_template, y))
+  })
+  names(out) <- as.character(years)
+  return(out)
+}
+
+##' @title bind_annual_tables
+##' @name bind_annual_tables
+##' @author Yang Gu
+##'
+##' @param dt_list A list of data.tables.
+##' @param drop_year Logical; if TRUE and a `year` column exists, remove it.
+##'
+##' @description
+##' Standardize and row-bind annual tables into one combined data.table.
+##'
+##' @return
+##' A single combined data.table.
+bind_annual_tables <- function(dt_list, drop_year = TRUE) {
+  cleaned <- lapply(dt_list, function(df) {
+    df <- as.data.table(df)
+    if (drop_year && "year" %in% names(df)) {
+      df[, year := NULL]
+    }
+    return(df)
+  })
+  out <- rbindlist(cleaned, use.names = TRUE, fill = TRUE)
+  return(out)
+}
+
+##' @title extract_kgc_for_points
+##' @name extract_kgc_for_points
+##' @author Yang Gu
+##'
+##' @param coords_dt A data.table containing columns `lon`, `lat`, and `Y`.
+##' @param kg_file Path to Köppen-Geiger raster.
+##' @param kg_lookup Character vector mapping raster values to KGC classes.
+##'
+##' @description
+##' Extract Köppen-Geiger climate classes for each coordinate point.
+##'
+##' @return
+##' A data.table with columns `index`, `lon`, `lat`, and `KGC`.
+extract_kgc_for_points <- function(coords_dt, kg_file, kg_lookup) {
+  kg_raster <- rast(kg_file)
+  coords_vect <- vect(coords_dt, geom = c("lon", "lat"), crs = "EPSG:4326")
+  ex <- terra::extract(kg_raster, coords_vect, ID = FALSE)
+  
+  vals <- as.integer(ex[[1]])
+  valid <- !is.na(vals) & vals >= 1 & vals <= length(kg_lookup)
+  
+  KGC <- rep(NA_character_, length(vals))
+  KGC[valid] <- kg_lookup[vals[valid]]
+  
+  points_meta <- data.table(
+    index = coords_dt$Y,
+    lon   = coords_dt$lon,
+    lat   = coords_dt$lat,
+    KGC   = KGC
+  )
+  points_meta[, KGC := factor(KGC, levels = kg_lookup)]
+  return(points_meta)
+}
+
+##' @title add_point_metadata_to_era
+##' @name add_point_metadata_to_era
+##' @author Yang Gu
+##'
+##' @param era_dt ERA5 data.table with `index`.
+##' @param coords_file Path to coordinate CSV file.
+##' @param kg_file Path to Köppen-Geiger raster.
+##'
+##' @description
+##' Add point-level metadata, including lon/lat and KGC, to ERA5 records.
+##'
+##' @return
+##' A list with:
+##' \itemize{
+##'   \item `era_all`: ERA data.table merged with metadata
+##'   \item `points_meta`: point metadata table
+##' }
+add_point_metadata_to_era <- function(era_dt, coords_file, kg_file) {
+  test_coords <- fread(coords_file)[, .(lon, lat, Y)]
+  
+  kg_lookup <- c(
+    "Af","Am","Aw","BWh","BWk","BSh","BSk","Csa","Csb","Csc",
+    "Cwa","Cwb","Cwc","Cfa","Cfb","Cfc","Dsa","Dsb","Dsc",
+    "Dsd","Dwa","Dwb","Dwc","Dwd","Dfa","Dfb","Dfc","Dfd","ET","EF"
+  )
+  
+  points_meta <- extract_kgc_for_points(
+    coords_dt = test_coords,
+    kg_file = kg_file,
+    kg_lookup = kg_lookup
+  )
+  
+  era_dt <- merge(
+    era_dt,
+    points_meta,
+    by = "index",
+    all.x = TRUE,
+    allow.cartesian = TRUE
+  )
+  
+  return(list(
+    era_all = era_dt,
+    points_meta = points_meta
+  ))
+}
+
+##' @title expand_tiff_daily
+##' @name expand_tiff_daily
+##' @author Yang Gu
+##'
+##' @param tiff_file Path to TIFF-derived annual covariate CSV.
+##'
+##' @description
+##' Expand annual/static TIFF-based covariates to daily resolution according
+##' to the study year definition.
+##'
+##' @return
+##' A daily-resolution data.table keyed by `index` and `Date`.
+expand_tiff_daily <- function(tiff_file) {
+  tiff <- fread(tiff_file)
+  tiff <- as.data.table(tiff)
+  
+  tiff[, `:=`(
+    start_date = fifelse(
+      Year == 2012,
+      as.Date("2012-01-01"),
+      as.Date(paste0(Year - 1, "-07-16"))
+    ),
+    end_date = fifelse(
+      Year == 2012,
+      as.Date("2012-07-15"),
+      as.Date(paste0(Year, "-07-15"))
+    )
+  )]
+  
+  tiff_daily <- tiff[, .(
+    Date = seq(start_date[1], end_date[1], by = "day")
+  ), by = .(
+    Year, index, cell, cell_lon, cell_lat, diff_lon, diff_lat, distance,
+    LC, twi, PH, Sand, agb, SOC, N, year_since_disturb
+  )]
+  
+  return(tiff_daily)
+}
+
+##' @title read_soil_moisture_data
+##' @name read_soil_moisture_data
+##' @author Yang Gu
+##'
+##' @param soilm_file Path to soil moisture CSV.
+##'
+##' @description
+##' Read and standardize soil moisture data.
+##'
+##' @return
+##' A data.table with standardized columns `index` and `utc`.
+read_soil_moisture_data <- function(soilm_file) {
+  soilm <- fread(soilm_file)
+  setnames(soilm, c("Y", "datetime"), c("index", "utc"))
+  return(soilm)
+}
+
+##' @title add_daily_tiff_covariates
+##' @name add_daily_tiff_covariates
+##' @author Yang Gu
+##'
+##' @param envres Environmental data.table with `index` and `utc`.
+##' @param tiff_daily Daily TIFF covariate data.table.
+##'
+##' @description
+##' Merge daily TIFF covariates into the environmental table.
+##'
+##' @return
+##' A merged data.table.
+add_daily_tiff_covariates <- function(envres, tiff_daily) {
+  envres[, Date := as.Date(utc)]
+  setDT(envres)
+  setDT(tiff_daily)
+  setkey(envres, index, Date)
+  setkey(tiff_daily, index, Date)
+  
+  out <- merge(
+    envres,
+    tiff_daily,
+    by = c("index", "Date"),
+    all.x = TRUE,
+    sort = FALSE
+  )
+  return(out)
+}
+
+##' @title add_meteorological_derived_variables
+##' @name add_meteorological_derived_variables
+##' @author Yang Gu
+##'
+##' @param dt A data.table containing ERA5 meteorological variables.
+##'
+##' @description
+##' Add derived meteorological variables such as PPFD, wind speed,
+##' temperature in Celsius, relative humidity, VPD, and day/night indicator.
+##'
+##' @return
+##' A data.table with derived variables appended.
+add_meteorological_derived_variables <- function(dt) {
+  dt[, `:=`(
+    PPFD      = 0.45 * ssrd / 4.57 / (3 * 3600),
+    WindSpeed = sqrt(u10^2 + v10^2),
+    t_air_C   = t2m - 273.15,
+    d_air_C   = d2m - 273.15
+  )]
+  
+  dt[, `:=`(
+    RH  = 100 * exp((17.625 * d_air_C) / (243.04 + d_air_C) -
+                      (17.625 * t_air_C) / (243.04 + t_air_C)),
+    e_s = 0.6108 * exp((17.27 * t_air_C) / (t_air_C + 237.3)),
+    e_a = 0.6108 * exp((17.27 * d_air_C) / (d_air_C + 237.3)),
+    is_day = as.integer(ssrd > 10)
+  )]
+  
+  dt[, VPD := pmax(e_s - e_a, 0)]
+  dt[, c("t_air_C", "d_air_C", "e_s", "e_a") := NULL]
+  
+  return(dt)
+}
+
+##' @title add_local_time_variables
+##' @name add_local_time_variables
+##' @author Yang Gu
+##'
+##' @param dt A data.table containing `index`, `lat.x`, `lon.x`, and `utc`.
+##'
+##' @description
+##' Add local timezone, local datetime, and seasonal harmonic variables.
+##'
+##' @return
+##' A data.table with local time variables appended.
+add_local_time_variables <- function(dt) {
+  site_coords <- unique(dt[, .(index, lat.x, lon.x)])
+  site_coords[, tz := lutz::tz_lookup_coords(lat.x, lon.x, method = "accurate")]
+  
+  dt[site_coords, tz := i.tz, on = "index"]
+  dt[, tlocal := with_tz(utc, tz = unique(tz)), by = tz]
+  
+  dt[, year_local := year(tlocal)]
+  dt[, days_in_year := ifelse(leap_year(year_local), 366, 365)]
+  dt[, doy := yday(tlocal) +
+       hour(tlocal) / 24 +
+       minute(tlocal) / 1440 +
+       second(tlocal) / 86400]
+  
+  dt[, `:=`(
+    sin_doy = sin(2 * pi * doy / days_in_year),
+    cos_doy = cos(2 * pi * doy / days_in_year)
+  )]
+  
+  cols_to_drop <- intersect(
+    c("cell", "cell_lon", "cell_lat", "diff_lon", "diff_lat",
+      "distance", "tz", "lat.y", "lon.y", "lon_index",
+      "lat_index", "year_local", "days_in_year"),
+    names(dt)
+  )
+  if (length(cols_to_drop) > 0) {
+    dt[, (cols_to_drop) := NULL]
+  }
+  
+  return(dt)
+}
+
+##' @title read_and_fill_modis_daily
+##' @name read_and_fill_modis_daily
+##' @author Yang Gu
+##'
+##' @param modis_files Character vector of MODIS CSV file paths.
+##' @param start_date Start date.
+##' @param end_date End date.
+##'
+##' @description
+##' Read MODIS NDVI/EVI data and fill to daily resolution using nearest
+##' available observations.
+##'
+##' @return
+##' A data.table with columns `index`, `Date`, `EVI`, and `NDVI`.
+read_and_fill_modis_daily <- function(modis_files,
+                                      start_date = as.Date("2012-01-01"),
+                                      end_date   = as.Date("2024-12-31")) {
+  dat <- rbindlist(lapply(modis_files, fread), use.names = TRUE, fill = TRUE)
+  
+  setnames(
+    dat,
+    old = c("ID", "Latitude", "Longitude",
+            "MYD13Q1_061__250m_16_days_EVI",
+            "MYD13Q1_061__250m_16_days_NDVI"),
+    new = c("index", "lat", "lon", "EVI", "NDVI")
+  )
+  
+  dat <- dat[, .(index, Date, EVI, NDVI)]
+  
+  unique_idx <- unique(dat$index)
+  all_days <- data.table(
+    index = rep(unique_idx, each = as.integer(end_date - start_date + 1)),
+    Date  = rep(seq(start_date, end_date, by = "day"), times = length(unique_idx)),
+    lat   = NA_real_,
+    lon   = NA_real_,
+    EVI   = NA_real_,
+    NDVI  = NA_real_
+  )
+  
+  setkey(dat, index, Date)
+  setkey(all_days, index, Date)
+  
+  all_days[, Date_cal := Date]
+  filled <- dat[all_days, on = .(index, Date), roll = "nearest"]
+  final_df <- filled[, .(index, Date = Date_cal, EVI, NDVI)]
+  
+  return(final_df)
+}
+
+##' @title add_modis_covariates
+##' @name add_modis_covariates
+##' @author Yang Gu
+##'
+##' @param envres Environmental result table.
+##' @param modis_dt Daily MODIS table.
+##'
+##' @description
+##' Merge daily MODIS EVI/NDVI data into the environmental result table.
+##'
+##' @return
+##' A merged data.table.
+add_modis_covariates <- function(envres, modis_dt) {
+  out <- merge(
+    envres,
+    modis_dt,
+    by = c("index", "Date"),
+    all.x = TRUE
+  )
+  return(out)
+}
+
+##' @title read_sipnet_flux_outputs
+##' @name read_sipnet_flux_outputs
+##' @author Yang Gu
+##'
+##' @param years Integer vector of years.
+##' @param path_template File path template for SIPNET annual CSV files.
+##'
+##' @description
+##' Read and combine annual SIPNET flux output files.
+##'
+##' @return
+##' A data.table with columns `utc`, `index`, `NEE_mean`, `Qle_mean`, `GPP_mean`.
+read_sipnet_flux_outputs <- function(years, path_template) {
+  sipnet_list <- read_annual_csv_series(years, path_template)
+  sipnet_all <- bind_annual_tables(sipnet_list, drop_year = TRUE)
+  
+  setnames(sipnet_all, "time", "utc")
+  output.df <- sipnet_all[, .(utc, index, NEE_mean, Qle_mean, GPP_mean)]
+  
+  return(output.df)
+}
+
+##' @title add_sipnet_outputs
+##' @name add_sipnet_outputs
+##' @author Yang Gu
+##'
+##' @param envres Environmental result table.
+##' @param sipnet_dt SIPNET output table.
+##'
+##' @description
+##' Merge SIPNET outputs into the environmental result table and convert
+##' NEE units to observation-compatible units.
+##'
+##' @return
+##' A merged data.table.
+add_sipnet_outputs <- function(envres, sipnet_dt) {
+  out <- merge(
+    envres,
+    sipnet_dt[, .(utc, index, NEE_mean, Qle_mean, GPP_mean)],
+    by = c("utc", "index"),
+    all.x = TRUE,
+    sort = FALSE
+  )
+  
+  out[, NEE_mean := NEE_mean * 1e8 / 1.157407]
+  return(out)
+}
+
+##' @title read_and_match_ec_data
+##' @name read_and_match_ec_data
+##' @author Yang Gu
+##'
+##' @param ec_file Path to eddy covariance data CSV.
+##' @param matched_file Path to matched point-site CSV.
+##'
+##' @description
+##' Read eddy covariance data, fill missing flux variables using fallback
+##' columns, and match site IDs to spatial indices.
+##'
+##' @return
+##' A data.table with columns matched to `utc` and `index`.
+read_and_match_ec_data <- function(ec_file, matched_file) {
+  resimet.df <- fread(ec_file)
+  close_points_df <- fread(matched_file)
+  
+  setDT(close_points_df)
+  close_points_df <- close_points_df[order(min_dist_m), .SD[1], by = index]
+  
+  resimet.df[is.na(NEE_CUT_USTAR50) & !is.na(NEE_VUT_USTAR50),
+             NEE_CUT_USTAR50 := NEE_VUT_USTAR50]
+  resimet.df[is.na(GPP_NT_CUT_USTAR50) & !is.na(GPP_NT_VUT_USTAR50),
+             GPP_NT_CUT_USTAR50 := GPP_NT_VUT_USTAR50]
+  
+  resimet.df <- resimet.df[, .(
+    utc, NEE_CUT_USTAR50, LE_F_MDS, GPP_NT_CUT_USTAR50, Site_ID
+  )]
+  
+  site_index_map <- close_points_df[, .(Site_ID, index)]
+  
+  resimet.df <- merge(
+    resimet.df,
+    site_index_map,
+    by = "Site_ID",
+    all.x = TRUE
+  )
+  
+  resimet.df <- resimet.df[, .SD[1], by = .(utc, index)]
+  return(resimet.df)
+}
+
+##' @title add_ec_observations_and_residuals
+##' @name add_ec_observations_and_residuals
+##' @author Yang Gu
+##'
+##' @param envres Environmental result table.
+##' @param ec_dt Eddy covariance table.
+##'
+##' @description
+##' Merge EC observations into the environmental result table and compute
+##' residuals between SIPNET outputs and EC observations.
+##'
+##' @return
+##' A data.table with residual columns added.
+add_ec_observations_and_residuals <- function(envres, ec_dt) {
+  out <- merge(
+    envres,
+    ec_dt,
+    by = c("utc", "index"),
+    all.x = TRUE,
+    sort = FALSE
+  )
+  
+  out[, `:=`(
+    NEE_residual = NEE_mean - NEE_CUT_USTAR50,
+    LE_residual  = Qle_mean - LE_F_MDS,
+    GPP_residual = GPP_mean - GPP_NT_CUT_USTAR50
+  )]
+  
+  out <- out[, .SD[1], by = .(utc, index)]
+  return(out)
+}
+
+##' @title finalize_envres_columns
+##' @name finalize_envres_columns
+##' @author Yang Gu
+##'
+##' @param dt Final environmental result table.
+##'
+##' @description
+##' Clean up duplicated coordinate columns and standardize final names.
+##'
+##' @return
+##' A cleaned data.table ready for saving.
+finalize_envres_columns <- function(dt) {
+  drop_cols <- intersect(c("lat.y", "lon.y"), names(dt))
+  if (length(drop_cols) > 0) {
+    dt[, (drop_cols) := NULL]
+  }
+  
+  rename_old <- intersect(c("lat.x", "lon.x"), names(dt))
+  rename_new <- c("lat", "lon")[c("lat.x", "lon.x") %in% rename_old]
+  
+  if (length(rename_old) > 0) {
+    setnames(dt, rename_old, rename_new)
+  }
+  
+  return(dt)
+}
+
+##' @title build_envres_dataset
+##' @name build_envres_dataset
+##' @author Yang Gu
+##'
+##' @param years Integer vector of years to process.
+##' @param era_template File path template for ERA annual CSV files.
+##' @param coords_file Path to coordinate CSV.
+##' @param kg_file Path to Köppen-Geiger raster.
+##' @param tiff_file Path to TIFF annual covariate CSV.
+##' @param soilm_file Path to soil moisture CSV.
+##' @param modis_files Character vector of MODIS CSV files.
+##' @param sipnet_template File path template for SIPNET annual CSV files.
+##' @param ec_file Path to eddy covariance CSV.
+##' @param matched_file Path to matched spatial index CSV.
+##'
+##' @description
+##' Build the full environmental modeling dataset by combining ERA5,
+##' soil moisture, TIFF-derived covariates, MODIS, SIPNET outputs, and
+##' eddy covariance observations.
+##'
+##' @return
+##' A final integrated data.table.
+build_envres_dataset <- function(
+    years,
+    era_template,
+    coords_file,
+    kg_file,
+    tiff_file,
+    soilm_file,
+    modis_files,
+    sipnet_template,
+    ec_file,
+    matched_file
+) {
+  ## ERA5
+  era_list <- read_annual_csv_series(years, era_template)
+  era_all <- bind_annual_tables(era_list, drop_year = TRUE)
+  rm(era_list)
+  gc()
+  
+  era_meta <- add_point_metadata_to_era(
+    era_dt = era_all,
+    coords_file = coords_file,
+    kg_file = kg_file
+  )
+  era_all <- era_meta$era_all
+  points_meta <- era_meta$points_meta
+  gc()
+  
+  ## TIFF
+  tiff_daily <- expand_tiff_daily(tiff_file)
+  gc()
+  
+  ## Soil moisture
+  soilm <- read_soil_moisture_data(soilm_file)
+  
+  ## Merge ERA + soil moisture
+  envres <- merge(era_all, soilm, by = c("index", "utc"), all.x = TRUE)
+  rm(era_all, soilm)
+  gc()
+  
+  ## Add daily TIFF covariates
+  envres <- add_daily_tiff_covariates(envres, tiff_daily)
+  rm(tiff_daily)
+  gc()
+  
+  ## Derived meteorological variables
+  envres <- add_meteorological_derived_variables(envres)
+  gc()
+  
+  ## Local time and seasonal harmonics
+  envres <- add_local_time_variables(envres)
+  gc()
+  
+  ## MODIS
+  modis_dt <- read_and_fill_modis_daily(modis_files)
+  envres <- add_modis_covariates(envres, modis_dt)
+  rm(modis_dt)
+  gc()
+  
+  ## SIPNET
+  sipnet_dt <- read_sipnet_flux_outputs(years, sipnet_template)
+  envres <- add_sipnet_outputs(envres, sipnet_dt)
+  rm(sipnet_dt)
+  gc()
+  
+  ## Eddy covariance
+  ec_dt <- read_and_match_ec_data(ec_file, matched_file)
+  envres <- add_ec_observations_and_residuals(envres, ec_dt)
+  rm(ec_dt, points_meta)
+  gc()
+  
+  ## Final cleanup
+  envres <- finalize_envres_columns(envres)
+  gc()
+  
+  return(envres)
+}
+
+##' @title save_envres_dataset
+##' @name save_envres_dataset
+##' @author Yang Gu
+##'
+##' @param envres Final environmental result data.table.
+##' @param output_file Output `.RData` file path.
+##'
+##' @description
+##' Save the final dataset to an `.RData` file.
+##'
+##' @return
+##' No return value. Saves `envres` to disk.
+save_envres_dataset <- function(envres, output_file) {
+  save(envres, file = output_file)
+}
\ No newline at end of file

From a8b9dc94fc091688998948664050254b81cc1870 Mon Sep 17 00:00:00 2001
From: ygu <yanggu@bu.edu>
Date: Thu, 16 Apr 2026 22:17:31 -0400
Subject: [PATCH 09/13] Update modules/data.land/R/SIPNET_flux_gapfill.R

Co-authored-by: Michael Dietze <dietze@bu.edu>
---
 modules/data.land/R/SIPNET_flux_gapfill.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/data.land/R/SIPNET_flux_gapfill.R b/modules/data.land/R/SIPNET_flux_gapfill.R
index 2543658981..1960134311 100644
--- a/modules/data.land/R/SIPNET_flux_gapfill.R
+++ b/modules/data.land/R/SIPNET_flux_gapfill.R
@@ -313,7 +313,7 @@ fit_site_xgb_cv <- function(subdf,
 #' @author Yang Gu
 #' @importFrom foreach %dopar%
 #' @export
-xgb_gapfill <- function(site.dir,
+xgb_flux_gapfill <- function(site.dir,
                         pred.var.dir,
                         flux_var,
                         outdir,

From 706252a86860e457e215eeaf4a8d85a4ec263492 Mon Sep 17 00:00:00 2001
From: yanggu <yanggu@scc-kd2.scc.bu.edu>
Date: Tue, 28 Apr 2026 16:49:17 -0400
Subject: [PATCH 10/13] upload the revised version to repo

---
 local/sda_runner.R                            | 154 ++++++++++++++
 models/sipnet/R/read_restart.SIPNET.R         | 198 ++++++++++--------
 .../assim.sequential/R/Analysis_sda_block.R   |  32 ++-
 .../assim.sequential/R/sda.enkf_parallel.R    |  34 ++-
 modules/data.land/NAMESPACE                   |   2 +-
 5 files changed, 319 insertions(+), 101 deletions(-)
 create mode 100644 local/sda_runner.R

diff --git a/local/sda_runner.R b/local/sda_runner.R
new file mode 100644
index 0000000000..5ff798610d
--- /dev/null
+++ b/local/sda_runner.R
@@ -0,0 +1,154 @@
+# loading libraries
+library(dplyr)
+library(xts)
+library(PEcAn.all)
+library(purrr)
+library(furrr)
+library(lubridate)
+library(nimble)
+library(ncdf4)
+library(PEcAnAssimSequential)
+library(dplyr)
+library(sp)
+library(raster)
+library(zoo)
+library(ggplot2)
+library(mnormt)
+library(sjmisc)
+library(stringr)
+library(doParallel)
+library(doSNOW)
+library(data.table)
+library(Kendall)
+library(lgarch)
+library(parallel)
+library(foreach)
+library(terra)
+setwd("/projectnb/dietzelab/guYANG/pecan/runners/wishart_sda/wishart_sda/")
+## read settings xml file.
+load("/projectnb/dietzelab/guYANG/pecan/runners/test10/pecan_flux.RData")
+# Change dir name and settings
+settings$outdir      <- "/projectnb/dietzelab/guYANG/pecan/runners/wishart_sda/output_inter_q_2/"
+settings$rundir      <- file.path(settings$outdir, "run")
+settings$modeloutdir <- file.path(settings$outdir, "out")
+settings$host$rundir <- file.path(settings$outdir, "run")
+settings$host$outdir <- file.path(settings$outdir, "out")
+settings$host$folder <- file.path(settings$outdir, "out")
+settings$ensemble$size <- 5
+settings$state.data.assimilation$adjustment <- "TRUE"
+settings$host$prerun <- "module load R/4.4.0"
+###### Change Q type
+settings$state.data.assimilation$q.type <- "wishart"
+## Fix the multi output in one timestep bug
+settings$model$jobtemplate <- "/projectnb/dietzelab/guYANG/pecan/runners/test7/sipnet_template.job"
+
+# Load the selected sites
+load("/projectnb/dietzelab/guYANG/pecan/runners/wishart_sda/sda_idx.Rdata")
+all_ids  <- vapply(settings, \(s) as.character(s$run$site$id), "")
+# Sub settings for this run
+settings <- settings[all_ids %in% keep_ids]
+settings <- PEcAn.settings::as.MultiSettings(settings)
+
+# setup the batch job settings.
+general.job <- list(cores = 28, folder.num = 80)
+batch.settings = structure(list(
+  general.job = general.job,
+  qsub.cmd = "qsub -l h_rt=24:00:00 -l mem_per_core=4G -l buyin -pe omp @CORES@ -V -N @NAME@ -o @STDOUT@ -e @STDERR@ -S /bin/bash"
+))
+settings$state.data.assimilation$batch.settings <- batch.settings
+
+# update settings with the actual PFTs.
+settings <- PEcAn.settings::prepare.settings(settings)
+
+# load 6 obs
+load("/projectnb/dietzelab/guYANG/pecan/runners/test10/obs.mean.RData")
+load("/projectnb/dietzelab/guYANG/pecan/runners/test10/obs.cov.RData")
+
+# load 4 obs
+# load("/projectnb/dietzelab/dongchen/anchorSites/NA_runs/SDA_8k_site/observation/Rdata_with_attributes/obs.mean.LandTrendr_GEDI.Rdata")
+# load("/projectnb/dietzelab/dongchen/anchorSites/NA_runs/SDA_8k_site/observation/Rdata_with_attributes/obs.cov.LandTrendr_GEDI.Rdata")
+
+sub_obs <- function(L, keep) setNames(lapply(L, \(l) l[names(l) %in% keep]), names(L))
+obs.mean <- sub_obs(obs.mean, keep_ids)
+obs.cov  <- sub_obs(obs.cov,  keep_ids)
+
+# replace zero observations and variances with small numbers.
+for (i in 1:length(obs.mean)) {
+  if(is.null(obs.mean[[i]][[1]])){
+    next
+  }
+  for (j in 1:length(obs.mean[[i]])) {
+    if (length(obs.mean[[i]][[j]])==0) {
+      next
+    }
+    obs.mean[[i]][[j]][which(obs.mean[[i]][[j]]==0)] <- 0.01
+    if(length(obs.cov[[i]][[j]]) > 1){
+      diag(obs.cov[[i]][[j]])[which(diag(obs.cov[[i]][[j]]<=0.1))] <- 0.1
+    }else{
+      if(obs.cov[[i]][[j]] <= 0.1){
+        obs.cov[[i]][[j]] <- 0.1
+      }
+    }
+  }
+}
+
+if (length(obs.cov[[i]][[j]]) > 1) {
+  d <- diag(obs.cov[[i]][[j]])
+  d[d <= 0.1] <- 0.1
+  diag(obs.cov[[i]][[j]]) <- d
+}
+
+# load PFT parameter file.
+samples_src <- "/projectnb/dietzelab/dongchen/anchorSites/NA_runs/SDA_8k_site/samples.Rdata"
+samples_dst <- file.path(settings$outdir, "samples.Rdata")
+dir.create(settings$outdir, recursive = TRUE, showWarnings = FALSE)
+if (!file.exists(samples_dst)) file.copy(samples_src, samples_dst, overwrite = TRUE)
+
+control <- list(
+  TimeseriesPlot = FALSE,
+  OutlierDetection = FALSE,
+  send_email = NULL,
+  keepNC = TRUE,
+  forceRun = TRUE,
+  run_parallel = FALSE,
+  MCMC.args = NULL,
+  merge_nc = FALSE,
+  execution = "qsub_parallel"   # or "qsub" / "local"
+)
+
+source("/projectnb/dietzelab/guYANG/pecan/runners/wishart_sda/wishart_sda/sda.enkf_local.R")
+source("/projectnb/dietzelab/guYANG/pecan/runners/wishart_sda/wishart_sda/read.restart.SIPNET.R")
+source("/projectnb/dietzelab/guYANG/pecan/runners/wishart_sda/wishart_sda/analysis_sda_block.R")
+source("/projectnb/dietzelab/guYANG/pecan/runners/wishart_sda/wishart_sda/MCMC_block_function.R")
+# source("/projectnb/dietzelab/guYANG/pecan/runners/wishart_sda/wishart_sda/sda.enkf_param.R")
+
+###### Be careful: param
+res <- PEcAnAssimSequential:::sda.enkf_local(
+  settings = settings,
+  obs.mean = obs.mean,
+  obs.cov = obs.cov,
+  Q = NULL,
+  pre_enkf_params = NULL,
+  ensemble.samples = NULL,
+  control = control
+)
+
+# job_lines <- c(
+#   "#!/bin/bash",
+#   "module load R/4.4.0",
+#   "Rscript /projectnb/dietzelab/guYANG/pecan/runners/wishart_sda/wishart_sda/pecan_sda_runner.R"
+# )
+# writeLines(job_lines, "/projectnb/dietzelab/guYANG/pecan/runners/wishart_sda/logs/pecan_sda_runner.sh")
+
+# qsub -l h_rt=2:00:00 \
+# -l buyin \
+# -l mem_per_core=8G \
+# -pe omp 28 \
+# -V \
+# -N pecan_sda_runner \
+# -o /projectnb/dietzelab/guYANG/pecan/runners/wishart_sda/logs/pecan_sda_runner.out \
+# -e /projectnb/dietzelab/guYANG/pecan/runners/wishart_sda/logs/pecan_sda_runner.err \
+# -M yanggu@bu.edu \
+# -m abe \
+# -S /bin/bash \
+# /projectnb/dietzelab/guYANG/pecan/runners/wishart_sda/logs/pecan_sda_runner.sh
diff --git a/models/sipnet/R/read_restart.SIPNET.R b/models/sipnet/R/read_restart.SIPNET.R
index 4af0aeedbe..ad835fff0f 100755
--- a/models/sipnet/R/read_restart.SIPNET.R
+++ b/models/sipnet/R/read_restart.SIPNET.R
@@ -8,140 +8,172 @@
 ##' 
 ##' @return X.vec      vector of forecasts
 ##' @export
-read_restart.SIPNET <- function(outdir, runid, stop.time, settings, var.names, params) {
-  
-  prior.sla <- params[[which(!names(params) %in% c("soil", "soil_SDA", "restart"))[1]]]$SLA
-  
+read_restart.SIPNET <- function (outdir, runid, stop.time, settings, var.names, params) 
+{
+  prior.sla <- params[[which(!names(params) %in% c("soil", 
+                                                   "soil_SDA", "restart"))[1]]]$SLA
   forecast <- list()
-  params$restart <-c() #state.vars not in var.names will be added here
-  #SIPNET inital states refer to models/sipnet/inst/template.param
-  state.vars <- c("SWE", "SoilMoist", "SoilMoistFrac", "AbvGrndWood", "TotSoilCarb", "LAI", 
-                  "litter_carbon_content", "fine_root_carbon_content", 
+  params$restart <- c()
+  state.vars <- c("SWE", "SoilMoist", "SoilMoistFrac", "AbvGrndWood", "NEE","Qle",
+                  "TotSoilCarb", "LAI", "litter_carbon_content", "fine_root_carbon_content", 
                   "coarse_root_carbon_content", "litter_mass_content_of_water")
-  #when adding new state variables make sure the naming is consistent across read_restart, write_restart and write.configs
-  #pre-populate parsm$restart with NAs so state names can be added
+  
   params$restart <- rep(NA, length(setdiff(state.vars, var.names)))
-  #add states to params$restart NOT in var.names
   names(params$restart) <- setdiff(state.vars, var.names)
-  # Read ensemble output
-  ens <- PEcAn.utils::read.output(runid = runid,
-                                  outdir = file.path(outdir, runid),
-                                  start.year = lubridate::year(stop.time),
-                                  end.year = lubridate::year(stop.time),
-                                  variables = c(state.vars,"time_bounds"))
-  #calculate last
-  start.time <- as.Date(paste0(lubridate::year(stop.time),"-01-01"))
-  time_var <- ens$time_bounds[1,]
-  real_time <- as.POSIXct(time_var*3600*24, origin = start.time)
-  # last <- which(as.Date(real_time)==as.Date(stop.time))[1]
-  last <- which(as.Date(real_time)==as.Date(stop.time))[length(which(as.Date(real_time)==as.Date(stop.time)))]
+  ens <- PEcAn.utils::read.output(runid = runid, outdir = file.path(outdir, 
+                                                                    runid), start.year = lubridate::year(stop.time), end.year = lubridate::year(stop.time), 
+                                  variables = c(state.vars, "time_bounds"))
+  
+  # 4) 若关键变量缺失，直接给出可读的报错信息（避免在 ud_convert 里才崩）
+  must_have <- c("AbvGrndWood","fine_root_carbon_content","coarse_root_carbon_content")
+  missing <- must_have[ sapply(ens[must_have], is.null) ]
+  if (length(missing) > 0) {
+    stop("Missing variables in NetCDF for runid ", runid, ": ",
+         paste(missing, collapse=", "),
+         " | Available: ", paste(names(ens), collapse=", "),
+         call. = FALSE)
+  }
+  
+  start.time <- as.Date(paste0(lubridate::year(stop.time), 
+                               "-01-01"))
+  time_var <- ens$time_bounds[1, ]
+  real_time <- as.POSIXct(time_var * 3600 * 24, origin = start.time)
+  last <- which(as.Date(real_time) == as.Date(stop.time))[length(which(as.Date(real_time) == 
+                                                                         as.Date(stop.time)))]
+  
+  # print("breakpoint3")
+  # print(list(
+  #   last = last,
+  #   has_AbvGrndWood = !is.null(ens$AbvGrndWood),
+  #   len_AbvGrndWood = if (is.null(ens$AbvGrndWood)) NA_integer_ else length(ens$AbvGrndWood),
+  #   has_fine = !is.null(ens$fine_root_carbon_content),
+  #   len_fine = if (is.null(ens$fine_root_carbon_content)) NA_integer_ else length(ens$fine_root_carbon_content),
+  #   has_coarse = !is.null(ens$coarse_root_carbon_content),
+  #   len_coarse = if (is.null(ens$coarse_root_carbon_content)) NA_integer_ else length(ens$coarse_root_carbon_content)
+  # ))
+  
+  if (is.null(ens$AbvGrndWood)) {
+    stop("ens$AbvGrndWood is NULL. Available vars: ", paste(names(ens), collapse=", "), call. = FALSE)
+  }
+  if (is.na(last) || length(ens$AbvGrndWood) < last) {
+    stop("Index 'last' invalid for AbvGrndWood: last=", last,
+         " len=", length(ens$AbvGrndWood), call. = FALSE)
+  }
   
-  #### PEcAn Standard Outputs
   if ("AbvGrndWood" %in% var.names) {
-    forecast[[length(forecast) + 1]] <- PEcAn.utils::ud_convert(ens$AbvGrndWood[last],  "kg/m^2", "Mg/ha")
+    # print("breakpoint3.1")
+    forecast[[length(forecast) + 1]] <- PEcAn.utils::ud_convert(ens$AbvGrndWood[last], 
+                                                                "kg/m^2", "Mg/ha")
     names(forecast[[length(forecast)]]) <- c("AbvGrndWood")
-    
-    wood_total_C    <- ens$AbvGrndWood[last] + ens$fine_root_carbon_content[last] + ens$coarse_root_carbon_content[last]
-    if (wood_total_C<=0) wood_total_C <- 0.0001 # Making sure we are not making Nans in case there is no plant living there.
-    
-    params$restart["abvGrndWoodFrac"] <- ens$AbvGrndWood[last]  / wood_total_C
-    params$restart["coarseRootFrac"]  <- ens$coarse_root_carbon_content[last] / wood_total_C
-    params$restart["fineRootFrac"]    <- ens$fine_root_carbon_content[last]   / wood_total_C
-  }else{
-    params$restart["AbvGrndWood"] <- PEcAn.utils::ud_convert(ens$AbvGrndWood[last],  "kg/m^2", "g/m^2")
-    # calculate fractions, store in params, will use in write_restart
-    wood_total_C    <- ens$AbvGrndWood[last] + ens$fine_root_carbon_content[last] + ens$coarse_root_carbon_content[last]
-    if (wood_total_C<=0) wood_total_C <- 0.0001 # Making sure we are not making Nans in case there is no plant living there.
-    
-    params$restart["abvGrndWoodFrac"] <- ens$AbvGrndWood[last]  / wood_total_C
-    params$restart["coarseRootFrac"]  <- ens$coarse_root_carbon_content[last] / wood_total_C
-    params$restart["fineRootFrac"]    <- ens$fine_root_carbon_content[last]   / wood_total_C
+    wood_total_C <- ens$AbvGrndWood[last] + ens$fine_root_carbon_content[last] + 
+      ens$coarse_root_carbon_content[last]
+    if (wood_total_C <= 0) 
+      wood_total_C <- 1e-04
+    params$restart["abvGrndWoodFrac"] <- ens$AbvGrndWood[last]/wood_total_C
+    params$restart["coarseRootFrac"] <- ens$coarse_root_carbon_content[last]/wood_total_C
+    params$restart["fineRootFrac"] <- ens$fine_root_carbon_content[last]/wood_total_C
+  }
+  else {
+    params$restart["AbvGrndWood"] <- PEcAn.utils::ud_convert(ens$AbvGrndWood[last], 
+                                                             "kg/m^2", "g/m^2")
+    wood_total_C <- ens$AbvGrndWood[last] + ens$fine_root_carbon_content[last] + 
+      ens$coarse_root_carbon_content[last]
+    if (wood_total_C <= 0) 
+      wood_total_C <- 1e-04
+    params$restart["abvGrndWoodFrac"] <- ens$AbvGrndWood[last]/wood_total_C
+    params$restart["coarseRootFrac"] <- ens$coarse_root_carbon_content[last]/wood_total_C
+    params$restart["fineRootFrac"] <- ens$fine_root_carbon_content[last]/wood_total_C
   }
   
   if ("GWBI" %in% var.names) {
-    forecast[[length(forecast) + 1]] <- PEcAn.utils::ud_convert(mean(ens$GWBI),  "kg/m^2/s", "Mg/ha/yr")
+    forecast[[length(forecast) + 1]] <- PEcAn.utils::ud_convert(mean(ens$GWBI), 
+                                                                "kg/m^2/s", "Mg/ha/yr")
     names(forecast[[length(forecast)]]) <- c("GWBI")
   }
-  
-  # Reading in NET Ecosystem Exchange for SDA - unit is kg C m-2 s-1 and the average is estimated
   if ("NEE" %in% var.names) {
-    forecast[[length(forecast) + 1]] <- mean(ens$NEE)  ## 
-    names(forecast[[length(forecast)]]) <- c("NEE")
+    forecast[[length(forecast) + 1]] <-
+      nee_model_to_obs(mean(ens$NEE))
+    names(forecast[[length(forecast)]]) <- "NEE"
   }
   
-  
-  # Reading in Latent heat flux for SDA  - unit is MW m-2
   if ("Qle" %in% var.names) {
-    forecast[[length(forecast) + 1]] <- ens$Qle[last]*1e-6  ##  
+    forecast[[length(forecast) + 1]] <- mean(ens$Qle)
     names(forecast[[length(forecast)]]) <- c("Qle")
   }
-  
   if ("leaf_carbon_content" %in% var.names) {
-    forecast[[length(forecast) + 1]] <- ens$leaf_carbon_content[last]  ## kgC/m2*m2/kg*2kg/kgC
+    forecast[[length(forecast) + 1]] <- ens$leaf_carbon_content[last]
     names(forecast[[length(forecast)]]) <- c("LeafC")
   }
-  
   if ("LAI" %in% var.names) {
-    forecast[[length(forecast) + 1]] <- ens$LAI[last]  ## m2/m2 
+    forecast[[length(forecast) + 1]] <- ens$LAI[last]
     names(forecast[[length(forecast)]]) <- c("LAI")
-  }else{
+  }
+  else {
     params$restart["LAI"] <- ens$LAI[last]
   }
-  
-  litter_carbon_content <- ens$litter_carbon_content[last] %||% NA_real_  ##kgC/m2
   if ("litter_carbon_content" %in% var.names) {
-    forecast[[length(forecast) + 1]] <- litter_carbon_content
+    forecast[[length(forecast) + 1]] <- ens$litter_carbon_content[last]
     names(forecast[[length(forecast)]]) <- c("litter_carbon_content")
-  }else{
-    params$restart["litter_carbon_content"] <- PEcAn.utils::ud_convert(litter_carbon_content, 'kg m-2', 'g m-2')
   }
-  
-  litter_mass_content_of_water <- ens$litter_mass_content_of_water[last] %||% NA_real_  ##kgC/m2
+  else {
+    params$restart["litter_carbon_content"] <- PEcAn.utils::ud_convert(ens$litter_carbon_content[last], 
+                                                                       "kg m-2", "g m-2")
+  }
   if ("litter_mass_content_of_water" %in% var.names) {
-    forecast[[length(forecast) + 1]] <- litter_mass_content_of_water
+    forecast[[length(forecast) + 1]] <- ens$litter_mass_content_of_water[last]
     names(forecast[[length(forecast)]]) <- c("litter_mass_content_of_water")
-  }else{
-    params$restart["litter_mass_content_of_water"] <- litter_mass_content_of_water
   }
-  
+  else {
+    params$restart["litter_mass_content_of_water"] <- ens$litter_mass_content_of_water[last]
+  }
   if ("SoilMoist" %in% var.names) {
     forecast[[length(forecast) + 1]] <- ens$SoilMoist[last]
     names(forecast[[length(forecast)]]) <- c("SoilMoist")
-  }else{
+  }
+  else {
     params$restart["SoilMoist"] <- ens$SoilMoist[last]
   }
-  
   if ("SoilMoistFrac" %in% var.names) {
-    forecast[[length(forecast) + 1]] <- ens$SoilMoistFrac[last]*100  ## here we multiply it by 100 to convert from proportion to percentage.
+    forecast[[length(forecast) + 1]] <- ens$SoilMoistFrac[last] * 
+      100
     names(forecast[[length(forecast)]]) <- c("SoilMoistFrac")
-  }else{
+  }
+  else {
     params$restart["SoilMoistFrac"] <- ens$SoilMoistFrac[last]
   }
-  
-  # This is snow
   if ("SWE" %in% var.names) {
-    forecast[[length(forecast) + 1]] <- ens$SWE[last]  ## kgC/m2
+    forecast[[length(forecast) + 1]] <- ens$SWE[last]
     names(forecast[[length(forecast)]]) <- c("SWE")
-  }else{
+  }
+  else {
     params$restart["SWE"] <- ens$SWE[last]/10
   }
-  
   if ("TotLivBiom" %in% var.names) {
-    forecast[[length(forecast) + 1]] <- PEcAn.utils::ud_convert(ens$TotLivBiom[last],  "kg/m^2", "Mg/ha")
+    forecast[[length(forecast) + 1]] <- PEcAn.utils::ud_convert(ens$TotLivBiom[last], 
+                                                                "kg/m^2", "Mg/ha")
     names(forecast[[length(forecast)]]) <- c("TotLivBiom")
   }
-  
   if ("TotSoilCarb" %in% var.names) {
     forecast[[length(forecast) + 1]] <- ens$TotSoilCarb[last]
     names(forecast[[length(forecast)]]) <- c("TotSoilCarb")
-  }else{
-    params$restart["TotSoilCarb"] <- PEcAn.utils::ud_convert(ens$TotSoilCarb[last], 'kg m-2', 'g m-2') # kgC/m2 -> gC/m2
   }
-  
-  #remove any remaining NAs from params$restart
+  else {
+    params$restart["TotSoilCarb"] <- PEcAn.utils::ud_convert(ens$TotSoilCarb[last], 
+                                                             "kg m-2", "g m-2")
+  }
   params$restart <- stats::na.omit(params$restart)
-  
+  # print(runid)
   X_tmp <- list(X = unlist(forecast), params = params)
-  
+  ### Check
+  # print(X_tmp)
   return(X_tmp)
-} # read_restart.SIPNET
\ No newline at end of file
+  # print("end of write.config.SIPNET")
+}
+
+nee_model_to_obs <- function(x) {
+  x * 1e8 / 1.157407
+}
+
+nee_obs_to_model <- function(x) {
+  x * 1.157407 / 1e8
+}
diff --git a/modules/assim.sequential/R/Analysis_sda_block.R b/modules/assim.sequential/R/Analysis_sda_block.R
index a032ae4be3..7fef2997e6 100644
--- a/modules/assim.sequential/R/Analysis_sda_block.R
+++ b/modules/assim.sequential/R/Analysis_sda_block.R
@@ -461,9 +461,23 @@ MCMC_block_function <- function(block) {
                   control = list(propCov= block$data$pf, adaptScaleOnly = TRUE,
                                  latents = "X", pfOptimizeNparticles = TRUE))
   #add toggle Y sampler.
-  for (i in 1:block$constant$YN) {
-    conf$addSampler(paste0("y.censored[", i, "]"), 'toggle', control=list(type='RW'))
+  # Revise 1: only those who needed is y.censored
+  na_idx <- which(is.na(block$data$y.censored))
+  
+  if (length(na_idx) > 0) {
+    for (i in na_idx) {
+      conf$addSampler(
+        target = paste0("y.censored[", i, "]"),
+        type = "toggle",
+        control = list(type = "RW")
+      )
+    }
   }
+  ## Delete this
+  # for (i in 1:block$constant$YN) {
+  #   conf$addSampler(paste0("y.censored[", i, "]"), 'toggle', control=list(type='RW'))
+  # }
+  ## Finish Revise
   # conf$printSamplers()
   #compile MCMC
   Rmcmc <- nimble::buildMCMC(conf)
@@ -471,12 +485,14 @@ MCMC_block_function <- function(block) {
   Cmcmc <- nimble::compileNimble(Rmcmc, project = model_pred, showCompilerOutput = FALSE)
   
   #if we don't have any NA in the Y.
-  if (!any(is.na(block$data$y.censored))) {
-    #add toggle Y sampler.
-    for(i in 1:block$constant$YN) {
-      valueInCompiledNimbleFunction(Cmcmc$samplerFunctions[[samplerNumberOffset+i]], 'toggle', 0)
-    }
-  }
+  ## Revise 2: Delete this
+  # if (!any(is.na(block$data$y.censored))) {
+  #   #add toggle Y sampler.
+  #   for(i in 1:block$constant$YN) {
+  #     valueInCompiledNimbleFunction(Cmcmc$samplerFunctions[[samplerNumberOffset+i]], 'toggle', 0)
+  #   }
+  # }
+  ## Finish Revise
   
   #run MCMC
   dat <- runMCMC(Cmcmc, niter = block$MCMC$niter, nburnin = block$MCMC$nburnin, thin = block$MCMC$nthin, nchains = block$MCMC$nchain)
diff --git a/modules/assim.sequential/R/sda.enkf_parallel.R b/modules/assim.sequential/R/sda.enkf_parallel.R
index 3d9b073893..e8633d5c2f 100644
--- a/modules/assim.sequential/R/sda.enkf_parallel.R
+++ b/modules/assim.sequential/R/sda.enkf_parallel.R
@@ -169,17 +169,33 @@ sda.enkf_local <- function(settings,
         for (i in 1:length(settings$run$inputs$met$path)) {
           #---------------- model specific split inputs
           ### model specific split inputs
+          ### Revise: delete those
+                    # settings$run$inputs$met$path[[i]] <- do.call(
+          #   my.split_inputs,
+          #   args = list(
+          #     settings = settings,
+          #     start.time = lubridate::ymd_hms(settings$run$site$met.start, truncated = 3), # This depends if we are restart or not
+          #     stop.time = lubridate::ymd_hms(settings$run$site$met.end, truncated = 3),
+          #     inputs =  settings$run$inputs$met$path[[i]],
+          #     outpath = paste0(paste0(settings$outdir, "/Extracted_met/"), settings$run$site$id),
+          #     overwrite =F
+          #   )
+          # )
+          split_args <- list(
+            start.time = lubridate::ymd_hms(settings$run$site$met.start, truncated = 3),
+            stop.time  = lubridate::ymd_hms(settings$run$site$met.end, truncated = 3),
+            inputs     = settings$run$inputs$met$path[[i]],
+            outpath    = file.path(settings$outdir, "Extracted_met", settings$run$site$id),
+            overwrite  = FALSE
+          )
+          if (settings$model$type != "SIPNET") {
+            split_args <- c(list(settings = settings), split_args)
+          }
           settings$run$inputs$met$path[[i]] <- do.call(
             my.split_inputs,
-            args = list(
-              settings = settings,
-              start.time = lubridate::ymd_hms(settings$run$site$met.start, truncated = 3), # This depends if we are restart or not
-              stop.time = lubridate::ymd_hms(settings$run$site$met.end, truncated = 3),
-              inputs =  settings$run$inputs$met$path[[i]],
-              outpath = paste0(paste0(settings$outdir, "/Extracted_met/"), settings$run$site$id),
-              overwrite =F
-            )
+            args = split_args
           )
+          ### Finish Revise
           # changing the start and end date which will be used for model2netcdf.model
           settings$run$start.date <- lubridate::ymd_hms(settings$state.data.assimilation$start.date, truncated = 3)
           settings$run$end.date <- lubridate::ymd_hms(settings$state.data.assimilation$end.date, truncated = 3)
@@ -211,7 +227,7 @@ sda.enkf_local <- function(settings,
     # find a site that has all registered inputs except for the parameter field.
     if (all(names.sampler %in% names.site.input)) {
       input_design <- PEcAn.uncertainty::generate_joint_ensemble_design(settings = settings[[i]], 
-                                                                        ensemble_samples = ensemble.samples, 
+                                                                      
                                                                         ensemble_size = nens)[[1]]
       break
     }
diff --git a/modules/data.land/NAMESPACE b/modules/data.land/NAMESPACE
index 14e120593c..ddfec52ad7 100644
--- a/modules/data.land/NAMESPACE
+++ b/modules/data.land/NAMESPACE
@@ -75,7 +75,7 @@ export(to_co2e)
 export(validate_events_json)
 export(write_ic)
 export(write_veg)
-export(xgb_gapfill)
+export(xgb_flux_gapfill)
 importFrom(dplyr,"%>%")
 importFrom(foreach,"%dopar%")
 importFrom(magrittr,"%>%")

From 5367eb870fd9f7f61ee4ac05f63c06a325cde0d6 Mon Sep 17 00:00:00 2001
From: yanggu <yanggu@scc-gc1.scc.bu.edu>
Date: Thu, 30 Apr 2026 14:17:04 -0400
Subject: [PATCH 11/13]  multi chain MCMC config & fix NEE unit

---
 local/sda_runner.R                            | 56 ++++++++-------
 models/sipnet/R/model2netcdf.SIPNET.R         | 66 +++++++++--------
 models/sipnet/man/model2netcdf.SIPNET.Rd      |  2 +-
 .../assim.sequential/R/Analysis_sda_block.R   | 70 ++++++++++++++++++-
 .../assim.sequential/R/sda.enkf_parallel.R    |  1 +
 5 files changed, 131 insertions(+), 64 deletions(-)

diff --git a/local/sda_runner.R b/local/sda_runner.R
index 5ff798610d..cddc255cd8 100644
--- a/local/sda_runner.R
+++ b/local/sda_runner.R
@@ -24,18 +24,18 @@ library(lgarch)
 library(parallel)
 library(foreach)
 library(terra)
-setwd("/projectnb/dietzelab/guYANG/pecan/runners/wishart_sda/wishart_sda/")
+setwd("/projectnb/dietzelab/guYANG/pecan/")
 ## read settings xml file.
-load("/projectnb/dietzelab/guYANG/pecan/runners/test10/pecan_flux.RData")
+load("/projectnb/dietzelab/guYANG/pecan/pecan.Rdata")
 # Change dir name and settings
-settings$outdir      <- "/projectnb/dietzelab/guYANG/pecan/runners/wishart_sda/output_inter_q_2/"
+settings$outdir      <- "/projectnb/dietzelab/guYANG/pecan/output/adjust/"
 settings$rundir      <- file.path(settings$outdir, "run")
 settings$modeloutdir <- file.path(settings$outdir, "out")
 settings$host$rundir <- file.path(settings$outdir, "run")
 settings$host$outdir <- file.path(settings$outdir, "out")
 settings$host$folder <- file.path(settings$outdir, "out")
-settings$ensemble$size <- 5
-settings$state.data.assimilation$adjustment <- "TRUE"
+settings$ensemble$size <- 25
+settings$state.data.assimilation$adjustment <- "FALSE"
 settings$host$prerun <- "module load R/4.4.0"
 ###### Change Q type
 settings$state.data.assimilation$q.type <- "wishart"
@@ -64,10 +64,6 @@ settings <- PEcAn.settings::prepare.settings(settings)
 load("/projectnb/dietzelab/guYANG/pecan/runners/test10/obs.mean.RData")
 load("/projectnb/dietzelab/guYANG/pecan/runners/test10/obs.cov.RData")
 
-# load 4 obs
-# load("/projectnb/dietzelab/dongchen/anchorSites/NA_runs/SDA_8k_site/observation/Rdata_with_attributes/obs.mean.LandTrendr_GEDI.Rdata")
-# load("/projectnb/dietzelab/dongchen/anchorSites/NA_runs/SDA_8k_site/observation/Rdata_with_attributes/obs.cov.LandTrendr_GEDI.Rdata")
-
 sub_obs <- function(L, keep) setNames(lapply(L, \(l) l[names(l) %in% keep]), names(L))
 obs.mean <- sub_obs(obs.mean, keep_ids)
 obs.cov  <- sub_obs(obs.cov,  keep_ids)
@@ -99,7 +95,7 @@ if (length(obs.cov[[i]][[j]]) > 1) {
 }
 
 # load PFT parameter file.
-samples_src <- "/projectnb/dietzelab/dongchen/anchorSites/NA_runs/SDA_8k_site/samples.Rdata"
+samples_src <- "/projectnb/dietzelab/guYANG/pecan/cali_samples.Rdata"
 samples_dst <- file.path(settings$outdir, "samples.Rdata")
 dir.create(settings$outdir, recursive = TRUE, showWarnings = FALSE)
 if (!file.exists(samples_dst)) file.copy(samples_src, samples_dst, overwrite = TRUE)
@@ -108,7 +104,7 @@ control <- list(
   TimeseriesPlot = FALSE,
   OutlierDetection = FALSE,
   send_email = NULL,
-  keepNC = TRUE,
+  keepNC = FALSE,
   forceRun = TRUE,
   run_parallel = FALSE,
   MCMC.args = NULL,
@@ -116,13 +112,6 @@ control <- list(
   execution = "qsub_parallel"   # or "qsub" / "local"
 )
 
-source("/projectnb/dietzelab/guYANG/pecan/runners/wishart_sda/wishart_sda/sda.enkf_local.R")
-source("/projectnb/dietzelab/guYANG/pecan/runners/wishart_sda/wishart_sda/read.restart.SIPNET.R")
-source("/projectnb/dietzelab/guYANG/pecan/runners/wishart_sda/wishart_sda/analysis_sda_block.R")
-source("/projectnb/dietzelab/guYANG/pecan/runners/wishart_sda/wishart_sda/MCMC_block_function.R")
-# source("/projectnb/dietzelab/guYANG/pecan/runners/wishart_sda/wishart_sda/sda.enkf_param.R")
-
-###### Be careful: param
 res <- PEcAnAssimSequential:::sda.enkf_local(
   settings = settings,
   obs.mean = obs.mean,
@@ -130,25 +119,40 @@ res <- PEcAnAssimSequential:::sda.enkf_local(
   Q = NULL,
   pre_enkf_params = NULL,
   ensemble.samples = NULL,
-  control = control
+  control = list(
+    TimeseriesPlot = FALSE,
+    OutlierDetection = FALSE,
+    send_email = NULL,
+    keepNC = TRUE,
+    forceRun = TRUE,
+    run_parallel = FALSE,
+    MCMC.args = list(
+      niter = 200000,   
+      nburnin = 100000,
+      nthin = 5,
+      nchain = 3
+    ),
+    merge_nc = FALSE,
+    execution = "qsub_parallel"   # or "qsub" / "local"
+  )
 )
 
 # job_lines <- c(
 #   "#!/bin/bash",
 #   "module load R/4.4.0",
-#   "Rscript /projectnb/dietzelab/guYANG/pecan/runners/wishart_sda/wishart_sda/pecan_sda_runner.R"
+#   "Rscript /projectnb/dietzelab/guYANG/pecan/pecan/local/sda_runner.R"
 # )
-# writeLines(job_lines, "/projectnb/dietzelab/guYANG/pecan/runners/wishart_sda/logs/pecan_sda_runner.sh")
+# writeLines(job_lines, "/projectnb/dietzelab/guYANG/pecan/output/log/sda_runner.sh")
 
-# qsub -l h_rt=2:00:00 \
+# qsub -l h_rt=6:00:00 \
 # -l buyin \
 # -l mem_per_core=8G \
 # -pe omp 28 \
 # -V \
-# -N pecan_sda_runner \
-# -o /projectnb/dietzelab/guYANG/pecan/runners/wishart_sda/logs/pecan_sda_runner.out \
-# -e /projectnb/dietzelab/guYANG/pecan/runners/wishart_sda/logs/pecan_sda_runner.err \
+# -N sda_runner \
+# -o /projectnb/dietzelab/guYANG/pecan/output/log/sda_runner.out \
+# -e /projectnb/dietzelab/guYANG/pecan/output/log/sda_runner.err \
 # -M yanggu@bu.edu \
 # -m abe \
 # -S /bin/bash \
-# /projectnb/dietzelab/guYANG/pecan/runners/wishart_sda/logs/pecan_sda_runner.sh
+# /projectnb/dietzelab/guYANG/pecan/output/log/sda_runner.sh
diff --git a/models/sipnet/R/model2netcdf.SIPNET.R b/models/sipnet/R/model2netcdf.SIPNET.R
index 5d96f615fe..8264c8f9d7 100644
--- a/models/sipnet/R/model2netcdf.SIPNET.R
+++ b/models/sipnet/R/model2netcdf.SIPNET.R
@@ -58,7 +58,7 @@ mergeNC <- function(
 #' @export
 #' @author Shawn Serbin, Michael Dietze
 model2netcdf.SIPNET <- function(outdir, sitelat, sitelon, start_date, end_date, delete.raw = FALSE, revision = NULL, prefix = "sipnet.out",
-                                overwrite = FALSE, conflict = FALSE) {
+                                overwrite = TRUE, conflict = FALSE) {
   ### Read in model output in SIPNET format
   sipnet_out_file <- file.path(outdir, prefix)
   # SIPNET v1 had a "Notes" comment line before the header; v2 removed it.
@@ -119,8 +119,8 @@ model2netcdf.SIPNET <- function(outdir, sitelat, sitelon, start_date, end_date,
   
   
   timestep.s <- 86400 / out_day
-
-
+  
+  
   ## Unit conversions
   #
   # CKB 20260407: Not using ud_convert here is intentional!
@@ -134,7 +134,7 @@ model2netcdf.SIPNET <- function(outdir, sitelat, sitelon, start_date, end_date,
   cm_step_to_mm_sec <- function(x) x * 10 / timestep.s
   sipnet_output <- sipnet_output |>
     dplyr::mutate(
-
+      
       # C and N pools
       dplyr::across(
         .cols = c(
@@ -145,7 +145,7 @@ model2netcdf.SIPNET <- function(outdir, sitelat, sitelon, start_date, end_date,
         ),
         .fns = g_to_kg
       ),
-
+      
       # C and N fluxes
       dplyr::across(
         .cols = c(
@@ -154,7 +154,7 @@ model2netcdf.SIPNET <- function(outdir, sitelat, sitelon, start_date, end_date,
         ),
         .fns = g_step_to_kg_sec
       ),
-
+      
       # Water pools
       dplyr::across(
         .cols = c(
@@ -163,7 +163,7 @@ model2netcdf.SIPNET <- function(outdir, sitelat, sitelon, start_date, end_date,
         ),
         .fns = cm_to_mm
       ),
-
+      
       # Water fluxes
       dplyr::across(
         .cols = dplyr::all_of("evapotranspiration"),
@@ -172,12 +172,12 @@ model2netcdf.SIPNET <- function(outdir, sitelat, sitelon, start_date, end_date,
       # Water flux special case:
       # Sipnet reports transpiration, and no other variables, in cm/day not cm/timestep.
       fluxestranspiration = cm_to_mm(.data$fluxestranspiration) / 86400, # cm/day -> mm/sec
-
+      
       # Date and time
       datetime = sipnet2datetime(.data$year, .data$day, .data$time)
     )
-
-
+  
+  
   # calculate LAI for standard output
   # LAI = plantLeafC / leafCSpWt
   # both operands are in carbon units (gC/m2 and gC/m2_leaf),
@@ -187,22 +187,20 @@ model2netcdf.SIPNET <- function(outdir, sitelat, sitelon, start_date, end_date,
                                        "sipnet.param"), stringsAsFactors = FALSE)
   leafCSpWt <- param[param[, 1] == "leafCSpWt", 2]
   SLA <- 1000 / leafCSpWt  # m2 leaf / kg C
-
+  
   
   ### Loop over years in SIPNET output to create separate netCDF outputs
   for (y in year_seq) {
-    #initialize the conflicted as FALSE
     conflicted <- FALSE
-    conflict <- TRUE    #conflict is set to TRUE to enable the rename of yearly nc file for merging SDA results with sub-annual data
-    #if we have conflicts on this file.
-    if (file.exists(file.path(outdir, paste(y, "nc", sep = "."))) & overwrite == FALSE & conflict == FALSE) {
-      next
-    }else if(file.exists(file.path(outdir, paste(y, "nc", sep = "."))) & conflict){
-      conflicted <- TRUE
-      file.rename(file.path(outdir, paste(y, "nc", sep = ".")), file.path(outdir, "previous.nc"))
+    nc_file <- file.path(outdir, paste(y, "nc", sep = "."))
+    if (file.exists(nc_file)) {
+      ok <- file.remove(nc_file)
+      if (!ok) {
+        stop("Failed to remove existing NetCDF file: ", nc_file)
+      }
     }
-    print(paste("---- Processing year: ", y))  # turn on for debugging
-
+    print(paste("---- Processing year: ", y))  # turn on for debugging  
+    
     ## Subset data for processing
     sub.sipnet.output <- subset(sipnet_output, sipnet_output$year == y)
     
@@ -238,10 +236,10 @@ model2netcdf.SIPNET <- function(outdir, sitelat, sitelon, start_date, end_date,
       "coarse_root_carbon_content" = sub.sipnet.output$coarseRootC,
       "LAI" = sub.sipnet.output$plantLeafC * SLA,
       "TotLivBiom" = sub.sipnet.output$plantWoodC + sub.sipnet.output$plantLeafC +
-                       sub.sipnet.output$coarseRootC + sub.sipnet.output$fineRootC,
+        sub.sipnet.output$coarseRootC + sub.sipnet.output$fineRootC,
       "TotSoilCarb" = sub.sipnet.output$soil + sub.sipnet.output$litter,
       "AGB" = sub.sipnet.output$plantWoodC + sub.sipnet.output$plantLeafC,
-
+      
       # Water variables:
       # Liquid water units are cm in Sipnet; in PEcAn they're kg water m-2
       #  (which is equivalent to mm: (water density = 1000 kg m-3) * (1 m/ 1000 mm) = (1 kg m-2)/mm
@@ -255,14 +253,14 @@ model2netcdf.SIPNET <- function(outdir, sitelat, sitelon, start_date, end_date,
       "SoilMoistFrac" = sub.sipnet.output$soilWetnessFrac,
       "SWE" = sub.sipnet.output$snow  # Snow Water Equivalent
     )
-
+    
     if ("litterWater" %in% names(sub.sipnet.output)) { # Removed in SIPNET v2; only extract if present
       output[["litter_mass_content_of_water"]] <- sub.sipnet.output$litterWater
     }
     if ("woodCreation" %in% names(sub.sipnet.output)) { # Added in SIPNET v2; only extract if present
       output[["GWBI"]] <- sub.sipnet.output$woodCreation
     }
-
+    
     # columns only present in sipnet >= v2 with N and methane turned on
     if ("minN" %in% names(sub.sipnet.output)) {
       output[["mineral_N"]] <- sub.sipnet.output$minN
@@ -288,7 +286,7 @@ model2netcdf.SIPNET <- function(outdir, sitelat, sitelon, start_date, end_date,
     if ("ch4" %in% names(sub.sipnet.output)) {
       output[["CH4_flux"]] <- sub.sipnet.output$ch4
     }
-
+    
     output[["time_bounds"]] <- c(rbind(bounds[,1], bounds[,2]))
     
     # ******************** Declare netCDF variables ********************#
@@ -344,7 +342,7 @@ model2netcdf.SIPNET <- function(outdir, sitelat, sitelon, start_date, end_date,
                                        longname = "history time interval endpoints", dim=list(time_interval,time = t), 
                                        prec = "double")              
     )
-
+    
     if ("litter_mass_content_of_water" %in% names(output)) {
       nc_var[["litter_mass_content_of_water"]] <- PEcAn.utils::to_ncvar("litter_mass_content_of_water", dims)
     }
@@ -354,30 +352,30 @@ model2netcdf.SIPNET <- function(outdir, sitelat, sitelon, start_date, end_date,
     }
     if ("mineral_N" %in% names(output)) {
       nc_var[["mineral_N"]] <- ncdf4::ncvar_def("mineral_N", units = "kg N m-2",
-        dim = list(lon, lat, t), missval = -999, longname = "Soil mineral nitrogen")
+                                                dim = list(lon, lat, t), missval = -999, longname = "Soil mineral nitrogen")
     }
     if ("soil_organic_N" %in% names(output)) {
       nc_var[["soil_organic_N"]] <- ncdf4::ncvar_def("soil_organic_N", units = "kg N m-2",
-        dim = list(lon, lat, t), missval = -999, longname = "Soil organic nitrogen")
+                                                     dim = list(lon, lat, t), missval = -999, longname = "Soil organic nitrogen")
     }
     if ("litter_N" %in% names(output)) {
       nc_var[["litter_N"]] <- ncdf4::ncvar_def("litter_N", units = "kg N m-2",
-        dim = list(lon, lat, t), missval = -999, longname = "Litter nitrogen")
+                                               dim = list(lon, lat, t), missval = -999, longname = "Litter nitrogen")
     }
     if ("N2O_flux" %in% names(output)) {
       nc_var[["N2O_flux"]] <- PEcAn.utils::to_ncvar("N2O_flux", dims)
     }
     if ("N_leaching" %in% names(output)) {
       nc_var[["N_leaching"]] <- ncdf4::ncvar_def("N_leaching", units = "kg N m-2 s-1",
-        dim = list(lon, lat, t), missval = -999, longname = "Nitrogen leaching flux")
+                                                 dim = list(lon, lat, t), missval = -999, longname = "Nitrogen leaching flux")
     }
     if ("N_fixation" %in% names(output)) {
       nc_var[["N_fixation"]] <- ncdf4::ncvar_def("N_fixation", units = "kg N m-2 s-1",
-        dim = list(lon, lat, t), missval = -999, longname = "Nitrogen fixation flux")
+                                                 dim = list(lon, lat, t), missval = -999, longname = "Nitrogen fixation flux")
     }
     if ("N_uptake" %in% names(output)) {
       nc_var[["N_uptake"]] <- ncdf4::ncvar_def("N_uptake", units = "kg N m-2 s-1",
-        dim = list(lon, lat, t), missval = -999, longname = "Plant nitrogen uptake flux")
+                                               dim = list(lon, lat, t), missval = -999, longname = "Plant nitrogen uptake flux")
     }
     if ("CH4_flux" %in% names(output)) {
       nc_var[["CH4_flux"]] <- PEcAn.utils::to_ncvar("CH4_flux", dims)
@@ -449,4 +447,4 @@ sipnet2datetime <- function(year, doy, hour){
   )
   
   as.POSIXct(datetime)
-}
+}
\ No newline at end of file
diff --git a/models/sipnet/man/model2netcdf.SIPNET.Rd b/models/sipnet/man/model2netcdf.SIPNET.Rd
index 5042dbc734..01d55577d4 100644
--- a/models/sipnet/man/model2netcdf.SIPNET.Rd
+++ b/models/sipnet/man/model2netcdf.SIPNET.Rd
@@ -13,7 +13,7 @@ model2netcdf.SIPNET(
   delete.raw = FALSE,
   revision = NULL,
   prefix = "sipnet.out",
-  overwrite = FALSE,
+  overwrite = TRUE,
   conflict = FALSE
 )
 }
diff --git a/modules/assim.sequential/R/Analysis_sda_block.R b/modules/assim.sequential/R/Analysis_sda_block.R
index 7fef2997e6..2d9f2eb16e 100644
--- a/modules/assim.sequential/R/Analysis_sda_block.R
+++ b/modules/assim.sequential/R/Analysis_sda_block.R
@@ -84,6 +84,19 @@ analysis_sda_block <- function (settings, block.list.all, X, obs.mean, obs.cov,
   foreach::registerDoSEQ()
   PEcAn.logger::logger.info("Completed!")
   
+  #### Revise: Add MCMC block diag save
+  mcmc_diag <- purrr::map(
+    block.list.all[[t]],
+    "mcmc_diag"
+  )
+  mcmc_diag <- mcmc_diag[!vapply(mcmc_diag, is.null, logical(1))]
+  if (length(mcmc_diag) > 0) {
+    mcmc_diag <- dplyr::bind_rows(mcmc_diag)
+  } else {
+    mcmc_diag <- NULL
+  }
+  #### Finish Revise
+  
   #convert from block lists to vector values.
   if ("try-error" %in% class(try(V <- block.2.vector(block.list.all[[t]], X, H, settings$state.data.assimilation$adjustment)))) {
     PEcAn.logger::logger.severe("Something wrong within the block.2.vector function.")
@@ -98,6 +111,7 @@ analysis_sda_block <- function (settings, block.list.all, X, obs.mean, obs.cov,
               Pa = V$Pa,
               Y = Y,
               R = R,
+              mcmc_diag = mcmc_diag,
               analysis = V$analysis))
 }
 
@@ -494,10 +508,60 @@ MCMC_block_function <- function(block) {
   # }
   ## Finish Revise
   
-  #run MCMC
-  dat <- runMCMC(Cmcmc, niter = block$MCMC$niter, nburnin = block$MCMC$nburnin, thin = block$MCMC$nthin, nchains = block$MCMC$nchain)
-  #update aq, bq, mua, and pa
+  ## Revise: 
+  # Delete the original MCMC at fits nchain = 1
+  # #run MCMC
+  # dat <- runMCMC(Cmcmc, niter = block$MCMC$niter, nburnin = block$MCMC$nburnin, thin = block$MCMC$nthin, nchains = block$MCMC$nchain)
+  mcmc_diag <- NULL
+  
+   if (block$MCMC$nchain == 1) {
+    dat <- runMCMC(
+      Cmcmc,
+      niter   = block$MCMC$niter,
+      nburnin = block$MCMC$nburnin,
+      thin    = block$MCMC$nthin,
+      nchains = block$MCMC$nchain
+    )
+  } else {
+    ## multi-chain code
+    dat_raw <- runMCMC(
+      Cmcmc,
+      niter   = block$MCMC$niter,
+      nburnin = block$MCMC$nburnin,
+      thin    = block$MCMC$nthin,
+      nchains = block$MCMC$nchain,
+      samplesAsCodaMCMC = TRUE,
+      summary = FALSE
+    )
+    
+    rhat <- coda::gelman.diag(
+      dat_raw,
+      autoburnin = FALSE,
+      multivariate = FALSE
+    )$psrf[, "Point est."]
+    
+    ess <- coda::effectiveSize(dat_raw)
+    
+    mcmc_diag <- data.frame(
+      parameter = names(rhat),
+      rhat = as.numeric(rhat),
+      ess = as.numeric(ess[names(rhat)])
+    )
+
+    ## combine chains
+    dat <- as.matrix(dat_raw)
+  }
+  
+  str(dat_raw); dim(dat); head(colnames(dat))
+  
+  ## save diag into block
+  block$mcmc_diag <- mcmc_diag
+  
+    #update aq, bq, mua, and pa
   M <- colMeans(dat)
+  
+  
+  
   block$update$aq <- block$Inits$q
   if (block$constant$q.type == 3) {
     #if it's a vector q case
diff --git a/modules/assim.sequential/R/sda.enkf_parallel.R b/modules/assim.sequential/R/sda.enkf_parallel.R
index e8633d5c2f..6db15b12c1 100644
--- a/modules/assim.sequential/R/sda.enkf_parallel.R
+++ b/modules/assim.sequential/R/sda.enkf_parallel.R
@@ -479,6 +479,7 @@ sda.enkf_local <- function(settings,
                         analysis = ANALYSIS[[obs.t]],
                         enkf.params = enkf.params[[obs.t]],
                         ens_weights[[obs.t]],
+                        mcmc_diag = enkf.params[[obs.t]]$mcmc_diag,
                         params.list = params.list,
                         restart.list = restart.list,
                         debias.out = debias.out)

From 14b4ff4d8db63bc4dc72192add6b33a97caf5f7b Mon Sep 17 00:00:00 2001
From: yanggu <yanggu@scc-ga1.scc.bu.edu>
Date: Tue, 12 May 2026 14:13:53 -0400
Subject: [PATCH 12/13] update bf byproduct

---
 local/sda_runner.R                            |  58 +++++++---
 models/sipnet/R/model2netcdf.SIPNET.R         |   2 +-
 .../assim.sequential/R/SDA_OBS_Assembler.R    |   3 +-
 .../assim.sequential/R/sda.enkf_parallel.R    |  15 +++
 modules/data.land/R/Soilgrids_SoilC_prep.R    | 105 +++++++++++++-----
 5 files changed, 138 insertions(+), 45 deletions(-)

diff --git a/local/sda_runner.R b/local/sda_runner.R
index cddc255cd8..61e86d0cb4 100644
--- a/local/sda_runner.R
+++ b/local/sda_runner.R
@@ -27,8 +27,13 @@ library(terra)
 setwd("/projectnb/dietzelab/guYANG/pecan/")
 ## read settings xml file.
 load("/projectnb/dietzelab/guYANG/pecan/pecan.Rdata")
+
+### PEcAn Settings
+# settings_dir <- "/projectnb/dietzelab/guYANG/pecan/output/pecan_flux.xml"
+# settings <- PEcAn.settings::read.settings(settings_dir)
+
 # Change dir name and settings
-settings$outdir      <- "/projectnb/dietzelab/guYANG/pecan/output/adjust/"
+settings$outdir      <- "/projectnb/dietzelab/guYANG/pecan/output/5obs_monthly/"
 settings$rundir      <- file.path(settings$outdir, "run")
 settings$modeloutdir <- file.path(settings$outdir, "out")
 settings$host$rundir <- file.path(settings$outdir, "run")
@@ -38,7 +43,9 @@ settings$ensemble$size <- 25
 settings$state.data.assimilation$adjustment <- "FALSE"
 settings$host$prerun <- "module load R/4.4.0"
 ###### Change Q type
-settings$state.data.assimilation$q.type <- "wishart"
+settings$state.data.assimilation$q.type <- "vector"
+settings$state.data.assimilation$aqq.Init <- "1"
+settings$state.data.assimilation$bqq.Init <- "1"
 ## Fix the multi output in one timestep bug
 settings$model$jobtemplate <- "/projectnb/dietzelab/guYANG/pecan/runners/test7/sipnet_template.job"
 
@@ -60,9 +67,12 @@ settings$state.data.assimilation$batch.settings <- batch.settings
 # update settings with the actual PFTs.
 settings <- PEcAn.settings::prepare.settings(settings)
 
-# load 6 obs
-load("/projectnb/dietzelab/guYANG/pecan/runners/test10/obs.mean.RData")
-load("/projectnb/dietzelab/guYANG/pecan/runners/test10/obs.cov.RData")
+# load obs
+# load("/projectnb/dietzelab/guYANG/pecan/runners/test10/obs.mean.RData")
+# load("/projectnb/dietzelab/guYANG/pecan/runners/test10/obs.cov.RData")
+
+load("/projectnb/dietzelab/guYANG/pecan/output/obs/Rdata/obs.mean.monthly.noSM.Rdata")
+load("/projectnb/dietzelab/guYANG/pecan/output/obs/Rdata/obs.cov.monthly.noSM.Rdata")
 
 sub_obs <- function(L, keep) setNames(lapply(L, \(l) l[names(l) %in% keep]), names(L))
 obs.mean <- sub_obs(obs.mean, keep_ids)
@@ -95,19 +105,25 @@ if (length(obs.cov[[i]][[j]]) > 1) {
 }
 
 # load PFT parameter file.
-samples_src <- "/projectnb/dietzelab/guYANG/pecan/cali_samples.Rdata"
+# samples_src <- "/projectnb/dietzelab/guYANG/pecan/cali_samples.Rdata"
+samples_src <- "/projectnb/dietzelab/dongchen/anchorSites/NA_runs/SDA_8k_site/samples.Rdata"
 samples_dst <- file.path(settings$outdir, "samples.Rdata")
 dir.create(settings$outdir, recursive = TRUE, showWarnings = FALSE)
 if (!file.exists(samples_dst)) file.copy(samples_src, samples_dst, overwrite = TRUE)
 
-control <- list(
+control = list(
   TimeseriesPlot = FALSE,
   OutlierDetection = FALSE,
   send_email = NULL,
-  keepNC = FALSE,
+  keepNC = TRUE,
   forceRun = TRUE,
   run_parallel = FALSE,
-  MCMC.args = NULL,
+  MCMC.args = list(
+    niter = 200000,   
+    nburnin = 100000,
+    nthin = 5,
+    nchain = 3
+  ),
   merge_nc = FALSE,
   execution = "qsub_parallel"   # or "qsub" / "local"
 )
@@ -137,22 +153,34 @@ res <- PEcAnAssimSequential:::sda.enkf_local(
   )
 )
 
+#### qsub
+# PEcAnAssimSequential:::qsub_sda(
+#   settings = settings,
+#   obs.mean = obs.mean,
+#   obs.cov = obs.cov,
+#   Q = NULL,
+#   pre_enkf_params = NULL,
+#   ensemble.samples = NULL,
+#   outdir = "/projectnb/dietzelab/guYANG/pecan/output/qsub",
+#   control = control
+# )
+
 # job_lines <- c(
 #   "#!/bin/bash",
 #   "module load R/4.4.0",
 #   "Rscript /projectnb/dietzelab/guYANG/pecan/pecan/local/sda_runner.R"
 # )
-# writeLines(job_lines, "/projectnb/dietzelab/guYANG/pecan/output/log/sda_runner.sh")
+# writeLines(job_lines, "/projectnb/dietzelab/guYANG/pecan/output/log/sda_runner_1.sh")
 
-# qsub -l h_rt=6:00:00 \
+# qsub -l h_rt=10:00:00 \
 # -l buyin \
 # -l mem_per_core=8G \
 # -pe omp 28 \
 # -V \
-# -N sda_runner \
-# -o /projectnb/dietzelab/guYANG/pecan/output/log/sda_runner.out \
-# -e /projectnb/dietzelab/guYANG/pecan/output/log/sda_runner.err \
+# -N sda_runner_1 \
+# -o /projectnb/dietzelab/guYANG/pecan/output/log/sda_runner_1.out \
+# -e /projectnb/dietzelab/guYANG/pecan/output/log/sda_runner_1.err \
 # -M yanggu@bu.edu \
 # -m abe \
 # -S /bin/bash \
-# /projectnb/dietzelab/guYANG/pecan/output/log/sda_runner.sh
+# /projectnb/dietzelab/guYANG/pecan/output/log/sda_runner_1.sh
diff --git a/models/sipnet/R/model2netcdf.SIPNET.R b/models/sipnet/R/model2netcdf.SIPNET.R
index 8264c8f9d7..aa17e0b9e3 100644
--- a/models/sipnet/R/model2netcdf.SIPNET.R
+++ b/models/sipnet/R/model2netcdf.SIPNET.R
@@ -158,7 +158,7 @@ model2netcdf.SIPNET <- function(outdir, sitelat, sitelon, start_date, end_date,
       # Water pools
       dplyr::across(
         .cols = c(
-          dplyr::all_of(c("soilWater", "soilWetnessFrac", "snow")),
+          dplyr::all_of(c("soilWater", "snow")),
           dplyr::any_of("litterWater") # Only present in V1 output
         ),
         .fns = cm_to_mm
diff --git a/modules/assim.sequential/R/SDA_OBS_Assembler.R b/modules/assim.sequential/R/SDA_OBS_Assembler.R
index 315d1fdfb4..53ba5d8066 100644
--- a/modules/assim.sequential/R/SDA_OBS_Assembler.R
+++ b/modules/assim.sequential/R/SDA_OBS_Assembler.R
@@ -44,8 +44,7 @@ SDA_OBS_Assembler <- function(settings){
       site.list$lon <- as.numeric(site.list$lon)
       list(site_id=site.list$id, lat=site.list$lat, lon=site.list$lon, site_name=site.list$name)
     })%>% 
-    dplyr::bind_rows() %>% 
-    as.list()
+    dplyr::bind_rows() 
   
   #convert from timestep to time points
   if (length(Obs_Prep$timestep)>0){
diff --git a/modules/assim.sequential/R/sda.enkf_parallel.R b/modules/assim.sequential/R/sda.enkf_parallel.R
index 6db15b12c1..6cd321422f 100644
--- a/modules/assim.sequential/R/sda.enkf_parallel.R
+++ b/modules/assim.sequential/R/sda.enkf_parallel.R
@@ -596,6 +596,21 @@ qsub_sda <- function(settings,
   if (is.null(outdir)) {
     outdir <- settings$outdir
   }
+  
+  ## Revise 
+  # ensure ensemble.samples is available before splitting batch jobs
+  if (is.null(ensemble.samples)) {
+    sample_file <- file.path(outdir, "samples.Rdata")
+    if (!file.exists(sample_file)) {
+      stop("Cannot find samples.Rdata at: ", sample_file)
+    }
+    load(sample_file)   # expects object named ensemble.samples
+    if (is.null(ensemble.samples)) {
+      stop("samples.Rdata was loaded, but object `ensemble.samples` was not found.")
+    }
+  }
+  ## Stop Revise
+  
   # create folder for storing job outputs.
   batch.folder <- file.path(outdir, prefix)
   # delete the whole folder if it's not empty.
diff --git a/modules/data.land/R/Soilgrids_SoilC_prep.R b/modules/data.land/R/Soilgrids_SoilC_prep.R
index 4fc88cca8c..a808df1c54 100644
--- a/modules/data.land/R/Soilgrids_SoilC_prep.R
+++ b/modules/data.land/R/Soilgrids_SoilC_prep.R
@@ -12,37 +12,88 @@
 #'
 #' @author Dongchen Zhang
 #' @importFrom magrittr %>%
-Soilgrids_SoilC_prep <- function(site_info, start_date, end_date, time_points, 
-                           outdir = NULL, export_csv = FALSE){
-  #if we export CSV but didn't provide any path
-  if(as.logical(export_csv) & is.null(outdir)){
-    PEcAn.logger::logger.info("If you want to export CSV file, please ensure input the outdir!")
+Soilgrids_SoilC_prep <- function(site_info, start_date, end_date, time_points,
+                                 outdir = NULL, export_csv = FALSE) {
+  
+  if (as.logical(export_csv) & is.null(outdir)) {
+    PEcAn.logger::logger.info(
+      "If you want to export CSV file, please ensure input the outdir!"
+    )
     return(0)
-  }else if(as.logical(export_csv) & !file.exists(file.path(outdir, "soilgrids_soilC_data.csv"))){
-    #if we want to export the csv file for soilgrids data.
-    Previous_CSV <- PEcAn.data.land::soilgrids_soilC_extract(site_info, outdir)
-  }else if(!as.logical(export_csv) & !file.exists(file.path(outdir, "soilgrids_soilC_data.csv"))){
-    #if we don't want to export the csv file.
+  }
+  
+  soilc_file <- if (!is.null(outdir)) {
+    file.path(outdir, "soilgrids_soilC_data.csv")
+  } else {
+    NA_character_
+  }
+  
+  if (!is.null(outdir) && !file.exists(soilc_file)) {
+    
+    if (as.logical(export_csv)) {
+      Previous_CSV <- PEcAn.data.land::soilgrids_soilC_extract(site_info, outdir)
+    } else {
+      Previous_CSV <- PEcAn.data.land::soilgrids_soilC_extract(site_info)
+    }
+    
+  } else if (!is.null(outdir) && file.exists(soilc_file)) {
+    
+    Previous_CSV <- as.data.frame(utils::read.csv(soilc_file))
+    
+  } else {
+    
     Previous_CSV <- PEcAn.data.land::soilgrids_soilC_extract(site_info)
-  }else if(file.exists(file.path(outdir, "soilgrids_soilC_data.csv"))){
-    #if we have previous extracted soilgrids csv file.
-    Previous_CSV <- as.data.frame(utils::read.csv(file.path(outdir, "soilgrids_soilC_data.csv")))
-    SoilC_Output <- matrix(NA, length(site_info$site_id), 2*length(time_points)+1) %>% 
-      `colnames<-`(c("site_id", paste0(time_points, "_TotSoilCarb"), paste0(time_points, "_SD"))) %>% as.data.frame()#we need: site_id, agb, sd, target time point.
-    SoilC_Output$site_id <- site_info$site_id
-    #loop over time and site
-    for (i in seq_along(time_points)) {
-      t <- time_points[i]
-      for (id in site_info$site_id) {
-        site_SoilC <- Previous_CSV[which(Previous_CSV$Site_ID == id),]
-        if (dim(site_SoilC)[1] == 0) {
-          next
-        }
-        SoilC_Output[which(SoilC_Output$site_id==id), paste0(t, "_TotSoilCarb")] <- site_SoilC$Total_soilC_0.200cm
-        SoilC_Output[which(SoilC_Output$site_id==id), paste0(t, "_SD")] <- site_SoilC$Std_soilC_0.200cm
+  }
+  
+  if (!is.null(outdir) && file.exists(soilc_file)) {
+    Previous_CSV <- as.data.frame(utils::read.csv(soilc_file))
+  }
+  
+  SoilC_Output <- matrix(
+    NA,
+    length(site_info$site_id),
+    2 * length(time_points) + 1
+  ) %>%
+    `colnames<-`(
+      c(
+        "site_id",
+        paste0(time_points, "_TotSoilCarb"),
+        paste0(time_points, "_SD")
+      )
+    ) %>%
+    as.data.frame()
+  
+  SoilC_Output$site_id <- site_info$site_id
+  
+  for (i in seq_along(time_points)) {
+    t <- time_points[i]
+    
+    for (id in site_info$site_id) {
+      site_SoilC <- Previous_CSV[
+        which(as.character(Previous_CSV$Site_ID) == as.character(id)),
+      ]
+      
+      if (nrow(site_SoilC) == 0) {
+        next
       }
+      
+      SoilC_Output[
+        which(as.character(SoilC_Output$site_id) == as.character(id)),
+        paste0(t, "_TotSoilCarb")
+      ] <- site_SoilC$Total_soilC_0.200cm
+      
+      SoilC_Output[
+        which(as.character(SoilC_Output$site_id) == as.character(id)),
+        paste0(t, "_SD")
+      ] <- site_SoilC$Std_soilC_0.200cm
     }
   }
+  
   PEcAn.logger::logger.info("Soilgrids SoilC Prep Completed!")
-  list(SoilC_Output = SoilC_Output, time_points = time_points, var = "TotSoilCarb")
+  
+  list(
+    SoilC_Output = SoilC_Output,
+    time_points  = time_points,
+    var          = "TotSoilCarb"
+  )
 }
\ No newline at end of file

From 8e2c97d2be7451f5695bb8bb0a65d94a75b58270 Mon Sep 17 00:00:00 2001
From: yanggu <yanggu@scc-ga1.scc.bu.edu>
Date: Tue, 12 May 2026 14:56:55 -0400
Subject: [PATCH 13/13]  monthly sipnet available

---
 base/utils/R/read.output.R                    |  6 ++--
 .../assim.sequential/R/sda.enkf_parallel.R    | 33 ++++++++++++++-----
 2 files changed, 27 insertions(+), 12 deletions(-)

diff --git a/base/utils/R/read.output.R b/base/utils/R/read.output.R
index a641b113aa..1df8fc741d 100644
--- a/base/utils/R/read.output.R
+++ b/base/utils/R/read.output.R
@@ -77,9 +77,9 @@ read.output <- function(runid, outdir,
   }
 
   # create list of *.nc years - look only for files formatted as
-  # YYYY.nc, the default pecan output file name standard
+  # YYYY.nc or YYYY-MM-DD.nc
   if (is.null(ncfiles)) {
-    ncfiles_sub <- list.files(path = outdir, pattern = "^-?[[:digit:]]{4}\\.nc$", full.names = FALSE)
+    ncfiles_sub <- list.files(path = outdir, pattern = "^-?[[:digit:]]{4}(-[[:digit:]]{2}-[[:digit:]]{2})?\\.nc$", full.names = FALSE)
     ncfiles <- file.path(outdir, ncfiles_sub)
   } else {
     # Assume the NetCDF files follow the PEcAn standard format
@@ -128,7 +128,7 @@ read.output <- function(runid, outdir,
 
   # Deduce file years from their names
   nc_years <- suppressWarnings(
-    as.numeric(gsub("^(-?[[:digit:]]{4})\\.nc", "\\1", ncfiles_sub))
+    as.numeric(gsub("^(-?[[:digit:]]{4})(-[[:digit:]]{2}-[[:digit:]]{2})?\\.nc$", "\\1", ncfiles_sub))
   )
   if (any(is.na(nc_years))) {
     PEcAn.logger::logger.debug(
diff --git a/modules/assim.sequential/R/sda.enkf_parallel.R b/modules/assim.sequential/R/sda.enkf_parallel.R
index 6cd321422f..07d69cf203 100644
--- a/modules/assim.sequential/R/sda.enkf_parallel.R
+++ b/modules/assim.sequential/R/sda.enkf_parallel.R
@@ -109,13 +109,20 @@ sda.enkf_local <- function(settings,
   ###-------------------------------------------------------------------###
   ### check dates before data assimilation                              ###
   ###-------------------------------------------------------------------###----  
-  #filtering obs data based on years specifited in setting > state.data.assimilation
+  #Filter observation dates to SDA bounds
   start.cut <- lubridate::ymd_hms(settings$state.data.assimilation$start.date, truncated = 3)
-  Start.year <- (lubridate::year(settings$state.data.assimilation$start.date))
-  End.year <- lubridate::year(settings$state.data.assimilation$end.date) # dates that assimilations will be done for - obs will be subsetted based on this
-  assim.sda <- Start.year:End.year
-  obs.mean <- obs.mean[sapply(lubridate::year(names(obs.mean)), function(obs.year) obs.year %in% (assim.sda))] #checks obs.mean dates against assimyear dates
-  obs.cov <- obs.cov[sapply(lubridate::year(names(obs.cov)), function(obs.year) obs.year %in% (assim.sda))] #checks obs.cov dates against assimyear dates
+  # Start.year <- (lubridate::year(settings$state.data.assimilation$start.date))
+  # End.year <- lubridate::year(settings$state.data.assimilation$end.date) # dates that assimilations will be done for - obs will be subsetted based on this
+  # assim.sda <- Start.year:End.year
+  # obs.mean <- obs.mean[sapply(lubridate::year(names(obs.mean)), function(obs.year) obs.year %in% (assim.sda))] #checks obs.mean dates against assimyear dates
+  # obs.cov <- obs.cov[sapply(lubridate::year(names(obs.cov)), function(obs.year) obs.year %in% (assim.sda))] #checks obs.cov dates against assimyear dates
+  if (is.na(start.cut)) {
+    start.cut <- as.POSIXct(as.Date(settings$state.data.assimilation$start.date), tz = "UTC")
+  }
+  end.cut <- lubridate::ymd_hms(settings$state.data.assimilation$end.date, truncated = 3)
+  if (is.na(end.cut)) {
+    end.cut <- as.POSIXct(as.Date(settings$state.data.assimilation$end.date), tz = "UTC")
+  }
   #checking that there are dates in obs.mean and adding midnight as the time
   obs.times <- names(obs.mean)
   obs.times.POSIX <- lubridate::ymd_hms(obs.times)
@@ -127,11 +134,17 @@ sda.enkf_local <- function(settings,
         ### Data does not have time associated with dates 
         ### Adding 12:59:59PM assuming next time step starts one second later
         # PEcAn.logger::logger.warn("Pumpkin Warning: adding one minute before midnight time assumption to dates associated with data")
-        obs.times.POSIX[i] <- lubridate::ymd_hms(paste(obs.times[i], "23:59:59"))
+        # obs.times.POSIX[i] <- lubridate::ymd_hms(paste(obs.times[i], "23:59:59"))
+        # If dates are provided without times, standardize to 00:00:00 UTC.
+        obs.times.POSIX[i] <- lubridate::ymd_hms(paste(obs.times[i], "00:00:00"))
       }
     }
   }
   obs.times <- obs.times.POSIX
+  keep_obs <- which(obs.times >= start.cut & obs.times <= end.cut)
+  obs.times <- obs.times[keep_obs]
+  obs.mean <- obs.mean[keep_obs]
+  obs.cov <- obs.cov[keep_obs]
   read_restart_times <- c(lubridate::ymd_hms(start.cut, truncated = 3), obs.times)
   nt  <- length(obs.times) #sets length of for loop for Forecast/Analysis
   if (nt==0) PEcAn.logger::logger.severe('There has to be at least one Obs.')
@@ -262,7 +275,8 @@ sda.enkf_local <- function(settings,
             for (i in seq_len(nens)) {
               #---------------- model specific split inputs
               split_args <- list(
-                start.time = (lubridate::ymd_hms(obs.times[t - 1], truncated = 3) + lubridate::second(lubridate::hms("00:00:01"))),
+                # start.time = (lubridate::ymd_hms(obs.times[t - 1], truncated = 3) + lubridate::second(lubridate::hms("00:00:01"))),
+                start.time = lubridate::ymd_hms(obs.times[t - 1], truncated = 3),
                 stop.time = lubridate::ymd_hms(obs.times[t], truncated = 3),
                 inputs = inputs$met$samples[[i]]
               )
@@ -289,7 +303,8 @@ sda.enkf_local <- function(settings,
                              }
                              list(
                                runid = configs$runs$id,
-                               start.time = strptime(obs.times[t -1], format = "%Y-%m-%d %H:%M:%S") + lubridate::second(lubridate::hms("00:00:01")),
+                               # start.time = strptime(obs.times[t -1], format = "%Y-%m-%d %H:%M:%S") + lubridate::second(lubridate::hms("00:00:01")),
+                               start.time = strptime(obs.times[t -1], format = "%Y-%m-%d %H:%M:%S"),
                                stop.time = strptime(obs.times[t], format ="%Y-%m-%d %H:%M:%S"),
                                settings = settings,
                                new.state = new_state_site,