From 77f3c93473b2847fe86717744956a8f5953c738b Mon Sep 17 00:00:00 2001 From: yanggu Date: Wed, 18 Mar 2026 23:23:06 -0400 Subject: [PATCH 01/13] Upload NEE and LE gap-fill functions --- modules/data.land/DESCRIPTION | 6 +- modules/data.land/R/SIPNET_flux_gapfill.R | 588 ++++++++++++++++++++++ 2 files changed, 593 insertions(+), 1 deletion(-) create mode 100644 modules/data.land/R/SIPNET_flux_gapfill.R diff --git a/modules/data.land/DESCRIPTION b/modules/data.land/DESCRIPTION index 13691a4327..ba5ad75b22 100644 --- a/modules/data.land/DESCRIPTION +++ b/modules/data.land/DESCRIPTION @@ -46,7 +46,11 @@ Imports: terra, tidyr, tidyselect, - XML (>= 3.98-1.4) + XML (>= 3.98-1.4), + data.table, + tibble, + xgboost, + doParallel Suggests: dataone, datapack, diff --git a/modules/data.land/R/SIPNET_flux_gapfill.R b/modules/data.land/R/SIPNET_flux_gapfill.R new file mode 100644 index 0000000000..3997513616 --- /dev/null +++ b/modules/data.land/R/SIPNET_flux_gapfill.R @@ -0,0 +1,588 @@ +#' @description +#' This function reads the site information table and returns the selected +#' site IDs for downstream modeling. +#' +#' @title read_site_ids +#' +#' @param site.dir character: path to the site information table. +#' @param n_sites numeric: number of sites to keep from the site table, +#' the total number of sites is 241. +#' +#' @return character vector of selected site IDs. Example: 'Site ID' CA-ARB +#' +#' @author Yang Gu +read_site_ids <- function(site.dir, n_sites = 40) { + fluxnet.df <- data.table::fread(site.dir) |> + tibble::as_tibble() + + site_ids <- fluxnet.df$`Site ID`[1:n_sites] |> + unique() + + return(site_ids) +} + +#' @description +#' This function reads the predictor/flux dataset and keeps only rows +#' corresponding to the selected site IDs. +#' +#' @title read_pred_data +#' +#' @param pred.var.dir character: path to the predictor/flux data file. +#' @param site_ids character: vector of site IDs to retain. +#' +#' @return data.frame containing filtered predictor/flux data. +#' +#' @author Yang Gu +read_pred_data <- function(pred.var.dir, site_ids) { + resimet.df <- data.table::fread(pred.var.dir) |> + tibble::as_tibble() |> + dplyr::mutate(is_night_suncalc = as.integer(is_night_suncalc)) + + resimet.df <- resimet.df |> + dplyr::filter(Site_ID %in% site_ids) + + return(resimet.df) +} + +#' @description +#' This function calculates site-specific missing-value percentages for +#' candidate predictor variables and returns a long-format table of retained +#' predictors for each site. +#' +#' @title build_sitecov_df +#' +#' @param resimet.df data.frame: predictor/flux dataset filtered to target sites. +#' @param vars_to_check character: vector of candidate predictor variable names. +#' @param outdir character: optional output directory to save the missing summary. +#' Default is NULL. +#' @param missing_threshold numeric: maximum allowed missing percentage for a +#' predictor to be retained at a site. Default is 40. +#' +#' @return list containing: +#' \describe{ +#' \item{missing_summary}{data frame of missing-value summary by site and variable.} +#' \item{sitecov.df}{long-format data frame of retained predictors by site.} +#' } +#' +#' @author Yang Gu +build_sitecov_df <- function(resimet.df, + vars_to_check, + outdir = NULL, + missing_threshold = 40) { + missing_summary <- resimet.df |> + dplyr::select(Site_ID, dplyr::all_of(vars_to_check)) |> + tidyr::pivot_longer( + cols = -Site_ID, + names_to = "variable", + values_to = "value" + ) |> + dplyr::group_by(Site_ID, variable) |> + dplyr::summarise( + total_obs = dplyr::n(), + missing_count = sum(is.na(value)), + missing_pct = missing_count / total_obs * 100, + .groups = "drop" + ) |> + dplyr::arrange(Site_ID, dplyr::desc(missing_pct)) + + if (!is.null(outdir)) { + if (!file.exists(outdir)) { + dir.create(outdir, recursive = TRUE) + } + data.table::fwrite( + missing_summary, + file.path(outdir, "summary.csv") + ) + } + + high_staying_vars <- missing_summary |> + dplyr::filter(missing_pct < missing_threshold) |> + dplyr::group_by(Site_ID) |> + dplyr::summarise( + filtered_variables = list(variable), + .groups = "drop" + ) + + sitecov.df <- high_staying_vars |> + tidyr::unnest(filtered_variables) + + return(list( + missing_summary = missing_summary, + sitecov.df = sitecov.df + )) +} + +#' @description +#' This internal helper function fits a site-specific XGBoost model using +#' k-fold CV. It evaluates model performance with fold-level +#' training and validation R2, and fits a final model using the full +#' site training dataset if the average validation R2 exceeds the +#' specified threshold. +#' +#' @title fit_site_xgb_cv +#' +#' @param subdf data.frame: a site-specific training data frame containing +#' the response variable and predictor variables. +#' @param feats character: a character vector of predictor variable names used +#' for model fitting. +#' @param site_name character: site identifier used for progress reporting. +#' @param response_var character: the name of the response variable column in +#' `subdf`. +#' @param nfolds numeric: number of folds used in CV. +#' @param params list: a list of XGBoost model parameters passed to +#' `xgboost::xgb.train`. +#' @param nrounds numeric: number of boosting rounds used in XGBoost training. +#' @param r2_threshold numeric: minimum average validation R2 required +#' to fit the final model. +#' +#' @return list containing: +#' \describe{ +#' \item{r2}{average validation R2 across folds.} +#' \item{model}{the final fitted XGBoost model, or NULL if the site does not meet the threshold.} +#' \item{folds_df}{a data frame of fold-level training and validation R2.} +#' } +#' +#' @author Yang Gu +fit_site_xgb_cv <- function(subdf, + feats, + site_name, + response_var, + nfolds, + params, + nrounds, + r2_threshold) { + + # Return empty results if the site has too few samples or no predictors + if (nrow(subdf) < nfolds || length(feats) == 0) { + return(list( + r2 = NA_real_, + model = NULL, + folds_df = tibble::tibble() + )) + } + + # Randomly assign observations to CV folds + set.seed(123) + folds <- sample(rep(seq_len(nfolds), length.out = nrow(subdf))) + + # Initialize containers for fold-level performance metrics + r2s <- numeric(nfolds) + r2s_train <- numeric(nfolds) + folds_df <- vector("list", nfolds) + + # Loop over folds + for (i in seq_len(nfolds)) { + # Split training and validation subsets + tr <- subdf[folds != i, , drop = FALSE] + te <- subdf[folds == i, , drop = FALSE] + + # Construct XGBoost training matrix + dtrain <- xgboost::xgb.DMatrix( + data = as.matrix(dplyr::select(tr, dplyr::all_of(feats))), + label = tr[[response_var]], + missing = NA + ) + + # Fit fold-specific XGBoost model + mdl <- xgboost::xgb.train( + params = params, + data = dtrain, + nrounds = nrounds, + verbose = 0 + ) + + # Construct validation matrix + dvalid <- xgboost::xgb.DMatrix( + data = as.matrix(dplyr::select(te, dplyr::all_of(feats))), + missing = NA + ) + + # Predict on validation and training data + pred_valid <- stats::predict(mdl, dvalid) + pred_train <- stats::predict(mdl, dtrain) + + # Compute denominator for R2 + denom_valid <- sum((te[[response_var]] - mean(te[[response_var]], na.rm = TRUE))^2, na.rm = TRUE) + denom_train <- sum((tr[[response_var]] - mean(tr[[response_var]], na.rm = TRUE))^2, na.rm = TRUE) + + # Compute validation R2 + r2_valid <- if (denom_valid > 0) { + 1 - sum((te[[response_var]] - pred_valid)^2, na.rm = TRUE) / denom_valid + } else { + NA_real_ + } + + # Compute training R2 + r2_train <- if (denom_train > 0) { + 1 - sum((tr[[response_var]] - pred_train)^2, na.rm = TRUE) / denom_train + } else { + NA_real_ + } + + # Store fold-level metrics + r2s[i] <- r2_valid + r2s_train[i] <- r2_train + + folds_df[[i]] <- tibble::tibble( + fold = i, + R2_train = r2_train, + R2_valid = r2_valid + ) + } + + # Print fold-level validation performance for the current site + message(sprintf( + "[Site %s] Fold Valid R2: %s", + site_name, + paste(round(r2s, 3), collapse = ", ") + )) + + # Compute average validation R2 across folds + avg_r2 <- mean(r2s, na.rm = TRUE) + final_mdl <- NULL + + # Fit final model using all site data if the site passes the threshold + if (!is.na(avg_r2) && avg_r2 >= r2_threshold) { + final_mdl <- xgboost::xgb.train( + params = params, + data = xgboost::xgb.DMatrix( + data = as.matrix(dplyr::select(subdf, dplyr::all_of(feats))), + label = subdf[[response_var]], + missing = NA + ), + nrounds = nrounds, + verbose = 0 + ) + } + + # Return site-level summary results + list( + r2 = avg_r2, + model = final_mdl, + folds_df = dplyr::bind_rows(folds_df) + ) +} + +#########Need to add sub-functions to add site_ids, resimet.df, and sitecov.df +#' @description +#' This function performs site-level XGBoost gap-filling with CV +#' for eddy-covariance flux data. For each ensemble iteration, it samples +#' observed data(QC=0) within each site, fits site-specific XGBoost models +#' using k-fold CV, predicts missing or low-quality flux values, +#' and exports both gap-filled outputs and fold-level CV diagnostics. +#' +#' @title xgb_gapfill +#' +#' @param site_ids character: a character vector of site IDs to be modeled. +#' @param resimet.df data.frame: a data frame containing eddy-covariance +#' observations, QC flags, timestamps, and predictor variables. +#' @param sitecov.df data.frame: a data frame containing site-specific selected +#' predictor names, with columns including `Site_ID` and `filtered_variables`. +#' @param flux_var character: the target flux type to be gap-filled. Currently +#' supported values are `"LE"` and `"NEE"`. +#' @param outdir character: the output directory where the exported CSV files +#' will be written. +#' @param nfolds numeric: number of folds used in CV, default is 10. +#' @param nrounds numeric: number of boosting rounds used in XGBoost training, +#' default is 200. +#' @param nens numeric: number of ensemble iterations, default is 25. +#' @param nsamp numeric: proportion of high-quality observations sampled within +#' each site for training in each ensemble, default is 0.95. +#' @param r2_threshold numeric: minimum average validation R2 required to +#' fit the final model and retain a site in the exported gap-filled output +#' @param cores numeric: how many CPUs to be used in the calculation +#' @param overwrite boolean: decide if existing output files should be overwritten, +#' the default is TRUE. +#' +#' @return list containing ensemble-level outputs. Each element corresponds to +#' one ensemble iteration and includes: +#' \describe{ +#' \item{cv_results}{a data frame of site-level average CV R2.} +#' \item{filled_data}{a data frame of the full input dataset with merged gap-filled values.} +#' \item{exported_data}{a data frame of filtered gap-filled results for well-performing sites only.} +#' \item{folds_all}{a data frame of fold-level CV diagnostics across sites.} +#' } +#' +#' @author Yang Gu +#' @importFrom foreach %dopar% +#' @export +xgb_gapfill <- function(site.dir, + pred.var.dir, + flux_var, + outdir, + vars_to_check = c( + "TA_F","SW_IN_F","SW_DIF","LW_IN_JSB","VPD_F","PA_F","P_F", + "RH","NETRAD","PPFD_IN","PPFD_DIF","TS_F_MDS_1","LW_IN_F", + "TS_F_MDS_3","TS_F_MDS_5","SWC_F_MDS_1","NDVI","EVI", + "sin_doy","cos_doy","SWC_F_MDS_3","SWC_F_MDS_5" + ), + n_sites = 40, + missing_threshold = 40, + nfolds = 10, + nrounds = 200, + nens = 25, + nsamp = 0.95, + params = list( + objective = "reg:squarederror", + eta = 0.13, + max_depth = 13, + subsample = 0.9, + colsample_bytree = 0.9 + ), + r2_threshold = 0.6, + cores = max(1L, parallel::detectCores() - 1L), + overwrite = TRUE) { + + if (!requireNamespace("xgboost", quietly = TRUE)) { + stop("Package 'xgboost' is required for cv_xgb_site_cv_gapfill() but is not installed.") + } + if (!requireNamespace("doParallel", quietly = TRUE)) { + stop("Package 'doParallel' is required for parallel execution but is not installed.") + } + if (!requireNamespace("foreach", quietly = TRUE)) { + stop("Package 'foreach' is required for parallel execution but is not installed.") + } + + # Read target site IDs + site_ids <- read_site_ids( + site.dir = site.dir, + n_sites = n_sites + ) + + # Read predictor and flux data for selected sites + resimet.df <- read_pred_data( + pred.var.dir = pred.var.dir, + site_ids = site_ids + ) + + # Build site-specific predictor table + sitecov_res <- build_sitecov_df( + resimet.df = resimet.df, + vars_to_check = vars_to_check, + outdir = outdir, + missing_threshold = missing_threshold + ) + sitecov.df <- sitecov_res$sitecov.df + + # Map flux type to variable names + flux_var <- toupper(flux_var) + if (flux_var == "LE") { + response_var <- "LE_F_MDS" + qc_var <- "LE_F_MDS_QC" + gapfill_var <- "LE_gapfill" + } else if (flux_var == "NEE") { + response_var <- "NEE_CUT_USTAR50" + qc_var <- "NEE_CUT_USTAR50_QC" + gapfill_var <- "NEE_gapfill" + } else { + stop("Unsupported flux_var. Currently only 'LE' and 'NEE' are supported.") + } + + # Validate required columns in input data + required_cols <- c("Site_ID", "TIMESTAMP_START", response_var, qc_var) + missing_cols <- setdiff(required_cols, names(resimet.df)) + if (length(missing_cols) > 0) { + stop( + sprintf( + "Missing required columns in resimet.df: %s", + paste(missing_cols, collapse = ", ") + ) + ) + } + if (!file.exists(outdir)) { + dir.create(outdir, recursive = TRUE) + } + + # Filter high-quality observations for training (QC = 0) + data_good <- resimet.df |> + dplyr::filter(.data[[qc_var]] == 0) + + ensemble_results <- vector("list", nens) + + # Loop over ensemble members + for (m in seq_len(nens)) { + message(sprintf("Ensemble %d/%d", m, nens)) + + # Stratified Sampling: Sample 95% from within each site for training. + train_df <- data_good |> + dplyr::group_by(Site_ID) |> + dplyr::slice_sample(prop = nsamp) |> + dplyr::ungroup() + + # Build site-specific training datasets + site_dfs <- lapply(site_ids, function(site) { + feats0 <- sitecov.df |> + dplyr::filter(Site_ID == site) |> + dplyr::pull(filtered_variables) |> + unique() + + feats <- unique(c(feats0, "is_night_suncalc")) + feats <- intersect(feats, names(train_df)) + + df <- train_df |> + dplyr::filter(Site_ID == site) |> + dplyr::select( + TIMESTAMP_START, + Site_ID, + dplyr::all_of(response_var), + dplyr::all_of(feats) + ) |> + tibble::as_tibble() + + attr(df, "features") <- feats + df + }) + names(site_dfs) <- site_ids + + # Parallel model fitting and gap-filling by site + use_cores <- min(as.numeric(cores), length(site_ids)) + cl <- parallel::makeCluster(use_cores, type = "PSOCK") + doParallel::registerDoParallel(cl) + results <- tryCatch({ + foreach::foreach( + site = site_ids, + .packages = c("dplyr", "xgboost", "tibble"), + .combine = dplyr::bind_rows, + .errorhandling = "pass" + ) %dopar% { + tryCatch({ + df <- site_dfs[[site]] + feats <- attr(df, "features") + + # Fit XGBoost model with CV + fit <- fit_site_xgb_cv( + subdf = df, + feats = feats, + site_name = site, + response_var = response_var, + nfolds = nfolds, + params = params, + nrounds = nrounds, + r2_threshold = r2_threshold + ) + + # Identify gap-fill candidates + gap_cands <- resimet.df |> + dplyr::filter( + Site_ID == site, + is.na(.data[[response_var]]) | .data[[qc_var]] != 0 + ) |> + dplyr::select(TIMESTAMP_START, Site_ID, dplyr::all_of(feats)) |> + tibble::as_tibble() + + pd <- gap_cands + + # Gap fill flux if model is valid + if (!is.null(fit$model) && nrow(pd) > 0) { + dtest <- xgboost::xgb.DMatrix( + data = as.matrix(dplyr::select(pd, dplyr::all_of(feats))), + missing = NA + ) + pd[[gapfill_var]] <- stats::predict(fit$model, dtest) + pd$GF_QC <- 0L + } else if (nrow(pd) > 0) { + pd[[gapfill_var]] <- NA_real_ + pd$GF_QC <- 1L + } + + # Return site-level results + tibble::tibble( + Site_ID = site, + Avg_R2 = fit$r2, + pred_df = list(pd), + folds_df = list(dplyr::mutate(fit$folds_df, Site_ID = site)) + ) + }, error = function(e) { + message(sprintf("[Site %s] ERROR: %s", site, conditionMessage(e))) + tibble::tibble( + Site_ID = site, + Avg_R2 = NA_real_, + pred_df = list(NULL), + folds_df = list(tibble::tibble()) + ) + }) + } + }, error = function(e) { + message(sprintf("Parallel loop error in ensemble %d: %s", m, conditionMessage(e))) + NULL + }, finally = { + parallel::stopCluster(cl) + foreach::registerDoSEQ() + }) + + # Skip ensemble if no results + if (is.null(results) || nrow(results) == 0) { + ensemble_results[[m]] <- NULL + next + } + + # Combine gap-fill predictions across sites + pred_list <- results$pred_df + gap_all <- dplyr::bind_rows(pred_list) + + # Merge predictions back to full dataset + resimet_filled <- resimet.df |> + dplyr::left_join(gap_all, by = c("Site_ID", "TIMESTAMP_START")) |> + dplyr::mutate( + !!gapfill_var := dplyr::if_else( + !is.na(.data[[gapfill_var]]), + .data[[gapfill_var]], + .data[[response_var]] + ), + GF_QC = dplyr::if_else(is.na(GF_QC), 1L, GF_QC) + ) + + # Extract CV performance + cv_results <- results |> + dplyr::select(Site_ID, Avg_R2) + + # Select well-performing sites + good_sites <- cv_results |> + dplyr::filter(!is.na(Avg_R2) & Avg_R2 >= r2_threshold) |> + dplyr::pull(Site_ID) + + # Subset final output for good sites + sub.df <- resimet_filled |> + dplyr::filter(Site_ID %in% good_sites) |> + dplyr::select( + Site_ID, + TIMESTAMP_START, + dplyr::all_of(response_var), + dplyr::all_of(gapfill_var), + GF_QC, + dplyr::all_of(qc_var) + ) + + # Aggregate fold-level CV diagnostics + folds_all <- dplyr::bind_rows(results$folds_df) |> + dplyr::mutate(ensemble_id = m) + + ens_tag <- sprintf("ens%02d", m) + flux_tag <- tolower(flux_var) + + data_path <- file.path(outdir, sprintf("%s_gfed_dat_%s.csv", flux_tag, ens_tag)) + folds_path <- file.path(outdir, sprintf("%s_gfed_cv_folds_%s.csv", flux_tag, ens_tag)) + + if (overwrite || !file.exists(data_path)) { + data.table::fwrite(sub.df, data_path) + } + + if (overwrite || !file.exists(folds_path)) { + data.table::fwrite(folds_all, folds_path) + } + + # Store ensemble results + ensemble_results[[m]] <- list( + cv_results = cv_results, + filled_data = resimet_filled, + exported_data = sub.df, + folds_all = folds_all + ) + + rm(results, pred_list, gap_all, resimet_filled, cv_results, good_sites, + sub.df, folds_all, site_dfs, train_df) + gc(FALSE) + } + + return(ensemble_results) +} \ No newline at end of file From 61981715cd53ea7f5a96e76c29ff7946bba107c6 Mon Sep 17 00:00:00 2001 From: yanggu Date: Thu, 19 Mar 2026 14:25:59 -0400 Subject: [PATCH 02/13] make roxygen files fot gap-filling scripts --- modules/data.land/NAMESPACE | 1 + modules/data.land/man/build_sitecov_df.Rd | 39 +++++++++++ modules/data.land/man/fit_site_xgb_cv.Rd | 57 ++++++++++++++++ modules/data.land/man/read_pred_data.Rd | 23 +++++++ modules/data.land/man/xgb_gapfill.Rd | 81 +++++++++++++++++++++++ 5 files changed, 201 insertions(+) create mode 100644 modules/data.land/man/build_sitecov_df.Rd create mode 100644 modules/data.land/man/fit_site_xgb_cv.Rd create mode 100644 modules/data.land/man/read_pred_data.Rd create mode 100644 modules/data.land/man/xgb_gapfill.Rd diff --git a/modules/data.land/NAMESPACE b/modules/data.land/NAMESPACE index dc864627af..ec0d690a37 100644 --- a/modules/data.land/NAMESPACE +++ b/modules/data.land/NAMESPACE @@ -71,6 +71,7 @@ export(to.TreeCode) export(validate_events_json) export(write_ic) export(write_veg) +export(xgb_gapfill) importFrom(dplyr,"%>%") importFrom(foreach,"%dopar%") importFrom(magrittr,"%>%") diff --git a/modules/data.land/man/build_sitecov_df.Rd b/modules/data.land/man/build_sitecov_df.Rd new file mode 100644 index 0000000000..52388c7be9 --- /dev/null +++ b/modules/data.land/man/build_sitecov_df.Rd @@ -0,0 +1,39 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/SIPNET_flux_gapfill.R +\name{build_sitecov_df} +\alias{build_sitecov_df} +\title{build_sitecov_df} +\usage{ +build_sitecov_df( + resimet.df, + vars_to_check, + outdir = NULL, + missing_threshold = 40 +) +} +\arguments{ +\item{resimet.df}{data.frame: predictor/flux dataset filtered to target sites.} + +\item{vars_to_check}{character: vector of candidate predictor variable names.} + +\item{outdir}{character: optional output directory to save the missing summary. +Default is NULL.} + +\item{missing_threshold}{numeric: maximum allowed missing percentage for a +predictor to be retained at a site. Default is 40.} +} +\value{ +list containing: +\describe{ + \item{missing_summary}{data frame of missing-value summary by site and variable.} + \item{sitecov.df}{long-format data frame of retained predictors by site.} +} +} +\description{ +This function calculates site-specific missing-value percentages for +candidate predictor variables and returns a long-format table of retained +predictors for each site. +} +\author{ +Yang Gu +} diff --git a/modules/data.land/man/fit_site_xgb_cv.Rd b/modules/data.land/man/fit_site_xgb_cv.Rd new file mode 100644 index 0000000000..97998c2a82 --- /dev/null +++ b/modules/data.land/man/fit_site_xgb_cv.Rd @@ -0,0 +1,57 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/SIPNET_flux_gapfill.R +\name{fit_site_xgb_cv} +\alias{fit_site_xgb_cv} +\title{fit_site_xgb_cv} +\usage{ +fit_site_xgb_cv( + subdf, + feats, + site_name, + response_var, + nfolds, + params, + nrounds, + r2_threshold +) +} +\arguments{ +\item{subdf}{data.frame: a site-specific training data frame containing +the response variable and predictor variables.} + +\item{feats}{character: a character vector of predictor variable names used +for model fitting.} + +\item{site_name}{character: site identifier used for progress reporting.} + +\item{response_var}{character: the name of the response variable column in +`subdf`.} + +\item{nfolds}{numeric: number of folds used in CV.} + +\item{params}{list: a list of XGBoost model parameters passed to +`xgboost::xgb.train`.} + +\item{nrounds}{numeric: number of boosting rounds used in XGBoost training.} + +\item{r2_threshold}{numeric: minimum average validation R2 required +to fit the final model.} +} +\value{ +list containing: +\describe{ + \item{r2}{average validation R2 across folds.} + \item{model}{the final fitted XGBoost model, or NULL if the site does not meet the threshold.} + \item{folds_df}{a data frame of fold-level training and validation R2.} +} +} +\description{ +This internal helper function fits a site-specific XGBoost model using +k-fold CV. It evaluates model performance with fold-level +training and validation R2, and fits a final model using the full +site training dataset if the average validation R2 exceeds the +specified threshold. +} +\author{ +Yang Gu +} diff --git a/modules/data.land/man/read_pred_data.Rd b/modules/data.land/man/read_pred_data.Rd new file mode 100644 index 0000000000..603c071738 --- /dev/null +++ b/modules/data.land/man/read_pred_data.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/SIPNET_flux_gapfill.R +\name{read_pred_data} +\alias{read_pred_data} +\title{read_pred_data} +\usage{ +read_pred_data(pred.var.dir, site_ids) +} +\arguments{ +\item{pred.var.dir}{character: path to the predictor/flux data file.} + +\item{site_ids}{character: vector of site IDs to retain.} +} +\value{ +data.frame containing filtered predictor/flux data. +} +\description{ +This function reads the predictor/flux dataset and keeps only rows +corresponding to the selected site IDs. +} +\author{ +Yang Gu +} diff --git a/modules/data.land/man/xgb_gapfill.Rd b/modules/data.land/man/xgb_gapfill.Rd new file mode 100644 index 0000000000..7a66779f1f --- /dev/null +++ b/modules/data.land/man/xgb_gapfill.Rd @@ -0,0 +1,81 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/SIPNET_flux_gapfill.R +\name{xgb_gapfill} +\alias{xgb_gapfill} +\title{xgb_gapfill} +\usage{ +xgb_gapfill( + site.dir, + pred.var.dir, + flux_var, + outdir, + vars_to_check = c("TA_F", "SW_IN_F", "SW_DIF", "LW_IN_JSB", "VPD_F", "PA_F", "P_F", + "RH", "NETRAD", "PPFD_IN", "PPFD_DIF", "TS_F_MDS_1", "LW_IN_F", "TS_F_MDS_3", + "TS_F_MDS_5", "SWC_F_MDS_1", "NDVI", "EVI", "sin_doy", "cos_doy", "SWC_F_MDS_3", + "SWC_F_MDS_5"), + n_sites = 40, + missing_threshold = 40, + nfolds = 10, + nrounds = 200, + nens = 25, + nsamp = 0.95, + params = list(objective = "reg:squarederror", eta = 0.13, max_depth = 13, subsample = + 0.9, colsample_bytree = 0.9), + r2_threshold = 0.6, + cores = max(1L, parallel::detectCores() - 1L), + overwrite = TRUE +) +} +\arguments{ +\item{flux_var}{character: the target flux type to be gap-filled. Currently +supported values are `"LE"` and `"NEE"`.} + +\item{outdir}{character: the output directory where the exported CSV files +will be written.} + +\item{nfolds}{numeric: number of folds used in CV, default is 10.} + +\item{nrounds}{numeric: number of boosting rounds used in XGBoost training, +default is 200.} + +\item{nens}{numeric: number of ensemble iterations, default is 25.} + +\item{nsamp}{numeric: proportion of high-quality observations sampled within +each site for training in each ensemble, default is 0.95.} + +\item{r2_threshold}{numeric: minimum average validation R2 required to +fit the final model and retain a site in the exported gap-filled output} + +\item{cores}{numeric: how many CPUs to be used in the calculation} + +\item{overwrite}{boolean: decide if existing output files should be overwritten, +the default is TRUE.} + +\item{site_ids}{character: a character vector of site IDs to be modeled.} + +\item{resimet.df}{data.frame: a data frame containing eddy-covariance +observations, QC flags, timestamps, and predictor variables.} + +\item{sitecov.df}{data.frame: a data frame containing site-specific selected +predictor names, with columns including `Site_ID` and `filtered_variables`.} +} +\value{ +list containing ensemble-level outputs. Each element corresponds to +one ensemble iteration and includes: +\describe{ + \item{cv_results}{a data frame of site-level average CV R2.} + \item{filled_data}{a data frame of the full input dataset with merged gap-filled values.} + \item{exported_data}{a data frame of filtered gap-filled results for well-performing sites only.} + \item{folds_all}{a data frame of fold-level CV diagnostics across sites.} +} +} +\description{ +This function performs site-level XGBoost gap-filling with CV +for eddy-covariance flux data. For each ensemble iteration, it samples +observed data(QC=0) within each site, fits site-specific XGBoost models +using k-fold CV, predicts missing or low-quality flux values, +and exports both gap-filled outputs and fold-level CV diagnostics. +} +\author{ +Yang Gu +} From 2dce279d131407a21a3eb94dccdc3a953506aa73 Mon Sep 17 00:00:00 2001 From: yanggu Date: Thu, 19 Mar 2026 14:26:57 -0400 Subject: [PATCH 03/13] commit rosxygen files for read_site_ids --- modules/data.land/man/read_site_ids.Rd | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 modules/data.land/man/read_site_ids.Rd diff --git a/modules/data.land/man/read_site_ids.Rd b/modules/data.land/man/read_site_ids.Rd new file mode 100644 index 0000000000..8c4bf35143 --- /dev/null +++ b/modules/data.land/man/read_site_ids.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/SIPNET_flux_gapfill.R +\name{read_site_ids} +\alias{read_site_ids} +\title{read_site_ids} +\usage{ +read_site_ids(site.dir, n_sites = 40) +} +\arguments{ +\item{site.dir}{character: path to the site information table.} + +\item{n_sites}{numeric: number of sites to keep from the site table, +the total number of sites is 241.} +} +\value{ +character vector of selected site IDs. Example: 'Site ID' CA-ARB +} +\description{ +This function reads the site information table and returns the selected +site IDs for downstream modeling. +} +\author{ +Yang Gu +} From dd5367886f06f3eaad399ae303077d5584e2d0d0 Mon Sep 17 00:00:00 2001 From: yanggu Date: Thu, 19 Mar 2026 16:37:19 -0400 Subject: [PATCH 04/13] update description --- docker/depends/pecan_package_dependencies.csv | 4 ++ modules/data.land/R/SIPNET_flux_gapfill.R | 65 ++++++++++--------- 2 files changed, 37 insertions(+), 32 deletions(-) diff --git a/docker/depends/pecan_package_dependencies.csv b/docker/depends/pecan_package_dependencies.csv index 1c64dd7120..0fbef5edb4 100644 --- a/docker/depends/pecan_package_dependencies.csv +++ b/docker/depends/pecan_package_dependencies.csv @@ -33,6 +33,7 @@ "data.table","*","base/utils","Suggests",FALSE "data.table","*","base/visualization","Imports",FALSE "data.table","*","models/biocro","Imports",FALSE +"data.table","*","modules/data.land","Imports",FALSE "data.table","*","modules/data.remote","Suggests",FALSE "dataone","*","modules/data.land","Suggests",FALSE "datapack","*","modules/data.land","Suggests",FALSE @@ -41,6 +42,7 @@ "dbplyr",">= 2.4.0","base/db","Imports",FALSE "devtools","*","models/ed","Suggests",FALSE "doParallel","*","modules/data.atmosphere","Suggests",FALSE +"doParallel","*","modules/data.land","Imports",FALSE "doParallel","*","modules/data.remote","Imports",FALSE "doSNOW","*","base/remote","Suggests",FALSE "doSNOW","*","base/utils","Suggests",FALSE @@ -667,6 +669,7 @@ "tibble","*","models/fates","Imports",FALSE "tibble","*","models/lpjguess","Imports",FALSE "tibble","*","modules/data.atmosphere","Imports",FALSE +"tibble","*","modules/data.land","Imports",FALSE "tibble","*","modules/data.remote","Suggests",FALSE "tibble","*","modules/meta.analysis","Suggests",FALSE "tictoc","*","modules/assim.sequential","Suggests",FALSE @@ -722,6 +725,7 @@ "withr","*","modules/data.land","Suggests",FALSE "withr","*","modules/uncertainty","Suggests",FALSE "xgboost","*","modules/assim.sequential","Suggests",FALSE +"xgboost","*","modules/data.land","Imports",FALSE "XML","*","base/workflow","Imports",FALSE "XML","*","models/biocro","Imports",FALSE "XML","*","models/maat","Imports",FALSE diff --git a/modules/data.land/R/SIPNET_flux_gapfill.R b/modules/data.land/R/SIPNET_flux_gapfill.R index 3997513616..17f33823ec 100644 --- a/modules/data.land/R/SIPNET_flux_gapfill.R +++ b/modules/data.land/R/SIPNET_flux_gapfill.R @@ -263,44 +263,45 @@ fit_site_xgb_cv <- function(subdf, ) } -#########Need to add sub-functions to add site_ids, resimet.df, and sitecov.df #' @description -#' This function performs site-level XGBoost gap-filling with CV -#' for eddy-covariance flux data. For each ensemble iteration, it samples -#' observed data(QC=0) within each site, fits site-specific XGBoost models -#' using k-fold CV, predicts missing or low-quality flux values, -#' and exports both gap-filled outputs and fold-level CV diagnostics. +#' Perform site-level XGBoost gap-filling with cross-validation for +#' eddy-covariance flux data. For each ensemble iteration, the function +#' samples high-quality observations (QC = 0) within each site, fits a +#' site-specific XGBoost model using k-fold cross-validation, predicts +#' missing or low-quality flux values, and exports both gap-filled outputs +#' and fold-level CV diagnostics. #' #' @title xgb_gapfill #' -#' @param site_ids character: a character vector of site IDs to be modeled. -#' @param resimet.df data.frame: a data frame containing eddy-covariance -#' observations, QC flags, timestamps, and predictor variables. -#' @param sitecov.df data.frame: a data frame containing site-specific selected -#' predictor names, with columns including `Site_ID` and `filtered_variables`. -#' @param flux_var character: the target flux type to be gap-filled. Currently -#' supported values are `"LE"` and `"NEE"`. -#' @param outdir character: the output directory where the exported CSV files -#' will be written. -#' @param nfolds numeric: number of folds used in CV, default is 10. -#' @param nrounds numeric: number of boosting rounds used in XGBoost training, -#' default is 200. -#' @param nens numeric: number of ensemble iterations, default is 25. -#' @param nsamp numeric: proportion of high-quality observations sampled within -#' each site for training in each ensemble, default is 0.95. -#' @param r2_threshold numeric: minimum average validation R2 required to -#' fit the final model and retain a site in the exported gap-filled output -#' @param cores numeric: how many CPUs to be used in the calculation -#' @param overwrite boolean: decide if existing output files should be overwritten, -#' the default is TRUE. +#' @param site.dir character. Path to the site information table. +#' @param pred.var.dir character. Path to the predictor/flux data file. +#' @param flux_var character. Flux type to be gap-filled. Currently supports +#' `"LE"` and `"NEE"`. +#' @param outdir character. Output directory for exported CSV files. +#' @param vars_to_check character vector. Candidate predictor variables used +#' to build site-specific predictor sets. +#' @param n_sites numeric. Number of sites to keep from the site table. +#' @param missing_threshold numeric. Maximum allowed percentage of missingness +#' when selecting predictors for each site. +#' @param nfolds numeric. Number of folds used in cross-validation. Default is 10. +#' @param nrounds numeric. Number of boosting rounds for XGBoost training. +#' Default is 200. +#' @param nens numeric. Number of ensemble iterations. Default is 25. +#' @param nsamp numeric. Proportion of QC = 0 observations sampled within each +#' site for training in each ensemble. Default is 0.95. +#' @param params list. XGBoost model parameters. +#' @param r2_threshold numeric. Minimum average validation R2 required to keep +#' a site in the exported gap-filled output. +#' @param cores numeric. Number of CPUs used for parallel site-level fitting. +#' @param overwrite logical. Whether existing output files should be overwritten. #' -#' @return list containing ensemble-level outputs. Each element corresponds to -#' one ensemble iteration and includes: +#' @return A list of length `nens`. Each element corresponds to one ensemble +#' iteration and contains: #' \describe{ -#' \item{cv_results}{a data frame of site-level average CV R2.} -#' \item{filled_data}{a data frame of the full input dataset with merged gap-filled values.} -#' \item{exported_data}{a data frame of filtered gap-filled results for well-performing sites only.} -#' \item{folds_all}{a data frame of fold-level CV diagnostics across sites.} +#' \item{cv_results}{data frame of site-level average CV R2.} +#' \item{filled_data}{data frame of the full input dataset with merged gap-filled values.} +#' \item{exported_data}{data frame of filtered gap-filled results for well-performing sites only.} +#' \item{folds_all}{data frame of fold-level CV diagnostics across sites.} #' } #' #' @author Yang Gu From 7c5a76b3513b2cd6cff7690f034ae5c446b9ed26 Mon Sep 17 00:00:00 2001 From: yanggu Date: Thu, 19 Mar 2026 17:47:25 -0400 Subject: [PATCH 05/13] update Rd --- modules/data.land/man/xgb_gapfill.Rd | 67 +++++++++++++++------------- 1 file changed, 36 insertions(+), 31 deletions(-) diff --git a/modules/data.land/man/xgb_gapfill.Rd b/modules/data.land/man/xgb_gapfill.Rd index 7a66779f1f..d473940280 100644 --- a/modules/data.land/man/xgb_gapfill.Rd +++ b/modules/data.land/man/xgb_gapfill.Rd @@ -27,54 +27,59 @@ xgb_gapfill( ) } \arguments{ -\item{flux_var}{character: the target flux type to be gap-filled. Currently -supported values are `"LE"` and `"NEE"`.} +\item{site.dir}{character. Path to the site information table.} -\item{outdir}{character: the output directory where the exported CSV files -will be written.} +\item{pred.var.dir}{character. Path to the predictor/flux data file.} -\item{nfolds}{numeric: number of folds used in CV, default is 10.} +\item{flux_var}{character. Flux type to be gap-filled. Currently supports +`"LE"` and `"NEE"`.} -\item{nrounds}{numeric: number of boosting rounds used in XGBoost training, -default is 200.} +\item{outdir}{character. Output directory for exported CSV files.} -\item{nens}{numeric: number of ensemble iterations, default is 25.} +\item{vars_to_check}{character vector. Candidate predictor variables used +to build site-specific predictor sets.} -\item{nsamp}{numeric: proportion of high-quality observations sampled within -each site for training in each ensemble, default is 0.95.} +\item{n_sites}{numeric. Number of sites to keep from the site table.} -\item{r2_threshold}{numeric: minimum average validation R2 required to -fit the final model and retain a site in the exported gap-filled output} +\item{missing_threshold}{numeric. Maximum allowed percentage of missingness +when selecting predictors for each site.} -\item{cores}{numeric: how many CPUs to be used in the calculation} +\item{nfolds}{numeric. Number of folds used in cross-validation. Default is 10.} -\item{overwrite}{boolean: decide if existing output files should be overwritten, -the default is TRUE.} +\item{nrounds}{numeric. Number of boosting rounds for XGBoost training. +Default is 200.} -\item{site_ids}{character: a character vector of site IDs to be modeled.} +\item{nens}{numeric. Number of ensemble iterations. Default is 25.} -\item{resimet.df}{data.frame: a data frame containing eddy-covariance -observations, QC flags, timestamps, and predictor variables.} +\item{nsamp}{numeric. Proportion of QC = 0 observations sampled within each +site for training in each ensemble. Default is 0.95.} -\item{sitecov.df}{data.frame: a data frame containing site-specific selected -predictor names, with columns including `Site_ID` and `filtered_variables`.} +\item{params}{list. XGBoost model parameters.} + +\item{r2_threshold}{numeric. Minimum average validation R2 required to keep +a site in the exported gap-filled output.} + +\item{cores}{numeric. Number of CPUs used for parallel site-level fitting.} + +\item{overwrite}{logical. Whether existing output files should be overwritten.} } \value{ -list containing ensemble-level outputs. Each element corresponds to -one ensemble iteration and includes: +A list of length `nens`. Each element corresponds to one ensemble +iteration and contains: \describe{ - \item{cv_results}{a data frame of site-level average CV R2.} - \item{filled_data}{a data frame of the full input dataset with merged gap-filled values.} - \item{exported_data}{a data frame of filtered gap-filled results for well-performing sites only.} - \item{folds_all}{a data frame of fold-level CV diagnostics across sites.} + \item{cv_results}{data frame of site-level average CV R2.} + \item{filled_data}{data frame of the full input dataset with merged gap-filled values.} + \item{exported_data}{data frame of filtered gap-filled results for well-performing sites only.} + \item{folds_all}{data frame of fold-level CV diagnostics across sites.} } } \description{ -This function performs site-level XGBoost gap-filling with CV -for eddy-covariance flux data. For each ensemble iteration, it samples -observed data(QC=0) within each site, fits site-specific XGBoost models -using k-fold CV, predicts missing or low-quality flux values, -and exports both gap-filled outputs and fold-level CV diagnostics. +Perform site-level XGBoost gap-filling with cross-validation for +eddy-covariance flux data. For each ensemble iteration, the function +samples high-quality observations (QC = 0) within each site, fits a +site-specific XGBoost model using k-fold cross-validation, predicts +missing or low-quality flux values, and exports both gap-filled outputs +and fold-level CV diagnostics. } \author{ Yang Gu From 5e0c31e42a74d31ecd6b0a49ce91edf1f7dbef13 Mon Sep 17 00:00:00 2001 From: yanggu Date: Thu, 19 Mar 2026 19:37:45 -0400 Subject: [PATCH 06/13] Fix NSE notes in xgb_gapfill --- modules/data.land/R/SIPNET_flux_gapfill.R | 278 +++++++++++----------- 1 file changed, 139 insertions(+), 139 deletions(-) diff --git a/modules/data.land/R/SIPNET_flux_gapfill.R b/modules/data.land/R/SIPNET_flux_gapfill.R index 17f33823ec..2543658981 100644 --- a/modules/data.land/R/SIPNET_flux_gapfill.R +++ b/modules/data.land/R/SIPNET_flux_gapfill.R @@ -1,3 +1,9 @@ +utils::globalVariables(c( + "Site_ID", "variable", "value", "missing_count", "total_obs", + "missing_pct", "filtered_variables", "is_night_suncalc", + "TIMESTAMP_START", "GF_QC", "Avg_R2", "site", ":=" +)) + #' @description #' This function reads the site information table and returns the selected #' site IDs for downstream modeling. @@ -39,7 +45,7 @@ read_pred_data <- function(pred.var.dir, site_ids) { dplyr::mutate(is_night_suncalc = as.integer(is_night_suncalc)) resimet.df <- resimet.df |> - dplyr::filter(Site_ID %in% site_ids) + dplyr::filter(.data$Site_ID %in% site_ids) return(resimet.df) } @@ -70,20 +76,20 @@ build_sitecov_df <- function(resimet.df, outdir = NULL, missing_threshold = 40) { missing_summary <- resimet.df |> - dplyr::select(Site_ID, dplyr::all_of(vars_to_check)) |> + dplyr::select(.data$Site_ID, dplyr::all_of(vars_to_check)) |> tidyr::pivot_longer( - cols = -Site_ID, + cols = -.data$Site_ID, names_to = "variable", values_to = "value" ) |> - dplyr::group_by(Site_ID, variable) |> + dplyr::group_by(.data$Site_ID, .data$variable) |> dplyr::summarise( total_obs = dplyr::n(), - missing_count = sum(is.na(value)), - missing_pct = missing_count / total_obs * 100, + missing_count = sum(is.na(.data$value)), + missing_pct = .data$missing_count / .data$total_obs * 100, .groups = "drop" ) |> - dplyr::arrange(Site_ID, dplyr::desc(missing_pct)) + dplyr::arrange(.data$Site_ID, dplyr::desc(.data$missing_pct)) if (!is.null(outdir)) { if (!file.exists(outdir)) { @@ -96,15 +102,15 @@ build_sitecov_df <- function(resimet.df, } high_staying_vars <- missing_summary |> - dplyr::filter(missing_pct < missing_threshold) |> - dplyr::group_by(Site_ID) |> + dplyr::filter(.data$missing_pct < missing_threshold) |> + dplyr::group_by(.data$Site_ID) |> dplyr::summarise( - filtered_variables = list(variable), + filtered_variables = list(.data$variable), .groups = "drop" ) sitecov.df <- high_staying_vars |> - tidyr::unnest(filtered_variables) + tidyr::unnest(.data$filtered_variables) return(list( missing_summary = missing_summary, @@ -334,29 +340,32 @@ xgb_gapfill <- function(site.dir, cores = max(1L, parallel::detectCores() - 1L), overwrite = TRUE) { + # ------------------------------------------------------------ + # Check required packages + # ------------------------------------------------------------ if (!requireNamespace("xgboost", quietly = TRUE)) { - stop("Package 'xgboost' is required for cv_xgb_site_cv_gapfill() but is not installed.") + stop("Package 'xgboost' is required but not installed.") } if (!requireNamespace("doParallel", quietly = TRUE)) { - stop("Package 'doParallel' is required for parallel execution but is not installed.") + stop("Package 'doParallel' is required but not installed.") } if (!requireNamespace("foreach", quietly = TRUE)) { - stop("Package 'foreach' is required for parallel execution but is not installed.") + stop("Package 'foreach' is required but not installed.") } - # Read target site IDs - site_ids <- read_site_ids( - site.dir = site.dir, - n_sites = n_sites - ) + # ------------------------------------------------------------ + # Read site IDs and input data + # ------------------------------------------------------------ + site_ids <- read_site_ids(site.dir = site.dir, n_sites = n_sites) - # Read predictor and flux data for selected sites resimet.df <- read_pred_data( pred.var.dir = pred.var.dir, site_ids = site_ids ) - # Build site-specific predictor table + # ------------------------------------------------------------ + # Build site-specific predictor sets + # ------------------------------------------------------------ sitecov_res <- build_sitecov_df( resimet.df = resimet.df, vars_to_check = vars_to_check, @@ -365,7 +374,9 @@ xgb_gapfill <- function(site.dir, ) sitecov.df <- sitecov_res$sitecov.df - # Map flux type to variable names + # ------------------------------------------------------------ + # Define response and QC variables + # ------------------------------------------------------------ flux_var <- toupper(flux_var) if (flux_var == "LE") { response_var <- "LE_F_MDS" @@ -376,55 +387,60 @@ xgb_gapfill <- function(site.dir, qc_var <- "NEE_CUT_USTAR50_QC" gapfill_var <- "NEE_gapfill" } else { - stop("Unsupported flux_var. Currently only 'LE' and 'NEE' are supported.") + stop("flux_var must be 'LE' or 'NEE'") } - # Validate required columns in input data + # ------------------------------------------------------------ + # Check required columns + # ------------------------------------------------------------ required_cols <- c("Site_ID", "TIMESTAMP_START", response_var, qc_var) missing_cols <- setdiff(required_cols, names(resimet.df)) if (length(missing_cols) > 0) { - stop( - sprintf( - "Missing required columns in resimet.df: %s", - paste(missing_cols, collapse = ", ") - ) - ) - } - if (!file.exists(outdir)) { - dir.create(outdir, recursive = TRUE) + stop(sprintf("Missing columns: %s", paste(missing_cols, collapse = ", "))) } - # Filter high-quality observations for training (QC = 0) + if (!file.exists(outdir)) dir.create(outdir, recursive = TRUE) + + # ------------------------------------------------------------ + # Filter QC=0 data for training + # ------------------------------------------------------------ data_good <- resimet.df |> dplyr::filter(.data[[qc_var]] == 0) ensemble_results <- vector("list", nens) - # Loop over ensemble members + # ------------------------------------------------------------ + # Ensemble loop + # ------------------------------------------------------------ for (m in seq_len(nens)) { message(sprintf("Ensemble %d/%d", m, nens)) - # Stratified Sampling: Sample 95% from within each site for training. + # Sample within each site train_df <- data_good |> - dplyr::group_by(Site_ID) |> + dplyr::group_by(.data$Site_ID) |> dplyr::slice_sample(prop = nsamp) |> dplyr::ungroup() - # Build site-specific training datasets + # ------------------------------------------------------------ + # Build site-level datasets + # ------------------------------------------------------------ site_dfs <- lapply(site_ids, function(site) { + + # Select predictors for this site feats0 <- sitecov.df |> - dplyr::filter(Site_ID == site) |> - dplyr::pull(filtered_variables) |> + dplyr::filter(.data$Site_ID == .env$site) |> + dplyr::pull(.data$filtered_variables) |> unique() feats <- unique(c(feats0, "is_night_suncalc")) feats <- intersect(feats, names(train_df)) + # Subset training data df <- train_df |> - dplyr::filter(Site_ID == site) |> + dplyr::filter(.data$Site_ID == .env$site) |> dplyr::select( - TIMESTAMP_START, - Site_ID, + .data$TIMESTAMP_START, + .data$Site_ID, dplyr::all_of(response_var), dplyr::all_of(feats) ) |> @@ -433,95 +449,90 @@ xgb_gapfill <- function(site.dir, attr(df, "features") <- feats df }) + names(site_dfs) <- site_ids - # Parallel model fitting and gap-filling by site + # ------------------------------------------------------------ + # Parallel fitting + # ------------------------------------------------------------ use_cores <- min(as.numeric(cores), length(site_ids)) cl <- parallel::makeCluster(use_cores, type = "PSOCK") doParallel::registerDoParallel(cl) + results <- tryCatch({ foreach::foreach( site = site_ids, .packages = c("dplyr", "xgboost", "tibble"), - .combine = dplyr::bind_rows, - .errorhandling = "pass" + .combine = dplyr::bind_rows ) %dopar% { - tryCatch({ - df <- site_dfs[[site]] - feats <- attr(df, "features") - - # Fit XGBoost model with CV - fit <- fit_site_xgb_cv( - subdf = df, - feats = feats, - site_name = site, - response_var = response_var, - nfolds = nfolds, - params = params, - nrounds = nrounds, - r2_threshold = r2_threshold + + df <- site_dfs[[site]] + feats <- attr(df, "features") + + # Fit model + fit <- fit_site_xgb_cv( + subdf = df, + feats = feats, + site_name = site, + response_var = response_var, + nfolds = nfolds, + params = params, + nrounds = nrounds, + r2_threshold = r2_threshold + ) + + # -------------------------------------------------------- + # Gap-fill candidates + # -------------------------------------------------------- + gap_cands <- resimet.df |> + dplyr::filter( + .data$Site_ID == .env$site, + is.na(.data[[response_var]]) | .data[[qc_var]] != 0 + ) |> + dplyr::select( + .data$TIMESTAMP_START, + .data$Site_ID, + dplyr::all_of(feats) + ) |> + tibble::as_tibble() + + pd <- gap_cands + + # Predict + if (!is.null(fit$model) && nrow(pd) > 0) { + dtest <- xgboost::xgb.DMatrix( + data = as.matrix(dplyr::select(pd, dplyr::all_of(feats))), + missing = NA ) - - # Identify gap-fill candidates - gap_cands <- resimet.df |> - dplyr::filter( - Site_ID == site, - is.na(.data[[response_var]]) | .data[[qc_var]] != 0 - ) |> - dplyr::select(TIMESTAMP_START, Site_ID, dplyr::all_of(feats)) |> - tibble::as_tibble() - - pd <- gap_cands - - # Gap fill flux if model is valid - if (!is.null(fit$model) && nrow(pd) > 0) { - dtest <- xgboost::xgb.DMatrix( - data = as.matrix(dplyr::select(pd, dplyr::all_of(feats))), - missing = NA - ) - pd[[gapfill_var]] <- stats::predict(fit$model, dtest) - pd$GF_QC <- 0L - } else if (nrow(pd) > 0) { - pd[[gapfill_var]] <- NA_real_ - pd$GF_QC <- 1L - } - - # Return site-level results - tibble::tibble( - Site_ID = site, - Avg_R2 = fit$r2, - pred_df = list(pd), - folds_df = list(dplyr::mutate(fit$folds_df, Site_ID = site)) - ) - }, error = function(e) { - message(sprintf("[Site %s] ERROR: %s", site, conditionMessage(e))) - tibble::tibble( - Site_ID = site, - Avg_R2 = NA_real_, - pred_df = list(NULL), - folds_df = list(tibble::tibble()) - ) - }) + pd[[gapfill_var]] <- stats::predict(fit$model, dtest) + pd$GF_QC <- 0L + } else if (nrow(pd) > 0) { + pd[[gapfill_var]] <- NA_real_ + pd$GF_QC <- 1L + } + + tibble::tibble( + Site_ID = site, + Avg_R2 = fit$r2, + pred_df = list(pd), + folds_df = list(dplyr::mutate(fit$folds_df, Site_ID = .env$site)) + ) } - }, error = function(e) { - message(sprintf("Parallel loop error in ensemble %d: %s", m, conditionMessage(e))) - NULL }, finally = { parallel::stopCluster(cl) foreach::registerDoSEQ() }) - # Skip ensemble if no results if (is.null(results) || nrow(results) == 0) { ensemble_results[[m]] <- NULL next } - # Combine gap-fill predictions across sites - pred_list <- results$pred_df - gap_all <- dplyr::bind_rows(pred_list) + # ------------------------------------------------------------ + # Merge predictions + # ------------------------------------------------------------ + gap_all <- dplyr::bind_rows(results$pred_df) - # Merge predictions back to full dataset resimet_filled <- resimet.df |> dplyr::left_join(gap_all, by = c("Site_ID", "TIMESTAMP_START")) |> dplyr::mutate( @@ -530,58 +541,47 @@ xgb_gapfill <- function(site.dir, .data[[gapfill_var]], .data[[response_var]] ), - GF_QC = dplyr::if_else(is.na(GF_QC), 1L, GF_QC) + GF_QC = dplyr::if_else(is.na(.data$GF_QC), 1L, .data$GF_QC) ) - # Extract CV performance + # ------------------------------------------------------------ + # Select good sites + # ------------------------------------------------------------ cv_results <- results |> - dplyr::select(Site_ID, Avg_R2) + dplyr::select(.data$Site_ID, .data$Avg_R2) - # Select well-performing sites good_sites <- cv_results |> - dplyr::filter(!is.na(Avg_R2) & Avg_R2 >= r2_threshold) |> - dplyr::pull(Site_ID) + dplyr::filter(!is.na(.data$Avg_R2) & .data$Avg_R2 >= r2_threshold) |> + dplyr::pull(.data$Site_ID) - # Subset final output for good sites sub.df <- resimet_filled |> - dplyr::filter(Site_ID %in% good_sites) |> + dplyr::filter(.data$Site_ID %in% good_sites) |> dplyr::select( - Site_ID, - TIMESTAMP_START, + .data$Site_ID, + .data$TIMESTAMP_START, dplyr::all_of(response_var), dplyr::all_of(gapfill_var), - GF_QC, + .data$GF_QC, dplyr::all_of(qc_var) ) - # Aggregate fold-level CV diagnostics - folds_all <- dplyr::bind_rows(results$folds_df) |> - dplyr::mutate(ensemble_id = m) - + # ------------------------------------------------------------ + # Save outputs + # ------------------------------------------------------------ ens_tag <- sprintf("ens%02d", m) flux_tag <- tolower(flux_var) - data_path <- file.path(outdir, sprintf("%s_gfed_dat_%s.csv", flux_tag, ens_tag)) - folds_path <- file.path(outdir, sprintf("%s_gfed_cv_folds_%s.csv", flux_tag, ens_tag)) - - if (overwrite || !file.exists(data_path)) { - data.table::fwrite(sub.df, data_path) - } - - if (overwrite || !file.exists(folds_path)) { - data.table::fwrite(folds_all, folds_path) - } + data.table::fwrite( + sub.df, + file.path(outdir, sprintf("%s_gfed_dat_%s.csv", flux_tag, ens_tag)) + ) - # Store ensemble results ensemble_results[[m]] <- list( cv_results = cv_results, filled_data = resimet_filled, - exported_data = sub.df, - folds_all = folds_all + exported_data = sub.df ) - rm(results, pred_list, gap_all, resimet_filled, cv_results, good_sites, - sub.df, folds_all, site_dfs, train_df) gc(FALSE) } From 42d8f0cbd069a20807d167523fbf1f5696c411fe Mon Sep 17 00:00:00 2001 From: yanggu Date: Thu, 19 Mar 2026 20:03:47 -0400 Subject: [PATCH 07/13] put packages into Suggests --- docker/depends/pecan_package_dependencies.csv | 8 ++++---- modules/data.land/DESCRIPTION | 12 ++++++------ 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/docker/depends/pecan_package_dependencies.csv b/docker/depends/pecan_package_dependencies.csv index 0fbef5edb4..230e11c589 100644 --- a/docker/depends/pecan_package_dependencies.csv +++ b/docker/depends/pecan_package_dependencies.csv @@ -33,7 +33,7 @@ "data.table","*","base/utils","Suggests",FALSE "data.table","*","base/visualization","Imports",FALSE "data.table","*","models/biocro","Imports",FALSE -"data.table","*","modules/data.land","Imports",FALSE +"data.table","*","modules/data.land","Suggests",FALSE "data.table","*","modules/data.remote","Suggests",FALSE "dataone","*","modules/data.land","Suggests",FALSE "datapack","*","modules/data.land","Suggests",FALSE @@ -42,7 +42,7 @@ "dbplyr",">= 2.4.0","base/db","Imports",FALSE "devtools","*","models/ed","Suggests",FALSE "doParallel","*","modules/data.atmosphere","Suggests",FALSE -"doParallel","*","modules/data.land","Imports",FALSE +"doParallel","*","modules/data.land","Suggests",FALSE "doParallel","*","modules/data.remote","Imports",FALSE "doSNOW","*","base/remote","Suggests",FALSE "doSNOW","*","base/utils","Suggests",FALSE @@ -669,7 +669,7 @@ "tibble","*","models/fates","Imports",FALSE "tibble","*","models/lpjguess","Imports",FALSE "tibble","*","modules/data.atmosphere","Imports",FALSE -"tibble","*","modules/data.land","Imports",FALSE +"tibble","*","modules/data.land","Suggests",FALSE "tibble","*","modules/data.remote","Suggests",FALSE "tibble","*","modules/meta.analysis","Suggests",FALSE "tictoc","*","modules/assim.sequential","Suggests",FALSE @@ -725,7 +725,7 @@ "withr","*","modules/data.land","Suggests",FALSE "withr","*","modules/uncertainty","Suggests",FALSE "xgboost","*","modules/assim.sequential","Suggests",FALSE -"xgboost","*","modules/data.land","Imports",FALSE +"xgboost","*","modules/data.land","Suggests",FALSE "XML","*","base/workflow","Imports",FALSE "XML","*","models/biocro","Imports",FALSE "XML","*","models/maat","Imports",FALSE diff --git a/modules/data.land/DESCRIPTION b/modules/data.land/DESCRIPTION index ba5ad75b22..28fc5cd7be 100644 --- a/modules/data.land/DESCRIPTION +++ b/modules/data.land/DESCRIPTION @@ -46,11 +46,7 @@ Imports: terra, tidyr, tidyselect, - XML (>= 3.98-1.4), - data.table, - tibble, - xgboost, - doParallel + XML (>= 3.98-1.4) Suggests: dataone, datapack, @@ -77,7 +73,11 @@ Suggests: sp, testthat (>= 3.1.0), traits, - withr + withr, + xgboost, + doParallel, + data.table, + tibble Remotes: github::ropensci/traits License: BSD_3_clause + file LICENSE From b6bdea587c09a81c5623dd434d9297ef47731559 Mon Sep 17 00:00:00 2001 From: yanggu Date: Thu, 16 Apr 2026 21:36:01 -0400 Subject: [PATCH 08/13] data prep for downscaling and bcm code --- .../assim.sequential/inst/bias_correction.R | 747 ++++++++++++++++++ modules/assim.sequential/inst/data_prep.R | 633 +++++++++++++++ 2 files changed, 1380 insertions(+) create mode 100644 modules/assim.sequential/inst/bias_correction.R create mode 100644 modules/assim.sequential/inst/data_prep.R diff --git a/modules/assim.sequential/inst/bias_correction.R b/modules/assim.sequential/inst/bias_correction.R new file mode 100644 index 0000000000..f714ba8709 --- /dev/null +++ b/modules/assim.sequential/inst/bias_correction.R @@ -0,0 +1,747 @@ +## Load packages +library(data.table) +library(dplyr) +library(xgboost) +library(parallel) +library(doParallel) +library(caret) + +##' @title parse_bcm_args +##' @name parse_bcm_args +##' @author Yang Gu +##' +##' @param args Character vector from commandArgs(trailingOnly = TRUE). +##' +##' @description +##' Parse command-line arguments for flux type and response variable. +##' The first argument is the flux type ("LE" or "NEE"), and the second +##' argument is the raw ensemble variable name (ens01). +##' +##' @return +##' A list containing `flux_type`, `raw_response_var`, and `response_var`. +parse_bcm_args <- function(args) { + flux_type <- if (length(args) >= 1) toupper(args[1]) else "LE" + raw_response_var <- if (length(args) >= 2) args[2] else "ens01" + + if (!flux_type %in% c("LE", "NEE")) { + stop("flux_type must be either 'LE' or 'NEE'.") + } + + response_var <- paste0(raw_response_var, "_residual") + + return(list( + flux_type = flux_type, + raw_response_var = raw_response_var, + response_var = response_var + )) +} + +##' @title get_flux_config +##' @name get_flux_config +##' @author Yang Gu +##' +##' @param flux_type Character string, either "LE" or "NEE". +##' +##' @description +##' Return flux-specific configuration, including observation column names, +##' model mean column names, columns to remove, and output suffixes. +##' +##' @return +##' A list of flux-specific settings. +get_flux_config <- function(flux_type) { + if (flux_type == "LE") { + return(list( + flux_type = "LE", + mean_col = "Qle_mean", + mean_std_col = "flux_mean", + obs_col = "LE_F_MDS", + obs_qc_col = "LE_F_MDS_QC", + residual_col = "LE_residual", + corrected_col = "LE_corrected", + pred_residual_col = "LE_resi_pred", + gfed_file = "/projectnb/dietzelab/guYANG/Gap_fill/results/le_ens_ec_3h.csv", + output_tag = "le" + )) + } + + if (flux_type == "NEE") { + return(list( + flux_type = "NEE", + mean_col = "NEE_mean", + mean_std_col = "flux_mean", + obs_col = "NEE_CUT_USTAR50", + obs_qc_col = "NEE_CUT_USTAR50_QC", + residual_col = "NEE_residual", + corrected_col = "NEE_corrected", + pred_residual_col = "NEE_resi_pred", + gfed_file = "/projectnb/dietzelab/guYANG/Gap_fill/results/ens_ec_3h.csv", + output_tag = "nee" + )) + } + + stop("Unsupported flux_type.") +} + +##' @title read_close_points_and_envres +##' @name read_close_points_and_envres +##' @author Yang Gu +##' +##' @param matched_file Path to matched point-site file. +##' @param envres_file Path to .RData file containing `envres`. +##' +##' @description +##' Read the nearest matched site-index table and load the integrated envres +##' dataset from .RData. +##' +##' @return +##' A list containing `close_points_df` and `envres`. +read_close_points_and_envres <- function(matched_file, envres_file) { + close_points_df <- fread(matched_file) + setDT(close_points_df) + close_points_df <- close_points_df[order(min_dist_m), .SD[1], by = index] + + load(envres_file) + + return(list( + close_points_df = close_points_df, + envres = envres + )) +} + +##' @title filter_envres_for_flux +##' @name filter_envres_for_flux +##' @author Yang Gu +##' +##' @param envres Environmental result table. +##' @param flux_cfg Flux-specific configuration list. +##' +##' @description +##' Remove flux-specific observed columns, filter land-cover classes, +##' and restrict the date range used in BCM. +##' +##' @return +##' A filtered data.table. +filter_envres_for_flux <- function(envres, flux_cfg) { + cols_to_drop <- intersect( + c(flux_cfg$obs_col, flux_cfg$obs_qc_col, flux_cfg$residual_col), + names(envres) + ) + if (length(cols_to_drop) > 0) { + envres[, (cols_to_drop) := NULL] + } + + envres <- envres[as.numeric(as.character(LC)) <= 4] + + cutoff_date <- as.POSIXct("2012-07-15 23:59:59", tz = "UTC") + envres <- envres[utc > cutoff_date] + + return(envres) +} + +##' @title add_gapfill_and_predictors +##' @name add_gapfill_and_predictors +##' @author Yang Gu +##' +##' @param envres Environmental result table. +##' @param gfed Gap-fill ensemble result table. +##' @param close_points_df Nearest site-index table. +##' @param raw_response_var Raw ensemble variable name, e.g. "ens01". +##' @param flux_cfg Flux-specific configuration list. +##' +##' @description +##' Merge a selected ensemble gap-fill column into `envres`, add predictor +##' transformations, standardize the model mean column to `flux_mean`, +##' and construct the response residual. +##' +##' @return +##' A processed data.table ready for BCM modeling. +add_gapfill_and_predictors <- function(envres, + gfed, + close_points_df, + raw_response_var, + flux_cfg) { + cols_to_merge <- c(raw_response_var) + gfed_sub <- gfed[, c("Site_ID", "utc", cols_to_merge), with = FALSE] + + setkey(envres, Site_ID, utc) + setkey(gfed_sub, Site_ID, utc) + envres <- gfed_sub[envres] + + envres[, c("LC", "is_day", "KGC") := lapply(.SD, as.factor), + .SDcols = c("LC", "is_day", "KGC")] + + envres[, t2m2 := t2m^2] + envres[, PPFD2 := PPFD^2] + envres[, EVI2 := EVI^2] + envres[, VPD_t2m := VPD * t2m] + envres[, sincos := sin_doy * cos_doy] + + envres[, Site_ID := NULL] + + setkey(envres, index) + setkey(close_points_df, index) + envres[close_points_df, Site_ID := i.Site_ID] + + envres[EVI == -3000, EVI := NA_real_] + envres[NDVI == -3000, NDVI := NA_real_] + + if (flux_cfg$mean_col %in% names(envres) && + flux_cfg$mean_col != flux_cfg$mean_std_col) { + setnames(envres, old = flux_cfg$mean_col, new = flux_cfg$mean_std_col) + } + + cols_ens <- grep("ens", names(envres), value = TRUE) + cols_to_keep <- raw_response_var + cols_to_remove <- setdiff(cols_ens, cols_to_keep) + if (length(cols_to_remove) > 0) { + envres[, (cols_to_remove) := NULL] + } + + response_residual_col <- paste0(raw_response_var, "_residual") + envres[, (response_residual_col) := get(flux_cfg$mean_std_col) - get(raw_response_var)] + + return(envres) +} + +##' @title split_bcm_data +##' @name split_bcm_data +##' @author Yang Gu +##' +##' @param envres Environmental result table. +##' @param response_var Response column name. +##' +##' @description +##' Split the data into observed-site training data and unobserved-site +##' prediction data. +##' +##' @return +##' A list containing `tenvres` and `pdat`. +split_bcm_data <- function(envres, response_var) { + indices_to_keep <- envres[!is.na(Site_ID), unique(index)] + + tenvres <- envres[index %in% indices_to_keep] + setDT(tenvres) + tenvres[, LC := factor(as.character(LC))] + tenvres <- tenvres[!is.na(get(response_var))] + tenvres[, `:=`( + LC = as.factor(LC), + is_day = as.factor(is_day), + KGC = as.factor(KGC), + index = as.character(index) + )] + + envres <- envres[!(index %in% indices_to_keep)] + pdat <- envres[is.na(get(response_var))] + + return(list( + tenvres = tenvres, + pdat = pdat + )) +} + +##' @title get_bcm_model_components +##' @name get_bcm_model_components +##' @author Yang Gu +##' +##' @description +##' Return the predictor list, formula, and XGBoost hyperparameter settings +##' used in the BCM workflow. +##' +##' @return +##' A list of modeling components. +get_bcm_model_components <- function() { + predictors_all <- c( + "t2m", "sp", "d2m", "tp", "strd", "KGC", "lon", + "sin_doy", "cos_doy", "NDVI", "EVI", "twi", "PH", "Sand", "lat", + "agb", "SOC", "N", "year_since_disturb", "PPFD", "WindSpeed", + "t2m2", "PPFD2", "EVI2", "VPD_t2m", "sincos", + "sm_0_10", "sm_10_40", "sm_40_100", "sm_100_200", + "VPD", "is_day" + ) + + pred_formula <- as.formula( + paste0("~ ", paste(predictors_all, collapse = " + "), " - 1") + ) + + ### TBD: set the specific settings for different LC type + lc_param_list <- list( + `10` = list(objective = "reg:squarederror", eta = 0.02, max_depth = 4, + subsample = 0.5, colsample_bytree = 0.5, + min_child_weight = 20, gamma = 2, lambda = 4, alpha = 3), + `40` = list(objective = "reg:squarederror", eta = 0.025, max_depth = 5, + subsample = 0.55, colsample_bytree = 0.6, + min_child_weight = 15, gamma = 1, lambda = 4, alpha = 2), + `50` = list(objective = "reg:squarederror", eta = 0.015, max_depth = 3, + subsample = 0.35, colsample_bytree = 0.4, + min_child_weight = 40, gamma = 4, lambda = 6, alpha = 5), + `60` = list(objective = "reg:squarederror", eta = 0.02, max_depth = 3, + subsample = 0.4, colsample_bytree = 0.5, + min_child_weight = 25, gamma = 3, lambda = 4, alpha = 3), + `70` = list(objective = "reg:squarederror", eta = 0.02, max_depth = 3, + subsample = 0.4, colsample_bytree = 0.4, + min_child_weight = 30, gamma = 4, lambda = 5, alpha = 4) + ) + + default_params <- list( + objective = "reg:squarederror", eta = 0.1, max_depth = 6, + subsample = 0.8, colsample_bytree = 0.8, + min_child_weight = 10, gamma = 0.5, lambda = 3, alpha = 2 + ) + + return(list( + predictors_all = predictors_all, + pred_formula = pred_formula, + lc_param_list = lc_param_list, + default_params = default_params, + nrounds_fixed = 200 + )) +} + +##' @title get_valid_lc_tasks +##' @name get_valid_lc_tasks +##' @author Yang Gu +##' +##' @param tenvres Training data.table. +##' +##' @description +##' Construct leave-one-site-out CV tasks for LC classes with more than one site. +##' +##' @return +##' A data.frame of CV tasks. +get_valid_lc_tasks <- function(tenvres) { + lc_counts <- tenvres %>% + distinct(LC, index) %>% + group_by(LC) %>% + summarise(n_sites = n(), .groups = "drop") + + valid_lcs <- lc_counts %>% + filter(n_sites > 1) %>% + pull(LC) + + tasks <- tenvres %>% + filter(LC %in% valid_lcs) %>% + distinct(LC, index) %>% + arrange(LC, index) + + return(tasks) +} + +##' @title run_bcm_cv +##' @name run_bcm_cv +##' @author Yang Gu +##' +##' @param tenvres Training data.table. +##' @param tasks CV task table. +##' @param predictors_all Predictor column names. +##' @param response_var Response column name. +##' @param nrounds_fixed Fixed number of boosting rounds. +##' @param lc_param_list LC-specific XGBoost parameter list. +##' @param default_params Default XGBoost parameter list. +##' @param flux_cfg Flux-specific configuration list. +##' +##' @description +##' Run leave-one-site-out cross validation within land-cover classes. +##' +##' @return +##' A data.frame of CV predictions and diagnostics. +run_bcm_cv <- function(tenvres, + tasks, + predictors_all, + response_var, + nrounds_fixed, + lc_param_list, + default_params, + flux_cfg) { + ncores <- max(detectCores() - 1, 1) + cl <- makeCluster(ncores) + registerDoParallel(cl) + + cv_results <- foreach( + i = seq_len(nrow(tasks)), + .packages = c("dplyr", "xgboost", "data.table"), + .combine = rbind, + .export = c("tenvres", "predictors_all", "response_var", + "nrounds_fixed", "tasks", "lc_param_list", + "default_params", "flux_cfg") + ) %dopar% { + lc <- tasks$LC[i] + site <- tasks$index[i] + + df_lc <- tenvres %>% filter(LC == lc) + train_df <- df_lc %>% filter(index != site) + test_df <- df_lc %>% filter(index == site) + + if (nrow(train_df) == 0 || nrow(test_df) == 0) { + return(NULL) + } + + train_x <- data.matrix(train_df[, predictors_all, with = FALSE]) + test_x <- data.matrix(test_df[, predictors_all, with = FALSE]) + + train_y <- train_df[[response_var]] + test_y <- test_df[[response_var]] + + dtrain <- xgb.DMatrix(data = train_x, label = train_y, missing = NA) + dtest <- xgb.DMatrix(data = test_x, label = test_y, missing = NA) + watchlist <- list(train = dtrain, eval = dtest) + + params <- if (as.character(lc) %in% names(lc_param_list)) { + lc_param_list[[as.character(lc)]] + } else { + default_params + } + + model_i <- xgb.train( + params = params, + data = dtrain, + nrounds = nrounds_fixed, + watchlist = watchlist, + eval_metric = "rmse", + early_stopping_rounds = 20, + nthread = 1, + verbose = 0 + ) + + preds_i <- predict(model_i, test_x) + ev <- as.data.frame(model_i$evaluation_log) + + final_train_rmse <- ev$train_rmse[nrow(ev)] + final_eval_rmse <- ev$eval_rmse[nrow(ev)] + best_iter <- which.min(ev$eval_rmse) + + n_check <- 5 + if (nrow(ev) >= n_check + 1) { + train_tail <- tail(ev$train_rmse, n_check + 1) + eval_tail <- tail(ev$eval_rmse, n_check + 1) + train_trend <- diff(train_tail) + eval_trend <- diff(eval_tail) + has_overfit <- all(train_trend < 0) && all(eval_trend > 0) + } else { + has_overfit <- FALSE + } + + rmse_gap <- final_eval_rmse - final_train_rmse + rmse_ratio <- final_eval_rmse / final_train_rmse + + n <- nrow(test_df) + + data.frame( + LC = rep(lc, n), + index = rep(site, n), + utc = test_df$utc, + flux_mean = test_df[[flux_cfg$mean_std_col]], + observed = test_df[[response_var]], + predicted = preds_i, + train_rmse = rep(final_train_rmse, n), + eval_rmse = rep(final_eval_rmse, n), + rmse_gap = rep(rmse_gap, n), + rmse_ratio = rep(rmse_ratio, n), + best_iter = rep(best_iter, n), + has_overfit = rep(has_overfit, n), + stringsAsFactors = FALSE + ) + } + + stopCluster(cl) + registerDoSEQ() + + cv_results <- cv_results %>% + mutate( + flux_pred = flux_mean - predicted, + flux_obs = flux_mean - observed + ) + + return(cv_results) +} + +##' @title predict_bcm +##' @name predict_bcm +##' @author Yang Gu +##' +##' @param tenvres Training data.table. +##' @param pdat Prediction data.table. +##' @param response_var Response column name. +##' @param pred_formula Formula used for model.matrix construction. +##' @param predictors_all Predictor column names. +##' @param nrounds_fixed Fixed number of boosting rounds. +##' @param lc_param_list LC-specific XGBoost parameter list. +##' @param default_params Default XGBoost parameter list. +##' @param flux_cfg Flux-specific configuration list. +##' +##' @description +##' Fit one model per LC using all observed sites and predict residual +##' corrections for unobserved points. +##' +##' @return +##' A prediction data.table with flux-corrected outputs. +predict_bcm <- function(tenvres, + pdat, + response_var, + pred_formula, + predictors_all, + nrounds_fixed, + lc_param_list, + default_params, + flux_cfg) { + pdat[, flux_resi_pred := NA_real_] + + for (lc in levels(tenvres$LC)) { + df_lc <- tenvres[LC == lc] + if (nrow(df_lc) < 10) next + + mf_train_lc <- model.frame(pred_formula, data = df_lc, na.action = na.pass) + mat_train_lc <- model.matrix(pred_formula, data = mf_train_lc) + label_train_lc <- df_lc[[response_var]] + dtrain_lc <- xgb.DMatrix(data = mat_train_lc, label = label_train_lc, missing = NA) + + params_lc <- if (lc %in% names(lc_param_list)) lc_param_list[[lc]] else default_params + + bst_lc <- xgb.train( + params = params_lc, + data = dtrain_lc, + nrounds = nrounds_fixed, + verbose = 0 + ) + + pdat_lc <- pdat[LC == lc] + if (nrow(pdat_lc) == 0) next + + mf_p_lc <- model.frame(pred_formula, data = pdat_lc, na.action = na.pass) + mat_p_lc <- model.matrix(pred_formula, data = mf_p_lc) + dtest_lc <- xgb.DMatrix(data = mat_p_lc, missing = NA) + preds_lc <- predict(bst_lc, dtest_lc) + + pdat[LC == lc, flux_resi_pred := preds_lc] + } + + pdat[, flux_corrected := get(flux_cfg$mean_std_col) - flux_resi_pred] + + if (flux_cfg$pred_residual_col != "flux_resi_pred") { + pdat[, (flux_cfg$pred_residual_col) := flux_resi_pred] + } + if (flux_cfg$corrected_col != "flux_corrected") { + pdat[, (flux_cfg$corrected_col) := flux_corrected] + } + + return(pdat) +} + +##' @title compute_bcm_metrics +##' @name compute_bcm_metrics +##' @author Yang Gu +##' +##' @param cv_results Cross-validation result table. +##' +##' @description +##' Compute site-level, LC-level, and corrected-vs-baseline R2 summaries, +##' as well as overfitting diagnostics. +##' +##' @return +##' A list of summary tables. +compute_bcm_metrics <- function(cv_results) { + r2_by_site <- cv_results %>% + group_by(LC, index) %>% + summarise( + R2 = 1 - sum((observed - predicted)^2) / + sum((observed - mean(observed))^2), + .groups = "drop" + ) + + r2_by_lc <- r2_by_site %>% + group_by(LC) %>% + summarise( + R2 = mean(R2), + .groups = "drop" + ) + + overfit_table <- cv_results %>% + group_by(LC, index) %>% + summarise( + train_rmse = first(train_rmse), + eval_rmse = first(eval_rmse), + rmse_gap = first(rmse_gap), + rmse_ratio = first(rmse_ratio), + has_overfit = first(has_overfit), + .groups = "drop" + ) + + r2_by_index <- cv_results %>% + group_by(LC, index) %>% + summarise( + R2_fused = cor(flux_pred, flux_obs, use = "complete.obs")^2, + R2_baseline = cor(flux_mean, flux_obs, use = "complete.obs")^2, + .groups = "drop" + ) + + return(list( + r2_by_site = r2_by_site, + r2_by_lc = r2_by_lc, + overfit_table = overfit_table, + r2_by_index = r2_by_index + )) +} + +##' @title write_bcm_outputs +##' @name write_bcm_outputs +##' @author Yang Gu +##' +##' @param file_prefix Output file prefix. +##' @param flux_cfg Flux-specific configuration list. +##' @param overfit_table Overfitting summary table. +##' @param pdat Prediction output table. +##' @param r2_by_site Site-level R2 table. +##' @param r2_by_lc LC-level R2 table. +##' @param cv_results Cross-validation output table. +##' @param r2_by_index Index-level corrected-vs-baseline R2 table. +##' +##' @description +##' Write BCM outputs to disk, using flux-specific file suffixes. +##' +##' @return +##' No return value. +write_bcm_outputs <- function(file_prefix, + flux_cfg, + overfit_table, + pdat, + r2_by_site, + r2_by_lc, + cv_results, + r2_by_index) { + fwrite(overfit_table, paste0(file_prefix, "_lc1_", flux_cfg$output_tag, "_overfit.csv")) + fwrite(pdat, paste0(file_prefix, "_pred_lc1_", flux_cfg$output_tag, ".csv")) + + write.csv( + r2_by_site, + paste0(file_prefix, "_lc1_", flux_cfg$output_tag, "_site.csv"), + row.names = FALSE, quote = TRUE + ) + write.csv( + r2_by_lc, + paste0(file_prefix, "_lc1_", flux_cfg$output_tag, "_r2.csv"), + row.names = FALSE, quote = TRUE + ) + write.csv( + cv_results, + paste0(file_prefix, "_lc1_", flux_cfg$output_tag, "_cv.csv"), + row.names = FALSE, quote = TRUE + ) + write.csv( + r2_by_index, + paste0(file_prefix, "_lc1_", flux_cfg$output_tag, "_index_improve_r2.csv"), + row.names = FALSE + ) +} + +##' @title run_bcm_pipeline +##' @name run_bcm_pipeline +##' @author Yang Gu +##' +##' @param flux_type Flux type, either "LE" or "NEE". +##' @param raw_response_var Raw ensemble response variable, e.g. "ens01". +##' @param envres_file Path to envres .RData file. +##' @param matched_file Path to matched point-site CSV file. +##' @param output_dir Directory for writing outputs. +##' +##' @description +##' Run the full BCM workflow for a selected flux type using the integrated +##' `envres` dataset and ensemble gap-fill predictions. +##' +##' @return +##' A list containing prediction outputs and evaluation summaries. +run_bcm_pipeline <- function(flux_type, + raw_response_var, + envres_file, + matched_file, + output_dir) { + flux_cfg <- get_flux_config(flux_type) + response_var <- paste0(raw_response_var, "_residual") + + data_obj <- read_close_points_and_envres( + matched_file = matched_file, + envres_file = envres_file + ) + close_points_df <- data_obj$close_points_df + envres <- data_obj$envres + + gfed <- fread(flux_cfg$gfed_file) + + envres <- filter_envres_for_flux(envres, flux_cfg) + envres <- add_gapfill_and_predictors( + envres = envres, + gfed = gfed, + close_points_df = close_points_df, + raw_response_var = raw_response_var, + flux_cfg = flux_cfg + ) + + split_obj <- split_bcm_data(envres, response_var) + tenvres <- split_obj$tenvres + pdat <- split_obj$pdat + rm(envres) + gc() + + model_obj <- get_bcm_model_components() + predictors_all <- model_obj$predictors_all + pred_formula <- model_obj$pred_formula + lc_param_list <- model_obj$lc_param_list + default_params <- model_obj$default_params + nrounds_fixed <- model_obj$nrounds_fixed + + tasks <- get_valid_lc_tasks(tenvres) + + cv_results <- run_bcm_cv( + tenvres = tenvres, + tasks = tasks, + predictors_all = predictors_all, + response_var = response_var, + nrounds_fixed = nrounds_fixed, + lc_param_list = lc_param_list, + default_params = default_params, + flux_cfg = flux_cfg + ) + + pdat <- predict_bcm( + tenvres = tenvres, + pdat = pdat, + response_var = response_var, + pred_formula = pred_formula, + predictors_all = predictors_all, + nrounds_fixed = nrounds_fixed, + lc_param_list = lc_param_list, + default_params = default_params, + flux_cfg = flux_cfg + ) + + cols_to_keep <- c( + "utc", "index", + "flux_resi_pred", "flux_corrected", + flux_cfg$pred_residual_col, + flux_cfg$corrected_col + ) + cols_to_keep <- intersect(cols_to_keep, names(pdat)) + pdat <- pdat[, ..cols_to_keep] + + metric_obj <- compute_bcm_metrics(cv_results) + + file_prefix <- paste0(output_dir, "/", response_var) + + write_bcm_outputs( + file_prefix = file_prefix, + flux_cfg = flux_cfg, + overfit_table = metric_obj$overfit_table, + pdat = pdat, + r2_by_site = metric_obj$r2_by_site, + r2_by_lc = metric_obj$r2_by_lc, + cv_results = cv_results, + r2_by_index = metric_obj$r2_by_index + ) + + return(list( + pdat = pdat, + cv_results = cv_results, + r2_by_site = metric_obj$r2_by_site, + r2_by_lc = metric_obj$r2_by_lc, + overfit_table = metric_obj$overfit_table, + r2_by_index = metric_obj$r2_by_index + )) +} \ No newline at end of file diff --git a/modules/assim.sequential/inst/data_prep.R b/modules/assim.sequential/inst/data_prep.R new file mode 100644 index 0000000000..0f8b362cd7 --- /dev/null +++ b/modules/assim.sequential/inst/data_prep.R @@ -0,0 +1,633 @@ +## Load packages +library(ncdf4) +library(data.table) +library(dplyr) +library(future.apply) +library(tidyr) +library(lubridate) +library(purrr) +library(kgc) +library(terra) +library(lutz) + +##' @title read_annual_csv_series +##' @name read_annual_csv_series +##' @author Yang Gu +##' +##' @param years Integer vector of years. +##' @param path_template A character string template containing one `%d` +##' placeholder for year, e.g. "/path/to/file_%d.csv". +##' +##' @description +##' Read a series of annual CSV files and return them as a list of data.tables. +##' +##' @return +##' A named list of data.tables, one per year. +read_annual_csv_series <- function(years, path_template) { + out <- lapply(years, function(y) { + fread(sprintf(path_template, y)) + }) + names(out) <- as.character(years) + return(out) +} + +##' @title bind_annual_tables +##' @name bind_annual_tables +##' @author Yang Gu +##' +##' @param dt_list A list of data.tables. +##' @param drop_year Logical; if TRUE and a `year` column exists, remove it. +##' +##' @description +##' Standardize and row-bind annual tables into one combined data.table. +##' +##' @return +##' A single combined data.table. +bind_annual_tables <- function(dt_list, drop_year = TRUE) { + cleaned <- lapply(dt_list, function(df) { + df <- as.data.table(df) + if (drop_year && "year" %in% names(df)) { + df[, year := NULL] + } + return(df) + }) + out <- rbindlist(cleaned, use.names = TRUE, fill = TRUE) + return(out) +} + +##' @title extract_kgc_for_points +##' @name extract_kgc_for_points +##' @author Yang Gu +##' +##' @param coords_dt A data.table containing columns `lon`, `lat`, and `Y`. +##' @param kg_file Path to Köppen-Geiger raster. +##' @param kg_lookup Character vector mapping raster values to KGC classes. +##' +##' @description +##' Extract Köppen-Geiger climate classes for each coordinate point. +##' +##' @return +##' A data.table with columns `index`, `lon`, `lat`, and `KGC`. +extract_kgc_for_points <- function(coords_dt, kg_file, kg_lookup) { + kg_raster <- rast(kg_file) + coords_vect <- vect(coords_dt, geom = c("lon", "lat"), crs = "EPSG:4326") + ex <- terra::extract(kg_raster, coords_vect, ID = FALSE) + + vals <- as.integer(ex[[1]]) + valid <- !is.na(vals) & vals >= 1 & vals <= length(kg_lookup) + + KGC <- rep(NA_character_, length(vals)) + KGC[valid] <- kg_lookup[vals[valid]] + + points_meta <- data.table( + index = coords_dt$Y, + lon = coords_dt$lon, + lat = coords_dt$lat, + KGC = KGC + ) + points_meta[, KGC := factor(KGC, levels = kg_lookup)] + return(points_meta) +} + +##' @title add_point_metadata_to_era +##' @name add_point_metadata_to_era +##' @author Yang Gu +##' +##' @param era_dt ERA5 data.table with `index`. +##' @param coords_file Path to coordinate CSV file. +##' @param kg_file Path to Köppen-Geiger raster. +##' +##' @description +##' Add point-level metadata, including lon/lat and KGC, to ERA5 records. +##' +##' @return +##' A list with: +##' \itemize{ +##' \item `era_all`: ERA data.table merged with metadata +##' \item `points_meta`: point metadata table +##' } +add_point_metadata_to_era <- function(era_dt, coords_file, kg_file) { + test_coords <- fread(coords_file)[, .(lon, lat, Y)] + + kg_lookup <- c( + "Af","Am","Aw","BWh","BWk","BSh","BSk","Csa","Csb","Csc", + "Cwa","Cwb","Cwc","Cfa","Cfb","Cfc","Dsa","Dsb","Dsc", + "Dsd","Dwa","Dwb","Dwc","Dwd","Dfa","Dfb","Dfc","Dfd","ET","EF" + ) + + points_meta <- extract_kgc_for_points( + coords_dt = test_coords, + kg_file = kg_file, + kg_lookup = kg_lookup + ) + + era_dt <- merge( + era_dt, + points_meta, + by = "index", + all.x = TRUE, + allow.cartesian = TRUE + ) + + return(list( + era_all = era_dt, + points_meta = points_meta + )) +} + +##' @title expand_tiff_daily +##' @name expand_tiff_daily +##' @author Yang Gu +##' +##' @param tiff_file Path to TIFF-derived annual covariate CSV. +##' +##' @description +##' Expand annual/static TIFF-based covariates to daily resolution according +##' to the study year definition. +##' +##' @return +##' A daily-resolution data.table keyed by `index` and `Date`. +expand_tiff_daily <- function(tiff_file) { + tiff <- fread(tiff_file) + tiff <- as.data.table(tiff) + + tiff[, `:=`( + start_date = fifelse( + Year == 2012, + as.Date("2012-01-01"), + as.Date(paste0(Year - 1, "-07-16")) + ), + end_date = fifelse( + Year == 2012, + as.Date("2012-07-15"), + as.Date(paste0(Year, "-07-15")) + ) + )] + + tiff_daily <- tiff[, .( + Date = seq(start_date[1], end_date[1], by = "day") + ), by = .( + Year, index, cell, cell_lon, cell_lat, diff_lon, diff_lat, distance, + LC, twi, PH, Sand, agb, SOC, N, year_since_disturb + )] + + return(tiff_daily) +} + +##' @title read_soil_moisture_data +##' @name read_soil_moisture_data +##' @author Yang Gu +##' +##' @param soilm_file Path to soil moisture CSV. +##' +##' @description +##' Read and standardize soil moisture data. +##' +##' @return +##' A data.table with standardized columns `index` and `utc`. +read_soil_moisture_data <- function(soilm_file) { + soilm <- fread(soilm_file) + setnames(soilm, c("Y", "datetime"), c("index", "utc")) + return(soilm) +} + +##' @title add_daily_tiff_covariates +##' @name add_daily_tiff_covariates +##' @author Yang Gu +##' +##' @param envres Environmental data.table with `index` and `utc`. +##' @param tiff_daily Daily TIFF covariate data.table. +##' +##' @description +##' Merge daily TIFF covariates into the environmental table. +##' +##' @return +##' A merged data.table. +add_daily_tiff_covariates <- function(envres, tiff_daily) { + envres[, Date := as.Date(utc)] + setDT(envres) + setDT(tiff_daily) + setkey(envres, index, Date) + setkey(tiff_daily, index, Date) + + out <- merge( + envres, + tiff_daily, + by = c("index", "Date"), + all.x = TRUE, + sort = FALSE + ) + return(out) +} + +##' @title add_meteorological_derived_variables +##' @name add_meteorological_derived_variables +##' @author Yang Gu +##' +##' @param dt A data.table containing ERA5 meteorological variables. +##' +##' @description +##' Add derived meteorological variables such as PPFD, wind speed, +##' temperature in Celsius, relative humidity, VPD, and day/night indicator. +##' +##' @return +##' A data.table with derived variables appended. +add_meteorological_derived_variables <- function(dt) { + dt[, `:=`( + PPFD = 0.45 * ssrd / 4.57 / (3 * 3600), + WindSpeed = sqrt(u10^2 + v10^2), + t_air_C = t2m - 273.15, + d_air_C = d2m - 273.15 + )] + + dt[, `:=`( + RH = 100 * exp((17.625 * d_air_C) / (243.04 + d_air_C) - + (17.625 * t_air_C) / (243.04 + t_air_C)), + e_s = 0.6108 * exp((17.27 * t_air_C) / (t_air_C + 237.3)), + e_a = 0.6108 * exp((17.27 * d_air_C) / (d_air_C + 237.3)), + is_day = as.integer(ssrd > 10) + )] + + dt[, VPD := pmax(e_s - e_a, 0)] + dt[, c("t_air_C", "d_air_C", "e_s", "e_a") := NULL] + + return(dt) +} + +##' @title add_local_time_variables +##' @name add_local_time_variables +##' @author Yang Gu +##' +##' @param dt A data.table containing `index`, `lat.x`, `lon.x`, and `utc`. +##' +##' @description +##' Add local timezone, local datetime, and seasonal harmonic variables. +##' +##' @return +##' A data.table with local time variables appended. +add_local_time_variables <- function(dt) { + site_coords <- unique(dt[, .(index, lat.x, lon.x)]) + site_coords[, tz := lutz::tz_lookup_coords(lat.x, lon.x, method = "accurate")] + + dt[site_coords, tz := i.tz, on = "index"] + dt[, tlocal := with_tz(utc, tz = unique(tz)), by = tz] + + dt[, year_local := year(tlocal)] + dt[, days_in_year := ifelse(leap_year(year_local), 366, 365)] + dt[, doy := yday(tlocal) + + hour(tlocal) / 24 + + minute(tlocal) / 1440 + + second(tlocal) / 86400] + + dt[, `:=`( + sin_doy = sin(2 * pi * doy / days_in_year), + cos_doy = cos(2 * pi * doy / days_in_year) + )] + + cols_to_drop <- intersect( + c("cell", "cell_lon", "cell_lat", "diff_lon", "diff_lat", + "distance", "tz", "lat.y", "lon.y", "lon_index", + "lat_index", "year_local", "days_in_year"), + names(dt) + ) + if (length(cols_to_drop) > 0) { + dt[, (cols_to_drop) := NULL] + } + + return(dt) +} + +##' @title read_and_fill_modis_daily +##' @name read_and_fill_modis_daily +##' @author Yang Gu +##' +##' @param modis_files Character vector of MODIS CSV file paths. +##' @param start_date Start date. +##' @param end_date End date. +##' +##' @description +##' Read MODIS NDVI/EVI data and fill to daily resolution using nearest +##' available observations. +##' +##' @return +##' A data.table with columns `index`, `Date`, `EVI`, and `NDVI`. +read_and_fill_modis_daily <- function(modis_files, + start_date = as.Date("2012-01-01"), + end_date = as.Date("2024-12-31")) { + dat <- rbindlist(lapply(modis_files, fread), use.names = TRUE, fill = TRUE) + + setnames( + dat, + old = c("ID", "Latitude", "Longitude", + "MYD13Q1_061__250m_16_days_EVI", + "MYD13Q1_061__250m_16_days_NDVI"), + new = c("index", "lat", "lon", "EVI", "NDVI") + ) + + dat <- dat[, .(index, Date, EVI, NDVI)] + + unique_idx <- unique(dat$index) + all_days <- data.table( + index = rep(unique_idx, each = as.integer(end_date - start_date + 1)), + Date = rep(seq(start_date, end_date, by = "day"), times = length(unique_idx)), + lat = NA_real_, + lon = NA_real_, + EVI = NA_real_, + NDVI = NA_real_ + ) + + setkey(dat, index, Date) + setkey(all_days, index, Date) + + all_days[, Date_cal := Date] + filled <- dat[all_days, on = .(index, Date), roll = "nearest"] + final_df <- filled[, .(index, Date = Date_cal, EVI, NDVI)] + + return(final_df) +} + +##' @title add_modis_covariates +##' @name add_modis_covariates +##' @author Yang Gu +##' +##' @param envres Environmental result table. +##' @param modis_dt Daily MODIS table. +##' +##' @description +##' Merge daily MODIS EVI/NDVI data into the environmental result table. +##' +##' @return +##' A merged data.table. +add_modis_covariates <- function(envres, modis_dt) { + out <- merge( + envres, + modis_dt, + by = c("index", "Date"), + all.x = TRUE + ) + return(out) +} + +##' @title read_sipnet_flux_outputs +##' @name read_sipnet_flux_outputs +##' @author Yang Gu +##' +##' @param years Integer vector of years. +##' @param path_template File path template for SIPNET annual CSV files. +##' +##' @description +##' Read and combine annual SIPNET flux output files. +##' +##' @return +##' A data.table with columns `utc`, `index`, `NEE_mean`, `Qle_mean`, `GPP_mean`. +read_sipnet_flux_outputs <- function(years, path_template) { + sipnet_list <- read_annual_csv_series(years, path_template) + sipnet_all <- bind_annual_tables(sipnet_list, drop_year = TRUE) + + setnames(sipnet_all, "time", "utc") + output.df <- sipnet_all[, .(utc, index, NEE_mean, Qle_mean, GPP_mean)] + + return(output.df) +} + +##' @title add_sipnet_outputs +##' @name add_sipnet_outputs +##' @author Yang Gu +##' +##' @param envres Environmental result table. +##' @param sipnet_dt SIPNET output table. +##' +##' @description +##' Merge SIPNET outputs into the environmental result table and convert +##' NEE units to observation-compatible units. +##' +##' @return +##' A merged data.table. +add_sipnet_outputs <- function(envres, sipnet_dt) { + out <- merge( + envres, + sipnet_dt[, .(utc, index, NEE_mean, Qle_mean, GPP_mean)], + by = c("utc", "index"), + all.x = TRUE, + sort = FALSE + ) + + out[, NEE_mean := NEE_mean * 1e8 / 1.157407] + return(out) +} + +##' @title read_and_match_ec_data +##' @name read_and_match_ec_data +##' @author Yang Gu +##' +##' @param ec_file Path to eddy covariance data CSV. +##' @param matched_file Path to matched point-site CSV. +##' +##' @description +##' Read eddy covariance data, fill missing flux variables using fallback +##' columns, and match site IDs to spatial indices. +##' +##' @return +##' A data.table with columns matched to `utc` and `index`. +read_and_match_ec_data <- function(ec_file, matched_file) { + resimet.df <- fread(ec_file) + close_points_df <- fread(matched_file) + + setDT(close_points_df) + close_points_df <- close_points_df[order(min_dist_m), .SD[1], by = index] + + resimet.df[is.na(NEE_CUT_USTAR50) & !is.na(NEE_VUT_USTAR50), + NEE_CUT_USTAR50 := NEE_VUT_USTAR50] + resimet.df[is.na(GPP_NT_CUT_USTAR50) & !is.na(GPP_NT_VUT_USTAR50), + GPP_NT_CUT_USTAR50 := GPP_NT_VUT_USTAR50] + + resimet.df <- resimet.df[, .( + utc, NEE_CUT_USTAR50, LE_F_MDS, GPP_NT_CUT_USTAR50, Site_ID + )] + + site_index_map <- close_points_df[, .(Site_ID, index)] + + resimet.df <- merge( + resimet.df, + site_index_map, + by = "Site_ID", + all.x = TRUE + ) + + resimet.df <- resimet.df[, .SD[1], by = .(utc, index)] + return(resimet.df) +} + +##' @title add_ec_observations_and_residuals +##' @name add_ec_observations_and_residuals +##' @author Yang Gu +##' +##' @param envres Environmental result table. +##' @param ec_dt Eddy covariance table. +##' +##' @description +##' Merge EC observations into the environmental result table and compute +##' residuals between SIPNET outputs and EC observations. +##' +##' @return +##' A data.table with residual columns added. +add_ec_observations_and_residuals <- function(envres, ec_dt) { + out <- merge( + envres, + ec_dt, + by = c("utc", "index"), + all.x = TRUE, + sort = FALSE + ) + + out[, `:=`( + NEE_residual = NEE_mean - NEE_CUT_USTAR50, + LE_residual = Qle_mean - LE_F_MDS, + GPP_residual = GPP_mean - GPP_NT_CUT_USTAR50 + )] + + out <- out[, .SD[1], by = .(utc, index)] + return(out) +} + +##' @title finalize_envres_columns +##' @name finalize_envres_columns +##' @author Yang Gu +##' +##' @param dt Final environmental result table. +##' +##' @description +##' Clean up duplicated coordinate columns and standardize final names. +##' +##' @return +##' A cleaned data.table ready for saving. +finalize_envres_columns <- function(dt) { + drop_cols <- intersect(c("lat.y", "lon.y"), names(dt)) + if (length(drop_cols) > 0) { + dt[, (drop_cols) := NULL] + } + + rename_old <- intersect(c("lat.x", "lon.x"), names(dt)) + rename_new <- c("lat", "lon")[c("lat.x", "lon.x") %in% rename_old] + + if (length(rename_old) > 0) { + setnames(dt, rename_old, rename_new) + } + + return(dt) +} + +##' @title build_envres_dataset +##' @name build_envres_dataset +##' @author Yang Gu +##' +##' @param years Integer vector of years to process. +##' @param era_template File path template for ERA annual CSV files. +##' @param coords_file Path to coordinate CSV. +##' @param kg_file Path to Köppen-Geiger raster. +##' @param tiff_file Path to TIFF annual covariate CSV. +##' @param soilm_file Path to soil moisture CSV. +##' @param modis_files Character vector of MODIS CSV files. +##' @param sipnet_template File path template for SIPNET annual CSV files. +##' @param ec_file Path to eddy covariance CSV. +##' @param matched_file Path to matched spatial index CSV. +##' +##' @description +##' Build the full environmental modeling dataset by combining ERA5, +##' soil moisture, TIFF-derived covariates, MODIS, SIPNET outputs, and +##' eddy covariance observations. +##' +##' @return +##' A final integrated data.table. +build_envres_dataset <- function( + years, + era_template, + coords_file, + kg_file, + tiff_file, + soilm_file, + modis_files, + sipnet_template, + ec_file, + matched_file +) { + ## ERA5 + era_list <- read_annual_csv_series(years, era_template) + era_all <- bind_annual_tables(era_list, drop_year = TRUE) + rm(era_list) + gc() + + era_meta <- add_point_metadata_to_era( + era_dt = era_all, + coords_file = coords_file, + kg_file = kg_file + ) + era_all <- era_meta$era_all + points_meta <- era_meta$points_meta + gc() + + ## TIFF + tiff_daily <- expand_tiff_daily(tiff_file) + gc() + + ## Soil moisture + soilm <- read_soil_moisture_data(soilm_file) + + ## Merge ERA + soil moisture + envres <- merge(era_all, soilm, by = c("index", "utc"), all.x = TRUE) + rm(era_all, soilm) + gc() + + ## Add daily TIFF covariates + envres <- add_daily_tiff_covariates(envres, tiff_daily) + rm(tiff_daily) + gc() + + ## Derived meteorological variables + envres <- add_meteorological_derived_variables(envres) + gc() + + ## Local time and seasonal harmonics + envres <- add_local_time_variables(envres) + gc() + + ## MODIS + modis_dt <- read_and_fill_modis_daily(modis_files) + envres <- add_modis_covariates(envres, modis_dt) + rm(modis_dt) + gc() + + ## SIPNET + sipnet_dt <- read_sipnet_flux_outputs(years, sipnet_template) + envres <- add_sipnet_outputs(envres, sipnet_dt) + rm(sipnet_dt) + gc() + + ## Eddy covariance + ec_dt <- read_and_match_ec_data(ec_file, matched_file) + envres <- add_ec_observations_and_residuals(envres, ec_dt) + rm(ec_dt, points_meta) + gc() + + ## Final cleanup + envres <- finalize_envres_columns(envres) + gc() + + return(envres) +} + +##' @title save_envres_dataset +##' @name save_envres_dataset +##' @author Yang Gu +##' +##' @param envres Final environmental result data.table. +##' @param output_file Output `.RData` file path. +##' +##' @description +##' Save the final dataset to an `.RData` file. +##' +##' @return +##' No return value. Saves `envres` to disk. +save_envres_dataset <- function(envres, output_file) { + save(envres, file = output_file) +} \ No newline at end of file From a8b9dc94fc091688998948664050254b81cc1870 Mon Sep 17 00:00:00 2001 From: ygu Date: Thu, 16 Apr 2026 22:17:31 -0400 Subject: [PATCH 09/13] Update modules/data.land/R/SIPNET_flux_gapfill.R Co-authored-by: Michael Dietze --- modules/data.land/R/SIPNET_flux_gapfill.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/data.land/R/SIPNET_flux_gapfill.R b/modules/data.land/R/SIPNET_flux_gapfill.R index 2543658981..1960134311 100644 --- a/modules/data.land/R/SIPNET_flux_gapfill.R +++ b/modules/data.land/R/SIPNET_flux_gapfill.R @@ -313,7 +313,7 @@ fit_site_xgb_cv <- function(subdf, #' @author Yang Gu #' @importFrom foreach %dopar% #' @export -xgb_gapfill <- function(site.dir, +xgb_flux_gapfill <- function(site.dir, pred.var.dir, flux_var, outdir, From 706252a86860e457e215eeaf4a8d85a4ec263492 Mon Sep 17 00:00:00 2001 From: yanggu Date: Tue, 28 Apr 2026 16:49:17 -0400 Subject: [PATCH 10/13] upload the revised version to repo --- local/sda_runner.R | 154 ++++++++++++++ models/sipnet/R/read_restart.SIPNET.R | 198 ++++++++++-------- .../assim.sequential/R/Analysis_sda_block.R | 32 ++- .../assim.sequential/R/sda.enkf_parallel.R | 34 ++- modules/data.land/NAMESPACE | 2 +- 5 files changed, 319 insertions(+), 101 deletions(-) create mode 100644 local/sda_runner.R diff --git a/local/sda_runner.R b/local/sda_runner.R new file mode 100644 index 0000000000..5ff798610d --- /dev/null +++ b/local/sda_runner.R @@ -0,0 +1,154 @@ +# loading libraries +library(dplyr) +library(xts) +library(PEcAn.all) +library(purrr) +library(furrr) +library(lubridate) +library(nimble) +library(ncdf4) +library(PEcAnAssimSequential) +library(dplyr) +library(sp) +library(raster) +library(zoo) +library(ggplot2) +library(mnormt) +library(sjmisc) +library(stringr) +library(doParallel) +library(doSNOW) +library(data.table) +library(Kendall) +library(lgarch) +library(parallel) +library(foreach) +library(terra) +setwd("/projectnb/dietzelab/guYANG/pecan/runners/wishart_sda/wishart_sda/") +## read settings xml file. +load("/projectnb/dietzelab/guYANG/pecan/runners/test10/pecan_flux.RData") +# Change dir name and settings +settings$outdir <- "/projectnb/dietzelab/guYANG/pecan/runners/wishart_sda/output_inter_q_2/" +settings$rundir <- file.path(settings$outdir, "run") +settings$modeloutdir <- file.path(settings$outdir, "out") +settings$host$rundir <- file.path(settings$outdir, "run") +settings$host$outdir <- file.path(settings$outdir, "out") +settings$host$folder <- file.path(settings$outdir, "out") +settings$ensemble$size <- 5 +settings$state.data.assimilation$adjustment <- "TRUE" +settings$host$prerun <- "module load R/4.4.0" +###### Change Q type +settings$state.data.assimilation$q.type <- "wishart" +## Fix the multi output in one timestep bug +settings$model$jobtemplate <- "/projectnb/dietzelab/guYANG/pecan/runners/test7/sipnet_template.job" + +# Load the selected sites +load("/projectnb/dietzelab/guYANG/pecan/runners/wishart_sda/sda_idx.Rdata") +all_ids <- vapply(settings, \(s) as.character(s$run$site$id), "") +# Sub settings for this run +settings <- settings[all_ids %in% keep_ids] +settings <- PEcAn.settings::as.MultiSettings(settings) + +# setup the batch job settings. +general.job <- list(cores = 28, folder.num = 80) +batch.settings = structure(list( + general.job = general.job, + qsub.cmd = "qsub -l h_rt=24:00:00 -l mem_per_core=4G -l buyin -pe omp @CORES@ -V -N @NAME@ -o @STDOUT@ -e @STDERR@ -S /bin/bash" +)) +settings$state.data.assimilation$batch.settings <- batch.settings + +# update settings with the actual PFTs. +settings <- PEcAn.settings::prepare.settings(settings) + +# load 6 obs +load("/projectnb/dietzelab/guYANG/pecan/runners/test10/obs.mean.RData") +load("/projectnb/dietzelab/guYANG/pecan/runners/test10/obs.cov.RData") + +# load 4 obs +# load("/projectnb/dietzelab/dongchen/anchorSites/NA_runs/SDA_8k_site/observation/Rdata_with_attributes/obs.mean.LandTrendr_GEDI.Rdata") +# load("/projectnb/dietzelab/dongchen/anchorSites/NA_runs/SDA_8k_site/observation/Rdata_with_attributes/obs.cov.LandTrendr_GEDI.Rdata") + +sub_obs <- function(L, keep) setNames(lapply(L, \(l) l[names(l) %in% keep]), names(L)) +obs.mean <- sub_obs(obs.mean, keep_ids) +obs.cov <- sub_obs(obs.cov, keep_ids) + +# replace zero observations and variances with small numbers. +for (i in 1:length(obs.mean)) { + if(is.null(obs.mean[[i]][[1]])){ + next + } + for (j in 1:length(obs.mean[[i]])) { + if (length(obs.mean[[i]][[j]])==0) { + next + } + obs.mean[[i]][[j]][which(obs.mean[[i]][[j]]==0)] <- 0.01 + if(length(obs.cov[[i]][[j]]) > 1){ + diag(obs.cov[[i]][[j]])[which(diag(obs.cov[[i]][[j]]<=0.1))] <- 0.1 + }else{ + if(obs.cov[[i]][[j]] <= 0.1){ + obs.cov[[i]][[j]] <- 0.1 + } + } + } +} + +if (length(obs.cov[[i]][[j]]) > 1) { + d <- diag(obs.cov[[i]][[j]]) + d[d <= 0.1] <- 0.1 + diag(obs.cov[[i]][[j]]) <- d +} + +# load PFT parameter file. +samples_src <- "/projectnb/dietzelab/dongchen/anchorSites/NA_runs/SDA_8k_site/samples.Rdata" +samples_dst <- file.path(settings$outdir, "samples.Rdata") +dir.create(settings$outdir, recursive = TRUE, showWarnings = FALSE) +if (!file.exists(samples_dst)) file.copy(samples_src, samples_dst, overwrite = TRUE) + +control <- list( + TimeseriesPlot = FALSE, + OutlierDetection = FALSE, + send_email = NULL, + keepNC = TRUE, + forceRun = TRUE, + run_parallel = FALSE, + MCMC.args = NULL, + merge_nc = FALSE, + execution = "qsub_parallel" # or "qsub" / "local" +) + +source("/projectnb/dietzelab/guYANG/pecan/runners/wishart_sda/wishart_sda/sda.enkf_local.R") +source("/projectnb/dietzelab/guYANG/pecan/runners/wishart_sda/wishart_sda/read.restart.SIPNET.R") +source("/projectnb/dietzelab/guYANG/pecan/runners/wishart_sda/wishart_sda/analysis_sda_block.R") +source("/projectnb/dietzelab/guYANG/pecan/runners/wishart_sda/wishart_sda/MCMC_block_function.R") +# source("/projectnb/dietzelab/guYANG/pecan/runners/wishart_sda/wishart_sda/sda.enkf_param.R") + +###### Be careful: param +res <- PEcAnAssimSequential:::sda.enkf_local( + settings = settings, + obs.mean = obs.mean, + obs.cov = obs.cov, + Q = NULL, + pre_enkf_params = NULL, + ensemble.samples = NULL, + control = control +) + +# job_lines <- c( +# "#!/bin/bash", +# "module load R/4.4.0", +# "Rscript /projectnb/dietzelab/guYANG/pecan/runners/wishart_sda/wishart_sda/pecan_sda_runner.R" +# ) +# writeLines(job_lines, "/projectnb/dietzelab/guYANG/pecan/runners/wishart_sda/logs/pecan_sda_runner.sh") + +# qsub -l h_rt=2:00:00 \ +# -l buyin \ +# -l mem_per_core=8G \ +# -pe omp 28 \ +# -V \ +# -N pecan_sda_runner \ +# -o /projectnb/dietzelab/guYANG/pecan/runners/wishart_sda/logs/pecan_sda_runner.out \ +# -e /projectnb/dietzelab/guYANG/pecan/runners/wishart_sda/logs/pecan_sda_runner.err \ +# -M yanggu@bu.edu \ +# -m abe \ +# -S /bin/bash \ +# /projectnb/dietzelab/guYANG/pecan/runners/wishart_sda/logs/pecan_sda_runner.sh diff --git a/models/sipnet/R/read_restart.SIPNET.R b/models/sipnet/R/read_restart.SIPNET.R index 4af0aeedbe..ad835fff0f 100755 --- a/models/sipnet/R/read_restart.SIPNET.R +++ b/models/sipnet/R/read_restart.SIPNET.R @@ -8,140 +8,172 @@ ##' ##' @return X.vec vector of forecasts ##' @export -read_restart.SIPNET <- function(outdir, runid, stop.time, settings, var.names, params) { - - prior.sla <- params[[which(!names(params) %in% c("soil", "soil_SDA", "restart"))[1]]]$SLA - +read_restart.SIPNET <- function (outdir, runid, stop.time, settings, var.names, params) +{ + prior.sla <- params[[which(!names(params) %in% c("soil", + "soil_SDA", "restart"))[1]]]$SLA forecast <- list() - params$restart <-c() #state.vars not in var.names will be added here - #SIPNET inital states refer to models/sipnet/inst/template.param - state.vars <- c("SWE", "SoilMoist", "SoilMoistFrac", "AbvGrndWood", "TotSoilCarb", "LAI", - "litter_carbon_content", "fine_root_carbon_content", + params$restart <- c() + state.vars <- c("SWE", "SoilMoist", "SoilMoistFrac", "AbvGrndWood", "NEE","Qle", + "TotSoilCarb", "LAI", "litter_carbon_content", "fine_root_carbon_content", "coarse_root_carbon_content", "litter_mass_content_of_water") - #when adding new state variables make sure the naming is consistent across read_restart, write_restart and write.configs - #pre-populate parsm$restart with NAs so state names can be added + params$restart <- rep(NA, length(setdiff(state.vars, var.names))) - #add states to params$restart NOT in var.names names(params$restart) <- setdiff(state.vars, var.names) - # Read ensemble output - ens <- PEcAn.utils::read.output(runid = runid, - outdir = file.path(outdir, runid), - start.year = lubridate::year(stop.time), - end.year = lubridate::year(stop.time), - variables = c(state.vars,"time_bounds")) - #calculate last - start.time <- as.Date(paste0(lubridate::year(stop.time),"-01-01")) - time_var <- ens$time_bounds[1,] - real_time <- as.POSIXct(time_var*3600*24, origin = start.time) - # last <- which(as.Date(real_time)==as.Date(stop.time))[1] - last <- which(as.Date(real_time)==as.Date(stop.time))[length(which(as.Date(real_time)==as.Date(stop.time)))] + ens <- PEcAn.utils::read.output(runid = runid, outdir = file.path(outdir, + runid), start.year = lubridate::year(stop.time), end.year = lubridate::year(stop.time), + variables = c(state.vars, "time_bounds")) + + # 4) 若关键变量缺失,直接给出可读的报错信息(避免在 ud_convert 里才崩) + must_have <- c("AbvGrndWood","fine_root_carbon_content","coarse_root_carbon_content") + missing <- must_have[ sapply(ens[must_have], is.null) ] + if (length(missing) > 0) { + stop("Missing variables in NetCDF for runid ", runid, ": ", + paste(missing, collapse=", "), + " | Available: ", paste(names(ens), collapse=", "), + call. = FALSE) + } + + start.time <- as.Date(paste0(lubridate::year(stop.time), + "-01-01")) + time_var <- ens$time_bounds[1, ] + real_time <- as.POSIXct(time_var * 3600 * 24, origin = start.time) + last <- which(as.Date(real_time) == as.Date(stop.time))[length(which(as.Date(real_time) == + as.Date(stop.time)))] + + # print("breakpoint3") + # print(list( + # last = last, + # has_AbvGrndWood = !is.null(ens$AbvGrndWood), + # len_AbvGrndWood = if (is.null(ens$AbvGrndWood)) NA_integer_ else length(ens$AbvGrndWood), + # has_fine = !is.null(ens$fine_root_carbon_content), + # len_fine = if (is.null(ens$fine_root_carbon_content)) NA_integer_ else length(ens$fine_root_carbon_content), + # has_coarse = !is.null(ens$coarse_root_carbon_content), + # len_coarse = if (is.null(ens$coarse_root_carbon_content)) NA_integer_ else length(ens$coarse_root_carbon_content) + # )) + + if (is.null(ens$AbvGrndWood)) { + stop("ens$AbvGrndWood is NULL. Available vars: ", paste(names(ens), collapse=", "), call. = FALSE) + } + if (is.na(last) || length(ens$AbvGrndWood) < last) { + stop("Index 'last' invalid for AbvGrndWood: last=", last, + " len=", length(ens$AbvGrndWood), call. = FALSE) + } - #### PEcAn Standard Outputs if ("AbvGrndWood" %in% var.names) { - forecast[[length(forecast) + 1]] <- PEcAn.utils::ud_convert(ens$AbvGrndWood[last], "kg/m^2", "Mg/ha") + # print("breakpoint3.1") + forecast[[length(forecast) + 1]] <- PEcAn.utils::ud_convert(ens$AbvGrndWood[last], + "kg/m^2", "Mg/ha") names(forecast[[length(forecast)]]) <- c("AbvGrndWood") - - wood_total_C <- ens$AbvGrndWood[last] + ens$fine_root_carbon_content[last] + ens$coarse_root_carbon_content[last] - if (wood_total_C<=0) wood_total_C <- 0.0001 # Making sure we are not making Nans in case there is no plant living there. - - params$restart["abvGrndWoodFrac"] <- ens$AbvGrndWood[last] / wood_total_C - params$restart["coarseRootFrac"] <- ens$coarse_root_carbon_content[last] / wood_total_C - params$restart["fineRootFrac"] <- ens$fine_root_carbon_content[last] / wood_total_C - }else{ - params$restart["AbvGrndWood"] <- PEcAn.utils::ud_convert(ens$AbvGrndWood[last], "kg/m^2", "g/m^2") - # calculate fractions, store in params, will use in write_restart - wood_total_C <- ens$AbvGrndWood[last] + ens$fine_root_carbon_content[last] + ens$coarse_root_carbon_content[last] - if (wood_total_C<=0) wood_total_C <- 0.0001 # Making sure we are not making Nans in case there is no plant living there. - - params$restart["abvGrndWoodFrac"] <- ens$AbvGrndWood[last] / wood_total_C - params$restart["coarseRootFrac"] <- ens$coarse_root_carbon_content[last] / wood_total_C - params$restart["fineRootFrac"] <- ens$fine_root_carbon_content[last] / wood_total_C + wood_total_C <- ens$AbvGrndWood[last] + ens$fine_root_carbon_content[last] + + ens$coarse_root_carbon_content[last] + if (wood_total_C <= 0) + wood_total_C <- 1e-04 + params$restart["abvGrndWoodFrac"] <- ens$AbvGrndWood[last]/wood_total_C + params$restart["coarseRootFrac"] <- ens$coarse_root_carbon_content[last]/wood_total_C + params$restart["fineRootFrac"] <- ens$fine_root_carbon_content[last]/wood_total_C + } + else { + params$restart["AbvGrndWood"] <- PEcAn.utils::ud_convert(ens$AbvGrndWood[last], + "kg/m^2", "g/m^2") + wood_total_C <- ens$AbvGrndWood[last] + ens$fine_root_carbon_content[last] + + ens$coarse_root_carbon_content[last] + if (wood_total_C <= 0) + wood_total_C <- 1e-04 + params$restart["abvGrndWoodFrac"] <- ens$AbvGrndWood[last]/wood_total_C + params$restart["coarseRootFrac"] <- ens$coarse_root_carbon_content[last]/wood_total_C + params$restart["fineRootFrac"] <- ens$fine_root_carbon_content[last]/wood_total_C } if ("GWBI" %in% var.names) { - forecast[[length(forecast) + 1]] <- PEcAn.utils::ud_convert(mean(ens$GWBI), "kg/m^2/s", "Mg/ha/yr") + forecast[[length(forecast) + 1]] <- PEcAn.utils::ud_convert(mean(ens$GWBI), + "kg/m^2/s", "Mg/ha/yr") names(forecast[[length(forecast)]]) <- c("GWBI") } - - # Reading in NET Ecosystem Exchange for SDA - unit is kg C m-2 s-1 and the average is estimated if ("NEE" %in% var.names) { - forecast[[length(forecast) + 1]] <- mean(ens$NEE) ## - names(forecast[[length(forecast)]]) <- c("NEE") + forecast[[length(forecast) + 1]] <- + nee_model_to_obs(mean(ens$NEE)) + names(forecast[[length(forecast)]]) <- "NEE" } - - # Reading in Latent heat flux for SDA - unit is MW m-2 if ("Qle" %in% var.names) { - forecast[[length(forecast) + 1]] <- ens$Qle[last]*1e-6 ## + forecast[[length(forecast) + 1]] <- mean(ens$Qle) names(forecast[[length(forecast)]]) <- c("Qle") } - if ("leaf_carbon_content" %in% var.names) { - forecast[[length(forecast) + 1]] <- ens$leaf_carbon_content[last] ## kgC/m2*m2/kg*2kg/kgC + forecast[[length(forecast) + 1]] <- ens$leaf_carbon_content[last] names(forecast[[length(forecast)]]) <- c("LeafC") } - if ("LAI" %in% var.names) { - forecast[[length(forecast) + 1]] <- ens$LAI[last] ## m2/m2 + forecast[[length(forecast) + 1]] <- ens$LAI[last] names(forecast[[length(forecast)]]) <- c("LAI") - }else{ + } + else { params$restart["LAI"] <- ens$LAI[last] } - - litter_carbon_content <- ens$litter_carbon_content[last] %||% NA_real_ ##kgC/m2 if ("litter_carbon_content" %in% var.names) { - forecast[[length(forecast) + 1]] <- litter_carbon_content + forecast[[length(forecast) + 1]] <- ens$litter_carbon_content[last] names(forecast[[length(forecast)]]) <- c("litter_carbon_content") - }else{ - params$restart["litter_carbon_content"] <- PEcAn.utils::ud_convert(litter_carbon_content, 'kg m-2', 'g m-2') } - - litter_mass_content_of_water <- ens$litter_mass_content_of_water[last] %||% NA_real_ ##kgC/m2 + else { + params$restart["litter_carbon_content"] <- PEcAn.utils::ud_convert(ens$litter_carbon_content[last], + "kg m-2", "g m-2") + } if ("litter_mass_content_of_water" %in% var.names) { - forecast[[length(forecast) + 1]] <- litter_mass_content_of_water + forecast[[length(forecast) + 1]] <- ens$litter_mass_content_of_water[last] names(forecast[[length(forecast)]]) <- c("litter_mass_content_of_water") - }else{ - params$restart["litter_mass_content_of_water"] <- litter_mass_content_of_water } - + else { + params$restart["litter_mass_content_of_water"] <- ens$litter_mass_content_of_water[last] + } if ("SoilMoist" %in% var.names) { forecast[[length(forecast) + 1]] <- ens$SoilMoist[last] names(forecast[[length(forecast)]]) <- c("SoilMoist") - }else{ + } + else { params$restart["SoilMoist"] <- ens$SoilMoist[last] } - if ("SoilMoistFrac" %in% var.names) { - forecast[[length(forecast) + 1]] <- ens$SoilMoistFrac[last]*100 ## here we multiply it by 100 to convert from proportion to percentage. + forecast[[length(forecast) + 1]] <- ens$SoilMoistFrac[last] * + 100 names(forecast[[length(forecast)]]) <- c("SoilMoistFrac") - }else{ + } + else { params$restart["SoilMoistFrac"] <- ens$SoilMoistFrac[last] } - - # This is snow if ("SWE" %in% var.names) { - forecast[[length(forecast) + 1]] <- ens$SWE[last] ## kgC/m2 + forecast[[length(forecast) + 1]] <- ens$SWE[last] names(forecast[[length(forecast)]]) <- c("SWE") - }else{ + } + else { params$restart["SWE"] <- ens$SWE[last]/10 } - if ("TotLivBiom" %in% var.names) { - forecast[[length(forecast) + 1]] <- PEcAn.utils::ud_convert(ens$TotLivBiom[last], "kg/m^2", "Mg/ha") + forecast[[length(forecast) + 1]] <- PEcAn.utils::ud_convert(ens$TotLivBiom[last], + "kg/m^2", "Mg/ha") names(forecast[[length(forecast)]]) <- c("TotLivBiom") } - if ("TotSoilCarb" %in% var.names) { forecast[[length(forecast) + 1]] <- ens$TotSoilCarb[last] names(forecast[[length(forecast)]]) <- c("TotSoilCarb") - }else{ - params$restart["TotSoilCarb"] <- PEcAn.utils::ud_convert(ens$TotSoilCarb[last], 'kg m-2', 'g m-2') # kgC/m2 -> gC/m2 } - - #remove any remaining NAs from params$restart + else { + params$restart["TotSoilCarb"] <- PEcAn.utils::ud_convert(ens$TotSoilCarb[last], + "kg m-2", "g m-2") + } params$restart <- stats::na.omit(params$restart) - + # print(runid) X_tmp <- list(X = unlist(forecast), params = params) - + ### Check + # print(X_tmp) return(X_tmp) -} # read_restart.SIPNET \ No newline at end of file + # print("end of write.config.SIPNET") +} + +nee_model_to_obs <- function(x) { + x * 1e8 / 1.157407 +} + +nee_obs_to_model <- function(x) { + x * 1.157407 / 1e8 +} diff --git a/modules/assim.sequential/R/Analysis_sda_block.R b/modules/assim.sequential/R/Analysis_sda_block.R index a032ae4be3..7fef2997e6 100644 --- a/modules/assim.sequential/R/Analysis_sda_block.R +++ b/modules/assim.sequential/R/Analysis_sda_block.R @@ -461,9 +461,23 @@ MCMC_block_function <- function(block) { control = list(propCov= block$data$pf, adaptScaleOnly = TRUE, latents = "X", pfOptimizeNparticles = TRUE)) #add toggle Y sampler. - for (i in 1:block$constant$YN) { - conf$addSampler(paste0("y.censored[", i, "]"), 'toggle', control=list(type='RW')) + # Revise 1: only those who needed is y.censored + na_idx <- which(is.na(block$data$y.censored)) + + if (length(na_idx) > 0) { + for (i in na_idx) { + conf$addSampler( + target = paste0("y.censored[", i, "]"), + type = "toggle", + control = list(type = "RW") + ) + } } + ## Delete this + # for (i in 1:block$constant$YN) { + # conf$addSampler(paste0("y.censored[", i, "]"), 'toggle', control=list(type='RW')) + # } + ## Finish Revise # conf$printSamplers() #compile MCMC Rmcmc <- nimble::buildMCMC(conf) @@ -471,12 +485,14 @@ MCMC_block_function <- function(block) { Cmcmc <- nimble::compileNimble(Rmcmc, project = model_pred, showCompilerOutput = FALSE) #if we don't have any NA in the Y. - if (!any(is.na(block$data$y.censored))) { - #add toggle Y sampler. - for(i in 1:block$constant$YN) { - valueInCompiledNimbleFunction(Cmcmc$samplerFunctions[[samplerNumberOffset+i]], 'toggle', 0) - } - } + ## Revise 2: Delete this + # if (!any(is.na(block$data$y.censored))) { + # #add toggle Y sampler. + # for(i in 1:block$constant$YN) { + # valueInCompiledNimbleFunction(Cmcmc$samplerFunctions[[samplerNumberOffset+i]], 'toggle', 0) + # } + # } + ## Finish Revise #run MCMC dat <- runMCMC(Cmcmc, niter = block$MCMC$niter, nburnin = block$MCMC$nburnin, thin = block$MCMC$nthin, nchains = block$MCMC$nchain) diff --git a/modules/assim.sequential/R/sda.enkf_parallel.R b/modules/assim.sequential/R/sda.enkf_parallel.R index 3d9b073893..e8633d5c2f 100644 --- a/modules/assim.sequential/R/sda.enkf_parallel.R +++ b/modules/assim.sequential/R/sda.enkf_parallel.R @@ -169,17 +169,33 @@ sda.enkf_local <- function(settings, for (i in 1:length(settings$run$inputs$met$path)) { #---------------- model specific split inputs ### model specific split inputs + ### Revise: delete those + # settings$run$inputs$met$path[[i]] <- do.call( + # my.split_inputs, + # args = list( + # settings = settings, + # start.time = lubridate::ymd_hms(settings$run$site$met.start, truncated = 3), # This depends if we are restart or not + # stop.time = lubridate::ymd_hms(settings$run$site$met.end, truncated = 3), + # inputs = settings$run$inputs$met$path[[i]], + # outpath = paste0(paste0(settings$outdir, "/Extracted_met/"), settings$run$site$id), + # overwrite =F + # ) + # ) + split_args <- list( + start.time = lubridate::ymd_hms(settings$run$site$met.start, truncated = 3), + stop.time = lubridate::ymd_hms(settings$run$site$met.end, truncated = 3), + inputs = settings$run$inputs$met$path[[i]], + outpath = file.path(settings$outdir, "Extracted_met", settings$run$site$id), + overwrite = FALSE + ) + if (settings$model$type != "SIPNET") { + split_args <- c(list(settings = settings), split_args) + } settings$run$inputs$met$path[[i]] <- do.call( my.split_inputs, - args = list( - settings = settings, - start.time = lubridate::ymd_hms(settings$run$site$met.start, truncated = 3), # This depends if we are restart or not - stop.time = lubridate::ymd_hms(settings$run$site$met.end, truncated = 3), - inputs = settings$run$inputs$met$path[[i]], - outpath = paste0(paste0(settings$outdir, "/Extracted_met/"), settings$run$site$id), - overwrite =F - ) + args = split_args ) + ### Finish Revise # changing the start and end date which will be used for model2netcdf.model settings$run$start.date <- lubridate::ymd_hms(settings$state.data.assimilation$start.date, truncated = 3) settings$run$end.date <- lubridate::ymd_hms(settings$state.data.assimilation$end.date, truncated = 3) @@ -211,7 +227,7 @@ sda.enkf_local <- function(settings, # find a site that has all registered inputs except for the parameter field. if (all(names.sampler %in% names.site.input)) { input_design <- PEcAn.uncertainty::generate_joint_ensemble_design(settings = settings[[i]], - ensemble_samples = ensemble.samples, + ensemble_size = nens)[[1]] break } diff --git a/modules/data.land/NAMESPACE b/modules/data.land/NAMESPACE index 14e120593c..ddfec52ad7 100644 --- a/modules/data.land/NAMESPACE +++ b/modules/data.land/NAMESPACE @@ -75,7 +75,7 @@ export(to_co2e) export(validate_events_json) export(write_ic) export(write_veg) -export(xgb_gapfill) +export(xgb_flux_gapfill) importFrom(dplyr,"%>%") importFrom(foreach,"%dopar%") importFrom(magrittr,"%>%") From 5367eb870fd9f7f61ee4ac05f63c06a325cde0d6 Mon Sep 17 00:00:00 2001 From: yanggu Date: Thu, 30 Apr 2026 14:17:04 -0400 Subject: [PATCH 11/13] multi chain MCMC config & fix NEE unit --- local/sda_runner.R | 56 ++++++++------- models/sipnet/R/model2netcdf.SIPNET.R | 66 +++++++++-------- models/sipnet/man/model2netcdf.SIPNET.Rd | 2 +- .../assim.sequential/R/Analysis_sda_block.R | 70 ++++++++++++++++++- .../assim.sequential/R/sda.enkf_parallel.R | 1 + 5 files changed, 131 insertions(+), 64 deletions(-) diff --git a/local/sda_runner.R b/local/sda_runner.R index 5ff798610d..cddc255cd8 100644 --- a/local/sda_runner.R +++ b/local/sda_runner.R @@ -24,18 +24,18 @@ library(lgarch) library(parallel) library(foreach) library(terra) -setwd("/projectnb/dietzelab/guYANG/pecan/runners/wishart_sda/wishart_sda/") +setwd("/projectnb/dietzelab/guYANG/pecan/") ## read settings xml file. -load("/projectnb/dietzelab/guYANG/pecan/runners/test10/pecan_flux.RData") +load("/projectnb/dietzelab/guYANG/pecan/pecan.Rdata") # Change dir name and settings -settings$outdir <- "/projectnb/dietzelab/guYANG/pecan/runners/wishart_sda/output_inter_q_2/" +settings$outdir <- "/projectnb/dietzelab/guYANG/pecan/output/adjust/" settings$rundir <- file.path(settings$outdir, "run") settings$modeloutdir <- file.path(settings$outdir, "out") settings$host$rundir <- file.path(settings$outdir, "run") settings$host$outdir <- file.path(settings$outdir, "out") settings$host$folder <- file.path(settings$outdir, "out") -settings$ensemble$size <- 5 -settings$state.data.assimilation$adjustment <- "TRUE" +settings$ensemble$size <- 25 +settings$state.data.assimilation$adjustment <- "FALSE" settings$host$prerun <- "module load R/4.4.0" ###### Change Q type settings$state.data.assimilation$q.type <- "wishart" @@ -64,10 +64,6 @@ settings <- PEcAn.settings::prepare.settings(settings) load("/projectnb/dietzelab/guYANG/pecan/runners/test10/obs.mean.RData") load("/projectnb/dietzelab/guYANG/pecan/runners/test10/obs.cov.RData") -# load 4 obs -# load("/projectnb/dietzelab/dongchen/anchorSites/NA_runs/SDA_8k_site/observation/Rdata_with_attributes/obs.mean.LandTrendr_GEDI.Rdata") -# load("/projectnb/dietzelab/dongchen/anchorSites/NA_runs/SDA_8k_site/observation/Rdata_with_attributes/obs.cov.LandTrendr_GEDI.Rdata") - sub_obs <- function(L, keep) setNames(lapply(L, \(l) l[names(l) %in% keep]), names(L)) obs.mean <- sub_obs(obs.mean, keep_ids) obs.cov <- sub_obs(obs.cov, keep_ids) @@ -99,7 +95,7 @@ if (length(obs.cov[[i]][[j]]) > 1) { } # load PFT parameter file. -samples_src <- "/projectnb/dietzelab/dongchen/anchorSites/NA_runs/SDA_8k_site/samples.Rdata" +samples_src <- "/projectnb/dietzelab/guYANG/pecan/cali_samples.Rdata" samples_dst <- file.path(settings$outdir, "samples.Rdata") dir.create(settings$outdir, recursive = TRUE, showWarnings = FALSE) if (!file.exists(samples_dst)) file.copy(samples_src, samples_dst, overwrite = TRUE) @@ -108,7 +104,7 @@ control <- list( TimeseriesPlot = FALSE, OutlierDetection = FALSE, send_email = NULL, - keepNC = TRUE, + keepNC = FALSE, forceRun = TRUE, run_parallel = FALSE, MCMC.args = NULL, @@ -116,13 +112,6 @@ control <- list( execution = "qsub_parallel" # or "qsub" / "local" ) -source("/projectnb/dietzelab/guYANG/pecan/runners/wishart_sda/wishart_sda/sda.enkf_local.R") -source("/projectnb/dietzelab/guYANG/pecan/runners/wishart_sda/wishart_sda/read.restart.SIPNET.R") -source("/projectnb/dietzelab/guYANG/pecan/runners/wishart_sda/wishart_sda/analysis_sda_block.R") -source("/projectnb/dietzelab/guYANG/pecan/runners/wishart_sda/wishart_sda/MCMC_block_function.R") -# source("/projectnb/dietzelab/guYANG/pecan/runners/wishart_sda/wishart_sda/sda.enkf_param.R") - -###### Be careful: param res <- PEcAnAssimSequential:::sda.enkf_local( settings = settings, obs.mean = obs.mean, @@ -130,25 +119,40 @@ res <- PEcAnAssimSequential:::sda.enkf_local( Q = NULL, pre_enkf_params = NULL, ensemble.samples = NULL, - control = control + control = list( + TimeseriesPlot = FALSE, + OutlierDetection = FALSE, + send_email = NULL, + keepNC = TRUE, + forceRun = TRUE, + run_parallel = FALSE, + MCMC.args = list( + niter = 200000, + nburnin = 100000, + nthin = 5, + nchain = 3 + ), + merge_nc = FALSE, + execution = "qsub_parallel" # or "qsub" / "local" + ) ) # job_lines <- c( # "#!/bin/bash", # "module load R/4.4.0", -# "Rscript /projectnb/dietzelab/guYANG/pecan/runners/wishart_sda/wishart_sda/pecan_sda_runner.R" +# "Rscript /projectnb/dietzelab/guYANG/pecan/pecan/local/sda_runner.R" # ) -# writeLines(job_lines, "/projectnb/dietzelab/guYANG/pecan/runners/wishart_sda/logs/pecan_sda_runner.sh") +# writeLines(job_lines, "/projectnb/dietzelab/guYANG/pecan/output/log/sda_runner.sh") -# qsub -l h_rt=2:00:00 \ +# qsub -l h_rt=6:00:00 \ # -l buyin \ # -l mem_per_core=8G \ # -pe omp 28 \ # -V \ -# -N pecan_sda_runner \ -# -o /projectnb/dietzelab/guYANG/pecan/runners/wishart_sda/logs/pecan_sda_runner.out \ -# -e /projectnb/dietzelab/guYANG/pecan/runners/wishart_sda/logs/pecan_sda_runner.err \ +# -N sda_runner \ +# -o /projectnb/dietzelab/guYANG/pecan/output/log/sda_runner.out \ +# -e /projectnb/dietzelab/guYANG/pecan/output/log/sda_runner.err \ # -M yanggu@bu.edu \ # -m abe \ # -S /bin/bash \ -# /projectnb/dietzelab/guYANG/pecan/runners/wishart_sda/logs/pecan_sda_runner.sh +# /projectnb/dietzelab/guYANG/pecan/output/log/sda_runner.sh diff --git a/models/sipnet/R/model2netcdf.SIPNET.R b/models/sipnet/R/model2netcdf.SIPNET.R index 5d96f615fe..8264c8f9d7 100644 --- a/models/sipnet/R/model2netcdf.SIPNET.R +++ b/models/sipnet/R/model2netcdf.SIPNET.R @@ -58,7 +58,7 @@ mergeNC <- function( #' @export #' @author Shawn Serbin, Michael Dietze model2netcdf.SIPNET <- function(outdir, sitelat, sitelon, start_date, end_date, delete.raw = FALSE, revision = NULL, prefix = "sipnet.out", - overwrite = FALSE, conflict = FALSE) { + overwrite = TRUE, conflict = FALSE) { ### Read in model output in SIPNET format sipnet_out_file <- file.path(outdir, prefix) # SIPNET v1 had a "Notes" comment line before the header; v2 removed it. @@ -119,8 +119,8 @@ model2netcdf.SIPNET <- function(outdir, sitelat, sitelon, start_date, end_date, timestep.s <- 86400 / out_day - - + + ## Unit conversions # # CKB 20260407: Not using ud_convert here is intentional! @@ -134,7 +134,7 @@ model2netcdf.SIPNET <- function(outdir, sitelat, sitelon, start_date, end_date, cm_step_to_mm_sec <- function(x) x * 10 / timestep.s sipnet_output <- sipnet_output |> dplyr::mutate( - + # C and N pools dplyr::across( .cols = c( @@ -145,7 +145,7 @@ model2netcdf.SIPNET <- function(outdir, sitelat, sitelon, start_date, end_date, ), .fns = g_to_kg ), - + # C and N fluxes dplyr::across( .cols = c( @@ -154,7 +154,7 @@ model2netcdf.SIPNET <- function(outdir, sitelat, sitelon, start_date, end_date, ), .fns = g_step_to_kg_sec ), - + # Water pools dplyr::across( .cols = c( @@ -163,7 +163,7 @@ model2netcdf.SIPNET <- function(outdir, sitelat, sitelon, start_date, end_date, ), .fns = cm_to_mm ), - + # Water fluxes dplyr::across( .cols = dplyr::all_of("evapotranspiration"), @@ -172,12 +172,12 @@ model2netcdf.SIPNET <- function(outdir, sitelat, sitelon, start_date, end_date, # Water flux special case: # Sipnet reports transpiration, and no other variables, in cm/day not cm/timestep. fluxestranspiration = cm_to_mm(.data$fluxestranspiration) / 86400, # cm/day -> mm/sec - + # Date and time datetime = sipnet2datetime(.data$year, .data$day, .data$time) ) - - + + # calculate LAI for standard output # LAI = plantLeafC / leafCSpWt # both operands are in carbon units (gC/m2 and gC/m2_leaf), @@ -187,22 +187,20 @@ model2netcdf.SIPNET <- function(outdir, sitelat, sitelon, start_date, end_date, "sipnet.param"), stringsAsFactors = FALSE) leafCSpWt <- param[param[, 1] == "leafCSpWt", 2] SLA <- 1000 / leafCSpWt # m2 leaf / kg C - + ### Loop over years in SIPNET output to create separate netCDF outputs for (y in year_seq) { - #initialize the conflicted as FALSE conflicted <- FALSE - conflict <- TRUE #conflict is set to TRUE to enable the rename of yearly nc file for merging SDA results with sub-annual data - #if we have conflicts on this file. - if (file.exists(file.path(outdir, paste(y, "nc", sep = "."))) & overwrite == FALSE & conflict == FALSE) { - next - }else if(file.exists(file.path(outdir, paste(y, "nc", sep = "."))) & conflict){ - conflicted <- TRUE - file.rename(file.path(outdir, paste(y, "nc", sep = ".")), file.path(outdir, "previous.nc")) + nc_file <- file.path(outdir, paste(y, "nc", sep = ".")) + if (file.exists(nc_file)) { + ok <- file.remove(nc_file) + if (!ok) { + stop("Failed to remove existing NetCDF file: ", nc_file) + } } - print(paste("---- Processing year: ", y)) # turn on for debugging - + print(paste("---- Processing year: ", y)) # turn on for debugging + ## Subset data for processing sub.sipnet.output <- subset(sipnet_output, sipnet_output$year == y) @@ -238,10 +236,10 @@ model2netcdf.SIPNET <- function(outdir, sitelat, sitelon, start_date, end_date, "coarse_root_carbon_content" = sub.sipnet.output$coarseRootC, "LAI" = sub.sipnet.output$plantLeafC * SLA, "TotLivBiom" = sub.sipnet.output$plantWoodC + sub.sipnet.output$plantLeafC + - sub.sipnet.output$coarseRootC + sub.sipnet.output$fineRootC, + sub.sipnet.output$coarseRootC + sub.sipnet.output$fineRootC, "TotSoilCarb" = sub.sipnet.output$soil + sub.sipnet.output$litter, "AGB" = sub.sipnet.output$plantWoodC + sub.sipnet.output$plantLeafC, - + # Water variables: # Liquid water units are cm in Sipnet; in PEcAn they're kg water m-2 # (which is equivalent to mm: (water density = 1000 kg m-3) * (1 m/ 1000 mm) = (1 kg m-2)/mm @@ -255,14 +253,14 @@ model2netcdf.SIPNET <- function(outdir, sitelat, sitelon, start_date, end_date, "SoilMoistFrac" = sub.sipnet.output$soilWetnessFrac, "SWE" = sub.sipnet.output$snow # Snow Water Equivalent ) - + if ("litterWater" %in% names(sub.sipnet.output)) { # Removed in SIPNET v2; only extract if present output[["litter_mass_content_of_water"]] <- sub.sipnet.output$litterWater } if ("woodCreation" %in% names(sub.sipnet.output)) { # Added in SIPNET v2; only extract if present output[["GWBI"]] <- sub.sipnet.output$woodCreation } - + # columns only present in sipnet >= v2 with N and methane turned on if ("minN" %in% names(sub.sipnet.output)) { output[["mineral_N"]] <- sub.sipnet.output$minN @@ -288,7 +286,7 @@ model2netcdf.SIPNET <- function(outdir, sitelat, sitelon, start_date, end_date, if ("ch4" %in% names(sub.sipnet.output)) { output[["CH4_flux"]] <- sub.sipnet.output$ch4 } - + output[["time_bounds"]] <- c(rbind(bounds[,1], bounds[,2])) # ******************** Declare netCDF variables ********************# @@ -344,7 +342,7 @@ model2netcdf.SIPNET <- function(outdir, sitelat, sitelon, start_date, end_date, longname = "history time interval endpoints", dim=list(time_interval,time = t), prec = "double") ) - + if ("litter_mass_content_of_water" %in% names(output)) { nc_var[["litter_mass_content_of_water"]] <- PEcAn.utils::to_ncvar("litter_mass_content_of_water", dims) } @@ -354,30 +352,30 @@ model2netcdf.SIPNET <- function(outdir, sitelat, sitelon, start_date, end_date, } if ("mineral_N" %in% names(output)) { nc_var[["mineral_N"]] <- ncdf4::ncvar_def("mineral_N", units = "kg N m-2", - dim = list(lon, lat, t), missval = -999, longname = "Soil mineral nitrogen") + dim = list(lon, lat, t), missval = -999, longname = "Soil mineral nitrogen") } if ("soil_organic_N" %in% names(output)) { nc_var[["soil_organic_N"]] <- ncdf4::ncvar_def("soil_organic_N", units = "kg N m-2", - dim = list(lon, lat, t), missval = -999, longname = "Soil organic nitrogen") + dim = list(lon, lat, t), missval = -999, longname = "Soil organic nitrogen") } if ("litter_N" %in% names(output)) { nc_var[["litter_N"]] <- ncdf4::ncvar_def("litter_N", units = "kg N m-2", - dim = list(lon, lat, t), missval = -999, longname = "Litter nitrogen") + dim = list(lon, lat, t), missval = -999, longname = "Litter nitrogen") } if ("N2O_flux" %in% names(output)) { nc_var[["N2O_flux"]] <- PEcAn.utils::to_ncvar("N2O_flux", dims) } if ("N_leaching" %in% names(output)) { nc_var[["N_leaching"]] <- ncdf4::ncvar_def("N_leaching", units = "kg N m-2 s-1", - dim = list(lon, lat, t), missval = -999, longname = "Nitrogen leaching flux") + dim = list(lon, lat, t), missval = -999, longname = "Nitrogen leaching flux") } if ("N_fixation" %in% names(output)) { nc_var[["N_fixation"]] <- ncdf4::ncvar_def("N_fixation", units = "kg N m-2 s-1", - dim = list(lon, lat, t), missval = -999, longname = "Nitrogen fixation flux") + dim = list(lon, lat, t), missval = -999, longname = "Nitrogen fixation flux") } if ("N_uptake" %in% names(output)) { nc_var[["N_uptake"]] <- ncdf4::ncvar_def("N_uptake", units = "kg N m-2 s-1", - dim = list(lon, lat, t), missval = -999, longname = "Plant nitrogen uptake flux") + dim = list(lon, lat, t), missval = -999, longname = "Plant nitrogen uptake flux") } if ("CH4_flux" %in% names(output)) { nc_var[["CH4_flux"]] <- PEcAn.utils::to_ncvar("CH4_flux", dims) @@ -449,4 +447,4 @@ sipnet2datetime <- function(year, doy, hour){ ) as.POSIXct(datetime) -} +} \ No newline at end of file diff --git a/models/sipnet/man/model2netcdf.SIPNET.Rd b/models/sipnet/man/model2netcdf.SIPNET.Rd index 5042dbc734..01d55577d4 100644 --- a/models/sipnet/man/model2netcdf.SIPNET.Rd +++ b/models/sipnet/man/model2netcdf.SIPNET.Rd @@ -13,7 +13,7 @@ model2netcdf.SIPNET( delete.raw = FALSE, revision = NULL, prefix = "sipnet.out", - overwrite = FALSE, + overwrite = TRUE, conflict = FALSE ) } diff --git a/modules/assim.sequential/R/Analysis_sda_block.R b/modules/assim.sequential/R/Analysis_sda_block.R index 7fef2997e6..2d9f2eb16e 100644 --- a/modules/assim.sequential/R/Analysis_sda_block.R +++ b/modules/assim.sequential/R/Analysis_sda_block.R @@ -84,6 +84,19 @@ analysis_sda_block <- function (settings, block.list.all, X, obs.mean, obs.cov, foreach::registerDoSEQ() PEcAn.logger::logger.info("Completed!") + #### Revise: Add MCMC block diag save + mcmc_diag <- purrr::map( + block.list.all[[t]], + "mcmc_diag" + ) + mcmc_diag <- mcmc_diag[!vapply(mcmc_diag, is.null, logical(1))] + if (length(mcmc_diag) > 0) { + mcmc_diag <- dplyr::bind_rows(mcmc_diag) + } else { + mcmc_diag <- NULL + } + #### Finish Revise + #convert from block lists to vector values. if ("try-error" %in% class(try(V <- block.2.vector(block.list.all[[t]], X, H, settings$state.data.assimilation$adjustment)))) { PEcAn.logger::logger.severe("Something wrong within the block.2.vector function.") @@ -98,6 +111,7 @@ analysis_sda_block <- function (settings, block.list.all, X, obs.mean, obs.cov, Pa = V$Pa, Y = Y, R = R, + mcmc_diag = mcmc_diag, analysis = V$analysis)) } @@ -494,10 +508,60 @@ MCMC_block_function <- function(block) { # } ## Finish Revise - #run MCMC - dat <- runMCMC(Cmcmc, niter = block$MCMC$niter, nburnin = block$MCMC$nburnin, thin = block$MCMC$nthin, nchains = block$MCMC$nchain) - #update aq, bq, mua, and pa + ## Revise: + # Delete the original MCMC at fits nchain = 1 + # #run MCMC + # dat <- runMCMC(Cmcmc, niter = block$MCMC$niter, nburnin = block$MCMC$nburnin, thin = block$MCMC$nthin, nchains = block$MCMC$nchain) + mcmc_diag <- NULL + + if (block$MCMC$nchain == 1) { + dat <- runMCMC( + Cmcmc, + niter = block$MCMC$niter, + nburnin = block$MCMC$nburnin, + thin = block$MCMC$nthin, + nchains = block$MCMC$nchain + ) + } else { + ## multi-chain code + dat_raw <- runMCMC( + Cmcmc, + niter = block$MCMC$niter, + nburnin = block$MCMC$nburnin, + thin = block$MCMC$nthin, + nchains = block$MCMC$nchain, + samplesAsCodaMCMC = TRUE, + summary = FALSE + ) + + rhat <- coda::gelman.diag( + dat_raw, + autoburnin = FALSE, + multivariate = FALSE + )$psrf[, "Point est."] + + ess <- coda::effectiveSize(dat_raw) + + mcmc_diag <- data.frame( + parameter = names(rhat), + rhat = as.numeric(rhat), + ess = as.numeric(ess[names(rhat)]) + ) + + ## combine chains + dat <- as.matrix(dat_raw) + } + + str(dat_raw); dim(dat); head(colnames(dat)) + + ## save diag into block + block$mcmc_diag <- mcmc_diag + + #update aq, bq, mua, and pa M <- colMeans(dat) + + + block$update$aq <- block$Inits$q if (block$constant$q.type == 3) { #if it's a vector q case diff --git a/modules/assim.sequential/R/sda.enkf_parallel.R b/modules/assim.sequential/R/sda.enkf_parallel.R index e8633d5c2f..6db15b12c1 100644 --- a/modules/assim.sequential/R/sda.enkf_parallel.R +++ b/modules/assim.sequential/R/sda.enkf_parallel.R @@ -479,6 +479,7 @@ sda.enkf_local <- function(settings, analysis = ANALYSIS[[obs.t]], enkf.params = enkf.params[[obs.t]], ens_weights[[obs.t]], + mcmc_diag = enkf.params[[obs.t]]$mcmc_diag, params.list = params.list, restart.list = restart.list, debias.out = debias.out) From 14b4ff4d8db63bc4dc72192add6b33a97caf5f7b Mon Sep 17 00:00:00 2001 From: yanggu Date: Tue, 12 May 2026 14:13:53 -0400 Subject: [PATCH 12/13] update bf byproduct --- local/sda_runner.R | 58 +++++++--- models/sipnet/R/model2netcdf.SIPNET.R | 2 +- .../assim.sequential/R/SDA_OBS_Assembler.R | 3 +- .../assim.sequential/R/sda.enkf_parallel.R | 15 +++ modules/data.land/R/Soilgrids_SoilC_prep.R | 105 +++++++++++++----- 5 files changed, 138 insertions(+), 45 deletions(-) diff --git a/local/sda_runner.R b/local/sda_runner.R index cddc255cd8..61e86d0cb4 100644 --- a/local/sda_runner.R +++ b/local/sda_runner.R @@ -27,8 +27,13 @@ library(terra) setwd("/projectnb/dietzelab/guYANG/pecan/") ## read settings xml file. load("/projectnb/dietzelab/guYANG/pecan/pecan.Rdata") + +### PEcAn Settings +# settings_dir <- "/projectnb/dietzelab/guYANG/pecan/output/pecan_flux.xml" +# settings <- PEcAn.settings::read.settings(settings_dir) + # Change dir name and settings -settings$outdir <- "/projectnb/dietzelab/guYANG/pecan/output/adjust/" +settings$outdir <- "/projectnb/dietzelab/guYANG/pecan/output/5obs_monthly/" settings$rundir <- file.path(settings$outdir, "run") settings$modeloutdir <- file.path(settings$outdir, "out") settings$host$rundir <- file.path(settings$outdir, "run") @@ -38,7 +43,9 @@ settings$ensemble$size <- 25 settings$state.data.assimilation$adjustment <- "FALSE" settings$host$prerun <- "module load R/4.4.0" ###### Change Q type -settings$state.data.assimilation$q.type <- "wishart" +settings$state.data.assimilation$q.type <- "vector" +settings$state.data.assimilation$aqq.Init <- "1" +settings$state.data.assimilation$bqq.Init <- "1" ## Fix the multi output in one timestep bug settings$model$jobtemplate <- "/projectnb/dietzelab/guYANG/pecan/runners/test7/sipnet_template.job" @@ -60,9 +67,12 @@ settings$state.data.assimilation$batch.settings <- batch.settings # update settings with the actual PFTs. settings <- PEcAn.settings::prepare.settings(settings) -# load 6 obs -load("/projectnb/dietzelab/guYANG/pecan/runners/test10/obs.mean.RData") -load("/projectnb/dietzelab/guYANG/pecan/runners/test10/obs.cov.RData") +# load obs +# load("/projectnb/dietzelab/guYANG/pecan/runners/test10/obs.mean.RData") +# load("/projectnb/dietzelab/guYANG/pecan/runners/test10/obs.cov.RData") + +load("/projectnb/dietzelab/guYANG/pecan/output/obs/Rdata/obs.mean.monthly.noSM.Rdata") +load("/projectnb/dietzelab/guYANG/pecan/output/obs/Rdata/obs.cov.monthly.noSM.Rdata") sub_obs <- function(L, keep) setNames(lapply(L, \(l) l[names(l) %in% keep]), names(L)) obs.mean <- sub_obs(obs.mean, keep_ids) @@ -95,19 +105,25 @@ if (length(obs.cov[[i]][[j]]) > 1) { } # load PFT parameter file. -samples_src <- "/projectnb/dietzelab/guYANG/pecan/cali_samples.Rdata" +# samples_src <- "/projectnb/dietzelab/guYANG/pecan/cali_samples.Rdata" +samples_src <- "/projectnb/dietzelab/dongchen/anchorSites/NA_runs/SDA_8k_site/samples.Rdata" samples_dst <- file.path(settings$outdir, "samples.Rdata") dir.create(settings$outdir, recursive = TRUE, showWarnings = FALSE) if (!file.exists(samples_dst)) file.copy(samples_src, samples_dst, overwrite = TRUE) -control <- list( +control = list( TimeseriesPlot = FALSE, OutlierDetection = FALSE, send_email = NULL, - keepNC = FALSE, + keepNC = TRUE, forceRun = TRUE, run_parallel = FALSE, - MCMC.args = NULL, + MCMC.args = list( + niter = 200000, + nburnin = 100000, + nthin = 5, + nchain = 3 + ), merge_nc = FALSE, execution = "qsub_parallel" # or "qsub" / "local" ) @@ -137,22 +153,34 @@ res <- PEcAnAssimSequential:::sda.enkf_local( ) ) +#### qsub +# PEcAnAssimSequential:::qsub_sda( +# settings = settings, +# obs.mean = obs.mean, +# obs.cov = obs.cov, +# Q = NULL, +# pre_enkf_params = NULL, +# ensemble.samples = NULL, +# outdir = "/projectnb/dietzelab/guYANG/pecan/output/qsub", +# control = control +# ) + # job_lines <- c( # "#!/bin/bash", # "module load R/4.4.0", # "Rscript /projectnb/dietzelab/guYANG/pecan/pecan/local/sda_runner.R" # ) -# writeLines(job_lines, "/projectnb/dietzelab/guYANG/pecan/output/log/sda_runner.sh") +# writeLines(job_lines, "/projectnb/dietzelab/guYANG/pecan/output/log/sda_runner_1.sh") -# qsub -l h_rt=6:00:00 \ +# qsub -l h_rt=10:00:00 \ # -l buyin \ # -l mem_per_core=8G \ # -pe omp 28 \ # -V \ -# -N sda_runner \ -# -o /projectnb/dietzelab/guYANG/pecan/output/log/sda_runner.out \ -# -e /projectnb/dietzelab/guYANG/pecan/output/log/sda_runner.err \ +# -N sda_runner_1 \ +# -o /projectnb/dietzelab/guYANG/pecan/output/log/sda_runner_1.out \ +# -e /projectnb/dietzelab/guYANG/pecan/output/log/sda_runner_1.err \ # -M yanggu@bu.edu \ # -m abe \ # -S /bin/bash \ -# /projectnb/dietzelab/guYANG/pecan/output/log/sda_runner.sh +# /projectnb/dietzelab/guYANG/pecan/output/log/sda_runner_1.sh diff --git a/models/sipnet/R/model2netcdf.SIPNET.R b/models/sipnet/R/model2netcdf.SIPNET.R index 8264c8f9d7..aa17e0b9e3 100644 --- a/models/sipnet/R/model2netcdf.SIPNET.R +++ b/models/sipnet/R/model2netcdf.SIPNET.R @@ -158,7 +158,7 @@ model2netcdf.SIPNET <- function(outdir, sitelat, sitelon, start_date, end_date, # Water pools dplyr::across( .cols = c( - dplyr::all_of(c("soilWater", "soilWetnessFrac", "snow")), + dplyr::all_of(c("soilWater", "snow")), dplyr::any_of("litterWater") # Only present in V1 output ), .fns = cm_to_mm diff --git a/modules/assim.sequential/R/SDA_OBS_Assembler.R b/modules/assim.sequential/R/SDA_OBS_Assembler.R index 315d1fdfb4..53ba5d8066 100644 --- a/modules/assim.sequential/R/SDA_OBS_Assembler.R +++ b/modules/assim.sequential/R/SDA_OBS_Assembler.R @@ -44,8 +44,7 @@ SDA_OBS_Assembler <- function(settings){ site.list$lon <- as.numeric(site.list$lon) list(site_id=site.list$id, lat=site.list$lat, lon=site.list$lon, site_name=site.list$name) })%>% - dplyr::bind_rows() %>% - as.list() + dplyr::bind_rows() #convert from timestep to time points if (length(Obs_Prep$timestep)>0){ diff --git a/modules/assim.sequential/R/sda.enkf_parallel.R b/modules/assim.sequential/R/sda.enkf_parallel.R index 6db15b12c1..6cd321422f 100644 --- a/modules/assim.sequential/R/sda.enkf_parallel.R +++ b/modules/assim.sequential/R/sda.enkf_parallel.R @@ -596,6 +596,21 @@ qsub_sda <- function(settings, if (is.null(outdir)) { outdir <- settings$outdir } + + ## Revise + # ensure ensemble.samples is available before splitting batch jobs + if (is.null(ensemble.samples)) { + sample_file <- file.path(outdir, "samples.Rdata") + if (!file.exists(sample_file)) { + stop("Cannot find samples.Rdata at: ", sample_file) + } + load(sample_file) # expects object named ensemble.samples + if (is.null(ensemble.samples)) { + stop("samples.Rdata was loaded, but object `ensemble.samples` was not found.") + } + } + ## Stop Revise + # create folder for storing job outputs. batch.folder <- file.path(outdir, prefix) # delete the whole folder if it's not empty. diff --git a/modules/data.land/R/Soilgrids_SoilC_prep.R b/modules/data.land/R/Soilgrids_SoilC_prep.R index 4fc88cca8c..a808df1c54 100644 --- a/modules/data.land/R/Soilgrids_SoilC_prep.R +++ b/modules/data.land/R/Soilgrids_SoilC_prep.R @@ -12,37 +12,88 @@ #' #' @author Dongchen Zhang #' @importFrom magrittr %>% -Soilgrids_SoilC_prep <- function(site_info, start_date, end_date, time_points, - outdir = NULL, export_csv = FALSE){ - #if we export CSV but didn't provide any path - if(as.logical(export_csv) & is.null(outdir)){ - PEcAn.logger::logger.info("If you want to export CSV file, please ensure input the outdir!") +Soilgrids_SoilC_prep <- function(site_info, start_date, end_date, time_points, + outdir = NULL, export_csv = FALSE) { + + if (as.logical(export_csv) & is.null(outdir)) { + PEcAn.logger::logger.info( + "If you want to export CSV file, please ensure input the outdir!" + ) return(0) - }else if(as.logical(export_csv) & !file.exists(file.path(outdir, "soilgrids_soilC_data.csv"))){ - #if we want to export the csv file for soilgrids data. - Previous_CSV <- PEcAn.data.land::soilgrids_soilC_extract(site_info, outdir) - }else if(!as.logical(export_csv) & !file.exists(file.path(outdir, "soilgrids_soilC_data.csv"))){ - #if we don't want to export the csv file. + } + + soilc_file <- if (!is.null(outdir)) { + file.path(outdir, "soilgrids_soilC_data.csv") + } else { + NA_character_ + } + + if (!is.null(outdir) && !file.exists(soilc_file)) { + + if (as.logical(export_csv)) { + Previous_CSV <- PEcAn.data.land::soilgrids_soilC_extract(site_info, outdir) + } else { + Previous_CSV <- PEcAn.data.land::soilgrids_soilC_extract(site_info) + } + + } else if (!is.null(outdir) && file.exists(soilc_file)) { + + Previous_CSV <- as.data.frame(utils::read.csv(soilc_file)) + + } else { + Previous_CSV <- PEcAn.data.land::soilgrids_soilC_extract(site_info) - }else if(file.exists(file.path(outdir, "soilgrids_soilC_data.csv"))){ - #if we have previous extracted soilgrids csv file. - Previous_CSV <- as.data.frame(utils::read.csv(file.path(outdir, "soilgrids_soilC_data.csv"))) - SoilC_Output <- matrix(NA, length(site_info$site_id), 2*length(time_points)+1) %>% - `colnames<-`(c("site_id", paste0(time_points, "_TotSoilCarb"), paste0(time_points, "_SD"))) %>% as.data.frame()#we need: site_id, agb, sd, target time point. - SoilC_Output$site_id <- site_info$site_id - #loop over time and site - for (i in seq_along(time_points)) { - t <- time_points[i] - for (id in site_info$site_id) { - site_SoilC <- Previous_CSV[which(Previous_CSV$Site_ID == id),] - if (dim(site_SoilC)[1] == 0) { - next - } - SoilC_Output[which(SoilC_Output$site_id==id), paste0(t, "_TotSoilCarb")] <- site_SoilC$Total_soilC_0.200cm - SoilC_Output[which(SoilC_Output$site_id==id), paste0(t, "_SD")] <- site_SoilC$Std_soilC_0.200cm + } + + if (!is.null(outdir) && file.exists(soilc_file)) { + Previous_CSV <- as.data.frame(utils::read.csv(soilc_file)) + } + + SoilC_Output <- matrix( + NA, + length(site_info$site_id), + 2 * length(time_points) + 1 + ) %>% + `colnames<-`( + c( + "site_id", + paste0(time_points, "_TotSoilCarb"), + paste0(time_points, "_SD") + ) + ) %>% + as.data.frame() + + SoilC_Output$site_id <- site_info$site_id + + for (i in seq_along(time_points)) { + t <- time_points[i] + + for (id in site_info$site_id) { + site_SoilC <- Previous_CSV[ + which(as.character(Previous_CSV$Site_ID) == as.character(id)), + ] + + if (nrow(site_SoilC) == 0) { + next } + + SoilC_Output[ + which(as.character(SoilC_Output$site_id) == as.character(id)), + paste0(t, "_TotSoilCarb") + ] <- site_SoilC$Total_soilC_0.200cm + + SoilC_Output[ + which(as.character(SoilC_Output$site_id) == as.character(id)), + paste0(t, "_SD") + ] <- site_SoilC$Std_soilC_0.200cm } } + PEcAn.logger::logger.info("Soilgrids SoilC Prep Completed!") - list(SoilC_Output = SoilC_Output, time_points = time_points, var = "TotSoilCarb") + + list( + SoilC_Output = SoilC_Output, + time_points = time_points, + var = "TotSoilCarb" + ) } \ No newline at end of file From 8e2c97d2be7451f5695bb8bb0a65d94a75b58270 Mon Sep 17 00:00:00 2001 From: yanggu Date: Tue, 12 May 2026 14:56:55 -0400 Subject: [PATCH 13/13] monthly sipnet available --- base/utils/R/read.output.R | 6 ++-- .../assim.sequential/R/sda.enkf_parallel.R | 33 ++++++++++++++----- 2 files changed, 27 insertions(+), 12 deletions(-) diff --git a/base/utils/R/read.output.R b/base/utils/R/read.output.R index a641b113aa..1df8fc741d 100644 --- a/base/utils/R/read.output.R +++ b/base/utils/R/read.output.R @@ -77,9 +77,9 @@ read.output <- function(runid, outdir, } # create list of *.nc years - look only for files formatted as - # YYYY.nc, the default pecan output file name standard + # YYYY.nc or YYYY-MM-DD.nc if (is.null(ncfiles)) { - ncfiles_sub <- list.files(path = outdir, pattern = "^-?[[:digit:]]{4}\\.nc$", full.names = FALSE) + ncfiles_sub <- list.files(path = outdir, pattern = "^-?[[:digit:]]{4}(-[[:digit:]]{2}-[[:digit:]]{2})?\\.nc$", full.names = FALSE) ncfiles <- file.path(outdir, ncfiles_sub) } else { # Assume the NetCDF files follow the PEcAn standard format @@ -128,7 +128,7 @@ read.output <- function(runid, outdir, # Deduce file years from their names nc_years <- suppressWarnings( - as.numeric(gsub("^(-?[[:digit:]]{4})\\.nc", "\\1", ncfiles_sub)) + as.numeric(gsub("^(-?[[:digit:]]{4})(-[[:digit:]]{2}-[[:digit:]]{2})?\\.nc$", "\\1", ncfiles_sub)) ) if (any(is.na(nc_years))) { PEcAn.logger::logger.debug( diff --git a/modules/assim.sequential/R/sda.enkf_parallel.R b/modules/assim.sequential/R/sda.enkf_parallel.R index 6cd321422f..07d69cf203 100644 --- a/modules/assim.sequential/R/sda.enkf_parallel.R +++ b/modules/assim.sequential/R/sda.enkf_parallel.R @@ -109,13 +109,20 @@ sda.enkf_local <- function(settings, ###-------------------------------------------------------------------### ### check dates before data assimilation ### ###-------------------------------------------------------------------###---- - #filtering obs data based on years specifited in setting > state.data.assimilation + #Filter observation dates to SDA bounds start.cut <- lubridate::ymd_hms(settings$state.data.assimilation$start.date, truncated = 3) - Start.year <- (lubridate::year(settings$state.data.assimilation$start.date)) - End.year <- lubridate::year(settings$state.data.assimilation$end.date) # dates that assimilations will be done for - obs will be subsetted based on this - assim.sda <- Start.year:End.year - obs.mean <- obs.mean[sapply(lubridate::year(names(obs.mean)), function(obs.year) obs.year %in% (assim.sda))] #checks obs.mean dates against assimyear dates - obs.cov <- obs.cov[sapply(lubridate::year(names(obs.cov)), function(obs.year) obs.year %in% (assim.sda))] #checks obs.cov dates against assimyear dates + # Start.year <- (lubridate::year(settings$state.data.assimilation$start.date)) + # End.year <- lubridate::year(settings$state.data.assimilation$end.date) # dates that assimilations will be done for - obs will be subsetted based on this + # assim.sda <- Start.year:End.year + # obs.mean <- obs.mean[sapply(lubridate::year(names(obs.mean)), function(obs.year) obs.year %in% (assim.sda))] #checks obs.mean dates against assimyear dates + # obs.cov <- obs.cov[sapply(lubridate::year(names(obs.cov)), function(obs.year) obs.year %in% (assim.sda))] #checks obs.cov dates against assimyear dates + if (is.na(start.cut)) { + start.cut <- as.POSIXct(as.Date(settings$state.data.assimilation$start.date), tz = "UTC") + } + end.cut <- lubridate::ymd_hms(settings$state.data.assimilation$end.date, truncated = 3) + if (is.na(end.cut)) { + end.cut <- as.POSIXct(as.Date(settings$state.data.assimilation$end.date), tz = "UTC") + } #checking that there are dates in obs.mean and adding midnight as the time obs.times <- names(obs.mean) obs.times.POSIX <- lubridate::ymd_hms(obs.times) @@ -127,11 +134,17 @@ sda.enkf_local <- function(settings, ### Data does not have time associated with dates ### Adding 12:59:59PM assuming next time step starts one second later # PEcAn.logger::logger.warn("Pumpkin Warning: adding one minute before midnight time assumption to dates associated with data") - obs.times.POSIX[i] <- lubridate::ymd_hms(paste(obs.times[i], "23:59:59")) + # obs.times.POSIX[i] <- lubridate::ymd_hms(paste(obs.times[i], "23:59:59")) + # If dates are provided without times, standardize to 00:00:00 UTC. + obs.times.POSIX[i] <- lubridate::ymd_hms(paste(obs.times[i], "00:00:00")) } } } obs.times <- obs.times.POSIX + keep_obs <- which(obs.times >= start.cut & obs.times <= end.cut) + obs.times <- obs.times[keep_obs] + obs.mean <- obs.mean[keep_obs] + obs.cov <- obs.cov[keep_obs] read_restart_times <- c(lubridate::ymd_hms(start.cut, truncated = 3), obs.times) nt <- length(obs.times) #sets length of for loop for Forecast/Analysis if (nt==0) PEcAn.logger::logger.severe('There has to be at least one Obs.') @@ -262,7 +275,8 @@ sda.enkf_local <- function(settings, for (i in seq_len(nens)) { #---------------- model specific split inputs split_args <- list( - start.time = (lubridate::ymd_hms(obs.times[t - 1], truncated = 3) + lubridate::second(lubridate::hms("00:00:01"))), + # start.time = (lubridate::ymd_hms(obs.times[t - 1], truncated = 3) + lubridate::second(lubridate::hms("00:00:01"))), + start.time = lubridate::ymd_hms(obs.times[t - 1], truncated = 3), stop.time = lubridate::ymd_hms(obs.times[t], truncated = 3), inputs = inputs$met$samples[[i]] ) @@ -289,7 +303,8 @@ sda.enkf_local <- function(settings, } list( runid = configs$runs$id, - start.time = strptime(obs.times[t -1], format = "%Y-%m-%d %H:%M:%S") + lubridate::second(lubridate::hms("00:00:01")), + # start.time = strptime(obs.times[t -1], format = "%Y-%m-%d %H:%M:%S") + lubridate::second(lubridate::hms("00:00:01")), + start.time = strptime(obs.times[t -1], format = "%Y-%m-%d %H:%M:%S"), stop.time = strptime(obs.times[t], format ="%Y-%m-%d %H:%M:%S"), settings = settings, new.state = new_state_site,