diff --git a/modules/data.remote/inst/ccmmf/events/make_events_statewide.R b/modules/data.remote/inst/ccmmf/events/make_events_statewide.R new file mode 100644 index 00000000000..dd2d8fac372 --- /dev/null +++ b/modules/data.remote/inst/ccmmf/events/make_events_statewide.R @@ -0,0 +1,522 @@ +#!/usr/bin/env Rscript +# Build statewide PEcAn-style event files for one year (Parquet + JSON) from matched LandIQ+MSLSP. +# Writes phenology, planting, harvest, and optionally tillage event tables under event_files/. +# +# Main inputs: CCMMF_MANAGEMENT (paths to matched assigned_year=Y.parquet, scripts/traits pool, +# tillage/ndti_v4.1). Optional: HARVEST_LOOKUP_RDS, HARVEST_WOODY_DESTRUCTIVE, TILLAGE_* env. +# Main outputs: phenology_statewide_Y, planting_statewide_Y, harvest_statewide_Y, tillage_statewide_*. +# How to run: Rscript make_events_statewide.R [phenology|planting|harvest|tillage] +# Workflow: end of monitoring chart (statewide outputs). Phenology/planting/harvest read assigned +# parquet and pool_calculations_from_lookup.R. Harvest dates: row/rice use mslsp_OGMn; hay/woody +# use mslsp_OGD. Tillage uses NDTI plus assigned parcels in a buffered year window. + +suppressPackageStartupMessages({ + library(data.table) + library(arrow) + library(jsonlite) + library(lubridate) +}) + +path_management <- Sys.getenv("CCMMF_MANAGEMENT", "/projectnb/dietzelab/ccmmf/management") +matched_dir <- file.path(path_management, "phenology", "matched_landiq_mslsp_v4.1") +pool_script <- file.path(path_management, "scripts", "traits", "pool_calculations_from_lookup.R") +tillage_metrics_script <- file.path(path_management, "scripts", "tillage", "tillage_metrics.R") +ndti_root <- file.path(path_management, "tillage", "ndti_v4.1") +out_dir <- file.path(path_management, "event_files") + +args <- commandArgs(trailingOnly = TRUE) +if (length(args) < 1L) { + stop("Usage: Rscript make_events_statewide.R [phenology|planting|harvest|tillage]") +} +year_arg <- as.integer(args[1L]) +if (is.na(year_arg)) { + stop("Year must be an integer, got: ", args[1L]) +} +event_type <- if (length(args) >= 2L) { + match.arg(args[2L], c("phenology", "planting", "harvest", "tillage")) +} else { + NULL +} + +run_phenology <- is.null(event_type) || event_type == "phenology" +run_planting <- is.null(event_type) || event_type == "planting" +run_harvest <- is.null(event_type) || event_type == "harvest" +run_tillage <- !is.null(event_type) && event_type == "tillage" + +msg_suffix <- if (is.null(event_type)) { + " (phenology + planting + harvest)" +} else { + paste0(" event_type=", event_type) +} +message("[make_events_statewide] year=", year_arg, msg_suffix) + +dir.create(out_dir, recursive = TRUE, showWarnings = FALSE) + +#### Single-year assigned (phenology, planting, harvest) +if (run_phenology || run_planting || run_harvest) { + assigned_file <- file.path(matched_dir, sprintf("assigned_year=%d.parquet", year_arg)) + if (!file.exists(assigned_file)) { + stop("Missing assigned file: ", assigned_file) + } + assigned <- as.data.table(read_parquet(assigned_file)) + assigned[, parcel_id := as.character(parcel_id)] + matched <- assigned[assigned_by == "matched"] + message("[assigned] ", nrow(assigned), " rows; matched: ", nrow(matched)) + + n_before <- nrow(matched) + matched <- matched[ + !is.na(landiq_CLASS) & !is.na(landiq_SUBCLASS) & !is.na(landiq_PFT) + ] + message(" Dropped ", n_before - nrow(matched), " matched rows missing crop/PFT; ", nrow(matched), " remain") + + if (run_phenology || run_planting) { + n_ev <- nrow(matched) + matched <- matched[!is.na(mslsp_EVImax) & !is.na(mslsp_EVIamp)] + message(" EVI filter: dropped ", n_ev - nrow(matched), " rows; ", nrow(matched), " remain") + } + + if (run_harvest) { + for (col in c("mslsp_OGMn", "mslsp_OGD")) { + if (!col %in% names(matched)) { + matched[, (col) := NA] + } + } + matched[, pft_l := tolower(trimws(as.character(landiq_PFT)))] + matched[, harvest_date_str := NA_character_] + matched[pft_l %in% c("row", "rice"), harvest_date_str := as.character(mslsp_OGMn)] + matched[pft_l %in% c("hay", "woody"), harvest_date_str := as.character(mslsp_OGD)] + n_hd <- nrow(matched) + matched <- matched[ + !is.na(harvest_date_str) & nzchar(harvest_date_str) & harvest_date_str != "NA" + ] + message( + " Harvest-date filter (annuals OGMn, perennials OGD; no Peak): dropped ", + n_hd - nrow(matched), " rows; ", nrow(matched), " remain" + ) + matched[, pft_l := NULL] + } +} + +if (run_planting || run_harvest) { + pool_env <- new.env(parent = globalenv()) + source(pool_script, local = pool_env) + harvest_rds <- Sys.getenv("HARVEST_LOOKUP_RDS", "") + if (nzchar(harvest_rds)) { + lk <- pool_env$load_trait_lookup(harvest_path = harvest_rds) + message("[pool] Loaded trait lookup (harvest_path=", harvest_rds, ")") + } else { + lk <- pool_env$load_trait_lookup() + message("[pool] Loaded trait lookup (default planting + harvest RDS paths)") + } +} + +harvest_destructive_default <- tolower(Sys.getenv("HARVEST_WOODY_DESTRUCTIVE", "0")) %in% c("1", "true", "yes") + +#### PHENOLOGY +if (run_phenology) { + message("[phenology] Building events") + pheno <- matched[, .( + site_id = parcel_id, + year = lubridate::year(mslsp_Peak), + leafonday = as.character(mslsp_50PCGI), + leafoffday = as.character(mslsp_50PCGD) + )] + pheno <- pheno[!is.na(leafonday) & !is.na(leafoffday)] + setorder(pheno, site_id, year) + + pheno_parquet <- file.path(out_dir, sprintf("phenology_statewide_%d.parquet", year_arg)) + write_parquet(pheno, pheno_parquet) + message(" Wrote ", pheno_parquet, " (", nrow(pheno), " rows)") + + pheno_json_path <- file.path(out_dir, sprintf("phenology_statewide_%d.json", year_arg)) + pheno_list <- lapply(split(pheno, pheno$site_id), function(rows) { + lapply(seq_len(nrow(rows)), function(i) { + list( + event_type = "phenology", + year = rows$year[i], + leafonday = rows$leafonday[i], + leafoffday = rows$leafoffday[i] + ) + }) + }) + write(toJSON(pheno_list, auto_unbox = TRUE, pretty = TRUE), pheno_json_path) + message(" Wrote ", pheno_json_path) +} + +#### PLANTING +if (run_planting) { + message("[planting] Building events (C/N pools via LAI)") + planting_date_str <- as.character(matched$mslsp_OGI) + + planting_rows <- vector("list", nrow(matched)) + for (i in seq_len(nrow(matched))) { + row <- matched[i] + p <- tryCatch( + pool_env$initialize_planting( + ID = row$parcel_id, + DATE = planting_date_str[i], + PFT = row$landiq_PFT, + lk = lk, + class = row$landiq_CLASS, + subclass = row$landiq_SUBCLASS, + mslsp_EVImax = row$mslsp_EVImax, + mslsp_EVIamp = row$mslsp_EVIamp + ), + error = function(e) NULL + ) + if (!is.null(p) && nrow(p) > 0) { + code <- paste0(trimws(as.character(row$landiq_CLASS)), as.character(row$landiq_SUBCLASS)) + planting_rows[[i]] <- data.table( + site_id = row$parcel_id, + year = row$year, + season = row$season, + date = planting_date_str[i], + code = code, + PFT = row$landiq_PFT, + LAI = as.numeric(p$LAI[1]), + C_LEAF = as.numeric(p$C_LEAF[1]), + C_STEM = as.numeric(p$C_STEM[1]), + C_FINEROOT = as.numeric(p$C_FINEROOT[1]), + C_COARSEROOT = as.numeric(p$C_COARSEROOT[1]), + N_LEAF = as.numeric(p$N_LEAF[1]), + N_STEM = as.numeric(p$N_STEM[1]), + N_FINEROOT = as.numeric(p$N_FINEROOT[1]), + N_COARSEROOT = as.numeric(p$N_COARSEROOT[1]) + ) + } + if (i %% 10000L == 0L) { + message(" ", i, "/", nrow(matched), " done") + } + } + planting_dt <- rbindlist(planting_rows, use.names = TRUE, fill = TRUE) + setorder(planting_dt, site_id, year, season) + planting_dt[, event_type := "planting"] + setcolorder(planting_dt, c("event_type", setdiff(names(planting_dt), "event_type"))) + + plant_parquet <- file.path(out_dir, sprintf("planting_statewide_%d.parquet", year_arg)) + write_parquet(planting_dt, plant_parquet) + message(" Wrote ", plant_parquet, " (", nrow(planting_dt), " rows)") + + plant_json_path <- file.path(out_dir, sprintf("planting_statewide_%d.json", year_arg)) + plant_list <- lapply(split(planting_dt, planting_dt$site_id), function(rows) { + lapply(seq_len(nrow(rows)), function(i) { + list( + event_type = rows$event_type[i], + date = rows$date[i], + year = rows$year[i], + season = rows$season[i], + crop = rows$code[i], + PFT = rows$PFT[i], + LAI = rows$LAI[i], + leaf_c_kg_m2 = rows$C_LEAF[i], + stem_c_kg_m2 = rows$C_STEM[i], + fineroot_c_kg_m2 = rows$C_FINEROOT[i], + coarseroot_c_kg_m2 = rows$C_COARSEROOT[i], + leaf_n_kg_m2 = rows$N_LEAF[i], + stem_n_kg_m2 = rows$N_STEM[i], + fineroot_n_kg_m2 = rows$N_FINEROOT[i], + coarseroot_n_kg_m2 = rows$N_COARSEROOT[i] + ) + }) + }) + write(toJSON(plant_list, auto_unbox = TRUE, pretty = TRUE), plant_json_path) + message(" Wrote ", plant_json_path) +} + +#### HARVEST (removal fractions for SIPNET / PEcAn; same LandIQ row as planting) +if (run_harvest) { + message("[harvest] Building events (lookup-based removal fractions)") + has_dest_col <- "destructive" %in% names(matched) + + harvest_rows <- vector("list", nrow(matched)) + for (i in seq_len(nrow(matched))) { + row <- matched[i] + code <- paste0(trimws(as.character(row$landiq_CLASS)), as.character(row$landiq_SUBCLASS)) + dest <- if (has_dest_col) isTRUE(as.logical(row$destructive[1])) else harvest_destructive_default + h <- tryCatch( + pool_env$initialize_harvest_from_lookup( + ID = row$parcel_id, + DATE = as.character(row$harvest_date_str)[1], + code = code, + PFT = row$landiq_PFT, + lk = lk, + destructive = dest + ), + error = function(e) NULL + ) + if (!is.null(h) && nrow(h) > 0) { + harvest_rows[[i]] <- data.table( + site_id = row$parcel_id, + year = row$year, + season = row$season, + date = as.character(row$harvest_date_str)[1], + CLASS_SUBCLASS = code, + PFT = row$landiq_PFT, + frac_above_removed_0to1 = as.numeric(h$AGB_REMOVED[1]), + frac_above_to_litter_0to1 = as.numeric(h$AGB_LITTER[1]), + frac_below_removed_0to1 = as.numeric(h$BGB_REMOVED[1]), + frac_below_to_litter_0to1 = as.numeric(h$BGB_LITTER[1]) + ) + } + if (i %% 10000L == 0L) { + message(" ", i, "/", nrow(matched), " done") + } + } + harvest_dt <- rbindlist(harvest_rows, use.names = TRUE, fill = TRUE) + setorder(harvest_dt, site_id, year, season) + harvest_dt[, event_type := "harvest"] + setcolorder(harvest_dt, c("event_type", setdiff(names(harvest_dt), "event_type"))) + + harvest_parquet <- file.path(out_dir, sprintf("harvest_statewide_%d.parquet", year_arg)) + write_parquet(harvest_dt, harvest_parquet) + message(" Wrote ", harvest_parquet, " (", nrow(harvest_dt), " rows)") + + harvest_json_path <- file.path(out_dir, sprintf("harvest_statewide_%d.json", year_arg)) + harvest_list <- lapply(split(harvest_dt, harvest_dt$site_id), function(rows) { + lapply(seq_len(nrow(rows)), function(i) { + list( + event_type = rows$event_type[i], + date = rows$date[i], + year = rows$year[i], + season = rows$season[i], + crop = rows$CLASS_SUBCLASS[i], + PFT = rows$PFT[i], + frac_above_removed_0to1 = rows$frac_above_removed_0to1[i], + frac_above_to_litter_0to1 = rows$frac_above_to_litter_0to1[i], + frac_below_removed_0to1 = rows$frac_below_removed_0to1[i], + frac_below_to_litter_0to1 = rows$frac_below_to_litter_0to1[i] + ) + }) + }) + write(toJSON(harvest_list, auto_unbox = TRUE, pretty = TRUE), harvest_json_path) + message(" Wrote ", harvest_json_path) +} + +#### TILLAGE (buffered multi-year assigned + NDTI; parcel chunks) +if (run_tillage) { + suppressPackageStartupMessages(library(dplyr)) + if (!file.exists(tillage_metrics_script)) { + stop("Missing tillage_metrics.R: ", tillage_metrics_script) + } + source(tillage_metrics_script) + + buf <- suppressWarnings(as.integer(Sys.getenv("TILLAGE_BUFFER_YEARS", "1"))) + if (is.na(buf) || buf < 0L) { + buf <- 1L + } + chunk_n <- suppressWarnings(as.integer(Sys.getenv("TILLAGE_PARCEL_CHUNK", "3000"))) + if (is.na(chunk_n) || chunk_n < 1L) { + chunk_n <- 3000L + } + + year_first <- year_arg + year_last <- year_arg + load_years <- seq(year_first - buf, year_last + buf) + + message( + "[tillage] output year ", year_arg, " | load years ", min(load_years), ":", max(load_years), + " | chunk ", chunk_n + ) + + list_ndti_parquet <- function(yrs) { + fl <- character(0) + for (y in yrs) { + ydir <- file.path(ndti_root, sprintf("year=%d", y)) + if (!dir.exists(ydir)) { + next + } + fl <- c( + fl, + Sys.glob(file.path(ydir, sprintf("ndti_year=%d_month=*.parquet", y))), + Sys.glob(file.path(ydir, "*.parquet")) + ) + } + unique(fl[file.exists(fl)]) + } + ndti_files <- list_ndti_parquet(load_years) + if (length(ndti_files) == 0L) { + stop("[tillage] No NDTI parquet under ", ndti_root) + } + message("[tillage] NDTI files found: ", length(ndti_files)) + + mslsp_parts <- list() + for (y in load_years) { + f <- file.path(matched_dir, sprintf("assigned_year=%d.parquet", y)) + if (!file.exists(f)) { + message("[tillage] skip missing ", f) + next + } + mslsp_parts[[length(mslsp_parts) + 1L]] <- as.data.table(read_parquet(f)) + } + if (length(mslsp_parts) == 0L) { + stop("[tillage] No assigned parquet for load years") + } + + mslsp_all <- rbindlist(mslsp_parts, use.names = TRUE, fill = TRUE) + mslsp_all[, parcel_id := as.character(parcel_id)] + mslsp_all <- mslsp_all[assigned_by == "matched"] + mslsp_all <- mslsp_all[!is.na(mslsp_OGI) & !is.na(mslsp_OGMn)] + mslsp_all[, OGI_date := as.Date(mslsp_OGI)] + mslsp_all[, OGMn_date := as.Date(mslsp_OGMn)] + mslsp_all <- mslsp_all[!is.na(OGI_date) & !is.na(OGMn_date)] + + phenology_full <- mslsp_all[, .(parcel_id, year, OGI_date, OGMn_date)] + pft_y <- mslsp_all[, .(PFT = landiq_PFT[1L]), by = .(parcel_id, year)] + + message("[tillage] phenology rows ", nrow(phenology_full), " | parcels ", uniqueN(phenology_full$parcel_id)) + + read_ndti_for_parcels <- function(parcel_ids, yrs, root) { + pid_unique <- unique(as.character(parcel_ids)) + parts <- list() + for (y in yrs) { + ydir <- file.path(root, sprintf("year=%d", y)) + if (!dir.exists(ydir)) { + next + } + # open_dataset(ydir) fails when year=* contains non-parquet (e.g. logs/*.log) + fl <- c( + Sys.glob(file.path(ydir, sprintf("ndti_year=%d_month=*.parquet", y))), + Sys.glob(file.path(ydir, "*.parquet")) + ) + fl <- unique(fl[file.exists(fl)]) + if (length(fl) == 0L) { + next + } + ds <- tryCatch(arrow::open_dataset(fl), error = function(e) NULL) + if (is.null(ds)) { + next + } + sub <- tryCatch( + ds |> + dplyr::filter(parcel_id %in% pid_unique) |> + dplyr::collect(), + error = function(e) NULL + ) + if (!is.null(sub) && nrow(sub) > 0L) { + parts[[length(parts) + 1L]] <- as.data.table(sub) + } + } + if (length(parts) == 0L) { + return(data.table()) + } + rbindlist(parts, use.names = TRUE, fill = TRUE) + } + + all_pids <- unique(phenology_full$parcel_id) + n_chunk <- ceiling(length(all_pids) / chunk_n) + results <- vector("list", n_chunk) + + for (ic in seq_len(n_chunk)) { + i0 <- (ic - 1L) * chunk_n + 1L + i1 <- min(ic * chunk_n, length(all_pids)) + pchunk <- all_pids[i0:i1] + message("[tillage] chunk ", ic, "/", n_chunk, " parcels ", i0, ":", i1) + + pheno_chunk <- phenology_full[parcel_id %in% pchunk] + if (nrow(pheno_chunk) == 0L) { + next + } + + ndti_chunk <- read_ndti_for_parcels(pchunk, load_years, ndti_root) + if (nrow(ndti_chunk) == 0L) { + message(" no NDTI rows") + next + } + ndti_chunk[, date := as.Date(date)] + ndti_chunk <- merge(ndti_chunk, pft_y, by = c("parcel_id", "year"), all.x = TRUE) + ndti_chunk <- ndti_chunk[!is.na(PFT) & nzchar(as.character(PFT))] + + common <- intersect(unique(ndti_chunk$parcel_id), unique(pheno_chunk$parcel_id)) + if (length(common) == 0L) { + message(" no ndti/phenology overlap") + next + } + ndti_chunk <- ndti_chunk[parcel_id %in% common] + pheno_chunk <- pheno_chunk[parcel_id %in% common] + + res <- tryCatch( + tillage_metrics(ndti_table = ndti_chunk, phenology_table = pheno_chunk), + error = function(e) { + warning("[tillage] tillage_metrics failed chunk ", ic, ": ", conditionMessage(e)) + NULL + } + ) + if (!is.null(res) && nrow(res) > 0L) { + results[[ic]] <- as.data.table(res) + } + } + + all_res <- rbindlist(results[!vapply(results, is.null, NA)], use.names = TRUE, fill = TRUE) + if (nrow(all_res) == 0L) { + stop("[tillage] No results (check NDTI overlap and errors above)") + } + + all_res[, parcel_id := as.character(parcel_id)] + all_res[, year := as.integer(year)] + + for (y in year_first:year_last) { + yr_dt <- all_res[year == y] + pq <- file.path(out_dir, sprintf("tillage_statewide_%d.parquet", y)) + js <- file.path(out_dir, sprintf("tillage_statewide_%d.json", y)) + + if (nrow(yr_dt) == 0L) { + message("[tillage] year ", y, ": no rows; writing empty outputs") + write_parquet( + data.table(event_type = character(), parcel_id = character(), year = integer()), + pq + ) + writeLines("{}", js) + next + } + + n_pre <- nrow(yr_dt) + ord_cols <- intersect(c("parcel_id", "OGMn_date", "min_date", "max_date"), names(yr_dt)) + if (length(ord_cols) > 0L) { + data.table::setorderv(yr_dt, ord_cols) + } + yr_dt <- unique(yr_dt, by = c("parcel_id", "OGMn_date")) + if (nrow(yr_dt) < n_pre) { + message( + "[tillage] year ", y, ": deduped ", n_pre - nrow(yr_dt), + " duplicate row(s) (parcel_id + OGMn_date); kept first after sort" + ) + } + + out_dt <- copy(yr_dt) + out_dt[, site_id := parcel_id] + out_dt[, event_type := "tillage"] + date_cols <- names(out_dt)[vapply(out_dt, function(z) inherits(z, "Date"), NA)] + for (cn in date_cols) { + out_dt[, (cn) := as.character(get(cn))] + } + setcolorder(out_dt, c("event_type", setdiff(names(out_dt), "event_type"))) + write_parquet(out_dt, pq) + message(" Wrote ", pq, " (", nrow(out_dt), " rows)") + + json_list <- lapply(split(out_dt, out_dt$site_id), function(rows) { + lapply(seq_len(nrow(rows)), function(i) { + list( + event_type = rows$event_type[i], + year = rows$year[i], + PFT = rows$PFT[i], + OGMn_date = rows$OGMn_date[i], + max_date = rows$max_date[i], + max_ndti = rows$max_ndti[i], + min_date = rows$min_date[i], + min_ndti = rows$min_ndti[i], + min_n_valid = rows$min_n_valid[i], + min_sd = rows$min_sd[i], + ndti_pct_change = rows$ndti_pct_change[i], + min_val_date_before = rows$min_val_date_before[i], + min_val_n_before = rows$min_val_n_before[i], + min_val_date_after = rows$min_val_date_after[i], + min_val_n_after = rows$min_val_n_after[i] + ) + }) + }) + write(toJSON(json_list, auto_unbox = TRUE, pretty = TRUE), js) + message(" Wrote ", js) + } +} + +message("[make_events_statewide] Done for year=", year_arg) diff --git a/modules/data.remote/inst/ccmmf/hls/build_hls_parcel_tile_map.R b/modules/data.remote/inst/ccmmf/hls/build_hls_parcel_tile_map.R new file mode 100644 index 00000000000..04ec1a0d1f4 --- /dev/null +++ b/modules/data.remote/inst/ccmmf/hls/build_hls_parcel_tile_map.R @@ -0,0 +1,188 @@ +# Build a parcel-year to HLS tile map for California agricultural parcels (LandIQ v4.1). +# Reads crops_all_years.parquet filtered by agricultural CLASS/SUBCLASS, loads parcel +# geometries, intersects with the HLS tile grid, and writes RDS plus tile parcel counts. +# +# Main inputs: CCMMF_LANDIQ_V4 (LandIQ root), CCMMF_MANAGEMENT (crop lookup, tile extent), +# optional CLI year_min year_max overwrite. +# Main outputs: hls_parcel_tile_map_v4.1_years=MIN-MAX.rds, hls_tile_parcel_counts CSV, +# optional removed parcel-years CSV when geometries fail QC. +# How to run: Rscript scripts/hls/build_hls_parcel_tile_map.R [year_min] [year_max] [overwrite] +# Workflow: upstream of tilewise MSLSP/NDTI drivers; run build_hls_tile_extent.R first. + +suppressPackageStartupMessages({ + library(sf) + library(data.table) + library(arrow) + library(dplyr) +}) +sf::sf_use_s2(FALSE) + +#### Configuration + +path_landiq_v4 <- Sys.getenv("CCMMF_LANDIQ_V4", "/projectnb/dietzelab/ccmmf/LandIQ-harmonized-v4.1") +path_management <- Sys.getenv("CCMMF_MANAGEMENT", "/projectnb/dietzelab/ccmmf/management") +path_parcels <- file.path(path_landiq_v4, "parcels-consolidated.gpkg") +path_crops_parq <- file.path(path_landiq_v4, "crops_all_years.parq") +path_cropcode_lookup <- file.path(path_management, "LandIQ_cropCode_lookup_table.csv") +path_tiles <- file.path(path_management, "hls_tile_extent.rds") +path_out <- path_management + +#### Parse arguments + +args <- commandArgs(trailingOnly = TRUE) +year_min <- if (length(args) >= 1) as.integer(args[1]) else 2016L +year_max <- if (length(args) >= 2) as.integer(args[2]) else 2024L +overwrite <- length(args) >= 3 && tolower(args[3]) %in% c("overwrite", "true", "t", "1", "yes", "y") + +out_file <- file.path(path_out, sprintf("hls_parcel_tile_map_v4.1_years=%d-%d.rds", year_min, year_max)) +if (file.exists(out_file) && !overwrite) quit(save = "no", status = 0) +if (!file.exists(path_tiles)) { + stop("Tile extent not found. Run: Rscript scripts/hls/build_hls_tile_extent.R") +} + +tile_prep <- readRDS(path_tiles) +tile_extent <- tile_prep$tile_extent_sf +used_crs <- tile_prep$used_crs + +#### Parcel-year rows (agricultural only via CLASS and SUBCLASS join) + +# Join on CLASS+SUBCLASS so subclass-level PFT differences (e.g. T19 vs T28 woody) stay correct. +lookup <- fread(path_cropcode_lookup) +ag_pairs <- unique(lookup[is_agricultural == TRUE, + .(CLASS = trimws(CLASS), SUBCLASS = as.character(SUBCLASS))]) +ag_classes_filter <- unique(ag_pairs$CLASS) + +parcel_year_raw <- arrow::open_dataset(path_crops_parq) |> + dplyr::filter(year >= year_min, year <= year_max, CLASS %in% ag_classes_filter) |> + dplyr::select(parcel_id, year, CLASS, SUBCLASS) |> + dplyr::collect() |> + as.data.table() +parcel_year_raw[, CLASS := trimws(as.character(CLASS))] +parcel_year_raw[, SUBCLASS := as.character(SUBCLASS)] +parcel_year <- merge(parcel_year_raw, ag_pairs, by = c("CLASS", "SUBCLASS"))[ + , .(parcel_id = as.character(parcel_id), year = as.integer(year)) +] |> unique() +parcel_year[, parcel_id := as.character(parcel_id)] +parcel_year[, year := as.integer(year)] +message("Parcel-year rows (agricultural, ", year_min, "-", year_max, "): ", nrow(parcel_year)) + +#### Load parcel geometry in chunks (avoid huge SQL IN lists) + +ids <- unique(parcel_year$parcel_id) +layer <- st_layers(path_parcels)$name[1] +chunks <- split(ids, ceiling(seq_along(ids) / 5000L)) +geom_chunks <- lapply(chunks, function(x) { + esc <- gsub("'", "''", x, fixed = TRUE) + q <- sprintf('SELECT * FROM "%s" WHERE parcel_id IN (%s)', layer, paste0("'", esc, "'", collapse = ",")) + st_read(path_parcels, query = q, quiet = TRUE) +}) +parcels <- do.call(rbind, geom_chunks) +parcels$parcel_id <- as.character(parcels$parcel_id) + +#### QC: drop invalid or empty geometries (bad WKB can break OGR) + +valid <- tryCatch( + !sf::st_is_empty(sf::st_geometry(parcels)), + error = function(e) { + message("Bulk geometry check failed; checking row-by-row for corrupt geometries.") + vapply(seq_len(nrow(parcels)), function(i) { + tryCatch(!sf::st_is_empty(sf::st_geometry(parcels)[i]), error = function(e) FALSE) + }, logical(1)) + } +) +removed_log <- if (any(!valid)) { + parcel_year[parcel_id %in% parcels$parcel_id[!valid], .(parcel_id, year)] +} else { + data.table(parcel_id = character(), year = integer()) +} +parcels <- parcels[valid, ] + +#### Reproject to tile CRS (row-by-row fallback if bulk transform fails) + +parcels_tr <- tryCatch(sf::st_transform(parcels, used_crs), error = function(e) NULL) +if (is.null(parcels_tr)) { + message("Bulk st_transform failed; checking row-by-row.") + chunk_size <- 5000L + n <- nrow(parcels) + good <- logical(n) + for (start in seq(1L, n, by = chunk_size)) { + end <- min(start + chunk_size - 1L, n) + chk <- tryCatch(sf::st_transform(parcels[start:end, ], used_crs), error = function(e) NULL) + if (!is.null(chk)) { + good[start:end] <- TRUE + } else { + for (i in start:end) { + good[i] <- tryCatch({ + sf::st_transform(parcels[i, ], used_crs) + TRUE + }, error = function(e) FALSE) + } + } + } + drop_ids <- parcels$parcel_id[!good] + if (length(drop_ids) > 0) { + removed_log <- rbind(removed_log, parcel_year[parcel_id %in% drop_ids, .(parcel_id, year)]) + } + parcels <- parcels[good, ] + parcels <- sf::st_transform(parcels, used_crs) +} else { + parcels <- parcels_tr +} + +#### Spatial join: parcel polygon intersects tile polygon (any overlap counts) + +hits <- tryCatch(sf::st_intersects(parcels, tile_extent), error = function(e) NULL) +if (is.null(hits)) { + message("Bulk st_intersects failed; checking row-by-row.") + n <- nrow(parcels) + good <- logical(n) + for (i in seq_len(n)) { + good[i] <- tryCatch({ + hi <- sf::st_intersects(parcels[i, ], tile_extent) + length(hi[[1]]) >= 0 + TRUE + }, error = function(e) FALSE) + } + drop_ids <- parcels$parcel_id[!good] + if (length(drop_ids) > 0) { + removed_log <- rbind(removed_log, parcel_year[parcel_id %in% drop_ids, .(parcel_id, year)]) + } + parcels <- parcels[good, ] + hits <- sf::st_intersects(parcels, tile_extent) +} +keep <- lengths(hits) > 0 +parcels <- parcels[keep, ] +hits <- hits[keep] + +if (nrow(removed_log) > 0) { + removed_log <- unique(removed_log) + removed_log_file <- file.path(path_out, sprintf("hls_parcel_tile_map_removed_v4.1_years=%d-%d.csv", year_min, year_max)) + dir.create(path_out, recursive = TRUE, showWarnings = FALSE) + fwrite(removed_log, removed_log_file) + message("Dropped ", nrow(removed_log), " parcel-years with invalid geometry; log: ", removed_log_file) +} + +#### Build parcel to tiles table and join to parcel_year + +tile_by_parcel <- data.table( + parcel_id = parcels$parcel_id, + tileIDs = vapply(hits, function(i) paste(tile_extent$tile_id[i], collapse = ","), character(1)), + n_tiles = lengths(hits) +) +setkey(tile_by_parcel, parcel_id) +setkey(parcel_year, parcel_id) +out <- tile_by_parcel[parcel_year, nomatch = 0][, .(parcel_id, year, tileIDs, n_tiles)] + +#### Tile to parcel counts (for scheduling) + +tile_long <- out[, .(tile_id = unlist(strsplit(tileIDs, ",", fixed = TRUE))), by = .(parcel_id, year)] +tile_counts <- tile_long[, .(n_parcels = .N), by = .(tile_id, year)] +setorder(tile_counts, tile_id, year) + +#### Write outputs + +dir.create(path_out, recursive = TRUE, showWarnings = FALSE) +saveRDS(out, out_file) +tile_counts_file <- file.path(path_out, sprintf("hls_tile_parcel_counts_v4.1_years=%d-%d.csv", year_min, year_max)) +fwrite(tile_counts, tile_counts_file) +message("Wrote tile->parcel counts: ", tile_counts_file) diff --git a/modules/data.remote/inst/ccmmf/hls/combine_mslsp_tilepieces.R b/modules/data.remote/inst/ccmmf/hls/combine_mslsp_tilepieces.R new file mode 100644 index 00000000000..90ab1fb9b3b --- /dev/null +++ b/modules/data.remote/inst/ccmmf/hls/combine_mslsp_tilepieces.R @@ -0,0 +1,69 @@ +#!/usr/bin/env Rscript +# Combine per-tile MSLSP tilepieces into one annual Parquet for a year. +# Aggregates duplicate parcels at tile boundaries with weighted means and QA modes. +# +# Main inputs: prep from mslsp_prep_static_tilewise(year); tilepieces under tilepieces_year=. +# Main outputs: mslsp_year=Y.parquet in the year output directory. +# How to run: Rscript combine_mslsp_tilepieces.R [overwrite], or via driver combine. +# Workflow: monitoring workflow merge step for MSLSP. + +script_dir <- if (length(file_arg <- commandArgs(trailingOnly = FALSE)[grepl("^--file=", + commandArgs(trailingOnly = FALSE))])) { + dirname(sub("^--file=", "", file_arg[1])) +} else "." +source(file.path(script_dir, "extract_summary_core.R")) +source(file.path(script_dir, "tilewise_mslsp_implementation.R")) + +#### Combine function +# Read all tilepieces for a year, aggregate cross-tile duplicates, write Parquet. +# Weighted aggregation across tiles (when a parcel spans multiple tiles): +# metric mean: SUM(w_valid * metric_mean) / SUM(w_valid) +# metric sd: derived from parallel variance identity +# QA mode: weighted mode (tile with greatest coverage area wins) +# na_frac: weighted mean of per-tile na_frac values +mslsp_combine <- function(prep, time_key, overwrite = FALSE, verbose = TRUE) { + year <- prep$year + + tilepieces_dir <- file.path(prep$out_dir, sprintf("tilepieces_year=%d", year)) + out_path <- file.path(prep$out_dir, sprintf("mslsp_year=%d.parquet", year)) + + if (file.exists(out_path) && !overwrite) { + if (verbose) message("[combine] skip (exists): ", out_path) + return(invisible(out_path)) + } + + tile_files <- list.files(tilepieces_dir, "^tile=.*\\.csv\\.gz$", full.names = TRUE) + if (length(tile_files) == 0) stop("No tilepieces found in: ", tilepieces_dir) + if (verbose) message("[combine] ", length(tile_files), " tilepieces -> ", out_path) + + dt <- rbindlist(lapply(tile_files, function(f) { + tryCatch(fread(f, showProgress = FALSE), error = function(e) NULL) + }), fill = TRUE, use.names = TRUE) + + if (nrow(dt) == 0) { + if (verbose) message("[combine] no rows - writing empty parquet") + arrow::write_parquet(dt, out_path) + return(invisible(out_path)) + } + + agg <- mslsp_aggregate_tilepieces(dt, year) + arrow::write_parquet(agg, out_path) + if (verbose) message("[combine] wrote ", nrow(agg), " rows") + invisible(out_path) +} + +#### CLI + +if (sys.nframe() == 0) { + args <- commandArgs(trailingOnly = TRUE) + is_ow <- function(x) tolower(x) %in% c("true", "t", "yes", "y", "overwrite") + overwrite_flag <- any(sapply(args, is_ow)) + + if (length(args) < 1) stop( + "Usage: Rscript combine_mslsp_tilepieces.R [overwrite]" + ) + + year_arg <- as.integer(args[1]) + prep <- mslsp_prep_static_tilewise(year_arg) + mslsp_combine(prep, 1L, overwrite = overwrite_flag) +} diff --git a/modules/data.remote/inst/ccmmf/hls/combine_ndti_tilepieces.R b/modules/data.remote/inst/ccmmf/hls/combine_ndti_tilepieces.R new file mode 100644 index 00000000000..93a23753303 --- /dev/null +++ b/modules/data.remote/inst/ccmmf/hls/combine_ndti_tilepieces.R @@ -0,0 +1,118 @@ +#!/usr/bin/env Rscript +# Combine per-tile NDTI tilepieces into one monthly Parquet for a year and month. +# Aggregates duplicate (parcel_id, date) rows with weighted mean and variance rules. +# +# Main inputs: prep from ndti_prep_static_tilewise(year); tilepieces for that month. +# Main outputs: ndti_year=Y_month=MM.parquet under the year directory. +# How to run: Rscript combine_ndti_tilepieces.R [overwrite], or via driver. +# Workflow: monitoring workflow merge step for NDTI. + +script_dir <- if (length(file_arg <- commandArgs(trailingOnly = FALSE)[grepl("^--file=", commandArgs(trailingOnly = FALSE))])) { + dirname(sub("^--file=", "", file_arg[1])) +} else "." +source(file.path(script_dir, "tilewise_ndti_implementation.R")) + +#### Combine function +# Read all tilepieces for one month, aggregate cross-tile duplicates, write parquet. +# Weighted aggregation across tiles: +# mean: sum(w * x) / sum(w) +# sd: via parallel variance identity: Var = E[X^2] - E[X]^2 +# na_frac: recover w_total = w_valid / (1 - na_frac) per tile row, then +# re-derive na_frac = 1 - sum(w_valid) / sum(w_total) across tiles +ndti_combine <- function(prep, month, overwrite = FALSE, verbose = TRUE) { + year <- prep$year + month <- as.integer(month) + + out_path <- path_monthly_output(prep$out_dir, year, month) + if (file.exists(out_path) && !overwrite) { + if (verbose) message("[combine] skip (exists): ", out_path) + return(invisible(out_path)) + } + + tilepieces_dir <- path_tilepieces(prep$out_dir, year, month) + tile_files <- list.files(tilepieces_dir, "^tile=.*\\.csv\\.gz$", full.names = TRUE) + if (length(tile_files) == 0) stop("No tilepieces found in: ", tilepieces_dir) + if (verbose) message("[combine] ", length(tile_files), " tilepieces -> ", out_path) + + col_types <- list( + character = "parcel_id", + integer = "n_valid", + double = c("ndti_mean", "ndti_sd", "w_valid", "sum_w2", "na_frac") + # date and parcel_id read as character; date is cast after rbindlist + ) + + # Read all tiles; fread handles .gz natively + all_tiles <- rbindlist( + lapply(tile_files, function(f) { + dt <- tryCatch(fread(f, colClasses = col_types), error = function(e) NULL) + if (is.null(dt) || nrow(dt) == 0) return(NULL) + dt + }), + fill = TRUE + ) + + all_tiles <- all_tiles[!is.na(ndti_mean)] + all_tiles[, date := as.Date(date)] + + if (nrow(all_tiles) == 0) { + if (verbose) message("[combine] no non-null rows - writing empty parquet") + result <- data.table( + parcel_id = character(), year = integer(), + date = as.Date(integer(0), origin = "1970-01-01"), + ndti_mean = double(), ndti_sd = double(), + n_valid = integer(), w_valid = double(), + sum_w2 = double(), na_frac = double() + ) + arrow::write_parquet(result, out_path) + return(invisible(out_path)) + } + + # Recover per-row w_total for na_frac re-derivation across tiles + all_tiles[, w_total := fifelse(!is.na(na_frac) & na_frac < 1, + w_valid / (1 - na_frac), 0)] + + result <- all_tiles[, { + sw <- sum(w_valid) + mu <- sum(w_valid * ndti_mean) / sw + wt <- sum(w_total) + .( + ndti_mean = mu, + # parallel variance: E[X^2] - E[X]^2 + ndti_sd = sqrt(pmax(0, + sum(w_valid * (ndti_sd^2 + ndti_mean^2)) / sw - mu^2 + )), + n_valid = sum(as.integer(n_valid)), + w_valid = sw, + sum_w2 = sum(sum_w2), + na_frac = fifelse(wt > 0, 1 - sw / wt, NA_real_) + ) + }, by = .(parcel_id, date)] + + setorder(result, parcel_id, date) + result[, year := year] + setcolorder(result, c("parcel_id", "year", "date", + "ndti_mean", "ndti_sd", "n_valid", "w_valid", "sum_w2", "na_frac")) + + arrow::write_parquet(result, out_path) + if (verbose) message("[combine] wrote ", nrow(result), " rows") + invisible(out_path) +} + +#### CLI + +if (sys.nframe() == 0) { + args <- commandArgs(trailingOnly = TRUE) + is_ow <- function(x) tolower(x) %in% c("true", "t", "yes", "y", "overwrite") + overwrite_flag <- any(sapply(args, is_ow)) + + if (length(args) < 2) stop( + "Usage: Rscript combine_ndti_tilepieces.R [overwrite]" + ) + + year_arg <- as.integer(args[1]) + month_arg <- suppressWarnings(as.integer(args[2])) + if (is.na(month_arg) || month_arg < 1L || month_arg > 12L) stop("Month must be 1-12") + + prep <- ndti_prep_static_tilewise(year_arg) + ndti_combine(prep, month_arg, overwrite = overwrite_flag) +} diff --git a/modules/data.remote/inst/ccmmf/hls/extract_summary_core.R b/modules/data.remote/inst/ccmmf/hls/extract_summary_core.R new file mode 100644 index 00000000000..4be6294ae6a --- /dev/null +++ b/modules/data.remote/inst/ccmmf/hls/extract_summary_core.R @@ -0,0 +1,95 @@ +# Weighted summary helpers for HLS polygon extraction (MSLSP-style products). +# Computes weighted mean, sd, QA mode, coverage weights, and na_frac for parcel summaries. +# +# Main inputs: data.table from raster extraction with ID, weight, layer columns. +# Main outputs: summarized data.table rows per parcel_id (or id_col). +# How to run: sourced by tilewise_mslsp_implementation.R; not a standalone CLI. +# Workflow: shared by MSLSP tile extraction and combine steps. + +suppressPackageStartupMessages(library(data.table)) + +#### Weighted statistics +# Returns mean, sd, n_valid, w_valid, sum_w2, na_frac for SE and quality flags. +weighted_stats <- function(x, w) { + ok <- !is.na(x) & !is.na(w) & w > 0 + if (!any(ok)) { + return(list(mean = NA_real_, sd = NA_real_, n_valid = 0L, + w_valid = 0, sum_w2 = 0, na_frac = NA_real_)) + } + x2 <- as.numeric(x[ok]) + w2 <- as.numeric(w[ok]) + wsum <- sum(w2) + mu <- sum(w2 * x2) / wsum + sd <- sqrt(sum(w2 * (x2 - mu)^2) / wsum) + + w_all <- w[!is.na(w) & w > 0] + w_na <- w[is.na(x) & !is.na(w) & w > 0] + na_frac <- if (length(w_all) == 0) NA_real_ else sum(w_na, na.rm = TRUE) / sum(w_all, na.rm = TRUE) + + list( + mean = as.numeric(mu), + sd = as.numeric(sd), + n_valid = as.integer(length(x2)), + w_valid = as.numeric(wsum), + sum_w2 = as.numeric(sum(w2^2)), + na_frac = as.numeric(na_frac) + ) +} + +weighted_mode_stats <- function(x, w) { + ok <- !is.na(x) & !is.na(w) & w > 0 + if (!any(ok)) return(list(mode = NA_real_, mode_frac = NA_real_)) + dt <- data.table(x = x[ok], w = w[ok]) + agg <- dt[, .(w_sum = sum(w)), by = x][order(-w_sum)] + mode_val <- agg$x[1] + mode_frac <- agg$w_sum[1] / sum(agg$w_sum) + list(mode = as.numeric(mode_val), mode_frac = as.numeric(mode_frac)) +} + +# One row per polygon: coverage stats, weighted mean/sd per continuous layer, QA mode per QA layer. +summarize_extract <- function(ex_dt, id_vec, layer_names, qa_names = character(), id_col = "parcel_id") { + stopifnot("ID" %in% names(ex_dt), "weight" %in% names(ex_dt)) + stopifnot(length(id_vec) >= max(ex_dt$ID, na.rm = TRUE)) + + dt <- as.data.table(ex_dt) + dt[, (id_col) := id_vec[ID]] + wcol <- "weight" + + cont_layers <- setdiff(layer_names, qa_names) + if (length(cont_layers) == 0) stop("No continuous layers to compute coverage stats.") + coverage_layer <- cont_layers[1] + + sup <- dt[, { + s <- weighted_stats(get(coverage_layer), get(wcol)) + list(n_valid = as.integer(s$n_valid), w_valid = as.numeric(s$w_valid), + sum_w2 = as.numeric(s$sum_w2), na_frac = as.numeric(s$na_frac)) + }, by = c(id_col)] + + cont_out <- dt[, { + out <- list() + for (nm in cont_layers) { + s <- weighted_stats(get(nm), get(wcol)) + out[[paste0(nm, "_mean")]] <- as.numeric(s$mean) + out[[paste0(nm, "_sd")]] <- as.numeric(s$sd) + } + out + }, by = c(id_col)] + + qa_out <- NULL + if (length(qa_names) > 0) { + qa_out <- dt[, { + out <- list() + for (q in qa_names) { + m <- weighted_mode_stats(get(q), get(wcol)) + out[[paste0(q, "_mode")]] <- as.numeric(m$mode) + out[[paste0(q, "_mode_frac")]] <- as.numeric(m$mode_frac) + } + out + }, by = c(id_col)] + } + + res <- merge(sup, cont_out, by = id_col, all = TRUE) + if (!is.null(qa_out)) res <- merge(res, qa_out, by = id_col, all = TRUE) + res[] +} + diff --git a/modules/data.remote/inst/ccmmf/hls/tilewise_core.R b/modules/data.remote/inst/ccmmf/hls/tilewise_core.R new file mode 100644 index 00000000000..4fdccac76c8 --- /dev/null +++ b/modules/data.remote/inst/ccmmf/hls/tilewise_core.R @@ -0,0 +1,411 @@ +# Shared tilewise orchestration for HLS-derived products (MSLSP, NDTI). +# Implements prep-static, per-tile extract to tilepieces, combine to Parquet, and optional merge. +# +# Main inputs: product object (see Product interface at bottom of file). +# Main outputs: tilepiece CSV.gz dirs and product-specific Parquet paths. +# How to run: sourced from tilewise_*_driver.R; not a standalone CLI. +# Workflow: core of monitoring workflow stages S2, S5 for raster summaries. + +suppressPackageStartupMessages({ + library(sf) + library(data.table) + library(stringr) +}) + +#### Logging +# Timestamped log to console and optional file. Driver calls tilewise_log_init() +# to enable file sink; tw_log() works console-only if not set. + +.tw_state <- new.env(parent = emptyenv()) +.tw_state$log_con <- NULL +.tw_state$log_path <- NULL + +tilewise_log_init <- function(log_path) { + dir.create(dirname(log_path), recursive = TRUE, showWarnings = FALSE) + .tw_state$log_con <- file(log_path, open = "at") + .tw_state$log_path <- log_path + do.call("on.exit", list(quote(tilewise_log_close()), add = TRUE), envir = parent.frame()) + tw_log("INFO", "=== run started pid=", Sys.getpid(), " ===") + message("[log] writing to: ", log_path) + invisible(log_path) +} + +tilewise_log_close <- function() { + if (!is.null(.tw_state$log_con)) { + tw_log("INFO", "=== run finished ===") + close(.tw_state$log_con) + .tw_state$log_con <- NULL + .tw_state$log_path <- NULL + } +} + +tw_log <- function(level = "INFO", ...) { + msg <- paste0(c(...), collapse = "") + ts <- format(Sys.time(), "%Y-%m-%d %H:%M:%S") + line <- sprintf("%s [%-5s] %s", ts, level, msg) + message(line) + if (!is.null(.tw_state$log_con)) { + writeLines(line, .tw_state$log_con) + flush(.tw_state$log_con) + } +} + +#### Generic helpers + +parcel_id_to_bucket <- function(parcel_ids, n_buckets) { + x <- suppressWarnings(as.integer(parcel_ids)) + x[is.na(x)] <- 0L + (abs(x) %% as.integer(n_buckets)) + 1L +} + +append_to_csv <- function(data, filepath) { + if (file.exists(filepath)) { + fwrite(data, filepath, append = TRUE, col.names = FALSE) + } else { + fwrite(data, filepath) + } +} + +concat_gzipped_csvs <- function(input_files, output_file) { + input_files <- sort(input_files[file.exists(input_files)]) + if (length(input_files) == 0) stop("No input files to combine") + if (length(input_files) == 1) { + cmd <- sprintf("zcat %s | gzip -c > %s", shQuote(input_files[1]), shQuote(output_file)) + } else { + rest <- paste(shQuote(input_files[-1]), collapse = " ") + cmd <- sprintf( + "(zcat %s; for f in %s; do zcat \"$f\" | sed '1d'; done) | gzip -c > %s", + shQuote(input_files[1]), rest, shQuote(output_file) + ) + } + if (system(cmd) != 0L) stop("Concatenation failed: ", output_file) + invisible(output_file) +} + +# Build a list mapping tile_id -> row indices in prep$polys. +# prep$polys must have a tile_ids list column (each element is a character vector +# of tile IDs the parcel overlaps). +build_tile_to_parcel_indices <- function(polys) { + tile_ids_list <- polys$tile_ids + rows <- rep.int(seq_len(nrow(polys)), lengths(tile_ids_list)) + tiles <- unlist(tile_ids_list, use.names = FALSE) + ok <- !is.na(tiles) & nzchar(tiles) + split(rows[ok], tiles[ok]) +} + +sanitize_tile_id <- function(id) gsub("[^0-9A-Za-z]+", "_", id) + +#### tilewise_run (extract per tile and scene) + +tilewise_run <- function(prep, time_key, product, overwrite = FALSE, verbose = TRUE) { + year <- prep$year + time_key <- as.integer(time_key) + + scene_index_dt <- product$scene_index(year, time_key, verbose = verbose) + if (is.null(scene_index_dt) || nrow(scene_index_dt) == 0) { + if (verbose) tw_log("WARN", "no scenes for year=", year, " time_key=", time_key) + return(invisible(NULL)) + } + tile_col <- product$scene_index_tile_col + if (!tile_col %in% names(scene_index_dt)) stop("scene_index must have column: ", tile_col) + + tile_to_indices <- build_tile_to_parcel_indices(prep$polys) + tile_ids <- sort(names(tile_to_indices)) + + # Smoke-test: restrict to one tile via env var. + # TILEWISE_ONE_TILE=1 -> first tile + # TILEWISE_ONE_TILE=10SFF -> specific tile + one_tile <- Sys.getenv("TILEWISE_ONE_TILE", "") + if (nzchar(one_tile) && length(tile_ids) > 0) { + one_tile_lower <- tolower(one_tile) + if (one_tile_lower %in% c("1", "true", "yes", "y", "first")) { + tile_ids <- tile_ids[1] + } else if (one_tile %in% tile_ids) { + tile_ids <- one_tile + } else { + stop("TILEWISE_ONE_TILE is set but tile not found in prep: ", one_tile) + } + if (verbose) tw_log("INFO", "TILEWISE_ONE_TILE -> ", tile_ids) + } + + tilepieces_dir <- product$path_tilepieces(prep$out_dir, year, time_key) + dir.create(tilepieces_dir, recursive = TRUE, showWarnings = FALSE) + + relevant_scene_rows <- scene_index_dt[get(tile_col) %in% tile_ids] + if (verbose) { + tw_log("INFO", + "year=", year, " time_key=", time_key, + " tiles=", length(tile_ids), + " relevant_scenes=", nrow(relevant_scene_rows) + ) + } + + # Per-tile timing: one row per processed tile, written to tilepieces_dir at the end. + timing_rows <- list() + + write_empty_tilepiece <- function(tmp, gz) { + empty <- product$empty_tilepiece_schema() + fwrite(empty, tmp) + gz_status <- system2("gzip", c("-f", tmp)) + if (gz_status != 0L) { + tw_log("WARN", "gzip failed for empty tilepiece ", tmp, "; keeping .csv") + } else { + renamed <- file.rename(paste0(tmp, ".gz"), gz) + if (!isTRUE(renamed)) tw_log("WARN", "rename failed for empty tilepiece ", tmp) + } + } + + for (tile_id in tile_ids) { + parcel_indices <- tile_to_indices[[tile_id]] + if (length(parcel_indices) == 0) next + + tile_safe <- sanitize_tile_id(tile_id) + output_gz <- file.path(tilepieces_dir, sprintf("tile=%s.csv.gz", tile_safe)) + output_csv <- file.path(tilepieces_dir, sprintf("tile=%s.csv", tile_safe)) + if ((file.exists(output_gz) || file.exists(output_csv)) && !overwrite) next + + output_tmp <- output_csv + if (file.exists(output_tmp)) file.remove(output_tmp) + + tile_start <- proc.time()[["elapsed"]] + start_ts <- format(Sys.time(), "%Y-%m-%d %H:%M:%S") + n_rows <- 0L + + # Scenes for this tile - fetched before geometry so prepare_tile can use CRS. + # Use cur_tile (not tile_id) so data.table doesn't match the column of same name. + cur_tile <- tile_id + scenes_this_tile <- scene_index_dt[get(tile_col) == cur_tile] + + if (nrow(scenes_this_tile) == 0) { + if (verbose) tw_log("INFO", "tile=", tile_id, " 0 scenes - writing empty tilepiece") + write_empty_tilepiece(output_tmp, output_gz) + timing_rows[[length(timing_rows) + 1]] <- list( + tile_id = tile_id, n_parcels = length(parcel_indices), n_scenes = 0L, + n_rows_written = 0L, status = "empty_no_scenes", + elapsed_sec = round(proc.time()[["elapsed"]] - tile_start, 1), + start_ts = start_ts, end_ts = format(Sys.time(), "%Y-%m-%d %H:%M:%S") + ) + next + } + + # Per-tile geometry context. + # If the product defines prepare_tile, call it (e.g. NDTI loads geometry + # lazily from the GPKG for just this tile's parcels, already reprojected). + # Otherwise fall back to slicing prep$polys (MSLSP loads all geometry upfront). + if (!is.null(product$prepare_tile)) { + parcel_ids_tile <- prep$polys$parcel_id[parcel_indices] + tile_parcels <- product$prepare_tile(prep, tile_id, parcel_ids_tile, scenes_this_tile) + } else { + tile_parcels <- prep$polys[parcel_indices, ] + tile_parcels$tile_ids <- replicate(nrow(tile_parcels), tile_id, simplify = FALSE) + } + + if (is.null(tile_parcels) || nrow(tile_parcels) == 0) { + if (verbose) tw_log("WARN", "tile=", tile_id, " no parcel geometry - writing empty tilepiece") + write_empty_tilepiece(output_tmp, output_gz) + timing_rows[[length(timing_rows) + 1]] <- list( + tile_id = tile_id, n_parcels = length(parcel_indices), n_scenes = nrow(scenes_this_tile), + n_rows_written = 0L, status = "empty_no_geometry", + elapsed_sec = round(proc.time()[["elapsed"]] - tile_start, 1), + start_ts = start_ts, end_ts = format(Sys.time(), "%Y-%m-%d %H:%M:%S") + ) + next + } + + if (verbose) tw_log("INFO", + "tile=", tile_id, + " parcels=", nrow(tile_parcels), + " scenes=", nrow(scenes_this_tile)) + + tile_status <- "ok" + for (scene_idx in seq_len(nrow(scenes_this_tile))) { + scene_row <- scenes_this_tile[scene_idx] + result <- tryCatch( + product$process_scene(prep, scene_row, tile_parcels, tile_id), + error = function(e) { + tw_log("ERROR", "tile=", tile_id, " scene=", scene_idx, " ", conditionMessage(e)) + tile_status <<- "error" + NULL + } + ) + if (!is.null(result) && nrow(result) > 0) { + result[, tile_id := tile_id] + append_to_csv(result, output_tmp) + n_rows <- n_rows + nrow(result) + } + } + + if (file.exists(output_tmp)) { + gz_status <- system2("gzip", c("-f", output_tmp)) + if (gz_status != 0L) { + tw_log("WARN", "gzip failed for ", output_tmp, "; keeping .csv and continuing") + } else { + renamed <- file.rename(paste0(output_tmp, ".gz"), output_gz) + if (!isTRUE(renamed)) { + tw_log("WARN", "rename to canonical .csv.gz failed for ", output_tmp) + } + } + } else { + write_empty_tilepiece(output_tmp, output_gz) + } + + elapsed <- round(proc.time()[["elapsed"]] - tile_start, 1) + if (verbose) tw_log("TIME", "tile=", tile_id, " done rows=", n_rows, " elapsed=", elapsed, "s") + + timing_rows[[length(timing_rows) + 1]] <- list( + tile_id = tile_id, n_parcels = nrow(tile_parcels), n_scenes = nrow(scenes_this_tile), + n_rows_written = n_rows, status = tile_status, + elapsed_sec = elapsed, + start_ts = start_ts, end_ts = format(Sys.time(), "%Y-%m-%d %H:%M:%S") + ) + } + + # Write per-tile timing summary so slow/failed tiles are easy to spot. + if (length(timing_rows) > 0) { + timing_dt <- rbindlist(timing_rows) + timing_dt[, `:=`(year = year, time_key = time_key)] + timing_path <- file.path(tilepieces_dir, "_tile_timing.csv") + fwrite(timing_dt, timing_path) + if (verbose) tw_log("INFO", "tile timing written to: ", timing_path) + } + + invisible(tilepieces_dir) +} + +#### tilewise_combine (aggregate tilepieces) + +tilewise_combine <- function(prep, time_key, product, n_buckets = 256L, + overwrite = FALSE, verbose = TRUE) { + year <- prep$year + time_key <- as.integer(time_key) + + # Products that define combine() bypass the generic bucket + # scatter/aggregate path entirely. + if (!is.null(product$combine)) { + return(product$combine(prep, time_key, overwrite = overwrite, verbose = verbose)) + } + + #### Generic bucket combine (products without custom combine) + tilepieces_dir <- product$path_tilepieces(prep$out_dir, year, time_key) + parts_dir <- product$path_combine_parts(prep$out_dir, year, time_key) + tile_files <- list.files(tilepieces_dir, "^tile=.*\\.csv(\\.gz)?$", full.names = TRUE) + if (length(tile_files) == 0) stop("No tilepieces in: ", tilepieces_dir) + + dir.create(parts_dir, recursive = TRUE, showWarnings = FALSE) + done_marker <- file.path(parts_dir, "DONE.marker") + if (file.exists(done_marker) && !overwrite) { + if (verbose) tw_log("INFO", "[combine] already done: ", parts_dir) + return(invisible(parts_dir)) + } + + tmp_dir <- file.path(parts_dir, "tmp_buckets") + dir.create(tmp_dir, recursive = TRUE, showWarnings = FALSE) + bucket_paths <- file.path(tmp_dir, sprintf("bucket=%03d.csv", seq_len(n_buckets))) + if (overwrite) for (p in bucket_paths) if (file.exists(p)) file.remove(p) + + # Pass 1: scatter rows into buckets so all rows for one parcel land together. + if (verbose) tw_log("INFO", "[combine] pass1: scatter into ", n_buckets, " buckets") + for (f in tile_files) { + dt <- try(fread(f, showProgress = FALSE), silent = TRUE) + if (inherits(dt, "try-error") || nrow(dt) == 0) next + if (!product$validate_tilepiece(dt)) next + dt_scatter <- product$prepare_for_scatter(dt, n_buckets) + by_bucket <- split(dt_scatter, dt_scatter$bucket) + for (bn in names(by_bucket)) { + append_to_csv(product$scatter_cols(by_bucket[[bn]]), bucket_paths[as.integer(bn)]) + } + } + + # Pass 2: aggregate each bucket independently (bounded memory). + if (verbose) tw_log("INFO", "[combine] pass2: aggregate buckets") + empty_part <- product$empty_part_schema(year) + for (b in seq_len(n_buckets)) { + part_file <- file.path(parts_dir, sprintf("part=%03d.csv.gz", b)) + if (!file.exists(bucket_paths[b])) { + fwrite(empty_part, sub("\\.gz$", "", part_file)) + system2("gzip", c("-f", sub("\\.gz$", "", part_file))) + next + } + dt <- fread(bucket_paths[b], showProgress = FALSE) + if (nrow(dt) == 0) { + fwrite(empty_part, sub("\\.gz$", "", part_file)) + system2("gzip", c("-f", sub("\\.gz$", "", part_file))) + next + } + agg <- product$aggregate_bucket(dt, year) + fwrite(agg, sub("\\.gz$", "", part_file)) + system2("gzip", c("-f", sub("\\.gz$", "", part_file))) + } + writeLines(c( + sprintf("year=%d", year), + sprintf("time_key=%d", time_key), + sprintf("n_buckets=%d", n_buckets) + ), done_marker) + unlink(tmp_dir, recursive = TRUE, force = TRUE) + invisible(parts_dir) +} + +#### tilewise_merge (concatenate bucket parts) + +tilewise_merge <- function(prep, time_key, product, overwrite = FALSE, verbose = TRUE) { + year <- prep$year + time_key <- as.integer(time_key) + + # Products that define combine() already wrote the final output there. + if (!is.null(product$combine)) { + out_path <- product$path_final_output(prep$out_dir, year, time_key) + if (file.exists(out_path)) { + if (verbose) message("[merge] skipped - combine already produced: ", out_path) + return(invisible(out_path)) + } + # Final output missing - run combine now. + if (verbose) message("[merge] final output missing, running combine") + return(product$combine(prep, time_key, overwrite = overwrite, verbose = verbose)) + } + + # Concatenate bucket parts into final output. + parts_dir <- product$path_combine_parts(prep$out_dir, year, time_key) + part_files <- sort(list.files(parts_dir, "^part=[0-9]+\\.csv\\.gz$", full.names = TRUE)) + if (length(part_files) == 0) stop("No part files in: ", parts_dir) + output_path <- product$path_final_output(prep$out_dir, year, time_key) + if (file.exists(output_path) && !overwrite) { + if (verbose) tw_log("INFO", "[merge] skip (exists): ", output_path) + return(invisible(output_path)) + } + if (verbose) tw_log("INFO", "[merge] concatenating ", length(part_files), " parts -> ", output_path) + concat_gzipped_csvs(part_files, output_path) + invisible(output_path) +} + +tilewise_prep_static <- function(year, product, ...) { + product$prep_static(year, ...) +} + +#### Product interface +# Required: +# product$prep_static(year, ...) +# -> list(year, polys, out_dir) +# polys must have columns: parcel_id, tile_ids (list of tile ID strings) +# polys may be an sf object (MSLSP) or a plain data.table (NDTI) +# product$scene_index(year, time_key, verbose) -> data.table with tile column +# product$scene_index_tile_col -> column name in scene_index result +# product$process_scene(prep, scene_row, tile_parcels, tile_id) -> data.table or NULL +# product$path_tilepieces(out_dir, year, time_key) -> directory path +# product$path_final_output(out_dir, year, time_key) -> file path +# product$empty_tilepiece_schema() -> zero-row data.table with correct columns +# +# Optional: +# product$prepare_tile(prep, tile_id, parcel_ids, scenes_this_tile) +# -> tile-specific context (e.g. sf geometry) passed to process_scene +# when NULL, framework slices prep$polys[parcel_indices, ] +# product$combine(prep, time_key, overwrite, verbose) +# -> full combine: read tilepieces, aggregate, write final output +# when NULL, framework uses generic bucket scatter/aggregate/merge +# +# Required only when combine is NULL (generic bucket path): +# product$path_combine_parts(out_dir, year, time_key) -> parts directory +# product$validate_tilepiece(dt) -> logical +# product$prepare_for_scatter(dt, n_buckets) -> dt with bucket column +# product$scatter_cols(dt) -> dt without bucket column +# product$aggregate_bucket(dt, year) -> aggregated dt +# product$empty_part_schema(year) -> zero-row data.table diff --git a/modules/data.remote/inst/ccmmf/hls/tilewise_mslsp_driver.R b/modules/data.remote/inst/ccmmf/hls/tilewise_mslsp_driver.R new file mode 100644 index 00000000000..64bd9c11a09 --- /dev/null +++ b/modules/data.remote/inst/ccmmf/hls/tilewise_mslsp_driver.R @@ -0,0 +1,72 @@ +#!/usr/bin/env Rscript +# MSLSP tilewise driver: prep-static, extract, combine, or all for one year. +# Uses tilewise_core plus MSLSP implementation and combine_mslsp_tilepieces.R. +# +# Main inputs: year, command prep-static|extract|combine|all, optional overwrite. +# Main outputs: tilepiece CSV.gz under phenology/raw_mslsp_v4.1, then mslsp_year=Y.parquet. +# How to run: Rscript tilewise_mslsp_driver.R [prep-static|extract|combine|all] [overwrite] +# Workflow: monitoring workflow stage S1-S5 for MSLSP (see monitoring_workflow_flowchart.mmd). + +script_dir <- if (length(file_arg <- commandArgs(trailingOnly = FALSE)[grepl("^--file=", + commandArgs(trailingOnly = FALSE))])) { + dirname(sub("^--file=", "", file_arg[1])) +} else "." + +# Arrow registers filesystem handlers lazily. Touch parquet before sf/terra/GDAL load. +suppressPackageStartupMessages(library(arrow)) +local({ + tmp <- tempfile(fileext = ".parquet") + on.exit(unlink(tmp)) + arrow::write_parquet(data.frame(x = 1L), tmp) + arrow::read_parquet(tmp) +}) + +source(file.path(script_dir, "extract_summary_core.R")) +source(file.path(script_dir, "tilewise_core.R")) +source(file.path(script_dir, "tilewise_mslsp_implementation.R")) +source(file.path(script_dir, "combine_mslsp_tilepieces.R")) + +product <- product_mslsp() + +args <- commandArgs(trailingOnly = TRUE) +is_overwrite <- function(x) tolower(x) %in% c("true", "t", "yes", "y", "overwrite") +overwrite_flag <- any(sapply(args, is_overwrite)) +args <- args[!sapply(args, is_overwrite)] + +if (length(args) < 1) { + stop("Usage: Rscript tilewise_mslsp_driver.R [prep-static|extract|combine|all] [overwrite]") +} + +year_arg <- as.integer(args[1]) +command <- if (length(args) >= 2 && nzchar(args[2])) tolower(args[2]) else "all" +mslsp_time_key <- 1L + +ts <- format(Sys.time(), "%Y%m%d_%H%M%S") +log_dir <- file.path(mslsp_out_root, sprintf("year=%d", year_arg), "logs") +tilewise_log_init(file.path(log_dir, + sprintf("mslsp_%s_%s.log", command, ts))) + +tw_log("INFO", "MSLSP driver year=", year_arg, " command=", command, + " overwrite=", overwrite_flag, + " pid=", Sys.getpid(), + " SGE_TASK_ID=", Sys.getenv("SGE_TASK_ID", "")) + +if (command == "prep-static") { + tilewise_prep_static(year_arg, product) + +} else if (command == "extract") { + prep <- tilewise_prep_static(year_arg, product) + tilewise_run(prep, mslsp_time_key, product, overwrite = overwrite_flag) + +} else if (command == "combine") { + prep <- tilewise_prep_static(year_arg, product) + tilewise_combine(prep, mslsp_time_key, product, overwrite = overwrite_flag) + +} else if (command == "all") { + prep <- tilewise_prep_static(year_arg, product) + tilewise_run(prep, mslsp_time_key, product, overwrite = overwrite_flag) + tilewise_combine(prep, mslsp_time_key, product, overwrite = overwrite_flag) + +} else { + stop("Unknown command: ", command, ". Use: prep-static | extract | combine | all") +} diff --git a/modules/data.remote/inst/ccmmf/hls/tilewise_mslsp_implementation.R b/modules/data.remote/inst/ccmmf/hls/tilewise_mslsp_implementation.R new file mode 100644 index 00000000000..87e22c67b98 --- /dev/null +++ b/modules/data.remote/inst/ccmmf/hls/tilewise_mslsp_implementation.R @@ -0,0 +1,525 @@ +# MSLSP product definition for the tilewise framework (prep, scenes, extract, combine). +# Sourced after extract_summary_core.R and combine_mslsp_tilepieces.R by the MSLSP driver. +# +# Main inputs: env paths for LandIQ, MSLSP NetCDF roots, parcel-tile map RDS. +# Main outputs: tilepieces and annual Parquet under phenology/raw_mslsp_v4.1. +# How to run: sourced only; use tilewise_mslsp_driver.R for CLI. +# Workflow: monitoring workflow MSLSP branch. + +#### Configuration +mslsp_landiq_v4 <- Sys.getenv("CCMMF_LANDIQ_V4", "/projectnb/dietzelab/ccmmf/LandIQ-harmonized-v4.1") +mslsp_management <- Sys.getenv("CCMMF_MANAGEMENT", "/projectnb/dietzelab/ccmmf/management") +mslsp_parcels_gpkg <- file.path(mslsp_landiq_v4, "parcels-consolidated.gpkg") +mslsp_crops_parq <- file.path(mslsp_landiq_v4, "crops_all_years.parq") +mslsp_cropcode_lookup <- file.path(mslsp_management, "LandIQ_cropCode_lookup_table.csv") +mslsp_legacy_dir <- Sys.getenv("mslsp_legacy_dir", "/projectnb/dietzelab/ccmmf/HLS_data") +mslsp_new_base <- Sys.getenv("mslsp_new_base", "/projectnb/dietzelab/ccmmf/data_phen/output") +mslsp_out_root <- file.path(mslsp_management, "phenology/raw_mslsp_v4.1") +mslsp_parcel_tilemap <- Sys.getenv( + "mslsp_parcel_tilemap", + file.path(mslsp_management, "hls_parcel_tile_map_v4.1_years=2016-2024.rds") +) + +mslsp_metrics <- c("OGI", "50PCGI", "OGMx", "Peak", "OGD", "50PCGD", "OGMn", + "EVImax", "EVIamp", "EVIarea") +mslsp_qa_cat <- c("gupQA", "gdownQA") +mslsp_year_fields_mode <- c("NumCycles") +mslsp_year_fields_mean <- c("numObs") + +mslsp_state <- new.env(parent = emptyenv()) + +#### Scene index helpers + +mslsp_nc_path <- function(tile_id, year) { + yr <- as.integer(year) + candidates <- c( + file.path(mslsp_legacy_dir, paste0("MSLSP_", tile_id, "_", yr, ".nc")), + file.path(mslsp_new_base, tile_id, "phenoMetrics", paste0("MSLSP_", tile_id, "_", yr, ".nc")) + ) + hit <- candidates[file.exists(candidates)] + if (length(hit) == 0) NA_character_ else hit[1] +} + +mslsp_load_parcel_tilemap <- function(year) { + if (!file.exists(mslsp_parcel_tilemap)) return(NULL) + dt <- as.data.table(readRDS(mslsp_parcel_tilemap)) + if (!all(c("parcel_id", "tileIDs", "year") %in% names(dt))) return(NULL) + yr <- as.integer(year) + dt <- dt[as.integer(year) == yr, .(parcel_id = as.character(parcel_id), tileIDs)] + setkey(dt, parcel_id) + dt +} + +mslsp_scene_index <- function(year, time_key, verbose = TRUE) { + yr <- as.integer(year) + key <- as.character(yr) + tiles <- if (exists(key, envir = mslsp_state, inherits = FALSE)) { + get(key, envir = mslsp_state, inherits = FALSE) + } else { + tilemap <- mslsp_load_parcel_tilemap(yr) + if (is.null(tilemap) || nrow(tilemap) == 0) character() else { + sort(unique(unlist(strsplit(tilemap$tileIDs, ",", fixed = TRUE), use.names = FALSE))) + } + } + if (length(tiles) == 0) return(data.table(tile_id = character(), path = character())) + paths <- vapply(tiles, mslsp_nc_path, character(1), year = yr) + ok <- !is.na(paths) & file.exists(paths) + out <- data.table(tile_id = as.character(tiles[ok]), path = paths[ok]) + if (verbose) { + missing_tiles <- as.character(tiles[!ok]) + if (length(missing_tiles) > 0) { + sample_tiles <- paste(head(sort(unique(missing_tiles)), 12), collapse = ",") + if (length(unique(missing_tiles)) > 12) sample_tiles <- paste0(sample_tiles, ",...") + msg <- paste0("[MSLSP] scene_index year=", yr, + " missing_tile_nc=", length(unique(missing_tiles)), + " sample=", sample_tiles) + if (exists("tw_log", mode = "function")) { + tw_log("WARN", msg) + } else { + message(msg) + } + } + if (nrow(out) > 0) message("[MSLSP] scene_index year=", yr, " tiles=", nrow(out)) + } + out +} + +#### Static prep +# PFT from lookup (CLASS, SUBCLASS -> PFT) so subclass-level differences (e.g. T19 vs T28 woody) are correct. +mslsp_prep_static_tilewise <- function(year, ...) { + yr <- as.integer(year) + sf::sf_use_s2(FALSE) + + lookup <- data.table::fread(mslsp_cropcode_lookup) + ag_pairs <- lookup[is_agricultural == TRUE, + .(CLASS = trimws(CLASS), SUBCLASS = as.character(SUBCLASS), PFT)] + + # Read once from the single parquet file and filter in-memory. + # This avoids Arrow dataset URI registration issues seen on SCC for some jobs. + pq_result <- as.data.table(arrow::read_parquet( + mslsp_crops_parq, + col_select = c("parcel_id", "CLASS", "SUBCLASS", "year") + )) + ca_crop <- pq_result[as.integer(year) == yr, .(parcel_id, CLASS, SUBCLASS)] + ca_crop[, CLASS := trimws(as.character(CLASS))] + ca_crop[, SUBCLASS := as.character(SUBCLASS)] + # Inner join: filters to agricultural only AND attaches PFT + ca_crop <- merge(ca_crop, ag_pairs, by = c("CLASS", "SUBCLASS")) + crop_ids <- unique(as.character(ca_crop$parcel_id)) + if (length(crop_ids) == 0) stop("No agricultural parcel_ids found for year ", yr) + + tilemap <- mslsp_load_parcel_tilemap(yr) + if (is.null(tilemap)) { + stop( + "Parcel-tile map not found for MSLSP.\n", + " Path: ", mslsp_parcel_tilemap, "\n", + " Build it: Rscript scripts/hls/build_hls_parcel_tile_map.R 2016 2024 overwrite" + ) + } + tile_sub <- tilemap[crop_ids, on = "parcel_id", nomatch = 0] + missing_map <- setdiff(crop_ids, tile_sub$parcel_id) + if (length(missing_map) > 0) { + message("[MSLSP prep] parcel_ids missing from tile map (dropped): ", length(missing_map)) + crop_ids <- intersect(crop_ids, tile_sub$parcel_id) + tile_sub <- tilemap[crop_ids, on = "parcel_id", nomatch = 0] + } + + # Load geometries from v4.1 GPKG in batches to avoid huge SQL IN clauses. + layer <- sf::st_layers(mslsp_parcels_gpkg)$name[1] + ids <- unique(tile_sub$parcel_id) + chunks <- split(ids, ceiling(seq_along(ids) / 5000L)) + geom_chunks <- lapply(chunks, function(x) { + esc <- gsub("'", "''", x, fixed = TRUE) + q <- sprintf('SELECT * FROM "%s" WHERE parcel_id IN (%s)', + layer, paste0("'", esc, "'", collapse = ",")) + sf::st_read(mslsp_parcels_gpkg, query = q, quiet = TRUE) + }) + polys <- do.call(rbind, geom_chunks) + polys$parcel_id <- as.character(polys$parcel_id) + polys <- polys[!sf::st_is_empty(sf::st_geometry(polys)), ] + + missing_geom <- setdiff(ids, polys$parcel_id) + if (length(missing_geom) > 0) { + message("[MSLSP prep] parcel_ids missing geometry (dropped): ", length(missing_geom)) + tile_sub <- tile_sub[!parcel_id %in% missing_geom] + } + tile_sub <- tile_sub[polys$parcel_id, on = "parcel_id", nomatch = 0] + + # Attach tile_ids list column so tilewise_core can build the tile->parcel index. + polys$tile_ids <- strsplit(tile_sub$tileIDs, ",", fixed = TRUE) + active_tiles <- sort(unique(unlist(polys$tile_ids, use.names = FALSE))) + mslsp_state[[as.character(yr)]] <- active_tiles + + out_dir <- file.path(mslsp_out_root, paste0("year=", yr)) + dir.create(out_dir, recursive = TRUE, showWarnings = FALSE) + + list(year = yr, polys = polys, out_dir = out_dir, active_tiles = active_tiles) +} + +#### Scene extraction +mslsp_process_scene_tilewise <- function(prep, scene_row, parcels_this_tile, tile_id) { + log_skip <- function(level = "WARN", reason, extra = "") { + msg <- paste0("[MSLSP extract] tile=", tile_id, + " reason=", reason, + if (nzchar(extra)) paste0(" ", extra) else "") + if (exists("tw_log", mode = "function")) { + tw_log(level, msg) + } else { + message(msg) + } + } + + if (!requireNamespace("exactextractr", quietly = TRUE)) { + stop("exactextractr is required for MSLSP extraction. Install with: install.packages('exactextractr')") + } + path <- scene_row$path[1] + if (!file.exists(path)) { + log_skip("WARN", "missing_scene_file", paste0("path=", path)) + return(NULL) + } + yr <- prep$year + + r_all <- try(terra::rast(path), silent = TRUE) + if (inherits(r_all, "try-error")) { + log_skip("WARN", "unreadable_scene_raster", paste0("path=", path)) + return(NULL) + } + + # Identify which layers are present in this NetCDF. + nms <- names(r_all) + qa1 <- intersect(mslsp_qa_cat, nms) + qa2 <- intersect(paste0(mslsp_qa_cat, "_2"), nms) + yr_mode <- intersect(mslsp_year_fields_mode, nms) # e.g. NumCycles + yr_mean <- intersect(mslsp_year_fields_mean, nms) # e.g. numObs + + cycle1_layers <- intersect(c(mslsp_metrics, qa1, yr_mode, yr_mean), nms) + cycle2_layers <- intersect(c(paste0(mslsp_metrics, "_2"), qa2), nms) + if (length(cycle1_layers) == 0 && length(cycle2_layers) == 0) { + log_skip("WARN", "no_expected_layers_in_scene", paste0("path=", path)) + return(NULL) + } + + # Reproject parcels to the raster CRS and crop to their bounding box. + # Both cycles share this single crop of the full-tile raster. + target_crs <- terra::crs(r_all) + parcels_tr <- sf::st_transform(parcels_this_tile, target_crs) + bbox_vect <- terra::vect(sf::st_as_sfc(sf::st_bbox(parcels_tr))) + r_crop <- try(terra::crop(r_all, bbox_vect), silent = TRUE) + if (inherits(r_crop, "try-error")) { + log_skip("WARN", "crop_extent_no_overlap", paste0("path=", path)) + return(NULL) + } + + pid_vec <- as.character(parcels_tr$parcel_id) + + # Build an exactextractr per-polygon callback for a given set of layers. + # + # exactextractr calls this once per polygon with: + # values - data.frame with one column per raster layer, one row per pixel + # cov_fracs - numeric vector of pixel coverage fractions (0-1) + # + # Arguments: + # metric_cols - layer names aggregated by weighted mean + SD + # mode_cols - layer names aggregated by weighted mode + mode_frac + # key_col - single layer used to define "valid" pixels: + # a pixel is valid if it overlaps the polygon and key_col is not NA + # col_rename - function applied to a layer name before naming the output column; + # for cycle 2 this strips "_2" so both cycles share the same schema + # (e.g. "OGI_2" becomes "OGI", output is "OGI_mean" not "OGI_2_mean") + make_summarize_poly <- function(metric_cols, mode_cols, key_col, col_rename = identity) { + function(values, cov_fracs) { + pos <- cov_fracs > 0 + ok <- pos & !is.na(values[[key_col]]) + w_total <- sum(cov_fracs[pos]) + w_valid_key <- sum(cov_fracs[ok]) + + out <- list( + n_valid = sum(ok), + w_valid = w_valid_key, + sum_w2 = sum(cov_fracs[ok]^2), # for n_eff = w_valid^2 / sum_w2 -> SE = sd / sqrt(n_eff) + na_frac = if (w_total > 0) 1 - w_valid_key / w_total else NA_real_ + ) + + for (col in metric_cols) { + s <- weighted_stats(values[[col]], cov_fracs) + base <- col_rename(col) + out[[paste0(base, "_mean")]] <- s$mean + out[[paste0(base, "_sd")]] <- s$sd + } + + for (col in mode_cols) { + m <- weighted_mode_stats(values[[col]], cov_fracs) + base <- col_rename(col) + out[[paste0(base, "_mode")]] <- m$mode + out[[paste0(base, "_mode_frac")]] <- m$mode_frac + } + + # Return a 1-row data.frame so exactextractr combines as one row per polygon + # (returning a raw list causes wrong combination: one row per list element). + # check.names=FALSE is required: layer names like "50PCGI_mean" start with a digit + # and would otherwise be silently renamed to "X50PCGI_mean" by as.data.frame(). + as.data.frame(out, stringsAsFactors = FALSE, check.names = FALSE) + } + } + + out_all <- list() + + # --- Cycle 1 --- + if (length(cycle1_layers) > 0) { + metric1 <- intersect(c(mslsp_metrics, yr_mean), cycle1_layers) + mode1 <- intersect(c(qa1, yr_mode), cycle1_layers) + key1 <- if (length(metric1) > 0) metric1[1] else mode1[1] + cb1 <- make_summarize_poly(metric1, mode1, key1) + res1 <- try( + exactextractr::exact_extract(r_crop[[cycle1_layers]], parcels_tr, fun = cb1, progress = FALSE), + silent = TRUE + ) + if (inherits(res1, "try-error")) { + log_skip("ERROR", "exact_extract_cycle1_failed", paste0("path=", path)) + } + if (!inherits(res1, "try-error") && !is.null(res1)) { + dt1 <- as.data.table(res1) + dt1[, c("parcel_id", "year", "cycle") := list(pid_vec, yr, 1L)] + out_all[[length(out_all) + 1]] <- dt1 + } + } + + # --- Cycle 2 --- + # col_rename strips the trailing "_2" from layer names so output columns match the + # cycle 1 schema. e.g. "OGI_2" -> "OGI_mean", "gupQA_2" -> "gupQA_mode". + # The `cycle` column then distinguishes the two. + if (length(cycle2_layers) > 0) { + strip2 <- function(x) sub("_2$", "", x) + metric2 <- intersect(paste0(mslsp_metrics, "_2"), cycle2_layers) + mode2 <- intersect(qa2, cycle2_layers) + key2 <- if (length(metric2) > 0) metric2[1] else mode2[1] + cb2 <- make_summarize_poly(metric2, mode2, key2, col_rename = strip2) + res2 <- try( + exactextractr::exact_extract(r_crop[[cycle2_layers]], parcels_tr, fun = cb2, progress = FALSE), + silent = TRUE + ) + if (inherits(res2, "try-error")) { + log_skip("ERROR", "exact_extract_cycle2_failed", paste0("path=", path)) + } + if (!inherits(res2, "try-error") && !is.null(res2)) { + dt2 <- as.data.table(res2) + dt2[, c("parcel_id", "year", "cycle") := list(pid_vec, yr, 2L)] + out_all[[length(out_all) + 1]] <- dt2 + } + } + + if (length(out_all) == 0) { + log_skip("WARN", "no_cycle_outputs_written", paste0("path=", path)) + return(NULL) + } + out <- rbindlist(out_all, fill = TRUE) + out[, parcel_id := as.character(parcel_id)] + + # Harmonize EVI scaling across vintages of MSLSP NetCDF files. + # + # Why this is needed: + # - 2018-2019 files (legacy path) encode EVI scaling with NetCDF attributes + # scale_factor/add_offset, which terra typically applies on read. + # - 2020+ files (new path) often encode equivalent information as scale/offset. + # In this workflow, those can come through as raw integer-like values + # (~0..10000 for EVImax/EVIamp, ~0..32766 for EVIarea). + # + # Without harmonization, different years end up on different numeric scales + # and downstream matching/QC behaves inconsistently. + # + # Strategy: + # - Detect raw-like magnitude in extracted parcel summaries. + # - Apply scaling exactly once only when needed. + maybe_scale_evi <- function(dt) { + evi_cols <- intersect(c("EVImax_mean", "EVIamp_mean", "EVIarea_mean"), names(dt)) + if (length(evi_cols) == 0) return(dt) + med_abs <- vapply(evi_cols, function(cc) { + v <- suppressWarnings(as.numeric(dt[[cc]])) + stats::median(abs(v), na.rm = TRUE) + }, numeric(1)) + # Physical EVI values should be O(1), not O(1e3-1e4). + needs_scale <- any(is.finite(med_abs[c("EVImax_mean", "EVIamp_mean")]) & + med_abs[c("EVImax_mean", "EVIamp_mean")] > 10, na.rm = TRUE) + if (!isTRUE(needs_scale)) return(dt) + + scale_f <- c(EVImax = 1e-4, EVIamp = 1e-4, EVIarea = 1e-2) + for (nm in names(scale_f)) { + mcol <- paste0(nm, "_mean") + scol <- paste0(nm, "_sd") + if (mcol %in% names(dt)) dt[[mcol]] <- suppressWarnings(as.numeric(dt[[mcol]])) * scale_f[[nm]] + if (scol %in% names(dt)) dt[[scol]] <- suppressWarnings(as.numeric(dt[[scol]])) * scale_f[[nm]] + } + if (exists("tw_log", mode = "function")) { + tw_log("WARN", "[MSLSP extract] tile=", tile_id, " applied EVI scale harmonization (raw-like magnitude detected)") + } + dt + } + out <- maybe_scale_evi(out) + + out[] +} + +# ============================================================================= +# Combine helpers (used by combine_mslsp_tilepieces.R and the bucket fallback) +# ============================================================================= + +# Core aggregation logic, shared by mslsp_combine and the bucket fallback. +# Aggregates by (parcel_id, year, cycle) using area-weighted statistics. +mslsp_aggregate_tilepieces <- function(dt, year) { + key <- c("parcel_id", "year", "cycle") + if (!all(key %in% names(dt))) return(dt) + dt[, year := as.integer(year)] + + safe_weighted_mean <- function(x, w) { + ok <- is.finite(x) & is.finite(w) & w > 0 + if (!any(ok)) return(NA_real_) + sum(w[ok] * x[ok]) / sum(w[ok]) + } + safe_weighted_sd <- function(mu, sd, w) { + ok <- is.finite(mu) & is.finite(sd) & is.finite(w) & w > 0 + if (!any(ok)) return(NA_real_) + ww <- w[ok]; mm <- mu[ok]; ss <- sd[ok] + m_all <- sum(ww * mm) / sum(ww) + var_all <- sum(ww * (ss^2 + mm^2)) / sum(ww) - m_all^2 + sqrt(pmax(0, var_all)) + } + weighted_mode <- function(x, w) { + ok <- !is.na(x) & is.finite(w) & w > 0 + if (!any(ok)) return(NA_real_) + dd <- data.table(x = x[ok], w = w[ok])[, .(w = sum(w)), by = x][order(-w)] + as.numeric(dd$x[1]) + } + weighted_mode_frac <- function(x, mode_val, w) { + ok <- !is.na(x) & is.finite(w) & w > 0 + if (!any(ok) || is.na(mode_val)) return(NA_real_) + denom <- sum(w[ok]) + if (!is.finite(denom) || denom <= 0) return(NA_real_) + sum(w[ok & x == mode_val]) / denom + } + + agg <- dt[, { + w <- as.numeric(w_valid) + out <- list( + n_valid = as.integer(sum(as.integer(n_valid), na.rm = TRUE)), + w_valid = as.numeric(sum(w, na.rm = TRUE)), + sum_w2 = as.numeric(sum(as.numeric(sum_w2), na.rm = TRUE)), + na_frac = safe_weighted_mean(as.numeric(na_frac), w) + ) + for (m in mslsp_metrics) { + mcol <- paste0(m, "_mean"); scol <- paste0(m, "_sd") + mu <- if (mcol %in% names(.SD)) as.numeric(get(mcol)) else rep(NA_real_, .N) + sd <- if (scol %in% names(.SD)) as.numeric(get(scol)) else rep(NA_real_, .N) + out[[mcol]] <- safe_weighted_mean(mu, w) + out[[scol]] <- safe_weighted_sd(mu, sd, w) + } + for (q in c(mslsp_qa_cat, mslsp_year_fields_mode)) { + qcol <- paste0(q, "_mode"); fqcol <- paste0(q, "_mode_frac") + qx <- if (qcol %in% names(.SD)) as.numeric(get(qcol)) else rep(NA_real_, .N) + qm <- weighted_mode(qx, w) + out[[qcol]] <- qm + out[[fqcol]] <- weighted_mode_frac(qx, qm, w) + } + for (f in mslsp_year_fields_mean) { + mcol <- paste0(f, "_mean"); scol <- paste0(f, "_sd") + mu <- if (mcol %in% names(.SD)) as.numeric(get(mcol)) else rep(NA_real_, .N) + sd <- if (scol %in% names(.SD)) as.numeric(get(scol)) else rep(NA_real_, .N) + out[[mcol]] <- safe_weighted_mean(mu, w) + out[[scol]] <- safe_weighted_sd(mu, sd, w) + } + out + }, by = key] + setcolorder(agg, c(key, setdiff(names(agg), key))) + agg +} + +# ============================================================================= +# Product object +# ============================================================================= + +product_mslsp <- function() { + if (!exists("weighted_stats", mode = "function")) { + stop("Source extract_summary_core.R first (provides weighted_stats, weighted_mode_stats)") + } + if (!exists("mslsp_combine", mode = "function")) { + stop("Source combine_mslsp_tilepieces.R first (provides mslsp_combine)") + } + list( + prep_static = mslsp_prep_static_tilewise, + scene_index = mslsp_scene_index, + scene_index_tile_col = "tile_id", + process_scene = mslsp_process_scene_tilewise, + + # MSLSP loads all geometry upfront in prep_static, so no prepare_tile needed. + # tilewise_core will slice prep$polys[parcel_indices, ] per tile. + prepare_tile = NULL, + + combine = mslsp_combine, + + path_tilepieces = function(out_dir, year, time_key) { + file.path(out_dir, sprintf("tilepieces_year=%d", year)) + }, + path_final_output = function(out_dir, year, time_key) { + file.path(out_dir, sprintf("mslsp_year=%d.parquet", year)) + }, + # path_combine_parts not used (combine handles output directly) + path_combine_parts = function(out_dir, year, time_key) NULL, + + validate_tilepiece = function(dt) { + req <- c("parcel_id", "year", "cycle", "n_valid", "w_valid") + has_req <- all(req %in% names(dt)) + has_met <- any(paste0(mslsp_metrics, "_mean") %in% names(dt)) + has_req && has_met + }, + + # Bucket fallback (used only when combine is not available). + prepare_for_scatter = function(dt, n_buckets) { + dt[, bucket := parcel_id_to_bucket(parcel_id, n_buckets)] + dt + }, + scatter_cols = function(dt) { + cols <- setdiff(names(dt), "bucket") + dt[, ..cols] + }, + aggregate_bucket = function(dt, year) mslsp_aggregate_tilepieces(dt, year), + + empty_tilepiece_schema = function() { + base <- data.table( + parcel_id = character(), year = integer(), cycle = integer(), + n_valid = integer(), w_valid = double(), sum_w2 = double(), na_frac = double() + ) + for (m in mslsp_metrics) { + base[[paste0(m, "_mean")]] <- double() + base[[paste0(m, "_sd")]] <- double() + } + for (q in c(mslsp_qa_cat, mslsp_year_fields_mode)) { + base[[paste0(q, "_mode")]] <- double() + base[[paste0(q, "_mode_frac")]] <- double() + } + for (f in mslsp_year_fields_mean) { + base[[paste0(f, "_mean")]] <- double() + base[[paste0(f, "_sd")]] <- double() + } + base[, tile_id := character()] + base + }, + empty_part_schema = function(year) { + base <- data.table( + parcel_id = character(), year = integer(), cycle = integer(), + n_valid = integer(), w_valid = double(), sum_w2 = double(), na_frac = double() + ) + for (m in mslsp_metrics) { + base[[paste0(m, "_mean")]] <- double() + base[[paste0(m, "_sd")]] <- double() + } + for (q in c(mslsp_qa_cat, mslsp_year_fields_mode)) { + base[[paste0(q, "_mode")]] <- double() + base[[paste0(q, "_mode_frac")]] <- double() + } + for (f in mslsp_year_fields_mean) { + base[[paste0(f, "_mean")]] <- double() + base[[paste0(f, "_sd")]] <- double() + } + base + } + ) +} diff --git a/modules/data.remote/inst/ccmmf/hls/tilewise_ndti_driver.R b/modules/data.remote/inst/ccmmf/hls/tilewise_ndti_driver.R new file mode 100644 index 00000000000..566f9175682 --- /dev/null +++ b/modules/data.remote/inst/ccmmf/hls/tilewise_ndti_driver.R @@ -0,0 +1,88 @@ +#!/usr/bin/env Rscript +# NDTI tilewise driver: prep-static, extract, combine, or all for one year and month(s). +# Uses tilewise_core and NDTI implementation; writes under tillage/ndti_v4.1. +# +# Main inputs: year, command, month integers 1-12, optional overwrite. +# Main outputs: tilepiece CSV.gz and ndti_year=Y_month=MM.parquet per month. +# How to run: Rscript tilewise_ndti_driver.R [months ...] [overwrite] +# Workflow: monitoring workflow NDTI branch (see monitoring_workflow_flowchart.mmd). + +script_dir <- if (length(file_arg <- commandArgs(trailingOnly = FALSE)[grepl("^--file=", + commandArgs(trailingOnly = FALSE))])) { + dirname(sub("^--file=", "", file_arg[1])) +} else "." + +suppressPackageStartupMessages(library(arrow)) +local({ + tmp <- tempfile(fileext = ".parquet") + on.exit(unlink(tmp)) + arrow::write_parquet(data.frame(x = 1L), tmp) + arrow::read_parquet(tmp) +}) + +source(file.path(script_dir, "tilewise_ndti_implementation.R")) +source(file.path(script_dir, "combine_ndti_tilepieces.R")) +source(file.path(script_dir, "tilewise_core.R")) + +path_ndti_output <- file.path( + Sys.getenv("CCMMF_MANAGEMENT", "/projectnb/dietzelab/ccmmf/management"), + "tillage/ndti_v4.1" +) +product <- product_ndti() + +args <- commandArgs(trailingOnly = TRUE) +is_overwrite <- function(x) tolower(x) %in% c("true", "t", "yes", "y", "overwrite") +overwrite_flag <- any(sapply(args, is_overwrite)) +args <- args[!sapply(args, is_overwrite)] + +if (length(args) < 2) { + stop("Usage: Rscript tilewise_ndti_driver.R [month ...] [overwrite]") +} + +year_arg <- as.integer(args[1]) +command <- tolower(args[2]) + +months_from <- function(x) { + vals <- suppressWarnings(as.integer(x)) + unique(sort(vals[!is.na(vals) & vals >= 1 & vals <= 12])) +} + +months_str <- paste(months_from(args[-(1:2)]), collapse = "-") +if (!nzchar(months_str)) months_str <- "prep" +ts <- format(Sys.time(), "%Y%m%d_%H%M%S") +log_dir <- file.path(path_ndti_output, sprintf("year=%d", year_arg), "logs") +tilewise_log_init(file.path(log_dir, + sprintf("ndti_%s_m%s_%s.log", command, months_str, ts))) + +tw_log("INFO", "NDTI driver year=", year_arg, " command=", command, + " overwrite=", overwrite_flag, + " pid=", Sys.getpid(), + " SGE_TASK_ID=", Sys.getenv("SGE_TASK_ID", "")) + +if (command == "prep-static") { + tilewise_prep_static(year_arg, product, overwrite = overwrite_flag) + +} else if (command == "extract") { + months <- months_from(args[-(1:2)]) + if (length(months) == 0) stop("Usage: ... extract [month ...] [overwrite]") + prep <- tilewise_prep_static(year_arg, product, overwrite = overwrite_flag) + for (m in months) tilewise_run(prep, m, product, overwrite = overwrite_flag) + +} else if (command == "combine") { + months <- months_from(args[-(1:2)]) + if (length(months) == 0) stop("Usage: ... combine [month ...] [overwrite]") + prep <- tilewise_prep_static(year_arg, product) + for (m in months) tilewise_combine(prep, m, product, overwrite = overwrite_flag) + +} else if (command == "all") { + months <- months_from(args[-(1:2)]) + if (length(months) == 0) stop("Usage: ... all [month ...] [overwrite]") + prep <- tilewise_prep_static(year_arg, product, overwrite = overwrite_flag) + for (m in months) { + tilewise_run(prep, m, product, overwrite = overwrite_flag) + tilewise_combine(prep, m, product, overwrite = overwrite_flag) + } + +} else { + stop("Unknown command: ", command, ". Use: prep-static | extract | combine | all") +} diff --git a/modules/data.remote/inst/ccmmf/hls/tilewise_ndti_implementation.R b/modules/data.remote/inst/ccmmf/hls/tilewise_ndti_implementation.R new file mode 100644 index 00000000000..490788e6363 --- /dev/null +++ b/modules/data.remote/inst/ccmmf/hls/tilewise_ndti_implementation.R @@ -0,0 +1,361 @@ +# NDTI product for the tilewise framework (HLS bands, Fmask, per-parcel NDTI). +# Weighted stats: n_eff = w_valid^2/sum_w2; SE_weighted = ndti_sd/sqrt(n_eff). +# +# Main inputs: HLSL/HLSS roots, LandIQ paths, parcel-tile map (env). +# Main outputs: tilepieces and monthly Parquet under tillage/ndti_v4.1. +# How to run: sourced from tilewise_ndti_driver.R. +# Workflow: monitoring workflow NDTI branch. + +suppressPackageStartupMessages({ + library(sf) + library(terra) + library(data.table) + library(stringr) + library(arrow) + library(dplyr) + library(exactextractr) +}) +sf::sf_use_s2(FALSE) + +terra::terraOptions(threads = max(1L, suppressWarnings( + as.integer(Sys.getenv("NDTI_TERRA_THREADS", "8")) +))) + +#### Configuration +ndti_management <- Sys.getenv("CCMMF_MANAGEMENT", "/projectnb/dietzelab/ccmmf/management") +ndti_landiq_v4 <- Sys.getenv("CCMMF_LANDIQ_V4", "/projectnb/dietzelab/ccmmf/LandIQ-harmonized-v4.1") +ndti_parcels_gpkg <- file.path(ndti_landiq_v4, "parcels-consolidated.gpkg") +ndti_crops_parq <- file.path(ndti_landiq_v4, "crops_all_years.parq") +ndti_cropcode_csv <- file.path(ndti_management, "LandIQ_cropCode_lookup_table.csv") +ndti_hlsl_base <- Sys.getenv("HLSL_BASE", "/projectnb/dietzelab/XinyuanJi/State_of_California_HLSL") +ndti_hlss_base <- Sys.getenv("HLSS_BASE", "/projectnb/dietzelab/XinyuanJi/State_of_California_HLSS") +ndti_out_root <- file.path(ndti_management, "tillage/ndti_v4.1") +ndti_parcel_tilemap <- Sys.getenv( + "NDTI_PARCEL_TILEMAP", + file.path(ndti_management, "hls_parcel_tile_map_v4.1_years=2016-2024.rds") +) + +#### Path helpers + +# Intermediate per-tile CSV.gz files (one per tile per month) +path_tilepieces <- function(output_dir, year, month) { + file.path(output_dir, sprintf("tilepieces_year=%d_month=%02d", year, month)) +} + +# Final combined output: one parquet per month inside year= directory. +# All monthly files form a Hive-partitioned dataset: arrow::open_dataset(ndti_out_root) +path_monthly_output <- function(output_dir, year, month) { + file.path(output_dir, sprintf("ndti_year=%d_month=%02d.parquet", year, month)) +} + +#### Small utilities + +# Parse MGRS tile ID from HLS filename (e.g. "HLS.L30.T10SFF...." -> "10SFF") +extract_tile_id_from_filename <- function(filenames) { + matched <- str_extract(filenames, "T[0-9A-Z]{5}\\.") + result <- sub("^T|\\.$", "", matched) + result[is.na(matched)] <- NA_character_ + result +} + +sanitize_tile_id_for_filename <- function(tile_id) gsub("[^0-9A-Za-z]+", "_", tile_id) + +# HLS filenames carry a trailing dot on the tile ID ("10SFF."); strip for matching +normalize_tile_id <- function(x) sub("\\.+$", "", trimws(as.character(x))) + +# Fmask bit flags: cloud=bit1, cloud shadow=bit3, snow=bit4 +is_fmask_bad_pixel <- function(fmask_values) { + (bitwAnd(fmask_values, 2L) != 0) | + (bitwAnd(fmask_values, 8L) != 0) | + (bitwAnd(fmask_values, 16L) != 0) +} + +# Append rows to a CSV, writing header only on the first write +append_to_csv <- function(data, filepath) { + if (file.exists(filepath)) { + fwrite(data, filepath, append = TRUE, col.names = FALSE) + } else { + fwrite(data, filepath) + } +} + +# ============================================================================= +# Scene index +# ============================================================================= + +# Build a table of all HLS scenes (date x sensor x tile) for a given year-month. +build_scene_index <- function(year, month, verbose = TRUE) { + month_start <- as.Date(paste(year, month, 1, sep = "-")) + month_end <- seq(month_start, by = "month", length.out = 2L)[2L] - 1L + + list_scenes <- function(dir_path, band_pattern, sensor_name) { + if (!dir.exists(dir_path)) return(data.table()) + files <- list.files(dir_path, pattern = band_pattern, full.names = TRUE) + if (length(files) == 0) return(data.table()) + doy_str <- str_extract(basename(files), "\\d{7}") + dates <- as.Date(doy_str, "%Y%j") + keep <- !is.na(dates) & dates >= month_start & dates <= month_end + if (!any(keep)) return(data.table()) + data.table( + date = dates[keep], + sensor = sensor_name, + tile_id = extract_tile_id_from_filename(basename(files)[keep]), + path = files[keep] + ) + } + + scenes <- rbindlist(list( + list_scenes(file.path(ndti_hlsl_base, year), ".*B06.*\\.tif$", "HLSL"), + list_scenes(file.path(ndti_hlss_base, year), ".*B11.*\\.tif$", "HLSS") + )) + scenes <- scenes[!is.na(tile_id)] + scenes[, tile_id := normalize_tile_id(tile_id)] + if (verbose) message("[scene] ", nrow(scenes), " scenes for ", year, "-", sprintf("%02d", month)) + scenes +} + +#### Geometry loading +load_parcel_geometries <- function(parcel_ids) { + parcel_ids <- unique(as.character(parcel_ids)) + layer_name <- st_layers(ndti_parcels_gpkg)$name[1] + chunks <- split(parcel_ids, ceiling(seq_along(parcel_ids) / 5000L)) + out <- lapply(chunks, function(chunk) { + ids_sql <- paste0("'", gsub("'", "''", chunk, fixed = TRUE), "'", collapse = ",") + st_read(ndti_parcels_gpkg, + query = sprintf('SELECT * FROM "%s" WHERE parcel_id IN (%s)', layer_name, ids_sql), + quiet = TRUE) + }) + parcels <- do.call(rbind, out) + parcels <- st_zm(parcels, drop = TRUE, what = "ZM") + parcels[!st_is_empty(st_geometry(parcels)), ] +} + +load_parcel_tilemap <- function(year = NULL) { + if (!file.exists(ndti_parcel_tilemap)) return(NULL) + dt <- as.data.table(readRDS(ndti_parcel_tilemap)) + if ("year" %in% names(dt)) { + if (is.null(year)) return(NULL) + dt <- dt[year == as.integer(year)][, year := NULL] + } + if (!all(c("parcel_id", "tileIDs") %in% names(dt))) return(NULL) + dt[, parcel_id := as.character(parcel_id)] + dt <- dt[, .(tileIDs = paste(sort(unique(unlist( + strsplit(paste(tileIDs, collapse = ","), ",", fixed = TRUE) + ))), collapse = ",")), by = parcel_id] + setkey(dt, parcel_id) + dt +} + +#### Static prep (cached per year) +ndti_prep_static_tilewise <- function(year, overwrite = FALSE, verbose = TRUE) { + yr <- as.integer(year) + output_dir <- file.path(ndti_out_root, paste0("year=", yr)) + cache_path <- file.path(output_dir, sprintf("ndti_prep_static_year=%d.rds", yr)) + + if (file.exists(cache_path) && !overwrite) { + if (verbose) message("[prep] using cache: ", cache_path) + prep <- readRDS(cache_path) + if (!is.null(prep$polys)) return(prep) + } else { + lookup <- fread(ndti_cropcode_csv) + ag_pairs <- unique(lookup[is_agricultural == TRUE, + .(CLASS = trimws(CLASS), SUBCLASS = as.character(SUBCLASS))]) + ag_classes_filter <- unique(ag_pairs$CLASS) + + pq_result <- arrow::open_dataset(ndti_crops_parq) |> + dplyr::filter(year == !!yr, CLASS %in% !!ag_classes_filter) |> + dplyr::select(parcel_id, CLASS, SUBCLASS) |> + dplyr::collect() + ca_crop <- as.data.table(pq_result) + ca_crop[, CLASS := trimws(as.character(CLASS))] + ca_crop[, SUBCLASS := as.character(SUBCLASS)] + ca_crop <- merge(ca_crop, ag_pairs, by = c("CLASS", "SUBCLASS")) + parcel_ids <- unique(as.character(ca_crop$parcel_id)) + if (length(parcel_ids) == 0) stop("No agricultural parcel_ids found for year ", yr) + if (verbose) message("[prep] agricultural parcels: ", length(parcel_ids)) + + tilemap <- load_parcel_tilemap(yr) + if (is.null(tilemap)) stop( + "Parcel-tile map not found: ", ndti_parcel_tilemap, "\n", + "Build it: Rscript scripts/hls/build_hls_parcel_tile_map.R 2016 2024 overwrite" + ) + + # tile_to_parcel_ids: kept for reference; polys drives tilewise_core + tile_to_parcel_ids <- local({ + pid <- unique(as.character(parcel_ids)) + map_sub <- tilemap[pid, on = "parcel_id", nomatch = 0] + map_sub <- unique(map_sub, by = "parcel_id") + out <- list() + for (i in seq_len(nrow(map_sub))) { + for (tile in strsplit(map_sub$tileIDs[i], ",", fixed = TRUE)[[1]]) { + if (nzchar(tile)) out[[tile]] <- c(out[[tile]], map_sub$parcel_id[i]) + } + } + out + }) + + covered <- unique(unlist(tile_to_parcel_ids, use.names = FALSE)) + n_missing <- length(setdiff(parcel_ids, covered)) + if (n_missing > 0) + message("[prep] ", n_missing, " parcels missing from tile map (dropped) - rebuild map if large") + + dir.create(output_dir, recursive = TRUE, showWarnings = FALSE) + prep <- list( + year = yr, + out_dir = output_dir, + tile_to_parcel_ids = tile_to_parcel_ids, + base_dirs = list(HLSL = ndti_hlsl_base, HLSS = ndti_hlss_base), + band_other = list(HLSL = c("B06", "B07"), HLSS = c("B11", "B12")), + fmask_suffix = list(HLSL = "B06.tif$", HLSS = "B11.tif$") + ) + saveRDS(prep, cache_path) + if (verbose) message("[prep] saved: ", cache_path) + } + + # Build polys data.table (parcel_id + tile_ids list-column) for tilewise_core. + # Geometry is NOT loaded here; it is loaded lazily per-tile in prepare_tile(). + tile_ids <- names(prep$tile_to_parcel_ids) + parcel_tile <- rbindlist(lapply(tile_ids, function(tid) { + ids <- as.character(prep$tile_to_parcel_ids[[tid]]) + if (length(ids) == 0) return(NULL) + data.table(parcel_id = ids, tile_id = tid) + }), use.names = TRUE, fill = TRUE) + + if (is.null(parcel_tile) || nrow(parcel_tile) == 0) { + prep$polys <- data.table(parcel_id = character(), tile_ids = list()) + } else { + prep$polys <- parcel_tile[, .(tile_ids = list(sort(unique(tile_id)))), by = parcel_id] + } + prep +} + +#### Scene extraction +# parcels_sf must be reprojected to raster CRS. +extract_ndti_scene <- function(band1_path, band2_path, fmask_path, + parcels_sf, parcel_ids, scene_date) { + bbox_vect <- terra::vect(sf::st_as_sfc(sf::st_bbox(parcels_sf))) + band1 <- try(terra::crop(terra::rast(band1_path), bbox_vect), silent = TRUE) + band2 <- try(terra::crop(terra::rast(band2_path), bbox_vect), silent = TRUE) + fmask <- try(terra::crop(terra::rast(fmask_path), bbox_vect), silent = TRUE) + if (inherits(band1, "try-error") || inherits(band2, "try-error") || inherits(fmask, "try-error")) { + return(NULL) + } + + ndti <- (band1 - band2) / (band1 + band2) + ndti[is_fmask_bad_pixel(terra::values(fmask))] <- NA + names(ndti) <- "NDTI" + + summarize_poly <- function(values, cov_fracs) { + pos <- cov_fracs > 0 + ok <- !is.na(values) & pos + w_total <- sum(cov_fracs[pos]) + w_valid <- sum(cov_fracs[ok]) + if (!any(ok)) { + return(as.data.frame(list(ndti_mean = NA_real_, ndti_sd = NA_real_, + n_valid = 0L, w_valid = 0, sum_w2 = 0, na_frac = NA_real_))) + } + v <- values[ok] + w <- cov_fracs[ok] + mu <- sum(w * v) / w_valid + as.data.frame(list( + ndti_mean = mu, + ndti_sd = sqrt(sum(w * (v - mu)^2) / w_valid), + n_valid = sum(ok), + w_valid = w_valid, + sum_w2 = sum(w^2), # n_eff = w_valid^2 / sum_w2 -> SE = ndti_sd / sqrt(n_eff) + na_frac = if (w_total > 0) 1 - w_valid / w_total else NA_real_ + )) + } + + result <- try( + exactextractr::exact_extract(ndti, parcels_sf, fun = summarize_poly, + progress = FALSE), + silent = TRUE + ) + if (inherits(result, "try-error") || is.null(result)) return(NULL) + + dt <- as.data.table(result) + # Assign parcel_id manually - exactextractr preserves row order matching parcels_sf. + dt[, parcel_id := parcels_sf$parcel_id] + dt[, date := scene_date] + dt +} + +#### Product object +product_ndti <- function() { + list( + prep_static = ndti_prep_static_tilewise, + scene_index = build_scene_index, + scene_index_tile_col = "tile_id", + + # Load parcel geometry from GPKG for just this tile's parcels, reprojected + # to the raster CRS. Called once per tile before the scene loop. + prepare_tile = function(prep, tile_id, parcel_ids, scenes_this_tile) { + parcels_sf <- load_parcel_geometries(as.character(parcel_ids)) + if (nrow(parcels_sf) == 0) return(NULL) + # Some scene files can be corrupt/truncated; choose the first readable + # scene to determine CRS so one bad file doesn't fail the whole tile. + raster_crs <- NULL + for (p in scenes_this_tile$path) { + r_try <- try(terra::rast(p), silent = TRUE) + if (inherits(r_try, "try-error")) next + c_try <- try(sf::st_crs(terra::crs(r_try)), silent = TRUE) + if (inherits(c_try, "try-error") || is.null(c_try) || is.na(c_try)) next + raster_crs <- c_try + break + } + if (is.null(raster_crs)) return(NULL) + sf::st_transform(parcels_sf, raster_crs) + }, + + # Extract NDTI stats for one HLS scene. + # tile_parcels is the sf object returned by prepare_tile (already reprojected). + process_scene = function(prep, scene_row, tile_parcels, tile_id) { + sensor <- scene_row$sensor[1] + b1 <- scene_row$path[1] + bands <- list(HLSL = c("B06", "B07"), HLSS = c("B11", "B12"))[[sensor]] + b2 <- sub(bands[1], bands[2], b1, fixed = TRUE) + yr_str <- stringr::str_extract(basename(b1), "\\d{4}") + fmask <- file.path( + prep$base_dirs[[sensor]], paste0(yr_str, "_Fmask"), + sub(prep$fmask_suffix[[sensor]], "Fmask.tif", basename(b1)) + ) + if (!file.exists(b2) || !file.exists(fmask)) return(NULL) + extract_ndti_scene(b1, b2, fmask, tile_parcels, tile_parcels$parcel_id, + scene_row$date[1]) + }, + + # Reads all tilepieces, aggregates across tiles, writes Parquet. + combine = function(prep, time_key, overwrite = FALSE, verbose = TRUE) { + ndti_combine(prep, time_key, overwrite = overwrite, verbose = verbose) + }, + + path_tilepieces = path_tilepieces, + path_final_output = path_monthly_output, + path_combine_parts = function(out_dir, year, time_key) NULL, + + validate_tilepiece = function(dt) { + all(c("parcel_id", "date", "ndti_mean", "ndti_sd", + "n_valid", "w_valid", "sum_w2", "na_frac") %in% names(dt)) + }, + + empty_tilepiece_schema = function() { + data.table( + parcel_id = character(), + date = as.Date(integer(0), origin = "1970-01-01"), + ndti_mean = double(), ndti_sd = double(), + n_valid = integer(), w_valid = double(), + sum_w2 = double(), na_frac = double() + ) + }, + empty_part_schema = function(year) { + data.table( + parcel_id = character(), year = integer(), + date = as.Date(integer(0), origin = "1970-01-01"), + ndti_mean = double(), ndti_sd = double(), + n_valid = integer(), w_valid = double(), + sum_w2 = double(), na_frac = double() + ) + } + ) +} diff --git a/modules/data.remote/inst/ccmmf/phenology/match_landiq_mslsp.R b/modules/data.remote/inst/ccmmf/phenology/match_landiq_mslsp.R new file mode 100644 index 00000000000..dab7d2047da --- /dev/null +++ b/modules/data.remote/inst/ccmmf/phenology/match_landiq_mslsp.R @@ -0,0 +1,605 @@ +#!/usr/bin/env Rscript +# Match LandIQ crop seasons to MSLSP cycles using rule-based assignment (no rank cost). +# Primary rule: ADOY inside [OGI, OGMn]; tie-break by nearest Peak, then cycle 1 before 2. +# CLASS-aware season order: season 2 first when CLASS present; MULTIUSE D/M favors season 1; +# then seasons 3/4. MSLSP cycles follow User Guide (cycle 1 = largest EVI2 amplitude). +# +# Main inputs: CCMMF_MANAGEMENT, CCMMF_LANDIQ_V4, raw_mslsp_v4.1 parquet, crops_all_years.parq. +# Main outputs: assigned_year=Y.parquet and QC tables under phenology/matched_landiq_mslsp_v4.1. +# How to run: Rscript match_landiq_mslsp.R (year from YEAR global or wrapper); see repo docs. +# Workflow: monitoring workflow ASSIGN stage between MSLSP annual parquet and trait lookups. + +suppressPackageStartupMessages({ + library(data.table) + library(arrow) + library(dplyr) + library(lubridate) +}) + +#### Configuration +path_management <- Sys.getenv("CCMMF_MANAGEMENT", "/projectnb/dietzelab/ccmmf/management") +path_landiq_v4 <- Sys.getenv("CCMMF_LANDIQ_V4", "/projectnb/dietzelab/ccmmf/LandIQ-harmonized-v4.1") +combined_root <- file.path(path_management, "phenology/raw_mslsp_v4.1") +landiq_parq <- file.path(path_landiq_v4, "crops_all_years.parq") +cropcode_csv <- file.path(path_management, "LandIQ_cropCode_lookup_table.csv") +out_dir <- file.path(path_management, "phenology/matched_landiq_mslsp_v4.1") + +eps_eviamp <- 0.01 +heterogeneity_na_frac_thr <- 0.5 +assign_active_only <- TRUE + +year <- if (exists("YEAR", envir = .GlobalEnv)) get("YEAR", .GlobalEnv) else NULL +do_subset_test <- if (exists("DO_SUBSET_TEST", envir = .GlobalEnv)) get("DO_SUBSET_TEST", .GlobalEnv) else FALSE +sample_per_pft <- if (exists("SAMPLE_PER_PFT", envir = .GlobalEnv)) get("SAMPLE_PER_PFT", .GlobalEnv) else 500L +assign_parcel_ids_file <- if (exists("ASSIGN_PARCEL_IDS_FILE", envir = .GlobalEnv)) get("ASSIGN_PARCEL_IDS_FILE", .GlobalEnv) else Sys.getenv("ASSIGN_PARCEL_IDS_FILE", "") +assign_subset_ids <- if (nzchar(trimws(assign_parcel_ids_file)) && file.exists(assign_parcel_ids_file)) { + ids <- unique(trimws(as.character(fread(assign_parcel_ids_file)$parcel_id))) + ids[nzchar(ids)] +} else character(0) + +#### Helpers +# Normalize DOY for same-year wrap (e.g. OGI=350, OGMn=50). Do NOT use for cross-year DOY. +norm_doy <- function(doy) { + d <- as.numeric(doy) + out <- d + ok <- !is.na(d) + out[ok & d < 1] <- d[ok & d < 1] + 365 + out[ok & d > 365] <- d[ok & d > 365] - 365 + out +} + +# LandIQ: "negative ADOY = prior year (-92 = Oct 1); positive = target year" (metadata). +# MSLSP: DOY "January 1 of target year = 1", valid -181 to 548 (User Guide). Use date comparison. +adoy_in_window <- function(adoy, ogi, ogmn, year) { + yr <- as.integer(year)[1] + n <- max(length(adoy), length(ogi), length(ogmn)) + adoy <- rep(as.numeric(adoy), length.out = n) + ogi <- rep(as.numeric(ogi), length.out = n) + ogmn <- rep(as.numeric(ogmn), length.out = n) + out <- rep(NA, n) + ok <- !is.na(adoy) & !is.na(ogi) & !is.na(ogmn) + if (!any(ok)) return(out) + d0 <- as.Date(sprintf("%d-01-01", yr)) + adoy_date <- d0 + as.integer(round(adoy[ok])) - 1L + ogi_date <- d0 + as.integer(round(ogi[ok])) - 1L + ogmn_date <- d0 + as.integer(round(ogmn[ok])) - 1L + lo <- pmin(ogi_date, ogmn_date) + hi <- pmax(ogi_date, ogmn_date) + out[ok] <- (adoy_date >= lo) & (adoy_date <= hi) + out +} + +is_real_cycle <- function(Peak, OGI, OGMn, w_valid, EVIamp, eps_amp = 0.01) { + ok_dates <- !is.na(Peak) & !is.na(OGI) & !is.na(OGMn) + ok_w <- !is.na(w_valid) & w_valid > 0 + ok_evi <- !is.na(EVIamp) & EVIamp > eps_amp + ok_dates & ok_w & ok_evi +} + +qc_mslsp_qa_pixel_agreement_flag <- function(gup_frac, gdown_frac, thr_good = 0.6, thr_bad = 0.3) { + gf <- suppressWarnings(as.numeric(gup_frac)) + df <- suppressWarnings(as.numeric(gdown_frac)) + m <- pmax(gf, df, na.rm = TRUE) + out <- rep("mixed_qa_pixel_agreement", length(m)) + out[is.na(gf) & is.na(df)] <- "qa_not_available" + out[!is.na(m) & m >= thr_good] <- "high_qa_pixel_agreement" + out[!is.na(m) & m <= thr_bad] <- "low_qa_pixel_agreement" + out +} + +heterogeneity_flag <- function(n_valid, w_valid, na_frac, na_frac_thr = 0.5) { + out <- rep("low_na_frac", length(n_valid)) + out[is.na(n_valid) | is.na(w_valid)] <- "mslsp_metrics_missing" + out[!is.na(na_frac) & na_frac >= na_frac_thr] <- "high_na_frac" + out +} + +qc_cycle_season_counts_label <- function(n_ms, n_li) { + if (is.na(n_ms) || n_ms == 0) return("no_mslsp_data") + if (is.na(n_li) || n_li == 0) return("no_landiq_active") + paste0(n_ms, if (n_ms == 1L) "cycle" else "cycles", "_", n_li, if (n_li == 1L) "season" else "seasons") +} + +qc_mslsp_pixel_availability_flag <- function(w_valid, n_valid) { + has <- !is.na(w_valid) & w_valid > 0 & !is.na(n_valid) & n_valid > 0 + out <- rep("no_pixels_or_invalid", length(w_valid)) + out[has] <- "has_pixels" + out[is.na(w_valid) & is.na(n_valid)] <- "no_mslsp_data_for_cycle" + out +} + +date_dist_days <- function(yr, doy1, doy2) { + n <- max(length(yr), length(doy1), length(doy2)) + yr <- rep(as.integer(yr), length.out = n) + doy1 <- rep(as.numeric(doy1), length.out = n) + doy2 <- rep(as.numeric(doy2), length.out = n) + ok <- !is.na(yr) & !is.na(doy1) & !is.na(doy2) + out <- rep(NA_real_, n) + if (!any(ok)) return(out) + d0 <- as.Date(sprintf("%d-01-01", yr[ok])) + d1 <- d0 + as.integer(round(doy1[ok])) - 1L + d2 <- d0 + as.integer(round(doy2[ok])) - 1L + out[ok] <- as.numeric(d1 - d2) + out +} + +# Convert DOY to Date (handles cross-year: doy < 1 = prior year, doy > 366 = next year) +doy_to_date <- function(year, doy) { + yr <- as.integer(year)[1] + d <- suppressWarnings(as.numeric(doy)[1]) + if (is.na(yr) || is.na(d)) return(as.Date(NA)) + lubridate::ymd(paste0(yr, "-01-01")) + lubridate::days(round(d) - 1L) +} + +load_combined_mslsp <- function(year, root = combined_root) { + year <- as.integer(year) + stopifnot("year must be a valid integer" = !is.na(year)) + path <- file.path(root, paste0("year=", year), sprintf("mslsp_year=%d.parquet", year)) + if (!file.exists(path)) stop("Combined Parquet not found: ", path) + d <- as.data.table(arrow::read_parquet(path)) + stopifnot("Combined file is empty" = nrow(d) > 0L) + stopifnot("Combined data must have parcel_id" = "parcel_id" %in% names(d)) + stopifnot("Combined data must have cycle column" = "cycle" %in% names(d)) + d[, parcel_id := as.character(parcel_id)] + d[, year := as.integer(year)] + setorder(d, parcel_id, year, cycle) + d +} + +col1 <- function(row, name) { + v <- row[[name]] + if (!is.null(v)) return(v[1]) + v <- row[[paste0(name, ".x")]] + if (!is.null(v)) return(v[1]) + NA_real_ +} +opt <- function(row, name) { v <- row[[name]]; if (is.null(v)) NA_real_ else v[1] } + +mslsp_long_to_cycles <- function(long_rows) { + if (nrow(long_rows) == 0) return(list(mslsp_cycles = NULL, keep_reason = "no_mslsp_pixels")) + pid <- long_rows$parcel_id[1] + yr <- long_rows$year[1] + fill <- function(r, prefix) { + m <- r[[paste0(prefix, "_mean")]] + if (is.null(m)) NA_real_ else m + } + mslsp_cycle_list <- vector("list", nrow(long_rows)) + for (j in seq_len(nrow(long_rows))) { + r <- long_rows[j] + cyc <- as.integer(r$cycle[1]) + if (is.na(cyc)) cyc <- j + mslsp_cycle_list[[j]] <- data.table( + parcel_id = pid, year = yr, mslsp_cycle = cyc, mslsp_rank = j, + mslsp_OGI = fill(r, "OGI"), mslsp_50PCGI = fill(r, "50PCGI"), mslsp_OGMx = fill(r, "OGMx"), mslsp_Peak = fill(r, "Peak"), + mslsp_OGD = fill(r, "OGD"), mslsp_50PCGD = fill(r, "50PCGD"), mslsp_OGMn = fill(r, "OGMn"), + mslsp_EVImax = fill(r, "EVImax"), mslsp_EVIamp = fill(r, "EVIamp"), mslsp_EVIarea = fill(r, "EVIarea"), + mslsp_w_valid = col1(r, "w_valid"), mslsp_n_valid = as.integer(col1(r, "n_valid")), mslsp_na_frac = col1(r, "na_frac"), + mslsp_numObs = opt(r, "numObs_mean"), mslsp_NumCycles = opt(r, "NumCycles_mode"), + mslsp_gupQA_mode = opt(r, "gupQA_mode"), mslsp_gupQA_mode_frac = opt(r, "gupQA_mode_frac"), + mslsp_gdownQA_mode = opt(r, "gdownQA_mode"), mslsp_gdownQA_mode_frac = opt(r, "gdownQA_mode_frac") + ) + } + mslsp_cycles <- rbindlist(mslsp_cycle_list) + mslsp_cycles[, cycle_real := is_real_cycle(mslsp_Peak, mslsp_OGI, mslsp_OGMn, mslsp_w_valid, mslsp_EVIamp, eps_amp = eps_eviamp)] + real <- mslsp_cycles[cycle_real == TRUE] + if (nrow(real) == 0) return(list(mslsp_cycles = NULL, keep_reason = "mslsp_cycles_filtered_out")) + real[, mslsp_rank := seq_len(.N)] + list(mslsp_cycles = real, keep_reason = if (nrow(real) >= 2) "both_mslsp_cycles_kept" else "single_mslsp_cycle_only") +} + +is_valid_landiq_class <- function(x) { + x <- trimws(as.character(x)) + !is.na(x) & nzchar(x) & x != "**" +} + +season_component_priority <- function(season, class_code, multiuse = NA_character_) { + s <- suppressWarnings(as.integer(season)) + class_ok <- is_valid_landiq_class(class_code) + mu <- trimws(as.character(multiuse)) + is_distinct_or_mixed <- !is.na(mu) & mu %in% c("D", "M", "d", "m") + + out <- rep(99L, length(s)) + out[class_ok & s == 2L] <- 1L + out[class_ok & s == 1L & is_distinct_or_mixed] <- 2L + out[class_ok & s == 1L & !is_distinct_or_mixed] <- 3L + out[class_ok & s == 3L] <- 4L + out[class_ok & s == 4L] <- 5L + out[!class_ok] <- 20L + fifelse(is.na(s), 9L, s) + out +} + +assignment_class_rollup <- function(DT) { + first_nonempty <- function(x) { + y <- as.character(x) + y <- y[!is.na(y) & nzchar(trimws(y))] + if (length(y) == 0) NA_character_ else y[1] + } + if (any(DT$qc_cycle_season_counts == "no_mslsp_pixels", na.rm = TRUE)) return("no_mslsp_pixels") + if (any(DT$qc_cycle_season_counts == "mslsp_cycles_filtered_out", na.rm = TRUE)) return("mslsp_cycles_filtered_out") + if (any(DT$qc_cycle_season_counts == "no_landiq_active", na.rm = TRUE)) return("no_landiq_active") + if (any(DT$qc_cycle_season_counts == "no_mslsp_cycle_for_season", na.rm = TRUE)) return("no_mslsp_cycle_assigned") + if (any(DT$qc_adoy_vs_cycle == "adoy_outside_cycle", na.rm = TRUE)) return("adoy_outside_cycle_review") + if (any(DT$qc_n_adoy_in_cycle == "multiple_adoy_in_cycle", na.rm = TRUE)) return("multiple_adoy_in_cycle_review") + cc <- first_nonempty(DT$qc_cycle_season_counts) + if (!is.na(cc) && !(cc %in% c("1cycle_1season", "2cycles_2seasons"))) { + if (cc == "2cycles_1season") return("mismatch_2cycles_1season") + if (cc == "1cycle_2seasons") return("mismatch_1cycle_2seasons") + return(paste0("mismatch_", cc)) + } + if (is.na(cc)) return("mismatch_unclassified") + if (any(DT$qc_adoy_vs_cycle == "adoy_inside_cycle", na.rm = TRUE)) return("adoy_inside_and_single") + return("matched_no_adoy") +} + +assign_one_4rows <- function(pid, yr, combined_row, landiq_rows) { + out <- data.table( + parcel_id = pid, year = yr, season = 1L:4L, + assigned_by = "no_match", assigned_woody_tiebreak = FALSE, + mslsp_cycle = NA_integer_, + landiq_PCNT = NA_real_, landiq_ADOY = NA_real_, landiq_PFT = NA_character_, landiq_CLASS = NA_character_, landiq_SUBCLASS = NA_character_, landiq_MULTIUSE = NA_character_, + mslsp_Peak = as.Date(NA), mslsp_OGI = as.Date(NA), mslsp_OGMn = as.Date(NA), + mslsp_50PCGI = as.Date(NA), mslsp_OGMx = as.Date(NA), mslsp_OGD = as.Date(NA), mslsp_50PCGD = as.Date(NA), + mslsp_EVImax = NA_real_, mslsp_EVIamp = NA_real_, mslsp_EVIarea = NA_real_, + mslsp_w_valid = NA_real_, mslsp_n_valid = NA_integer_, mslsp_na_frac = NA_real_, mslsp_numObs = NA_real_, mslsp_NumCycles = NA_real_, + peak_dist_days = NA_real_, + mslsp_gupQA_mode = NA_real_, mslsp_gupQA_mode_frac = NA_real_, mslsp_gdownQA_mode = NA_real_, mslsp_gdownQA_mode_frac = NA_real_, + qc_landiq_season_data = NA_character_, + qc_adoy_vs_cycle = NA_character_, qc_n_adoy_in_cycle = NA_character_, + qc_adoy_status = NA_character_, qc_cycle_status = NA_character_, + qc_adoy_cycle_relation = NA_character_, qc_adoy_multiplicity = NA_character_, + qc_mslsp_pixel_availability = NA_character_, qc_heterogeneity = NA_character_, + qc_cycle_season_counts = NA_character_, qc_mslsp_qa_pixel_agreement = NA_character_, + match_outcome = NA_character_, qc_mslsp_cycles_available = NA_character_ + ) + + for (s in 1:4) { + r <- landiq_rows[season == s] + if (nrow(r)) { + out[season == s, `:=`( + landiq_PCNT = r$PCNT[1], landiq_ADOY = r$ADOY[1], landiq_PFT = r$PFT[1], + landiq_CLASS = if ("CLASS" %in% names(r)) r$CLASS[1] else NA_character_, + landiq_SUBCLASS = if ("SUBCLASS" %in% names(r)) r$SUBCLASS[1] else NA_character_, + landiq_MULTIUSE = if ("MULTIUSE" %in% names(r)) r$MULTIUSE[1] else NA_character_ + )] + pcnt <- r$PCNT[1] + out[season == s, qc_landiq_season_data := fifelse(!is.na(pcnt) & pcnt >= 0, "landiq_season_has_data", NA_character_)] + } else { + out[season == s, qc_landiq_season_data := NA_character_] + } + } + + active <- landiq_rows[!is.na(PCNT) & PCNT >= 0] + n_landiq_active <- nrow(active) + if (n_landiq_active == 0) { + out[, `:=`( + qc_adoy_vs_cycle = NA_character_, qc_n_adoy_in_cycle = NA_character_, + qc_mslsp_pixel_availability = NA_character_, qc_heterogeneity = NA_character_, + qc_mslsp_qa_pixel_agreement = NA_character_, + qc_cycle_season_counts = "no_landiq_active", + qc_mslsp_cycles_available = NA_character_ + )] + out[, match_outcome := "no_landiq_active"] + return(list(assigned = out)) + } + + cyc <- mslsp_long_to_cycles(combined_row) + mslsp_cycles <- cyc$mslsp_cycles + if (is.null(mslsp_cycles) || nrow(mslsp_cycles) == 0) { + out[, `:=`( + qc_adoy_vs_cycle = NA_character_, qc_n_adoy_in_cycle = NA_character_, + qc_mslsp_pixel_availability = NA_character_, qc_heterogeneity = NA_character_, + qc_mslsp_qa_pixel_agreement = NA_character_, + qc_cycle_season_counts = cyc$keep_reason, + qc_mslsp_cycles_available = cyc$keep_reason + )] + out[, match_outcome := cyc$keep_reason] + return(list(assigned = out)) + } + + n_mslsp_used <- nrow(mslsp_cycles) + out[, qc_cycle_season_counts := qc_cycle_season_counts_label(n_mslsp_used, n_landiq_active)] + + landiq_active <- active[, .( + landiq_season = season, + landiq_ADOY = ADOY, + landiq_PFT = if ("PFT" %in% names(active)) PFT else NA_character_, + landiq_CLASS = if ("CLASS" %in% names(active)) CLASS else NA_character_, + landiq_SUBCLASS = if ("SUBCLASS" %in% names(active)) SUBCLASS else NA_character_, + landiq_MULTIUSE = if ("MULTIUSE" %in% names(active)) MULTIUSE else NA_character_ + )] + landiq_active[, landiq_PCNT := active$PCNT] + landiq_active[, component_priority := season_component_priority(landiq_season, landiq_CLASS, landiq_MULTIUSE)] + + mslsp_cycles[, keytmp := 1L] + landiq_active[, keytmp := 1L] + combos <- merge(mslsp_cycles, landiq_active, by = "keytmp", allow.cartesian = TRUE) + combos[, keytmp := NULL] + combos[, has_adoy := !is.na(landiq_ADOY) & !is.na(mslsp_OGI) & !is.na(mslsp_OGMn)] + combos[, adoy_in_window := fifelse(has_adoy, adoy_in_window(landiq_ADOY, mslsp_OGI, mslsp_OGMn, yr), NA)] + combos[, peak_dist_abs := fifelse(has_adoy, abs(as.numeric(date_dist_days(yr, mslsp_Peak, landiq_ADOY))), Inf)] + combos[is.na(peak_dist_abs), peak_dist_abs := Inf] + combos[, is_woody := identical(trimws(as.character(landiq_PFT)), "woody")] + + season_order <- landiq_active[order(component_priority, landiq_season), unique(landiq_season)] + used_cycles <- integer() + chosen_rows <- vector("list", length(season_order)) + n_chosen <- 0L + for (sea in season_order) { + cand <- combos[landiq_season == sea & !(mslsp_cycle %in% used_cycles)] + if (nrow(cand) == 0) next + + in_window <- cand[adoy_in_window == TRUE] + if (nrow(in_window) > 0) { + # Nearest Peak to ADOY, then prefer cycle 1 (MSLSP strongest cycle) + setorder(in_window, peak_dist_abs, mslsp_cycle) + pick <- in_window[1] + pick[, assigned_woody_tiebreak := FALSE] + } else { + # No ADOY: tie-break by season priority and mslsp_cycle (woody and non-woody) + woody <- cand$is_woody[1L] + setorder(cand, peak_dist_abs, mslsp_cycle) + pick <- cand[1] + pick[, assigned_woody_tiebreak := woody] + } + + n_chosen <- n_chosen + 1L + chosen_rows[[n_chosen]] <- pick + used_cycles <- c(used_cycles, as.integer(pick$mslsp_cycle[1])) + } + + chosen <- if (n_chosen > 0) rbindlist(chosen_rows[seq_len(n_chosen)], use.names = TRUE, fill = TRUE) else combos[0] + + landiq_adoy_vec <- active$ADOY + n_in_cycle <- vapply(seq_len(nrow(chosen)), function(i) { + sum(adoy_in_window(landiq_adoy_vec, chosen$mslsp_OGI[i], chosen$mslsp_OGMn[i], yr), na.rm = TRUE) + }, integer(1)) + chosen[, n_adoy_in_cycle := n_in_cycle] + chosen[has_adoy == TRUE & adoy_in_window == TRUE, n_adoy_in_cycle := pmax(n_adoy_in_cycle, 1L)] + chosen[, qc_n_adoy_in_cycle := fifelse(!has_adoy, "no_adoy_recorded", + fifelse(is.na(n_adoy_in_cycle) | n_adoy_in_cycle == 0L, "no_adoy_in_cycle", + fifelse(n_adoy_in_cycle == 1L, "one_adoy_in_cycle", "multiple_adoy_in_cycle")))] + chosen[, assigned_woody_tiebreak := if ("assigned_woody_tiebreak" %in% names(chosen)) assigned_woody_tiebreak else FALSE] + chosen[, qc_adoy_vs_cycle := fcase( + has_adoy & adoy_in_window == TRUE, "adoy_inside_cycle", + has_adoy & (is.na(adoy_in_window) | !adoy_in_window), "adoy_outside_cycle", + !has_adoy & assigned_woody_tiebreak == TRUE, "no_adoy_woody_tiebreak", + default = "no_adoy_recorded" + )] + chosen[, qc_mslsp_pixel_availability := qc_mslsp_pixel_availability_flag(mslsp_w_valid, mslsp_n_valid)] + chosen[, qc_heterogeneity := heterogeneity_flag(mslsp_n_valid, mslsp_w_valid, mslsp_na_frac, na_frac_thr = heterogeneity_na_frac_thr)] + chosen[, qc_mslsp_qa_pixel_agreement := qc_mslsp_qa_pixel_agreement_flag(mslsp_gupQA_mode_frac, mslsp_gdownQA_mode_frac)] + chosen[, qc_cycle_season_counts := qc_cycle_season_counts_label(n_mslsp_used, n_landiq_active)] + chosen[, qc_mslsp_cycles_available := cyc$keep_reason] + + for (k in seq_len(nrow(chosen))) { + sea <- chosen$landiq_season[k] + rr <- which(out$season == sea) + if (length(rr) != 1) next + out[rr, `:=`( + assigned_by = "matched", + assigned_woody_tiebreak = chosen$assigned_woody_tiebreak[k], + mslsp_cycle = chosen$mslsp_cycle[k], + mslsp_OGI = doy_to_date(yr, chosen$mslsp_OGI[k]), mslsp_50PCGI = doy_to_date(yr, chosen$mslsp_50PCGI[k]), mslsp_OGMx = doy_to_date(yr, chosen$mslsp_OGMx[k]), + mslsp_Peak = doy_to_date(yr, chosen$mslsp_Peak[k]), mslsp_OGD = doy_to_date(yr, chosen$mslsp_OGD[k]), mslsp_50PCGD = doy_to_date(yr, chosen$mslsp_50PCGD[k]), mslsp_OGMn = doy_to_date(yr, chosen$mslsp_OGMn[k]), + mslsp_EVImax = chosen$mslsp_EVImax[k], mslsp_EVIamp = chosen$mslsp_EVIamp[k], mslsp_EVIarea = chosen$mslsp_EVIarea[k], + mslsp_w_valid = chosen$mslsp_w_valid[k], mslsp_n_valid = chosen$mslsp_n_valid[k], mslsp_na_frac = chosen$mslsp_na_frac[k], + mslsp_numObs = chosen$mslsp_numObs[k], + peak_dist_days = chosen$peak_dist_abs[k], + mslsp_gupQA_mode = chosen$mslsp_gupQA_mode[k], mslsp_gupQA_mode_frac = chosen$mslsp_gupQA_mode_frac[k], + mslsp_gdownQA_mode = chosen$mslsp_gdownQA_mode[k], mslsp_gdownQA_mode_frac = chosen$mslsp_gdownQA_mode_frac[k], + qc_adoy_vs_cycle = chosen$qc_adoy_vs_cycle[k], + qc_n_adoy_in_cycle = chosen$qc_n_adoy_in_cycle[k], + qc_mslsp_pixel_availability = qc_mslsp_pixel_availability_flag(chosen$mslsp_w_valid[k], chosen$mslsp_n_valid[k]), + qc_heterogeneity = heterogeneity_flag(chosen$mslsp_n_valid[k], chosen$mslsp_w_valid[k], chosen$mslsp_na_frac[k], na_frac_thr = heterogeneity_na_frac_thr), + qc_mslsp_qa_pixel_agreement = qc_mslsp_qa_pixel_agreement_flag(chosen$mslsp_gupQA_mode_frac[k], chosen$mslsp_gdownQA_mode_frac[k]) + )] + } + + out[assigned_by == "no_match", `:=`( + qc_adoy_vs_cycle = NA_character_, qc_n_adoy_in_cycle = NA_character_, + qc_mslsp_pixel_availability = NA_character_, qc_heterogeneity = NA_character_, + qc_mslsp_qa_pixel_agreement = NA_character_, + qc_cycle_season_counts = NA_character_, qc_mslsp_cycles_available = NA_character_ + )] + out[assigned_by == "no_match" & qc_landiq_season_data == "landiq_season_has_data", `:=`( + qc_adoy_vs_cycle = "no_cycle_assigned", + qc_n_adoy_in_cycle = "no_cycle_assigned", + qc_mslsp_pixel_availability = "no_mslsp_data_for_cycle", + qc_heterogeneity = "no_cycle_assigned", + qc_mslsp_qa_pixel_agreement = "no_cycle_assigned", + qc_cycle_season_counts = "no_mslsp_cycle_for_season", + qc_mslsp_cycles_available = "no_cycle_assigned" + )] + + out[, qc_adoy_status := fifelse( + qc_landiq_season_data == "landiq_season_has_data", + fifelse(is.na(landiq_ADOY), "no_landiq_adoy", "has_adoy"), + NA_character_ + )] + out[, qc_cycle_status := fifelse( + qc_landiq_season_data == "landiq_season_has_data", + fifelse(assigned_by == "matched", "has_assigned_cycle", "no_assigned_cycle"), + NA_character_ + )] + out[, qc_adoy_cycle_relation := fifelse( + qc_adoy_status == "has_adoy" & qc_cycle_status == "has_assigned_cycle", + qc_adoy_vs_cycle, + NA_character_ + )] + out[, qc_adoy_multiplicity := fifelse( + qc_adoy_status == "has_adoy" & qc_cycle_status == "has_assigned_cycle", + qc_n_adoy_in_cycle, + NA_character_ + )] + + if ("NumCycles_mode" %in% names(combined_row)) out[, mslsp_NumCycles := combined_row$NumCycles_mode[1]] + out[assigned_by == "matched", qc_mslsp_cycles_available := cyc$keep_reason] + + list(assigned = out) +} + +# Extract QC summary from assigned data (file path or data.table). +# Returns long-format: year, level, qc_dimension, category, pft, n, pct. +# qc_mslsp_cycles_available only at field_year (avoids redundant row-level copy). +# If out_dir is set, writes qc_summary_year=Y.csv. +extract_qc_summary <- function(assigned_path_or_dt, out_dir = NULL) { + if (is.character(assigned_path_or_dt)) { + if (!file.exists(assigned_path_or_dt)) stop("File not found: ", assigned_path_or_dt) + assigned <- as.data.table(arrow::read_parquet(assigned_path_or_dt)) + yr <- as.integer(assigned$year[1]) + } else { + assigned <- as.data.table(assigned_path_or_dt) + yr <- as.integer(assigned$year[1]) + } + pft_col <- names(assigned)[tolower(names(assigned)) %in% c("pft", "landiq_pft")][1] + has_pft <- !is.na(pft_col) + + matched <- assigned[assigned_by == "matched"] + row_dims <- c( + "qc_adoy_vs_cycle", "qc_n_adoy_in_cycle", "qc_cycle_season_counts", + "qc_mslsp_pixel_availability", "qc_heterogeneity", "qc_mslsp_qa_pixel_agreement" + ) + row_dims <- intersect(row_dims, names(matched)) + out <- data.table(year = integer(), level = character(), qc_dimension = character(), category = character(), pft = character(), n = integer(), pct = double()) + if (nrow(matched) > 0 && length(row_dims) > 0) { + for (d in row_dims) { + by_cols <- c(d, if (has_pft) pft_col else NULL) + tab <- matched[, .N, by = by_cols] + tab[, category := fifelse(is.na(get(d)), "(no value)", as.character(get(d)))] + tab[, pft := if (has_pft) fifelse(is.na(get(pft_col)), "(no value)", as.character(get(pft_col))) else "(all)"] + if (has_pft) tab[, (pft_col) := NULL] + tab[, (d) := NULL] + tab[, pct := round(100 * N / sum(N), 1), by = "pft"] + setnames(tab, "N", "n") + tab[, `:=`(year = yr, level = "row", qc_dimension = d)] + setcolorder(tab, c("year", "level", "qc_dimension", "category", "pft", "n", "pct")) + out <- rbind(out, tab) + } + } + fy_cols <- c("match_outcome", "qc_mslsp_cycles_available", if (has_pft) pft_col else NULL) + fy_rollup <- assigned[, lapply(.SD, function(x) x[!is.na(x)][1L]), by = .(parcel_id, year), .SDcols = fy_cols] + if (has_pft) setnames(fy_rollup, pft_col, "pft_rollup") + n_fy <- nrow(fy_rollup) + if (n_fy > 0) { + for (d in c("match_outcome", "qc_mslsp_cycles_available")) { + if (!d %in% names(fy_rollup)) next + by_fy <- c(d, if (has_pft) "pft_rollup" else NULL) + tab <- fy_rollup[, .N, by = by_fy] + tab[, category := fifelse(is.na(get(d)), "(no value)", as.character(get(d)))] + tab[, pft := if (has_pft) fifelse(is.na(pft_rollup), "(no value)", as.character(pft_rollup)) else "(all)"] + if (has_pft) tab[, pft_rollup := NULL] + tab[, (d) := NULL] + if (has_pft) tab[, pct := round(100 * N / sum(N), 1), by = "pft"] else tab[, pct := round(100 * N / n_fy, 1)] + setnames(tab, "N", "n") + tab[, `:=`(year = yr, level = "field_year", qc_dimension = d)] + setcolorder(tab, c("year", "level", "qc_dimension", "category", "pft", "n", "pct")) + out <- rbind(out, tab) + } + } + if (nrow(out) > 0) { + # Sort: level (row then field_year), qc_dimension (fixed order), pft (row/hay/rice/woody/noncrop then other), then n descending + pft_order <- c("row", "hay", "rice", "woody", "noncrop", "(no value)", "(all)") + dim_order <- c( + "qc_adoy_vs_cycle", "qc_n_adoy_in_cycle", "qc_cycle_season_counts", + "qc_mslsp_pixel_availability", "qc_heterogeneity", "qc_mslsp_qa_pixel_agreement", + "match_outcome", "qc_mslsp_cycles_available" + ) + out[, level := factor(level, levels = c("row", "field_year"))] + out[, qc_dimension := factor(qc_dimension, levels = dim_order)] + out[, pft := factor(pft, levels = c(pft_order, setdiff(unique(out$pft), pft_order)))] + setorder(out, level, qc_dimension, pft, -n) + out[, level := as.character(level)] + out[, qc_dimension := as.character(qc_dimension)] + out[, pft := as.character(pft)] + if (length(out_dir) > 0 && nzchar(out_dir)) { + dir.create(out_dir, recursive = TRUE, showWarnings = FALSE) + out_file <- file.path(out_dir, paste0("qc_summary_year=", yr, ".csv")) + fwrite(out, out_file) + message("[QC] Summary: ", out_file) + } + } + invisible(out) +} + +summarize_assignment <- function(year, out_path = out_dir) { + yr <- as.integer(year) + assigned_path <- file.path(out_path, paste0("assigned_year=", yr, ".parquet")) + if (!file.exists(assigned_path)) stop("assigned parquet not found: ", assigned_path) + extract_qc_summary(assigned_path, out_path) +} + +run_assignment <- function(year, cr = combined_root, + subset_test = do_subset_test, samp_per_pft = sample_per_pft, + out_path = out_dir) { + yr <- as.integer(year) + stopifnot("year must be a valid integer" = !is.na(yr)) + message("[1/9] Loading combined MSLSP year=", yr) + pheno <- load_combined_mslsp(yr, cr) + if (!"MULTIUSE" %in% names(pheno)) pheno[, MULTIUSE := NA_character_] + message("[2/9] Loading LandIQ year=", yr) + lookup <- fread(cropcode_csv) + ag_pairs <- unique(lookup[is_agricultural == TRUE, .(CLASS = trimws(CLASS), SUBCLASS = as.character(SUBCLASS), PFT)]) + ag_classes <- unique(ag_pairs$CLASS) + landiq <- as.data.table( + arrow::open_dataset(landiq_parq) |> + dplyr::filter(year == !!yr, CLASS %in% !!ag_classes) |> + dplyr::collect() + ) + landiq[, CLASS := trimws(as.character(CLASS))] + landiq[, SUBCLASS := as.character(SUBCLASS)] + # Normalize "no subclass" so CLASS X (fallow) and other ** in lookup match: NA, "", "**" -> "**" + landiq[is.na(SUBCLASS) | trimws(SUBCLASS) == "" | trimws(SUBCLASS) == "**", SUBCLASS := "**"] + landiq[, parcel_id := trimws(as.character(parcel_id))] + if ("MULTIUSE" %in% names(landiq)) landiq[, MULTIUSE := trimws(as.character(MULTIUSE))] + landiq[, PCNT := suppressWarnings(as.numeric(PCNT))] + landiq[, ADOY := suppressWarnings(as.numeric(ADOY))] + landiq[ADOY == 0, ADOY := NA_real_] + landiq[, year := as.integer(year)] + landiq <- merge(landiq, ag_pairs, by = c("CLASS", "SUBCLASS")) + setkey(landiq, parcel_id, year) + setkey(pheno, parcel_id, year) + fys <- if (isTRUE(assign_active_only)) unique(landiq[!is.na(PCNT) & PCNT >= 0, .(parcel_id, year)]) else unique(landiq[, .(parcel_id, year)]) + in_pheno <- unique(pheno[fys, nomatch = 0][, .(parcel_id, year)]) + if (nrow(in_pheno) == 0) stop("No field-years in common between combined MSLSP and LandIQ.") + message("[3/9] Overlap: ", nrow(in_pheno), " field-years in both") + fys <- in_pheno + if (length(assign_subset_ids) > 0) { + fys[, parcel_id := as.character(parcel_id)] + fys <- fys[parcel_id %in% assign_subset_ids] + if (nrow(fys) == 0) { + message("[3/9] No subset parcel_ids in overlap for year ", yr, "; skipping this year.") + return(invisible(list(assigned = data.table()))) + } + out_path <- file.path(out_path, "subsample_n400") + } else if (subset_test) { + n_take <- min(samp_per_pft, nrow(fys)) + fys <- fys[sample(.N, n_take)] + } + setorder(fys, parcel_id, year) + message("[4/9] Assigning ", nrow(fys), " field-years") + pheno_split <- split(pheno, pheno$parcel_id) + landiq_split <- split(landiq, landiq$parcel_id) + results <- lapply(fys$parcel_id, function(pid) { + cr <- pheno_split[[pid]]; if (is.null(cr)) cr <- pheno[0L] + lr <- landiq_split[[pid]]; if (is.null(lr)) lr <- landiq[0L] + assign_one_4rows(pid, yr, cr, lr) + }) + assigned <- rbindlist(lapply(results, `[[`, "assigned"), fill = TRUE) + assigned[, match_outcome := { + ac <- assignment_class_rollup(.SD) + fcase(ac == "matched_no_adoy", "matched_no_adoy", ac == "adoy_inside_and_single", "matched_adoy_validated", default = ac) + }, by = .(parcel_id, year)] + assigned[assigned_by == "no_match" & (is.na(qc_landiq_season_data) | qc_landiq_season_data != "landiq_season_has_data"), match_outcome := NA_character_] + dir.create(out_path, recursive = TRUE, showWarnings = FALSE) + out_assigned <- file.path(out_path, paste0("assigned_year=", yr, ".parquet")) + arrow::write_parquet(assigned, out_assigned) + extract_qc_summary(assigned, out_path) + invisible(list(assigned = assigned)) +} + +if (!is.null(year)) { + run_assignment(year, subset_test = do_subset_test, samp_per_pft = sample_per_pft) +} + +message("Loaded standalone matching script (rule-based + CLASS-aware).") + diff --git a/modules/data.remote/inst/ccmmf/tillage/tillage_metrics.R b/modules/data.remote/inst/ccmmf/tillage/tillage_metrics.R new file mode 100644 index 00000000000..181ad9a617d --- /dev/null +++ b/modules/data.remote/inst/ccmmf/tillage/tillage_metrics.R @@ -0,0 +1,178 @@ +# Tillage metrics from NDTI time series relative to phenology dates (OGI, OGMn). +# For each fallow window between harvest and next planting, interpolates NDTI, smooths, +# and derives tillage effect metrics (dplyr tibble in, tibble out). +# +# Main inputs: ndti_table with parcel_id, year, date, ndti_mean, ndti_sd, n_valid, PFT; +# phenology_table with parcel_id, year, OGI_date, OGMn_date (Date). Assign PFT before +# calling if NDTI lacks it. +# Main outputs: one row per fallow window with tillage_eff and related columns. +# How to run: source() and call tillage_metrics(); or sourced from make_events_statewide.R. +# Workflow: monitoring workflow DERIVE stage before statewide tillage events. + +# Neighbor NDTI obs (n_valid > 0) before/after target_date; pooled SD when +# target day has no obs. Vectorized over rows of q (parcel_id, target_date). +.min_ndti_neighbor_details <- function(rel_dt, q_dt) { + n <- nrow(q_dt) + prev_date <- as.Date(rep(NA_real_, n)) + prev_nv <- rep(NA_real_, n) + prev_ss <- rep(NA_real_, n) + foll_date <- as.Date(rep(NA_real_, n)) + foll_nv <- rep(NA_real_, n) + foll_ss <- rep(NA_real_, n) + + for (pid in unique(q_dt$parcel_id)) { + ix <- which(q_dt$parcel_id == pid) + rp <- rel_dt[rel_dt$parcel_id == pid, , drop = FALSE] + if (nrow(rp) == 0L) { + next + } + o <- order(rp$date) + rp <- rp[o, , drop = FALSE] + di <- as.integer(rp$date) + ti <- as.integer(q_dt$target_date[ix]) + + ip <- findInterval(ti - 1L, di) + hit <- ip >= 1L + if (any(hit)) { + ii <- ix[hit] + jj <- ip[hit] + prev_date[ii] <- rp$date[jj] + prev_nv[ii] <- rp$n_valid[jj] + prev_ss[ii] <- rp$ss[jj] + } + + fi <- findInterval(ti, di) + 1L + ok <- fi <= length(di) & di[fi] > ti + if (any(ok, na.rm = TRUE)) { + ii <- ix[ok] + jj <- fi[ok] + foll_date[ii] <- rp$date[jj] + foll_nv[ii] <- rp$n_valid[jj] + foll_ss[ii] <- rp$ss[jj] + } + } + + list( + prev_date = prev_date, + prev_nv = prev_nv, + prev_ss = prev_ss, + foll_date = foll_date, + foll_nv = foll_nv, + foll_ss = foll_ss + ) +} + +tillage_metrics <- function(ndti_table, phenology_table) { + ndti_work <- dplyr::as_tibble(ndti_table) |> + dplyr::mutate( + date = as.Date(date), + ss = ifelse(!is.na(n_valid) & n_valid > 0, n_valid * (ndti_sd^2), 0) + ) + + pheno_date <- dplyr::transmute( + phenology_table, + parcel_id, + year, + OGI_date, + OGMn_date + ) + + ndti_smooth <- ndti_work |> + dplyr::inner_join(pheno_date, by = c("parcel_id", "year")) |> + dplyr::arrange(parcel_id, date) |> + dplyr::group_by(parcel_id, year, PFT) |> + tidyr::complete(date = seq.Date(min(date), max(date), by = "day")) |> + tidyr::fill(OGMn_date, OGI_date, .direction = "downup") |> + dplyr::mutate( + mean_ndti_filled = zoo::na.approx(ndti_mean, x = date, na.rm = FALSE), + smoothed = as.numeric(stats::filter(mean_ndti_filled, rep(1 / 4, 4), sides = 2)) + ) |> + dplyr::ungroup() + + fallow_periods <- pheno_date |> + dplyr::arrange(parcel_id, OGI_date) |> + dplyr::group_by(parcel_id) |> + dplyr::mutate(fallow_start = OGMn_date, fallow_end = dplyr::lead(OGI_date)) |> + dplyr::filter(!is.na(fallow_end)) |> + dplyr::ungroup() + + joined_fb <- dplyr::inner_join( + ndti_smooth, + fallow_periods, + by = "parcel_id", + suffix = c("_ndti", "_fallow") + ) + yr_col <- if ("year_fallow" %in% names(joined_fb)) "year_fallow" else "year" + + base_metrics <- joined_fb |> + dplyr::filter(date >= fallow_start, date <= fallow_end) |> + dplyr::group_by(parcel_id, fallow_start) |> + dplyr::summarize( + min_idx = which.min(smoothed), + minNDTI_date = date[min_idx], + ndti_on_minNDTI = smoothed[min_idx], + max_pre_idx = which.max(ifelse(date <= minNDTI_date, smoothed, -Inf)), + maxNDTI_pre_date = date[max_pre_idx], + maxNDTI_pre_min = smoothed[max_pre_idx], + ndti_pct_change = ((maxNDTI_pre_min - ndti_on_minNDTI) / maxNDTI_pre_min) * 100, + year = dplyr::first(.data[[yr_col]]), + PFT = dplyr::first(PFT), + OGMn_date = dplyr::first(fallow_start), + .groups = "drop" + ) + + # One row per fallow window: min-day SD / neighbor counts (was rowwise + + # get_metric_details per row; res_max branch was never selected - dropped). + smooth <- dplyr::distinct(dplyr::as_tibble(ndti_smooth), parcel_id, date, .keep_all = TRUE) + rel <- dplyr::filter(smooth, !is.na(n_valid), n_valid > 0) + + q <- data.frame( + parcel_id = base_metrics$parcel_id, + target_date = as.Date(base_metrics$minNDTI_date), + stringsAsFactors = FALSE + ) + row_t <- match( + paste(q$parcel_id, as.integer(q$target_date), sep = "\r"), + paste(smooth$parcel_id, as.integer(smooth$date), sep = "\r") + ) + n_on <- rep(0L, nrow(q)) + okm <- !is.na(row_t) + if (any(okm)) { + nvv <- smooth$n_valid[row_t[okm]] + n_on[okm] <- as.integer(ifelse(is.na(nvv), 0L, nvv)) + } + tgt_sd <- rep(NA_real_, nrow(q)) + tgt_sd[okm] <- smooth$ndti_sd[row_t[okm]] + + nb <- .min_ndti_neighbor_details(rel, q) + sd_out <- ifelse(n_on > 0, tgt_sd, NA_real_) + w0 <- which(n_on == 0L) + if (length(w0) > 0L) { + nv <- ifelse(is.na(nb$prev_nv[w0]), 0, nb$prev_nv[w0]) + + ifelse(is.na(nb$foll_nv[w0]), 0, nb$foll_nv[w0]) + ss <- ifelse(is.na(nb$prev_ss[w0]), 0, nb$prev_ss[w0]) + + ifelse(is.na(nb$foll_ss[w0]), 0, nb$foll_ss[w0]) + pool <- sqrt(ss / nv) + pool[nv <= 0 | !is.finite(pool)] <- NA_real_ + sd_out[w0] <- pool + } + + dplyr::transmute( + base_metrics, + parcel_id, + year, + PFT, + OGMn_date, + max_date = maxNDTI_pre_date, + max_ndti = maxNDTI_pre_min, + min_date = minNDTI_date, + min_ndti = ndti_on_minNDTI, + min_n_valid = n_on, + min_sd = sd_out, + ndti_pct_change, + min_val_date_before = nb$prev_date, + min_val_n_before = nb$prev_nv, + min_val_date_after = nb$foll_date, + min_val_n_after = nb$foll_nv + ) +} diff --git a/modules/data.remote/inst/ccmmf/traits/build_harvest_lookup.R b/modules/data.remote/inst/ccmmf/traits/build_harvest_lookup.R new file mode 100644 index 00000000000..b7aa2166fa4 --- /dev/null +++ b/modules/data.remote/inst/ccmmf/traits/build_harvest_lookup.R @@ -0,0 +1,121 @@ +# Build long-format harvest removal / litter fraction lookup for CCMMF. +# +# Starts from agricultural LandIQ codes with a PFT. Uses placeholder fractions +# by PFT (row, rice, hay, woody, woody_destructive) for AGB_REMOVED, AGB_LITTER, +# BGB_REMOVED, BGB_LITTER. Duplicates every woody row as woody_destructive so +# orchards can use a different harvest scenario. Stacks subclass, class, PFT, +# and global levels so pool_calculations can use the same fallback order as traits. +# +# Writes plant_traits/harvest_lookup_long.rds and .csv under CCMMF_MANAGEMENT. +# +# Run from management repo root: +# Rscript scripts/traits/build_harvest_lookup.R +# Env: CCMMF_MANAGEMENT + +#### Load packages + +suppressPackageStartupMessages({ + library(data.table) + library(dplyr) + library(readr) + library(tibble) + library(tidyr) +}) + +#### Paths and constants (override root with CCMMF_MANAGEMENT) + +path_management <- Sys.getenv("CCMMF_MANAGEMENT", "/projectnb/dietzelab/ccmmf/management") +landiq_lookup_csv <- file.path(path_management, "LandIQ_cropCode_lookup_table.csv") +plant_traits_dir <- file.path(path_management, "plant_traits") +out_rds <- file.path(plant_traits_dir, "harvest_lookup_long.rds") +out_csv <- file.path(plant_traits_dir, "harvest_lookup_long.csv") + +# Agricultural LandIQ rows only; need PFT (same filter as build_planting_lookup.R). +load_landiq_mapping <- function(path = landiq_lookup_csv) { + d <- as.data.frame(fread(path)) + d %>% + mutate(SUBCLASS = as.character(SUBCLASS)) %>% + filter(is_agricultural == TRUE, !is.na(PFT)) %>% + transmute(class = CLASS, subclass = SUBCLASS, crop_desc = SUBCLASS_desc, class_desc = CLASS_desc, PFT = PFT) +} + +# Until field-derived fractions exist, these are fixed means; sd_obs NA, n_obs 0. +placeholder_means <- tibble::tribble( + ~PFT, ~AGB_REMOVED, ~AGB_LITTER, ~BGB_REMOVED, ~BGB_LITTER, + "row", 0.80, 0.20, 0.00, 1.00, + "rice", 0.80, 0.20, 0.00, 1.00, + "hay", 0.75, 0.15, 0.00, 0.00, + "woody", 0.15, 0.015, 0.00, 0.00, + "woody_destructive", 0.80, 0.20, 0.00, 1.00 +) + +#### Build tables by level + +dir.create(plant_traits_dir, recursive = TRUE, showWarnings = FALSE) + +cat("Loading LandIQ mapping (agricultural classes only)...\n") +mapping <- load_landiq_mapping(landiq_lookup_csv) + +subclass_valid <- mapping %>% + distinct(class, subclass, crop_desc, PFT) + +# Class level uses CLASS_desc as the human-readable label (subclass left blank). +class_valid <- mapping %>% + distinct(level_id = class, class_desc, PFT) %>% + mutate(class = level_id, crop_desc = class_desc, subclass = NA_character_) %>% + select(level_id, class, subclass, crop_desc, PFT) + +# Harvest lookup distinguishes full orchard removal (woody_destructive) from partial woody harvest. +woody_sub <- subclass_valid %>% filter(PFT == "woody") %>% mutate(PFT = "woody_destructive") +subclass_valid <- bind_rows(subclass_valid, woody_sub) + +woody_cls <- class_valid %>% filter(PFT == "woody") %>% mutate(PFT = "woody_destructive") +class_valid <- bind_rows(class_valid, woody_cls) + +placeholders <- placeholder_means %>% + pivot_longer(-PFT, names_to = "param", values_to = "mean_obs") %>% + mutate(sd_obs = NA_real_, n_obs = 0L) + +tbl_sub <- subclass_valid %>% + left_join(placeholders, by = "PFT", relationship = "many-to-many") %>% + mutate(level = "subclass") + +tbl_class <- class_valid %>% + select(-level_id) %>% + left_join(placeholders, by = "PFT", relationship = "many-to-many") %>% + mutate(level = "class") + +# PFT-only rows: no spatial code; used when subclass and class both miss. +tbl_pft <- placeholders %>% + mutate( + level = "pft", + class = NA_character_, + subclass = NA_character_, + crop_desc = NA_character_ + ) + +# Global row per param: mean of the placeholder PFT values (still placeholders). +tbl_global <- placeholders %>% + group_by(param) %>% + summarise( + mean_obs = mean(mean_obs, na.rm = TRUE), + sd_obs = NA_real_, + n_obs = 0L, + .groups = "drop" + ) %>% + mutate( + level = "global", + class = NA_character_, + subclass = NA_character_, + crop_desc = NA_character_, + PFT = NA_character_ + ) + +harvest_lookup_long <- bind_rows(tbl_sub, tbl_class, tbl_pft, tbl_global) %>% + relocate(level, PFT, class, subclass, crop_desc, param, mean_obs, sd_obs, n_obs) %>% + arrange(level, PFT, class, subclass, param) + +#### Write outputs + +saveRDS(harvest_lookup_long, out_rds) +write_csv(harvest_lookup_long, out_csv) \ No newline at end of file diff --git a/modules/data.remote/inst/ccmmf/traits/build_harvest_lookup_faostat.R b/modules/data.remote/inst/ccmmf/traits/build_harvest_lookup_faostat.R new file mode 100644 index 00000000000..6b961b13a8b --- /dev/null +++ b/modules/data.remote/inst/ccmmf/traits/build_harvest_lookup_faostat.R @@ -0,0 +1,665 @@ +# Build long-format harvest lookup from FAOSTAT-style crop statistics plus LandIQ codes. +# Joins each FAOSTAT item to LandIQ crop_desc, aggregates DRYAD-style variables at subclass, +# then class x PFT, then PFT, then global, and derives harvest removal fractions (mass-balanced +# AGB using HI and yield, or clamped yield). Row levels follow coalesce(subclass, class, pft, +# placeholder, global) so pool_calculations_from_lookup.R can fall back the same way as +# build_harvest_lookup.R. If the Excel file is missing, uses HARVEST_PFT_SUMMARY_CSV only. +# +# Main inputs: CCMMF_MANAGEMENT; HARVEST_FAOSTAT_XLSX (columns item, variable, value); +# HARVEST_PFT_SUMMARY_CSV for PFT-only fallback; LandIQ_cropCode_lookup_table.csv. +# Main outputs: plant_traits/harvest_lookup_long_faostat.rds and .csv (does not overwrite harvest_lookup_long.rds). +# How to run: Rscript scripts/traits/build_harvest_lookup_faostat.R from repo root (set CCMMF_MANAGEMENT). +# Workflow: optional harvest lookup for events; set HARVEST_LOOKUP_RDS to this RDS in make_events_statewide.R. + +#### Load packages + +suppressPackageStartupMessages({ + library(data.table) + library(dplyr) + library(readr) + library(readxl) + library(stringr) + library(tibble) + library(tidyr) +}) + +#### Paths and constants + +path_management <- Sys.getenv("CCMMF_MANAGEMENT", "/projectnb/dietzelab/ccmmf/management") +landiq_lookup_csv <- file.path(path_management, "LandIQ_cropCode_lookup_table.csv") +plant_traits_dir <- file.path(path_management, "plant_traits") +out_rds <- file.path(plant_traits_dir, "harvest_lookup_long_faostat.rds") +out_csv <- file.path(plant_traits_dir, "harvest_lookup_long_faostat.csv") + +faostat_xlsx <- Sys.getenv( + "HARVEST_FAOSTAT_XLSX", + "/projectnb/dietzelab/mkim/Harvest Data/new_output.xlsx" +) +pft_summary_csv <- Sys.getenv( + "HARVEST_PFT_SUMMARY_CSV", + "/projectnb/dietzelab/mkim/Harvest Data/harvest_pools_output.csv" +) + +rs_root_shoot <- 0.20 + +harvest_param_names <- c("AGB_REMOVED", "AGB_LITTER", "BGB_REMOVED", "BGB_LITTER") + +placeholder_means_wide <- tibble::tribble( + ~PFT, ~AGB_REMOVED, ~AGB_LITTER, ~BGB_REMOVED, ~BGB_LITTER, + "row", 0.80, 0.20, 0.00, 1.00, + "rice", 0.80, 0.20, 0.00, 1.00, + "hay", 0.75, 0.15, 0.00, 0.00, + "woody", 0.15, 0.015, 0.00, 0.00, + "woody_destructive", 0.80, 0.20, 0.00, 1.00 +) + +req_var_names <- c( + "Maximum_Above_ground_biomass_kg_DM_ha", + "Mean_Grain_yield_kg_DM_ha", + "HI", + "CR_removed_pc", + "Mean_Above_ground_biomass_kg_DM_ha", + "Yield_kg_DM_ha", + "Mean_Harvest_index", + "Ratio_residues_removed_from_field" +) + +#### LandIQ load and FAOSTAT item to crop_desc mapping + +# Agricultural LandIQ rows only; need PFT (same as build_planting_lookup / harvest). +load_landiq_mapping <- function(path = landiq_lookup_csv) { + d <- as.data.frame(fread(path)) + d %>% + mutate(SUBCLASS = as.character(SUBCLASS)) %>% + filter(is_agricultural == TRUE, !is.na(PFT)) %>% + transmute(class = CLASS, subclass = SUBCLASS, crop_desc = SUBCLASS_desc, class_desc = CLASS_desc, PFT = PFT) +} + +norm_item <- function(x) { + x <- tolower(trimws(as.character(x))) + x[!nzchar(x) | is.na(x)] <- NA_character_ + x +} + +# Exact LandIQ SUBCLASS_desc strings for FAOSTAT items that do not match automatically. +manual_item_to_crop <- tibble::tibble( + item_clean = norm_item(c( + "maize", "rice, paddy", "soybeans", "wheat", "barley", "oats", "sunflower seed", + "rapeseed", "cotton", "sugar beets", "potatoes", "tomatoes", "bananas", + "cassava", "coconuts", "triticale", "lentils", "sesame seed", "tea", + "tobacco, unmanufactured", "vetches", "alfalfa" + )), + crop_desc = c( + "Corn (field & sweet)", + "Rice", + "Beans (dry)", + "Wheat", + "Barley", + "Oats", + "Sunflowers", + "Safflower", + "Cotton", + "Sugar beets", + "Potatoes", + "Tomatoes (processing)", + "Miscellaneous deciduous", + "Miscellaneous field", + "Miscellaneous deciduous", + "Miscellaneous grain and hay", + "Beans (dry)", + "Castor beans", + "Miscellaneous field", + "Miscellaneous field", + "Beans (dry)", + "Alfalfa and alfalfa mixtures" + ) +) + +map_item_to_landiq <- function(item_clean, codes_tbl, manual_tbl) { + if (is.na(item_clean)) { + return(tibble::tibble( + crop_desc = NA_character_, class = NA_character_, + subclass = NA_character_, PFT = NA_character_ + )) + } + hit <- manual_tbl %>% dplyr::filter(.data$item_clean == item_clean) + if (nrow(hit) == 1L) { + m <- codes_tbl %>% dplyr::filter(.data$crop_desc == hit$crop_desc[1]) + if (nrow(m) >= 1L) { + return(m %>% dplyr::slice(1) %>% dplyr::select(crop_desc, class, subclass, PFT)) + } + } + hits <- codes_tbl %>% dplyr::filter(.data$norm_crop == item_clean) + if (nrow(hits) == 1L) { + return(hits %>% dplyr::select(crop_desc, class, subclass, PFT)) + } + if (nrow(hits) > 1L) { + return(hits %>% dplyr::arrange(nchar(.data$crop_desc)) %>% dplyr::slice(1) %>% + dplyr::select(crop_desc, class, subclass, PFT)) + } + hits <- codes_tbl %>% dplyr::filter(stringr::str_detect(.data$norm_crop, stringr::fixed(item_clean))) + if (nrow(hits) >= 1L) { + return(hits %>% dplyr::arrange(nchar(.data$crop_desc)) %>% dplyr::slice(1) %>% + dplyr::select(crop_desc, class, subclass, PFT)) + } + words <- unlist(stringr::str_split(item_clean, "\\s+")) + words <- words[nzchar(words)] + if (length(words) > 0L) { + patt <- paste0("\\b(", paste(stringr::str_escape(words), collapse = "|"), ")\\b") + hits <- codes_tbl %>% dplyr::filter(stringr::str_detect(.data$norm_crop, patt)) + if (nrow(hits) >= 1L) { + return(hits %>% dplyr::arrange(nchar(.data$crop_desc)) %>% dplyr::slice(1) %>% + dplyr::select(crop_desc, class, subclass, PFT)) + } + } + tibble::tibble( + crop_desc = NA_character_, class = NA_character_, + subclass = NA_character_, PFT = NA_character_ + ) +} + +#### Harvest fractions from wide trait row (HI, yield, residue) + +harvest_fractions_from_wide <- function(df_wide, pft_for_placeholder, cr_fallback) { + for (cn in req_var_names) { + if (!cn %in% names(df_wide)) { + df_wide[[cn]] <- NA_real_ + } + } + cr_use <- suppressWarnings(as.numeric(df_wide$CR_removed_pc[1])) + if (is.na(cr_use)) { + cr_use <- cr_fallback + } + cr <- dplyr::coalesce( + suppressWarnings(as.numeric(df_wide$CR_removed_pc[1])), + suppressWarnings(as.numeric(df_wide$Ratio_residues_removed_from_field[1])), + cr_use + ) + if (is.na(cr) || !is.finite(cr)) { + cr <- 0 + } + cr <- pmin(pmax(cr, 0), 100) + + agb <- suppressWarnings(as.numeric(df_wide$Mean_Above_ground_biomass_kg_DM_ha[1])) + if (is.na(agb) || !is.finite(agb) || agb <= 0) { + agb <- suppressWarnings(as.numeric(df_wide$Maximum_Above_ground_biomass_kg_DM_ha[1])) + } + hi_raw <- dplyr::coalesce( + suppressWarnings(as.numeric(df_wide$HI[1])), + suppressWarnings(as.numeric(df_wide$Mean_Harvest_index[1])) + ) + yield_obs <- dplyr::coalesce( + suppressWarnings(as.numeric(df_wide$Yield_kg_DM_ha[1])), + suppressWarnings(as.numeric(df_wide$Mean_Grain_yield_kg_DM_ha[1])) + ) + + yield_use <- NA_real_ + residue <- NA_real_ + if (!is.na(hi_raw) && is.finite(hi_raw)) { + hi_c <- pmin(pmax(hi_raw, 0), 1) + yield_use <- hi_c * agb + residue <- agb - yield_use + } else if (!is.na(yield_obs) && is.finite(yield_obs) && !is.na(agb) && agb > 0) { + yield_use <- pmin(pmax(yield_obs, 0), agb) + residue <- agb - yield_use + } + + ph <- placeholder_means_wide %>% dplyr::filter(.data$PFT == pft_for_placeholder) + if (nrow(ph) != 1L) { + ph <- placeholder_means_wide %>% dplyr::filter(.data$PFT == "row") + } + + if (is.na(agb) || !is.finite(agb) || agb <= 0 || is.na(yield_use) || is.na(residue)) { + return(c( + AGB_REMOVED = ph$AGB_REMOVED[1], AGB_LITTER = ph$AGB_LITTER[1], + BGB_REMOVED = ph$BGB_REMOVED[1], BGB_LITTER = ph$BGB_LITTER[1], + source = "placeholder" + )) + } + + agb_rm <- yield_use + residue * (cr / 100) + agb_lit <- residue * (1 - cr / 100) + c( + AGB_REMOVED = agb_rm / agb, AGB_LITTER = agb_lit / agb, + BGB_REMOVED = 0, BGB_LITTER = 1, + source = "calculated" + ) +} + +agg_to_fraction_row <- function(agg_slice, pft_label, cr_fb) { + if (nrow(agg_slice) == 0L) { + return(NULL) + } + wide <- agg_slice %>% + dplyr::select(variable, value) %>% + tidyr::pivot_wider(names_from = variable, values_from = value) + v <- harvest_fractions_from_wide(wide, pft_label, cr_fb) + tibble::as_tibble(as.list(v)) +} + +#### Read FAOSTAT Excel and PFT summary CSV fallback + +read_faostat_excel <- function(path_xlsx) { + if (!nzchar(path_xlsx) || !file.exists(path_xlsx)) { + return(NULL) + } + tryCatch( + readxl::read_excel(path_xlsx), + error = function(e) { + warning("Could not read HARVEST_FAOSTAT_XLSX: ", conditionMessage(e)) + NULL + } + ) +} + +try_pft_summary_csv <- function(path_csv) { + if (!nzchar(path_csv) || !file.exists(path_csv)) { + return(tibble::tibble()) + } + raw <- readr::read_csv(path_csv, show_col_types = FALSE) + pc_cols <- c("AGB_REMOVED_pc", "AGB_LITTER_pc", "BGB_REMOVED_pc", "BGB_LITTER_pc") + if (!all(pc_cols %in% names(raw))) { + stop("HARVEST_PFT_SUMMARY_CSV must contain columns: ", paste(pc_cols, collapse = ", ")) + } + raw %>% + dplyr::mutate( + AGB_REMOVED = .data$AGB_REMOVED_pc / 100, + AGB_LITTER = .data$AGB_LITTER_pc / 100, + BGB_REMOVED = .data$BGB_REMOVED_pc / 100, + BGB_LITTER = .data$BGB_LITTER_pc / 100, + n_crops = if ("n_crops" %in% names(raw)) as.integer(.data$n_crops) else 0L, + source = if ("source" %in% names(raw)) as.character(.data$source) else "csv" + ) %>% + dplyr::select(PFT, AGB_REMOVED, AGB_LITTER, BGB_REMOVED, BGB_LITTER, n_crops, source) +} + +merge_pft_with_placeholders <- function(summary_tbl) { + ph <- placeholder_means_wide %>% dplyr::mutate(PFT = as.character(PFT)) + if (nrow(summary_tbl) == 0) { + return(ph %>% dplyr::mutate(n_crops = 0L, source = "placeholder")) + } + keyed <- summary_tbl %>% + dplyr::mutate(PFT = as.character(PFT)) %>% + dplyr::rename( + AGB_REMOVED_f = AGB_REMOVED, + AGB_LITTER_f = AGB_LITTER, + BGB_REMOVED_f = BGB_REMOVED, + BGB_LITTER_f = BGB_LITTER + ) + ph %>% + dplyr::left_join(keyed, by = "PFT") %>% + dplyr::mutate( + AGB_REMOVED = dplyr::coalesce(.data$AGB_REMOVED_f, .data$AGB_REMOVED), + AGB_LITTER = dplyr::coalesce(.data$AGB_LITTER_f, .data$AGB_LITTER), + BGB_REMOVED = dplyr::coalesce(.data$BGB_REMOVED_f, .data$BGB_REMOVED), + BGB_LITTER = dplyr::coalesce(.data$BGB_LITTER_f, .data$BGB_LITTER), + n_crops = dplyr::coalesce(as.integer(.data$n_crops), 0L), + source = dplyr::coalesce(.data$source, "placeholder") + ) %>% + dplyr::select(PFT, AGB_REMOVED, AGB_LITTER, BGB_REMOVED, BGB_LITTER, n_crops, source) +} + +#### LandIQ subclass and class frames (woody vs woody_destructive) + +# Same woody / woody_destructive duplication as build_harvest_lookup.R. +landiq_subclass_class_frames <- function(mapping_df) { + subclass_valid <- mapping_df %>% dplyr::distinct(class, subclass, crop_desc, PFT) + woody_sub <- subclass_valid %>% dplyr::filter(PFT == "woody") %>% dplyr::mutate(PFT = "woody_destructive") + subclass_valid <- dplyr::bind_rows(subclass_valid, woody_sub) + class_valid <- mapping_df %>% + dplyr::distinct(level_id = class, class_desc, PFT) %>% + dplyr::mutate(class = level_id, crop_desc = class_desc, subclass = NA_character_) %>% + dplyr::select(level_id, class, subclass, crop_desc, PFT) + woody_cls <- class_valid %>% dplyr::filter(PFT == "woody") %>% dplyr::mutate(PFT = "woody_destructive") + class_valid <- dplyr::bind_rows(class_valid, woody_cls) + list(subclass_valid = subclass_valid, class_valid = class_valid) +} + +# Long harvest params from wide PFT table (CSV path or internal); same shape as build_harvest_lookup join. +pft_wide_to_placeholders_long <- function(pft_wide_tbl) { + pft_wide_tbl %>% + tidyr::pivot_longer( + c(AGB_REMOVED, AGB_LITTER, BGB_REMOVED, BGB_LITTER), + names_to = "param", + values_to = "mean_obs" + ) %>% + dplyr::mutate( + sd_obs = NA_real_, + n_obs = as.integer(ifelse(.data$source == "placeholder" | is.na(.data$n_crops), 0L, .data$n_crops)) + ) %>% + dplyr::select(PFT, param, mean_obs, sd_obs, n_obs) +} + +# Stack subclass / class / pft / global from PFT-only long rows (CSV fallback = build_harvest_lookup.R layout). +harvest_long_from_pft_placeholders <- function(mapping_df, placeholders_long) { + skel <- landiq_subclass_class_frames(mapping_df) + tbl_sub <- skel$subclass_valid %>% + dplyr::left_join(placeholders_long, by = "PFT", relationship = "many-to-many") %>% + dplyr::mutate(level = "subclass") + tbl_class <- skel$class_valid %>% + dplyr::select(-level_id) %>% + dplyr::left_join(placeholders_long, by = "PFT", relationship = "many-to-many") %>% + dplyr::mutate(level = "class") + tbl_pft <- placeholders_long %>% + dplyr::mutate(level = "pft", class = NA_character_, subclass = NA_character_, crop_desc = NA_character_) + tbl_global <- placeholders_long %>% + dplyr::group_by(param) %>% + dplyr::summarise(mean_obs = mean(mean_obs, na.rm = TRUE), sd_obs = NA_real_, n_obs = 0L, .groups = "drop") %>% + dplyr::mutate( + level = "global", + class = NA_character_, + subclass = NA_character_, + crop_desc = NA_character_, + PFT = NA_character_ + ) + dplyr::bind_rows(tbl_sub, tbl_class, tbl_pft, tbl_global) %>% + dplyr::relocate(level, PFT, class, subclass, crop_desc, param, mean_obs, sd_obs, n_obs) %>% + dplyr::arrange(level, PFT, class, subclass, param) +} + +write_harvest_lookup_outputs <- function(harvest_lookup_long, rds_path, csv_path) { + saveRDS(harvest_lookup_long, rds_path) + readr::write_csv(harvest_lookup_long, csv_path) + cat("Wrote ", rds_path, "\n", sep = "") + cat("Wrote ", csv_path, "\n", sep = "") +} + +#### Create output directory + +dir.create(plant_traits_dir, recursive = TRUE, showWarnings = FALSE) + +#### Load LandIQ + +cat("Loading LandIQ mapping (agricultural classes only)...\n") +mapping <- load_landiq_mapping(landiq_lookup_csv) +codes <- mapping %>% + dplyr::distinct(crop_desc, class, subclass, PFT) %>% + dplyr::mutate(norm_crop = norm_item(.data$crop_desc)) + +harvest_tbl <- read_faostat_excel(faostat_xlsx) +use_excel <- !is.null(harvest_tbl) && + all(c("item", "variable", "value") %in% names(harvest_tbl)) + +if (!use_excel) { + cat("No usable FAOSTAT Excel; using PFT summary CSV path.\n") + summary_csv <- try_pft_summary_csv(pft_summary_csv) + if (nrow(summary_csv) == 0) { + stop("Need HARVEST_FAOSTAT_XLSX with item/variable/value or HARVEST_PFT_SUMMARY_CSV.") + } + pft_wide <- merge_pft_with_placeholders(summary_csv) + pl <- pft_wide_to_placeholders_long(pft_wide) + harvest_lookup_long <- harvest_long_from_pft_placeholders(mapping, pl) + write_harvest_lookup_outputs(harvest_lookup_long, out_rds, out_csv) + quit(save = "no", status = 0) +} + +cat("Joining FAOSTAT items to LandIQ crop_desc (planting-style identity)...\n") + +cr_fb <- NA_real_ +if ("original_crop" %in% names(harvest_tbl)) { + cr_fb <- harvest_tbl %>% + dplyr::filter(.data$original_crop == "All_crops", .data$variable == "CR_removed_pc") %>% + dplyr::summarise(v = mean(as.numeric(.data$value), na.rm = TRUE)) %>% + dplyr::pull(v) +} + +traits_long <- harvest_tbl %>% + dplyr::mutate( + item_clean = norm_item(.data$item), + value_num = suppressWarnings(as.numeric(.data$value)) + ) %>% + dplyr::filter(!is.na(.data$item_clean), !is.na(.data$variable)) + +uniq_items <- unique(traits_long$item_clean) +item_rows <- vector("list", length(uniq_items)) +for (ii in seq_along(uniq_items)) { + item_rows[[ii]] <- map_item_to_landiq(uniq_items[ii], codes, manual_item_to_crop) %>% + dplyr::mutate(item_clean = uniq_items[ii]) +} +item_map <- dplyr::bind_rows(item_rows) + +n_mapped <- sum(!is.na(item_map$class)) +cat(" Mapped ", n_mapped, " of ", nrow(item_map), " distinct items to a LandIQ subclass.\n", sep = "") + +harvest_join <- traits_long %>% + dplyr::left_join(item_map, by = "item_clean") %>% + dplyr::filter(!is.na(.data$class), !is.na(.data$subclass)) + +cat(" Trait rows after LandIQ join: ", nrow(harvest_join), "\n", sep = "") + +if (nrow(harvest_join) == 0L) { + stop( + "No FAOSTAT rows joined to LandIQ (check item names vs SUBCLASS_desc and manual_item_to_crop). ", + "Or use HARVEST_PFT_SUMMARY_CSV only by removing/renaming the Excel path." + ) +} + +#### Summarize at subclass (mean per variable, like TRY to subclass) + +sub_agg <- harvest_join %>% + dplyr::group_by(class, subclass, crop_desc, PFT, variable) %>% + dplyr::summarise( + value = mean(.data$value_num, na.rm = TRUE), + n_obs = dplyr::n(), + .groups = "drop" + ) + +sub_keys <- sub_agg %>% + dplyr::distinct(class, subclass, crop_desc, PFT) + +cat("Computing harvest fractions per subclass key...\n") +sub_frac_list <- vector("list", nrow(sub_keys)) +for (i in seq_len(nrow(sub_keys))) { + k <- sub_keys[i, , drop = FALSE] + sl <- sub_agg %>% + dplyr::filter( + .data$class == k$class[1], .data$subclass == k$subclass[1], + .data$crop_desc == k$crop_desc[1], .data$PFT == k$PFT[1] + ) + fr <- agg_to_fraction_row(sl, k$PFT[1], cr_fb) + if (!is.null(fr)) { + sub_frac_list[[i]] <- dplyr::bind_cols(k, fr) + } +} +frac_sub <- dplyr::bind_rows(sub_frac_list) + +#### Summarize at class x PFT (re-aggregate raw rows, like planting class level) + +class_agg <- harvest_join %>% + dplyr::group_by(class, PFT, variable) %>% + dplyr::summarise(value = mean(.data$value_num, na.rm = TRUE), n_obs = dplyr::n(), .groups = "drop") + +class_keys <- class_agg %>% dplyr::distinct(class, PFT) + +cat("Computing harvest fractions per class x PFT...\n") +class_frac_list <- vector("list", nrow(class_keys)) +for (i in seq_len(nrow(class_keys))) { + k <- class_keys[i, , drop = FALSE] + sl <- class_agg %>% + dplyr::filter(.data$class == k$class[1], .data$PFT == k$PFT[1]) + fr <- agg_to_fraction_row(sl, k$PFT[1], cr_fb) + if (!is.null(fr)) { + class_frac_list[[i]] <- dplyr::bind_cols( + tibble::tibble(class = k$class[1], PFT = k$PFT[1]), + fr + ) + } +} +frac_class <- dplyr::bind_rows(class_frac_list) + +#### Summarize at PFT and global + +pft_agg <- harvest_join %>% + dplyr::group_by(PFT, variable) %>% + dplyr::summarise(value = mean(.data$value_num, na.rm = TRUE), n_obs = dplyr::n(), .groups = "drop") + +pft_keys <- pft_agg %>% dplyr::distinct(PFT) + +cat("Computing harvest fractions per PFT...\n") +pft_frac_list <- vector("list", nrow(pft_keys)) +for (i in seq_len(nrow(pft_keys))) { + pf <- pft_keys$PFT[i] + sl <- pft_agg %>% dplyr::filter(.data$PFT == pf) + fr <- agg_to_fraction_row(sl, pf, cr_fb) + if (!is.null(fr)) { + pft_frac_list[[i]] <- dplyr::bind_cols(tibble::tibble(PFT = pf), fr) + } +} +frac_pft <- dplyr::bind_rows(pft_frac_list) + +glob_agg <- harvest_join %>% + dplyr::group_by(variable) %>% + dplyr::summarise(value = mean(.data$value_num, na.rm = TRUE), .groups = "drop") +dom_pft <- names(sort(table(harvest_join$PFT), decreasing = TRUE))[1] +if (is.na(dom_pft)) { + dom_pft <- "row" +} +fr_g <- agg_to_fraction_row( + glob_agg %>% dplyr::transmute(variable, value), + dom_pft, + cr_fb +) +if (is.null(fr_g) || nrow(fr_g) == 0L) { + stop("Global harvest fraction row is empty; check FAOSTAT variables.") +} +fr_g_num <- function(nm) suppressWarnings(as.numeric(fr_g[[nm]][1])) + +ph_wide <- placeholder_means_wide %>% dplyr::mutate(PFT = as.character(PFT)) +ph_long <- ph_wide %>% + tidyr::pivot_longer(c(AGB_REMOVED, AGB_LITTER, BGB_REMOVED, BGB_LITTER), + names_to = "param", values_to = "ph" + ) + +frac_sub_l <- frac_sub %>% + tidyr::pivot_longer(c(AGB_REMOVED, AGB_LITTER, BGB_REMOVED, BGB_LITTER), + names_to = "param", values_to = "v_sub" + ) %>% + dplyr::select(class, subclass, crop_desc, PFT, param, v_sub) +frac_class_l <- frac_class %>% + tidyr::pivot_longer(c(AGB_REMOVED, AGB_LITTER, BGB_REMOVED, BGB_LITTER), + names_to = "param", values_to = "v_class" + ) %>% + dplyr::select(class, PFT, param, v_class) +frac_pft_l <- frac_pft %>% + tidyr::pivot_longer(c(AGB_REMOVED, AGB_LITTER, BGB_REMOVED, BGB_LITTER), + names_to = "param", values_to = "v_pft" + ) %>% + dplyr::select(PFT, param, v_pft) + +g_vals <- tibble::tibble( + param = harvest_param_names, + v_glob = c( + fr_g_num("AGB_REMOVED"), fr_g_num("AGB_LITTER"), + fr_g_num("BGB_REMOVED"), fr_g_num("BGB_LITTER") + ) +) + +#### Assemble long lookup (coalesce subclass, class, pft, placeholder, global) + +skel <- landiq_subclass_class_frames(mapping) + +tbl_sub <- skel$subclass_valid %>% + tidyr::crossing(param = harvest_param_names) %>% + dplyr::left_join(frac_sub_l, by = c("class", "subclass", "crop_desc", "PFT", "param")) %>% + dplyr::left_join(frac_class_l, by = c("class", "PFT", "param")) %>% + dplyr::left_join(frac_pft_l, by = c("PFT", "param")) %>% + dplyr::left_join(g_vals, by = "param") %>% + dplyr::left_join(ph_long, by = c("PFT", "param")) %>% + dplyr::mutate( + # Placeholder before global so woody_destructive does not take the global cereal mean. + mean_obs = dplyr::coalesce( + suppressWarnings(as.numeric(.data$v_sub)), + suppressWarnings(as.numeric(.data$v_class)), + suppressWarnings(as.numeric(.data$v_pft)), + suppressWarnings(as.numeric(.data$ph)), + suppressWarnings(as.numeric(.data$v_glob)) + ), + src = dplyr::case_when( + !is.na(.data$v_sub) ~ "subclass", + !is.na(.data$v_class) ~ "class", + !is.na(.data$v_pft) ~ "pft", + !is.na(.data$v_glob) ~ "global", + TRUE ~ "placeholder" + ), + sd_obs = NA_real_, + n_obs = 0L, + level = "subclass" + ) %>% + dplyr::select(level, PFT, class, subclass, crop_desc, param, mean_obs, sd_obs, n_obs, src) + +tbl_class <- skel$class_valid %>% + dplyr::select(-level_id) %>% + tidyr::crossing(param = harvest_param_names) %>% + dplyr::left_join(frac_class_l, by = c("class", "PFT", "param")) %>% + dplyr::left_join(frac_pft_l, by = c("PFT", "param")) %>% + dplyr::left_join(g_vals, by = "param") %>% + dplyr::left_join(ph_long, by = c("PFT", "param")) %>% + dplyr::mutate( + mean_obs = dplyr::coalesce( + suppressWarnings(as.numeric(.data$v_class)), + suppressWarnings(as.numeric(.data$v_pft)), + suppressWarnings(as.numeric(.data$ph)), + suppressWarnings(as.numeric(.data$v_glob)) + ), + src = dplyr::case_when( + !is.na(.data$v_class) ~ "class", + !is.na(.data$v_pft) ~ "pft", + !is.na(.data$v_glob) ~ "global", + TRUE ~ "placeholder" + ), + sd_obs = NA_real_, + n_obs = 0L, + level = "class" + ) %>% + dplyr::select(level, PFT, class, subclass, crop_desc, param, mean_obs, sd_obs, n_obs, src) + +tbl_pft <- tidyr::crossing( + PFT = unique(ph_wide$PFT), + param = harvest_param_names +) %>% + dplyr::left_join(frac_pft_l, by = c("PFT", "param")) %>% + dplyr::left_join(g_vals, by = "param") %>% + dplyr::left_join(ph_long, by = c("PFT", "param")) %>% + dplyr::mutate( + mean_obs = dplyr::coalesce( + suppressWarnings(as.numeric(.data$v_pft)), + suppressWarnings(as.numeric(.data$ph)), + suppressWarnings(as.numeric(.data$v_glob)) + ), + sd_obs = NA_real_, + n_obs = 0L, + level = "pft", + class = NA_character_, + subclass = NA_character_, + crop_desc = NA_character_ + ) %>% + dplyr::select(level, PFT, class, subclass, crop_desc, param, mean_obs, sd_obs, n_obs) + +tbl_global <- tibble::tibble( + param = harvest_param_names, + mean_obs = c( + fr_g_num("AGB_REMOVED"), fr_g_num("AGB_LITTER"), + fr_g_num("BGB_REMOVED"), fr_g_num("BGB_LITTER") + ), + sd_obs = NA_real_, + n_obs = 0L, + level = "global", + class = NA_character_, + subclass = NA_character_, + crop_desc = NA_character_, + PFT = NA_character_ +) %>% + dplyr::select(level, PFT, class, subclass, crop_desc, param, mean_obs, sd_obs, n_obs) + +harvest_lookup_long <- dplyr::bind_rows( + tbl_sub %>% dplyr::select(-src), + tbl_class %>% dplyr::select(-src), + tbl_pft, + tbl_global +) %>% + dplyr::relocate(level, PFT, class, subclass, crop_desc, param, mean_obs, sd_obs, n_obs) %>% + dplyr::arrange(level, PFT, class, subclass, param) + +#### Write RDS and CSV + +write_harvest_lookup_outputs(harvest_lookup_long, out_rds, out_csv) diff --git a/modules/data.remote/inst/ccmmf/traits/build_planting_lookup.R b/modules/data.remote/inst/ccmmf/traits/build_planting_lookup.R new file mode 100644 index 00000000000..69936a7edcf --- /dev/null +++ b/modules/data.remote/inst/ccmmf/traits/build_planting_lookup.R @@ -0,0 +1,342 @@ +# Build long-format planting trait lookup for CCMMF / SIPNET-style pool init. +# +# Reads TRY master_data, maps each observation's species to a LandIQ SUBCLASS_desc +# crop group (genus lookup), converts trait values to common units, then keeps +# only rows that join to an agricultural LandIQ code with a PFT. Aggregates to +# four fallback levels used downstream: subclass (class+subclass), class crossed +# with PFT, PFT-only, and global. TRY SLA traits 3115-3117 are merged into one +# pooled SLA trait (SLA_POOLED) before aggregation. +# +# Writes plant_traits/planting_lookup_long.rds and planting_lookup_long.csv +# under CCMMF_MANAGEMENT (default: .../ccmmf/management). +# +# Run from management repo root: +# Rscript scripts/traits/build_planting_lookup.R +# Env: CCMMF_MANAGEMENT, TRY_MASTER_DATA (path to master_data.RData with object master_data). + +#### Load packages + +suppressPackageStartupMessages({ + library(data.table) + library(dplyr) + library(readr) + library(tibble) +}) + +#### Paths and outputs (CCMMF_MANAGEMENT and TRY_MASTER_DATA override defaults) + +path_management <- Sys.getenv("CCMMF_MANAGEMENT", "/projectnb/dietzelab/ccmmf/management") +master_data_path <- Sys.getenv("TRY_MASTER_DATA", "/projectnb/dietzelab/mkim/TRYDataR/master_data.RData") +landiq_lookup_csv <- file.path(path_management, "LandIQ_cropCode_lookup_table.csv") +plant_traits_dir <- file.path(path_management, "plant_traits") +out_rds <- file.path(plant_traits_dir, "planting_lookup_long.rds") +out_csv <- file.path(plant_traits_dir, "planting_lookup_long.csv") + +#### SLA pool: TRY IDs 3115, 3116, 3117 combined into one pseudo-trait + +sla_component_ids <- c(3115, 3116, 3117) +sla_pooled_key <- "SLA_POOLED" +sla_pooled_name <- "SLA (combined: 3115/3116/3117)" + +#### Helper functions + +# Agricultural LandIQ rows only; need PFT (same definition as build_harvest_lookup.R). +load_landiq_mapping <- function(path = landiq_lookup_csv) { + d <- as.data.frame(fread(path)) + d %>% + mutate(SUBCLASS = as.character(SUBCLASS)) %>% + filter(is_agricultural == TRUE, !is.na(PFT)) %>% + transmute(class = CLASS, subclass = SUBCLASS, crop_desc = SUBCLASS_desc, class_desc = CLASS_desc, PFT = PFT) +} + + +# TRY reports mixed units; this maps OrigUnitStr to the unit system we aggregate in. +normalize_units <- function(trait_id, value, unit_str) { + if (is.na(value)) return(NA_real_) + trait_id <- as.numeric(trait_id) + value_num <- suppressWarnings(as.numeric(value)) + + if (trait_id == 14) { + if (is.na(unit_str) || unit_str == "") return(value_num) + if (grepl("mg/g|mg g-1|mg N g-1|g/100g", unit_str, ignore.case = TRUE)) return(value_num) + if (grepl("%", unit_str, ignore.case = TRUE)) return(value_num * 10) + if (grepl("g/kg", unit_str, ignore.case = TRUE)) return(value_num) + return(value_num) + } + + if (trait_id %in% c(3115, 3116, 3117)) { + if (is.na(unit_str) || unit_str == "") return(NA_real_) + if (grepl("mm2/mg|mm2 mg-1|mm\\^2 mg-1", unit_str, ignore.case = TRUE)) return(value_num) + if (grepl("cm2/g|cm2 g-1|cm\\^2 g-1", unit_str, ignore.case = TRUE)) return(value_num * 0.1) + if (grepl("m2/kg|m\\^2 kg-1", unit_str, ignore.case = TRUE)) return(value_num) + if (grepl("g/m2|g m-2|mg/dm2", unit_str, ignore.case = TRUE)) return(NA_real_) + return(NA_real_) + } + + if (trait_id %in% c(3441, 128, 3450, 3952, 3953)) { + if (is.na(unit_str) || unit_str == "") return(value_num) + if (grepl("kg", unit_str, ignore.case = TRUE)) return(value_num * 1000) + if (grepl("mg", unit_str, ignore.case = TRUE)) return(value_num / 1000) + return(value_num) + } + + if (trait_id %in% c(1534, 2005)) { + if (is.na(unit_str) || unit_str == "") return(value_num) + if (grepl("%", unit_str, ignore.case = TRUE)) return(value_num / 100) + if (grepl("g/g|g g-1", unit_str, ignore.case = TRUE)) return(value_num) + return(value_num) + } + + if (trait_id %in% c(146, 165, 2057, 1055)) return(value_num) + if (is.na(unit_str) || unit_str == "") return(value_num) + return(value_num) +} + + +# First word of AccSpeciesName is genus; value must match LandIQ SUBCLASS_desc text. +genus_to_group <- c( + # Berries & small fruits + "Ribes" = "Bush berries", + "Rubus" = "Bush berries", + "Vaccinium" = "Bush berries", + "Fragaria" = "Strawberries", + # Tree fruits + "Malus" = "Apples", + "Pyrus" = "Pears", + "Vitis" = "Wine grapes", + "Persea" = "Avocados", + "Olea" = "Olives", + "Juglans" = "Walnuts", + "Pistacia" = "Pistachios", + "Punica" = "Pomegranates", + "Phoenix" = "Dates", + # Grains & row crops + "Zea" = "Corn, Sorghum or Sudan (grouped for RS only)", + "Sorghum" = "Corn, Sorghum or Sudan (grouped for RS only)", + "Triticum" = "Wheat", + "Oryza" = "Rice", + "Zizania" = "Wild rice", + "Gossypium" = "Cotton", + "Helianthus" = "Sunflowers", + # Legumes & forages + "Medicago" = "Alfalfa and alfalfa mixtures", + "Vigna" = "Beans (dry)", + "Glycine" = "Beans (dry)", + "Phaseolus" = "Beans (dry)", + # Vegetables + "Spinacia" = "Spinach", + "Lactuca" = "Lettuce (all types)", + "Brassica" = "Cole crops (mixture of 22-25)", + "Daucus" = "Carrots", + "Cucurbita" = "Melons, squash, cucumbers", + "Cucumis" = "Melons, squash, cucumbers", + "Capsicum" = "Peppers", + "Solanum" = "Potatoes", + "Ipomoea" = "Sweet potatoes", + "Allium" = "Onions & garlic", + # Miscellaneous + "Prunus" = "Miscellaneous deciduous", + "Agrostis" = "Miscellaneous grasses", + "Festuca" = "Miscellaneous grasses", + "Lolium" = "Miscellaneous grasses", + "Poa" = "Miscellaneous grasses", + "Achillea" = "Mixed pasture", + "Artemisia" = "Mixed pasture", + "Carex" = "Mixed pasture", + "Juncus" = "Mixed pasture", + "Coffea" = "Miscellaneous field", + "unknown" = "Miscellaneous field" +) + +map_species_to_group <- function(species_vec) { + genus <- sub("^(\\w+).*", "\\1", species_vec) + group <- genus_to_group[genus] + group[is.na(group)] <- "NA" + unname(group) +} + + +# Per-level means plus species- and dataset-level diagnostics for the long lookup table. +summarize_level <- function(df, level, level_id_col) { + id_sym <- rlang::sym(level_id_col) + + obs <- df %>% + group_by(!!id_sym, TraitKey, TraitID, TraitName) %>% + summarise( + mean_obs = mean(value_std, na.rm = TRUE), + sd_obs = sd(value_std, na.rm = TRUE), + n_obs = sum(!is.na(value_std)), + n_species = n_distinct(AccSpeciesName), + n_datasets = n_distinct(DatasetID), + .groups = "drop" + ) %>% + rename(level_id = !!id_sym) + + sp <- df %>% + group_by(!!id_sym, TraitKey, TraitID, TraitName, AccSpeciesName) %>% + summarise( + mean_sp = mean(value_std, na.rm = TRUE), + n_obs_sp = sum(!is.na(value_std)), + .groups = "drop" + ) + sp_sum <- sp %>% + group_by(!!id_sym, TraitKey, TraitID, TraitName) %>% + summarise( + mean_species = mean(mean_sp, na.rm = TRUE), + sd_species_mean = sd(mean_sp, na.rm = TRUE), + min_species_n_obs = suppressWarnings(min(n_obs_sp, na.rm = TRUE)), + median_species_n_obs = suppressWarnings(stats::median(n_obs_sp, na.rm = TRUE)), + max_species_n_obs = suppressWarnings(max(n_obs_sp, na.rm = TRUE)), + .groups = "drop" + ) %>% + rename(level_id = !!id_sym) + + ds <- df %>% + group_by(!!id_sym, TraitKey, TraitID, TraitName, DatasetID) %>% + summarise( + mean_ds = mean(value_std, na.rm = TRUE), + n_obs_ds = sum(!is.na(value_std)), + .groups = "drop" + ) + ds_sum <- ds %>% + group_by(!!id_sym, TraitKey, TraitID, TraitName) %>% + summarise( + mean_dataset = mean(mean_ds, na.rm = TRUE), + sd_dataset_mean = sd(mean_ds, na.rm = TRUE), + min_dataset_n_obs = suppressWarnings(min(n_obs_ds, na.rm = TRUE)), + median_dataset_n_obs = suppressWarnings(stats::median(n_obs_ds, na.rm = TRUE)), + max_dataset_n_obs = suppressWarnings(max(n_obs_ds, na.rm = TRUE)), + .groups = "drop" + ) %>% + rename(level_id = !!id_sym) + + obs %>% + left_join(sp_sum, by = c("level_id", "TraitKey", "TraitID", "TraitName")) %>% + left_join(ds_sum, by = c("level_id", "TraitKey", "TraitID", "TraitName")) %>% + mutate(level = level) %>% + relocate(level, level_id, TraitKey, TraitID, TraitName) +} + + +#### Create output directory + +dir.create(plant_traits_dir, recursive = TRUE, showWarnings = FALSE) + +#### Load LandIQ and TRY master_data + +cat("Loading LandIQ mapping (is_agricultural only)...\n") +mapping <- load_landiq_mapping(landiq_lookup_csv) + +cat("Loading master_data...\n") +obj <- load(master_data_path) +if (!("master_data" %in% obj)) { + stop("Expected object 'master_data' in ", master_data_path) +} + +#### Normalize TRY units and map species string to LandIQ crop_desc group + +cat("Normalizing values + mapping species->group...\n") +data_normalized <- master_data %>% + mutate( + value_num = suppressWarnings(as.numeric(OrigValueStr)), + value_std = mapply(normalize_units, TraitID, value_num, OrigUnitStr), + TraitKey = as.character(TraitID), + group = map_species_to_group(AccSpeciesName) + ) %>% + filter(!is.na(value_std)) +cat("Rows after normalization (all traits): ", nrow(data_normalized), "\n", sep = "") + + +#### Restrict TRY rows to species that map into our LandIQ crop_desc list + +group_to_codes <- mapping %>% + select(crop_desc, class, subclass, PFT) %>% + distinct() + +sub_df <- data_normalized %>% + filter(group != "NA") %>% + inner_join(group_to_codes, by = c("group" = "crop_desc")) + +# Same joined TRY rows; we summarize them four ways (different grouping keys below). +class_df <- sub_df +pft_df <- sub_df +glob_df <- sub_df %>% + mutate(GLOBAL = "GLOBAL") + +# Copy SLA component traits under a single TraitKey so SLA pools with one fallback chain. +sub_df_sla <- sub_df %>% + filter(TraitID %in% sla_component_ids) %>% + mutate(TraitKey = sla_pooled_key, TraitID = NA_real_, TraitName = sla_pooled_name) +class_df_sla <- class_df %>% + filter(TraitID %in% sla_component_ids) %>% + mutate(TraitKey = sla_pooled_key, TraitID = NA_real_, TraitName = sla_pooled_name) +pft_df_sla <- pft_df %>% + filter(TraitID %in% sla_component_ids) %>% + mutate(TraitKey = sla_pooled_key, TraitID = NA_real_, TraitName = sla_pooled_name) +glob_df_sla <- glob_df %>% + filter(TraitID %in% sla_component_ids) %>% + mutate(TraitKey = sla_pooled_key, TraitID = NA_real_, TraitName = sla_pooled_name) + +#### Summarize traits at each fallback level (subclass, class x PFT, PFT, global; plus SLA pooled) + +# Subclass key is first letter of class plus numeric subclass (matches LandIQ code layout). +sub_df <- sub_df %>% mutate(subclass_id = paste0(class, subclass)) +sub_df_sla <- sub_df_sla %>% mutate(subclass_id = paste0(class, subclass)) +cat("Summarizing subclass level...\n") +tbl_sub <- bind_rows( + summarize_level(sub_df, level = "subclass", level_id_col = "subclass_id"), + summarize_level(sub_df_sla, level = "subclass", level_id_col = "subclass_id") +) +# Class-level table is keyed by class and PFT together (one trait row per class|PFT). +class_df <- class_df %>% mutate(class_pft = paste0(class, "|", PFT)) +class_df_sla <- class_df_sla %>% mutate(class_pft = paste0(class, "|", PFT)) +cat("Summarizing class level (by class x PFT)...\n") +tbl_class <- bind_rows( + summarize_level(class_df, level = "class", level_id_col = "class_pft"), + summarize_level(class_df_sla, level = "class", level_id_col = "class_pft") +) +cat("Summarizing PFT level...\n") +tbl_pft <- bind_rows( + summarize_level(pft_df, level = "pft", level_id_col = "PFT"), + summarize_level(pft_df_sla, level = "pft", level_id_col = "PFT") +) +cat("Summarizing global level...\n") +tbl_global <- bind_rows( + summarize_level(glob_df, level = "global", level_id_col = "GLOBAL"), + summarize_level(glob_df_sla, level = "global", level_id_col = "GLOBAL") +) + +#### Parse level_id and add LandIQ identity columns for the long output + +# summarize_level() only keeps level_id plus trait stats. Below we split that id and +# left_join LandIQ so each row carries class, subclass, crop_desc, PFT (the same +# names pool_calculations uses). PFT-only and global levels have no LandIQ code; +# we set those columns to NA so bind_rows() stacks one consistent schema. + +tbl_sub_parsed <- tbl_sub %>% + mutate(class = sub("^(.)(.*)$", "\\1", level_id), + subclass = sub("^.(.*)$", "\\1", level_id)) %>% + left_join(mapping %>% distinct(class, subclass, crop_desc, PFT), by = c("class", "subclass")) %>% + select(-level_id) +tbl_class_parsed <- tbl_class %>% + mutate(class = sub("^([^|]+)\\|(.*)$", "\\1", level_id), + PFT = sub("^([^|]+)\\|(.*)$", "\\2", level_id)) %>% + left_join(mapping %>% distinct(class, class_desc), by = "class") %>% + mutate(subclass = NA_character_, crop_desc = class_desc) %>% + select(-class_desc, -level_id) +tbl_pft_parsed <- tbl_pft %>% + mutate(PFT = level_id, class = NA_character_, subclass = NA_character_, crop_desc = NA_character_) %>% + select(-level_id) +tbl_global_parsed <- tbl_global %>% + mutate(class = NA_character_, subclass = NA_character_, crop_desc = NA_character_, PFT = NA_character_) %>% + select(-level_id) + +planting_lookup_long <- bind_rows(tbl_sub_parsed, tbl_class_parsed, tbl_pft_parsed, tbl_global_parsed) %>% + relocate(level, PFT, class, subclass, crop_desc) %>% + arrange(level, PFT, class, subclass, TraitID) + + +#### Write RDS and CSV + +saveRDS(planting_lookup_long, out_rds) +write_csv(planting_lookup_long, out_csv) \ No newline at end of file diff --git a/modules/data.remote/inst/ccmmf/traits/lai_from_mslsp.R b/modules/data.remote/inst/ccmmf/traits/lai_from_mslsp.R new file mode 100644 index 00000000000..29021c35acd --- /dev/null +++ b/modules/data.remote/inst/ccmmf/traits/lai_from_mslsp.R @@ -0,0 +1,112 @@ +# Leaf area index (LAI) from MSLSP EVImax and EVIamp for CCMMF pool initialization. +# +# Mourad et al. (2020): LAI = (max(0, a*sqrt(k*EVI) - b))^2 with fixed a, b. The +# paper uses only the positive part of (a*sqrt(k*EVI) - b) before squaring; EVI is +# floored at 0. k reflects phenological timing (0.15 ~ early-season annuals, +# 0.50 ~ leaf-on on perennials). Row and rice use EVIamp (strong bare season; +# amplitude tracks the crop cycle). Hay and mature woody use EVImax (often green +# year-round; amplitude can be small while peak EVI still reflects canopy). Woody +# with LandIQ CLASS YP (young perennial) uses EVIamp. +# +# Why only CLASS, not SUBCLASS: the only LandIQ branch in this file is woody YP +# vs other woody. Row, rice, and hay do not use CLASS. We do not vary LAI by +# subclass (e.g. T19 vs T4) yet; that would belong in a richer rule table later. +# Pool functions pass CLASS from your data, or set it from CLASS_SUBCLASS using +# lk$mapping when you only supply a code string. +# +# Main entry: compute_lai_from_mslsp(mslsp_EVImax, mslsp_EVIamp, pft, class, +# diagnostics=FALSE). Scalar LAI, or diagnostics=TRUE returns a list (LAI, lai_*). +# +# Downstream: sourced by pool_calculations_from_lookup.R for initialize_planting() (MSLSP path). +# Run alone: source this file from R (requires dplyr); no env vars for this module. +# Style: docs/SCRIPT_REFACTOR_PROMPT.md in ccmmf-phenology (ASCII, simple sections). + +#### Load packages + +suppressPackageStartupMessages({ + library(dplyr) +}) + +lai_mourad_a <- 2.92 +lai_mourad_b <- 0.43 +lai_default_min <- 0 +lai_default_max <- Inf + +compute_lai_from_mslsp <- function(mslsp_EVImax, mslsp_EVIamp, + pft, class = NA_character_, + diagnostics = FALSE) { + if (!diagnostics) { + na_out <- NA_real_ + } else { + na_out <- list( + LAI = NA_real_, lai_rule_id = NA_character_, lai_evi_field_used = NA_character_, + lai_evi_value_used = NA_real_, lai_k = NA_real_, lai_a = NA_real_, lai_b = NA_real_, + lai_min = NA_real_, lai_max = NA_real_ + ) + } + + if (length(pft) != 1L || is.na(pft)) return(na_out) + pft_lc <- tolower(trimws(as.character(pft))) + if (!nzchar(pft_lc)) return(na_out) + + if (length(class) != 1L || is.na(class)) { + class_uc <- NA_character_ + } else { + class_uc <- toupper(trimws(as.character(class))) + if (!nzchar(class_uc)) class_uc <- NA_character_ + } + + rules <- tibble(pft_lc = pft_lc, class_uc = class_uc) %>% + mutate( + rule_id = case_when( + pft_lc == "row" ~ "row_amp", + pft_lc == "rice" ~ "rice_amp", + pft_lc == "hay" ~ "hay_max", + pft_lc == "woody" & !is.na(class_uc) & class_uc == "YP" ~ "woody_yp_amp", + pft_lc == "woody" ~ "woody_max", + TRUE ~ NA_character_ + ), + k = case_when( + pft_lc %in% c("row", "rice") ~ 0.15, + pft_lc %in% c("hay", "woody") ~ 0.50, + TRUE ~ NA_real_ + ), + evi_field = case_when( + pft_lc %in% c("row", "rice") ~ "EVIamp", + pft_lc == "hay" ~ "EVImax", + pft_lc == "woody" & !is.na(class_uc) & class_uc == "YP" ~ "EVIamp", + pft_lc == "woody" ~ "EVImax", + TRUE ~ NA_character_ + ) + ) + + if (is.na(rules$rule_id[1])) return(na_out) + + evi_used <- if (rules$evi_field[1] == "EVIamp") mslsp_EVIamp else mslsp_EVImax + evi_num <- as.numeric(evi_used) + k <- rules$k[1] + a <- lai_mourad_a + b <- lai_mourad_b + lai <- NA_real_ + if (!is.na(evi_num) && !is.na(k)) { + evi_num <- pmax(0, evi_num) + term <- a * sqrt(k * evi_num) - b + lai <- pmax(0, term)^2 + lai <- pmax(lai_default_min, pmin(lai_default_max, lai)) + } + + if (!diagnostics) { + return(lai) + } + list( + LAI = lai, + lai_rule_id = rules$rule_id[1], + lai_evi_field_used = rules$evi_field[1], + lai_evi_value_used = as.numeric(evi_used), + lai_k = k, + lai_a = a, + lai_b = b, + lai_min = lai_default_min, + lai_max = lai_default_max + ) +} diff --git a/modules/data.remote/inst/ccmmf/traits/pool_calculations_from_lookup.R b/modules/data.remote/inst/ccmmf/traits/pool_calculations_from_lookup.R new file mode 100644 index 00000000000..403abf060f2 --- /dev/null +++ b/modules/data.remote/inst/ccmmf/traits/pool_calculations_from_lookup.R @@ -0,0 +1,503 @@ +# Pool calculations from pre-built TRY and harvest lookup tables for SIPNET +# planting and harvest rows. Maps LandIQ crop code and PFT to C and N pools at +# planting (LAI from MSLSP EVI via lai_from_mslsp.R) and to removal +# fractions at harvest. Trait lookup order: subclass > class > PFT > global. +# +# Main inputs: planting_lookup_long.rds, harvest_lookup_long.rds, +# LandIQ_cropCode_lookup_table.csv +# +# Main outputs: Tibbles from initialize_planting() and initialize_harvest_from_lookup() (no writes here). +# Used by make_events_statewide.R for statewide planting events. + +#### Load packages + +suppressPackageStartupMessages({ + library(data.table) + library(dplyr) + library(readr) + library(tibble) +}) + +#### Paths and configuration + +path_management <- Sys.getenv("CCMMF_MANAGEMENT", "/projectnb/dietzelab/ccmmf/management") +plant_traits_dir <- file.path(path_management, "plant_traits") +planting_lookup_rds <- file.path(plant_traits_dir, "planting_lookup_long.rds") +harvest_lookup_rds <- file.path(plant_traits_dir, "harvest_lookup_long.rds") +landiq_lookup_csv <- file.path(path_management, "LandIQ_cropCode_lookup_table.csv") + +source(file.path(path_management, "scripts/traits/lai_from_mslsp.R")) + +sla_pooled_key <- "SLA_POOLED" + +# Infix "default if missing": x %||% y returns y when x is NULL or length-0, else x +# (same idea as ?? in some languages). We use it because trait lookup records do not +# always include every optional field (e.g. n_obs, sd_obs); for diagnostics we still +# want stable columns, so we fall back to NA_* instead of NULL or branching everywhere. +`%||%` <- function(x, y) if (is.null(x) || length(x) == 0) y else x + +#### LandIQ crop table (agricultural rows, key = CLASS pasted with SUBCLASS) + +load_landiq_mapping <- function(path = landiq_lookup_csv) { + d <- as.data.frame(fread(path)) + d %>% + filter(is_agricultural == TRUE) %>% + mutate(CLASS = as.character(CLASS), + SUBCLASS = as.character(SUBCLASS), + key = paste0(CLASS, SUBCLASS)) %>% + relocate(CLASS, SUBCLASS, CLASS_desc, SUBCLASS_desc, PFT, key) +} + +#### Named-vector indexes from RDS long tables (fast lookup by string key) +# +# load_trait_lookup() splits the long table by level (subclass, class, pft, global). +# Each call below gets one of those pieces. Keys are built in the same order +# get_trait_record tries them: subclass, class, pft, global. + +make_trait_index <- function(lookup_df) { + lookup_df <- as.data.frame(lookup_df) + cols <- c( + "mean_obs", "sd_obs", "n_obs", "n_species", "n_datasets", + "mean_species", "sd_species_mean", + "mean_dataset", "sd_dataset_mean", + "replicates_median", "n_replicates_nonNA", "errorRisk_max", + "TraitName" + ) + # Drop any name not in the table (the planting RDS does not have every column listed above). + cols <- intersect(cols, names(lookup_df)) + # No data rows: return empty indexes. Otherwise the tests below can guess wrong on empty vectors. + if (nrow(lookup_df) == 0L) { + idx <- lapply(cols, function(cc) stats::setNames(lookup_df[[cc]], character(0))) + names(idx) <- cols + return(idx) + } + if ("subclass" %in% names(lookup_df) && !all(is.na(lookup_df$subclass))) { + # Subclass: class+subclass|TraitKey (harvest includes PFT in this key; traits do not) + key <- paste0(lookup_df$class, lookup_df$subclass, "|", lookup_df$TraitKey) + } else if ("class" %in% names(lookup_df) && !all(is.na(lookup_df$class))) { + # Class-level rows: class|PFT|TraitKey or class|TraitKey + has_pft <- "PFT" %in% names(lookup_df) + use_pft <- if (has_pft) !is.na(lookup_df$PFT) & nzchar(trimws(as.character(lookup_df$PFT))) else rep(FALSE, nrow(lookup_df)) + lookup_key <- ifelse(use_pft, paste0(lookup_df$class, "|", lookup_df$PFT), dplyr::coalesce(lookup_df$class, "GLOBAL")) + key <- paste0(lookup_key, "|", lookup_df$TraitKey) + } else if ("PFT" %in% names(lookup_df) && "class" %in% names(lookup_df) && all(is.na(lookup_df$class)) && !all(is.na(lookup_df$PFT))) { + # PFT-level rows: PFT|TraitKey + key <- paste0(lookup_df$PFT, "|", lookup_df$TraitKey) + } else if ("PFT" %in% names(lookup_df) && all(is.na(lookup_df$PFT))) { + # Global rows: GLOBAL|TraitKey + key <- paste0("GLOBAL|", lookup_df$TraitKey) + } else { + stop("make_trait_index: unrecognized row layout (check level column in RDS)", call. = FALSE) + } + idx <- lapply(cols, function(cc) stats::setNames(lookup_df[[cc]], key)) + names(idx) <- cols + idx +} + +make_harvest_index <- function(lookup_df) { + lookup_df <- as.data.frame(lookup_df) + cols <- c("mean_obs", "sd_obs", "n_obs") + cols <- intersect(cols, names(lookup_df)) + # No data rows: return empty indexes (same reason as make_trait_index). + if (nrow(lookup_df) == 0L) { + idx <- lapply(cols, function(cc) stats::setNames(lookup_df[[cc]], character(0))) + names(idx) <- cols + return(idx) + } + if ("subclass" %in% names(lookup_df) && !all(is.na(lookup_df$subclass))) { + key <- paste0(lookup_df$class, lookup_df$subclass, "|", lookup_df$PFT, "|", lookup_df$param) + } else if ("class" %in% names(lookup_df) && !all(is.na(lookup_df$class))) { + key <- paste0(lookup_df$class, "|", lookup_df$PFT, "|", lookup_df$param) + } else if ("PFT" %in% names(lookup_df) && "class" %in% names(lookup_df) && all(is.na(lookup_df$class)) && !all(is.na(lookup_df$PFT))) { + key <- paste0(lookup_df$PFT, "|", lookup_df$param) + } else if ("PFT" %in% names(lookup_df) && all(is.na(lookup_df$PFT))) { + key <- paste0("GLOBAL|", lookup_df$param) + } else { + stop("make_harvest_index: unrecognized row layout (check level column in RDS)", call. = FALSE) + } + idx <- lapply(cols, function(cc) stats::setNames(lookup_df[[cc]], key)) + names(idx) <- cols + idx +} + +# One row from an index built above; NULL if key missing or mean_obs is NA. +lookup_index_fetch <- function(idx, key) { + if (is.null(key)) return(NULL) + v <- idx$mean_obs[key] + if (length(v) == 0 || is.na(v[[1]])) return(NULL) + out <- list() + for (nm in names(idx)) out[[nm]] <- idx[[nm]][key][[1]] + out +} + +#### Load planting and harvest RDS; build all indexes and LandIQ mapping + +load_trait_lookup <- function(path = planting_lookup_rds, + harvest_path = harvest_lookup_rds, + landiq_path = landiq_lookup_csv) { + df <- readRDS(path) + harvest_df <- readRDS(harvest_path) + list( + lookup = df, + idx_subclass = make_trait_index(df %>% filter(.data$level == "subclass")), + idx_class = make_trait_index(df %>% filter(.data$level == "class")), + idx_pft = make_trait_index(df %>% filter(.data$level == "pft")), + idx_global = make_trait_index(df %>% filter(.data$level == "global")), + harvest_lookup = harvest_df, + idx_harvest_subclass = make_harvest_index(harvest_df %>% filter(.data$level == "subclass")), + idx_harvest_class = make_harvest_index(harvest_df %>% filter(.data$level == "class")), + idx_harvest_pft = make_harvest_index(harvest_df %>% filter(.data$level == "pft")), + idx_harvest_global = make_harvest_index(harvest_df %>% filter(.data$level == "global")), + mapping = load_landiq_mapping(landiq_path) + ) +} + +#### LandIQ code or CLASS+SUBCLASS to crop fields used in keys + +# Returns class, subclass, crop_desc, class_desc, pft from LandIQ code (e.g. T19). +get_group_class_from_code <- function(code, mapping_df) { + row <- mapping_df %>% filter(.data$key == !!code) + if (nrow(row) == 0) return(list(class = NA_character_, subclass = NA_character_, crop_desc = NA_character_, class_desc = NA_character_, pft = NA_character_)) + pft <- if ("PFT" %in% names(row)) row$PFT[1] else NA_character_ + list(class = row$CLASS[1], subclass = row$SUBCLASS[1], crop_desc = row$SUBCLASS_desc[1], class_desc = row$CLASS_desc[1], pft = pft) +} + +# Same structure as get_group_class_from_code but look up by class and subclass. +get_group_class_from_class_subclass <- function(class, subclass, mapping_df) { + if (is.null(mapping_df)) return(list(class = NA_character_, subclass = NA_character_, crop_desc = NA_character_, class_desc = NA_character_, pft = NA_character_)) + class <- trimws(as.character(class))[1] + subclass <- as.character(subclass)[1] + if (is.na(class) || is.na(subclass) || !nzchar(class) || !nzchar(subclass)) return(list(class = NA_character_, subclass = NA_character_, crop_desc = NA_character_, class_desc = NA_character_, pft = NA_character_)) + row <- mapping_df %>% filter(.data$CLASS == !!class, .data$SUBCLASS == !!subclass) + if (nrow(row) == 0) return(list(class = class, subclass = subclass, crop_desc = NA_character_, class_desc = NA_character_, pft = NA_character_)) + pft <- if ("PFT" %in% names(row)) row$PFT[1] else NA_character_ + list(class = row$CLASS[1], subclass = row$SUBCLASS[1], crop_desc = row$SUBCLASS_desc[1], class_desc = row$CLASS_desc[1], pft = pft) +} + +as_trait_key <- function(trait_id_or_key) { + if (is.character(trait_id_or_key)) return(trait_id_or_key) + as.character(as.numeric(trait_id_or_key)) +} + +#### Trait and harvest values with subclass then class then pft then global fallback + +get_trait_record <- function(lk, subclass, class, trait_id_or_key, pft = NULL) { + tkey <- as_trait_key(trait_id_or_key) + k_sub <- paste0(class, subclass, "|", tkey) + if (is.null(pft) && !is.null(lk$mapping) && "PFT" %in% names(lk$mapping)) { + row <- lk$mapping %>% filter(.data$CLASS == !!class, .data$SUBCLASS == !!subclass) + pft <- if (nrow(row) > 0) row$PFT[1] else NA_character_ + } + k_cls <- if (!is.na(pft) && nzchar(pft)) paste0(class, "|", pft, "|", tkey) else paste0(class, "|", tkey) + k_pft <- if (!is.na(pft) && nzchar(pft)) paste0(pft, "|", tkey) else NULL + k_glb <- paste0("GLOBAL|", tkey) + + r <- lookup_index_fetch(lk$idx_subclass, k_sub) + if (!is.null(r)) return(c(r, list(value = r$mean_obs, src = "subclass"))) + r <- lookup_index_fetch(lk$idx_class, k_cls) + if (!is.null(r)) return(c(r, list(value = r$mean_obs, src = "class"))) + if (!is.null(k_pft) && !is.null(lk$idx_pft)) { + r <- lookup_index_fetch(lk$idx_pft, k_pft) + if (!is.null(r)) return(c(r, list(value = r$mean_obs, src = "pft"))) + } + r <- lookup_index_fetch(lk$idx_global, k_glb) + if (!is.null(r)) return(c(r, list(value = r$mean_obs, src = "global"))) + list(value = NA_real_, src = "none") +} + +# Harvest fallback: subclass -> class -> pft -> global (same as trait lookup). +# lookup_pft: row/rice/hay/woody/woody_destructive (maps from parcel PFT + destructive). +get_harvest_param <- function(lk, subclass, class, lookup_pft, param) { + k_sub <- paste0(class, subclass, "|", lookup_pft, "|", param) + k_cls <- paste0(class, "|", lookup_pft, "|", param) + k_pft <- paste0(lookup_pft, "|", param) + k_glb <- paste0("GLOBAL|", param) + + r <- lookup_index_fetch(lk$idx_harvest_subclass, k_sub) + if (!is.null(r)) return(c(r, list(value = r$mean_obs, src = "subclass"))) + r <- lookup_index_fetch(lk$idx_harvest_class, k_cls) + if (!is.null(r)) return(c(r, list(value = r$mean_obs, src = "class"))) + r <- lookup_index_fetch(lk$idx_harvest_pft, k_pft) + if (!is.null(r)) return(c(r, list(value = r$mean_obs, src = "pft"))) + r <- lookup_index_fetch(lk$idx_harvest_global, k_glb) + if (!is.null(r)) return(c(r, list(value = r$mean_obs, src = "global"))) + list(value = NA_real_, src = "none") +} + +derive_CN_coarse <- function(lk, subclass, class) { + CN_root <- get_trait_record(lk, subclass, class, 1055)$value + CN_fine <- get_trait_record(lk, subclass, class, 2057)$value + f_fine <- get_trait_record(lk, subclass, class, 2005)$value + f_coarse <- get_trait_record(lk, subclass, class, 1534)$value + + if (all(!is.na(c(CN_root, CN_fine, f_fine, f_coarse)))) { + f_root <- f_fine + f_coarse + if (!is.na(f_root) && f_root > 0) { + f_fine_root <- f_fine / f_root + f_coarse_root <- f_coarse / f_root + denom <- (1 / CN_root) - (f_fine_root / CN_fine) + if (!is.na(denom) && denom > 0) { + CN_coarse <- f_coarse_root / denom + if (is.finite(CN_coarse) && CN_coarse > 0) return(CN_coarse) + } + } + } + if (!is.na(CN_root)) return(CN_root) + if (!is.na(CN_fine)) return(CN_fine) + CN_stem <- get_trait_record(lk, subclass, class, 165)$value + if (!is.na(CN_stem)) return(CN_stem) + CN_leaf <- get_trait_record(lk, subclass, class, 146)$value + if (!is.na(CN_leaf)) return(CN_leaf) + NA_real_ +} + +#### Planting: core (LandIQ code plus numeric LAI to C and N pools) + +planting_pools_from_lookup <- function(ID, DATE, code, LAI, PFT, lk, + diagnostics = FALSE) { + gc <- get_group_class_from_code(code, lk$mapping) + subclass <- gc$subclass + class <- gc$class + crop_desc <- gc$crop_desc + class_desc <- gc$class_desc + pft <- gc$pft + + sla_rec <- get_trait_record(lk, subclass, class, sla_pooled_key, pft = pft) + SLA <- sla_rec$value + if (is.na(SLA) || SLA <= 0) { + out <- tibble( + LOC = ID, DATE = DATE, CLASS_SUBCLASS = code, class = class, subclass = subclass, crop_desc = crop_desc, CLASS_DESC = class_desc, + PFT = PFT, LAI = LAI, + C_LEAF = NA_real_, C_STEM = NA_real_, C_FINEROOT = NA_real_, C_COARSEROOT = NA_real_, + N_LEAF = NA_real_, N_STEM = NA_real_, N_FINEROOT = NA_real_, N_COARSEROOT = NA_real_, + ENSEMBLE_SIZE = 1L + ) + if (diagnostics) { + out$sla_src <- sla_rec$src + out$sla_n_obs <- sla_rec$n_obs %||% NA_integer_ + out$sla_sd_obs <- sla_rec$sd_obs %||% NA_real_ + } + return(out) + } + + M_leaf_kgm2 <- LAI / SLA + + r3441 <- get_trait_record(lk, subclass, class, 3441, pft = pft) + r128 <- get_trait_record(lk, subclass, class, 128, pft = pft) + r1534 <- get_trait_record(lk, subclass, class, 1534, pft = pft) + r2005 <- get_trait_record(lk, subclass, class, 2005, pft = pft) + + M_leaf_gplant <- r3441$value + M_stem_gplant <- r128$value + f_coarse <- r1534$value + f_fine <- r2005$value + + f_root <- NA_real_ + if (!is.na(f_coarse) || !is.na(f_fine)) f_root <- dplyr::coalesce(f_coarse, 0) + dplyr::coalesce(f_fine, 0) + + F_stem <- F_fineRoot <- F_coarseRoot <- NA_real_ + if (!any(is.na(c(M_leaf_gplant, M_stem_gplant, f_root))) && f_root > 0 && f_root < 1) { + M_plant_gplant <- (M_leaf_gplant + M_stem_gplant) / (1 - f_root) + M_fine_gplant <- if (!is.na(f_fine)) f_fine * M_plant_gplant else NA_real_ + M_coarse_gplant<- if (!is.na(f_coarse)) f_coarse * M_plant_gplant else NA_real_ + F_stem <- M_stem_gplant / M_leaf_gplant + F_fineRoot <- if (!is.na(M_fine_gplant)) M_fine_gplant / M_leaf_gplant else NA_real_ + F_coarseRoot <- if (!is.na(M_coarse_gplant)) M_coarse_gplant / M_leaf_gplant else NA_real_ + } else if (!any(is.na(c(M_leaf_gplant, M_stem_gplant)))) { + F_stem <- M_stem_gplant / M_leaf_gplant + } + + M_stem_kgm2 <- if (!is.na(F_stem) && !is.na(M_leaf_kgm2)) F_stem * M_leaf_kgm2 else NA_real_ + M_fine_kgm2 <- if (!is.na(F_fineRoot) && !is.na(M_leaf_kgm2)) F_fineRoot * M_leaf_kgm2 else NA_real_ + M_coarse_kgm2 <- if (!is.na(F_coarseRoot) && !is.na(M_leaf_kgm2)) F_coarseRoot * M_leaf_kgm2 else NA_real_ + + C_leaf <- M_leaf_kgm2 * 0.47 + C_stem <- M_stem_kgm2 * 0.47 + C_fineroot <- M_fine_kgm2 * 0.47 + C_coarseroot <- M_coarse_kgm2 * 0.50 + + r14 <- get_trait_record(lk, subclass, class, 14, pft = pft) + r146 <- get_trait_record(lk, subclass, class, 146, pft = pft) + r165 <- get_trait_record(lk, subclass, class, 165, pft = pft) + r1055 <- get_trait_record(lk, subclass, class, 1055, pft = pft) + r2057 <- get_trait_record(lk, subclass, class, 2057, pft = pft) + + Nleaf_mass_mgg <- r14$value + CN_leaf <- r146$value + CN_stem <- r165$value + CN_root <- r1055$value + CN_fine <- r2057$value + + if (!is.na(Nleaf_mass_mgg) && !is.na(M_leaf_kgm2)) { + N_leaf <- M_leaf_kgm2 * (Nleaf_mass_mgg * 1e-6) + } else if (!is.na(CN_leaf) && !is.na(C_leaf)) { + N_leaf <- C_leaf / CN_leaf + } else { + N_leaf <- NA_real_ + } + + if (!is.na(C_stem) && !is.na(CN_stem)) { + N_stem <- C_stem / CN_stem + } else if (!is.na(C_stem) && !is.na(CN_root)) { + N_stem <- C_stem / CN_root + } else if (!is.na(C_stem) && !is.na(CN_leaf)) { + N_stem <- C_stem / CN_leaf + } else { + N_stem <- NA_real_ + } + + CN_fine_use <- CN_fine + if (is.na(CN_fine_use)) CN_fine_use <- CN_root + if (is.na(CN_fine_use)) CN_fine_use <- CN_stem + if (is.na(CN_fine_use)) CN_fine_use <- CN_leaf + N_fineroot <- if (!is.na(C_fineroot) && !is.na(CN_fine_use)) C_fineroot / CN_fine_use else NA_real_ + + CN_coarse <- derive_CN_coarse(lk, subclass, class) + if (is.na(CN_coarse)) { + if (!is.na(CN_root)) CN_coarse <- CN_root + else if (!is.na(CN_stem)) CN_coarse <- CN_stem + else if (!is.na(CN_leaf)) CN_coarse <- CN_leaf + } + N_coarseroot <- if (!is.na(C_coarseroot) && !is.na(CN_coarse)) C_coarseroot / CN_coarse else NA_real_ + + out <- tibble( + LOC = ID, DATE = DATE, CLASS_SUBCLASS = code, class = class, subclass = subclass, crop_desc = crop_desc, CLASS_DESC = class_desc, + PFT = PFT, LAI = LAI, + C_LEAF = C_leaf, C_STEM = C_stem, C_FINEROOT = C_fineroot, C_COARSEROOT = C_coarseroot, + N_LEAF = N_leaf, N_STEM = N_stem, N_FINEROOT = N_fineroot, N_COARSEROOT = N_coarseroot, + ENSEMBLE_SIZE = 1L + ) + + if (diagnostics) { + out$sla_src <- sla_rec$src + out$sla_n_obs <- sla_rec$n_obs %||% NA_integer_ + out$sla_sd_obs <- sla_rec$sd_obs %||% NA_real_ + out$src_14 <- r14$src + out$src_3441 <- r3441$src + out$src_128 <- r128$src + out$src_2005 <- r2005$src + out$src_1534 <- r1534$src + out$src_1055 <- r1055$src + out$src_2057 <- r2057$src + out$src_146 <- r146$src + out$src_165 <- r165$src + out$used_class_any <- any(c(out$src_14, out$src_3441, out$src_128, out$src_2005, out$src_1534, + out$src_1055, out$src_2057, out$src_146, out$src_165) == "class") + out$used_pft_any <- any(c(out$src_14, out$src_3441, out$src_128, out$src_2005, out$src_1534, + out$src_1055, out$src_2057, out$src_146, out$src_165) == "pft") + out$used_global_any <- any(c(out$src_14, out$src_3441, out$src_128, out$src_2005, out$src_1534, + out$src_1055, out$src_2057, out$src_146, out$src_165) == "global") + } + out +} + +#### Planting: initialize_planting (fixed LAI or MSLSP EVI) + +# If LAI is a finite number, use it and ignore mslsp_EVImax and mslsp_EVIamp. +# Else require both EVI fields and compute LAI with compute_lai_from_mslsp(). +# LandIQ key: if class and subclass are both non-empty, code is paste0(class, subclass). +# Otherwise code must be set. Non-empty class overrides the class used in +# compute_lai_from_mslsp when the caller passed only a code string (optional CLASS column). + +initialize_planting <- function( + ID, DATE, PFT, lk, + code = NULL, + class = NA_character_, + subclass = NA_character_, + LAI = NA_real_, + mslsp_EVImax = NULL, + mslsp_EVIamp = NULL, + diagnostics = FALSE) { + + cls <- trimws(as.character(class)[1]) + sub <- as.character(subclass)[1] + code_in <- if (is.null(code)) "" else trimws(as.character(code)[1]) + + if (!is.na(cls) && nzchar(cls) && !is.na(sub) && nzchar(sub)) { + code_chr <- paste0(cls, sub) + } else if (nzchar(code_in)) { + code_chr <- code_in + } else { + stop("initialize_planting: provide LandIQ code or both class and subclass.") + } + + lai_vec <- suppressWarnings(as.numeric(LAI)) + use_explicit_lai <- length(lai_vec) > 0L && is.finite(lai_vec[[1]]) && !is.na(lai_vec[[1]]) + + if (use_explicit_lai) { + return(planting_pools_from_lookup( + ID = ID, DATE = DATE, code = code_chr, LAI = lai_vec[[1]], PFT = PFT, lk = lk, diagnostics = diagnostics + )) + } + + if (is.null(mslsp_EVImax) || is.null(mslsp_EVIamp)) { + stop("initialize_planting: provide finite LAI, or both mslsp_EVImax and mslsp_EVIamp.") + } + mx <- suppressWarnings(as.numeric(mslsp_EVImax)[1]) + ma <- suppressWarnings(as.numeric(mslsp_EVIamp)[1]) + if (is.na(mx) || is.na(ma)) { + stop("initialize_planting: mslsp_EVImax and mslsp_EVIamp must be non-NA when LAI is not given.") + } + + class_for_lai <- cls + if (is.na(class_for_lai) || !nzchar(class_for_lai)) { + class_for_lai <- get_group_class_from_code(code_chr, lk$mapping)$class + } + + lai_diag <- compute_lai_from_mslsp( + mslsp_EVImax = mx, + mslsp_EVIamp = ma, + pft = PFT, + class = class_for_lai, + diagnostics = TRUE + ) + + out <- planting_pools_from_lookup( + ID = ID, DATE = DATE, code = code_chr, LAI = lai_diag$LAI, PFT = PFT, lk = lk, diagnostics = diagnostics + ) + if (diagnostics) { + out$lai_rule_id <- lai_diag$lai_rule_id + out$lai_evi_field_used <- lai_diag$lai_evi_field_used + out$lai_evi_value_used <- lai_diag$lai_evi_value_used + out$lai_k <- lai_diag$lai_k + out$lai_a <- lai_diag$lai_a + out$lai_b <- lai_diag$lai_b + out$lai_min <- lai_diag$lai_min + out$lai_max <- lai_diag$lai_max + } + out +} + +#### Harvest: removal fractions from lookup + +initialize_harvest_from_lookup <- function(ID, DATE, code, PFT, lk, destructive = FALSE) { + gc <- get_group_class_from_code(code, lk$mapping) + subclass <- gc$subclass + class <- gc$class + crop_desc <- gc$crop_desc + class_desc <- gc$class_desc + + # Map parcel PFT to lookup PFT: rice/row->row, hay->hay, woody->woody or woody_destructive + lookup_pft <- dplyr::case_when( + PFT %in% c("rice", "row") ~ "row", + PFT == "hay" ~ "hay", + PFT == "woody" & destructive ~ "woody_destructive", + PFT == "woody" ~ "woody", + TRUE ~ "skip" + ) + if (lookup_pft == "skip") return(NULL) + + r_agb_rem <- get_harvest_param(lk, subclass, class, lookup_pft, "AGB_REMOVED") + r_agb_lit <- get_harvest_param(lk, subclass, class, lookup_pft, "AGB_LITTER") + r_bgb_rem <- get_harvest_param(lk, subclass, class, lookup_pft, "BGB_REMOVED") + r_bgb_lit <- get_harvest_param(lk, subclass, class, lookup_pft, "BGB_LITTER") + + tibble( + LOC = ID, DATE = DATE, CLASS_SUBCLASS = code, class = class, subclass = subclass, crop_desc = crop_desc, CLASS_DESC = class_desc, + PFT = PFT, + AGB_REMOVED = r_agb_rem$value, AGB_LITTER = r_agb_lit$value, + BGB_REMOVED = r_bgb_rem$value, BGB_LITTER = r_bgb_lit$value, + ENSEMBLE_SIZE = 1L + ) +} +