From bf6489a8a15aab97b71d15e69835373daf4e07fd Mon Sep 17 00:00:00 2001 From: James Duncan Date: Tue, 3 Mar 2026 16:07:35 -0800 Subject: [PATCH 1/9] Add SamudrACE CM4 piControl baseline configs --- .../baselines/coupled/cm4-piControl/README.md | 95 +++++ .../coupled/cm4-piControl/evaluate.sh | 42 +++ .../cm4-piControl/evaluator-config-ICx1.yaml | 41 +++ .../finetune-config-template.yaml | 152 ++++++++ .../coupled/cm4-piControl/finetune.sh | 84 +++++ .../cm4-piControl/train-config-template.yaml | 152 ++++++++ .../baselines/coupled/cm4-piControl/train.sh | 86 +++++ .../uncoupled-atmos/ace-evaluator-config.yaml | 29 ++ .../uncoupled-atmos/ace-train-config.yaml | 248 +++++++++++++ .../uncoupled-atmos/run-ace-evaluator.sh | 42 +++ .../uncoupled-atmos/run-ace-train.sh | 43 +++ .../uncoupled-ocean/ace-evaluator-config.yaml | 31 ++ .../uncoupled-ocean/ace-train-config.yaml | 335 ++++++++++++++++++ .../uncoupled-ocean/run-ace-evaluator.sh | 42 +++ .../uncoupled-ocean/run-ace-train.sh | 43 +++ 15 files changed, 1465 insertions(+) create mode 100644 configs/baselines/coupled/cm4-piControl/README.md create mode 100644 configs/baselines/coupled/cm4-piControl/evaluate.sh create mode 100644 configs/baselines/coupled/cm4-piControl/evaluator-config-ICx1.yaml create mode 100644 configs/baselines/coupled/cm4-piControl/finetune-config-template.yaml create mode 100644 configs/baselines/coupled/cm4-piControl/finetune.sh create mode 100644 configs/baselines/coupled/cm4-piControl/train-config-template.yaml create mode 100644 configs/baselines/coupled/cm4-piControl/train.sh create mode 100644 configs/baselines/coupled/cm4-piControl/uncoupled-atmos/ace-evaluator-config.yaml create mode 100644 configs/baselines/coupled/cm4-piControl/uncoupled-atmos/ace-train-config.yaml create mode 100644 configs/baselines/coupled/cm4-piControl/uncoupled-atmos/run-ace-evaluator.sh create mode 100644 configs/baselines/coupled/cm4-piControl/uncoupled-atmos/run-ace-train.sh create mode 100644 configs/baselines/coupled/cm4-piControl/uncoupled-ocean/ace-evaluator-config.yaml create mode 100644 configs/baselines/coupled/cm4-piControl/uncoupled-ocean/ace-train-config.yaml create mode 100644 configs/baselines/coupled/cm4-piControl/uncoupled-ocean/run-ace-evaluator.sh create mode 100644 configs/baselines/coupled/cm4-piControl/uncoupled-ocean/run-ace-train.sh diff --git a/configs/baselines/coupled/cm4-piControl/README.md b/configs/baselines/coupled/cm4-piControl/README.md new file mode 100644 index 000000000..140a48833 --- /dev/null +++ b/configs/baselines/coupled/cm4-piControl/README.md @@ -0,0 +1,95 @@ +# Coupled training and evaluation for CM4-piControl + +Self-contained baseline configs and scripts for the SamudrACE coupled +atmosphere-ocean training pipeline on GFDL CM4 piControl data. + +## Pipeline overview + +The pipeline trains uncoupled atmosphere and ocean models independently, then +couples them via freeze-then-optimize (FTO) training, with an optional +joint fine-tuning stage. Each step produces a Beaker dataset whose ID is +plugged into the next script's `EXISTING_RESULTS_DATASET` variable. + +``` +Step 1: Uncoupled training (parallel) + uncoupled-atmos/run-ace-train.sh -> atmos checkpoint + uncoupled-ocean/run-ace-train.sh -> ocean checkpoint + +Step 2 (optional): Uncoupled evaluation + uncoupled-atmos/run-ace-evaluator.sh + uncoupled-ocean/run-ace-evaluator.sh + +Step 3: Coupled training (freeze-then-optimize) + train.sh -> coupled checkpoint (atmos frozen, ocean fine-tuned) + +Step 4 (optional): Coupled fine-tuning + finetune.sh -> refined coupled checkpoint (both models trained) + +Step 5: Coupled evaluation + evaluate.sh +``` + +## Directory contents + +| File | Purpose | +|------|---------| +| `uncoupled-atmos/ace-train-config.yaml` | SFNO atmosphere model: architecture, variables, loss weights | +| `uncoupled-atmos/ace-evaluator-config.yaml` | Atmosphere evaluation (58,300 steps = ~40 years at 6h) | +| `uncoupled-ocean/ace-train-config.yaml` | Samudra ocean model: architecture, variables, correctors | +| `uncoupled-ocean/ace-evaluator-config.yaml` | Ocean evaluation (2,920 steps = ~40 years at 5-day) | +| `train-config-template.yaml` | Coupled FTO training: data loaders, optimization, coupled stepper skeleton (atmos frozen, ocean trainable) | +| `finetune-config-template.yaml` | Coupled fine-tuning: lower LR, cosine annealing, both models trainable, loads from coupled checkpoint | +| `evaluator-config-ICx1.yaml` | Coupled evaluation from a single initial condition (year 311) | +| `train.sh` | Generates `coupled-train-config.yaml` and submits coupled training | +| `finetune.sh` | Generates `coupled-finetune-config.yaml` and submits fine-tuning | +| `evaluate.sh` | Submits coupled evaluation | + +## How configs are generated + +The coupled training configs are too large to maintain by hand since they +embed the full stepper definitions for both atmosphere and ocean models. +Instead, `train.sh` and `finetune.sh` generate them automatically: + +1. Copy `uncoupled-atmos/ace-train-config.yaml` and + `uncoupled-ocean/ace-train-config.yaml` to temp files +2. Remap stats paths (`statsdata` -> `atmos_stats` / `ocean_stats`) +3. Strip training-specific fields (`loss`, `parameter_init`, etc.) +4. Extract `sea_ice_fraction_name` from the ocean corrector config +5. Merge both steppers into the template (template values win on conflict) +6. Set `ocean_fraction_prediction.sea_ice_fraction_name` + +This requires **yq >= 4** (`brew install yq` or `pip install yq`). + +## How to use + +1. **Train uncoupled models** -- run `uncoupled-atmos/run-ace-train.sh` and + `uncoupled-ocean/run-ace-train.sh`. When complete, find the Beaker result + dataset ID for each job. + +2. **Update `train.sh`** -- set `EXISTING_RESULTS_ATMOS_DATASET` and + `EXISTING_RESULTS_OCEAN_DATASET` to the dataset IDs from step 1. + +3. **Run coupled training** -- run `train.sh`. This generates + `coupled-train-config.yaml` and submits the job. + +4. **(Optional) Fine-tune** -- set `EXISTING_RESULTS_DATASET` in + `finetune.sh` to the dataset ID from coupled training, then run it. + +5. **Evaluate** -- set `EXISTING_RESULTS_DATASET` in `evaluate.sh` to the + dataset ID from coupled training (or fine-tuning), then run it. + +6. **(Optional) Evaluate uncoupled models** -- set `EXISTING_RESULTS_DATASET` + in the uncoupled evaluator scripts and run them. + +## Key model details + +- **Atmosphere**: SphericalFourierNeuralOperatorNet (SFNO), embed_dim=384, + 8 layers, 6h timestep, 8-level vertical discretization +- **Ocean**: Samudra CNN, ch_width=[200,250,300,400], 5-day timestep, + 19 depth levels for temperature/salinity/velocity +- **Coupled FTO**: 20 epochs, 4 coupled steps, atmosphere frozen, ocean + trained with MSE loss +- **Coupled fine-tuning**: 20 epochs, lr=1e-5 with cosine annealing, + both models trained, loads from coupled checkpoint +- **Data**: CM4 piControl 200-year simulation, train years 151-306, + validation years 306-311, evaluation from year 311 diff --git a/configs/baselines/coupled/cm4-piControl/evaluate.sh b/configs/baselines/coupled/cm4-piControl/evaluate.sh new file mode 100644 index 000000000..e48d3e501 --- /dev/null +++ b/configs/baselines/coupled/cm4-piControl/evaluate.sh @@ -0,0 +1,42 @@ +#!/bin/bash + +set -e + +JOB_NAME="cm4-piControl-coupled-evaluator" +JOB_GROUP="cm4-piControl-coupled" +EXISTING_RESULTS_DATASET="TODO" # beaker dataset ID from coupled training or fine-tuning +CONFIG_FILENAME="evaluator-config-ICx1.yaml" +SCRIPT_PATH=$(git rev-parse --show-prefix) # relative to the root of the repository +CONFIG_PATH="${SCRIPT_PATH}${CONFIG_FILENAME}" +BEAKER_USERNAME=$(beaker account whoami --format=json | jq -r '.[0].name') +REPO_ROOT=$(git rev-parse --show-toplevel) + +cd "$REPO_ROOT" # so config path is valid no matter where we are running this script + +python -m fme.coupled.validate_config --config_type evaluator $CONFIG_PATH + +gantry run \ + --name $JOB_NAME \ + --task-name $JOB_NAME \ + --description "ACE coupled CM4 piControl evaluator" \ + --beaker-image "$(cat $REPO_ROOT/latest_deps_only_image.txt)" \ + --workspace ai2/ace \ + --priority normal \ + --not-preemptible \ + --cluster ai2/ceres-cirrascale \ + --cluster ai2/saturn-cirrascale \ + --weka climate-default:/climate-default \ + --env WANDB_USERNAME=$BEAKER_USERNAME \ + --env WANDB_NAME=$JOB_NAME \ + --env WANDB_JOB_TYPE=inference \ + --env WANDB_RUN_GROUP=$JOB_GROUP \ + --env GOOGLE_APPLICATION_CREDENTIALS=/tmp/google_application_credentials.json \ + --env-secret WANDB_API_KEY=wandb-api-key-ai2cm-sa \ + --dataset-secret google-credentials:/tmp/google_application_credentials.json \ + --dataset $EXISTING_RESULTS_DATASET:training_checkpoints/best_inference_ckpt.tar:/ckpt.tar \ + --gpus 1 \ + --shared-memory 50GiB \ + --budget ai2/climate \ + --system-python \ + --install "pip install --no-deps ." \ + -- python -I -m fme.coupled.evaluator $CONFIG_PATH diff --git a/configs/baselines/coupled/cm4-piControl/evaluator-config-ICx1.yaml b/configs/baselines/coupled/cm4-piControl/evaluator-config-ICx1.yaml new file mode 100644 index 000000000..dabe161e6 --- /dev/null +++ b/configs/baselines/coupled/cm4-piControl/evaluator-config-ICx1.yaml @@ -0,0 +1,41 @@ +experiment_dir: /results +n_coupled_steps: 2920 +coupled_steps_in_memory: 20 +checkpoint_path: /ckpt.tar +data_writer: + ocean: + save_prediction_files: false + save_monthly_files: true + atmosphere: + save_prediction_files: false + save_monthly_files: true +logging: + log_to_screen: true + log_to_wandb: true + log_to_file: true + project: ace-samudra-coupled-cm4 + entity: ai2cm +loader: + num_data_workers: 1 + dataset: + ocean: + data_path: /climate-default + file_pattern: 2025-05-14-cm4-piControl-200yr-ocean.zarr + engine: zarr + atmosphere: + merge: + - data_path: /climate-default + file_pattern: 2025-05-13-cm4-piControl-200yr-coupled-sst-sic-6h-interpFalse.zarr + engine: zarr + subset: + # NOTE: this is required to align the atmosphere and ocean start times + start_time: '0151-01-06' + - data_path: /climate-default + file_pattern: 2025-03-21-CM4-piControl-atmosphere-land-1deg-8layer-200yr.zarr + engine: zarr + subset: + # NOTE: this is required to align the atmosphere and ocean start times + start_time: '0151-01-06' + start_indices: + times: + - '0311-01-01T00:00:00' diff --git a/configs/baselines/coupled/cm4-piControl/finetune-config-template.yaml b/configs/baselines/coupled/cm4-piControl/finetune-config-template.yaml new file mode 100644 index 000000000..beaa51055 --- /dev/null +++ b/configs/baselines/coupled/cm4-piControl/finetune-config-template.yaml @@ -0,0 +1,152 @@ +experiment_dir: /results +save_checkpoint: true +validate_using_ema: true +ema: + decay: 0.999 +max_epochs: 20 +n_coupled_steps: 4 +inference: + n_coupled_steps: 1456 + coupled_steps_in_memory: 8 + loader: + num_data_workers: 1 + dataset: + ocean: + merge: + - data_path: /climate-default + file_pattern: 2025-06-03-cm4-piControl-200yr-coupled-ocean.zarr + engine: zarr + subset: + start_time: '0151-01-06' + - data_path: /climate-default + file_pattern: 2025-04-16-cm4-piControl-ocean-200yr-dataset.zarr + engine: zarr + subset: + start_time: '0151-01-06' + atmosphere: + merge: + - data_path: /climate-default + file_pattern: 2025-06-03-cm4-piControl-200yr-coupled-interpolate_sst-atmosphere.zarr + engine: zarr + subset: + # NOTE: this is required to align the atmosphere and ocean start times + start_time: '0151-01-06' + - data_path: /climate-default + file_pattern: 2025-03-21-CM4-piControl-atmosphere-land-1deg-8layer-200yr.zarr + engine: zarr + subset: + # NOTE: this is required to align the atmosphere and ocean start times + start_time: '0151-01-06' + start_indices: + times: + - '0151-01-06T00:00:00' + - '0171-01-06T00:00:00' + - '0191-01-06T00:00:00' + - '0211-01-06T00:00:00' + - '0231-01-06T00:00:00' + - '0251-01-06T00:00:00' + - '0271-01-06T00:00:00' + - '0291-01-06T00:00:00' + aggregator: + log_zonal_mean_images: false + log_histograms: false +logging: + log_to_screen: true + log_to_wandb: true + log_to_file: true + project: ace-samudra-coupled-cm4 + entity: ai2cm +train_loader: + batch_size: 16 + num_data_workers: 4 + prefetch_factor: 1 + dataset: + - ocean: + merge: + - data_path: /climate-default + file_pattern: 2025-06-03-cm4-piControl-200yr-coupled-ocean.zarr + engine: zarr + subset: + start_time: '0151-01-06' + stop_time: '0306-01-01' + - data_path: /climate-default + file_pattern: 2025-04-16-cm4-piControl-ocean-200yr-dataset.zarr + engine: zarr + subset: + start_time: '0151-01-06' + stop_time: '0306-01-01' + atmosphere: + merge: + - data_path: /climate-default + file_pattern: 2025-06-03-cm4-piControl-200yr-coupled-interpolate_sst-atmosphere.zarr + engine: zarr + subset: + start_time: '0151-01-06' + stop_time: '0306-01-01' + - data_path: /climate-default + file_pattern: 2025-03-21-CM4-piControl-atmosphere-land-1deg-8layer-200yr.zarr + engine: zarr + subset: + start_time: '0151-01-06' + stop_time: '0306-01-01' +validation_loader: + batch_size: 16 + num_data_workers: 4 + prefetch_factor: 1 + dataset: + - ocean: + merge: + - data_path: /climate-default + file_pattern: 2025-06-03-cm4-piControl-200yr-coupled-ocean.zarr + engine: zarr + subset: + start_time: '0306-01-01' + stop_time: '0311-01-01' + - data_path: /climate-default + file_pattern: 2025-04-16-cm4-piControl-ocean-200yr-dataset.zarr + engine: zarr + subset: + start_time: '0306-01-01' + stop_time: '0311-01-01' + atmosphere: + merge: + - data_path: /climate-default + file_pattern: 2025-06-03-cm4-piControl-200yr-coupled-interpolate_sst-atmosphere.zarr + engine: zarr + subset: + start_time: '0306-01-01' + stop_time: '0311-01-01' + - data_path: /climate-default + file_pattern: 2025-03-21-CM4-piControl-atmosphere-land-1deg-8layer-200yr.zarr + engine: zarr + subset: + start_time: '0306-01-01' + stop_time: '0311-01-01' +optimization: + enable_automatic_mixed_precision: false + lr: 0.00001 + optimizer_type: FusedAdam + kwargs: + weight_decay: 0.01 + use_gradient_accumulation: true + scheduler: + type: CosineAnnealingLR +stepper: + parameter_init: + checkpoint_path: /ckpt.tar + ocean: + timedelta: 5D + loss_contributions: + n_steps: 4 + weight: 1.0 + stepper: + parameter_init: + weights_path: null # required + atmosphere: + timedelta: 6h + loss_contributions: + n_steps: 2 + weight: 1.0 + stepper: + parameter_init: + weights_path: null # required diff --git a/configs/baselines/coupled/cm4-piControl/finetune.sh b/configs/baselines/coupled/cm4-piControl/finetune.sh new file mode 100644 index 000000000..2354941b2 --- /dev/null +++ b/configs/baselines/coupled/cm4-piControl/finetune.sh @@ -0,0 +1,84 @@ +#!/bin/bash +# +# Coupled fine-tuning: starting from a coupled training checkpoint, fine-tunes +# both atmosphere and ocean models jointly with a cosine-annealing LR schedule. +# Generates the full coupled config by merging stepper definitions from the +# uncoupled training configs into finetune-config-template.yaml. +# Requires yq >= 4. + +set -e + +JOB_NAME="cm4-piControl-coupled-finetune" +JOB_GROUP="cm4-piControl-coupled" +EXISTING_RESULTS_DATASET="TODO" # beaker dataset ID from coupled training (train.sh) +CKPT_TYPE="best_inference_ckpt" + +SCRIPT_PATH=$(git rev-parse --show-prefix) # relative to the root of the repository +BEAKER_USERNAME=$(beaker account whoami --format=json | jq -r '.[0].name') +REPO_ROOT=$(git rev-parse --show-toplevel) +N_GPUS=4 + +ATMOS_STATS_DATA=jamesd/2025-06-03-cm4-piControl-200yr-coupled-stats-atmosphere +OCEAN_STATS_DATA=jamesd/2025-06-03-cm4-piControl-200yr-coupled-stats-ocean + +cd "$REPO_ROOT" # so config path is valid no matter where we are running this script + +# --- Generate coupled-finetune-config.yaml from template + uncoupled configs --- + +TEMPLATE_CONFIG_PATH="${SCRIPT_PATH}finetune-config-template.yaml" +CONFIG_PATH="${SCRIPT_PATH}coupled-finetune-config.yaml" + +cp "${SCRIPT_PATH}uncoupled-atmos/ace-train-config.yaml" ./atmos-config.yaml +sed -i 's/statsdata/atmos_stats/g' ./atmos-config.yaml + +cp "${SCRIPT_PATH}uncoupled-ocean/ace-train-config.yaml" ./ocean-config.yaml +sed -i 's/statsdata/ocean_stats/g' ./ocean-config.yaml + +yq -i 'del(.stepper.loss, .stepper.optimize_last_step_only, .stepper.n_ensemble, .stepper.parameter_init, .stepper.train_n_forward_step)' ./ocean-config.yaml +yq -i 'del(.stepper.loss, .stepper.optimize_last_step_only, .stepper.n_ensemble, .stepper.parameter_init, .stepper.train_n_forward_step)' ./atmos-config.yaml + +SIC_NAME=$(yq '.stepper.step.config.corrector.config.sea_ice_fraction_correction.sea_ice_fraction_name' ./ocean-config.yaml) +if [[ "$SIC_NAME" == "null" ]]; then + echo "Failed to extract sea_ice_fraction_name from the ocean config" + exit 1 +fi + +cp "$TEMPLATE_CONFIG_PATH" "$CONFIG_PATH" + +yq -i '.stepper.ocean.stepper *=n load("ocean-config.yaml").stepper' "$CONFIG_PATH" +SIC_NAME=$SIC_NAME yq -i '.stepper.ocean_fraction_prediction.sea_ice_fraction_name = env(SIC_NAME)' "$CONFIG_PATH" +yq -i '.stepper.atmosphere.stepper *=n load("atmos-config.yaml").stepper' "$CONFIG_PATH" + +rm ./atmos-config.yaml ./ocean-config.yaml + +# --- Validate and submit --- + +python -m fme.coupled.validate_config "$CONFIG_PATH" --config_type train + +gantry run \ + --name $JOB_NAME \ + --task-name $JOB_NAME \ + --description "ACE coupled CM4 piControl fine-tuning" \ + --beaker-image "$(cat $REPO_ROOT/latest_deps_only_image.txt)" \ + --workspace ai2/ace \ + --priority normal \ + --preemptible \ + --cluster ai2/ceres-cirrascale \ + --cluster ai2/saturn-cirrascale \ + --weka climate-default:/climate-default \ + --env WANDB_USERNAME=$BEAKER_USERNAME \ + --env WANDB_NAME=$JOB_NAME \ + --env WANDB_JOB_TYPE=training \ + --env WANDB_RUN_GROUP=$JOB_GROUP \ + --env GOOGLE_APPLICATION_CREDENTIALS=/tmp/google_application_credentials.json \ + --env-secret WANDB_API_KEY=wandb-api-key-ai2cm-sa \ + --dataset-secret google-credentials:/tmp/google_application_credentials.json \ + --dataset $ATMOS_STATS_DATA:/atmos_stats \ + --dataset $OCEAN_STATS_DATA:/ocean_stats \ + --dataset "$EXISTING_RESULTS_DATASET:training_checkpoints/${CKPT_TYPE}.tar:/ckpt.tar" \ + --gpus $N_GPUS \ + --shared-memory 800GiB \ + --budget ai2/climate \ + --system-python \ + --install "pip install --no-deps ." \ + -- torchrun --nproc_per_node $N_GPUS -m fme.coupled.train "$CONFIG_PATH" diff --git a/configs/baselines/coupled/cm4-piControl/train-config-template.yaml b/configs/baselines/coupled/cm4-piControl/train-config-template.yaml new file mode 100644 index 000000000..f0d9d28b3 --- /dev/null +++ b/configs/baselines/coupled/cm4-piControl/train-config-template.yaml @@ -0,0 +1,152 @@ +experiment_dir: /results +save_checkpoint: true +validate_using_ema: true +ema: + decay: 0.999 +max_epochs: 20 +n_coupled_steps: 4 +inference: + n_coupled_steps: 1456 + coupled_steps_in_memory: 8 + loader: + num_data_workers: 1 + dataset: + ocean: + merge: + - data_path: /climate-default + file_pattern: 2025-06-03-cm4-piControl-200yr-coupled-ocean.zarr + engine: zarr + subset: + start_time: '0151-01-06' + - data_path: /climate-default + file_pattern: 2025-04-16-cm4-piControl-ocean-200yr-dataset.zarr + engine: zarr + subset: + start_time: '0151-01-06' + atmosphere: + merge: + - data_path: /climate-default + file_pattern: 2025-06-03-cm4-piControl-200yr-coupled-interpolate_sst-atmosphere.zarr + engine: zarr + subset: + # NOTE: this is required to align the atmosphere and ocean start times + start_time: '0151-01-06' + - data_path: /climate-default + file_pattern: 2025-03-21-CM4-piControl-atmosphere-land-1deg-8layer-200yr.zarr + engine: zarr + subset: + # NOTE: this is required to align the atmosphere and ocean start times + start_time: '0151-01-06' + start_indices: + times: + - '0151-01-06T00:00:00' + - '0171-01-06T00:00:00' + - '0191-01-06T00:00:00' + - '0211-01-06T00:00:00' + - '0231-01-06T00:00:00' + - '0251-01-06T00:00:00' + - '0271-01-06T00:00:00' + - '0291-01-06T00:00:00' + aggregator: + log_zonal_mean_images: false + log_histograms: false +logging: + log_to_screen: true + log_to_wandb: true + log_to_file: true + project: ace-samudra-coupled-cm4 + entity: ai2cm +train_loader: + batch_size: 16 + num_data_workers: 4 + prefetch_factor: 1 + dataset: + - ocean: + merge: + - data_path: /climate-default + file_pattern: 2025-06-03-cm4-piControl-200yr-coupled-ocean.zarr + engine: zarr + subset: + start_time: '0151-01-06' + stop_time: '0306-01-01' + - data_path: /climate-default + file_pattern: 2025-04-16-cm4-piControl-ocean-200yr-dataset.zarr + engine: zarr + subset: + start_time: '0151-01-06' + stop_time: '0306-01-01' + atmosphere: + merge: + - data_path: /climate-default + file_pattern: 2025-06-03-cm4-piControl-200yr-coupled-interpolate_sst-atmosphere.zarr + engine: zarr + subset: + start_time: '0151-01-06' + stop_time: '0306-01-01' + - data_path: /climate-default + file_pattern: 2025-03-21-CM4-piControl-atmosphere-land-1deg-8layer-200yr.zarr + engine: zarr + subset: + start_time: '0151-01-06' + stop_time: '0306-01-01' +validation_loader: + batch_size: 16 + num_data_workers: 4 + prefetch_factor: 1 + dataset: + - ocean: + merge: + - data_path: /climate-default + file_pattern: 2025-06-03-cm4-piControl-200yr-coupled-ocean.zarr + engine: zarr + subset: + start_time: '0306-01-01' + stop_time: '0311-01-01' + - data_path: /climate-default + file_pattern: 2025-04-16-cm4-piControl-ocean-200yr-dataset.zarr + engine: zarr + subset: + start_time: '0306-01-01' + stop_time: '0311-01-01' + atmosphere: + merge: + - data_path: /climate-default + file_pattern: 2025-06-03-cm4-piControl-200yr-coupled-interpolate_sst-atmosphere.zarr + engine: zarr + subset: + start_time: '0306-01-01' + stop_time: '0311-01-01' + - data_path: /climate-default + file_pattern: 2025-03-21-CM4-piControl-atmosphere-land-1deg-8layer-200yr.zarr + engine: zarr + subset: + start_time: '0306-01-01' + stop_time: '0311-01-01' +optimization: + enable_automatic_mixed_precision: false + lr: 0.0001 + optimizer_type: FusedAdam + kwargs: + weight_decay: 0.01 + use_gradient_accumulation: true +stepper: + ocean_fraction_prediction: + land_fraction_name: land_fraction + sea_ice_fraction_name_in_atmosphere: sea_ice_fraction + sst_name: sst + ocean: + timedelta: 5D + stepper: + parameter_init: + weights_path: /ocean_ckpt.tar + atmosphere: + timedelta: 6h + loss_contributions: + n_steps: 0 + stepper: + parameter_init: + weights_path: /atmos_ckpt.tar + parameters: + - frozen: + include: + - '*' diff --git a/configs/baselines/coupled/cm4-piControl/train.sh b/configs/baselines/coupled/cm4-piControl/train.sh new file mode 100644 index 000000000..9d2621bd5 --- /dev/null +++ b/configs/baselines/coupled/cm4-piControl/train.sh @@ -0,0 +1,86 @@ +#!/bin/bash +# +# Coupled training (freeze-then-optimize): freezes the atmosphere model and +# fine-tunes the ocean model in the coupled loop. Generates the full coupled +# config by merging stepper definitions from the uncoupled training configs +# into the train-config-template.yaml. Requires yq >= 4. + +set -e + +JOB_NAME="cm4-piControl-coupled-train" +JOB_GROUP="cm4-piControl-coupled" +EXISTING_RESULTS_ATMOS_DATASET="TODO" # beaker dataset ID from uncoupled atmos training +EXISTING_RESULTS_OCEAN_DATASET="TODO" # beaker dataset ID from uncoupled ocean training +ATMOS_CKPT="best_inference_ckpt" +OCEAN_CKPT="best_inference_ckpt" + +SCRIPT_PATH=$(git rev-parse --show-prefix) # relative to the root of the repository +BEAKER_USERNAME=$(beaker account whoami --format=json | jq -r '.[0].name') +REPO_ROOT=$(git rev-parse --show-toplevel) +N_GPUS=4 + +ATMOS_STATS_DATA=jamesd/2025-06-03-cm4-piControl-200yr-coupled-stats-atmosphere +OCEAN_STATS_DATA=jamesd/2025-06-03-cm4-piControl-200yr-coupled-stats-ocean + +cd "$REPO_ROOT" # so config path is valid no matter where we are running this script + +# --- Generate coupled-train-config.yaml from template + uncoupled configs --- + +TEMPLATE_CONFIG_PATH="${SCRIPT_PATH}train-config-template.yaml" +CONFIG_PATH="${SCRIPT_PATH}coupled-train-config.yaml" + +cp "${SCRIPT_PATH}uncoupled-atmos/ace-train-config.yaml" ./atmos-config.yaml +sed -i 's/statsdata/atmos_stats/g' ./atmos-config.yaml + +cp "${SCRIPT_PATH}uncoupled-ocean/ace-train-config.yaml" ./ocean-config.yaml +sed -i 's/statsdata/ocean_stats/g' ./ocean-config.yaml + +yq -i 'del(.stepper.loss, .stepper.optimize_last_step_only, .stepper.n_ensemble, .stepper.parameter_init, .stepper.train_n_forward_step)' ./ocean-config.yaml +yq -i 'del(.stepper.loss, .stepper.optimize_last_step_only, .stepper.n_ensemble, .stepper.parameter_init, .stepper.train_n_forward_step)' ./atmos-config.yaml + +SIC_NAME=$(yq '.stepper.step.config.corrector.config.sea_ice_fraction_correction.sea_ice_fraction_name' ./ocean-config.yaml) +if [[ "$SIC_NAME" == "null" ]]; then + echo "Failed to extract sea_ice_fraction_name from the ocean config" + exit 1 +fi + +cp "$TEMPLATE_CONFIG_PATH" "$CONFIG_PATH" + +yq -i '.stepper.ocean.stepper *=n load("ocean-config.yaml").stepper' "$CONFIG_PATH" +SIC_NAME=$SIC_NAME yq -i '.stepper.ocean_fraction_prediction.sea_ice_fraction_name = env(SIC_NAME)' "$CONFIG_PATH" +yq -i '.stepper.atmosphere.stepper *=n load("atmos-config.yaml").stepper' "$CONFIG_PATH" + +rm ./atmos-config.yaml ./ocean-config.yaml + +# --- Validate and submit --- + +python -m fme.coupled.validate_config "$CONFIG_PATH" --config_type train + +gantry run \ + --name $JOB_NAME \ + --task-name $JOB_NAME \ + --description "ACE coupled CM4 piControl training (freeze-then-optimize)" \ + --beaker-image "$(cat $REPO_ROOT/latest_deps_only_image.txt)" \ + --workspace ai2/ace \ + --priority normal \ + --preemptible \ + --cluster ai2/ceres-cirrascale \ + --cluster ai2/saturn-cirrascale \ + --weka climate-default:/climate-default \ + --env WANDB_USERNAME=$BEAKER_USERNAME \ + --env WANDB_NAME=$JOB_NAME \ + --env WANDB_JOB_TYPE=training \ + --env WANDB_RUN_GROUP=$JOB_GROUP \ + --env GOOGLE_APPLICATION_CREDENTIALS=/tmp/google_application_credentials.json \ + --env-secret WANDB_API_KEY=wandb-api-key-ai2cm-sa \ + --dataset-secret google-credentials:/tmp/google_application_credentials.json \ + --dataset $ATMOS_STATS_DATA:/atmos_stats \ + --dataset $OCEAN_STATS_DATA:/ocean_stats \ + --dataset "$EXISTING_RESULTS_ATMOS_DATASET:training_checkpoints/${ATMOS_CKPT}.tar:/atmos_ckpt.tar" \ + --dataset "$EXISTING_RESULTS_OCEAN_DATASET:training_checkpoints/${OCEAN_CKPT}.tar:/ocean_ckpt.tar" \ + --gpus $N_GPUS \ + --shared-memory 800GiB \ + --budget ai2/climate \ + --system-python \ + --install "pip install --no-deps ." \ + -- torchrun --nproc_per_node $N_GPUS -m fme.coupled.train "$CONFIG_PATH" diff --git a/configs/baselines/coupled/cm4-piControl/uncoupled-atmos/ace-evaluator-config.yaml b/configs/baselines/coupled/cm4-piControl/uncoupled-atmos/ace-evaluator-config.yaml new file mode 100644 index 000000000..5fd0ac44d --- /dev/null +++ b/configs/baselines/coupled/cm4-piControl/uncoupled-atmos/ace-evaluator-config.yaml @@ -0,0 +1,29 @@ +experiment_dir: /results +n_forward_steps: 58300 +forward_steps_in_memory: 50 +checkpoint_path: /ckpt.tar +logging: + log_to_screen: true + log_to_wandb: true + log_to_file: true + project: ace2-cm4-atmos-noLSM + entity: ai2cm +loader: + start_indices: + times: + - '0311-01-01T06:00:00' + dataset: + merge: + - data_path: /climate-default + file_pattern: 2025-06-03-cm4-piControl-200yr-coupled-interpolate_sst-atmosphere.zarr + engine: zarr + - data_path: /climate-default + file_pattern: 2025-03-21-CM4-piControl-atmosphere-land-1deg-8layer-200yr.zarr + engine: zarr + num_data_workers: 4 +aggregator: + log_zonal_mean_images: true + log_histograms: true +data_writer: + save_prediction_files: false + save_monthly_files: true diff --git a/configs/baselines/coupled/cm4-piControl/uncoupled-atmos/ace-train-config.yaml b/configs/baselines/coupled/cm4-piControl/uncoupled-atmos/ace-train-config.yaml new file mode 100644 index 000000000..f2faeac7f --- /dev/null +++ b/configs/baselines/coupled/cm4-piControl/uncoupled-atmos/ace-train-config.yaml @@ -0,0 +1,248 @@ +experiment_dir: /results +save_checkpoint: true +validate_using_ema: true +ema: + decay: 0.999 +max_epochs: 50 # this is about equivalent to the amount of trainign for ACE2-ERA5/ACE2-SHiELD +n_forward_steps: 2 +inference: + n_forward_steps: 29200 + forward_steps_in_memory: 40 + loader: + start_indices: + times: + - '0151-01-01T06:00:00' + - '0171-01-01T06:00:00' + - '0191-01-01T06:00:00' + - '0211-01-01T06:00:00' + - '0231-01-01T06:00:00' + - '0251-01-01T06:00:00' + - '0271-01-01T06:00:00' + - '0291-01-01T06:00:00' + dataset: + merge: + - data_path: /climate-default + file_pattern: 2025-07-11-cm4-piControl-200yr-coupled-threshold-sst_from_ts-atmosphere.zarr + engine: zarr + - data_path: /climate-default + file_pattern: 2025-03-21-CM4-piControl-atmosphere-land-1deg-8layer-200yr.zarr + engine: zarr + num_data_workers: 32 + aggregator: + log_histograms: false +logging: + log_to_screen: true + log_to_wandb: true + log_to_file: true + project: ace2-cm4-atmos-noLSM + entity: ai2cm +train_loader: + batch_size: 16 + num_data_workers: 32 + prefetch_factor: 2 + dataset: + merge: + - data_path: /climate-default + file_pattern: 2025-07-11-cm4-piControl-200yr-coupled-threshold-sst_from_ts-atmosphere.zarr + engine: zarr + subset: + stop_time: '0306-01-01T06:00:00' + - data_path: /climate-default + file_pattern: 2025-03-21-CM4-piControl-atmosphere-land-1deg-8layer-200yr.zarr + engine: zarr + subset: + stop_time: '0306-01-01T06:00:00' +validation_loader: + batch_size: 128 + num_data_workers: 32 + prefetch_factor: 2 + dataset: + merge: + - data_path: /climate-default + file_pattern: 2025-07-11-cm4-piControl-200yr-coupled-threshold-sst_from_ts-atmosphere.zarr + engine: zarr + subset: + start_time: '0306-01-01T06:00:00' + stop_time: '0311-01-01T06:00:00' + - data_path: /climate-default + file_pattern: 2025-03-21-CM4-piControl-atmosphere-land-1deg-8layer-200yr.zarr + engine: zarr + subset: + start_time: '0306-01-01T06:00:00' + stop_time: '0311-01-01T06:00:00' +optimization: + use_gradient_accumulation: true + enable_automatic_mixed_precision: false + lr: 0.0001 + optimizer_type: FusedAdam + kwargs: + weight_decay: 0.01 +stepper: + loss: + type: MSE + weights: + air_temperature_0: 0.5 + air_temperature_1: 0.5 + eastward_wind_0: 0.5 + northward_wind_0: 0.5 + specific_total_water_0: 0.5 + specific_total_water_1: 0.25 + specific_total_water_2: 0.5 + PRATEsfc: 0.5 + h500: 10 + TMP850: 5 + Q2m: 0.5 + DLWRFsfc: 2 + ULWRFsfc: 5 + USWRFsfc: 2 + DSWRFsfc: 2 + USWRFtoa: 2 + tendency_of_total_water_path_due_to_advection: 0.5 + step: + type: single_module + config: + builder: + type: SphericalFourierNeuralOperatorNet + config: + embed_dim: 384 + filter_type: linear + hard_thresholding_fraction: 1.0 + use_mlp: true + normalization_layer: instance_norm + num_layers: 8 + operator_type: dhconv + scale_factor: 1 + separable: false + spectral_layers: 3 + spectral_transform: sht + normalization: + network: + global_means_path: /statsdata/centering.nc + global_stds_path: /statsdata/scaling-full-field.nc + residual: + global_means_path: /statsdata/centering.nc + global_stds_path: /statsdata/scaling-residual.nc + ocean: + surface_temperature_name: surface_temperature + ocean_fraction_name: ocean_fraction + interpolate: false + corrector: + conserve_dry_air: true + moisture_budget_correction: advection_and_precipitation + force_positive_names: + - specific_total_water_0 + - specific_total_water_1 + - specific_total_water_2 + - specific_total_water_3 + - specific_total_water_4 + - specific_total_water_5 + - specific_total_water_6 + - specific_total_water_7 + - Q2m + - PRATEsfc + - ULWRFsfc + - ULWRFtoa + - DLWRFsfc + - DSWRFsfc + - USWRFsfc + - USWRFtoa + next_step_forcing_names: + - DSWRFtoa + in_names: + - land_fraction + - ocean_fraction + - sea_ice_fraction + - DSWRFtoa + - HGTsfc + - PRESsfc + - surface_temperature + - TMP2m + - Q2m + - UGRD10m + - VGRD10m + - air_temperature_0 + - air_temperature_1 + - air_temperature_2 + - air_temperature_3 + - air_temperature_4 + - air_temperature_5 + - air_temperature_6 + - air_temperature_7 + - specific_total_water_0 + - specific_total_water_1 + - specific_total_water_2 + - specific_total_water_3 + - specific_total_water_4 + - specific_total_water_5 + - specific_total_water_6 + - specific_total_water_7 + - eastward_wind_0 + - eastward_wind_1 + - eastward_wind_2 + - eastward_wind_3 + - eastward_wind_4 + - eastward_wind_5 + - eastward_wind_6 + - eastward_wind_7 + - northward_wind_0 + - northward_wind_1 + - northward_wind_2 + - northward_wind_3 + - northward_wind_4 + - northward_wind_5 + - northward_wind_6 + - northward_wind_7 + out_names: + - PRESsfc + - surface_temperature + - TMP2m + - Q2m + - UGRD10m + - VGRD10m + - air_temperature_0 + - air_temperature_1 + - air_temperature_2 + - air_temperature_3 + - air_temperature_4 + - air_temperature_5 + - air_temperature_6 + - air_temperature_7 + - specific_total_water_0 + - specific_total_water_1 + - specific_total_water_2 + - specific_total_water_3 + - specific_total_water_4 + - specific_total_water_5 + - specific_total_water_6 + - specific_total_water_7 + - eastward_wind_0 + - eastward_wind_1 + - eastward_wind_2 + - eastward_wind_3 + - eastward_wind_4 + - eastward_wind_5 + - eastward_wind_6 + - eastward_wind_7 + - northward_wind_0 + - northward_wind_1 + - northward_wind_2 + - northward_wind_3 + - northward_wind_4 + - northward_wind_5 + - northward_wind_6 + - northward_wind_7 + - LHTFLsfc + - SHTFLsfc + - PRATEsfc + - ULWRFsfc + - ULWRFtoa + - DLWRFsfc + - DSWRFsfc + - USWRFsfc + - USWRFtoa + - tendency_of_total_water_path_due_to_advection + - TMP850 + - h500 + # ocean coupling variables (diagnostic for now) + - eastward_surface_wind_stress + - northward_surface_wind_stress diff --git a/configs/baselines/coupled/cm4-piControl/uncoupled-atmos/run-ace-evaluator.sh b/configs/baselines/coupled/cm4-piControl/uncoupled-atmos/run-ace-evaluator.sh new file mode 100644 index 000000000..ac01c3e8e --- /dev/null +++ b/configs/baselines/coupled/cm4-piControl/uncoupled-atmos/run-ace-evaluator.sh @@ -0,0 +1,42 @@ +#!/bin/bash + +set -e + +JOB_NAME="cm4-piControl-atmos-evaluator" +JOB_GROUP="cm4-piControl-atmos" +EXISTING_RESULTS_DATASET="TODO" # beaker dataset ID containing the training checkpoint +CONFIG_FILENAME="ace-evaluator-config.yaml" +SCRIPT_PATH=$(git rev-parse --show-prefix) # relative to the root of the repository +CONFIG_PATH="${SCRIPT_PATH}${CONFIG_FILENAME}" +BEAKER_USERNAME=$(beaker account whoami --format=json | jq -r '.[0].name') +REPO_ROOT=$(git rev-parse --show-toplevel) + +cd $REPO_ROOT # so config path is valid no matter where we are running this script + +python -m fme.ace.validate_config --config_type evaluator $CONFIG_PATH + +gantry run \ + --name $JOB_NAME \ + --task-name $JOB_NAME \ + --description "ACE-CM4 piControl atmosphere evaluator" \ + --beaker-image "$(cat $REPO_ROOT/latest_deps_only_image.txt)" \ + --workspace ai2/ace \ + --priority normal \ + --not-preemptible \ + --cluster ai2/ceres-cirrascale \ + --cluster ai2/saturn-cirrascale \ + --weka climate-default:/climate-default \ + --env WANDB_USERNAME=$BEAKER_USERNAME \ + --env WANDB_NAME=$JOB_NAME \ + --env WANDB_JOB_TYPE=inference \ + --env WANDB_RUN_GROUP=$JOB_GROUP \ + --env GOOGLE_APPLICATION_CREDENTIALS=/tmp/google_application_credentials.json \ + --env-secret WANDB_API_KEY=wandb-api-key-ai2cm-sa \ + --dataset-secret google-credentials:/tmp/google_application_credentials.json \ + --dataset $EXISTING_RESULTS_DATASET:training_checkpoints/best_inference_ckpt.tar:/ckpt.tar \ + --gpus 1 \ + --shared-memory 50GiB \ + --budget ai2/climate \ + --system-python \ + --install "pip install --no-deps ." \ + -- python -I -m fme.ace.evaluator $CONFIG_PATH diff --git a/configs/baselines/coupled/cm4-piControl/uncoupled-atmos/run-ace-train.sh b/configs/baselines/coupled/cm4-piControl/uncoupled-atmos/run-ace-train.sh new file mode 100644 index 000000000..cd91596d7 --- /dev/null +++ b/configs/baselines/coupled/cm4-piControl/uncoupled-atmos/run-ace-train.sh @@ -0,0 +1,43 @@ +#!/bin/bash + +set -e + +JOB_NAME="cm4-piControl-atmos-train" +JOB_GROUP="cm4-piControl-atmos" +CONFIG_FILENAME="ace-train-config.yaml" +SCRIPT_PATH=$(git rev-parse --show-prefix) # relative to the root of the repository +CONFIG_PATH="${SCRIPT_PATH}${CONFIG_FILENAME}" +BEAKER_USERNAME=$(beaker account whoami --format=json | jq -r '.[0].name') +REPO_ROOT=$(git rev-parse --show-toplevel) +N_GPUS=4 +STATS_DATA=jamesd/2025-07-11-cm4-piControl-200yr-coupled-stats-atmosphere + +cd $REPO_ROOT # so config path is valid no matter where we are running this script + +python -m fme.ace.validate_config --config_type train $CONFIG_PATH + +gantry run \ + --name $JOB_NAME \ + --task-name $JOB_NAME \ + --description "ACE-CM4 piControl atmosphere training" \ + --beaker-image "$(cat $REPO_ROOT/latest_deps_only_image.txt)" \ + --workspace ai2/ace \ + --priority normal \ + --preemptible \ + --cluster ai2/ceres-cirrascale \ + --cluster ai2/saturn-cirrascale \ + --weka climate-default:/climate-default \ + --env WANDB_USERNAME=$BEAKER_USERNAME \ + --env WANDB_NAME=$JOB_NAME \ + --env WANDB_JOB_TYPE=training \ + --env WANDB_RUN_GROUP=$JOB_GROUP \ + --env GOOGLE_APPLICATION_CREDENTIALS=/tmp/google_application_credentials.json \ + --env-secret WANDB_API_KEY=wandb-api-key-ai2cm-sa \ + --dataset-secret google-credentials:/tmp/google_application_credentials.json \ + --dataset $STATS_DATA:/statsdata \ + --gpus $N_GPUS \ + --shared-memory 400GiB \ + --budget ai2/climate \ + --system-python \ + --install "pip install --no-deps ." \ + -- torchrun --nproc_per_node $N_GPUS -m fme.ace.train $CONFIG_PATH diff --git a/configs/baselines/coupled/cm4-piControl/uncoupled-ocean/ace-evaluator-config.yaml b/configs/baselines/coupled/cm4-piControl/uncoupled-ocean/ace-evaluator-config.yaml new file mode 100644 index 000000000..c9f5ef92b --- /dev/null +++ b/configs/baselines/coupled/cm4-piControl/uncoupled-ocean/ace-evaluator-config.yaml @@ -0,0 +1,31 @@ +experiment_dir: /results +n_forward_steps: 2920 +forward_steps_in_memory: 40 +checkpoint_path: /ckpt.tar +logging: + log_to_screen: true + log_to_wandb: true + log_to_file: true + project: ace-samudra-cm4 + entity: ai2cm +loader: + start_indices: + times: + - '0311-01-01T00:00:00' + dataset: + merge: + - data_path: /climate-default + file_pattern: 2025-06-03-cm4-piControl-200yr-coupled-ocean.zarr + engine: zarr + - data_path: /climate-default + file_pattern: 2025-04-16-cm4-piControl-ocean-200yr-dataset.zarr + engine: zarr + - data_path: /climate-default + file_pattern: 2025-01-27-cm4-piControl-atmosphere-5daily-sfc-only/zarr_v3/data.zarr + engine: zarr + num_data_workers: 1 +data_writer: + save_prediction_files: false + save_monthly_files: true +aggregator: + log_histograms: true diff --git a/configs/baselines/coupled/cm4-piControl/uncoupled-ocean/ace-train-config.yaml b/configs/baselines/coupled/cm4-piControl/uncoupled-ocean/ace-train-config.yaml new file mode 100644 index 000000000..2e2d809f2 --- /dev/null +++ b/configs/baselines/coupled/cm4-piControl/uncoupled-ocean/ace-train-config.yaml @@ -0,0 +1,335 @@ +experiment_dir: /results +save_checkpoint: true +validate_using_ema: true +ema: + decay: 0.999 +max_epochs: 150 +n_forward_steps: 4 +inference: + n_forward_steps: 1440 + forward_steps_in_memory: 40 + loader: + start_indices: + times: + - '0151-01-06T00:00:00' + - '0171-01-06T00:00:00' + - '0191-01-06T00:00:00' + - '0211-01-06T00:00:00' + - '0231-01-06T00:00:00' + - '0251-01-06T00:00:00' + - '0271-01-06T00:00:00' + - '0291-01-06T00:00:00' + dataset: + merge: + - data_path: /climate-default + file_pattern: 2025-06-03-cm4-piControl-200yr-coupled-ocean.zarr + engine: zarr + - data_path: /climate-default + file_pattern: 2025-04-16-cm4-piControl-ocean-200yr-dataset.zarr + engine: zarr + - data_path: /climate-default + file_pattern: 2025-01-27-cm4-piControl-atmosphere-5daily-sfc-only/zarr_v3/data.zarr + engine: zarr + num_data_workers: 8 + aggregator: + log_histograms: false +logging: + log_to_screen: true + log_to_wandb: true + log_to_file: true + project: ace-samudra-cm4 + entity: ai2cm +train_loader: + batch_size: 16 + num_data_workers: 8 + dataset: + merge: + - data_path: /climate-default + file_pattern: 2025-06-03-cm4-piControl-200yr-coupled-ocean.zarr + engine: zarr + subset: + stop_time: '0306-01-01' + - data_path: /climate-default + file_pattern: 2025-04-16-cm4-piControl-ocean-200yr-dataset.zarr + engine: zarr + subset: + stop_time: '0306-01-01' + - data_path: /climate-default + file_pattern: 2025-01-27-cm4-piControl-atmosphere-5daily-sfc-only/zarr_v3/data.zarr + engine: zarr + subset: + stop_time: '0306-01-01' +validation_loader: + batch_size: 16 + num_data_workers: 8 + dataset: + merge: + - data_path: /climate-default + file_pattern: 2025-06-03-cm4-piControl-200yr-coupled-ocean.zarr + engine: zarr + subset: + start_time: '0306-01-01' + stop_time: '0311-01-01' + - data_path: /climate-default + file_pattern: 2025-04-16-cm4-piControl-ocean-200yr-dataset.zarr + engine: zarr + subset: + start_time: '0306-01-01' + stop_time: '0311-01-01' + - data_path: /climate-default + file_pattern: 2025-01-27-cm4-piControl-atmosphere-5daily-sfc-only/zarr_v3/data.zarr + engine: zarr + subset: + start_time: '0306-01-01' + stop_time: '0311-01-01' +optimization: + use_gradient_accumulation: true + enable_automatic_mixed_precision: false + lr: 0.0001 + optimizer_type: FusedAdam + kwargs: + weight_decay: 0.01 +stepper: + loss: + type: MSE + input_masking: + mask_value: 0 + fill_value: 0.0 + exclude_names_and_prefixes: + - land_fraction + step: + type: single_module + config: + builder: + type: Samudra + config: + ch_width: + - 200 + - 250 + - 300 + - 400 + dilation: + - 1 + - 2 + - 4 + - 8 + n_layers: + - 1 + - 1 + - 1 + - 1 + norm: instance + normalization: + network: + global_means_path: /statsdata/centering.nc + global_stds_path: /statsdata/scaling-full-field.nc + corrector: + type: ocean_corrector + config: + force_positive_names: + - so_0 + - so_1 + - so_2 + - so_3 + - so_4 + - so_5 + - so_6 + - so_7 + - so_8 + - so_9 + - so_10 + - so_11 + - so_12 + - so_13 + - so_14 + - so_15 + - so_16 + - so_17 + - so_18 + - HI + sea_ice_fraction_correction: + sea_ice_fraction_name: ocean_sea_ice_fraction + land_fraction_name: land_fraction + remove_negative_ocean_fraction: false + next_step_forcing_names: + - DLWRFsfc + - DSWRFsfc + - ULWRFsfc + - USWRFsfc + - LHTFLsfc + - SHTFLsfc + - PRATEsfc + - eastward_surface_wind_stress + - northward_surface_wind_stress + in_names: + - DLWRFsfc + - DSWRFsfc + - ULWRFsfc + - USWRFsfc + - LHTFLsfc + - SHTFLsfc + - PRATEsfc + - eastward_surface_wind_stress + - northward_surface_wind_stress + - land_fraction + - sst + - zos + - so_0 + - so_1 + - so_2 + - so_3 + - so_4 + - so_5 + - so_6 + - so_7 + - so_8 + - so_9 + - so_10 + - so_11 + - so_12 + - so_13 + - so_14 + - so_15 + - so_16 + - so_17 + - so_18 + - thetao_0 + - thetao_1 + - thetao_2 + - thetao_3 + - thetao_4 + - thetao_5 + - thetao_6 + - thetao_7 + - thetao_8 + - thetao_9 + - thetao_10 + - thetao_11 + - thetao_12 + - thetao_13 + - thetao_14 + - thetao_15 + - thetao_16 + - thetao_17 + - thetao_18 + - uo_0 + - uo_1 + - uo_2 + - uo_3 + - uo_4 + - uo_5 + - uo_6 + - uo_7 + - uo_8 + - uo_9 + - uo_10 + - uo_11 + - uo_12 + - uo_13 + - uo_14 + - uo_15 + - uo_16 + - uo_17 + - uo_18 + - vo_0 + - vo_1 + - vo_2 + - vo_3 + - vo_4 + - vo_5 + - vo_6 + - vo_7 + - vo_8 + - vo_9 + - vo_10 + - vo_11 + - vo_12 + - vo_13 + - vo_14 + - vo_15 + - vo_16 + - vo_17 + - vo_18 + - ocean_sea_ice_fraction + - HI + out_names: + - sst + - zos + - so_0 + - so_1 + - so_2 + - so_3 + - so_4 + - so_5 + - so_6 + - so_7 + - so_8 + - so_9 + - so_10 + - so_11 + - so_12 + - so_13 + - so_14 + - so_15 + - so_16 + - so_17 + - so_18 + - thetao_0 + - thetao_1 + - thetao_2 + - thetao_3 + - thetao_4 + - thetao_5 + - thetao_6 + - thetao_7 + - thetao_8 + - thetao_9 + - thetao_10 + - thetao_11 + - thetao_12 + - thetao_13 + - thetao_14 + - thetao_15 + - thetao_16 + - thetao_17 + - thetao_18 + - uo_0 + - uo_1 + - uo_2 + - uo_3 + - uo_4 + - uo_5 + - uo_6 + - uo_7 + - uo_8 + - uo_9 + - uo_10 + - uo_11 + - uo_12 + - uo_13 + - uo_14 + - uo_15 + - uo_16 + - uo_17 + - uo_18 + - vo_0 + - vo_1 + - vo_2 + - vo_3 + - vo_4 + - vo_5 + - vo_6 + - vo_7 + - vo_8 + - vo_9 + - vo_10 + - vo_11 + - vo_12 + - vo_13 + - vo_14 + - vo_15 + - vo_16 + - vo_17 + - vo_18 + - ocean_sea_ice_fraction + - HI diff --git a/configs/baselines/coupled/cm4-piControl/uncoupled-ocean/run-ace-evaluator.sh b/configs/baselines/coupled/cm4-piControl/uncoupled-ocean/run-ace-evaluator.sh new file mode 100644 index 000000000..5dca6cb8c --- /dev/null +++ b/configs/baselines/coupled/cm4-piControl/uncoupled-ocean/run-ace-evaluator.sh @@ -0,0 +1,42 @@ +#!/bin/bash + +set -e + +JOB_NAME="cm4-piControl-ocean-evaluator" +JOB_GROUP="cm4-piControl-ocean" +EXISTING_RESULTS_DATASET="TODO" # beaker dataset ID containing the training checkpoint +CONFIG_FILENAME="ace-evaluator-config.yaml" +SCRIPT_PATH=$(git rev-parse --show-prefix) # relative to the root of the repository +CONFIG_PATH="${SCRIPT_PATH}${CONFIG_FILENAME}" +BEAKER_USERNAME=$(beaker account whoami --format=json | jq -r '.[0].name') +REPO_ROOT=$(git rev-parse --show-toplevel) + +cd $REPO_ROOT # so config path is valid no matter where we are running this script + +python -m fme.ace.validate_config --config_type evaluator $CONFIG_PATH + +gantry run \ + --name $JOB_NAME \ + --task-name $JOB_NAME \ + --description "ACE-Samudra CM4 piControl ocean evaluator" \ + --beaker-image "$(cat $REPO_ROOT/latest_deps_only_image.txt)" \ + --workspace ai2/ace \ + --priority normal \ + --not-preemptible \ + --cluster ai2/ceres-cirrascale \ + --cluster ai2/saturn-cirrascale \ + --weka climate-default:/climate-default \ + --env WANDB_USERNAME=$BEAKER_USERNAME \ + --env WANDB_NAME=$JOB_NAME \ + --env WANDB_JOB_TYPE=inference \ + --env WANDB_RUN_GROUP=$JOB_GROUP \ + --env GOOGLE_APPLICATION_CREDENTIALS=/tmp/google_application_credentials.json \ + --env-secret WANDB_API_KEY=wandb-api-key-ai2cm-sa \ + --dataset-secret google-credentials:/tmp/google_application_credentials.json \ + --dataset $EXISTING_RESULTS_DATASET:training_checkpoints/best_inference_ckpt.tar:/ckpt.tar \ + --gpus 1 \ + --shared-memory 50GiB \ + --budget ai2/climate \ + --system-python \ + --install "pip install --no-deps ." \ + -- python -I -m fme.ace.evaluator $CONFIG_PATH diff --git a/configs/baselines/coupled/cm4-piControl/uncoupled-ocean/run-ace-train.sh b/configs/baselines/coupled/cm4-piControl/uncoupled-ocean/run-ace-train.sh new file mode 100644 index 000000000..da4bcdb92 --- /dev/null +++ b/configs/baselines/coupled/cm4-piControl/uncoupled-ocean/run-ace-train.sh @@ -0,0 +1,43 @@ +#!/bin/bash + +set -e + +JOB_NAME="cm4-piControl-ocean-train" +JOB_GROUP="cm4-piControl-ocean" +CONFIG_FILENAME="ace-train-config.yaml" +SCRIPT_PATH=$(git rev-parse --show-prefix) # relative to the root of the repository +CONFIG_PATH="${SCRIPT_PATH}${CONFIG_FILENAME}" +BEAKER_USERNAME=$(beaker account whoami --format=json | jq -r '.[0].name') +REPO_ROOT=$(git rev-parse --show-toplevel) +N_GPUS=2 +STATS_DATA=jamesd/2025-06-03-cm4-piControl-200yr-coupled-stats-ocean + +cd $REPO_ROOT # so config path is valid no matter where we are running this script + +python -m fme.ace.validate_config --config_type train $CONFIG_PATH + +gantry run \ + --name $JOB_NAME \ + --task-name $JOB_NAME \ + --description "ACE-Samudra CM4 piControl ocean training" \ + --beaker-image "$(cat $REPO_ROOT/latest_deps_only_image.txt)" \ + --workspace ai2/ace \ + --priority normal \ + --preemptible \ + --cluster ai2/ceres-cirrascale \ + --cluster ai2/saturn-cirrascale \ + --weka climate-default:/climate-default \ + --env WANDB_USERNAME=$BEAKER_USERNAME \ + --env WANDB_NAME=$JOB_NAME \ + --env WANDB_JOB_TYPE=training \ + --env WANDB_RUN_GROUP=$JOB_GROUP \ + --env GOOGLE_APPLICATION_CREDENTIALS=/tmp/google_application_credentials.json \ + --env-secret WANDB_API_KEY=wandb-api-key-ai2cm-sa \ + --dataset-secret google-credentials:/tmp/google_application_credentials.json \ + --dataset $STATS_DATA:/statsdata \ + --gpus $N_GPUS \ + --shared-memory 400GiB \ + --budget ai2/climate \ + --system-python \ + --install "pip install --no-deps ." \ + -- torchrun --nproc_per_node $N_GPUS -m fme.ace.train $CONFIG_PATH From 7d2db3f6bf5cb2aa166be11ea82ea4d34f170123 Mon Sep 17 00:00:00 2001 From: James Duncan Date: Thu, 26 Mar 2026 14:23:35 -0700 Subject: [PATCH 2/9] Do some file renaming --- .../uncoupled-atmos/{run-ace-evaluator.sh => evaluate.sh} | 7 ++++--- .../{ace-evaluator-config.yaml => evaluator-config.yaml} | 0 .../{ace-train-config.yaml => train-config.yaml} | 3 ++- .../uncoupled-atmos/{run-ace-train.sh => train.sh} | 4 ++-- .../uncoupled-ocean/{run-ace-evaluator.sh => evaluate.sh} | 7 ++++--- .../{ace-evaluator-config.yaml => evaluator-config.yaml} | 0 .../{ace-train-config.yaml => train-config.yaml} | 3 ++- .../uncoupled-ocean/{run-ace-train.sh => train.sh} | 4 ++-- 8 files changed, 16 insertions(+), 12 deletions(-) rename configs/baselines/coupled/cm4-piControl/uncoupled-atmos/{run-ace-evaluator.sh => evaluate.sh} (85%) rename configs/baselines/coupled/cm4-piControl/uncoupled-atmos/{ace-evaluator-config.yaml => evaluator-config.yaml} (100%) rename configs/baselines/coupled/cm4-piControl/uncoupled-atmos/{ace-train-config.yaml => train-config.yaml} (99%) rename configs/baselines/coupled/cm4-piControl/uncoupled-atmos/{run-ace-train.sh => train.sh} (93%) rename configs/baselines/coupled/cm4-piControl/uncoupled-ocean/{run-ace-evaluator.sh => evaluate.sh} (85%) rename configs/baselines/coupled/cm4-piControl/uncoupled-ocean/{ace-evaluator-config.yaml => evaluator-config.yaml} (100%) rename configs/baselines/coupled/cm4-piControl/uncoupled-ocean/{ace-train-config.yaml => train-config.yaml} (99%) rename configs/baselines/coupled/cm4-piControl/uncoupled-ocean/{run-ace-train.sh => train.sh} (93%) diff --git a/configs/baselines/coupled/cm4-piControl/uncoupled-atmos/run-ace-evaluator.sh b/configs/baselines/coupled/cm4-piControl/uncoupled-atmos/evaluate.sh similarity index 85% rename from configs/baselines/coupled/cm4-piControl/uncoupled-atmos/run-ace-evaluator.sh rename to configs/baselines/coupled/cm4-piControl/uncoupled-atmos/evaluate.sh index ac01c3e8e..26163eac9 100644 --- a/configs/baselines/coupled/cm4-piControl/uncoupled-atmos/run-ace-evaluator.sh +++ b/configs/baselines/coupled/cm4-piControl/uncoupled-atmos/evaluate.sh @@ -4,8 +4,8 @@ set -e JOB_NAME="cm4-piControl-atmos-evaluator" JOB_GROUP="cm4-piControl-atmos" -EXISTING_RESULTS_DATASET="TODO" # beaker dataset ID containing the training checkpoint -CONFIG_FILENAME="ace-evaluator-config.yaml" +EXISTING_RESULTS_DATASET="01JXXESTVASYBEKBM1VAWCRV87" # beaker dataset ID containing the training checkpoint +CONFIG_FILENAME="evaluator-config.yaml" SCRIPT_PATH=$(git rev-parse --show-prefix) # relative to the root of the repository CONFIG_PATH="${SCRIPT_PATH}${CONFIG_FILENAME}" BEAKER_USERNAME=$(beaker account whoami --format=json | jq -r '.[0].name') @@ -18,12 +18,13 @@ python -m fme.ace.validate_config --config_type evaluator $CONFIG_PATH gantry run \ --name $JOB_NAME \ --task-name $JOB_NAME \ - --description "ACE-CM4 piControl atmosphere evaluator" \ + --description "ACE2 CM4 piControl atmosphere evaluator" \ --beaker-image "$(cat $REPO_ROOT/latest_deps_only_image.txt)" \ --workspace ai2/ace \ --priority normal \ --not-preemptible \ --cluster ai2/ceres-cirrascale \ + --cluster ai2/jupiter-cirrascale \ --cluster ai2/saturn-cirrascale \ --weka climate-default:/climate-default \ --env WANDB_USERNAME=$BEAKER_USERNAME \ diff --git a/configs/baselines/coupled/cm4-piControl/uncoupled-atmos/ace-evaluator-config.yaml b/configs/baselines/coupled/cm4-piControl/uncoupled-atmos/evaluator-config.yaml similarity index 100% rename from configs/baselines/coupled/cm4-piControl/uncoupled-atmos/ace-evaluator-config.yaml rename to configs/baselines/coupled/cm4-piControl/uncoupled-atmos/evaluator-config.yaml diff --git a/configs/baselines/coupled/cm4-piControl/uncoupled-atmos/ace-train-config.yaml b/configs/baselines/coupled/cm4-piControl/uncoupled-atmos/train-config.yaml similarity index 99% rename from configs/baselines/coupled/cm4-piControl/uncoupled-atmos/ace-train-config.yaml rename to configs/baselines/coupled/cm4-piControl/uncoupled-atmos/train-config.yaml index f2faeac7f..2a3e8db44 100644 --- a/configs/baselines/coupled/cm4-piControl/uncoupled-atmos/ace-train-config.yaml +++ b/configs/baselines/coupled/cm4-piControl/uncoupled-atmos/train-config.yaml @@ -77,7 +77,7 @@ optimization: optimizer_type: FusedAdam kwargs: weight_decay: 0.01 -stepper: +stepper_training: loss: type: MSE weights: @@ -98,6 +98,7 @@ stepper: DSWRFsfc: 2 USWRFtoa: 2 tendency_of_total_water_path_due_to_advection: 0.5 +stepper: step: type: single_module config: diff --git a/configs/baselines/coupled/cm4-piControl/uncoupled-atmos/run-ace-train.sh b/configs/baselines/coupled/cm4-piControl/uncoupled-atmos/train.sh similarity index 93% rename from configs/baselines/coupled/cm4-piControl/uncoupled-atmos/run-ace-train.sh rename to configs/baselines/coupled/cm4-piControl/uncoupled-atmos/train.sh index cd91596d7..43d73f2c4 100644 --- a/configs/baselines/coupled/cm4-piControl/uncoupled-atmos/run-ace-train.sh +++ b/configs/baselines/coupled/cm4-piControl/uncoupled-atmos/train.sh @@ -4,7 +4,7 @@ set -e JOB_NAME="cm4-piControl-atmos-train" JOB_GROUP="cm4-piControl-atmos" -CONFIG_FILENAME="ace-train-config.yaml" +CONFIG_FILENAME="train-config.yaml" SCRIPT_PATH=$(git rev-parse --show-prefix) # relative to the root of the repository CONFIG_PATH="${SCRIPT_PATH}${CONFIG_FILENAME}" BEAKER_USERNAME=$(beaker account whoami --format=json | jq -r '.[0].name') @@ -19,7 +19,7 @@ python -m fme.ace.validate_config --config_type train $CONFIG_PATH gantry run \ --name $JOB_NAME \ --task-name $JOB_NAME \ - --description "ACE-CM4 piControl atmosphere training" \ + --description "ACE2 CM4 piControl atmosphere training" \ --beaker-image "$(cat $REPO_ROOT/latest_deps_only_image.txt)" \ --workspace ai2/ace \ --priority normal \ diff --git a/configs/baselines/coupled/cm4-piControl/uncoupled-ocean/run-ace-evaluator.sh b/configs/baselines/coupled/cm4-piControl/uncoupled-ocean/evaluate.sh similarity index 85% rename from configs/baselines/coupled/cm4-piControl/uncoupled-ocean/run-ace-evaluator.sh rename to configs/baselines/coupled/cm4-piControl/uncoupled-ocean/evaluate.sh index 5dca6cb8c..3b67cb0ba 100644 --- a/configs/baselines/coupled/cm4-piControl/uncoupled-ocean/run-ace-evaluator.sh +++ b/configs/baselines/coupled/cm4-piControl/uncoupled-ocean/evaluate.sh @@ -4,8 +4,8 @@ set -e JOB_NAME="cm4-piControl-ocean-evaluator" JOB_GROUP="cm4-piControl-ocean" -EXISTING_RESULTS_DATASET="TODO" # beaker dataset ID containing the training checkpoint -CONFIG_FILENAME="ace-evaluator-config.yaml" +EXISTING_RESULTS_DATASET="01JX4DEKY2A13D6Y95T53DSVCQ" # beaker dataset ID containing the training checkpoint +CONFIG_FILENAME="evaluator-config.yaml" SCRIPT_PATH=$(git rev-parse --show-prefix) # relative to the root of the repository CONFIG_PATH="${SCRIPT_PATH}${CONFIG_FILENAME}" BEAKER_USERNAME=$(beaker account whoami --format=json | jq -r '.[0].name') @@ -18,12 +18,13 @@ python -m fme.ace.validate_config --config_type evaluator $CONFIG_PATH gantry run \ --name $JOB_NAME \ --task-name $JOB_NAME \ - --description "ACE-Samudra CM4 piControl ocean evaluator" \ + --description "Samudra CM4 piControl ocean evaluator" \ --beaker-image "$(cat $REPO_ROOT/latest_deps_only_image.txt)" \ --workspace ai2/ace \ --priority normal \ --not-preemptible \ --cluster ai2/ceres-cirrascale \ + --cluster ai2/jupiter-cirrascale \ --cluster ai2/saturn-cirrascale \ --weka climate-default:/climate-default \ --env WANDB_USERNAME=$BEAKER_USERNAME \ diff --git a/configs/baselines/coupled/cm4-piControl/uncoupled-ocean/ace-evaluator-config.yaml b/configs/baselines/coupled/cm4-piControl/uncoupled-ocean/evaluator-config.yaml similarity index 100% rename from configs/baselines/coupled/cm4-piControl/uncoupled-ocean/ace-evaluator-config.yaml rename to configs/baselines/coupled/cm4-piControl/uncoupled-ocean/evaluator-config.yaml diff --git a/configs/baselines/coupled/cm4-piControl/uncoupled-ocean/ace-train-config.yaml b/configs/baselines/coupled/cm4-piControl/uncoupled-ocean/train-config.yaml similarity index 99% rename from configs/baselines/coupled/cm4-piControl/uncoupled-ocean/ace-train-config.yaml rename to configs/baselines/coupled/cm4-piControl/uncoupled-ocean/train-config.yaml index 2e2d809f2..7a58030e3 100644 --- a/configs/baselines/coupled/cm4-piControl/uncoupled-ocean/ace-train-config.yaml +++ b/configs/baselines/coupled/cm4-piControl/uncoupled-ocean/train-config.yaml @@ -89,9 +89,10 @@ optimization: optimizer_type: FusedAdam kwargs: weight_decay: 0.01 -stepper: +stepper_training: loss: type: MSE +stepper: input_masking: mask_value: 0 fill_value: 0.0 diff --git a/configs/baselines/coupled/cm4-piControl/uncoupled-ocean/run-ace-train.sh b/configs/baselines/coupled/cm4-piControl/uncoupled-ocean/train.sh similarity index 93% rename from configs/baselines/coupled/cm4-piControl/uncoupled-ocean/run-ace-train.sh rename to configs/baselines/coupled/cm4-piControl/uncoupled-ocean/train.sh index da4bcdb92..94565c64a 100644 --- a/configs/baselines/coupled/cm4-piControl/uncoupled-ocean/run-ace-train.sh +++ b/configs/baselines/coupled/cm4-piControl/uncoupled-ocean/train.sh @@ -4,7 +4,7 @@ set -e JOB_NAME="cm4-piControl-ocean-train" JOB_GROUP="cm4-piControl-ocean" -CONFIG_FILENAME="ace-train-config.yaml" +CONFIG_FILENAME="train-config.yaml" SCRIPT_PATH=$(git rev-parse --show-prefix) # relative to the root of the repository CONFIG_PATH="${SCRIPT_PATH}${CONFIG_FILENAME}" BEAKER_USERNAME=$(beaker account whoami --format=json | jq -r '.[0].name') @@ -19,7 +19,7 @@ python -m fme.ace.validate_config --config_type train $CONFIG_PATH gantry run \ --name $JOB_NAME \ --task-name $JOB_NAME \ - --description "ACE-Samudra CM4 piControl ocean training" \ + --description "SamudraI CM4 piControl ocean training" \ --beaker-image "$(cat $REPO_ROOT/latest_deps_only_image.txt)" \ --workspace ai2/ace \ --priority normal \ From 1d92e556cc6e437b8c05d64c40fa3f259a6eb609 Mon Sep 17 00:00:00 2001 From: James Duncan Date: Thu, 26 Mar 2026 14:58:59 -0700 Subject: [PATCH 3/9] Update configs and scripts --- .../coupled/cm4-piControl/evaluate.sh | 4 +- .../cm4-piControl/evaluator-config-ICx1.yaml | 41 ---- .../cm4-piControl/evaluator-config.yaml | 39 ++++ .../finetune-config-template.yaml | 167 +++++++++-------- .../coupled/cm4-piControl/finetune.sh | 34 ++-- .../cm4-piControl/train-config-template.yaml | 177 ++++++++++-------- .../baselines/coupled/cm4-piControl/train.sh | 34 ++-- .../uncoupled-atmos/evaluator-config.yaml | 4 +- .../uncoupled-atmos/train-config.yaml | 2 +- .../cm4-piControl/uncoupled-atmos/train.sh | 2 +- .../uncoupled-ocean/evaluator-config.yaml | 4 +- .../uncoupled-ocean/train-config.yaml | 2 +- .../cm4-piControl/uncoupled-ocean/train.sh | 2 +- 13 files changed, 262 insertions(+), 250 deletions(-) delete mode 100644 configs/baselines/coupled/cm4-piControl/evaluator-config-ICx1.yaml create mode 100644 configs/baselines/coupled/cm4-piControl/evaluator-config.yaml diff --git a/configs/baselines/coupled/cm4-piControl/evaluate.sh b/configs/baselines/coupled/cm4-piControl/evaluate.sh index e48d3e501..cf296546a 100644 --- a/configs/baselines/coupled/cm4-piControl/evaluate.sh +++ b/configs/baselines/coupled/cm4-piControl/evaluate.sh @@ -4,8 +4,8 @@ set -e JOB_NAME="cm4-piControl-coupled-evaluator" JOB_GROUP="cm4-piControl-coupled" -EXISTING_RESULTS_DATASET="TODO" # beaker dataset ID from coupled training or fine-tuning -CONFIG_FILENAME="evaluator-config-ICx1.yaml" +EXISTING_RESULTS_DATASET="01JZHQJXC4EYAPTCSP188YSVC0" # beaker dataset ID from coupled training or fine-tuning +CONFIG_FILENAME="evaluator-config.yaml" SCRIPT_PATH=$(git rev-parse --show-prefix) # relative to the root of the repository CONFIG_PATH="${SCRIPT_PATH}${CONFIG_FILENAME}" BEAKER_USERNAME=$(beaker account whoami --format=json | jq -r '.[0].name') diff --git a/configs/baselines/coupled/cm4-piControl/evaluator-config-ICx1.yaml b/configs/baselines/coupled/cm4-piControl/evaluator-config-ICx1.yaml deleted file mode 100644 index dabe161e6..000000000 --- a/configs/baselines/coupled/cm4-piControl/evaluator-config-ICx1.yaml +++ /dev/null @@ -1,41 +0,0 @@ -experiment_dir: /results -n_coupled_steps: 2920 -coupled_steps_in_memory: 20 -checkpoint_path: /ckpt.tar -data_writer: - ocean: - save_prediction_files: false - save_monthly_files: true - atmosphere: - save_prediction_files: false - save_monthly_files: true -logging: - log_to_screen: true - log_to_wandb: true - log_to_file: true - project: ace-samudra-coupled-cm4 - entity: ai2cm -loader: - num_data_workers: 1 - dataset: - ocean: - data_path: /climate-default - file_pattern: 2025-05-14-cm4-piControl-200yr-ocean.zarr - engine: zarr - atmosphere: - merge: - - data_path: /climate-default - file_pattern: 2025-05-13-cm4-piControl-200yr-coupled-sst-sic-6h-interpFalse.zarr - engine: zarr - subset: - # NOTE: this is required to align the atmosphere and ocean start times - start_time: '0151-01-06' - - data_path: /climate-default - file_pattern: 2025-03-21-CM4-piControl-atmosphere-land-1deg-8layer-200yr.zarr - engine: zarr - subset: - # NOTE: this is required to align the atmosphere and ocean start times - start_time: '0151-01-06' - start_indices: - times: - - '0311-01-01T00:00:00' diff --git a/configs/baselines/coupled/cm4-piControl/evaluator-config.yaml b/configs/baselines/coupled/cm4-piControl/evaluator-config.yaml new file mode 100644 index 000000000..139dbdbc0 --- /dev/null +++ b/configs/baselines/coupled/cm4-piControl/evaluator-config.yaml @@ -0,0 +1,39 @@ +experiment_dir: /results +n_coupled_steps: 2920 +coupled_steps_in_memory: 20 +checkpoint_path: /ckpt.tar +data_writer: + ocean: + save_prediction_files: false + save_monthly_files: false + atmosphere: + save_prediction_files: false + save_monthly_files: false +logging: + log_to_screen: true + log_to_wandb: true + log_to_file: true + project: SamudrACE-CM4-piControl + entity: ai2cm +loader: + num_data_workers: 1 + dataset: + ocean: + merge: + - data_path: /climate-default + file_pattern: 2025-06-03-cm4-piControl-200yr-coupled-ocean.zarr + engine: zarr + - data_path: /climate-default + file_pattern: 2025-04-16-cm4-piControl-ocean-200yr-dataset.zarr + engine: zarr + atmosphere: + merge: + - data_path: /climate-default + file_pattern: 2025-06-03-cm4-piControl-200yr-coupled-interpolate_sst-atmosphere.zarr + engine: zarr + - data_path: /climate-default + file_pattern: 2025-03-21-CM4-piControl-atmosphere-land-1deg-8layer-200yr.zarr + engine: zarr + start_indices: + times: + - '0311-01-01T00:00:00' diff --git a/configs/baselines/coupled/cm4-piControl/finetune-config-template.yaml b/configs/baselines/coupled/cm4-piControl/finetune-config-template.yaml index beaa51055..3341af255 100644 --- a/configs/baselines/coupled/cm4-piControl/finetune-config-template.yaml +++ b/configs/baselines/coupled/cm4-piControl/finetune-config-template.yaml @@ -16,27 +16,17 @@ inference: - data_path: /climate-default file_pattern: 2025-06-03-cm4-piControl-200yr-coupled-ocean.zarr engine: zarr - subset: - start_time: '0151-01-06' - data_path: /climate-default file_pattern: 2025-04-16-cm4-piControl-ocean-200yr-dataset.zarr engine: zarr - subset: - start_time: '0151-01-06' atmosphere: merge: - data_path: /climate-default file_pattern: 2025-06-03-cm4-piControl-200yr-coupled-interpolate_sst-atmosphere.zarr engine: zarr - subset: - # NOTE: this is required to align the atmosphere and ocean start times - start_time: '0151-01-06' - data_path: /climate-default file_pattern: 2025-03-21-CM4-piControl-atmosphere-land-1deg-8layer-200yr.zarr engine: zarr - subset: - # NOTE: this is required to align the atmosphere and ocean start times - start_time: '0151-01-06' start_indices: times: - '0151-01-06T00:00:00' @@ -54,74 +44,74 @@ logging: log_to_screen: true log_to_wandb: true log_to_file: true - project: ace-samudra-coupled-cm4 + project: SamudrACE-CM4-piControl entity: ai2cm train_loader: batch_size: 16 num_data_workers: 4 prefetch_factor: 1 dataset: - - ocean: - merge: - - data_path: /climate-default - file_pattern: 2025-06-03-cm4-piControl-200yr-coupled-ocean.zarr - engine: zarr - subset: - start_time: '0151-01-06' - stop_time: '0306-01-01' - - data_path: /climate-default - file_pattern: 2025-04-16-cm4-piControl-ocean-200yr-dataset.zarr - engine: zarr - subset: - start_time: '0151-01-06' - stop_time: '0306-01-01' - atmosphere: - merge: - - data_path: /climate-default - file_pattern: 2025-06-03-cm4-piControl-200yr-coupled-interpolate_sst-atmosphere.zarr - engine: zarr - subset: - start_time: '0151-01-06' - stop_time: '0306-01-01' - - data_path: /climate-default - file_pattern: 2025-03-21-CM4-piControl-atmosphere-land-1deg-8layer-200yr.zarr - engine: zarr - subset: - start_time: '0151-01-06' - stop_time: '0306-01-01' + ocean: + merge: + - data_path: /climate-default + file_pattern: 2025-06-03-cm4-piControl-200yr-coupled-ocean.zarr + engine: zarr + subset: + start_time: '0151-01-06' + stop_time: '0306-01-01' + - data_path: /climate-default + file_pattern: 2025-04-16-cm4-piControl-ocean-200yr-dataset.zarr + engine: zarr + subset: + start_time: '0151-01-06' + stop_time: '0306-01-01' + atmosphere: + merge: + - data_path: /climate-default + file_pattern: 2025-06-03-cm4-piControl-200yr-coupled-interpolate_sst-atmosphere.zarr + engine: zarr + subset: + start_time: '0151-01-06' + stop_time: '0306-01-01' + - data_path: /climate-default + file_pattern: 2025-03-21-CM4-piControl-atmosphere-land-1deg-8layer-200yr.zarr + engine: zarr + subset: + start_time: '0151-01-06' + stop_time: '0306-01-01' validation_loader: batch_size: 16 num_data_workers: 4 prefetch_factor: 1 dataset: - - ocean: - merge: - - data_path: /climate-default - file_pattern: 2025-06-03-cm4-piControl-200yr-coupled-ocean.zarr - engine: zarr - subset: - start_time: '0306-01-01' - stop_time: '0311-01-01' - - data_path: /climate-default - file_pattern: 2025-04-16-cm4-piControl-ocean-200yr-dataset.zarr - engine: zarr - subset: - start_time: '0306-01-01' - stop_time: '0311-01-01' - atmosphere: - merge: - - data_path: /climate-default - file_pattern: 2025-06-03-cm4-piControl-200yr-coupled-interpolate_sst-atmosphere.zarr - engine: zarr - subset: - start_time: '0306-01-01' - stop_time: '0311-01-01' - - data_path: /climate-default - file_pattern: 2025-03-21-CM4-piControl-atmosphere-land-1deg-8layer-200yr.zarr - engine: zarr - subset: - start_time: '0306-01-01' - stop_time: '0311-01-01' + ocean: + merge: + - data_path: /climate-default + file_pattern: 2025-06-03-cm4-piControl-200yr-coupled-ocean.zarr + engine: zarr + subset: + start_time: '0306-01-01' + stop_time: '0311-01-01' + - data_path: /climate-default + file_pattern: 2025-04-16-cm4-piControl-ocean-200yr-dataset.zarr + engine: zarr + subset: + start_time: '0306-01-01' + stop_time: '0311-01-01' + atmosphere: + merge: + - data_path: /climate-default + file_pattern: 2025-06-03-cm4-piControl-200yr-coupled-interpolate_sst-atmosphere.zarr + engine: zarr + subset: + start_time: '0306-01-01' + stop_time: '0311-01-01' + - data_path: /climate-default + file_pattern: 2025-03-21-CM4-piControl-atmosphere-land-1deg-8layer-200yr.zarr + engine: zarr + subset: + start_time: '0306-01-01' + stop_time: '0311-01-01' optimization: enable_automatic_mixed_precision: false lr: 0.00001 @@ -131,22 +121,47 @@ optimization: use_gradient_accumulation: true scheduler: type: CosineAnnealingLR -stepper: +stepper_training: parameter_init: checkpoint_path: /ckpt.tar ocean: - timedelta: 5D + parameter_init: + weights_path: null # null required + loss: + type: MSE loss_contributions: n_steps: 4 weight: 1.0 - stepper: - parameter_init: - weights_path: null # required atmosphere: - timedelta: 6h + parameter_init: + weights_path: null # null required loss_contributions: n_steps: 2 weight: 1.0 - stepper: - parameter_init: - weights_path: null # required + loss: + type: MSE + weights: + air_temperature_0: 0.5 + air_temperature_1: 0.5 + eastward_wind_0: 0.5 + northward_wind_0: 0.5 + specific_total_water_0: 0.5 + specific_total_water_1: 0.25 + specific_total_water_2: 0.5 + PRATEsfc: 0.5 + h500: 10 + TMP850: 5 + Q2m: 0.5 + DLWRFsfc: 2 + ULWRFsfc: 5 + USWRFsfc: 2 + DSWRFsfc: 2 + USWRFtoa: 2 + tendency_of_total_water_path_due_to_advection: 0.5 +stepper: + ocean: + timedelta: 5D + # stepper added by finetune.sh from uncoupled-ocean/train-config.yaml + atmosphere: + timedelta: 6h + # stepper added by finetune.sh from uncoupled-atmos/train-config.yaml diff --git a/configs/baselines/coupled/cm4-piControl/finetune.sh b/configs/baselines/coupled/cm4-piControl/finetune.sh index 2354941b2..08185b69f 100644 --- a/configs/baselines/coupled/cm4-piControl/finetune.sh +++ b/configs/baselines/coupled/cm4-piControl/finetune.sh @@ -1,10 +1,8 @@ #!/bin/bash # -# Coupled fine-tuning: starting from a coupled training checkpoint, fine-tunes -# both atmosphere and ocean models jointly with a cosine-annealing LR schedule. -# Generates the full coupled config by merging stepper definitions from the -# uncoupled training configs into finetune-config-template.yaml. -# Requires yq >= 4. +# SamudrACE CM4 piControl training stage 2: starting from the stage 1 +# checkpoint, fine-tunes both atmosphere and ocean models jointly with a +# cosine-annealing LR schedule. set -e @@ -14,6 +12,7 @@ EXISTING_RESULTS_DATASET="TODO" # beaker dataset ID from coupled training (trai CKPT_TYPE="best_inference_ckpt" SCRIPT_PATH=$(git rev-parse --show-prefix) # relative to the root of the repository +SCRIPT_PATH=${SCRIPT_PATH%/} BEAKER_USERNAME=$(beaker account whoami --format=json | jq -r '.[0].name') REPO_ROOT=$(git rev-parse --show-toplevel) N_GPUS=4 @@ -23,30 +22,21 @@ OCEAN_STATS_DATA=jamesd/2025-06-03-cm4-piControl-200yr-coupled-stats-ocean cd "$REPO_ROOT" # so config path is valid no matter where we are running this script -# --- Generate coupled-finetune-config.yaml from template + uncoupled configs --- +# --- Generate finetune-config.yaml from template + uncoupled configs --- -TEMPLATE_CONFIG_PATH="${SCRIPT_PATH}finetune-config-template.yaml" -CONFIG_PATH="${SCRIPT_PATH}coupled-finetune-config.yaml" +TEMPLATE_CONFIG_PATH="${SCRIPT_PATH}/finetune-config-template.yaml" +CONFIG_PATH="${SCRIPT_PATH}/finetune-config.yaml" -cp "${SCRIPT_PATH}uncoupled-atmos/ace-train-config.yaml" ./atmos-config.yaml +cp "${SCRIPT_PATH}/uncoupled-atmos/train-config.yaml" ./atmos-config.yaml sed -i 's/statsdata/atmos_stats/g' ./atmos-config.yaml -cp "${SCRIPT_PATH}uncoupled-ocean/ace-train-config.yaml" ./ocean-config.yaml +cp "${SCRIPT_PATH}/uncoupled-ocean/train-config.yaml" ./ocean-config.yaml sed -i 's/statsdata/ocean_stats/g' ./ocean-config.yaml -yq -i 'del(.stepper.loss, .stepper.optimize_last_step_only, .stepper.n_ensemble, .stepper.parameter_init, .stepper.train_n_forward_step)' ./ocean-config.yaml -yq -i 'del(.stepper.loss, .stepper.optimize_last_step_only, .stepper.n_ensemble, .stepper.parameter_init, .stepper.train_n_forward_step)' ./atmos-config.yaml - -SIC_NAME=$(yq '.stepper.step.config.corrector.config.sea_ice_fraction_correction.sea_ice_fraction_name' ./ocean-config.yaml) -if [[ "$SIC_NAME" == "null" ]]; then - echo "Failed to extract sea_ice_fraction_name from the ocean config" - exit 1 -fi - cp "$TEMPLATE_CONFIG_PATH" "$CONFIG_PATH" +# update component stepper configs, preserving template values on conflict yq -i '.stepper.ocean.stepper *=n load("ocean-config.yaml").stepper' "$CONFIG_PATH" -SIC_NAME=$SIC_NAME yq -i '.stepper.ocean_fraction_prediction.sea_ice_fraction_name = env(SIC_NAME)' "$CONFIG_PATH" yq -i '.stepper.atmosphere.stepper *=n load("atmos-config.yaml").stepper' "$CONFIG_PATH" rm ./atmos-config.yaml ./ocean-config.yaml @@ -58,13 +48,13 @@ python -m fme.coupled.validate_config "$CONFIG_PATH" --config_type train gantry run \ --name $JOB_NAME \ --task-name $JOB_NAME \ - --description "ACE coupled CM4 piControl fine-tuning" \ + --description "Run SamudrACE CM4 piControl ocean + atmos fine-tuning" \ --beaker-image "$(cat $REPO_ROOT/latest_deps_only_image.txt)" \ --workspace ai2/ace \ --priority normal \ --preemptible \ --cluster ai2/ceres-cirrascale \ - --cluster ai2/saturn-cirrascale \ + --cluster ai2/jupiter-cirrascale \ --weka climate-default:/climate-default \ --env WANDB_USERNAME=$BEAKER_USERNAME \ --env WANDB_NAME=$JOB_NAME \ diff --git a/configs/baselines/coupled/cm4-piControl/train-config-template.yaml b/configs/baselines/coupled/cm4-piControl/train-config-template.yaml index f0d9d28b3..657b2115a 100644 --- a/configs/baselines/coupled/cm4-piControl/train-config-template.yaml +++ b/configs/baselines/coupled/cm4-piControl/train-config-template.yaml @@ -16,27 +16,17 @@ inference: - data_path: /climate-default file_pattern: 2025-06-03-cm4-piControl-200yr-coupled-ocean.zarr engine: zarr - subset: - start_time: '0151-01-06' - data_path: /climate-default file_pattern: 2025-04-16-cm4-piControl-ocean-200yr-dataset.zarr engine: zarr - subset: - start_time: '0151-01-06' atmosphere: merge: - data_path: /climate-default file_pattern: 2025-06-03-cm4-piControl-200yr-coupled-interpolate_sst-atmosphere.zarr engine: zarr - subset: - # NOTE: this is required to align the atmosphere and ocean start times - start_time: '0151-01-06' - data_path: /climate-default file_pattern: 2025-03-21-CM4-piControl-atmosphere-land-1deg-8layer-200yr.zarr engine: zarr - subset: - # NOTE: this is required to align the atmosphere and ocean start times - start_time: '0151-01-06' start_indices: times: - '0151-01-06T00:00:00' @@ -54,74 +44,74 @@ logging: log_to_screen: true log_to_wandb: true log_to_file: true - project: ace-samudra-coupled-cm4 + project: SamudrACE-CM4-piControl entity: ai2cm train_loader: batch_size: 16 num_data_workers: 4 prefetch_factor: 1 dataset: - - ocean: - merge: - - data_path: /climate-default - file_pattern: 2025-06-03-cm4-piControl-200yr-coupled-ocean.zarr - engine: zarr - subset: - start_time: '0151-01-06' - stop_time: '0306-01-01' - - data_path: /climate-default - file_pattern: 2025-04-16-cm4-piControl-ocean-200yr-dataset.zarr - engine: zarr - subset: - start_time: '0151-01-06' - stop_time: '0306-01-01' - atmosphere: - merge: - - data_path: /climate-default - file_pattern: 2025-06-03-cm4-piControl-200yr-coupled-interpolate_sst-atmosphere.zarr - engine: zarr - subset: - start_time: '0151-01-06' - stop_time: '0306-01-01' - - data_path: /climate-default - file_pattern: 2025-03-21-CM4-piControl-atmosphere-land-1deg-8layer-200yr.zarr - engine: zarr - subset: - start_time: '0151-01-06' - stop_time: '0306-01-01' + ocean: + merge: + - data_path: /climate-default + file_pattern: 2025-06-03-cm4-piControl-200yr-coupled-ocean.zarr + engine: zarr + subset: + start_time: '0151-01-06' + stop_time: '0306-01-01' + - data_path: /climate-default + file_pattern: 2025-04-16-cm4-piControl-ocean-200yr-dataset.zarr + engine: zarr + subset: + start_time: '0151-01-06' + stop_time: '0306-01-01' + atmosphere: + merge: + - data_path: /climate-default + file_pattern: 2025-06-03-cm4-piControl-200yr-coupled-interpolate_sst-atmosphere.zarr + engine: zarr + subset: + start_time: '0151-01-06' + stop_time: '0306-01-01' + - data_path: /climate-default + file_pattern: 2025-03-21-CM4-piControl-atmosphere-land-1deg-8layer-200yr.zarr + engine: zarr + subset: + start_time: '0151-01-06' + stop_time: '0306-01-01' validation_loader: batch_size: 16 num_data_workers: 4 prefetch_factor: 1 dataset: - - ocean: - merge: - - data_path: /climate-default - file_pattern: 2025-06-03-cm4-piControl-200yr-coupled-ocean.zarr - engine: zarr - subset: - start_time: '0306-01-01' - stop_time: '0311-01-01' - - data_path: /climate-default - file_pattern: 2025-04-16-cm4-piControl-ocean-200yr-dataset.zarr - engine: zarr - subset: - start_time: '0306-01-01' - stop_time: '0311-01-01' - atmosphere: - merge: - - data_path: /climate-default - file_pattern: 2025-06-03-cm4-piControl-200yr-coupled-interpolate_sst-atmosphere.zarr - engine: zarr - subset: - start_time: '0306-01-01' - stop_time: '0311-01-01' - - data_path: /climate-default - file_pattern: 2025-03-21-CM4-piControl-atmosphere-land-1deg-8layer-200yr.zarr - engine: zarr - subset: - start_time: '0306-01-01' - stop_time: '0311-01-01' + ocean: + merge: + - data_path: /climate-default + file_pattern: 2025-06-03-cm4-piControl-200yr-coupled-ocean.zarr + engine: zarr + subset: + start_time: '0306-01-01' + stop_time: '0311-01-01' + - data_path: /climate-default + file_pattern: 2025-04-16-cm4-piControl-ocean-200yr-dataset.zarr + engine: zarr + subset: + start_time: '0306-01-01' + stop_time: '0311-01-01' + atmosphere: + merge: + - data_path: /climate-default + file_pattern: 2025-06-03-cm4-piControl-200yr-coupled-interpolate_sst-atmosphere.zarr + engine: zarr + subset: + start_time: '0306-01-01' + stop_time: '0311-01-01' + - data_path: /climate-default + file_pattern: 2025-03-21-CM4-piControl-atmosphere-land-1deg-8layer-200yr.zarr + engine: zarr + subset: + start_time: '0306-01-01' + stop_time: '0311-01-01' optimization: enable_automatic_mixed_precision: false lr: 0.0001 @@ -129,24 +119,53 @@ optimization: kwargs: weight_decay: 0.01 use_gradient_accumulation: true +stepper_training: + ocean: + parameter_init: + weights_path: /ocean_ckpt.tar + loss: + type: MSE + loss_contributions: + n_steps: 4 + weight: 1.0 + atmosphere: + parameter_init: + weights_path: /atmos_ckpt.tar + parameters: + - frozen: + include: + - '*' + loss_contributions: + n_steps: 0 + loss: + type: MSE + weights: + air_temperature_0: 0.5 + air_temperature_1: 0.5 + eastward_wind_0: 0.5 + northward_wind_0: 0.5 + specific_total_water_0: 0.5 + specific_total_water_1: 0.25 + specific_total_water_2: 0.5 + PRATEsfc: 0.5 + h500: 10 + TMP850: 5 + Q2m: 0.5 + DLWRFsfc: 2 + ULWRFsfc: 5 + USWRFsfc: 2 + DSWRFsfc: 2 + USWRFtoa: 2 + tendency_of_total_water_path_due_to_advection: 0.5 stepper: ocean_fraction_prediction: land_fraction_name: land_fraction + sea_ice_fraction_name: ocean_sea_ice_fraction sea_ice_fraction_name_in_atmosphere: sea_ice_fraction sst_name: sst ocean: timedelta: 5D - stepper: - parameter_init: - weights_path: /ocean_ckpt.tar + # stepper added by train.sh from uncoupled-ocean/train-config.yaml atmosphere: timedelta: 6h - loss_contributions: - n_steps: 0 - stepper: - parameter_init: - weights_path: /atmos_ckpt.tar - parameters: - - frozen: - include: - - '*' + # stepper added by train.sh from uncoupled-atmos/train-config.yaml diff --git a/configs/baselines/coupled/cm4-piControl/train.sh b/configs/baselines/coupled/cm4-piControl/train.sh index 9d2621bd5..f3eb52315 100644 --- a/configs/baselines/coupled/cm4-piControl/train.sh +++ b/configs/baselines/coupled/cm4-piControl/train.sh @@ -1,20 +1,19 @@ #!/bin/bash # -# Coupled training (freeze-then-optimize): freezes the atmosphere model and -# fine-tunes the ocean model in the coupled loop. Generates the full coupled -# config by merging stepper definitions from the uncoupled training configs -# into the train-config-template.yaml. Requires yq >= 4. +# SamudrACE CM4 piControl training stage 1: freezes the atmosphere model and +# fine-tunes the ocean model in coupled mode. set -e JOB_NAME="cm4-piControl-coupled-train" JOB_GROUP="cm4-piControl-coupled" -EXISTING_RESULTS_ATMOS_DATASET="TODO" # beaker dataset ID from uncoupled atmos training -EXISTING_RESULTS_OCEAN_DATASET="TODO" # beaker dataset ID from uncoupled ocean training +EXISTING_RESULTS_ATMOS_DATASET="01JXXESTVASYBEKBM1VAWCRV87" # beaker dataset ID from uncoupled atmos training +EXISTING_RESULTS_OCEAN_DATASET="01JX4DEKY2A13D6Y95T53DSVCQ" # beaker dataset ID from uncoupled ocean training ATMOS_CKPT="best_inference_ckpt" OCEAN_CKPT="best_inference_ckpt" SCRIPT_PATH=$(git rev-parse --show-prefix) # relative to the root of the repository +SCRIPT_PATH=${SCRIPT_PATH%/} BEAKER_USERNAME=$(beaker account whoami --format=json | jq -r '.[0].name') REPO_ROOT=$(git rev-parse --show-toplevel) N_GPUS=4 @@ -24,30 +23,21 @@ OCEAN_STATS_DATA=jamesd/2025-06-03-cm4-piControl-200yr-coupled-stats-ocean cd "$REPO_ROOT" # so config path is valid no matter where we are running this script -# --- Generate coupled-train-config.yaml from template + uncoupled configs --- +# --- Generate train-config.yaml from template + uncoupled configs --- -TEMPLATE_CONFIG_PATH="${SCRIPT_PATH}train-config-template.yaml" -CONFIG_PATH="${SCRIPT_PATH}coupled-train-config.yaml" +TEMPLATE_CONFIG_PATH="${SCRIPT_PATH}/train-config-template.yaml" +CONFIG_PATH="${SCRIPT_PATH}/train-config.yaml" -cp "${SCRIPT_PATH}uncoupled-atmos/ace-train-config.yaml" ./atmos-config.yaml +cp "${SCRIPT_PATH}/uncoupled-atmos/train-config.yaml" ./atmos-config.yaml sed -i 's/statsdata/atmos_stats/g' ./atmos-config.yaml -cp "${SCRIPT_PATH}uncoupled-ocean/ace-train-config.yaml" ./ocean-config.yaml +cp "${SCRIPT_PATH}/uncoupled-ocean/train-config.yaml" ./ocean-config.yaml sed -i 's/statsdata/ocean_stats/g' ./ocean-config.yaml -yq -i 'del(.stepper.loss, .stepper.optimize_last_step_only, .stepper.n_ensemble, .stepper.parameter_init, .stepper.train_n_forward_step)' ./ocean-config.yaml -yq -i 'del(.stepper.loss, .stepper.optimize_last_step_only, .stepper.n_ensemble, .stepper.parameter_init, .stepper.train_n_forward_step)' ./atmos-config.yaml - -SIC_NAME=$(yq '.stepper.step.config.corrector.config.sea_ice_fraction_correction.sea_ice_fraction_name' ./ocean-config.yaml) -if [[ "$SIC_NAME" == "null" ]]; then - echo "Failed to extract sea_ice_fraction_name from the ocean config" - exit 1 -fi - cp "$TEMPLATE_CONFIG_PATH" "$CONFIG_PATH" +# update component stepper configs, preserving template values on conflict yq -i '.stepper.ocean.stepper *=n load("ocean-config.yaml").stepper' "$CONFIG_PATH" -SIC_NAME=$SIC_NAME yq -i '.stepper.ocean_fraction_prediction.sea_ice_fraction_name = env(SIC_NAME)' "$CONFIG_PATH" yq -i '.stepper.atmosphere.stepper *=n load("atmos-config.yaml").stepper' "$CONFIG_PATH" rm ./atmos-config.yaml ./ocean-config.yaml @@ -59,7 +49,7 @@ python -m fme.coupled.validate_config "$CONFIG_PATH" --config_type train gantry run \ --name $JOB_NAME \ --task-name $JOB_NAME \ - --description "ACE coupled CM4 piControl training (freeze-then-optimize)" \ + --description "Run SamudrACE CM4 piControl ocean-only fine-tuning" \ --beaker-image "$(cat $REPO_ROOT/latest_deps_only_image.txt)" \ --workspace ai2/ace \ --priority normal \ diff --git a/configs/baselines/coupled/cm4-piControl/uncoupled-atmos/evaluator-config.yaml b/configs/baselines/coupled/cm4-piControl/uncoupled-atmos/evaluator-config.yaml index 5fd0ac44d..c21fa67cb 100644 --- a/configs/baselines/coupled/cm4-piControl/uncoupled-atmos/evaluator-config.yaml +++ b/configs/baselines/coupled/cm4-piControl/uncoupled-atmos/evaluator-config.yaml @@ -6,7 +6,7 @@ logging: log_to_screen: true log_to_wandb: true log_to_file: true - project: ace2-cm4-atmos-noLSM + project: SamudrACE-CM4-piControl entity: ai2cm loader: start_indices: @@ -26,4 +26,4 @@ aggregator: log_histograms: true data_writer: save_prediction_files: false - save_monthly_files: true + save_monthly_files: false diff --git a/configs/baselines/coupled/cm4-piControl/uncoupled-atmos/train-config.yaml b/configs/baselines/coupled/cm4-piControl/uncoupled-atmos/train-config.yaml index 2a3e8db44..a03bf034f 100644 --- a/configs/baselines/coupled/cm4-piControl/uncoupled-atmos/train-config.yaml +++ b/configs/baselines/coupled/cm4-piControl/uncoupled-atmos/train-config.yaml @@ -34,7 +34,7 @@ logging: log_to_screen: true log_to_wandb: true log_to_file: true - project: ace2-cm4-atmos-noLSM + project: SamudrACE-CM4-piControl entity: ai2cm train_loader: batch_size: 16 diff --git a/configs/baselines/coupled/cm4-piControl/uncoupled-atmos/train.sh b/configs/baselines/coupled/cm4-piControl/uncoupled-atmos/train.sh index 43d73f2c4..64b514f2f 100644 --- a/configs/baselines/coupled/cm4-piControl/uncoupled-atmos/train.sh +++ b/configs/baselines/coupled/cm4-piControl/uncoupled-atmos/train.sh @@ -19,7 +19,7 @@ python -m fme.ace.validate_config --config_type train $CONFIG_PATH gantry run \ --name $JOB_NAME \ --task-name $JOB_NAME \ - --description "ACE2 CM4 piControl atmosphere training" \ + --description "ACE2 CM4 piControl atmosphere pretraining" \ --beaker-image "$(cat $REPO_ROOT/latest_deps_only_image.txt)" \ --workspace ai2/ace \ --priority normal \ diff --git a/configs/baselines/coupled/cm4-piControl/uncoupled-ocean/evaluator-config.yaml b/configs/baselines/coupled/cm4-piControl/uncoupled-ocean/evaluator-config.yaml index c9f5ef92b..89e48ab34 100644 --- a/configs/baselines/coupled/cm4-piControl/uncoupled-ocean/evaluator-config.yaml +++ b/configs/baselines/coupled/cm4-piControl/uncoupled-ocean/evaluator-config.yaml @@ -6,7 +6,7 @@ logging: log_to_screen: true log_to_wandb: true log_to_file: true - project: ace-samudra-cm4 + project: SamudrACE-CM4-piControl entity: ai2cm loader: start_indices: @@ -26,6 +26,6 @@ loader: num_data_workers: 1 data_writer: save_prediction_files: false - save_monthly_files: true + save_monthly_files: false aggregator: log_histograms: true diff --git a/configs/baselines/coupled/cm4-piControl/uncoupled-ocean/train-config.yaml b/configs/baselines/coupled/cm4-piControl/uncoupled-ocean/train-config.yaml index 7a58030e3..2a7a9637e 100644 --- a/configs/baselines/coupled/cm4-piControl/uncoupled-ocean/train-config.yaml +++ b/configs/baselines/coupled/cm4-piControl/uncoupled-ocean/train-config.yaml @@ -37,7 +37,7 @@ logging: log_to_screen: true log_to_wandb: true log_to_file: true - project: ace-samudra-cm4 + project: SamudrACE-CM4-piControl entity: ai2cm train_loader: batch_size: 16 diff --git a/configs/baselines/coupled/cm4-piControl/uncoupled-ocean/train.sh b/configs/baselines/coupled/cm4-piControl/uncoupled-ocean/train.sh index 94565c64a..b5bbb705b 100644 --- a/configs/baselines/coupled/cm4-piControl/uncoupled-ocean/train.sh +++ b/configs/baselines/coupled/cm4-piControl/uncoupled-ocean/train.sh @@ -19,7 +19,7 @@ python -m fme.ace.validate_config --config_type train $CONFIG_PATH gantry run \ --name $JOB_NAME \ --task-name $JOB_NAME \ - --description "SamudraI CM4 piControl ocean training" \ + --description "SamudraI CM4 piControl ocean pretraining" \ --beaker-image "$(cat $REPO_ROOT/latest_deps_only_image.txt)" \ --workspace ai2/ace \ --priority normal \ From d31d6cc9aa57ed62e0d315434f3961a33a1c2fd6 Mon Sep 17 00:00:00 2001 From: James Duncan Date: Thu, 26 Mar 2026 15:09:05 -0700 Subject: [PATCH 4/9] Update README --- .../baselines/coupled/cm4-piControl/README.md | 99 +++++-------------- 1 file changed, 24 insertions(+), 75 deletions(-) diff --git a/configs/baselines/coupled/cm4-piControl/README.md b/configs/baselines/coupled/cm4-piControl/README.md index 140a48833..c8cf1d6ec 100644 --- a/configs/baselines/coupled/cm4-piControl/README.md +++ b/configs/baselines/coupled/cm4-piControl/README.md @@ -1,31 +1,30 @@ -# Coupled training and evaluation for CM4-piControl +# SamudrACE training and evaluation for CM4 piControl Self-contained baseline configs and scripts for the SamudrACE coupled -atmosphere-ocean training pipeline on GFDL CM4 piControl data. +atmosphere-ocean training pipeline on the 200-year GFDL CM4 piControl data. ## Pipeline overview -The pipeline trains uncoupled atmosphere and ocean models independently, then -couples them via freeze-then-optimize (FTO) training, with an optional -joint fine-tuning stage. Each step produces a Beaker dataset whose ID is -plugged into the next script's `EXISTING_RESULTS_DATASET` variable. +The pipeline first trains uncoupled atmosphere and ocean models independently, +then couples them in two stages: ocean-only fine-tuning (`train.sh`) and then +joint ocean-and-atmosphere fine-tuning (`finetune.sh`). ``` -Step 1: Uncoupled training (parallel) - uncoupled-atmos/run-ace-train.sh -> atmos checkpoint - uncoupled-ocean/run-ace-train.sh -> ocean checkpoint +Uncoupled training: + uncoupled-atmos/train.sh -> atmos checkpoint + uncoupled-ocean/train.sh -> ocean checkpoint -Step 2 (optional): Uncoupled evaluation - uncoupled-atmos/run-ace-evaluator.sh - uncoupled-ocean/run-ace-evaluator.sh +Uncoupled evaluation: + uncoupled-atmos/evaluate.sh + uncoupled-ocean/evaluate.sh -Step 3: Coupled training (freeze-then-optimize) +Coupled training stage 1: train.sh -> coupled checkpoint (atmos frozen, ocean fine-tuned) -Step 4 (optional): Coupled fine-tuning +Coupled training stage 2: finetune.sh -> refined coupled checkpoint (both models trained) -Step 5: Coupled evaluation +Coupled evaluation evaluate.sh ``` @@ -33,63 +32,13 @@ Step 5: Coupled evaluation | File | Purpose | |------|---------| -| `uncoupled-atmos/ace-train-config.yaml` | SFNO atmosphere model: architecture, variables, loss weights | -| `uncoupled-atmos/ace-evaluator-config.yaml` | Atmosphere evaluation (58,300 steps = ~40 years at 6h) | -| `uncoupled-ocean/ace-train-config.yaml` | Samudra ocean model: architecture, variables, correctors | -| `uncoupled-ocean/ace-evaluator-config.yaml` | Ocean evaluation (2,920 steps = ~40 years at 5-day) | -| `train-config-template.yaml` | Coupled FTO training: data loaders, optimization, coupled stepper skeleton (atmos frozen, ocean trainable) | -| `finetune-config-template.yaml` | Coupled fine-tuning: lower LR, cosine annealing, both models trainable, loads from coupled checkpoint | -| `evaluator-config-ICx1.yaml` | Coupled evaluation from a single initial condition (year 311) | -| `train.sh` | Generates `coupled-train-config.yaml` and submits coupled training | -| `finetune.sh` | Generates `coupled-finetune-config.yaml` and submits fine-tuning | -| `evaluate.sh` | Submits coupled evaluation | - -## How configs are generated - -The coupled training configs are too large to maintain by hand since they -embed the full stepper definitions for both atmosphere and ocean models. -Instead, `train.sh` and `finetune.sh` generate them automatically: - -1. Copy `uncoupled-atmos/ace-train-config.yaml` and - `uncoupled-ocean/ace-train-config.yaml` to temp files -2. Remap stats paths (`statsdata` -> `atmos_stats` / `ocean_stats`) -3. Strip training-specific fields (`loss`, `parameter_init`, etc.) -4. Extract `sea_ice_fraction_name` from the ocean corrector config -5. Merge both steppers into the template (template values win on conflict) -6. Set `ocean_fraction_prediction.sea_ice_fraction_name` - -This requires **yq >= 4** (`brew install yq` or `pip install yq`). - -## How to use - -1. **Train uncoupled models** -- run `uncoupled-atmos/run-ace-train.sh` and - `uncoupled-ocean/run-ace-train.sh`. When complete, find the Beaker result - dataset ID for each job. - -2. **Update `train.sh`** -- set `EXISTING_RESULTS_ATMOS_DATASET` and - `EXISTING_RESULTS_OCEAN_DATASET` to the dataset IDs from step 1. - -3. **Run coupled training** -- run `train.sh`. This generates - `coupled-train-config.yaml` and submits the job. - -4. **(Optional) Fine-tune** -- set `EXISTING_RESULTS_DATASET` in - `finetune.sh` to the dataset ID from coupled training, then run it. - -5. **Evaluate** -- set `EXISTING_RESULTS_DATASET` in `evaluate.sh` to the - dataset ID from coupled training (or fine-tuning), then run it. - -6. **(Optional) Evaluate uncoupled models** -- set `EXISTING_RESULTS_DATASET` - in the uncoupled evaluator scripts and run them. - -## Key model details - -- **Atmosphere**: SphericalFourierNeuralOperatorNet (SFNO), embed_dim=384, - 8 layers, 6h timestep, 8-level vertical discretization -- **Ocean**: Samudra CNN, ch_width=[200,250,300,400], 5-day timestep, - 19 depth levels for temperature/salinity/velocity -- **Coupled FTO**: 20 epochs, 4 coupled steps, atmosphere frozen, ocean - trained with MSE loss -- **Coupled fine-tuning**: 20 epochs, lr=1e-5 with cosine annealing, - both models trained, loads from coupled checkpoint -- **Data**: CM4 piControl 200-year simulation, train years 151-306, - validation years 306-311, evaluation from year 311 +| `uncoupled-atmos/train-config.yaml` | ACE2 atmosphere model pretraining config | +| `uncoupled-atmos/evaluator-config.yaml` | ACE2 evaluation (58,300 steps = ~40 years at 6h) | +| `uncoupled-ocean/train-config.yaml` | SamudraI ocean model pretraining config | +| `uncoupled-ocean/evaluator-config.yaml` | SamudraI evaluation (2,920 steps = ~40 years at 5-day) | +| `train-config-template.yaml` | SamudrACE stage 1 training config template | +| `finetune-config-template.yaml` | SamudrACE stage 2 training config template | +| `evaluator-config-ICx1.yaml` | SamudrACE evaluation from a single initial condition (year 311) | +| `train.sh` | Generates `train-config.yaml` and submits SamudrACE stage 1 training | +| `finetune.sh` | Generates `finetune-config.yaml` and submits SamudrACE stage 2 training | +| `evaluate.sh` | SamudrACE evaluation | From 936175d2c6a78c5639c9a4be155a3056a42647a7 Mon Sep 17 00:00:00 2001 From: James Duncan Date: Wed, 8 Apr 2026 13:45:54 -0700 Subject: [PATCH 5/9] Fix uncoupled atmos train config --- .../coupled/cm4-piControl/uncoupled-atmos/train-config.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/configs/baselines/coupled/cm4-piControl/uncoupled-atmos/train-config.yaml b/configs/baselines/coupled/cm4-piControl/uncoupled-atmos/train-config.yaml index a03bf034f..2c0919f00 100644 --- a/configs/baselines/coupled/cm4-piControl/uncoupled-atmos/train-config.yaml +++ b/configs/baselines/coupled/cm4-piControl/uncoupled-atmos/train-config.yaml @@ -22,7 +22,7 @@ inference: dataset: merge: - data_path: /climate-default - file_pattern: 2025-07-11-cm4-piControl-200yr-coupled-threshold-sst_from_ts-atmosphere.zarr + file_pattern: 2025-06-03-cm4-piControl-200yr-coupled-interpolate_sst-atmosphere.zarr engine: zarr - data_path: /climate-default file_pattern: 2025-03-21-CM4-piControl-atmosphere-land-1deg-8layer-200yr.zarr @@ -43,7 +43,7 @@ train_loader: dataset: merge: - data_path: /climate-default - file_pattern: 2025-07-11-cm4-piControl-200yr-coupled-threshold-sst_from_ts-atmosphere.zarr + file_pattern: 2025-06-03-cm4-piControl-200yr-coupled-interpolate_sst-atmosphere.zarr engine: zarr subset: stop_time: '0306-01-01T06:00:00' @@ -59,7 +59,7 @@ validation_loader: dataset: merge: - data_path: /climate-default - file_pattern: 2025-07-11-cm4-piControl-200yr-coupled-threshold-sst_from_ts-atmosphere.zarr + file_pattern: 2025-06-03-cm4-piControl-200yr-coupled-interpolate_sst-atmosphere.zarr engine: zarr subset: start_time: '0306-01-01T06:00:00' From 4463ae8bca74f5822278a7de17d4b567a8213e4e Mon Sep 17 00:00:00 2001 From: Elynn Wu Date: Wed, 8 Apr 2026 14:42:03 -0700 Subject: [PATCH 6/9] delete old configs --- configs/baselines/cm4-atmosphere/README.md | 6 - .../cm4-atmosphere/ace-evaluator-config.yaml | 24 -- .../cm4-atmosphere/ace-train-config.yaml | 256 ------------- .../cm4-atmosphere/run-ace-evaluator.sh | 43 --- .../baselines/cm4-atmosphere/run-ace-train.sh | 44 --- .../cm4-ocean/ace-evaluator-config.yaml | 31 -- .../baselines/cm4-ocean/ace-train-config.yaml | 339 ------------------ .../baselines/cm4-ocean/run-ace-evaluator.sh | 71 ---- configs/baselines/cm4-ocean/run-ace-train.sh | 61 ---- .../coupled-inference-no-target/config.yaml | 33 -- .../run-coupled-inference.sh | 43 --- 11 files changed, 951 deletions(-) delete mode 100644 configs/baselines/cm4-atmosphere/README.md delete mode 100644 configs/baselines/cm4-atmosphere/ace-evaluator-config.yaml delete mode 100644 configs/baselines/cm4-atmosphere/ace-train-config.yaml delete mode 100755 configs/baselines/cm4-atmosphere/run-ace-evaluator.sh delete mode 100755 configs/baselines/cm4-atmosphere/run-ace-train.sh delete mode 100644 configs/baselines/cm4-ocean/ace-evaluator-config.yaml delete mode 100644 configs/baselines/cm4-ocean/ace-train-config.yaml delete mode 100644 configs/baselines/cm4-ocean/run-ace-evaluator.sh delete mode 100644 configs/baselines/cm4-ocean/run-ace-train.sh delete mode 100644 configs/baselines/cm4/coupled-inference-no-target/config.yaml delete mode 100755 configs/baselines/cm4/coupled-inference-no-target/run-coupled-inference.sh diff --git a/configs/baselines/cm4-atmosphere/README.md b/configs/baselines/cm4-atmosphere/README.md deleted file mode 100644 index c855afb7b..000000000 --- a/configs/baselines/cm4-atmosphere/README.md +++ /dev/null @@ -1,6 +0,0 @@ -### CM4-atmosphere training and evaluation baseline configuration - -Update 2025-07-24: Configurations were updated for climate skill, consistency with other datasets, and training throughput performance based on the following studies: - -- [Throughput](https://docs.google.com/presentation/d/1m4AuXaAUIVbD61WMduWuOf9xsXVezBKdBj1d_6XThK4/edit?slide=id.g36f849ac55b_1_2#slide=id.g36f849ac55b_1_2) -- [Model skill](https://docs.google.com/presentation/d/1tzPKqRfSo0QBk4-nm059jheQ3GZ5txe6Om5ccUJ51xg/edit?slide=id.g370d36c5b09_0_8#slide=id.g370d36c5b09_0_8) \ No newline at end of file diff --git a/configs/baselines/cm4-atmosphere/ace-evaluator-config.yaml b/configs/baselines/cm4-atmosphere/ace-evaluator-config.yaml deleted file mode 100644 index ca5f9cf63..000000000 --- a/configs/baselines/cm4-atmosphere/ace-evaluator-config.yaml +++ /dev/null @@ -1,24 +0,0 @@ -experiment_dir: /results -n_forward_steps: 58300 -forward_steps_in_memory: 50 -checkpoint_path: /ckpt.tar -logging: - log_to_screen: true - log_to_wandb: true - log_to_file: true - project: ace - entity: ai2cm -loader: - start_indices: - times: - - '0311-01-01T06:00:00' - dataset: - data_path: /climate-default - file_pattern: 2025-03-21-CM4-piControl-atmosphere-land-1deg-8layer-200yr.zarr - engine: zarr - num_data_workers: 4 -aggregator: - log_zonal_mean_images: false -data_writer: - save_prediction_files: false - save_monthly_files: false diff --git a/configs/baselines/cm4-atmosphere/ace-train-config.yaml b/configs/baselines/cm4-atmosphere/ace-train-config.yaml deleted file mode 100644 index be5c1261a..000000000 --- a/configs/baselines/cm4-atmosphere/ace-train-config.yaml +++ /dev/null @@ -1,256 +0,0 @@ -experiment_dir: /results -save_checkpoint: true -validate_using_ema: true -ema: - decay: 0.999 -max_epochs: 50 # this is about equivalent to the amount of trainign for ACE2-ERA5/ACE2-SHiELD -n_forward_steps: 2 -inference: - n_forward_steps: 7300 # 5 years - forward_steps_in_memory: 50 - loader: - start_indices: - times: - # total of 120 years with 5yr rollouts - - "0301-01-01T06:00:00" - - "0301-03-01T00:00:00" - - "0301-04-29T00:00:00" - - "0301-06-26T18:00:00" - - "0301-08-24T18:00:00" - - "0301-10-22T12:00:00" - - "0301-12-20T12:00:00" - - "0302-02-17T06:00:00" - - "0302-04-17T06:00:00" - - "0302-06-15T00:00:00" - - "0302-08-13T00:00:00" - - "0302-10-10T18:00:00" - - "0302-12-08T18:00:00" - - "0303-02-05T12:00:00" - - "0303-04-05T12:00:00" - - "0303-06-03T06:00:00" - - "0303-08-01T06:00:00" - - "0303-09-29T00:00:00" - - "0303-11-27T00:00:00" - - "0304-01-24T18:00:00" - - "0304-03-24T18:00:00" - - "0304-05-22T12:00:00" - - "0304-07-20T12:00:00" - - "0304-09-17T06:00:00" - - "0304-11-15T06:00:00" - - "0305-01-13T00:00:00" - - "0305-03-13T00:00:00" - - "0305-05-10T18:00:00" - - "0305-07-08T18:00:00" - - "0305-09-05T12:00:00" - - "0305-11-03T12:00:00" - - "0306-01-01T06:00:00" - dataset: - data_path: /climate-default - file_pattern: 2025-03-21-CM4-piControl-atmosphere-land-1deg-8layer-200yr.zarr - engine: zarr - num_data_workers: 4 - aggregator: - log_histograms: true - time_mean_reference_data: /statsdata/time-mean.nc -logging: - log_to_screen: true - log_to_wandb: true - log_to_file: true - project: ace2-cm4 - entity: ai2cm -train_loader: - batch_size: 16 - num_data_workers: 16 - prefetch_factor: 4 - dataset: - data_path: /climate-default - file_pattern: 2025-03-21-CM4-piControl-atmosphere-land-1deg-8layer-200yr.zarr - engine: zarr - subset: - stop_time: '0301-01-01T06:00:00' -validation_loader: - batch_size: 64 - num_data_workers: 16 - prefetch_factor: 4 - dataset: - data_path: /climate-default - file_pattern: 2025-03-21-CM4-piControl-atmosphere-land-1deg-8layer-200yr.zarr - engine: zarr - subset: - start_time: '0301-01-01T06:00:00' - stop_time: '0311-01-01T06:00:00' -optimization: - use_gradient_accumulation: true - enable_automatic_mixed_precision: false - lr: 0.0001 - optimizer_type: AdamW - kwargs: - fused: true - weight_decay: 0.01 -stepper_training: - loss: - type: MSE - weights: - air_temperature_0: 0.5 - air_temperature_1: 0.5 - eastward_wind_0: 0.5 - northward_wind_0: 0.5 - specific_total_water_0: 0.5 - specific_total_water_1: 0.25 - specific_total_water_2: 0.5 - PRATEsfc: 0.5 - h500: 10 - TMP850: 5 - Q2m: 0.5 - DLWRFsfc: 2 - ULWRFsfc: 5 - USWRFsfc: 2 - DSWRFsfc: 2 - USWRFtoa: 2 - tendency_of_total_water_path_due_to_advection: 0.5 -stepper: - step: - type: single_module - config: - builder: - type: NoiseConditionedSFNO - config: - embed_dim: 384 - noise_embed_dim: 0 - filter_type: linear - use_mlp: true - num_layers: 8 - operator_type: dhconv - separable: false - spectral_layers: 3 - spectral_transform: sht - normalization: - network: - global_means_path: /statsdata/centering.nc - global_stds_path: /statsdata/scaling-full-field.nc - residual: - global_means_path: /statsdata/centering.nc - global_stds_path: /statsdata/scaling-residual.nc - ocean: - surface_temperature_name: surface_temperature - ocean_fraction_name: ocean_fraction - corrector: - conserve_dry_air: true - moisture_budget_correction: advection_and_precipitation - force_positive_names: - - specific_total_water_0 - - specific_total_water_1 - - specific_total_water_2 - - specific_total_water_3 - - specific_total_water_4 - - specific_total_water_5 - - specific_total_water_6 - - specific_total_water_7 - - Q2m - - PRATEsfc - - ULWRFsfc - - ULWRFtoa - - DLWRFsfc - - DSWRFsfc - - USWRFsfc - - USWRFtoa - next_step_forcing_names: - - DSWRFtoa - in_names: - - land_fraction - - ocean_fraction - - sea_ice_fraction - - DSWRFtoa - - HGTsfc - - PRESsfc - - surface_temperature - - TMP2m - - Q2m - - UGRD10m - - VGRD10m - - air_temperature_0 - - air_temperature_1 - - air_temperature_2 - - air_temperature_3 - - air_temperature_4 - - air_temperature_5 - - air_temperature_6 - - air_temperature_7 - - specific_total_water_0 - - specific_total_water_1 - - specific_total_water_2 - - specific_total_water_3 - - specific_total_water_4 - - specific_total_water_5 - - specific_total_water_6 - - specific_total_water_7 - - eastward_wind_0 - - eastward_wind_1 - - eastward_wind_2 - - eastward_wind_3 - - eastward_wind_4 - - eastward_wind_5 - - eastward_wind_6 - - eastward_wind_7 - - northward_wind_0 - - northward_wind_1 - - northward_wind_2 - - northward_wind_3 - - northward_wind_4 - - northward_wind_5 - - northward_wind_6 - - northward_wind_7 - out_names: - - PRESsfc - - surface_temperature - - TMP2m - - Q2m - - UGRD10m - - VGRD10m - - air_temperature_0 - - air_temperature_1 - - air_temperature_2 - - air_temperature_3 - - air_temperature_4 - - air_temperature_5 - - air_temperature_6 - - air_temperature_7 - - specific_total_water_0 - - specific_total_water_1 - - specific_total_water_2 - - specific_total_water_3 - - specific_total_water_4 - - specific_total_water_5 - - specific_total_water_6 - - specific_total_water_7 - - eastward_wind_0 - - eastward_wind_1 - - eastward_wind_2 - - eastward_wind_3 - - eastward_wind_4 - - eastward_wind_5 - - eastward_wind_6 - - eastward_wind_7 - - northward_wind_0 - - northward_wind_1 - - northward_wind_2 - - northward_wind_3 - - northward_wind_4 - - northward_wind_5 - - northward_wind_6 - - northward_wind_7 - - LHTFLsfc - - SHTFLsfc - - PRATEsfc - - ULWRFsfc - - ULWRFtoa - - DLWRFsfc - - DSWRFsfc - - USWRFsfc - - USWRFtoa - - tendency_of_total_water_path_due_to_advection - - TMP850 - - h500 - # ocean coupling variables (diagnostic for now) - - eastward_surface_wind_stress - - northward_surface_wind_stress \ No newline at end of file diff --git a/configs/baselines/cm4-atmosphere/run-ace-evaluator.sh b/configs/baselines/cm4-atmosphere/run-ace-evaluator.sh deleted file mode 100755 index adfd730a6..000000000 --- a/configs/baselines/cm4-atmosphere/run-ace-evaluator.sh +++ /dev/null @@ -1,43 +0,0 @@ -#!/bin/bash - -set -e - -JOB_NAME="ace2-cm4-atmosphere-evaluator" -JOB_GROUP="ace2-cm4-atmosphere" -EXISTING_RESULTS_DATASET="01K0JF7H54WVDF5FGSAFAP04GJ" # this contains the checkpoint to use for inference -CONFIG_FILENAME="ace-evaluator-config.yaml" -SCRIPT_PATH=$(git rev-parse --show-prefix) # relative to the root of the repository -CONFIG_PATH="${SCRIPT_PATH}${CONFIG_FILENAME}" - # since we use a service account API key for wandb, we use the beaker username to set the wandb username -BEAKER_USERNAME=$(beaker account whoami --format=json | jq -r '.[0].name') -REPO_ROOT=$(git rev-parse --show-toplevel) - -cd $REPO_ROOT # so config path is valid no matter where we are running this script - -python -m fme.ace.validate_config --config_type evaluator $CONFIG_PATH - -gantry run \ - --name $JOB_NAME \ - --task-name $JOB_NAME \ - --description 'Run ACE evaluator for CM4 atmosphere data' \ - --beaker-image "$(cat $REPO_ROOT/latest_deps_only_image.txt)" \ - --workspace ai2/ace \ - --priority normal \ - --not-preemptible \ - --cluster ai2/saturn-cirrascale \ - --cluster ai2/ceres-cirrascale \ - --env WANDB_USERNAME=$BEAKER_USERNAME \ - --env WANDB_NAME=$JOB_NAME \ - --env WANDB_JOB_TYPE=inference \ - --env WANDB_RUN_GROUP=$JOB_GROUP \ - --env GOOGLE_APPLICATION_CREDENTIALS=/tmp/google_application_credentials.json \ - --env-secret WANDB_API_KEY=wandb-api-key-ai2cm-sa \ - --dataset-secret google-credentials:/tmp/google_application_credentials.json \ - --dataset $EXISTING_RESULTS_DATASET:training_checkpoints/best_inference_ckpt.tar:/ckpt.tar \ - --gpus 1 \ - --shared-memory 50GiB \ - --weka climate-default:/climate-default \ - --budget ai2/climate \ - --system-python \ - --install "pip install --no-deps ." \ - -- python -I -m fme.ace.evaluator $CONFIG_PATH diff --git a/configs/baselines/cm4-atmosphere/run-ace-train.sh b/configs/baselines/cm4-atmosphere/run-ace-train.sh deleted file mode 100755 index d08f948c6..000000000 --- a/configs/baselines/cm4-atmosphere/run-ace-train.sh +++ /dev/null @@ -1,44 +0,0 @@ -#!/bin/bash - -set -e - -JOB_NAME="ace2-cm4-atmosphere-train" -JOB_GROUP="ace2-cm4-atmosphere" -CONFIG_FILENAME="ace-train-config.yaml" -SCRIPT_PATH=$(git rev-parse --show-prefix) # relative to the root of the repository -CONFIG_PATH="${SCRIPT_PATH}${CONFIG_FILENAME}" - # since we use a service account API key for wandb, we use the beaker username to set the wandb username -BEAKER_USERNAME=$(beaker account whoami --format=json | jq -r '.[0].name') -REPO_ROOT=$(git rev-parse --show-toplevel) -N_GPUS=8 - -cd $REPO_ROOT # so config path is valid no matter where we are running this script - -python -m fme.ace.validate_config --config_type train $CONFIG_PATH - -gantry run \ - --name $JOB_NAME \ - --task-name $JOB_NAME \ - --description 'Run ACE training for CM4 atmosphere data' \ - --beaker-image "$(cat $REPO_ROOT/latest_deps_only_image.txt)" \ - --workspace ai2/ace \ - --priority normal \ - --preemptible \ - --cluster ai2/ceres-cirrascale \ - --cluster ai2/saturn-cirrascale \ - --cluster ai2/titan-cirrascale \ - --env WANDB_USERNAME=$BEAKER_USERNAME \ - --env WANDB_NAME=$JOB_NAME \ - --env WANDB_JOB_TYPE=training \ - --env WANDB_RUN_GROUP=$JOB_GROUP \ - --env GOOGLE_APPLICATION_CREDENTIALS=/tmp/google_application_credentials.json \ - --env-secret WANDB_API_KEY=wandb-api-key-ai2cm-sa \ - --dataset-secret google-credentials:/tmp/google_application_credentials.json \ - --dataset jamesd/2025-03-21-CM4-piControl-atmosphere-land-1deg-8layer-200yr-stats:/statsdata \ - --gpus $N_GPUS \ - --shared-memory 400GiB \ - --weka climate-default:/climate-default \ - --budget ai2/climate \ - --system-python \ - --install "pip install --no-deps ." \ - -- torchrun --nproc_per_node $N_GPUS -m fme.ace.train $CONFIG_PATH diff --git a/configs/baselines/cm4-ocean/ace-evaluator-config.yaml b/configs/baselines/cm4-ocean/ace-evaluator-config.yaml deleted file mode 100644 index e5205f0d3..000000000 --- a/configs/baselines/cm4-ocean/ace-evaluator-config.yaml +++ /dev/null @@ -1,31 +0,0 @@ -experiment_dir: /results -n_forward_steps: 730 -forward_steps_in_memory: 40 -checkpoint_path: /ckpt.tar -logging: - log_to_screen: true - log_to_wandb: true - log_to_file: true - project: ace - entity: ai2cm -loader: - start_indices: - times: - - '0311-01-03T12:00:00' - dataset: - merge: - - data_path: /climate-default - file_pattern: 2025-08-22-cm4-piControl-200yr-coupled-ocean.zarr - engine: zarr - - data_path: /climate-default - file_pattern: 2025-08-07-cm4-piControl-200yr-ocean.zarr - engine: zarr - - data_path: /climate-default - file_pattern: 2025-08-07-cm4-piControl-200yr-5daily-sfc-flux.zarr - engine: zarr - num_data_workers: 1 -data_writer: - save_prediction_files: false - save_monthly_files: true -aggregator: - log_histograms: true diff --git a/configs/baselines/cm4-ocean/ace-train-config.yaml b/configs/baselines/cm4-ocean/ace-train-config.yaml deleted file mode 100644 index c0495d2ac..000000000 --- a/configs/baselines/cm4-ocean/ace-train-config.yaml +++ /dev/null @@ -1,339 +0,0 @@ -experiment_dir: /results -save_checkpoint: true -validate_using_ema: true -ema: - decay: 0.999 -max_epochs: 50 -n_forward_steps: 4 -inference: - n_forward_steps: 1440 - forward_steps_in_memory: 40 - loader: - start_indices: - times: - - '0151-01-03T12:00:00' - - '0171-01-03T12:00:00' - - '0191-01-03T12:00:00' - - '0211-01-03T12:00:00' - - '0231-01-03T12:00:00' - - '0251-01-03T12:00:00' - - '0271-01-03T12:00:00' - - '0291-01-03T12:00:00' - dataset: - merge: - - data_path: /climate-default - file_pattern: 2025-08-22-cm4-piControl-200yr-coupled-ocean.zarr - engine: zarr - - data_path: /climate-default - file_pattern: 2025-08-07-cm4-piControl-200yr-ocean.zarr - engine: zarr - - data_path: /climate-default - file_pattern: 2025-08-07-cm4-piControl-200yr-5daily-sfc-flux.zarr - engine: zarr - num_data_workers: 8 - aggregator: - log_histograms: false -logging: - log_to_screen: true - log_to_wandb: true - log_to_file: true - project: ace-samudra-cm4 - entity: ai2cm -train_loader: - batch_size: 16 - num_data_workers: 8 - dataset: - merge: - - data_path: /climate-default - file_pattern: 2025-08-22-cm4-piControl-200yr-coupled-ocean.zarr - engine: zarr - subset: - stop_time: '0306-01-01' - - data_path: /climate-default - file_pattern: 2025-08-07-cm4-piControl-200yr-ocean.zarr - engine: zarr - subset: - stop_time: '0306-01-01' - - data_path: /climate-default - file_pattern: 2025-08-07-cm4-piControl-200yr-5daily-sfc-flux.zarr - engine: zarr - subset: - stop_time: '0306-01-01' -validation_loader: - batch_size: 16 - num_data_workers: 8 - dataset: - merge: - - data_path: /climate-default - file_pattern: 2025-08-22-cm4-piControl-200yr-coupled-ocean.zarr - engine: zarr - subset: - start_time: '0306-01-01' - stop_time: '0311-01-01' - - data_path: /climate-default - file_pattern: 2025-08-07-cm4-piControl-200yr-ocean.zarr - engine: zarr - subset: - start_time: '0306-01-01' - stop_time: '0311-01-01' - - data_path: /climate-default - file_pattern: 2025-08-07-cm4-piControl-200yr-5daily-sfc-flux.zarr - engine: zarr - subset: - start_time: '0306-01-01' - stop_time: '0311-01-01' -optimization: - use_gradient_accumulation: true - enable_automatic_mixed_precision: false - lr: 0.0001 - optimizer_type: AdamW - kwargs: - fused: true - weight_decay: 0.01 -stepper_training: - loss: - type: MSE -stepper: - input_masking: - mask_value: 0 - fill_value: 0.0 - exclude_names_and_prefixes: - - land_fraction - - sea_surface_fraction - step: - type: single_module - config: - builder: - type: Samudra - config: - ch_width: - - 200 - - 250 - - 300 - - 400 - dilation: - - 1 - - 2 - - 4 - - 8 - n_layers: - - 1 - - 1 - - 1 - - 1 - norm: instance - normalization: - network: - global_means_path: /statsdata/centering.nc - global_stds_path: /statsdata/scaling-full-field.nc - corrector: - type: ocean_corrector - config: - force_positive_names: - - so_0 - - so_1 - - so_2 - - so_3 - - so_4 - - so_5 - - so_6 - - so_7 - - so_8 - - so_9 - - so_10 - - so_11 - - so_12 - - so_13 - - so_14 - - so_15 - - so_16 - - so_17 - - so_18 - - HI - sea_ice_fraction_correction: - sea_ice_fraction_name: ocean_sea_ice_fraction - land_fraction_name: land_fraction - remove_negative_ocean_fraction: false - zero_where_ice_free_names: - - HI - next_step_forcing_names: - - DLWRFsfc - - DSWRFsfc - - LHTFLsfc - - SHTFLsfc - - PRATEsfc - - eastward_surface_wind_stress - - northward_surface_wind_stress - in_names: - - DLWRFsfc - - DSWRFsfc - - LHTFLsfc - - SHTFLsfc - - PRATEsfc - - eastward_surface_wind_stress - - northward_surface_wind_stress - - land_fraction - - sea_surface_fraction - - deptho - - hfgeou - - sst - - zos - - so_0 - - so_1 - - so_2 - - so_3 - - so_4 - - so_5 - - so_6 - - so_7 - - so_8 - - so_9 - - so_10 - - so_11 - - so_12 - - so_13 - - so_14 - - so_15 - - so_16 - - so_17 - - so_18 - - thetao_0 - - thetao_1 - - thetao_2 - - thetao_3 - - thetao_4 - - thetao_5 - - thetao_6 - - thetao_7 - - thetao_8 - - thetao_9 - - thetao_10 - - thetao_11 - - thetao_12 - - thetao_13 - - thetao_14 - - thetao_15 - - thetao_16 - - thetao_17 - - thetao_18 - - uo_0 - - uo_1 - - uo_2 - - uo_3 - - uo_4 - - uo_5 - - uo_6 - - uo_7 - - uo_8 - - uo_9 - - uo_10 - - uo_11 - - uo_12 - - uo_13 - - uo_14 - - uo_15 - - uo_16 - - uo_17 - - uo_18 - - vo_0 - - vo_1 - - vo_2 - - vo_3 - - vo_4 - - vo_5 - - vo_6 - - vo_7 - - vo_8 - - vo_9 - - vo_10 - - vo_11 - - vo_12 - - vo_13 - - vo_14 - - vo_15 - - vo_16 - - vo_17 - - vo_18 - - ocean_sea_ice_fraction - - HI - out_names: - - sst - - zos - - so_0 - - so_1 - - so_2 - - so_3 - - so_4 - - so_5 - - so_6 - - so_7 - - so_8 - - so_9 - - so_10 - - so_11 - - so_12 - - so_13 - - so_14 - - so_15 - - so_16 - - so_17 - - so_18 - - thetao_0 - - thetao_1 - - thetao_2 - - thetao_3 - - thetao_4 - - thetao_5 - - thetao_6 - - thetao_7 - - thetao_8 - - thetao_9 - - thetao_10 - - thetao_11 - - thetao_12 - - thetao_13 - - thetao_14 - - thetao_15 - - thetao_16 - - thetao_17 - - thetao_18 - - uo_0 - - uo_1 - - uo_2 - - uo_3 - - uo_4 - - uo_5 - - uo_6 - - uo_7 - - uo_8 - - uo_9 - - uo_10 - - uo_11 - - uo_12 - - uo_13 - - uo_14 - - uo_15 - - uo_16 - - uo_17 - - uo_18 - - vo_0 - - vo_1 - - vo_2 - - vo_3 - - vo_4 - - vo_5 - - vo_6 - - vo_7 - - vo_8 - - vo_9 - - vo_10 - - vo_11 - - vo_12 - - vo_13 - - vo_14 - - vo_15 - - vo_16 - - vo_17 - - vo_18 - - ocean_sea_ice_fraction - - HI diff --git a/configs/baselines/cm4-ocean/run-ace-evaluator.sh b/configs/baselines/cm4-ocean/run-ace-evaluator.sh deleted file mode 100644 index 707d78331..000000000 --- a/configs/baselines/cm4-ocean/run-ace-evaluator.sh +++ /dev/null @@ -1,71 +0,0 @@ -#!/bin/bash - -set -e - -CONFIG_FILENAME="ace-evaluator-config.yaml" -SCRIPT_PATH=$(git rev-parse --show-prefix) # relative to the root of the repository -CONFIG_PATH=$SCRIPT_PATH/$CONFIG_FILENAME - # since we use a service account API key for wandb, we use the beaker username to set the wandb username -BEAKER_USERNAME=$(beaker account whoami --format=json | jq -r '.[0].name') -WANDB_USERNAME=${WANDB_USERNAME:-${BEAKER_USERNAME}} -REPO_ROOT=$(git rev-parse --show-toplevel) - -cd $REPO_ROOT # so config path is valid no matter where we are running this script - -python -m fme.ace.validate_config --config_type evaluator $CONFIG_PATH - -while read TRAIN_EXPER; do - JOB_GROUP=$(echo "$TRAIN_EXPER" | cut -d'|' -f1) - RS=$(echo "$TRAIN_EXPER" | cut -d'|' -f2) - EXPER_ID=$(echo "$TRAIN_EXPER" | cut -d'|' -f3) - STATUS=$(echo "$TRAIN_EXPER" | cut -d'|' -f4) - CKPT=$(echo "$TRAIN_EXPER" | cut -d"|" -f5) - OVERRIDE_ARGS=$(echo "$TRAIN_EXPER" | cut -d"|" -f6) - if [[ "$STATUS" == "training" ]] || [[ "$STATUS" == "skip" ]]; then - continue - fi - - EXISTING_RESULTS_DATASET=$(beaker experiment get $EXPER_ID --format json | jq '.[].jobs[-1].result' | grep "beaker" | cut -d'"' -f4) - echo - echo "Launching evaluator job:" - echo " - Group: ${JOB_GROUP}" - echo " - Random seed iteration: ${RS}" - echo " - Checkpoint: ${CKPT}" - echo " - Training experiment ID: ${EXPER_ID}" - echo " - Training results dataset ID: ${EXISTING_RESULTS_DATASET}" - echo " - --override args: ${OVERRIDE_ARGS}" - - echo - python -m fme.ace.validate_config --config_type evaluator $CONFIG_PATH --override $OVERRIDE_ARGS - echo - - JOB_NAME="${JOB_GROUP}-evaluator_${CKPT}-rs${RS}" - DESCRIPTION="ACE-Samudra CM4 baseline evaluator of RS${RS} ${CKPT}" - gantry run \ - --name "${JOB_NAME}" \ - --description "${DESCRIPTION}" \ - --beaker-image "$(cat $REPO_ROOT/latest_deps_only_image.txt)" \ - --workspace ai2/ace \ - --priority high \ - --not-preemptible \ - --cluster ai2/ceres \ - --cluster ai2/jupiter \ - --cluster ai2/neptune \ - --cluster ai2/saturn \ - --env WANDB_USERNAME=$WANDB_USERNAME \ - --env WANDB_NAME=$JOB_NAME \ - --env WANDB_JOB_TYPE=inference \ - --env WANDB_RUN_GROUP=$JOB_GROUP \ - --env GOOGLE_APPLICATION_CREDENTIALS=/tmp/google_application_credentials.json \ - --env-secret WANDB_API_KEY=wandb-api-key-ai2cm-sa \ - --dataset-secret google-credentials:/tmp/google_application_credentials.json \ - --dataset "${EXISTING_RESULTS_DATASET}:training_checkpoints/${CKPT}.tar:/ckpt.tar" \ - --gpus 1 \ - --shared-memory 20GiB \ - --weka climate-default:/climate-default \ - --budget ai2/climate \ - --system-python \ - --install "pip install --no-deps ." \ - -- python -I -m fme.ace.evaluator $CONFIG_PATH --override $OVERRIDE_ARGS - echo -done <"${SCRIPT_PATH}/experiments.txt" diff --git a/configs/baselines/cm4-ocean/run-ace-train.sh b/configs/baselines/cm4-ocean/run-ace-train.sh deleted file mode 100644 index 78a738caf..000000000 --- a/configs/baselines/cm4-ocean/run-ace-train.sh +++ /dev/null @@ -1,61 +0,0 @@ -#!/bin/bash - -set -e - -CONFIG_FILENAME="ace-train-config.yaml" -SCRIPT_PATH=$(git rev-parse --show-prefix) # relative to the root of the repository -CONFIG_PATH=$SCRIPT_PATH/$CONFIG_FILENAME - # since we use a service account API key for wandb, we use the beaker username to set the wandb username -BEAKER_USERNAME=$(beaker account whoami --format=json | jq -r '.[0].name') -REPO_ROOT=$(git rev-parse --show-toplevel) -N_GPUS=4 -PRIORITY="high" -WORKSPACE="ai2/ace" - -cd $REPO_ROOT # so config path is valid no matter where we are running this script - -JOB_GROUP="2025-08-28-baseline" # update when training a new baseline -RS=1 -JOB_NAME="${JOB_GROUP}-rs${RS}-train" -echo "Job name: ${JOB_NAME}" - -OVERRIDE_ARGS= # add group-specific overrides here, e.g. lr, max_epochs, etc. -STATS_DATA=jamesd/2025-08-22-cm4-piControl-200yr-coupled-stats-ocean - -python -m fme.ace.validate_config --config_type train $CONFIG_PATH - -EXPERIMENT_ID=$( - gantry run $ALLOW_DIRTY \ - --name $JOB_NAME \ - --description "ACE-Saumdra CM4 baseline training RS${RS}" \ - --beaker-image "$(cat $REPO_ROOT/latest_deps_only_image.txt)" \ - --workspace $WORKSPACE \ - --priority $PRIORITY \ - --preemptible \ - --cluster ai2/ceres \ - --cluster ai2/jupiter \ - --cluster ai2/neptune \ - --cluster ai2/saturn \ - --env WANDB_USERNAME=$BEAKER_USERNAME \ - --env WANDB_NAME=$JOB_NAME \ - --env WANDB_JOB_TYPE=training \ - --env WANDB_RUN_GROUP=$JOB_GROUP \ - --env GOOGLE_APPLICATION_CREDENTIALS=/tmp/google_application_credentials.json \ - --env-secret WANDB_API_KEY=wandb-api-key-ai2cm-sa \ - --dataset-secret google-credentials:/tmp/google_application_credentials.json \ - --dataset $STATS_DATA:/statsdata \ - --gpus $N_GPUS \ - --shared-memory 400GiB \ - --weka climate-default:/climate-default \ - --budget ai2/climate \ - --system-python \ - --install "pip install --no-deps ." \ - -- torchrun --nproc_per_node $N_GPUS -m fme.ace.train $CONFIG_PATH --override $OVERRIDE_ARGS | - tee /dev/tty | - grep beaker.org | - cut -d/ -f5 -) -# remove or change 'training' once completed in order to submit an evaluator job -echo "${JOB_GROUP}|${RS}|${EXPERIMENT_ID}|training|best_inference_ckpt" >> $SCRIPT_PATH/experiments.txt -echo -sleep 1 diff --git a/configs/baselines/cm4/coupled-inference-no-target/config.yaml b/configs/baselines/cm4/coupled-inference-no-target/config.yaml deleted file mode 100644 index b8de631db..000000000 --- a/configs/baselines/cm4/coupled-inference-no-target/config.yaml +++ /dev/null @@ -1,33 +0,0 @@ -experiment_dir: /results -n_coupled_steps: 140 -coupled_steps_in_memory: 20 -checkpoint_path: /ckpt.tar -data_writer: - ocean: - save_prediction_files: false - save_monthly_files: true - atmosphere: - save_prediction_files: false - save_monthly_files: true -forcing_loader: - atmosphere: - dataset: - data_path: /climate-default/2025-08-29-coupled-inference-no-target-data-example/CM4 - file_pattern: cm4-atmosphere-forcing-1yr.zarr - engine: zarr - n_repeats: 3 - num_data_workers: 1 -initial_condition: - ocean: - path: /climate-default/2025-08-29-coupled-inference-no-target-data-example/CM4/cm4-ocean-ic.nc - atmosphere: - path: /climate-default/2025-08-29-coupled-inference-no-target-data-example/CM4/cm4-atmosphere-ic.nc - start_indices: - first: 0 - n_initial_conditions: 2 -logging: - log_to_screen: true - log_to_wandb: true - log_to_file: true - project: coupled-emulator - entity: ai2cm \ No newline at end of file diff --git a/configs/baselines/cm4/coupled-inference-no-target/run-coupled-inference.sh b/configs/baselines/cm4/coupled-inference-no-target/run-coupled-inference.sh deleted file mode 100755 index 31f7b2bc5..000000000 --- a/configs/baselines/cm4/coupled-inference-no-target/run-coupled-inference.sh +++ /dev/null @@ -1,43 +0,0 @@ -#!/bin/bash - -set -e - -JOB_NAME="ace-samudra-coupled-CM4-inference-no-target-with-dummy-forcing-dataset" -JOB_GROUP="inference-no-target" -EXISTING_RESULTS_DATASET="01JZHQJXC4EYAPTCSP188YSVC0" # this contains the checkpoint to use for inference -CONFIG_FILENAME="config.yaml" -SCRIPT_PATH=$(git rev-parse --show-prefix) # relative to the root of the repository -CONFIG_PATH="${SCRIPT_PATH}${CONFIG_FILENAME}" - # since we use a service account API key for wandb, we use the beaker username to set the wandb username -BEAKER_USERNAME=$(beaker account whoami --format=json | jq -r '.[0].name') -WANDB_USERNAME=${WANDB_USERNAME:-${BEAKER_USERNAME}} -REPO_ROOT=$(git rev-parse --show-toplevel) - -cd $REPO_ROOT # so config path is valid no matter where we are running this script - -python -m fme.coupled.validate_config --config_type inference $CONFIG_PATH - -gantry run \ - --name $JOB_NAME \ - --task-name $JOB_NAME \ - --description 'Run ACE-Samudra inference no target' \ - --beaker-image "$(cat $REPO_ROOT/latest_deps_only_image.txt)" \ - --workspace ai2/ace \ - --priority normal \ - --not-preemptible \ - --cluster ai2/ceres \ - --env WANDB_USERNAME=$WANDB_USERNAME \ - --env WANDB_NAME=$JOB_NAME \ - --env WANDB_JOB_TYPE=inference \ - --env WANDB_RUN_GROUP=$JOB_GROUP \ - --env GOOGLE_APPLICATION_CREDENTIALS=/tmp/google_application_credentials.json \ - --env-secret WANDB_API_KEY=wandb-api-key-ai2cm-sa \ - --dataset-secret google-credentials:/tmp/google_application_credentials.json \ - --dataset $EXISTING_RESULTS_DATASET:training_checkpoints/best_inference_ckpt.tar:/ckpt.tar \ - --gpus 1 \ - --shared-memory 50GiB \ - --weka climate-default:/climate-default \ - --budget ai2/climate \ - --system-python \ - --install "pip install --no-deps ." \ - -- python -I -m fme.coupled.inference $CONFIG_PATH From 4a8b5dc2a99d7d990ef7292ff8c801f07149ecc4 Mon Sep 17 00:00:00 2001 From: Elynn Wu Date: Wed, 8 Apr 2026 14:42:40 -0700 Subject: [PATCH 7/9] update dir name --- configs/baselines/{coupled => }/cm4-piControl/README.md | 0 configs/baselines/{coupled => }/cm4-piControl/evaluate.sh | 0 .../baselines/{coupled => }/cm4-piControl/evaluator-config.yaml | 0 .../{coupled => }/cm4-piControl/finetune-config-template.yaml | 0 configs/baselines/{coupled => }/cm4-piControl/finetune.sh | 0 .../{coupled => }/cm4-piControl/train-config-template.yaml | 0 configs/baselines/{coupled => }/cm4-piControl/train.sh | 0 .../{coupled => }/cm4-piControl/uncoupled-atmos/evaluate.sh | 0 .../cm4-piControl/uncoupled-atmos/evaluator-config.yaml | 0 .../{coupled => }/cm4-piControl/uncoupled-atmos/train-config.yaml | 0 .../{coupled => }/cm4-piControl/uncoupled-atmos/train.sh | 0 .../{coupled => }/cm4-piControl/uncoupled-ocean/evaluate.sh | 0 .../cm4-piControl/uncoupled-ocean/evaluator-config.yaml | 0 .../{coupled => }/cm4-piControl/uncoupled-ocean/train-config.yaml | 0 .../{coupled => }/cm4-piControl/uncoupled-ocean/train.sh | 0 15 files changed, 0 insertions(+), 0 deletions(-) rename configs/baselines/{coupled => }/cm4-piControl/README.md (100%) rename configs/baselines/{coupled => }/cm4-piControl/evaluate.sh (100%) rename configs/baselines/{coupled => }/cm4-piControl/evaluator-config.yaml (100%) rename configs/baselines/{coupled => }/cm4-piControl/finetune-config-template.yaml (100%) rename configs/baselines/{coupled => }/cm4-piControl/finetune.sh (100%) rename configs/baselines/{coupled => }/cm4-piControl/train-config-template.yaml (100%) rename configs/baselines/{coupled => }/cm4-piControl/train.sh (100%) rename configs/baselines/{coupled => }/cm4-piControl/uncoupled-atmos/evaluate.sh (100%) rename configs/baselines/{coupled => }/cm4-piControl/uncoupled-atmos/evaluator-config.yaml (100%) rename configs/baselines/{coupled => }/cm4-piControl/uncoupled-atmos/train-config.yaml (100%) rename configs/baselines/{coupled => }/cm4-piControl/uncoupled-atmos/train.sh (100%) rename configs/baselines/{coupled => }/cm4-piControl/uncoupled-ocean/evaluate.sh (100%) rename configs/baselines/{coupled => }/cm4-piControl/uncoupled-ocean/evaluator-config.yaml (100%) rename configs/baselines/{coupled => }/cm4-piControl/uncoupled-ocean/train-config.yaml (100%) rename configs/baselines/{coupled => }/cm4-piControl/uncoupled-ocean/train.sh (100%) diff --git a/configs/baselines/coupled/cm4-piControl/README.md b/configs/baselines/cm4-piControl/README.md similarity index 100% rename from configs/baselines/coupled/cm4-piControl/README.md rename to configs/baselines/cm4-piControl/README.md diff --git a/configs/baselines/coupled/cm4-piControl/evaluate.sh b/configs/baselines/cm4-piControl/evaluate.sh similarity index 100% rename from configs/baselines/coupled/cm4-piControl/evaluate.sh rename to configs/baselines/cm4-piControl/evaluate.sh diff --git a/configs/baselines/coupled/cm4-piControl/evaluator-config.yaml b/configs/baselines/cm4-piControl/evaluator-config.yaml similarity index 100% rename from configs/baselines/coupled/cm4-piControl/evaluator-config.yaml rename to configs/baselines/cm4-piControl/evaluator-config.yaml diff --git a/configs/baselines/coupled/cm4-piControl/finetune-config-template.yaml b/configs/baselines/cm4-piControl/finetune-config-template.yaml similarity index 100% rename from configs/baselines/coupled/cm4-piControl/finetune-config-template.yaml rename to configs/baselines/cm4-piControl/finetune-config-template.yaml diff --git a/configs/baselines/coupled/cm4-piControl/finetune.sh b/configs/baselines/cm4-piControl/finetune.sh similarity index 100% rename from configs/baselines/coupled/cm4-piControl/finetune.sh rename to configs/baselines/cm4-piControl/finetune.sh diff --git a/configs/baselines/coupled/cm4-piControl/train-config-template.yaml b/configs/baselines/cm4-piControl/train-config-template.yaml similarity index 100% rename from configs/baselines/coupled/cm4-piControl/train-config-template.yaml rename to configs/baselines/cm4-piControl/train-config-template.yaml diff --git a/configs/baselines/coupled/cm4-piControl/train.sh b/configs/baselines/cm4-piControl/train.sh similarity index 100% rename from configs/baselines/coupled/cm4-piControl/train.sh rename to configs/baselines/cm4-piControl/train.sh diff --git a/configs/baselines/coupled/cm4-piControl/uncoupled-atmos/evaluate.sh b/configs/baselines/cm4-piControl/uncoupled-atmos/evaluate.sh similarity index 100% rename from configs/baselines/coupled/cm4-piControl/uncoupled-atmos/evaluate.sh rename to configs/baselines/cm4-piControl/uncoupled-atmos/evaluate.sh diff --git a/configs/baselines/coupled/cm4-piControl/uncoupled-atmos/evaluator-config.yaml b/configs/baselines/cm4-piControl/uncoupled-atmos/evaluator-config.yaml similarity index 100% rename from configs/baselines/coupled/cm4-piControl/uncoupled-atmos/evaluator-config.yaml rename to configs/baselines/cm4-piControl/uncoupled-atmos/evaluator-config.yaml diff --git a/configs/baselines/coupled/cm4-piControl/uncoupled-atmos/train-config.yaml b/configs/baselines/cm4-piControl/uncoupled-atmos/train-config.yaml similarity index 100% rename from configs/baselines/coupled/cm4-piControl/uncoupled-atmos/train-config.yaml rename to configs/baselines/cm4-piControl/uncoupled-atmos/train-config.yaml diff --git a/configs/baselines/coupled/cm4-piControl/uncoupled-atmos/train.sh b/configs/baselines/cm4-piControl/uncoupled-atmos/train.sh similarity index 100% rename from configs/baselines/coupled/cm4-piControl/uncoupled-atmos/train.sh rename to configs/baselines/cm4-piControl/uncoupled-atmos/train.sh diff --git a/configs/baselines/coupled/cm4-piControl/uncoupled-ocean/evaluate.sh b/configs/baselines/cm4-piControl/uncoupled-ocean/evaluate.sh similarity index 100% rename from configs/baselines/coupled/cm4-piControl/uncoupled-ocean/evaluate.sh rename to configs/baselines/cm4-piControl/uncoupled-ocean/evaluate.sh diff --git a/configs/baselines/coupled/cm4-piControl/uncoupled-ocean/evaluator-config.yaml b/configs/baselines/cm4-piControl/uncoupled-ocean/evaluator-config.yaml similarity index 100% rename from configs/baselines/coupled/cm4-piControl/uncoupled-ocean/evaluator-config.yaml rename to configs/baselines/cm4-piControl/uncoupled-ocean/evaluator-config.yaml diff --git a/configs/baselines/coupled/cm4-piControl/uncoupled-ocean/train-config.yaml b/configs/baselines/cm4-piControl/uncoupled-ocean/train-config.yaml similarity index 100% rename from configs/baselines/coupled/cm4-piControl/uncoupled-ocean/train-config.yaml rename to configs/baselines/cm4-piControl/uncoupled-ocean/train-config.yaml diff --git a/configs/baselines/coupled/cm4-piControl/uncoupled-ocean/train.sh b/configs/baselines/cm4-piControl/uncoupled-ocean/train.sh similarity index 100% rename from configs/baselines/coupled/cm4-piControl/uncoupled-ocean/train.sh rename to configs/baselines/cm4-piControl/uncoupled-ocean/train.sh From 5f1b7ca7adac10a4ab445a5ef813d442657a6693 Mon Sep 17 00:00:00 2001 From: Elynn Wu Date: Wed, 8 Apr 2026 15:06:56 -0700 Subject: [PATCH 8/9] update unit test --- configs/baselines/cm4-piControl/README.md | 2 + .../cm4-piControl/finetune-config.yaml | 560 +++++++++++++++++ configs/baselines/cm4-piControl/finetune.sh | 10 +- .../baselines/cm4-piControl/train-config.yaml | 564 ++++++++++++++++++ configs/baselines/cm4-piControl/train.sh | 10 +- configs/test_configs.py | 39 +- 6 files changed, 1172 insertions(+), 13 deletions(-) create mode 100644 configs/baselines/cm4-piControl/finetune-config.yaml create mode 100644 configs/baselines/cm4-piControl/train-config.yaml diff --git a/configs/baselines/cm4-piControl/README.md b/configs/baselines/cm4-piControl/README.md index c8cf1d6ec..519292e4c 100644 --- a/configs/baselines/cm4-piControl/README.md +++ b/configs/baselines/cm4-piControl/README.md @@ -37,7 +37,9 @@ Coupled evaluation | `uncoupled-ocean/train-config.yaml` | SamudraI ocean model pretraining config | | `uncoupled-ocean/evaluator-config.yaml` | SamudraI evaluation (2,920 steps = ~40 years at 5-day) | | `train-config-template.yaml` | SamudrACE stage 1 training config template | +| `train-config.yaml` | Example SamudrACE stage 1 training config, generated by `train.sh` | | `finetune-config-template.yaml` | SamudrACE stage 2 training config template | +| `finetune-config.yaml` | Example SamudrACE stage 2 training config, generated by `finetune.sh` | | `evaluator-config-ICx1.yaml` | SamudrACE evaluation from a single initial condition (year 311) | | `train.sh` | Generates `train-config.yaml` and submits SamudrACE stage 1 training | | `finetune.sh` | Generates `finetune-config.yaml` and submits SamudrACE stage 2 training | diff --git a/configs/baselines/cm4-piControl/finetune-config.yaml b/configs/baselines/cm4-piControl/finetune-config.yaml new file mode 100644 index 000000000..727705d83 --- /dev/null +++ b/configs/baselines/cm4-piControl/finetune-config.yaml @@ -0,0 +1,560 @@ +experiment_dir: /results +save_checkpoint: true +validate_using_ema: true +ema: + decay: 0.999 +max_epochs: 20 +n_coupled_steps: 4 +inference: + n_coupled_steps: 1456 + coupled_steps_in_memory: 8 + loader: + num_data_workers: 1 + dataset: + ocean: + merge: + - data_path: /climate-default + file_pattern: 2025-06-03-cm4-piControl-200yr-coupled-ocean.zarr + engine: zarr + - data_path: /climate-default + file_pattern: 2025-04-16-cm4-piControl-ocean-200yr-dataset.zarr + engine: zarr + atmosphere: + merge: + - data_path: /climate-default + file_pattern: 2025-06-03-cm4-piControl-200yr-coupled-interpolate_sst-atmosphere.zarr + engine: zarr + - data_path: /climate-default + file_pattern: 2025-03-21-CM4-piControl-atmosphere-land-1deg-8layer-200yr.zarr + engine: zarr + start_indices: + times: + - '0151-01-06T00:00:00' + - '0171-01-06T00:00:00' + - '0191-01-06T00:00:00' + - '0211-01-06T00:00:00' + - '0231-01-06T00:00:00' + - '0251-01-06T00:00:00' + - '0271-01-06T00:00:00' + - '0291-01-06T00:00:00' + aggregator: + log_zonal_mean_images: false + log_histograms: false +logging: + log_to_screen: true + log_to_wandb: true + log_to_file: true + project: SamudrACE-CM4-piControl + entity: ai2cm +train_loader: + batch_size: 16 + num_data_workers: 4 + prefetch_factor: 1 + dataset: + ocean: + merge: + - data_path: /climate-default + file_pattern: 2025-06-03-cm4-piControl-200yr-coupled-ocean.zarr + engine: zarr + subset: + start_time: '0151-01-06' + stop_time: '0306-01-01' + - data_path: /climate-default + file_pattern: 2025-04-16-cm4-piControl-ocean-200yr-dataset.zarr + engine: zarr + subset: + start_time: '0151-01-06' + stop_time: '0306-01-01' + atmosphere: + merge: + - data_path: /climate-default + file_pattern: 2025-06-03-cm4-piControl-200yr-coupled-interpolate_sst-atmosphere.zarr + engine: zarr + subset: + start_time: '0151-01-06' + stop_time: '0306-01-01' + - data_path: /climate-default + file_pattern: 2025-03-21-CM4-piControl-atmosphere-land-1deg-8layer-200yr.zarr + engine: zarr + subset: + start_time: '0151-01-06' + stop_time: '0306-01-01' +validation_loader: + batch_size: 16 + num_data_workers: 4 + prefetch_factor: 1 + dataset: + ocean: + merge: + - data_path: /climate-default + file_pattern: 2025-06-03-cm4-piControl-200yr-coupled-ocean.zarr + engine: zarr + subset: + start_time: '0306-01-01' + stop_time: '0311-01-01' + - data_path: /climate-default + file_pattern: 2025-04-16-cm4-piControl-ocean-200yr-dataset.zarr + engine: zarr + subset: + start_time: '0306-01-01' + stop_time: '0311-01-01' + atmosphere: + merge: + - data_path: /climate-default + file_pattern: 2025-06-03-cm4-piControl-200yr-coupled-interpolate_sst-atmosphere.zarr + engine: zarr + subset: + start_time: '0306-01-01' + stop_time: '0311-01-01' + - data_path: /climate-default + file_pattern: 2025-03-21-CM4-piControl-atmosphere-land-1deg-8layer-200yr.zarr + engine: zarr + subset: + start_time: '0306-01-01' + stop_time: '0311-01-01' +optimization: + enable_automatic_mixed_precision: false + lr: 0.00001 + optimizer_type: FusedAdam + kwargs: + weight_decay: 0.01 + use_gradient_accumulation: true + scheduler: + type: CosineAnnealingLR +stepper_training: + parameter_init: + checkpoint_path: /ckpt.tar + ocean: + parameter_init: + weights_path: null # null required + loss: + type: MSE + loss_contributions: + n_steps: 4 + weight: 1.0 + atmosphere: + parameter_init: + weights_path: null # null required + loss_contributions: + n_steps: 2 + weight: 1.0 + loss: + type: MSE + weights: + air_temperature_0: 0.5 + air_temperature_1: 0.5 + eastward_wind_0: 0.5 + northward_wind_0: 0.5 + specific_total_water_0: 0.5 + specific_total_water_1: 0.25 + specific_total_water_2: 0.5 + PRATEsfc: 0.5 + h500: 10 + TMP850: 5 + Q2m: 0.5 + DLWRFsfc: 2 + ULWRFsfc: 5 + USWRFsfc: 2 + DSWRFsfc: 2 + USWRFtoa: 2 + tendency_of_total_water_path_due_to_advection: 0.5 +stepper: + ocean: + timedelta: 5D + # stepper added by finetune.sh from uncoupled-ocean/train-config.yaml + + stepper: + input_masking: + mask_value: 0 + fill_value: 0.0 + exclude_names_and_prefixes: + - land_fraction + step: + type: single_module + config: + builder: + type: Samudra + config: + ch_width: + - 200 + - 250 + - 300 + - 400 + dilation: + - 1 + - 2 + - 4 + - 8 + n_layers: + - 1 + - 1 + - 1 + - 1 + norm: instance + normalization: + network: + global_means_path: /ocean_stats/centering.nc + global_stds_path: /ocean_stats/scaling-full-field.nc + corrector: + type: ocean_corrector + config: + force_positive_names: + - so_0 + - so_1 + - so_2 + - so_3 + - so_4 + - so_5 + - so_6 + - so_7 + - so_8 + - so_9 + - so_10 + - so_11 + - so_12 + - so_13 + - so_14 + - so_15 + - so_16 + - so_17 + - so_18 + - HI + sea_ice_fraction_correction: + sea_ice_fraction_name: ocean_sea_ice_fraction + land_fraction_name: land_fraction + remove_negative_ocean_fraction: false + next_step_forcing_names: + - DLWRFsfc + - DSWRFsfc + - ULWRFsfc + - USWRFsfc + - LHTFLsfc + - SHTFLsfc + - PRATEsfc + - eastward_surface_wind_stress + - northward_surface_wind_stress + in_names: + - DLWRFsfc + - DSWRFsfc + - ULWRFsfc + - USWRFsfc + - LHTFLsfc + - SHTFLsfc + - PRATEsfc + - eastward_surface_wind_stress + - northward_surface_wind_stress + - land_fraction + - sst + - zos + - so_0 + - so_1 + - so_2 + - so_3 + - so_4 + - so_5 + - so_6 + - so_7 + - so_8 + - so_9 + - so_10 + - so_11 + - so_12 + - so_13 + - so_14 + - so_15 + - so_16 + - so_17 + - so_18 + - thetao_0 + - thetao_1 + - thetao_2 + - thetao_3 + - thetao_4 + - thetao_5 + - thetao_6 + - thetao_7 + - thetao_8 + - thetao_9 + - thetao_10 + - thetao_11 + - thetao_12 + - thetao_13 + - thetao_14 + - thetao_15 + - thetao_16 + - thetao_17 + - thetao_18 + - uo_0 + - uo_1 + - uo_2 + - uo_3 + - uo_4 + - uo_5 + - uo_6 + - uo_7 + - uo_8 + - uo_9 + - uo_10 + - uo_11 + - uo_12 + - uo_13 + - uo_14 + - uo_15 + - uo_16 + - uo_17 + - uo_18 + - vo_0 + - vo_1 + - vo_2 + - vo_3 + - vo_4 + - vo_5 + - vo_6 + - vo_7 + - vo_8 + - vo_9 + - vo_10 + - vo_11 + - vo_12 + - vo_13 + - vo_14 + - vo_15 + - vo_16 + - vo_17 + - vo_18 + - ocean_sea_ice_fraction + - HI + out_names: + - sst + - zos + - so_0 + - so_1 + - so_2 + - so_3 + - so_4 + - so_5 + - so_6 + - so_7 + - so_8 + - so_9 + - so_10 + - so_11 + - so_12 + - so_13 + - so_14 + - so_15 + - so_16 + - so_17 + - so_18 + - thetao_0 + - thetao_1 + - thetao_2 + - thetao_3 + - thetao_4 + - thetao_5 + - thetao_6 + - thetao_7 + - thetao_8 + - thetao_9 + - thetao_10 + - thetao_11 + - thetao_12 + - thetao_13 + - thetao_14 + - thetao_15 + - thetao_16 + - thetao_17 + - thetao_18 + - uo_0 + - uo_1 + - uo_2 + - uo_3 + - uo_4 + - uo_5 + - uo_6 + - uo_7 + - uo_8 + - uo_9 + - uo_10 + - uo_11 + - uo_12 + - uo_13 + - uo_14 + - uo_15 + - uo_16 + - uo_17 + - uo_18 + - vo_0 + - vo_1 + - vo_2 + - vo_3 + - vo_4 + - vo_5 + - vo_6 + - vo_7 + - vo_8 + - vo_9 + - vo_10 + - vo_11 + - vo_12 + - vo_13 + - vo_14 + - vo_15 + - vo_16 + - vo_17 + - vo_18 + - ocean_sea_ice_fraction + - HI + atmosphere: + timedelta: 6h + # stepper added by finetune.sh from uncoupled-atmos/train-config.yaml + + stepper: + step: + type: single_module + config: + builder: + type: SphericalFourierNeuralOperatorNet + config: + embed_dim: 384 + filter_type: linear + hard_thresholding_fraction: 1.0 + use_mlp: true + normalization_layer: instance_norm + num_layers: 8 + operator_type: dhconv + scale_factor: 1 + separable: false + spectral_layers: 3 + spectral_transform: sht + normalization: + network: + global_means_path: /atmos_stats/centering.nc + global_stds_path: /atmos_stats/scaling-full-field.nc + residual: + global_means_path: /atmos_stats/centering.nc + global_stds_path: /atmos_stats/scaling-residual.nc + ocean: + surface_temperature_name: surface_temperature + ocean_fraction_name: ocean_fraction + interpolate: false + corrector: + conserve_dry_air: true + moisture_budget_correction: advection_and_precipitation + force_positive_names: + - specific_total_water_0 + - specific_total_water_1 + - specific_total_water_2 + - specific_total_water_3 + - specific_total_water_4 + - specific_total_water_5 + - specific_total_water_6 + - specific_total_water_7 + - Q2m + - PRATEsfc + - ULWRFsfc + - ULWRFtoa + - DLWRFsfc + - DSWRFsfc + - USWRFsfc + - USWRFtoa + next_step_forcing_names: + - DSWRFtoa + in_names: + - land_fraction + - ocean_fraction + - sea_ice_fraction + - DSWRFtoa + - HGTsfc + - PRESsfc + - surface_temperature + - TMP2m + - Q2m + - UGRD10m + - VGRD10m + - air_temperature_0 + - air_temperature_1 + - air_temperature_2 + - air_temperature_3 + - air_temperature_4 + - air_temperature_5 + - air_temperature_6 + - air_temperature_7 + - specific_total_water_0 + - specific_total_water_1 + - specific_total_water_2 + - specific_total_water_3 + - specific_total_water_4 + - specific_total_water_5 + - specific_total_water_6 + - specific_total_water_7 + - eastward_wind_0 + - eastward_wind_1 + - eastward_wind_2 + - eastward_wind_3 + - eastward_wind_4 + - eastward_wind_5 + - eastward_wind_6 + - eastward_wind_7 + - northward_wind_0 + - northward_wind_1 + - northward_wind_2 + - northward_wind_3 + - northward_wind_4 + - northward_wind_5 + - northward_wind_6 + - northward_wind_7 + out_names: + - PRESsfc + - surface_temperature + - TMP2m + - Q2m + - UGRD10m + - VGRD10m + - air_temperature_0 + - air_temperature_1 + - air_temperature_2 + - air_temperature_3 + - air_temperature_4 + - air_temperature_5 + - air_temperature_6 + - air_temperature_7 + - specific_total_water_0 + - specific_total_water_1 + - specific_total_water_2 + - specific_total_water_3 + - specific_total_water_4 + - specific_total_water_5 + - specific_total_water_6 + - specific_total_water_7 + - eastward_wind_0 + - eastward_wind_1 + - eastward_wind_2 + - eastward_wind_3 + - eastward_wind_4 + - eastward_wind_5 + - eastward_wind_6 + - eastward_wind_7 + - northward_wind_0 + - northward_wind_1 + - northward_wind_2 + - northward_wind_3 + - northward_wind_4 + - northward_wind_5 + - northward_wind_6 + - northward_wind_7 + - LHTFLsfc + - SHTFLsfc + - PRATEsfc + - ULWRFsfc + - ULWRFtoa + - DLWRFsfc + - DSWRFsfc + - USWRFsfc + - USWRFtoa + - tendency_of_total_water_path_due_to_advection + - TMP850 + - h500 + # ocean coupling variables (diagnostic for now) + - eastward_surface_wind_stress + - northward_surface_wind_stress diff --git a/configs/baselines/cm4-piControl/finetune.sh b/configs/baselines/cm4-piControl/finetune.sh index 08185b69f..171d557b0 100644 --- a/configs/baselines/cm4-piControl/finetune.sh +++ b/configs/baselines/cm4-piControl/finetune.sh @@ -11,10 +11,10 @@ JOB_GROUP="cm4-piControl-coupled" EXISTING_RESULTS_DATASET="TODO" # beaker dataset ID from coupled training (train.sh) CKPT_TYPE="best_inference_ckpt" -SCRIPT_PATH=$(git rev-parse --show-prefix) # relative to the root of the repository -SCRIPT_PATH=${SCRIPT_PATH%/} -BEAKER_USERNAME=$(beaker account whoami --format=json | jq -r '.[0].name') REPO_ROOT=$(git rev-parse --show-toplevel) +SCRIPT_PATH=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +SCRIPT_PATH=${SCRIPT_PATH#$REPO_ROOT/} +BEAKER_USERNAME=$(beaker account whoami --format=json | jq -r '.[0].name') N_GPUS=4 ATMOS_STATS_DATA=jamesd/2025-06-03-cm4-piControl-200yr-coupled-stats-atmosphere @@ -28,10 +28,10 @@ TEMPLATE_CONFIG_PATH="${SCRIPT_PATH}/finetune-config-template.yaml" CONFIG_PATH="${SCRIPT_PATH}/finetune-config.yaml" cp "${SCRIPT_PATH}/uncoupled-atmos/train-config.yaml" ./atmos-config.yaml -sed -i 's/statsdata/atmos_stats/g' ./atmos-config.yaml +sed -i'' -e 's/statsdata/atmos_stats/g' ./atmos-config.yaml cp "${SCRIPT_PATH}/uncoupled-ocean/train-config.yaml" ./ocean-config.yaml -sed -i 's/statsdata/ocean_stats/g' ./ocean-config.yaml +sed -i'' -e 's/statsdata/ocean_stats/g' ./ocean-config.yaml cp "$TEMPLATE_CONFIG_PATH" "$CONFIG_PATH" diff --git a/configs/baselines/cm4-piControl/train-config.yaml b/configs/baselines/cm4-piControl/train-config.yaml new file mode 100644 index 000000000..999cf08be --- /dev/null +++ b/configs/baselines/cm4-piControl/train-config.yaml @@ -0,0 +1,564 @@ +experiment_dir: /results +save_checkpoint: true +validate_using_ema: true +ema: + decay: 0.999 +max_epochs: 20 +n_coupled_steps: 4 +inference: + n_coupled_steps: 1456 + coupled_steps_in_memory: 8 + loader: + num_data_workers: 1 + dataset: + ocean: + merge: + - data_path: /climate-default + file_pattern: 2025-06-03-cm4-piControl-200yr-coupled-ocean.zarr + engine: zarr + - data_path: /climate-default + file_pattern: 2025-04-16-cm4-piControl-ocean-200yr-dataset.zarr + engine: zarr + atmosphere: + merge: + - data_path: /climate-default + file_pattern: 2025-06-03-cm4-piControl-200yr-coupled-interpolate_sst-atmosphere.zarr + engine: zarr + - data_path: /climate-default + file_pattern: 2025-03-21-CM4-piControl-atmosphere-land-1deg-8layer-200yr.zarr + engine: zarr + start_indices: + times: + - '0151-01-06T00:00:00' + - '0171-01-06T00:00:00' + - '0191-01-06T00:00:00' + - '0211-01-06T00:00:00' + - '0231-01-06T00:00:00' + - '0251-01-06T00:00:00' + - '0271-01-06T00:00:00' + - '0291-01-06T00:00:00' + aggregator: + log_zonal_mean_images: false + log_histograms: false +logging: + log_to_screen: true + log_to_wandb: true + log_to_file: true + project: SamudrACE-CM4-piControl + entity: ai2cm +train_loader: + batch_size: 16 + num_data_workers: 4 + prefetch_factor: 1 + dataset: + ocean: + merge: + - data_path: /climate-default + file_pattern: 2025-06-03-cm4-piControl-200yr-coupled-ocean.zarr + engine: zarr + subset: + start_time: '0151-01-06' + stop_time: '0306-01-01' + - data_path: /climate-default + file_pattern: 2025-04-16-cm4-piControl-ocean-200yr-dataset.zarr + engine: zarr + subset: + start_time: '0151-01-06' + stop_time: '0306-01-01' + atmosphere: + merge: + - data_path: /climate-default + file_pattern: 2025-06-03-cm4-piControl-200yr-coupled-interpolate_sst-atmosphere.zarr + engine: zarr + subset: + start_time: '0151-01-06' + stop_time: '0306-01-01' + - data_path: /climate-default + file_pattern: 2025-03-21-CM4-piControl-atmosphere-land-1deg-8layer-200yr.zarr + engine: zarr + subset: + start_time: '0151-01-06' + stop_time: '0306-01-01' +validation_loader: + batch_size: 16 + num_data_workers: 4 + prefetch_factor: 1 + dataset: + ocean: + merge: + - data_path: /climate-default + file_pattern: 2025-06-03-cm4-piControl-200yr-coupled-ocean.zarr + engine: zarr + subset: + start_time: '0306-01-01' + stop_time: '0311-01-01' + - data_path: /climate-default + file_pattern: 2025-04-16-cm4-piControl-ocean-200yr-dataset.zarr + engine: zarr + subset: + start_time: '0306-01-01' + stop_time: '0311-01-01' + atmosphere: + merge: + - data_path: /climate-default + file_pattern: 2025-06-03-cm4-piControl-200yr-coupled-interpolate_sst-atmosphere.zarr + engine: zarr + subset: + start_time: '0306-01-01' + stop_time: '0311-01-01' + - data_path: /climate-default + file_pattern: 2025-03-21-CM4-piControl-atmosphere-land-1deg-8layer-200yr.zarr + engine: zarr + subset: + start_time: '0306-01-01' + stop_time: '0311-01-01' +optimization: + enable_automatic_mixed_precision: false + lr: 0.0001 + optimizer_type: FusedAdam + kwargs: + weight_decay: 0.01 + use_gradient_accumulation: true +stepper_training: + ocean: + parameter_init: + weights_path: /ocean_ckpt.tar + loss: + type: MSE + loss_contributions: + n_steps: 4 + weight: 1.0 + atmosphere: + parameter_init: + weights_path: /atmos_ckpt.tar + parameters: + - frozen: + include: + - '*' + loss_contributions: + n_steps: 0 + loss: + type: MSE + weights: + air_temperature_0: 0.5 + air_temperature_1: 0.5 + eastward_wind_0: 0.5 + northward_wind_0: 0.5 + specific_total_water_0: 0.5 + specific_total_water_1: 0.25 + specific_total_water_2: 0.5 + PRATEsfc: 0.5 + h500: 10 + TMP850: 5 + Q2m: 0.5 + DLWRFsfc: 2 + ULWRFsfc: 5 + USWRFsfc: 2 + DSWRFsfc: 2 + USWRFtoa: 2 + tendency_of_total_water_path_due_to_advection: 0.5 +stepper: + ocean_fraction_prediction: + land_fraction_name: land_fraction + sea_ice_fraction_name: ocean_sea_ice_fraction + sea_ice_fraction_name_in_atmosphere: sea_ice_fraction + sst_name: sst + ocean: + timedelta: 5D + # stepper added by train.sh from uncoupled-ocean/train-config.yaml + + stepper: + input_masking: + mask_value: 0 + fill_value: 0.0 + exclude_names_and_prefixes: + - land_fraction + step: + type: single_module + config: + builder: + type: Samudra + config: + ch_width: + - 200 + - 250 + - 300 + - 400 + dilation: + - 1 + - 2 + - 4 + - 8 + n_layers: + - 1 + - 1 + - 1 + - 1 + norm: instance + normalization: + network: + global_means_path: /ocean_stats/centering.nc + global_stds_path: /ocean_stats/scaling-full-field.nc + corrector: + type: ocean_corrector + config: + force_positive_names: + - so_0 + - so_1 + - so_2 + - so_3 + - so_4 + - so_5 + - so_6 + - so_7 + - so_8 + - so_9 + - so_10 + - so_11 + - so_12 + - so_13 + - so_14 + - so_15 + - so_16 + - so_17 + - so_18 + - HI + sea_ice_fraction_correction: + sea_ice_fraction_name: ocean_sea_ice_fraction + land_fraction_name: land_fraction + remove_negative_ocean_fraction: false + next_step_forcing_names: + - DLWRFsfc + - DSWRFsfc + - ULWRFsfc + - USWRFsfc + - LHTFLsfc + - SHTFLsfc + - PRATEsfc + - eastward_surface_wind_stress + - northward_surface_wind_stress + in_names: + - DLWRFsfc + - DSWRFsfc + - ULWRFsfc + - USWRFsfc + - LHTFLsfc + - SHTFLsfc + - PRATEsfc + - eastward_surface_wind_stress + - northward_surface_wind_stress + - land_fraction + - sst + - zos + - so_0 + - so_1 + - so_2 + - so_3 + - so_4 + - so_5 + - so_6 + - so_7 + - so_8 + - so_9 + - so_10 + - so_11 + - so_12 + - so_13 + - so_14 + - so_15 + - so_16 + - so_17 + - so_18 + - thetao_0 + - thetao_1 + - thetao_2 + - thetao_3 + - thetao_4 + - thetao_5 + - thetao_6 + - thetao_7 + - thetao_8 + - thetao_9 + - thetao_10 + - thetao_11 + - thetao_12 + - thetao_13 + - thetao_14 + - thetao_15 + - thetao_16 + - thetao_17 + - thetao_18 + - uo_0 + - uo_1 + - uo_2 + - uo_3 + - uo_4 + - uo_5 + - uo_6 + - uo_7 + - uo_8 + - uo_9 + - uo_10 + - uo_11 + - uo_12 + - uo_13 + - uo_14 + - uo_15 + - uo_16 + - uo_17 + - uo_18 + - vo_0 + - vo_1 + - vo_2 + - vo_3 + - vo_4 + - vo_5 + - vo_6 + - vo_7 + - vo_8 + - vo_9 + - vo_10 + - vo_11 + - vo_12 + - vo_13 + - vo_14 + - vo_15 + - vo_16 + - vo_17 + - vo_18 + - ocean_sea_ice_fraction + - HI + out_names: + - sst + - zos + - so_0 + - so_1 + - so_2 + - so_3 + - so_4 + - so_5 + - so_6 + - so_7 + - so_8 + - so_9 + - so_10 + - so_11 + - so_12 + - so_13 + - so_14 + - so_15 + - so_16 + - so_17 + - so_18 + - thetao_0 + - thetao_1 + - thetao_2 + - thetao_3 + - thetao_4 + - thetao_5 + - thetao_6 + - thetao_7 + - thetao_8 + - thetao_9 + - thetao_10 + - thetao_11 + - thetao_12 + - thetao_13 + - thetao_14 + - thetao_15 + - thetao_16 + - thetao_17 + - thetao_18 + - uo_0 + - uo_1 + - uo_2 + - uo_3 + - uo_4 + - uo_5 + - uo_6 + - uo_7 + - uo_8 + - uo_9 + - uo_10 + - uo_11 + - uo_12 + - uo_13 + - uo_14 + - uo_15 + - uo_16 + - uo_17 + - uo_18 + - vo_0 + - vo_1 + - vo_2 + - vo_3 + - vo_4 + - vo_5 + - vo_6 + - vo_7 + - vo_8 + - vo_9 + - vo_10 + - vo_11 + - vo_12 + - vo_13 + - vo_14 + - vo_15 + - vo_16 + - vo_17 + - vo_18 + - ocean_sea_ice_fraction + - HI + atmosphere: + timedelta: 6h + # stepper added by train.sh from uncoupled-atmos/train-config.yaml + + stepper: + step: + type: single_module + config: + builder: + type: SphericalFourierNeuralOperatorNet + config: + embed_dim: 384 + filter_type: linear + hard_thresholding_fraction: 1.0 + use_mlp: true + normalization_layer: instance_norm + num_layers: 8 + operator_type: dhconv + scale_factor: 1 + separable: false + spectral_layers: 3 + spectral_transform: sht + normalization: + network: + global_means_path: /atmos_stats/centering.nc + global_stds_path: /atmos_stats/scaling-full-field.nc + residual: + global_means_path: /atmos_stats/centering.nc + global_stds_path: /atmos_stats/scaling-residual.nc + ocean: + surface_temperature_name: surface_temperature + ocean_fraction_name: ocean_fraction + interpolate: false + corrector: + conserve_dry_air: true + moisture_budget_correction: advection_and_precipitation + force_positive_names: + - specific_total_water_0 + - specific_total_water_1 + - specific_total_water_2 + - specific_total_water_3 + - specific_total_water_4 + - specific_total_water_5 + - specific_total_water_6 + - specific_total_water_7 + - Q2m + - PRATEsfc + - ULWRFsfc + - ULWRFtoa + - DLWRFsfc + - DSWRFsfc + - USWRFsfc + - USWRFtoa + next_step_forcing_names: + - DSWRFtoa + in_names: + - land_fraction + - ocean_fraction + - sea_ice_fraction + - DSWRFtoa + - HGTsfc + - PRESsfc + - surface_temperature + - TMP2m + - Q2m + - UGRD10m + - VGRD10m + - air_temperature_0 + - air_temperature_1 + - air_temperature_2 + - air_temperature_3 + - air_temperature_4 + - air_temperature_5 + - air_temperature_6 + - air_temperature_7 + - specific_total_water_0 + - specific_total_water_1 + - specific_total_water_2 + - specific_total_water_3 + - specific_total_water_4 + - specific_total_water_5 + - specific_total_water_6 + - specific_total_water_7 + - eastward_wind_0 + - eastward_wind_1 + - eastward_wind_2 + - eastward_wind_3 + - eastward_wind_4 + - eastward_wind_5 + - eastward_wind_6 + - eastward_wind_7 + - northward_wind_0 + - northward_wind_1 + - northward_wind_2 + - northward_wind_3 + - northward_wind_4 + - northward_wind_5 + - northward_wind_6 + - northward_wind_7 + out_names: + - PRESsfc + - surface_temperature + - TMP2m + - Q2m + - UGRD10m + - VGRD10m + - air_temperature_0 + - air_temperature_1 + - air_temperature_2 + - air_temperature_3 + - air_temperature_4 + - air_temperature_5 + - air_temperature_6 + - air_temperature_7 + - specific_total_water_0 + - specific_total_water_1 + - specific_total_water_2 + - specific_total_water_3 + - specific_total_water_4 + - specific_total_water_5 + - specific_total_water_6 + - specific_total_water_7 + - eastward_wind_0 + - eastward_wind_1 + - eastward_wind_2 + - eastward_wind_3 + - eastward_wind_4 + - eastward_wind_5 + - eastward_wind_6 + - eastward_wind_7 + - northward_wind_0 + - northward_wind_1 + - northward_wind_2 + - northward_wind_3 + - northward_wind_4 + - northward_wind_5 + - northward_wind_6 + - northward_wind_7 + - LHTFLsfc + - SHTFLsfc + - PRATEsfc + - ULWRFsfc + - ULWRFtoa + - DLWRFsfc + - DSWRFsfc + - USWRFsfc + - USWRFtoa + - tendency_of_total_water_path_due_to_advection + - TMP850 + - h500 + # ocean coupling variables (diagnostic for now) + - eastward_surface_wind_stress + - northward_surface_wind_stress diff --git a/configs/baselines/cm4-piControl/train.sh b/configs/baselines/cm4-piControl/train.sh index f3eb52315..ad604abf4 100644 --- a/configs/baselines/cm4-piControl/train.sh +++ b/configs/baselines/cm4-piControl/train.sh @@ -12,10 +12,10 @@ EXISTING_RESULTS_OCEAN_DATASET="01JX4DEKY2A13D6Y95T53DSVCQ" # beaker dataset ID ATMOS_CKPT="best_inference_ckpt" OCEAN_CKPT="best_inference_ckpt" -SCRIPT_PATH=$(git rev-parse --show-prefix) # relative to the root of the repository -SCRIPT_PATH=${SCRIPT_PATH%/} -BEAKER_USERNAME=$(beaker account whoami --format=json | jq -r '.[0].name') REPO_ROOT=$(git rev-parse --show-toplevel) +SCRIPT_PATH=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +SCRIPT_PATH=${SCRIPT_PATH#$REPO_ROOT/} +BEAKER_USERNAME=$(beaker account whoami --format=json | jq -r '.[0].name') N_GPUS=4 ATMOS_STATS_DATA=jamesd/2025-06-03-cm4-piControl-200yr-coupled-stats-atmosphere @@ -29,10 +29,10 @@ TEMPLATE_CONFIG_PATH="${SCRIPT_PATH}/train-config-template.yaml" CONFIG_PATH="${SCRIPT_PATH}/train-config.yaml" cp "${SCRIPT_PATH}/uncoupled-atmos/train-config.yaml" ./atmos-config.yaml -sed -i 's/statsdata/atmos_stats/g' ./atmos-config.yaml +sed -i'' -e 's/statsdata/atmos_stats/g' ./atmos-config.yaml cp "${SCRIPT_PATH}/uncoupled-ocean/train-config.yaml" ./ocean-config.yaml -sed -i 's/statsdata/ocean_stats/g' ./ocean-config.yaml +sed -i'' -e 's/statsdata/ocean_stats/g' ./ocean-config.yaml cp "$TEMPLATE_CONFIG_PATH" "$CONFIG_PATH" diff --git a/configs/test_configs.py b/configs/test_configs.py index 38a103b00..00e1b3504 100644 --- a/configs/test_configs.py +++ b/configs/test_configs.py @@ -4,6 +4,10 @@ import yaml import fme +from fme.coupled.inference.evaluator import ( + InferenceEvaluatorConfig as CoupledInferenceEvaluatorConfig, +) +from fme.coupled.train.train_config import TrainConfig as CoupledTrainConfig from fme.downscaling.evaluator import EvaluatorConfig from fme.downscaling.train import TrainerConfig as DownscalingTrainConfig @@ -14,7 +18,10 @@ def get_yaml_files(pattern, exclude=None): """Get all files matching the pattern in the directory and subdirectories.""" paths = list(EXAMPLES_DIRECTORY.rglob(pattern)) if exclude is not None: - paths = [p for p in paths if exclude not in str(p)] + if isinstance(exclude, str): + exclude = [exclude] + for exc in exclude: + paths = [p for p in paths if exc not in str(p)] paths = [p for p in paths if "experiments/" not in str(p)] return paths @@ -32,7 +39,14 @@ def validate_config(file_path, config_class): def test_train_configs_are_valid(): - train_files = get_yaml_files("*train*.yaml", exclude="baselines/downscaling") + train_files = get_yaml_files( + "*train*.yaml", + exclude=[ + "baselines/downscaling", + "cm4-piControl/train-config", + "cm4-piControl/finetune-config", + ], + ) assert len(train_files) > 0, "No train files found" for file in train_files: validate_config(file, fme.ace.TrainConfig) @@ -40,13 +54,32 @@ def test_train_configs_are_valid(): def test_evaluator_configs_are_valid(): evaluator_files = get_yaml_files( - "*evaluator*.yaml", exclude="baselines/downscaling" + "*evaluator*.yaml", + exclude=["baselines/downscaling", "cm4-piControl/evaluator-config"], ) assert len(evaluator_files) > 0, "No evaluator files found" for file in evaluator_files: validate_config(file, fme.ace.InferenceEvaluatorConfig) +def test_coupled_train_configs_are_valid(): + train_files = get_yaml_files( + "**/cm4-piControl/train-config.yaml", + ) + get_yaml_files( + "**/cm4-piControl/finetune-config.yaml", + ) + assert len(train_files) > 0, "No coupled train files found" + for file in train_files: + validate_config(file, CoupledTrainConfig) + + +def test_coupled_evaluator_configs_are_valid(): + evaluator_files = get_yaml_files("**/cm4-piControl/*evaluator*.yaml") + assert len(evaluator_files) > 0, "No coupled evaluator files found" + for file in evaluator_files: + validate_config(file, CoupledInferenceEvaluatorConfig) + + def test_downscaling_train_configs_are_valid(): downscaling_files = get_yaml_files("**/downscaling/*train*.yaml") + get_yaml_files( "**/downscaling-hiro-global/*train*.yaml" From bd45c4d75db672f08d9fa49876936bd40dedfdf5 Mon Sep 17 00:00:00 2001 From: James Duncan Date: Wed, 8 Apr 2026 16:01:27 -0700 Subject: [PATCH 9/9] Fix uncoupled atmos config and stats --- .../cm4-piControl/finetune-config.yaml | 4 ++-- .../baselines/cm4-piControl/train-config.yaml | 4 ++-- .../uncoupled-atmos/train-config.yaml | 18 +++++++++--------- .../cm4-piControl/uncoupled-atmos/train.sh | 2 +- 4 files changed, 14 insertions(+), 14 deletions(-) diff --git a/configs/baselines/cm4-piControl/finetune-config.yaml b/configs/baselines/cm4-piControl/finetune-config.yaml index 727705d83..13c2acde0 100644 --- a/configs/baselines/cm4-piControl/finetune-config.yaml +++ b/configs/baselines/cm4-piControl/finetune-config.yaml @@ -437,7 +437,7 @@ stepper: ocean: surface_temperature_name: surface_temperature ocean_fraction_name: ocean_fraction - interpolate: false + interpolate: true corrector: conserve_dry_air: true moisture_budget_correction: advection_and_precipitation @@ -462,6 +462,7 @@ stepper: - DSWRFtoa in_names: - land_fraction + - lake_fraction - ocean_fraction - sea_ice_fraction - DSWRFtoa @@ -555,6 +556,5 @@ stepper: - tendency_of_total_water_path_due_to_advection - TMP850 - h500 - # ocean coupling variables (diagnostic for now) - eastward_surface_wind_stress - northward_surface_wind_stress diff --git a/configs/baselines/cm4-piControl/train-config.yaml b/configs/baselines/cm4-piControl/train-config.yaml index 999cf08be..ba9723691 100644 --- a/configs/baselines/cm4-piControl/train-config.yaml +++ b/configs/baselines/cm4-piControl/train-config.yaml @@ -441,7 +441,7 @@ stepper: ocean: surface_temperature_name: surface_temperature ocean_fraction_name: ocean_fraction - interpolate: false + interpolate: true corrector: conserve_dry_air: true moisture_budget_correction: advection_and_precipitation @@ -466,6 +466,7 @@ stepper: - DSWRFtoa in_names: - land_fraction + - lake_fraction - ocean_fraction - sea_ice_fraction - DSWRFtoa @@ -559,6 +560,5 @@ stepper: - tendency_of_total_water_path_due_to_advection - TMP850 - h500 - # ocean coupling variables (diagnostic for now) - eastward_surface_wind_stress - northward_surface_wind_stress diff --git a/configs/baselines/cm4-piControl/uncoupled-atmos/train-config.yaml b/configs/baselines/cm4-piControl/uncoupled-atmos/train-config.yaml index 2c0919f00..e91cd21b2 100644 --- a/configs/baselines/cm4-piControl/uncoupled-atmos/train-config.yaml +++ b/configs/baselines/cm4-piControl/uncoupled-atmos/train-config.yaml @@ -3,7 +3,7 @@ save_checkpoint: true validate_using_ema: true ema: decay: 0.999 -max_epochs: 50 # this is about equivalent to the amount of trainign for ACE2-ERA5/ACE2-SHiELD +max_epochs: 50 n_forward_steps: 2 inference: n_forward_steps: 29200 @@ -27,7 +27,7 @@ inference: - data_path: /climate-default file_pattern: 2025-03-21-CM4-piControl-atmosphere-land-1deg-8layer-200yr.zarr engine: zarr - num_data_workers: 32 + num_data_workers: 4 aggregator: log_histograms: false logging: @@ -38,8 +38,8 @@ logging: entity: ai2cm train_loader: batch_size: 16 - num_data_workers: 32 - prefetch_factor: 2 + num_data_workers: 4 + prefetch_factor: 4 dataset: merge: - data_path: /climate-default @@ -53,9 +53,9 @@ train_loader: subset: stop_time: '0306-01-01T06:00:00' validation_loader: - batch_size: 128 - num_data_workers: 32 - prefetch_factor: 2 + batch_size: 64 + num_data_workers: 4 + prefetch_factor: 4 dataset: merge: - data_path: /climate-default @@ -126,7 +126,7 @@ stepper: ocean: surface_temperature_name: surface_temperature ocean_fraction_name: ocean_fraction - interpolate: false + interpolate: true corrector: conserve_dry_air: true moisture_budget_correction: advection_and_precipitation @@ -151,6 +151,7 @@ stepper: - DSWRFtoa in_names: - land_fraction + - lake_fraction - ocean_fraction - sea_ice_fraction - DSWRFtoa @@ -244,6 +245,5 @@ stepper: - tendency_of_total_water_path_due_to_advection - TMP850 - h500 - # ocean coupling variables (diagnostic for now) - eastward_surface_wind_stress - northward_surface_wind_stress diff --git a/configs/baselines/cm4-piControl/uncoupled-atmos/train.sh b/configs/baselines/cm4-piControl/uncoupled-atmos/train.sh index 64b514f2f..da02ef703 100644 --- a/configs/baselines/cm4-piControl/uncoupled-atmos/train.sh +++ b/configs/baselines/cm4-piControl/uncoupled-atmos/train.sh @@ -10,7 +10,7 @@ CONFIG_PATH="${SCRIPT_PATH}${CONFIG_FILENAME}" BEAKER_USERNAME=$(beaker account whoami --format=json | jq -r '.[0].name') REPO_ROOT=$(git rev-parse --show-toplevel) N_GPUS=4 -STATS_DATA=jamesd/2025-07-11-cm4-piControl-200yr-coupled-stats-atmosphere +STATS_DATA=jamesd/2025-06-03-cm4-piControl-200yr-coupled-stats-atmosphere cd $REPO_ROOT # so config path is valid no matter where we are running this script