ai2cm · jpdunc23 · Mar 4, 2026 · Mar 26, 2026 · Mar 26, 2026 · Mar 26, 2026
diff --git a/configs/baselines/coupled/cm4-piControl/README.md b/configs/baselines/coupled/cm4-piControl/README.md
@@ -0,0 +1,44 @@
+# SamudrACE training and evaluation for CM4 piControl
+
+Self-contained baseline configs and scripts for the SamudrACE coupled
+atmosphere-ocean training pipeline on the 200-year GFDL CM4 piControl data.
+
+## Pipeline overview
+
+The pipeline first trains uncoupled atmosphere and ocean models independently,
+then couples them in two stages: ocean-only fine-tuning (`train.sh`) and then
+joint ocean-and-atmosphere fine-tuning (`finetune.sh`).
+
+```
+Uncoupled training:
+  uncoupled-atmos/train.sh   -> atmos checkpoint
+  uncoupled-ocean/train.sh   -> ocean checkpoint
+
+Uncoupled evaluation:
+  uncoupled-atmos/evaluate.sh
+  uncoupled-ocean/evaluate.sh
+
+Coupled training stage 1:
+  train.sh  -> coupled checkpoint (atmos frozen, ocean fine-tuned)
+
+Coupled training stage 2:
+  finetune.sh  -> refined coupled checkpoint (both models trained)
+
+Coupled evaluation
+  evaluate.sh
+```
+
+## Directory contents
+
+| File | Purpose |
+|------|---------|
+| `uncoupled-atmos/train-config.yaml` | ACE2 atmosphere model pretraining config |
+| `uncoupled-atmos/evaluator-config.yaml` | ACE2 evaluation (58,300 steps = ~40 years at 6h) |
+| `uncoupled-ocean/train-config.yaml` | SamudraI ocean model pretraining config |
+| `uncoupled-ocean/evaluator-config.yaml` | SamudraI evaluation (2,920 steps = ~40 years at 5-day) |
+| `train-config-template.yaml` | SamudrACE stage 1 training config template |
+| `finetune-config-template.yaml` | SamudrACE stage 2 training config template |
+| `evaluator-config-ICx1.yaml` | SamudrACE evaluation from a single initial condition (year 311) |
+| `train.sh` | Generates `train-config.yaml` and submits SamudrACE stage 1 training |
+| `finetune.sh` | Generates `finetune-config.yaml` and submits SamudrACE stage 2 training |
+| `evaluate.sh` | SamudrACE evaluation |
diff --git a/configs/baselines/coupled/cm4-piControl/evaluate.sh b/configs/baselines/coupled/cm4-piControl/evaluate.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+
+set -e
+
+JOB_NAME="cm4-piControl-coupled-evaluator"
+JOB_GROUP="cm4-piControl-coupled"
+EXISTING_RESULTS_DATASET="01JZHQJXC4EYAPTCSP188YSVC0"  # beaker dataset ID from coupled training or fine-tuning
+CONFIG_FILENAME="evaluator-config.yaml"
+SCRIPT_PATH=$(git rev-parse --show-prefix)  # relative to the root of the repository
+CONFIG_PATH="${SCRIPT_PATH}${CONFIG_FILENAME}"
+BEAKER_USERNAME=$(beaker account whoami --format=json | jq -r '.[0].name')
+REPO_ROOT=$(git rev-parse --show-toplevel)
+
+cd "$REPO_ROOT"  # so config path is valid no matter where we are running this script
+
+python -m fme.coupled.validate_config --config_type evaluator $CONFIG_PATH
+
+gantry run \
+    --name $JOB_NAME \
+    --task-name $JOB_NAME \
+    --description "ACE coupled CM4 piControl evaluator" \
+    --beaker-image "$(cat $REPO_ROOT/latest_deps_only_image.txt)" \
+    --workspace ai2/ace \
+    --priority normal \
+    --not-preemptible \
+    --cluster ai2/ceres-cirrascale \
+    --cluster ai2/saturn-cirrascale \
+    --weka climate-default:/climate-default \
+    --env WANDB_USERNAME=$BEAKER_USERNAME \
+    --env WANDB_NAME=$JOB_NAME \
+    --env WANDB_JOB_TYPE=inference \
+    --env WANDB_RUN_GROUP=$JOB_GROUP \
+    --env GOOGLE_APPLICATION_CREDENTIALS=/tmp/google_application_credentials.json \
+    --env-secret WANDB_API_KEY=wandb-api-key-ai2cm-sa \
+    --dataset-secret google-credentials:/tmp/google_application_credentials.json \
+    --dataset $EXISTING_RESULTS_DATASET:training_checkpoints/best_inference_ckpt.tar:/ckpt.tar \
+    --gpus 1 \
+    --shared-memory 50GiB \
+    --budget ai2/climate \
+    --system-python \
+    --install "pip install --no-deps ." \
+    -- python -I -m fme.coupled.evaluator $CONFIG_PATH
diff --git a/configs/baselines/coupled/cm4-piControl/evaluator-config.yaml b/configs/baselines/coupled/cm4-piControl/evaluator-config.yaml
@@ -0,0 +1,39 @@
+experiment_dir: /results
+n_coupled_steps: 2920
+coupled_steps_in_memory: 20
+checkpoint_path: /ckpt.tar
+data_writer:
+  ocean:
+    save_prediction_files: false
+    save_monthly_files: false
+  atmosphere:
+    save_prediction_files: false
+    save_monthly_files: false
+logging:
+  log_to_screen: true
+  log_to_wandb: true
+  log_to_file: true
+  project: SamudrACE-CM4-piControl
+  entity: ai2cm
+loader:
+  num_data_workers: 1
+  dataset:
+    ocean:
+      merge:
+      - data_path: /climate-default
+        file_pattern: 2025-06-03-cm4-piControl-200yr-coupled-ocean.zarr
+        engine: zarr
+      - data_path: /climate-default
+        file_pattern: 2025-04-16-cm4-piControl-ocean-200yr-dataset.zarr
+        engine: zarr
+    atmosphere:
+      merge:
+      - data_path: /climate-default
+        file_pattern: 2025-06-03-cm4-piControl-200yr-coupled-interpolate_sst-atmosphere.zarr
+        engine: zarr
+      - data_path: /climate-default
+        file_pattern: 2025-03-21-CM4-piControl-atmosphere-land-1deg-8layer-200yr.zarr
+        engine: zarr
+  start_indices:
+    times:
+      - '0311-01-01T00:00:00'
diff --git a/configs/baselines/coupled/cm4-piControl/finetune-config-template.yaml b/configs/baselines/coupled/cm4-piControl/finetune-config-template.yaml
@@ -0,0 +1,167 @@
+experiment_dir: /results
+save_checkpoint: true
+validate_using_ema: true
+ema:
+  decay: 0.999
+max_epochs: 20
+n_coupled_steps: 4
+inference:
+  n_coupled_steps: 1456
+  coupled_steps_in_memory: 8
+  loader:
+    num_data_workers: 1
+    dataset:
+      ocean:
+        merge:
+        - data_path: /climate-default
+          file_pattern: 2025-06-03-cm4-piControl-200yr-coupled-ocean.zarr
+          engine: zarr
+        - data_path: /climate-default
+          file_pattern: 2025-04-16-cm4-piControl-ocean-200yr-dataset.zarr
+          engine: zarr
+      atmosphere:
+        merge:
+        - data_path: /climate-default
+          file_pattern: 2025-06-03-cm4-piControl-200yr-coupled-interpolate_sst-atmosphere.zarr
+          engine: zarr
+        - data_path: /climate-default
+          file_pattern: 2025-03-21-CM4-piControl-atmosphere-land-1deg-8layer-200yr.zarr
+          engine: zarr
+    start_indices:
+      times:
+        - '0151-01-06T00:00:00'
+        - '0171-01-06T00:00:00'
+        - '0191-01-06T00:00:00'
+        - '0211-01-06T00:00:00'
+        - '0231-01-06T00:00:00'
+        - '0251-01-06T00:00:00'
+        - '0271-01-06T00:00:00'
+        - '0291-01-06T00:00:00'
+  aggregator:
+    log_zonal_mean_images: false
+    log_histograms: false
+logging:
+  log_to_screen: true
+  log_to_wandb: true
+  log_to_file: true
+  project: SamudrACE-CM4-piControl
+  entity: ai2cm
+train_loader:
+  batch_size: 16
+  num_data_workers: 4
+  prefetch_factor: 1
+  dataset:
+    ocean:
+      merge:
+      - data_path: /climate-default
+        file_pattern: 2025-06-03-cm4-piControl-200yr-coupled-ocean.zarr
+        engine: zarr
+        subset:
+          start_time: '0151-01-06'
+          stop_time: '0306-01-01'
+      - data_path: /climate-default
+        file_pattern: 2025-04-16-cm4-piControl-ocean-200yr-dataset.zarr
+        engine: zarr
+        subset:
+          start_time: '0151-01-06'
+          stop_time: '0306-01-01'
+    atmosphere:
+      merge:
+      - data_path: /climate-default
+        file_pattern: 2025-06-03-cm4-piControl-200yr-coupled-interpolate_sst-atmosphere.zarr
+        engine: zarr
+        subset:
+          start_time: '0151-01-06'
+          stop_time: '0306-01-01'
+      - data_path: /climate-default
+        file_pattern: 2025-03-21-CM4-piControl-atmosphere-land-1deg-8layer-200yr.zarr
+        engine: zarr
+        subset:
+          start_time: '0151-01-06'
+          stop_time: '0306-01-01'
+validation_loader:
+  batch_size: 16
+  num_data_workers: 4
+  prefetch_factor: 1
+  dataset:
+    ocean:
+      merge:
+      - data_path: /climate-default
+        file_pattern: 2025-06-03-cm4-piControl-200yr-coupled-ocean.zarr
+        engine: zarr
+        subset:
+          start_time: '0306-01-01'
+          stop_time: '0311-01-01'
+      - data_path: /climate-default
+        file_pattern: 2025-04-16-cm4-piControl-ocean-200yr-dataset.zarr
+        engine: zarr
+        subset:
+          start_time: '0306-01-01'
+          stop_time: '0311-01-01'
+    atmosphere:
+      merge:
+      - data_path: /climate-default
+        file_pattern: 2025-06-03-cm4-piControl-200yr-coupled-interpolate_sst-atmosphere.zarr
+        engine: zarr
+        subset:
+          start_time: '0306-01-01'
+          stop_time: '0311-01-01'
+      - data_path: /climate-default
+        file_pattern: 2025-03-21-CM4-piControl-atmosphere-land-1deg-8layer-200yr.zarr
+        engine: zarr
+        subset:
+          start_time: '0306-01-01'
+          stop_time: '0311-01-01'
+optimization:
+  enable_automatic_mixed_precision: false
+  lr: 0.00001
+  optimizer_type: FusedAdam
+  kwargs:
+    weight_decay: 0.01
+  use_gradient_accumulation: true
+  scheduler:
+    type: CosineAnnealingLR
+stepper_training:
+  parameter_init:
+    checkpoint_path: /ckpt.tar
+  ocean:
+    parameter_init:
+      weights_path: null # null required
+    loss:
+      type: MSE
+    loss_contributions:
+      n_steps: 4
+      weight: 1.0
+  atmosphere:
+    parameter_init:
+      weights_path: null # null required
+    loss_contributions:
+      n_steps: 2
+      weight: 1.0
+    loss:
+      type: MSE
+      weights:
+        air_temperature_0: 0.5
+        air_temperature_1: 0.5
+        eastward_wind_0: 0.5
+        northward_wind_0: 0.5
+        specific_total_water_0: 0.5
+        specific_total_water_1: 0.25
+        specific_total_water_2: 0.5
+        PRATEsfc: 0.5
+        h500: 10
+        TMP850: 5
+        Q2m: 0.5
+        DLWRFsfc: 2
+        ULWRFsfc: 5
+        USWRFsfc: 2
+        DSWRFsfc: 2
+        USWRFtoa: 2
+        tendency_of_total_water_path_due_to_advection: 0.5
+stepper:
+  ocean:
+    timedelta: 5D
+    # stepper added by finetune.sh from uncoupled-ocean/train-config.yaml
+  atmosphere:
+    timedelta: 6h
+    # stepper added by finetune.sh from uncoupled-atmos/train-config.yaml
diff --git a/configs/baselines/coupled/cm4-piControl/finetune.sh b/configs/baselines/coupled/cm4-piControl/finetune.sh
@@ -0,0 +1,74 @@
+#!/bin/bash
+#
+# SamudrACE CM4 piControl training stage 2: starting from the stage 1
+# checkpoint, fine-tunes both atmosphere and ocean models jointly with a
+# cosine-annealing LR schedule.
+
+set -e
+
+JOB_NAME="cm4-piControl-coupled-finetune"
+JOB_GROUP="cm4-piControl-coupled"
+EXISTING_RESULTS_DATASET="TODO"  # beaker dataset ID from coupled training (train.sh)
+CKPT_TYPE="best_inference_ckpt"
+
+SCRIPT_PATH=$(git rev-parse --show-prefix)  # relative to the root of the repository
+SCRIPT_PATH=${SCRIPT_PATH%/}
+BEAKER_USERNAME=$(beaker account whoami --format=json | jq -r '.[0].name')
+REPO_ROOT=$(git rev-parse --show-toplevel)
+N_GPUS=4
+
+ATMOS_STATS_DATA=jamesd/2025-06-03-cm4-piControl-200yr-coupled-stats-atmosphere
+OCEAN_STATS_DATA=jamesd/2025-06-03-cm4-piControl-200yr-coupled-stats-ocean
+
+cd "$REPO_ROOT"  # so config path is valid no matter where we are running this script
+
+# --- Generate finetune-config.yaml from template + uncoupled configs ---
+
+TEMPLATE_CONFIG_PATH="${SCRIPT_PATH}/finetune-config-template.yaml"
+CONFIG_PATH="${SCRIPT_PATH}/finetune-config.yaml"
+
+cp "${SCRIPT_PATH}/uncoupled-atmos/train-config.yaml" ./atmos-config.yaml
+sed -i 's/statsdata/atmos_stats/g' ./atmos-config.yaml
+
+cp "${SCRIPT_PATH}/uncoupled-ocean/train-config.yaml" ./ocean-config.yaml
+sed -i 's/statsdata/ocean_stats/g' ./ocean-config.yaml
+
+cp "$TEMPLATE_CONFIG_PATH" "$CONFIG_PATH"
+
+# update component stepper configs, preserving template values on conflict
+yq -i '.stepper.ocean.stepper *=n load("ocean-config.yaml").stepper' "$CONFIG_PATH"
+yq -i '.stepper.atmosphere.stepper *=n load("atmos-config.yaml").stepper' "$CONFIG_PATH"
+
+rm ./atmos-config.yaml ./ocean-config.yaml
+
+# --- Validate and submit ---
+
+python -m fme.coupled.validate_config "$CONFIG_PATH" --config_type train
+
+gantry run \
+    --name $JOB_NAME \
+    --task-name $JOB_NAME \
+    --description "Run SamudrACE CM4 piControl ocean + atmos fine-tuning" \
+    --beaker-image "$(cat $REPO_ROOT/latest_deps_only_image.txt)" \
+    --workspace ai2/ace \
+    --priority normal \
+    --preemptible \
+    --cluster ai2/ceres-cirrascale \
+    --cluster ai2/jupiter-cirrascale \
+    --weka climate-default:/climate-default \
+    --env WANDB_USERNAME=$BEAKER_USERNAME \
+    --env WANDB_NAME=$JOB_NAME \
+    --env WANDB_JOB_TYPE=training \
+    --env WANDB_RUN_GROUP=$JOB_GROUP \
+    --env GOOGLE_APPLICATION_CREDENTIALS=/tmp/google_application_credentials.json \
+    --env-secret WANDB_API_KEY=wandb-api-key-ai2cm-sa \
+    --dataset-secret google-credentials:/tmp/google_application_credentials.json \
+    --dataset $ATMOS_STATS_DATA:/atmos_stats \
+    --dataset $OCEAN_STATS_DATA:/ocean_stats \
+    --dataset "$EXISTING_RESULTS_DATASET:training_checkpoints/${CKPT_TYPE}.tar:/ckpt.tar" \
+    --gpus $N_GPUS \
+    --shared-memory 800GiB \
+    --budget ai2/climate \
+    --system-python \
+    --install "pip install --no-deps ." \
+    -- torchrun --nproc_per_node $N_GPUS -m fme.coupled.train "$CONFIG_PATH"