Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 44 additions & 0 deletions configs/baselines/coupled/cm4-piControl/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# SamudrACE training and evaluation for CM4 piControl

Self-contained baseline configs and scripts for the SamudrACE coupled
atmosphere-ocean training pipeline on the 200-year GFDL CM4 piControl data.

## Pipeline overview

The pipeline first trains uncoupled atmosphere and ocean models independently,
then couples them in two stages: ocean-only fine-tuning (`train.sh`) and then
joint ocean-and-atmosphere fine-tuning (`finetune.sh`).

```
Uncoupled training:
uncoupled-atmos/train.sh -> atmos checkpoint
uncoupled-ocean/train.sh -> ocean checkpoint

Uncoupled evaluation:
uncoupled-atmos/evaluate.sh
uncoupled-ocean/evaluate.sh

Coupled training stage 1:
train.sh -> coupled checkpoint (atmos frozen, ocean fine-tuned)

Coupled training stage 2:
finetune.sh -> refined coupled checkpoint (both models trained)

Coupled evaluation
evaluate.sh
```

## Directory contents

| File | Purpose |
|------|---------|
| `uncoupled-atmos/train-config.yaml` | ACE2 atmosphere model pretraining config |
| `uncoupled-atmos/evaluator-config.yaml` | ACE2 evaluation (58,300 steps = ~40 years at 6h) |
| `uncoupled-ocean/train-config.yaml` | SamudraI ocean model pretraining config |
| `uncoupled-ocean/evaluator-config.yaml` | SamudraI evaluation (2,920 steps = ~40 years at 5-day) |
| `train-config-template.yaml` | SamudrACE stage 1 training config template |
| `finetune-config-template.yaml` | SamudrACE stage 2 training config template |
| `evaluator-config-ICx1.yaml` | SamudrACE evaluation from a single initial condition (year 311) |
| `train.sh` | Generates `train-config.yaml` and submits SamudrACE stage 1 training |
| `finetune.sh` | Generates `finetune-config.yaml` and submits SamudrACE stage 2 training |
| `evaluate.sh` | SamudrACE evaluation |
42 changes: 42 additions & 0 deletions configs/baselines/coupled/cm4-piControl/evaluate.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#!/bin/bash

set -e

JOB_NAME="cm4-piControl-coupled-evaluator"
JOB_GROUP="cm4-piControl-coupled"
EXISTING_RESULTS_DATASET="01JZHQJXC4EYAPTCSP188YSVC0" # beaker dataset ID from coupled training or fine-tuning
CONFIG_FILENAME="evaluator-config.yaml"
SCRIPT_PATH=$(git rev-parse --show-prefix) # relative to the root of the repository
CONFIG_PATH="${SCRIPT_PATH}${CONFIG_FILENAME}"
BEAKER_USERNAME=$(beaker account whoami --format=json | jq -r '.[0].name')
REPO_ROOT=$(git rev-parse --show-toplevel)

cd "$REPO_ROOT" # so config path is valid no matter where we are running this script

python -m fme.coupled.validate_config --config_type evaluator $CONFIG_PATH

gantry run \
--name $JOB_NAME \
--task-name $JOB_NAME \
--description "ACE coupled CM4 piControl evaluator" \
--beaker-image "$(cat $REPO_ROOT/latest_deps_only_image.txt)" \
--workspace ai2/ace \
--priority normal \
--not-preemptible \
--cluster ai2/ceres-cirrascale \
--cluster ai2/saturn-cirrascale \
--weka climate-default:/climate-default \
--env WANDB_USERNAME=$BEAKER_USERNAME \
--env WANDB_NAME=$JOB_NAME \
--env WANDB_JOB_TYPE=inference \
--env WANDB_RUN_GROUP=$JOB_GROUP \
--env GOOGLE_APPLICATION_CREDENTIALS=/tmp/google_application_credentials.json \
--env-secret WANDB_API_KEY=wandb-api-key-ai2cm-sa \
--dataset-secret google-credentials:/tmp/google_application_credentials.json \
--dataset $EXISTING_RESULTS_DATASET:training_checkpoints/best_inference_ckpt.tar:/ckpt.tar \
--gpus 1 \
--shared-memory 50GiB \
--budget ai2/climate \
--system-python \
--install "pip install --no-deps ." \
-- python -I -m fme.coupled.evaluator $CONFIG_PATH
39 changes: 39 additions & 0 deletions configs/baselines/coupled/cm4-piControl/evaluator-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
experiment_dir: /results
n_coupled_steps: 2920
coupled_steps_in_memory: 20
checkpoint_path: /ckpt.tar
data_writer:
ocean:
save_prediction_files: false
save_monthly_files: false
atmosphere:
save_prediction_files: false
save_monthly_files: false
logging:
log_to_screen: true
log_to_wandb: true
log_to_file: true
project: SamudrACE-CM4-piControl
entity: ai2cm
loader:
num_data_workers: 1
dataset:
ocean:
merge:
- data_path: /climate-default
file_pattern: 2025-06-03-cm4-piControl-200yr-coupled-ocean.zarr
engine: zarr
- data_path: /climate-default
file_pattern: 2025-04-16-cm4-piControl-ocean-200yr-dataset.zarr
engine: zarr
atmosphere:
merge:
- data_path: /climate-default
file_pattern: 2025-06-03-cm4-piControl-200yr-coupled-interpolate_sst-atmosphere.zarr
engine: zarr
- data_path: /climate-default
file_pattern: 2025-03-21-CM4-piControl-atmosphere-land-1deg-8layer-200yr.zarr
engine: zarr
start_indices:
times:
- '0311-01-01T00:00:00'
167 changes: 167 additions & 0 deletions configs/baselines/coupled/cm4-piControl/finetune-config-template.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
experiment_dir: /results
save_checkpoint: true
validate_using_ema: true
ema:
decay: 0.999
max_epochs: 20
n_coupled_steps: 4
inference:
n_coupled_steps: 1456
coupled_steps_in_memory: 8
loader:
num_data_workers: 1
dataset:
ocean:
merge:
- data_path: /climate-default
file_pattern: 2025-06-03-cm4-piControl-200yr-coupled-ocean.zarr
engine: zarr
- data_path: /climate-default
file_pattern: 2025-04-16-cm4-piControl-ocean-200yr-dataset.zarr
engine: zarr
atmosphere:
merge:
- data_path: /climate-default
file_pattern: 2025-06-03-cm4-piControl-200yr-coupled-interpolate_sst-atmosphere.zarr
engine: zarr
- data_path: /climate-default
file_pattern: 2025-03-21-CM4-piControl-atmosphere-land-1deg-8layer-200yr.zarr
engine: zarr
start_indices:
times:
- '0151-01-06T00:00:00'
- '0171-01-06T00:00:00'
- '0191-01-06T00:00:00'
- '0211-01-06T00:00:00'
- '0231-01-06T00:00:00'
- '0251-01-06T00:00:00'
- '0271-01-06T00:00:00'
- '0291-01-06T00:00:00'
aggregator:
log_zonal_mean_images: false
log_histograms: false
logging:
log_to_screen: true
log_to_wandb: true
log_to_file: true
project: SamudrACE-CM4-piControl
entity: ai2cm
train_loader:
batch_size: 16
num_data_workers: 4
prefetch_factor: 1
dataset:
ocean:
merge:
- data_path: /climate-default
file_pattern: 2025-06-03-cm4-piControl-200yr-coupled-ocean.zarr
engine: zarr
subset:
start_time: '0151-01-06'
stop_time: '0306-01-01'
- data_path: /climate-default
file_pattern: 2025-04-16-cm4-piControl-ocean-200yr-dataset.zarr
engine: zarr
subset:
start_time: '0151-01-06'
stop_time: '0306-01-01'
atmosphere:
merge:
- data_path: /climate-default
file_pattern: 2025-06-03-cm4-piControl-200yr-coupled-interpolate_sst-atmosphere.zarr
engine: zarr
subset:
start_time: '0151-01-06'
stop_time: '0306-01-01'
- data_path: /climate-default
file_pattern: 2025-03-21-CM4-piControl-atmosphere-land-1deg-8layer-200yr.zarr
engine: zarr
subset:
start_time: '0151-01-06'
stop_time: '0306-01-01'
validation_loader:
batch_size: 16
num_data_workers: 4
prefetch_factor: 1
dataset:
ocean:
merge:
- data_path: /climate-default
file_pattern: 2025-06-03-cm4-piControl-200yr-coupled-ocean.zarr
engine: zarr
subset:
start_time: '0306-01-01'
stop_time: '0311-01-01'
- data_path: /climate-default
file_pattern: 2025-04-16-cm4-piControl-ocean-200yr-dataset.zarr
engine: zarr
subset:
start_time: '0306-01-01'
stop_time: '0311-01-01'
atmosphere:
merge:
- data_path: /climate-default
file_pattern: 2025-06-03-cm4-piControl-200yr-coupled-interpolate_sst-atmosphere.zarr
engine: zarr
subset:
start_time: '0306-01-01'
stop_time: '0311-01-01'
- data_path: /climate-default
file_pattern: 2025-03-21-CM4-piControl-atmosphere-land-1deg-8layer-200yr.zarr
engine: zarr
subset:
start_time: '0306-01-01'
stop_time: '0311-01-01'
optimization:
enable_automatic_mixed_precision: false
lr: 0.00001
optimizer_type: FusedAdam
kwargs:
weight_decay: 0.01
use_gradient_accumulation: true
scheduler:
type: CosineAnnealingLR
stepper_training:
parameter_init:
checkpoint_path: /ckpt.tar
ocean:
parameter_init:
weights_path: null # null required
loss:
type: MSE
loss_contributions:
n_steps: 4
weight: 1.0
atmosphere:
parameter_init:
weights_path: null # null required
loss_contributions:
n_steps: 2
weight: 1.0
loss:
type: MSE
weights:
air_temperature_0: 0.5
air_temperature_1: 0.5
eastward_wind_0: 0.5
northward_wind_0: 0.5
specific_total_water_0: 0.5
specific_total_water_1: 0.25
specific_total_water_2: 0.5
PRATEsfc: 0.5
h500: 10
TMP850: 5
Q2m: 0.5
DLWRFsfc: 2
ULWRFsfc: 5
USWRFsfc: 2
DSWRFsfc: 2
USWRFtoa: 2
tendency_of_total_water_path_due_to_advection: 0.5
stepper:
ocean:
timedelta: 5D
# stepper added by finetune.sh from uncoupled-ocean/train-config.yaml
atmosphere:
timedelta: 6h
# stepper added by finetune.sh from uncoupled-atmos/train-config.yaml
74 changes: 74 additions & 0 deletions configs/baselines/coupled/cm4-piControl/finetune.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
#!/bin/bash
#
# SamudrACE CM4 piControl training stage 2: starting from the stage 1
# checkpoint, fine-tunes both atmosphere and ocean models jointly with a
# cosine-annealing LR schedule.

set -e

JOB_NAME="cm4-piControl-coupled-finetune"
JOB_GROUP="cm4-piControl-coupled"
EXISTING_RESULTS_DATASET="TODO" # beaker dataset ID from coupled training (train.sh)
CKPT_TYPE="best_inference_ckpt"

SCRIPT_PATH=$(git rev-parse --show-prefix) # relative to the root of the repository
SCRIPT_PATH=${SCRIPT_PATH%/}
BEAKER_USERNAME=$(beaker account whoami --format=json | jq -r '.[0].name')
REPO_ROOT=$(git rev-parse --show-toplevel)
N_GPUS=4

ATMOS_STATS_DATA=jamesd/2025-06-03-cm4-piControl-200yr-coupled-stats-atmosphere
OCEAN_STATS_DATA=jamesd/2025-06-03-cm4-piControl-200yr-coupled-stats-ocean

cd "$REPO_ROOT" # so config path is valid no matter where we are running this script

# --- Generate finetune-config.yaml from template + uncoupled configs ---

TEMPLATE_CONFIG_PATH="${SCRIPT_PATH}/finetune-config-template.yaml"
CONFIG_PATH="${SCRIPT_PATH}/finetune-config.yaml"

cp "${SCRIPT_PATH}/uncoupled-atmos/train-config.yaml" ./atmos-config.yaml
sed -i 's/statsdata/atmos_stats/g' ./atmos-config.yaml

cp "${SCRIPT_PATH}/uncoupled-ocean/train-config.yaml" ./ocean-config.yaml
sed -i 's/statsdata/ocean_stats/g' ./ocean-config.yaml

cp "$TEMPLATE_CONFIG_PATH" "$CONFIG_PATH"

# update component stepper configs, preserving template values on conflict
yq -i '.stepper.ocean.stepper *=n load("ocean-config.yaml").stepper' "$CONFIG_PATH"
yq -i '.stepper.atmosphere.stepper *=n load("atmos-config.yaml").stepper' "$CONFIG_PATH"

rm ./atmos-config.yaml ./ocean-config.yaml

# --- Validate and submit ---

python -m fme.coupled.validate_config "$CONFIG_PATH" --config_type train

gantry run \
--name $JOB_NAME \
--task-name $JOB_NAME \
--description "Run SamudrACE CM4 piControl ocean + atmos fine-tuning" \
--beaker-image "$(cat $REPO_ROOT/latest_deps_only_image.txt)" \
--workspace ai2/ace \
--priority normal \
--preemptible \
--cluster ai2/ceres-cirrascale \
--cluster ai2/jupiter-cirrascale \
--weka climate-default:/climate-default \
--env WANDB_USERNAME=$BEAKER_USERNAME \
--env WANDB_NAME=$JOB_NAME \
--env WANDB_JOB_TYPE=training \
--env WANDB_RUN_GROUP=$JOB_GROUP \
--env GOOGLE_APPLICATION_CREDENTIALS=/tmp/google_application_credentials.json \
--env-secret WANDB_API_KEY=wandb-api-key-ai2cm-sa \
--dataset-secret google-credentials:/tmp/google_application_credentials.json \
--dataset $ATMOS_STATS_DATA:/atmos_stats \
--dataset $OCEAN_STATS_DATA:/ocean_stats \
--dataset "$EXISTING_RESULTS_DATASET:training_checkpoints/${CKPT_TYPE}.tar:/ckpt.tar" \
--gpus $N_GPUS \
--shared-memory 800GiB \
--budget ai2/climate \
--system-python \
--install "pip install --no-deps ." \
-- torchrun --nproc_per_node $N_GPUS -m fme.coupled.train "$CONFIG_PATH"
Loading
Loading