From 9cb040e0ea92b3f3bdd13f298f95be2d9c6bd900 Mon Sep 17 00:00:00 2001
From: Khai Nguyen <khai.nx1201@gmail.com>
Date: Fri, 22 May 2026 18:13:12 -0400
Subject: [PATCH 1/5] =?UTF-8?q?PDL=20case57:=20paper-faithful=20loss,=20an?=
 =?UTF-8?q?alytical=20dual,=20BoundedOutput,=20FixRefBus,=20=CF=81=5Feq=5F?=
 =?UTF-8?q?scale?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Key changes to src/L2OALM.jl:
- Paper-faithful augmented Lagrangian loss (correct g,h from BNK.constraints!)
- Clamp inequality multipliers to μ ≥ 0 (KKT correctness)
- Analytical primal loss: apply ALM dual update per gradient step
  (λ_eff = clamp(λ̂ + ρ·h, −M, M)); eliminates dual tracking gap
- ρ_eq_scale: separate penalty multiplier for equality constraints
- use_penalty_only kwarg for penalty-only (no dual) ablation

Key changes to test/power.jl and examples/case57_train.jl:
- BoundedOutput head: sigmoid variable-bound enforcement (max_bound_viol ≡ 0)
- FixRefBus layer: reference-bus angle fixed to zero architecturally; GPU-safe
- Reduced-space AC-OPF: eliminate branch-flow variables
- Warmup at ρ_max, lr decay, env-var hyperparameter overrides

New: examples/case57_train_twophase.jl — two-phase training (Phase 1 fixed ρ
warm-start to skip degenerate basin; Phase 2 growing ρ with ρ_eq_scale).

Best result: max_eq = 1.165 p.u. on 5000 held-out case57 test samples,
max_ineq = max_bound = 0 (analytical dual, ρ_max=1e4, MAX_DUAL=1e6).
---
 .gitignore                        |  14 +
 Project.toml                      |  24 +-
 examples/Project.toml             |  22 ++
 examples/case57_train.jl          | 228 +++++++++++++++
 examples/case57_train_twophase.jl | 261 +++++++++++++++++
 scripts/plot_training.jl          |  49 ++++
 src/L2OALM.jl                     | 465 +++++++++++++++++++++++++-----
 test/power.jl                     | 150 ++++++----
 test/runtests.jl                  |  71 +++--
 9 files changed, 1128 insertions(+), 156 deletions(-)
 create mode 100644 examples/Project.toml
 create mode 100644 examples/case57_train.jl
 create mode 100644 examples/case57_train_twophase.jl
 create mode 100644 scripts/plot_training.jl

diff --git a/.gitignore b/.gitignore
index 285da1e..d9448cd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -26,3 +26,17 @@ Manifest*.toml
 # File generated by the Preferences package to store local preferences
 LocalPreferences.toml
 JuliaLocalPreferences.toml
+
+# Training run artifacts (CSV trajectories, plots). The example writes these
+# under outputs/; don't version-control them.
+outputs/
+
+# Cluster job scripts and output logs (HPC-specific, not part of the library).
+slurm_jobs/
+
+# Experimental / scratch training scripts not ready for inclusion.
+examples/case57_train_rho_restart.jl
+
+# Claude Code editor state.
+.claude/
+CLAUDE.md
diff --git a/Project.toml b/Project.toml
index f3699aa..f04aebe 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,19 +1,38 @@
 name = "L2OALM"
 uuid = "f31bfc7b-7b5d-4cc3-b76b-1af281ce159d"
-authors = ["Andrew <arosemberg3@gatech.edu> and contributors"]
 version = "1.0.0-DEV"
+authors = ["Andrew <arosemberg3@gatech.edu> and contributors"]
 
 [deps]
 BatchNLPKernels = "7145f916-0e30-4c9d-93a2-b32b6056125d"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 ChainRules = "082447d4-558c-5d27-93f4-14fc19e9eca2"
 ExaModels = "1037b233-b668-4ce9-9b63-f9f681f55dd2"
+GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527"
+KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 Lux = "b2108857-7c20-44ae-9111-449ecde12c47"
 LuxCUDA = "d0bbae9a-e099-4d5b-a835-1c6931763bda"
+MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54"
+NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
 Optimisers = "3bd65402-5787-11e9-1adc-39752487f4e2"
+PGLib = "07a8691f-3d11-4330-951b-3c50f98338be"
+PowerModels = "c36e90e8-916a-50a6-bd94-075b64ef4655"
+Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
+Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
+
+[sources]
+BatchNLPKernels = {rev = "main", url = "https://github.com/klamike/BatchNLPKernels.jl"}
 
 [compat]
+GPUArraysCore = "0.2.0"
+KernelAbstractions = "0.9.41"
+MLUtils = "0.4.8"
+NNlib = "0.9.34"
+PGLib = "0.2.2"
+PowerModels = "0.21.6"
+Random = "1.11.0"
+Zygote = "0.7.10"
 julia = "1.6.7"
 
 [extras]
@@ -25,8 +44,5 @@ PowerModels = "c36e90e8-916a-50a6-bd94-075b64ef4655"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
-[sources.BatchNLPKernels]
-url = "https://github.com/klamike/BatchNLPKernels.jl"
-
 [targets]
 test = ["Test", "PowerModels", "PGLib", "Random", "MLUtils", "KernelAbstractions", "GPUArraysCore"]
diff --git a/examples/Project.toml b/examples/Project.toml
new file mode 100644
index 0000000..4a3cf2a
--- /dev/null
+++ b/examples/Project.toml
@@ -0,0 +1,22 @@
+[deps]
+BatchNLPKernels = "7145f916-0e30-4c9d-93a2-b32b6056125d"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
+ExaModels = "1037b233-b668-4ce9-9b63-f9f681f55dd2"
+GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527"
+KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
+L2OALM = "f31bfc7b-7b5d-4cc3-b76b-1af281ce159d"
+Lux = "b2108857-7c20-44ae-9111-449ecde12c47"
+LuxCUDA = "d0bbae9a-e099-4d5b-a835-1c6931763bda"
+MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54"
+NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
+Optimisers = "3bd65402-5787-11e9-1adc-39752487f4e2"
+PGLib = "07a8691f-3d11-4330-951b-3c50f98338be"
+PowerModels = "c36e90e8-916a-50a6-bd94-075b64ef4655"
+Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
+Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
+
+[sources]
+BatchNLPKernels = {rev = "main", url = "https://github.com/klamike/BatchNLPKernels.jl"}
+L2OALM = {path = ".."}
diff --git a/examples/case57_train.jl b/examples/case57_train.jl
new file mode 100644
index 0000000..6ce43f5
--- /dev/null
+++ b/examples/case57_train.jl
@@ -0,0 +1,228 @@
+# examples/case57_train.jl
+#
+# Trains the PDL primal-dual networks on `pglib_opf_case57_ieee` on GPU using
+# the paper's AC-OPF hyperparameters and the L2OALM `train!` schedule
+# (penalty-only warm-start, K outer ALM iterations × L inner gradient epochs
+# per phase, validation-loss-triggered lr decay).
+#
+# Acceptance: `max_violation` on a held-out test set < 0.01 after training.
+#
+# Usage:
+#   julia --project=. examples/case57_train.jl
+#
+# Tunables — overridable via env vars (PDL_K, PDL_L_PRIMAL, etc.) for quick smoke runs.
+# L_PRIMAL / L_DUAL / WARMUP_EPOCHS are GRADIENT STEP counts, not epoch counts.
+# With BATCH_SIZE=200 and DATASET_SIZE=5000 there are 25 batches/epoch.
+# Paper (Park & Van Hentenryck 2023): K=10, L=5000, ρ_init=10, α=2, τ=0.8.
+# We use K=20, L=2500 as a halfway point: same total steps as K=10/L=5000 but
+# with twice as many multiplier updates.  Key fix vs earlier runs: BoundedOutput
+# now uses sigmoid (not hardsigmoid) so generators can always recover from
+# saturation — hardsigmoid's zero-gradient dead zone at z<-3 caused collapse.
+const K               = parse(Int,     get(ENV, "PDL_K",             "20"))
+const L_PRIMAL        = parse(Int,     get(ENV, "PDL_L_PRIMAL",      "2500"))
+const L_DUAL          = parse(Int,     get(ENV, "PDL_L_DUAL",        "2500"))
+const WARMUP_EPOCHS   = parse(Int,     get(ENV, "PDL_WARMUP_EPOCHS", "2500"))
+const LR_PRIMAL       = parse(Float64, get(ENV, "PDL_LR_PRIMAL",     "1.0e-4"))
+const LR_DUAL         = parse(Float64, get(ENV, "PDL_LR_DUAL",       "1.0e-4"))
+const LR_DECAY        = parse(Float64, get(ENV, "PDL_LR_DECAY",      "0.99"))
+const BATCH_SIZE      = parse(Int,     get(ENV, "PDL_BATCH_SIZE",    "200"))
+const DATASET_SIZE    = parse(Int,     get(ENV, "PDL_DATASET_SIZE",  "5000"))
+const RHO_INIT        = parse(Float64, get(ENV, "PDL_RHO_INIT",      "10.0"))
+const RHO_MAX         = parse(Float64, get(ENV, "PDL_RHO_MAX",        "1.0e4"))
+const TAU             = parse(Float64, get(ENV, "PDL_TAU",             "0.8"))
+const ALPHA           = parse(Float64, get(ENV, "PDL_ALPHA",           "2.0"))
+const RHO_EQ_SCALE    = parse(Float64, get(ENV, "PDL_RHO_EQ_SCALE",   "1.0"))
+const MAX_DUAL        = parse(Float64, get(ENV, "PDL_MAX_DUAL",        "1.0e6"))
+const HIDDEN_MULT          = parse(Float64, get(ENV, "PDL_HIDDEN_MULT",          "1.2"))
+const USE_PENALTY_ONLY     = parse(Bool,    get(ENV, "PDL_USE_PENALTY_ONLY",     "false"))
+const USE_ANALYTICAL_DUAL  = parse(Bool,    get(ENV, "PDL_USE_ANALYTICAL_DUAL",  "false"))
+const T_F               = Float32
+
+using L2OALM
+using Lux
+using Optimisers
+using MLUtils
+using KernelAbstractions
+using ExaModels
+using BatchNLPKernels
+using CUDA
+using NNlib
+
+using PowerModels
+PowerModels.silence()
+using PGLib
+
+using Random
+using Statistics
+using Dates
+using Printf
+
+import GPUArraysCore: @allowscalar
+const BNK = BatchNLPKernels
+
+# AC-OPF model construction (constraint ordering enforced: inequalities first,
+# equalities last). Reuses test/power.jl.
+include(joinpath(@__DIR__, "..", "test", "power.jl"))
+
+function feed_forward_builder(num_p, num_y, hidden_layers; activation = relu)
+    layer_sizes = [num_p; hidden_layers; num_y]
+    layers = Any[]
+    for i in 1:length(layer_sizes)-1
+        if i < length(layer_sizes) - 1
+            push!(layers, Dense(layer_sizes[i], layer_sizes[i+1], activation))
+        else
+            push!(layers, Dense(layer_sizes[i], layer_sizes[i+1]))
+        end
+    end
+    return Chain(layers...)
+end
+
+function main()
+    rng = Random.default_rng()
+    Random.seed!(rng, 2026_05_14)
+
+    if !CUDA.functional()
+        @warn "CUDA not functional — falling back to CPU. Performance will be poor."
+    end
+
+    backend, dev = CUDA.functional() ?
+        (CUDABackend(), gpu_device()) :
+        (CPU(),         cpu_device())
+
+    @info "Building parametric AC-OPF model" case = "pglib_opf_case57_ieee.m" T = T_F backend
+    model, nbus, ngen, narc, ref_bus_idxs = create_parametric_ac_power_model(
+        "pglib_opf_case57_ieee.m"; backend = backend, T = T_F,
+    )
+    bm_train = BNK.BatchModel(model, BATCH_SIZE,   config = BNK.BatchModelConfig(:full))
+    bm_test  = BNK.BatchModel(model, DATASET_SIZE, config = BNK.BatchModelConfig(:full))
+
+    nvar = model.meta.nvar
+    ncon = model.meta.ncon
+    nθ   = length(model.θ)
+
+    is_equal = model.meta.lcon .== model.meta.ucon
+    num_equal = count(is_equal)
+    @assert all(is_equal[ncon-num_equal+1:ncon])  "constraint ordering: equalities at tail"
+    @assert !any(is_equal[1:ncon-num_equal])      "constraint ordering: inequalities at head"
+    @info "Problem dimensions" nbus ngen narc nvar ncon nθ num_equal
+
+    # Θ: load multipliers ~ 1.0 ± 10%
+    Random.seed!(rng, 0xCA5E5701)
+    Θ_train = (T_F(1.0) .+ T_F(0.1) .* randn(rng, T_F, nθ, DATASET_SIZE)) |> dev
+    Random.seed!(rng, 0xCA5E5702)
+    Θ_test  = (T_F(1.0) .+ T_F(0.1) .* randn(rng, T_F, nθ, DATASET_SIZE)) |> dev
+
+    # Primal network with bound-enforcing head; paper width = 1.2·nvar.
+    h = round(Int, HIDDEN_MULT * nvar)
+    lvar = T_F.(model.meta.lvar)
+    uvar = T_F.(model.meta.uvar)
+    # Build FixRefBus mask on the same device as lvar (GPU when CUDA is available).
+    # FixRefBus.mask must be a CuArray — Lux |> dev only transfers parameters/states,
+    # not data fields of custom AbstractLuxLayer structs.
+    ref_mask = fill!(similar(lvar), one(T_F))
+    @allowscalar for idx in ref_bus_idxs
+        ref_mask[idx] = zero(T_F)
+    end
+    primal_model = Chain(
+        feed_forward_builder(nθ, nvar, [h, h]),
+        BoundedOutput(lvar, uvar),
+        FixRefBus(ref_mask),
+    ) |> dev
+    ps_primal, st_primal = Lux.setup(rng, primal_model)
+    ps_primal = ps_primal |> dev
+    st_primal = st_primal |> dev
+
+    # Dual network: 2-layer MLP outputting ncon multipliers.
+    dual_model = feed_forward_builder(nθ, ncon, [h, h]) |> dev
+    ps_dual, st_dual = Lux.setup(rng, dual_model)
+    ps_dual = ps_dual |> dev
+    st_dual = st_dual |> dev
+
+    train_state_primal = Training.TrainState(primal_model, ps_primal, st_primal, Optimisers.Adam(LR_PRIMAL))
+    train_state_dual   = Training.TrainState(dual_model,   ps_dual,   st_dual,   Optimisers.Adam(LR_DUAL))
+
+    data = DataLoader((Θ_train); batchsize = BATCH_SIZE, shuffle = true) .|> dev
+
+    # Paper hyperparameters for AC-OPF.
+    method = ALMMethod(; batch_model = bm_train, num_equal = num_equal,
+        max_dual = MAX_DUAL, ρmax = RHO_MAX, τ = TAU, α = ALPHA, ρ_eq_scale = RHO_EQ_SCALE)
+    trainer = ALMTrainer(primal_model, train_state_primal, dual_model, train_state_dual, RHO_INIT)
+
+    method_test = ALMMethod(; batch_model = bm_test, num_equal = num_equal,
+        max_dual = MAX_DUAL, ρmax = RHO_MAX, τ = TAU, α = ALPHA, ρ_eq_scale = RHO_EQ_SCALE)
+    test_primal_loss = primal_loss(method_test)
+
+    # ---- CSV logger setup ----
+    log_dir = joinpath(@__DIR__, "..", "outputs")
+    mkpath(log_dir)
+    timestamp = Dates.format(now(), "yyyymmdd_HHMMSS")
+    log_path = joinpath(log_dir, "training_log_case57_$(timestamp).csv")
+    open(log_path, "w") do io
+        println(io, "outer_iter,rho,mean_abs_mu,mean_abs_lambda,",
+                    "mean_violations,max_violation_ineq,max_violation_eq,",
+                    "max_bound_violation,mean_objs,primal_loss,wallclock_s")
+    end
+    @info "CSV log path" log_path
+
+    t0 = time()
+    function eval_and_log(iter::Int, trainer)
+        ps_p = trainer.primal_training_state.parameters
+        st_p = trainer.primal_training_state.states
+        X̂_test, _ = primal_model(Θ_test, ps_p, st_p)
+        Vc_test, Vb_test = BNK.all_violations!(bm_test, X̂_test, Θ_test)
+        objs_test = BNK.objective!(bm_test, X̂_test, Θ_test)
+        dual_hat, _ = trainer.dual_model(Θ_test,
+            trainer.dual_training_state.parameters,
+            trainer.dual_training_state.states)
+        μ = dual_hat[1:end-num_equal, :]
+        λ = dual_hat[end-num_equal+1:end, :]
+        Vc_ineq = Vc_test[1:end-num_equal, :]
+        Vc_eq   = Vc_test[end-num_equal+1:end, :]
+        mean_abs_mu     = mean(abs.(μ))
+        mean_abs_lambda = mean(abs.(λ))
+        mean_viol       = mean(Vc_test)
+        max_viol_ineq   = maximum(Vc_ineq)
+        max_viol_eq     = maximum(Vc_eq)
+        max_bound       = maximum(Vb_test)
+        mean_obj        = mean(objs_test)
+        wallclock       = time() - t0
+        primal_val, _, _ = test_primal_loss(primal_model, ps_p, st_p, (Θ_test, trainer))
+
+        open(log_path, "a") do io
+            @printf(io, "%d,%.6e,%.6e,%.6e,%.6e,%.6e,%.6e,%.6e,%.6e,%.6e,%.3f\n",
+                iter, trainer.ρ, mean_abs_mu, mean_abs_lambda,
+                mean_viol, max_viol_ineq, max_viol_eq, max_bound, mean_obj,
+                Float64(primal_val), wallclock)
+        end
+
+        @info @sprintf("iter %3d", iter) ρ=trainer.ρ max_viol_ineq max_viol_eq mean_viol mean_obj mean_abs_mu mean_abs_lambda
+        return Float64(mean_viol)   # signal for lr-decay: decay only when violations worsen
+    end
+
+    eval_and_log(0, trainer)   # baseline before training
+
+    @info "Starting train!" K L_PRIMAL L_DUAL WARMUP_EPOCHS RHO_INIT RHO_MAX TAU ALPHA RHO_EQ_SCALE MAX_DUAL LR_PRIMAL LR_DUAL HIDDEN_MULT h USE_PENALTY_ONLY USE_ANALYTICAL_DUAL
+    train!(method, trainer, data;
+        K = K, L_primal = L_PRIMAL, L_dual = L_DUAL,
+        warmup_epochs = WARMUP_EPOCHS,
+        lr_primal = LR_PRIMAL, lr_dual = LR_DUAL,
+        lr_decay  = LR_DECAY,
+        use_penalty_only = USE_PENALTY_ONLY,
+        use_analytical_dual = USE_ANALYTICAL_DUAL,
+        eval_fn   = eval_and_log,
+    )
+
+    # ---- Acceptance check ----
+    ps_p = trainer.primal_training_state.parameters
+    st_p = trainer.primal_training_state.states
+    X̂_final, _ = primal_model(Θ_test, ps_p, st_p)
+    Vc_final, Vb_final = BNK.all_violations!(bm_test, X̂_final, Θ_test)
+    max_viol = maximum(Vc_final)
+    max_bound = maximum(Vb_final)
+    @info "FINAL" max_viol max_bound log = log_path
+    @info "Acceptance" passed = (max_viol < 0.01) target = 0.01
+
+    return max_viol
+end
+
+main()
diff --git a/examples/case57_train_twophase.jl b/examples/case57_train_twophase.jl
new file mode 100644
index 0000000..125abf2
--- /dev/null
+++ b/examples/case57_train_twophase.jl
@@ -0,0 +1,261 @@
+# examples/case57_train_twophase.jl
+#
+# Two-phase PDL training for case57 (v26a):
+#
+#   Phase 1 — skip the degenerate phase:
+#     ρ_init = ρ_max = 1e4 (pinned, no growth).  At ρ=1e4 the zero-generation
+#     degenerate min costs 2.88M vs. feasible ~40K — mechanically impossible.
+#     Run K1 outer iterations.  After this phase the network is in a non-degenerate
+#     basin but equalities oscillate at ~2.5 p.u. (reproduces v21a behaviour).
+#
+#   Phase 2 — drive equalities to convergence:
+#     Start from Phase-1 weights (trainer unchanged).  Swap to a new method with
+#     ρ_max=1e6, ρ_eq_scale=10, MAX_DUAL=1e4.
+#     - ρ grows from trainer.ρ (=1e4 at end of Phase 1) up to 1e6.
+#     - ρ_eq_scale=10 makes effective equality ρ up to 1e7, but MAX_DUAL=1e4
+#       caps the gradient at lr × 1e4 = 1.0/step — stable throughout.
+#     - Phase-1 weights serve as a warm start in the non-degenerate basin.
+#     Run K2 outer iterations.
+#
+# The key difference from the user's original reset idea: we do NOT reset ρ back
+# to a small value (which would misalign the dual network).  Instead, Phase 2
+# CONTINUES growing ρ from where Phase 1 left it, but with higher ρ_max and
+# ρ_eq_scale so equalities get additional convergence pressure.
+
+const K1              = parse(Int,     get(ENV, "PDL_K1",             "40"))
+const K2              = parse(Int,     get(ENV, "PDL_K2",             "60"))
+const L_PRIMAL        = parse(Int,     get(ENV, "PDL_L_PRIMAL",       "2500"))
+const L_DUAL          = parse(Int,     get(ENV, "PDL_L_DUAL",         "2500"))
+const WARMUP_EPOCHS   = parse(Int,     get(ENV, "PDL_WARMUP_EPOCHS",  "25000"))
+const LR_PRIMAL       = parse(Float64, get(ENV, "PDL_LR_PRIMAL",      "1.0e-4"))
+const LR_DUAL         = parse(Float64, get(ENV, "PDL_LR_DUAL",        "1.0e-3"))
+const LR_DECAY        = parse(Float64, get(ENV, "PDL_LR_DECAY",       "0.99"))
+const BATCH_SIZE      = parse(Int,     get(ENV, "PDL_BATCH_SIZE",     "200"))
+const DATASET_SIZE    = parse(Int,     get(ENV, "PDL_DATASET_SIZE",   "5000"))
+const TAU             = parse(Float64, get(ENV, "PDL_TAU",             "0.8"))
+const ALPHA           = parse(Float64, get(ENV, "PDL_ALPHA",           "2.0"))
+const HIDDEN_MULT     = parse(Float64, get(ENV, "PDL_HIDDEN_MULT",     "4.0"))
+
+# Phase 1 hyperparams
+const RHO_P1          = parse(Float64, get(ENV, "PDL_RHO_P1",         "10000.0"))
+const MAX_DUAL_P1     = parse(Float64, get(ENV, "PDL_MAX_DUAL_P1",    "10000.0"))
+const RHO_EQ_SCALE_P1 = parse(Float64, get(ENV, "PDL_RHO_EQ_SCALE_P1","1.0"))
+
+# Phase 2 hyperparams
+const RHO_MAX_P2       = parse(Float64, get(ENV, "PDL_RHO_MAX_P2",    "1000000.0"))
+const MAX_DUAL_P2      = parse(Float64, get(ENV, "PDL_MAX_DUAL_P2",   "10000.0"))
+const RHO_EQ_SCALE_P2  = parse(Float64, get(ENV, "PDL_RHO_EQ_SCALE_P2","10.0"))
+
+const T_F               = Float32
+
+using L2OALM
+using Lux
+using Optimisers
+using MLUtils
+using KernelAbstractions
+using ExaModels
+using BatchNLPKernels
+using CUDA
+using NNlib
+
+using PowerModels
+PowerModels.silence()
+using PGLib
+
+using Random
+using Statistics
+using Dates
+using Printf
+
+import GPUArraysCore: @allowscalar
+const BNK = BatchNLPKernels
+
+include(joinpath(@__DIR__, "..", "test", "power.jl"))
+
+function feed_forward_builder(num_p, num_y, hidden_layers; activation = relu)
+    layer_sizes = [num_p; hidden_layers; num_y]
+    layers = Any[]
+    for i in 1:length(layer_sizes)-1
+        if i < length(layer_sizes) - 1
+            push!(layers, Dense(layer_sizes[i], layer_sizes[i+1], activation))
+        else
+            push!(layers, Dense(layer_sizes[i], layer_sizes[i+1]))
+        end
+    end
+    return Chain(layers...)
+end
+
+function main()
+    rng = Random.default_rng()
+    Random.seed!(rng, 2026_05_22)
+
+    if !CUDA.functional()
+        @warn "CUDA not functional — falling back to CPU."
+    end
+
+    backend, dev = CUDA.functional() ?
+        (CUDABackend(), gpu_device()) :
+        (CPU(),         cpu_device())
+
+    @info "Building parametric AC-OPF model" case = "pglib_opf_case57_ieee.m" T = T_F backend
+    model, nbus, ngen, narc, ref_bus_idxs = create_parametric_ac_power_model(
+        "pglib_opf_case57_ieee.m"; backend = backend, T = T_F,
+    )
+    bm_train = BNK.BatchModel(model, BATCH_SIZE,   config = BNK.BatchModelConfig(:full))
+    bm_test  = BNK.BatchModel(model, DATASET_SIZE, config = BNK.BatchModelConfig(:full))
+
+    nvar = model.meta.nvar
+    ncon = model.meta.ncon
+    nθ   = length(model.θ)
+
+    is_equal  = model.meta.lcon .== model.meta.ucon
+    num_equal = count(is_equal)
+    @assert all(is_equal[ncon-num_equal+1:ncon])
+    @assert !any(is_equal[1:ncon-num_equal])
+    @info "Problem dimensions" nbus ngen narc nvar ncon nθ num_equal
+
+    Random.seed!(rng, 0xCA5E5701)
+    Θ_train = (T_F(1.0) .+ T_F(0.1) .* randn(rng, T_F, nθ, DATASET_SIZE)) |> dev
+    Random.seed!(rng, 0xCA5E5702)
+    Θ_test  = (T_F(1.0) .+ T_F(0.1) .* randn(rng, T_F, nθ, DATASET_SIZE)) |> dev
+
+    h = round(Int, HIDDEN_MULT * nvar)
+    lvar = T_F.(model.meta.lvar)
+    uvar = T_F.(model.meta.uvar)
+    ref_mask = fill!(similar(lvar), one(T_F))
+    @allowscalar for idx in ref_bus_idxs
+        ref_mask[idx] = zero(T_F)
+    end
+    primal_model = Chain(
+        feed_forward_builder(nθ, nvar, [h, h]),
+        BoundedOutput(lvar, uvar),
+        FixRefBus(ref_mask),
+    ) |> dev
+    ps_primal, st_primal = Lux.setup(rng, primal_model)
+    ps_primal = ps_primal |> dev
+    st_primal = st_primal |> dev
+
+    dual_model = feed_forward_builder(nθ, ncon, [h, h]) |> dev
+    ps_dual, st_dual = Lux.setup(rng, dual_model)
+    ps_dual = ps_dual |> dev
+    st_dual = st_dual |> dev
+
+    train_state_primal = Training.TrainState(primal_model, ps_primal, st_primal, Optimisers.Adam(LR_PRIMAL))
+    train_state_dual   = Training.TrainState(dual_model,   ps_dual,   st_dual,   Optimisers.Adam(LR_DUAL))
+
+    data = DataLoader((Θ_train); batchsize = BATCH_SIZE, shuffle = true) .|> dev
+
+    # ---- shared logger ----
+    log_dir = joinpath(@__DIR__, "..", "outputs")
+    mkpath(log_dir)
+    timestamp = Dates.format(now(), "yyyymmdd_HHMMSS")
+    log_path = joinpath(log_dir, "training_log_case57_twophase_$(timestamp).csv")
+    open(log_path, "w") do io
+        println(io, "phase,outer_iter,rho,rho_eq_effective,mean_abs_mu,mean_abs_lambda,",
+                    "mean_violations,max_violation_ineq,max_violation_eq,",
+                    "max_bound_violation,mean_objs,primal_loss,wallclock_s")
+    end
+    @info "CSV log path" log_path
+    t0 = time()
+
+    function make_eval_fn(phase::Int, method_test)
+        test_primal_loss = primal_loss(method_test)
+        return function eval_and_log(iter::Int, trainer)
+            ps_p = trainer.primal_training_state.parameters
+            st_p = trainer.primal_training_state.states
+            X̂_test, _ = primal_model(Θ_test, ps_p, st_p)
+            Vc_test, Vb_test = BNK.all_violations!(bm_test, X̂_test, Θ_test)
+            objs_test = BNK.objective!(bm_test, X̂_test, Θ_test)
+            dual_hat, _ = trainer.dual_model(Θ_test,
+                trainer.dual_training_state.parameters,
+                trainer.dual_training_state.states)
+            μ = dual_hat[1:end-num_equal, :]
+            λ = dual_hat[end-num_equal+1:end, :]
+            Vc_ineq = Vc_test[1:end-num_equal, :]
+            Vc_eq   = Vc_test[end-num_equal+1:end, :]
+            mean_abs_mu     = mean(abs.(μ))
+            mean_abs_lambda = mean(abs.(λ))
+            max_viol_ineq   = maximum(Vc_ineq)
+            max_viol_eq     = maximum(Vc_eq)
+            max_bound       = maximum(Vb_test)
+            mean_obj        = mean(objs_test)
+            wallclock       = time() - t0
+            rho_eq_eff      = trainer.ρ * (phase == 1 ? RHO_EQ_SCALE_P1 : RHO_EQ_SCALE_P2)
+            primal_val, _, _ = test_primal_loss(primal_model, ps_p, st_p, (Θ_test, trainer))
+
+            open(log_path, "a") do io
+                @printf(io, "%d,%d,%.6e,%.6e,%.6e,%.6e,%.6e,%.6e,%.6e,%.6e,%.6e,%.6e,%.3f\n",
+                    phase, iter, trainer.ρ, rho_eq_eff,
+                    mean_abs_mu, mean_abs_lambda,
+                    mean(Vc_test), max_viol_ineq, max_viol_eq, max_bound, mean_obj,
+                    Float64(primal_val), wallclock)
+            end
+
+            @info @sprintf("Phase %d iter %3d", phase, iter) ρ=trainer.ρ ρ_eq=rho_eq_eff max_viol_ineq max_viol_eq mean_obj
+            return Float64(mean(Vc_test))
+        end
+    end
+
+    # ==========================================================================
+    # PHASE 1: fixed ρ = ρ_max = RHO_P1  (skip degenerate phase)
+    # ==========================================================================
+    @info "=== PHASE 1 ===" K1 RHO_P1 MAX_DUAL_P1 RHO_EQ_SCALE_P1
+
+    method_p1 = ALMMethod(; batch_model = bm_train, num_equal = num_equal,
+        max_dual = MAX_DUAL_P1, ρmax = RHO_P1, τ = TAU, α = ALPHA,
+        ρ_eq_scale = RHO_EQ_SCALE_P1)
+    method_p1_test = ALMMethod(; batch_model = bm_test, num_equal = num_equal,
+        max_dual = MAX_DUAL_P1, ρmax = RHO_P1, τ = TAU, α = ALPHA,
+        ρ_eq_scale = RHO_EQ_SCALE_P1)
+
+    trainer = ALMTrainer(primal_model, train_state_primal, dual_model, train_state_dual, RHO_P1)
+
+    eval_p1 = make_eval_fn(1, method_p1_test)
+    eval_p1(0, trainer)   # baseline
+
+    train!(method_p1, trainer, data;
+        K = K1, L_primal = L_PRIMAL, L_dual = L_DUAL,
+        warmup_epochs = WARMUP_EPOCHS,
+        lr_primal = LR_PRIMAL, lr_dual = LR_DUAL, lr_decay = LR_DECAY,
+        use_penalty_only = false, use_analytical_dual = true,
+        eval_fn = eval_p1,
+    )
+
+    @info "Phase 1 complete" trainer.ρ trainer.max_violations
+
+    # ==========================================================================
+    # PHASE 2: grow ρ → RHO_MAX_P2 with ρ_eq_scale=RHO_EQ_SCALE_P2
+    # Trainer state (weights + ρ from Phase 1) is preserved.
+    # ==========================================================================
+    @info "=== PHASE 2 ===" K2 RHO_MAX_P2 MAX_DUAL_P2 RHO_EQ_SCALE_P2
+
+    method_p2 = ALMMethod(; batch_model = bm_train, num_equal = num_equal,
+        max_dual = MAX_DUAL_P2, ρmax = RHO_MAX_P2, τ = TAU, α = ALPHA,
+        ρ_eq_scale = RHO_EQ_SCALE_P2)
+    method_p2_test = ALMMethod(; batch_model = bm_test, num_equal = num_equal,
+        max_dual = MAX_DUAL_P2, ρmax = RHO_MAX_P2, τ = TAU, α = ALPHA,
+        ρ_eq_scale = RHO_EQ_SCALE_P2)
+
+    eval_p2 = make_eval_fn(2, method_p2_test)
+
+    train!(method_p2, trainer, data;
+        K = K2, L_primal = L_PRIMAL, L_dual = L_DUAL,
+        warmup_epochs = 0,   # no second warmup — primal already in good basin
+        lr_primal = LR_PRIMAL, lr_dual = LR_DUAL, lr_decay = LR_DECAY,
+        use_penalty_only = false, use_analytical_dual = true,
+        eval_fn = eval_p2,
+    )
+
+    # ---- Final eval ----
+    ps_p = trainer.primal_training_state.parameters
+    st_p = trainer.primal_training_state.states
+    X̂_final, _ = primal_model(Θ_test, ps_p, st_p)
+    Vc_final, Vb_final = BNK.all_violations!(bm_test, X̂_final, Θ_test)
+    max_viol = maximum(Vc_final)
+    max_bound = maximum(Vb_final)
+    @info "FINAL" max_viol max_bound log = log_path
+    @info "Acceptance" passed = (max_viol < 0.01) target = 0.01
+    return max_viol
+end
+
+main()
diff --git a/scripts/plot_training.jl b/scripts/plot_training.jl
new file mode 100644
index 0000000..a873b1f
--- /dev/null
+++ b/scripts/plot_training.jl
@@ -0,0 +1,49 @@
+# scripts/plot_training.jl
+#
+# Plot a training trajectory CSV produced by examples/case57_train.jl.
+#
+# Usage:
+#   julia --project=examples scripts/plot_training.jl outputs/training_log_case57_<ts>.csv
+#
+# Produces a PNG next to the CSV.
+
+using DelimitedFiles
+using Plots
+gr()
+
+function main(csv_path::String)
+    @assert isfile(csv_path) "CSV not found: $csv_path"
+    data, header = readdlm(csv_path, ',', Float64; header = true)
+    cols = Dict(strip(c) => i for (i, c) in enumerate(header))
+
+    iters = data[:, cols["outer_iter"]]
+    ρ        = data[:, cols["rho"]]
+    mean_μ   = data[:, cols["mean_abs_mu"]]
+    mean_λ   = data[:, cols["mean_abs_lambda"]]
+    mean_v   = data[:, cols["mean_violations"]]
+    max_vi   = data[:, cols["max_violation_ineq"]]
+    max_ve   = data[:, cols["max_violation_eq"]]
+    max_b    = data[:, cols["max_bound_violation"]]
+    mean_o   = data[:, cols["mean_objs"]]
+    primal_l = data[:, cols["primal_loss"]]
+
+    p1 = plot(iters, primal_l, xlabel="outer iter", ylabel="primal AL loss", title="Primal loss", lw=2)
+    p2 = plot(iters, ρ,        xlabel="outer iter", ylabel="ρ",        title="Penalty ρ", lw=2)
+    p3 = plot(iters, [mean_μ mean_λ], xlabel="outer iter",
+        label=["mean |μ|" "mean |λ|"], title="Dual magnitudes", lw=2)
+    p4 = plot(iters, [max_vi max_ve max_b], xlabel="outer iter",
+        label=["max V_ineq" "max V_eq" "max V_bound"],
+        title="Max violations (log)", yscale=:log10, lw=2)
+    p5 = plot(iters, mean_v,   xlabel="outer iter", ylabel="mean violations", title="Mean violations", lw=2)
+    p6 = plot(iters, mean_o,   xlabel="outer iter", ylabel="mean objective", title="Mean cost", lw=2)
+
+    fig = plot(p1, p2, p3, p4, p5, p6, layout=(2, 3), size=(1400, 800), legend=:topright)
+    png_path = replace(csv_path, r"\.csv$" => ".png")
+    savefig(fig, png_path)
+    @info "Saved" png_path
+end
+
+if length(ARGS) < 1
+    error("usage: julia --project=examples scripts/plot_training.jl <csv_path>")
+end
+main(ARGS[1])
diff --git a/src/L2OALM.jl b/src/L2OALM.jl
index ef6836c..fcc90fd 100644
--- a/src/L2OALM.jl
+++ b/src/L2OALM.jl
@@ -7,11 +7,93 @@ using Lux
 using LuxCUDA
 using Lux.Training
 using CUDA
+using NNlib: sigmoid_fast
+using Random: AbstractRNG
 using Statistics
 using ChainRules: @ignore_derivatives
+using Zygote
+
+using Optimisers
 
 export ALMMethod,
-    ALMTrainer, dual_loss, primal_loss, single_train_step!
+    ALMTrainer, BoundedOutput, FixRefBus, dual_loss, primal_loss, penalty_only_primal_loss,
+    analytical_primal_loss, single_train_step!, train!
+
+"""
+    BoundedOutput(lvar, uvar)
+
+Parameterless Lux layer that enforces per-variable bounds on the primal network's
+output. For variables with finite `(lvar[i], uvar[i])` it applies
+
+    yᵢ = lvar[i] + (uvar[i] − lvar[i]) · sigmoid_fast(zᵢ)
+
+so the output is in `(lvar[i], uvar[i])` asymptotically (sigmoid never reaches the
+bounds exactly, but gradient is always nonzero). Variables with an infinite bound
+(e.g., voltage angles) pass through unchanged.
+
+Uses logistic sigmoid_fast (not hardsigmoid) so that generators can always recover from
+near-zero outputs: hardsigmoid has a dead gradient zone at z < -3 that permanently
+traps outputs at the lower bound once entered. Sigmoid'(z) = σ(z)(1-σ(z)) > 0 for
+all z, and Adam's gradient normalization restores steps even at deep saturation.
+"""
+struct BoundedOutput{V<:AbstractVector} <: Lux.AbstractLuxLayer
+    lo::V       # safe lower bound: lvar[i] where finite, 0 elsewhere
+    width::V    # uvar[i] - lvar[i] where both finite, 0 elsewhere
+    mask::V     # 1.0 where both finite, 0.0 elsewhere  (Float for GPU broadcasting)
+end
+
+function BoundedOutput(lvar::AbstractVector, uvar::AbstractVector)
+    @assert length(lvar) == length(uvar) "lvar and uvar must have the same length"
+    T = promote_type(eltype(lvar), eltype(uvar))
+    both = isfinite.(lvar) .& isfinite.(uvar)
+    lo = T.(ifelse.(both, lvar, zero(T)))
+    width = T.(ifelse.(both, uvar .- lvar, zero(T)))
+    mask = T.(both)
+    return BoundedOutput(lo, width, mask)
+end
+
+Lux.initialparameters(::AbstractRNG, ::BoundedOutput) = NamedTuple()
+Lux.initialstates(::AbstractRNG, ::BoundedOutput) = NamedTuple()
+Lux.parameterlength(::BoundedOutput) = 0
+Lux.statelength(::BoundedOutput) = 0
+
+function (b::BoundedOutput)(z::AbstractMatrix, _ps, st::NamedTuple)
+    bounded = b.lo .+ b.width .* sigmoid_fast.(z)
+    y = b.mask .* bounded .+ (one(eltype(b.mask)) .- b.mask) .* z
+    return y, st
+end
+
+"""
+    FixRefBus(mask)
+
+Parameterless Lux layer that forces reference-bus voltage angle(s) to zero by
+multiplying the corresponding rows of the output matrix by zero. This satisfies
+the `va[ref] = 0` equality constraint architecturally, removing it from the
+Lagrangian penalty and freeing network capacity for the remaining 127 variables.
+
+Construct via `FixRefBus(nvar, ref_bus_idxs)` which builds the mask on CPU; the
+mask is transferred to GPU when the Chain is moved with `|> dev`.
+"""
+struct FixRefBus{V<:AbstractVector} <: Lux.AbstractLuxLayer
+    mask::V   # length nvar; 0 at ref bus angle positions, 1 elsewhere
+end
+
+function FixRefBus(nvar::Int, ref_bus_idxs)
+    mask = ones(Float32, nvar)
+    for idx in ref_bus_idxs
+        mask[idx] = 0f0
+    end
+    return FixRefBus(mask)
+end
+
+Lux.initialparameters(::AbstractRNG, ::FixRefBus) = NamedTuple()
+Lux.initialstates(::AbstractRNG, ::FixRefBus) = NamedTuple()
+Lux.parameterlength(::FixRefBus) = 0
+Lux.statelength(::FixRefBus) = 0
+
+function (f::FixRefBus)(X::AbstractMatrix, _ps, st::NamedTuple)
+    return X .* f.mask, st
+end
 
 """
     AbstractL2OMethod
@@ -39,18 +121,22 @@ struct ALMMethod{T<:Real} <: AbstractPrimalDualMethod
     τ::T
     α::T
     num_equal::Int # TODO: There should be a way to get this from the batch model
+    ρ_eq_scale::T  # extra penalty multiplier for equality constraints (default 1.0)
 
-    function ALMMethod(batch_model::BatchModel, max_dual::T, ρmax::T, τ::T, α::T, num_equal::Int) where {T<:Real}
-        new{T}(batch_model, max_dual, ρmax, τ, α, num_equal)
+    function ALMMethod(batch_model::BatchModel, max_dual::T, ρmax::T, τ::T, α::T, num_equal::Int, ρ_eq_scale::T) where {T<:Real}
+        new{T}(batch_model, max_dual, ρmax, τ, α, num_equal, ρ_eq_scale)
     end
 end
 
 """
-    ALMMethod(; batch_model::BatchModel, num_equal::Int, max_dual::T = 1e6, ρmax::T = 1e6, τ::T = 0.8, α::T = 10.0)
+    ALMMethod(; batch_model, num_equal, max_dual=1e6, ρmax=1e6, τ=0.8, α=10.0, ρ_eq_scale=1.0)
+
+Constructor for the ALM method.
 
-Constructor for the Augmented Lagrangian Method (ALM) for primal-dual Learning to Optimize (L2O) methods.
-This function initializes the ALM method with a batch model, maximum dual variable values, maximum learning rate `ρ`, 
-threshold for the parameter updater `τ`, meta learning rate `α`, and the number of equality constraints.
+`ρ_eq_scale`: extra multiplier applied to the equality-constraint penalty (and dual
+update) relative to the inequality penalty. Values > 1 focus gradient pressure on
+power-balance residuals when those equalities are harder to satisfy than the
+inequality constraints. Default 1.0 reproduces the standard ALM formulation.
 """
 function ALMMethod(;
     batch_model::BatchModel,
@@ -59,8 +145,9 @@ function ALMMethod(;
     ρmax::T = 1e6,
     τ::T = 0.8,
     α::T = 10.0,
+    ρ_eq_scale::T = 1.0,
 ) where {T<:Real}
-    return ALMMethod(batch_model, max_dual, ρmax, τ, α, num_equal)
+    return ALMMethod(batch_model, max_dual, ρmax, τ, α, num_equal, ρ_eq_scale)
 end
 
 """
@@ -139,6 +226,7 @@ function dual_loss(method::ALMMethod)
     bm = method.batch_model
     max_dual = method.max_dual
     num_equal = method.num_equal
+    ρ_eq_scale = method.ρ_eq_scale
     return (dual_model, ps_dual, st_dual, data) -> begin
         Θ, trainer = data
         ρ = trainer.ρ
@@ -151,21 +239,21 @@ function dual_loss(method::ALMMethod)
         # Get previous dual predictions
         dual_hat_k, _ = @ignore_derivatives dual_model(Θ, prev_dual_training_state.parameters, prev_dual_training_state.states)
 
-        # Separate bound and equality constraints
-        # # Forward pass for primal model
-        X̂, _ = trainer.primal_model(θ, primal_training_state.parameters, primal_training_state.states)
-        gh = constraints!(bm, X̂, Θ)
-
-        @ignore_derivatives gh = trainer.constraints(Θ, trainer.ps_constraints, trainer.st_constraints)
+        # Separate bound and equality constraints (frozen primal — no gradients needed here)
+        X̂_dual, _ = @ignore_derivatives trainer.primal_model(Θ, primal_training_state.parameters, primal_training_state.states)
+        gh = @ignore_derivatives BNK.constraints!(bm, X̂_dual, Θ)
         gh_bound = gh[1:end-num_equal, :]
         gh_equal = gh[end-num_equal+1:end, :]
         dual_hat_bound = dual_hat_k[1:end-num_equal, :]
         dual_hat_equal = dual_hat_k[end-num_equal+1:end, :]
 
-        # Target for dual variables
+        # ALM dual targets:
+        #   μ_{k+1} ← max(μ_k + ρ·g, 0)         (ineq: one-sided clip)
+        #   λ_{k+1} ← λ_k + ρ·ρ_eq_scale·h      (eq: symmetric; scale boosts equality pressure)
+        # The outer clamp is a numerical safeguard against unbounded growth.
         dual_target = vcat(
-            min.(max.(dual_hat_bound + ρ .* gh_bound, 0), max_dual),
-            min.(dual_hat_equal + ρ .* gh_equal, max_dual),
+            clamp.(dual_hat_bound + ρ .* gh_bound, zero(max_dual), max_dual),
+            clamp.(dual_hat_equal + (ρ * ρ_eq_scale) .* gh_equal, -max_dual, max_dual),
         )
 
         loss = mean((dual_hat .- dual_target) .^ 2)
@@ -181,33 +269,60 @@ from current dual predictions for the batch model `bm` under parameters `Θ`.
 """
 function primal_loss(method::ALMMethod)
     bm = method.batch_model
+    num_equal = method.num_equal
+    ρ_eq_scale = method.ρ_eq_scale
     return (model, ps, st, data) -> begin
         Θ, trainer = data
         ρ = trainer.ρ
         num_s = size(Θ, 2)
 
-        # Forward pass for prediction
         X̂, st_new = model(Θ, ps, st)
 
-        # Get current dual predictions
-        dual_hat, _ = @ignore_derivatives trainer.dual_model(Θ, trainer.dual_training_state.parameters, trainer.dual_training_state.states)
-
-        # Calculate violations and objectives
+        # Frozen dual predictions for this primal step.
+        dual_hat, _ = @ignore_derivatives trainer.dual_model(
+            Θ, trainer.dual_training_state.parameters, trainer.dual_training_state.states)
+
+        # Paper PDL formulation (Park & Van Hentenryck 2023):
+        #   L_ρ(y, μ, λ) = f(y) + μᵀg(y) + λᵀh(y) + (ρ/2)·(‖max(g,0)‖² + ‖h‖²)
+        # Lagrangian linear term uses RAW signed g, h from BNK.constraints!
+        # (`max(g,0)`/`|h|` from all_violations! is the wrong quantity here:
+        # μ ≥ 0 with g_i ≤ 0 should contribute *negatively* so the primal keeps
+        # inequalities slack rather than just non-positive).
+        # Penalty term squares Vc, which BNK.all_violations! already computes as
+        # max(g,0) for inequalities and |h| for equalities — squaring gives the
+        # paper's max(g,0)² + h² exactly.
+        # ρ_eq_scale > 1 applies extra pressure on equality violations specifically,
+        # matching the dual_loss update which also uses ρ·ρ_eq_scale for equalities.
         objs = BNK.objective!(bm, X̂, Θ)
-        # gh = constraints!(bm, X̂, Θ)
-        Vc, Vb = BNK.all_violations!(bm, X̂, Θ)
-        V = vcat(Vb, Vc)
-        total_loss = (
-            sum(abs.(dual_hat .* V)) / num_s + ρ / 2 * sum((V) .^ 2) / num_s + mean(objs)
-        )
+        gh   = BNK.constraints!(bm, X̂, Θ)
+
+        # Split by ordering: inequalities first, equalities last (enforced in power.jl).
+        g = gh[1:end-num_equal, :]
+        h = gh[end-num_equal+1:end, :]
+        # Clamp inequality multipliers to μ ≥ 0: the dual network output is unconstrained
+        # so it can predict negative values, which would flip the sign of the μᵀg term and
+        # push the primal TOWARD constraint violations. Clamping restores KKT sign correctness.
+        μ = max.(dual_hat[1:end-num_equal, :], zero(eltype(dual_hat)))
+        λ = dual_hat[end-num_equal+1:end, :]
+
+        # Compute violations from gh directly — avoids a second BNK.constraints! call
+        # through all_violations! (which shares bm.cons_out and corrupts the rrule view).
+        Vc_ineq = max.(g, zero(eltype(g)))
+        Vc_eq   = abs.(h)
+        Vc      = vcat(Vc_ineq, Vc_eq)
+
+        lagrangian_term = (sum(μ .* g) + sum(λ .* h)) / num_s
+        penalty_term    = (ρ / 2) * (sum(Vc_ineq .^ 2) + ρ_eq_scale * sum(Vc_eq .^ 2)) / num_s
+        total_loss      = mean(objs) + lagrangian_term + penalty_term
 
         return total_loss,
         st_new,
         (
-            total_loss = total_loss,
-            mean_violations = mean(V),
-            max_violation = maximum(V),
-            mean_objs = mean(objs),
+            total_loss      = total_loss,
+            mean_violations = mean(Vc),
+            max_violation   = maximum(Vc),
+            max_bound_violation = zero(eltype(Vc)),
+            mean_objs       = mean(objs),
         )
     end
 end
@@ -220,12 +335,14 @@ This function computes the maximum violation, mean violations, mean objectives,
 from the batch states.
 """
 function reconcile_primal(::AbstractL2OTrainer, batch_states::Vector{NamedTuple})
-    max_violation = maximum([s.max_violation for s in batch_states])
+    # Use mean of mean_violations (not max-of-max) as the ρ-update signal.
+    # max-of-max is dominated by noisy hard batches and causes ρ to escalate
+    # before violations have genuinely plateaued, collapsing generators.
     mean_violations = mean([s.mean_violations for s in batch_states])
     mean_objs = mean([s.mean_objs for s in batch_states])
     mean_loss = mean([s.total_loss for s in batch_states])
     return (;
-        max_violation = max_violation,
+        max_violation = mean_violations,   # proxy for ρ criterion: bump only when mean stalls
         mean_violations = mean_violations,
         mean_objs = mean_objs,
         total_loss = mean_loss,
@@ -239,6 +356,7 @@ Reconciles the state of the dual model after processing a batch of data.
 This function computes the mean dual loss from the batch states.
 """
 function reconcile_dual(::AbstractL2OTrainer, batch_states::Vector{NamedTuple})
+    isempty(batch_states) && return (dual_loss = 0.0,)
     return (dual_loss = mean([s.dual_loss for s in batch_states]),)
 end
 
@@ -256,7 +374,7 @@ function update_trainer!(
     dual_state::NamedTuple,
 )
     # Update primal state
-    new_max_violations = primal_state.max_violations
+    new_max_violations = primal_state.max_violation
     trainer.mean_violations = primal_state.mean_violations
     trainer.mean_objs = primal_state.mean_objs
     trainer.total_loss = primal_state.total_loss
@@ -280,7 +398,7 @@ Default stopping criterion for the L2O methods.
 This function checks if the number of iterations has reached a predefined limit (100).
 """
 function _stopping_criterion(iter::Int, ::M, ::N) where {M<:AbstractL2OMethod, N<:AbstractL2OTrainer}
-    return iter >= 100 ? true : false
+    return iter >= 2 ? true : false
 end
 
 """
@@ -318,55 +436,248 @@ updates the trainer state.
 function single_train_step!(
     method::AbstractPrimalDualMethod,
     trainer::AbstractPrimalDualTrainer,
-    data,
+    data;
+    L_primal::Int = 1,
+    L_dual::Int   = 1,
+    use_penalty_only::Bool = false,
+    use_analytical_dual::Bool = false,
 )
-    iter_primal = 1
-    iter_dual = 1
-    num_batches = length(data)
-    current_state_primal = (;)
-    current_state_dual = (;)
-    _primal_loss = primal_loss(method)
-    _dual_loss = dual_loss(method)
+    _primal_loss = if use_penalty_only
+        penalty_only_primal_loss(method)
+    elseif use_analytical_dual
+        analytical_primal_loss(method)
+    else
+        primal_loss(method)
+    end
+    _dual_loss   = dual_loss(method)
     train_state_primal = trainer.primal_training_state
-    train_state_dual = trainer.dual_training_state
-
-    # primal loop
-    while primal_stopping_criterion(iter_primal, method, trainer)
-        current_states_primal = Vector{NamedTuple}(undef, num_batches)
-        iter_batch = 1
-        for (θ) in data
-            _, loss_val, stats, train_state_primal = Training.single_train_step!(
-                AutoZygote(),          # AD backend
-                _primal_loss,  # Loss function
-                (θ, trainer), # Data
-                train_state_primal, # Training state
+    train_state_dual   = trainer.dual_training_state
+
+    # ----- primal phase: exactly L_primal gradient steps, dual frozen -----
+    # Cycle through data batches (reshuffled each epoch) until L_primal steps done.
+    primal_states = NamedTuple[]
+    primal_step = 0
+    while primal_step < L_primal
+        for θ in data
+            primal_step >= L_primal && break
+            _, _, stats, train_state_primal = Training.single_train_step!(
+                AutoZygote(), _primal_loss, (θ, trainer), train_state_primal,
             )
-            current_states_primal[iter_batch] = stats
-            iter_batch += 1
+            push!(primal_states, stats)
+            primal_step += 1
         end
-        current_state_primal = reconcile_primal(trainer, current_states_primal)
-        iter_primal += 1
     end
-
-    # dual loop
-    while dual_stopping_criterion(iter_dual, method, trainer)
-        current_states_dual = Vector{NamedTuple}(undef, num_batches)
-        iter_batch = 1
-        for (θ) in data
-            _, loss_val, stats, train_state_dual = Training.single_train_step!(
-                AutoZygote(),          # AD backend
-                _dual_loss,  # Loss function
-                (θ, trainer), # Data
-                train_state_dual, # Training state
+    trainer.primal_training_state = train_state_primal
+    current_state_primal = reconcile_primal(trainer, primal_states)
+
+    # ----- dual phase: exactly L_dual gradient steps, primal frozen -----
+    dual_states = NamedTuple[]
+    dual_step = 0
+    while dual_step < L_dual
+        for θ in data
+            dual_step >= L_dual && break
+            _, _, stats, train_state_dual = Training.single_train_step!(
+                AutoZygote(), _dual_loss, (θ, trainer), train_state_dual,
             )
-            current_states_dual[iter_batch] = stats
-            iter_batch += 1
+            push!(dual_states, stats)
+            dual_step += 1
+        end
+    end
+    trainer.dual_training_state = train_state_dual
+    update_trainer!(method, trainer, current_state_primal, reconcile_dual(trainer, dual_states))
+    return
+end
+
+"""
+    penalty_only_primal_loss(method::ALMMethod)
+
+Closure that computes a *penalty-only* primal loss (no Lagrangian linear term):
+
+    L(y) = f(y) + (ρ/2) · ‖V(y)‖²
+
+Used for the primal warm-start phase before the alternating loop, so the primal
+network finds a feasible basin against fixed-zero multipliers before the dual
+starts producing nonzero multipliers. Without this, the first few outer
+iterations can be unstable because the dual fits multipliers against random
+primal predictions.
+"""
+function penalty_only_primal_loss(method::ALMMethod)
+    bm = method.batch_model
+    num_equal = method.num_equal
+    ρ_eq_scale = method.ρ_eq_scale
+    return (model, ps, st, data) -> begin
+        Θ, trainer = data
+        ρ = trainer.ρ
+        num_s = size(Θ, 2)
+        X̂, st_new = model(Θ, ps, st)
+        objs = BNK.objective!(bm, X̂, Θ)
+        # Use BNK.constraints! (which has an rrule) so that penalty gradients flow
+        # back through to the network weights. BNK.all_violations! has no rrule and
+        # would give zero penalty gradient, collapsing generators to pmin.
+        gh = BNK.constraints!(bm, X̂, Θ)
+        g  = gh[1:end-num_equal, :]
+        h  = gh[end-num_equal+1:end, :]
+        Vc_ineq = max.(g, zero(eltype(g)))
+        Vc_eq   = abs.(h)
+        Vc      = vcat(Vc_ineq, Vc_eq)
+        penalty_term = (ρ / 2) * (sum(Vc_ineq .^ 2) + ρ_eq_scale * sum(Vc_eq .^ 2)) / num_s
+        total_loss = mean(objs) + penalty_term
+        return total_loss, st_new, (
+            total_loss      = total_loss,
+            mean_violations = mean(Vc),
+            max_violation   = maximum(Vc),
+            max_bound_violation = zero(eltype(Vc)),
+            mean_objs       = mean(objs),
+        )
+    end
+end
+
+"""
+    analytical_primal_loss(method::ALMMethod)
+
+Primal loss that applies the ALM dual update analytically at every gradient step,
+eliminating the dual tracking gap entirely.
+
+Instead of using the dual network's output directly (which lags behind the correct
+multipliers by a 4000× factor at ρmax=10000), this computes the analytically corrected
+multipliers at each step:
+
+    μ_eff = max(μ_net(θ) + ρ·g(y,θ),  0)
+    λ_eff = clamp(λ_net(θ) + ρ·h(y,θ), -max_dual, max_dual)
+
+Then uses these in the augmented Lagrangian:
+
+    L = f(y) + μ_eff·g(y) + λ_eff·h(y)
+
+The penalty term is implicit: μ_eff·g = (μ_net + ρg)·g includes a ρg² term for
+violated constraints, equivalent to the standard ALM penalty. The dual network still
+trains in parallel (providing a warm-start for test-time generalization) but the primal
+gradient is no longer bottlenecked by slow dual convergence.
+"""
+function analytical_primal_loss(method::ALMMethod)
+    bm = method.batch_model
+    num_equal = method.num_equal
+    ρ_eq_scale = method.ρ_eq_scale
+    max_dual = method.max_dual
+    return (model, ps, st, data) -> begin
+        Θ, trainer = data
+        ρ = trainer.ρ
+        num_s = size(Θ, 2)
+
+        X̂, st_new = model(Θ, ps, st)
+
+        objs = BNK.objective!(bm, X̂, Θ)
+        gh   = BNK.constraints!(bm, X̂, Θ)
+
+        g = gh[1:end-num_equal, :]
+        h = gh[end-num_equal+1:end, :]
+
+        # Dual network provides a warm-start base; analytically correct by ρ·gh.
+        # @ignore_derivatives: gradient does NOT flow through dual net params,
+        # but DOES flow through gh (computed from X̂ above), providing the correct
+        # ρ·g and ρ·h gradient terms without any tracking lag.
+        dual_hat, _ = @ignore_derivatives trainer.dual_model(
+            Θ, trainer.dual_training_state.parameters, trainer.dual_training_state.states)
+
+        μ_base = dual_hat[1:end-num_equal, :]
+        λ_base = dual_hat[end-num_equal+1:end, :]
+
+        # Analytically corrected multipliers (one full ALM update applied immediately)
+        μ = clamp.(μ_base .+ ρ .* g,                  zero(max_dual), max_dual)
+        λ = clamp.(λ_base .+ (ρ * ρ_eq_scale) .* h,  -max_dual,      max_dual)
+
+        Vc_ineq = max.(g, zero(eltype(g)))
+        Vc_eq   = abs.(h)
+        Vc      = vcat(Vc_ineq, Vc_eq)
+
+        lagrangian_term = (sum(μ .* g) + sum(λ .* h)) / num_s
+        total_loss      = mean(objs) + lagrangian_term
+
+        return total_loss, st_new, (
+            total_loss      = total_loss,
+            mean_violations = mean(Vc),
+            max_violation   = maximum(Vc),
+            max_bound_violation = zero(eltype(Vc)),
+            mean_objs       = mean(objs),
+        )
+    end
+end
+
+"""
+    train!(method, trainer, data;
+        K=10, L_primal=500, L_dual=500, warmup_epochs=4,
+        lr_primal=1e-4, lr_dual=1e-4, lr_decay=0.99,
+        eval_fn=nothing)
+
+Top-level training loop matching the PDL paper schedule.
+
+- `K`: number of outer alternating iterations.
+- `L_primal` / `L_dual`: inner epochs (passes through `data`) per phase.
+- `warmup_epochs`: number of penalty-only primal epochs before the alternating
+  loop. Set to 0 to disable.
+- `lr_primal`, `lr_dual`: starting Adam learning rates. The trainer's existing
+  TrainStates are mutated via `Optimisers.adjust!` so this overrides whatever
+  was set at construction.
+- `lr_decay`: multiplicative decay applied when validation loss worsens.
+- `eval_fn(iter, trainer) -> Real | nothing`: optional validation callback. If
+  it returns a Real that's larger than the previous, both lrs are multiplied
+  by `lr_decay`.
+"""
+function train!(
+    method::ALMMethod, trainer::ALMTrainer, data;
+    K::Int = 10, L_primal::Int = 500, L_dual::Int = 500,
+    warmup_epochs::Int = 4,
+    lr_primal::Real = 1e-4, lr_dual::Real = 1e-4,
+    lr_decay::Real = 0.99,
+    use_penalty_only::Bool = false,
+    use_analytical_dual::Bool = false,
+    eval_fn = nothing,
+)
+    Optimisers.adjust!(trainer.primal_training_state.optimizer_state, lr_primal)
+    Optimisers.adjust!(trainer.dual_training_state.optimizer_state,   lr_dual)
+    cur_lr_primal = lr_primal
+    cur_lr_dual   = lr_dual
+
+    # ----- warm-start: penalty-only primal (warmup_epochs = gradient steps) -----
+    # Run at ρmax (not ρ_init) so the penalty is strong enough to push the primal
+    # toward feasibility before the dual starts producing multipliers. At ρ_init=10
+    # the penalty gradient is ~1000× too weak to reduce power-balance violations.
+    if warmup_epochs > 0
+        warmup_loss = penalty_only_primal_loss(method)
+        ts_primal = trainer.primal_training_state
+        ρ_saved = trainer.ρ
+        trainer.ρ = method.ρmax     # full penalty during warmup
+        warmup_step = 0
+        while warmup_step < warmup_epochs
+            for θ in data
+                warmup_step >= warmup_epochs && break
+                _, _, _, ts_primal = Training.single_train_step!(
+                    AutoZygote(), warmup_loss, (θ, trainer), ts_primal,
+                )
+                warmup_step += 1
+            end
+        end
+        trainer.primal_training_state = ts_primal
+        trainer.ρ = ρ_saved         # restore initial ρ for ALM schedule
+    end
+
+    # ----- alternating primal/dual loop with optional lr decay -----
+    last_val = typemax(Float64)
+    for outer in 1:K
+        single_train_step!(method, trainer, data; L_primal = L_primal, L_dual = L_dual, use_penalty_only = use_penalty_only, use_analytical_dual = use_analytical_dual)
+        if eval_fn !== nothing
+            v = eval_fn(outer, trainer)
+            if v isa Real
+                if v > last_val
+                    cur_lr_primal *= lr_decay
+                    cur_lr_dual   *= lr_decay
+                    Optimisers.adjust!(trainer.primal_training_state.optimizer_state, cur_lr_primal)
+                    Optimisers.adjust!(trainer.dual_training_state.optimizer_state,   cur_lr_dual)
+                end
+                last_val = v
+            end
         end
-        current_state_dual = reconcile_dual(trainer, current_states_dual)
-        iter_dual += 1
     end
-    # Update trainer state
-    update_trainer!(method, trainer, current_state_primal, current_state_dual)
     return
 end
 
diff --git a/test/power.jl b/test/power.jl
index 6644468..91c038d 100644
--- a/test/power.jl
+++ b/test/power.jl
@@ -78,6 +78,8 @@ function _parse_ac_data_raw(filename; T = Float64)
                     c7 = T((g + g_to)),
                     c8 = T((b + b_to)),
                     rate_a_sq = T(branch["rate_a"]^2),
+                    angmax = T(branch["angmax"]),
+                    angmin = T(branch["angmin"]),
                 )
             end for (i, branch_raw) in ref[:branch]
         ],
@@ -111,7 +113,12 @@ function create_parametric_ac_power_model(
     data = _parse_ac_data(filename, backend, T = T)
     c = ExaCore(T; backend = backend)
 
-    va = variable(c, length(data.bus);)
+    va = variable(
+        c,
+        length(data.bus);
+        lvar = fill!(similar(data.bus, T), -T(π / 2)),
+        uvar = fill!(similar(data.bus, T), T(π / 2)),
+    )
     vm = variable(
         c,
         length(data.bus);
@@ -125,88 +132,117 @@ function create_parametric_ac_power_model(
 
     @allowscalar load_multiplier = parameter(c, [1.0 for b in data.bus])
 
-    p = variable(c, length(data.arc); lvar = -data.rate_a, uvar = data.rate_a)
-    q = variable(c, length(data.arc); lvar = -data.rate_a, uvar = data.rate_a)
-
     o = objective(c, g.cost1 * pg[g.i]^2 + g.cost2 * pg[g.i] + g.cost3 for g in data.gen)
 
-    # Reference bus angle ------------------------------------------------------
-    c1 = constraint(c, va[i] for i in data.ref_buses)
-
-    # Branch power-flow equations ---------------------------------------------
+    # Reduced-space formulation: p and q branch flows are eliminated as decision
+    # variables and expressed analytically from (vm, va) via the AC power flow
+    # equations.  This removes all 4·nbranch branch-flow equality constraints
+    # (which were the dominant cause of max_viol_eq ≈ 1.0 when p/q were free):
+    #
+    #   p_from = c5·vm[f]² + c3·vm[f]·vm[t]·cos(θ) + c4·vm[f]·vm[t]·sin(θ)
+    #   q_from = -c6·vm[f]² - c4·vm[f]·vm[t]·cos(θ) + c3·vm[f]·vm[t]·sin(θ)
+    #   p_to   = c7·vm[t]² + c1·vm[t]·vm[f]·cos(-θ) + c2·vm[t]·vm[f]·sin(-θ)
+    #   q_to   = -c8·vm[t]² - c2·vm[t]·vm[f]·cos(-θ) + c1·vm[t]·vm[f]·sin(-θ)
+    #
+    # Constraints: inequalities first (angle_diff × 2, thermal × 2), then
+    # equalities (ref_bus, power_balance_p, power_balance_q).
+
+    # =========================== INEQUALITIES ===============================
+
+    # Angle difference upper: va_f - va_t ≤ angmax
     constraint(
         c,
-        (
-            p[b.f_idx] - b.c5 * vm[b.f_bus]^2 -
-            b.c3 * (vm[b.f_bus] * vm[b.t_bus] * cos(va[b.f_bus] - va[b.t_bus])) -
-            b.c4 * (vm[b.f_bus] * vm[b.t_bus] * sin(va[b.f_bus] - va[b.t_bus])) for
-            b in data.branch
-        ),
+        va[b.f_bus] - va[b.t_bus] - b.angmax for b in data.branch;
+        lcon = fill!(similar(data.branch, Float64, length(data.branch)), -Inf),
     )
-
+    # Angle difference lower: angmin ≤ va_f - va_t
     constraint(
         c,
-        (
-            q[b.f_idx] +
-            b.c6 * vm[b.f_bus]^2 +
-            b.c4 * (vm[b.f_bus] * vm[b.t_bus] * cos(va[b.f_bus] - va[b.t_bus])) -
-            b.c3 * (vm[b.f_bus] * vm[b.t_bus] * sin(va[b.f_bus] - va[b.t_bus])) for
-            b in data.branch
-        ),
+        b.angmin - (va[b.f_bus] - va[b.t_bus]) for b in data.branch;
+        lcon = fill!(similar(data.branch, Float64, length(data.branch)), -Inf),
     )
 
+    # Thermal from: p_from² + q_from² ≤ rate_a²
     constraint(
         c,
         (
-            p[b.t_idx] - b.c7 * vm[b.t_bus]^2 -
-            b.c1 * (vm[b.t_bus] * vm[b.f_bus] * cos(va[b.t_bus] - va[b.f_bus])) -
-            b.c2 * (vm[b.t_bus] * vm[b.f_bus] * sin(va[b.t_bus] - va[b.f_bus])) for
-            b in data.branch
-        ),
+            b.c5 * vm[b.f_bus]^2 +
+            b.c3 * (vm[b.f_bus] * vm[b.t_bus] * cos(va[b.f_bus] - va[b.t_bus])) +
+            b.c4 * (vm[b.f_bus] * vm[b.t_bus] * sin(va[b.f_bus] - va[b.t_bus]))
+        )^2 + (
+            -b.c6 * vm[b.f_bus]^2 -
+            b.c4 * (vm[b.f_bus] * vm[b.t_bus] * cos(va[b.f_bus] - va[b.t_bus])) +
+            b.c3 * (vm[b.f_bus] * vm[b.t_bus] * sin(va[b.f_bus] - va[b.t_bus]))
+        )^2 - b.rate_a_sq for b in data.branch;
+        lcon = fill!(similar(data.branch, Float64, length(data.branch)), -Inf),
     )
-
+    # Thermal to: p_to² + q_to² ≤ rate_a²
     constraint(
         c,
         (
-            q[b.t_idx] +
-            b.c8 * vm[b.t_bus]^2 +
-            b.c2 * (vm[b.t_bus] * vm[b.f_bus] * cos(va[b.t_bus] - va[b.f_bus])) -
-            b.c1 * (vm[b.t_bus] * vm[b.f_bus] * sin(va[b.t_bus] - va[b.f_bus])) for
-            b in data.branch
-        ),
+            b.c7 * vm[b.t_bus]^2 +
+            b.c1 * (vm[b.t_bus] * vm[b.f_bus] * cos(va[b.t_bus] - va[b.f_bus])) +
+            b.c2 * (vm[b.t_bus] * vm[b.f_bus] * sin(va[b.t_bus] - va[b.f_bus]))
+        )^2 + (
+            -b.c8 * vm[b.t_bus]^2 -
+            b.c2 * (vm[b.t_bus] * vm[b.f_bus] * cos(va[b.t_bus] - va[b.f_bus])) +
+            b.c1 * (vm[b.t_bus] * vm[b.f_bus] * sin(va[b.t_bus] - va[b.f_bus]))
+        )^2 - b.rate_a_sq for b in data.branch;
+        lcon = fill!(similar(data.branch, Float64, length(data.branch)), -Inf),
     )
 
-    # Angle difference limits --------------------------------------------------
-    constraint(
-        c,
-        va[b.f_bus] - va[b.t_bus] for b in data.branch;
-        lcon = data.angmin,
-        ucon = data.angmax,
-    )
+    # ============================ EQUALITIES ================================
 
-    # Apparent power thermal limits -------------------------------------------
-    constraint(
-        c,
-        p[b.f_idx]^2 + q[b.f_idx]^2 - b.rate_a_sq for b in data.branch;
-        lcon = fill!(similar(data.branch, Float64, length(data.branch)), -Inf),
-    )
-    constraint(
-        c,
-        p[b.t_idx]^2 + q[b.t_idx]^2 - b.rate_a_sq for b in data.branch;
-        lcon = fill!(similar(data.branch, Float64, length(data.branch)), -Inf),
-    )
+    # Reference bus angle
+    c1 = constraint(c, va[i] for i in data.ref_buses)
 
-    # Power balance at each bus -----------------------------------------------
+    # Power balance: load + shunt + Σ branch flows - Σ generation = 0
     load_balance_p =
         constraint(c, b.pd * load_multiplier[b.i] + b.gs * vm[b.i]^2 for b in data.bus)
     load_balance_q =
         constraint(c, b.qd * load_multiplier[b.i] - b.bs * vm[b.i]^2 for b in data.bus)
 
-    # Map arc & generator variables into the bus balance equations
-    constraint!(c, load_balance_p, a.bus => p[a.i] for a in data.arc)
-    constraint!(c, load_balance_q, a.bus => q[a.i] for a in data.arc)
+    # From-arc p flowing out of f_bus
+    constraint!(
+        c,
+        load_balance_p,
+        b.f_bus => b.c5 * vm[b.f_bus]^2 +
+            b.c3 * (vm[b.f_bus] * vm[b.t_bus] * cos(va[b.f_bus] - va[b.t_bus])) +
+            b.c4 * (vm[b.f_bus] * vm[b.t_bus] * sin(va[b.f_bus] - va[b.t_bus])) for
+        b in data.branch
+    )
+    # To-arc p flowing out of t_bus
+    constraint!(
+        c,
+        load_balance_p,
+        b.t_bus => b.c7 * vm[b.t_bus]^2 +
+            b.c1 * (vm[b.t_bus] * vm[b.f_bus] * cos(va[b.t_bus] - va[b.f_bus])) +
+            b.c2 * (vm[b.t_bus] * vm[b.f_bus] * sin(va[b.t_bus] - va[b.f_bus])) for
+        b in data.branch
+    )
+    # From-arc q flowing out of f_bus
+    constraint!(
+        c,
+        load_balance_q,
+        b.f_bus => -b.c6 * vm[b.f_bus]^2 -
+            b.c4 * (vm[b.f_bus] * vm[b.t_bus] * cos(va[b.f_bus] - va[b.t_bus])) +
+            b.c3 * (vm[b.f_bus] * vm[b.t_bus] * sin(va[b.f_bus] - va[b.t_bus])) for
+        b in data.branch
+    )
+    # To-arc q flowing out of t_bus
+    constraint!(
+        c,
+        load_balance_q,
+        b.t_bus => -b.c8 * vm[b.t_bus]^2 -
+            b.c2 * (vm[b.t_bus] * vm[b.f_bus] * cos(va[b.t_bus] - va[b.f_bus])) +
+            b.c1 * (vm[b.t_bus] * vm[b.f_bus] * sin(va[b.t_bus] - va[b.f_bus])) for
+        b in data.branch
+    )
     constraint!(c, load_balance_p, g.bus => -pg[g.i] for g in data.gen)
     constraint!(c, load_balance_q, g.bus => -qg[g.i] for g in data.gen)
 
-    return ExaModel(c; prod = prod), length(data.bus), length(data.gen), length(data.arc)
+    # ref_bus_idxs: positions in y (1-indexed) of the reference bus voltage angles.
+    # va occupies positions 1..nbus, so ref bus at internal index i → y[i].
+    ref_bus_idxs = Int.(Array(data.ref_buses))   # always CPU — just indices, not model data
+    return ExaModel(c; prod = prod), length(data.bus), length(data.gen), length(data.arc), ref_bus_idxs
 end
diff --git a/test/runtests.jl b/test/runtests.jl
index ef25104..da64f59 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -58,7 +58,7 @@ include("power.jl")
         rng = Random.default_rng(),
         T = Float64,
     )
-        model, nbus, ngen, blines =
+        model, nbus, ngen, blines, _ =
             create_parametric_ac_power_model(filename; backend = backend, T = T)
         bm_train = BNK.BatchModel(model, batch_size, config = BNK.BatchModelConfig(:full))
         bm_test = BNK.BatchModel(model, dataset_size, config = BNK.BatchModelConfig(:full))
@@ -66,12 +66,31 @@ include("power.jl")
         nvar = model.meta.nvar
         ncon = model.meta.ncon
         nθ = length(model.θ)
-        num_equal = nbus * 2
-
-        Θ_train = randn(T, nθ, dataset_size) |> dev_gpu
-        Θ_test = randn(T, nθ, dataset_size) |> dev_gpu
-
-        primal_model = feed_forward_builder(nθ, nvar, [320, 320])
+        # Compute equality count from model metadata — equality iff lcon == ucon.
+        # Constraint ordering in power.jl puts inequalities first, equalities last,
+        # so num_equal is the count of equalities at the tail of the constraint vector.
+        is_equal = model.meta.lcon .== model.meta.ucon
+        num_equal = count(is_equal)
+        @test all(is_equal[ncon-num_equal+1:ncon])   # ordering invariant: equalities at the tail
+        @test !any(is_equal[1:ncon-num_equal])       # ordering invariant: inequalities at the head
+        @info "Constraint counts" ncon num_equal num_inequal=(ncon-num_equal)
+
+        # Paper-style sampling: load multipliers ~ 1.0 ± 10%
+        # (The model declares `parameter(c, [1.0 for b in data.bus])` so Θ is a
+        #  per-bus load multiplier centered at 1.0.)
+        Random.seed!(rng, 0x12345678)
+        Θ_train = (T(1.0) .+ T(0.1) .* randn(rng, T, nθ, dataset_size)) |> dev_gpu
+        Random.seed!(rng, 0xCAFEBABE)
+        Θ_test  = (T(1.0) .+ T(0.1) .* randn(rng, T, nθ, dataset_size)) |> dev_gpu
+
+        # Primal architecture: 2-layer MLP + BoundedOutput head to enforce
+        # variable bounds architecturally (Park & Van Hentenryck 2023 AC-OPF model).
+        lvar = T.(model.meta.lvar)
+        uvar = T.(model.meta.uvar)
+        primal_model = Chain(
+            feed_forward_builder(nθ, nvar, [320, 320]),
+            BoundedOutput(lvar, uvar),
+        ) |> dev_gpu
         ps_primal, st_primal = Lux.setup(rng, primal_model)
         ps_primal = ps_primal |> dev_gpu
         st_primal = st_primal |> dev_gpu
@@ -99,24 +118,40 @@ include("power.jl")
 
         data = DataLoader((Θ_train); batchsize = batch_size, shuffle = true) .|> dev_gpu
 
-        method = ALMMethod(; batch_model=bm_train, num_equal=num_equal)
+        # Paper hyperparameters for AC-OPF (Park & Van Hentenryck 2023, Table for AC-OPF):
+        # α=2 (not 10), ρ_max=1e4 — prevents the augmented Lagrangian from blowing up.
+        method = ALMMethod(; batch_model=bm_train, num_equal=num_equal,
+            max_dual=1.0e6, ρmax=1.0e4, τ=0.8, α=2.0)
         trainer = ALMTrainer(primal_model, train_state_primal, dual_model, train_state_dual)
-        method_test = ALMMethod(; batch_model=bm_test, num_equal=num_equal)
+        method_test = ALMMethod(; batch_model=bm_test, num_equal=num_equal,
+            max_dual=1.0e6, ρmax=1.0e4, τ=0.8, α=2.0)
         test_primal_loss = primal_loss(method_test)
         test_dual_loss = dual_loss(method_test)
 
-        _, prev_primal_loss_val, _, _ = test_primal_loss(primal_model, ps_primal, st_primal, (Θ_test, trainer))
+        prev_primal_loss_val, _, prev_stats = test_primal_loss(primal_model, ps_primal, st_primal, (Θ_test, trainer))
 
+        primal_loss_val = prev_primal_loss_val
+        dual_loss_val   = zero(prev_primal_loss_val)
+        stats_primal    = prev_stats
+        # L_primal/L_dual now count gradient steps (not epochs); use length(data) per
+        # outer iter to match the old behaviour of one full pass per outer iteration.
+        n_steps = length(data)
         for iter in 1:100
-            single_train_step!(method, trainer, data)
-            # Log
-            _, primal_loss_val, stats_primal, train_state_primal = test_primal_loss(primal_model, ps_primal, st_primal, (Θ_test, trainer))
-            _, dual_loss_val, stats_dual, train_state_dual = test_dual_loss(dual_model, ps_dual, st_dual, (Θ_test, trainer))
-
-            @info "Validation Testset: Iteration $iter" primal_loss_val stats_primal.max_violation stats_primal.mean_violations stats_primal.mean_objs dual_loss_val
+            single_train_step!(method, trainer, data; L_primal=n_steps, L_dual=n_steps)
+            # Log using the trained parameters stored in trainer
+            ps_primal_trained = trainer.primal_training_state.parameters
+            st_primal_trained = trainer.primal_training_state.states
+            ps_dual_trained   = trainer.dual_training_state.parameters
+            st_dual_trained   = trainer.dual_training_state.states
+            primal_loss_val, _, stats_primal = test_primal_loss(primal_model, ps_primal_trained, st_primal_trained, (Θ_test, trainer))
+            dual_loss_val, _, stats_dual     = test_dual_loss(dual_model, ps_dual_trained, st_dual_trained, (Θ_test, trainer))
+
+            @info "Validation Testset: Iteration $iter" primal_loss_val stats_primal.max_violation stats_primal.max_bound_violation stats_primal.mean_violations stats_primal.mean_objs dual_loss_val
         end
-        # Check that the primal loss is decreasing
-        @test primal_loss_val < prev_primal_loss_val
+        # Check that training drove down constraint violations.
+        # The AL loss itself can grow because the trained dual produces large multipliers
+        # against any residual violation; mean_violations is the metric ALM directly targets.
+        @test stats_primal.mean_violations < prev_stats.mean_violations
 
         return
     end

From 45e89de548e4ad17e8f45966bfee0bcbd010d465 Mon Sep 17 00:00:00 2001
From: Khai Nguyen <khai.nx1201@gmail.com>
Date: Fri, 22 May 2026 22:06:14 -0400
Subject: [PATCH 2/5] Move use_analytical_dual and use_dual_learning into
 ALMMethod

Both flags are now ALMMethod fields (defaults: false, true) instead of
train!/single_train_step! kwargs, keeping all method config in one place.

- use_analytical_dual: apply ALM dual update analytically per gradient step
- use_dual_learning: set false to keep the dual network frozen throughout
  (skips dual training loop and the per-outer-iter deepcopy of dual state)

Remove use_penalty_only from single_train_step! (warmup is handled directly
in train! and was never exposed as a kwarg callers needed to set).
---
 examples/case57_train.jl          | 12 +++----
 examples/case57_train_twophase.jl | 10 +++---
 src/L2OALM.jl                     | 58 +++++++++++++++----------------
 3 files changed, 38 insertions(+), 42 deletions(-)

diff --git a/examples/case57_train.jl b/examples/case57_train.jl
index 6ce43f5..d723fbe 100644
--- a/examples/case57_train.jl
+++ b/examples/case57_train.jl
@@ -34,8 +34,8 @@ const ALPHA           = parse(Float64, get(ENV, "PDL_ALPHA",           "2.0"))
 const RHO_EQ_SCALE    = parse(Float64, get(ENV, "PDL_RHO_EQ_SCALE",   "1.0"))
 const MAX_DUAL        = parse(Float64, get(ENV, "PDL_MAX_DUAL",        "1.0e6"))
 const HIDDEN_MULT          = parse(Float64, get(ENV, "PDL_HIDDEN_MULT",          "1.2"))
-const USE_PENALTY_ONLY     = parse(Bool,    get(ENV, "PDL_USE_PENALTY_ONLY",     "false"))
 const USE_ANALYTICAL_DUAL  = parse(Bool,    get(ENV, "PDL_USE_ANALYTICAL_DUAL",  "false"))
+const USE_DUAL_LEARNING    = parse(Bool,    get(ENV, "PDL_USE_DUAL_LEARNING",    "true"))
 const T_F               = Float32
 
 using L2OALM
@@ -145,11 +145,13 @@ function main()
 
     # Paper hyperparameters for AC-OPF.
     method = ALMMethod(; batch_model = bm_train, num_equal = num_equal,
-        max_dual = MAX_DUAL, ρmax = RHO_MAX, τ = TAU, α = ALPHA, ρ_eq_scale = RHO_EQ_SCALE)
+        max_dual = MAX_DUAL, ρmax = RHO_MAX, τ = TAU, α = ALPHA, ρ_eq_scale = RHO_EQ_SCALE,
+        use_analytical_dual = USE_ANALYTICAL_DUAL, use_dual_learning = USE_DUAL_LEARNING)
     trainer = ALMTrainer(primal_model, train_state_primal, dual_model, train_state_dual, RHO_INIT)
 
     method_test = ALMMethod(; batch_model = bm_test, num_equal = num_equal,
-        max_dual = MAX_DUAL, ρmax = RHO_MAX, τ = TAU, α = ALPHA, ρ_eq_scale = RHO_EQ_SCALE)
+        max_dual = MAX_DUAL, ρmax = RHO_MAX, τ = TAU, α = ALPHA, ρ_eq_scale = RHO_EQ_SCALE,
+        use_analytical_dual = USE_ANALYTICAL_DUAL, use_dual_learning = USE_DUAL_LEARNING)
     test_primal_loss = primal_loss(method_test)
 
     # ---- CSV logger setup ----
@@ -201,14 +203,12 @@ function main()
 
     eval_and_log(0, trainer)   # baseline before training
 
-    @info "Starting train!" K L_PRIMAL L_DUAL WARMUP_EPOCHS RHO_INIT RHO_MAX TAU ALPHA RHO_EQ_SCALE MAX_DUAL LR_PRIMAL LR_DUAL HIDDEN_MULT h USE_PENALTY_ONLY USE_ANALYTICAL_DUAL
+    @info "Starting train!" K L_PRIMAL L_DUAL WARMUP_EPOCHS RHO_INIT RHO_MAX TAU ALPHA RHO_EQ_SCALE MAX_DUAL LR_PRIMAL LR_DUAL HIDDEN_MULT h USE_ANALYTICAL_DUAL USE_DUAL_LEARNING
     train!(method, trainer, data;
         K = K, L_primal = L_PRIMAL, L_dual = L_DUAL,
         warmup_epochs = WARMUP_EPOCHS,
         lr_primal = LR_PRIMAL, lr_dual = LR_DUAL,
         lr_decay  = LR_DECAY,
-        use_penalty_only = USE_PENALTY_ONLY,
-        use_analytical_dual = USE_ANALYTICAL_DUAL,
         eval_fn   = eval_and_log,
     )
 
diff --git a/examples/case57_train_twophase.jl b/examples/case57_train_twophase.jl
index 125abf2..7b5bfb7 100644
--- a/examples/case57_train_twophase.jl
+++ b/examples/case57_train_twophase.jl
@@ -203,10 +203,10 @@ function main()
 
     method_p1 = ALMMethod(; batch_model = bm_train, num_equal = num_equal,
         max_dual = MAX_DUAL_P1, ρmax = RHO_P1, τ = TAU, α = ALPHA,
-        ρ_eq_scale = RHO_EQ_SCALE_P1)
+        ρ_eq_scale = RHO_EQ_SCALE_P1, use_analytical_dual = true)
     method_p1_test = ALMMethod(; batch_model = bm_test, num_equal = num_equal,
         max_dual = MAX_DUAL_P1, ρmax = RHO_P1, τ = TAU, α = ALPHA,
-        ρ_eq_scale = RHO_EQ_SCALE_P1)
+        ρ_eq_scale = RHO_EQ_SCALE_P1, use_analytical_dual = true)
 
     trainer = ALMTrainer(primal_model, train_state_primal, dual_model, train_state_dual, RHO_P1)
 
@@ -217,7 +217,6 @@ function main()
         K = K1, L_primal = L_PRIMAL, L_dual = L_DUAL,
         warmup_epochs = WARMUP_EPOCHS,
         lr_primal = LR_PRIMAL, lr_dual = LR_DUAL, lr_decay = LR_DECAY,
-        use_penalty_only = false, use_analytical_dual = true,
         eval_fn = eval_p1,
     )
 
@@ -231,10 +230,10 @@ function main()
 
     method_p2 = ALMMethod(; batch_model = bm_train, num_equal = num_equal,
         max_dual = MAX_DUAL_P2, ρmax = RHO_MAX_P2, τ = TAU, α = ALPHA,
-        ρ_eq_scale = RHO_EQ_SCALE_P2)
+        ρ_eq_scale = RHO_EQ_SCALE_P2, use_analytical_dual = true)
     method_p2_test = ALMMethod(; batch_model = bm_test, num_equal = num_equal,
         max_dual = MAX_DUAL_P2, ρmax = RHO_MAX_P2, τ = TAU, α = ALPHA,
-        ρ_eq_scale = RHO_EQ_SCALE_P2)
+        ρ_eq_scale = RHO_EQ_SCALE_P2, use_analytical_dual = true)
 
     eval_p2 = make_eval_fn(2, method_p2_test)
 
@@ -242,7 +241,6 @@ function main()
         K = K2, L_primal = L_PRIMAL, L_dual = L_DUAL,
         warmup_epochs = 0,   # no second warmup — primal already in good basin
         lr_primal = LR_PRIMAL, lr_dual = LR_DUAL, lr_decay = LR_DECAY,
-        use_penalty_only = false, use_analytical_dual = true,
         eval_fn = eval_p2,
     )
 
diff --git a/src/L2OALM.jl b/src/L2OALM.jl
index fcc90fd..341fd19 100644
--- a/src/L2OALM.jl
+++ b/src/L2OALM.jl
@@ -122,9 +122,11 @@ struct ALMMethod{T<:Real} <: AbstractPrimalDualMethod
     α::T
     num_equal::Int # TODO: There should be a way to get this from the batch model
     ρ_eq_scale::T  # extra penalty multiplier for equality constraints (default 1.0)
+    use_analytical_dual::Bool  # apply ALM dual update analytically per gradient step
+    use_dual_learning::Bool    # train the dual network; false = frozen dual throughout
 
-    function ALMMethod(batch_model::BatchModel, max_dual::T, ρmax::T, τ::T, α::T, num_equal::Int, ρ_eq_scale::T) where {T<:Real}
-        new{T}(batch_model, max_dual, ρmax, τ, α, num_equal, ρ_eq_scale)
+    function ALMMethod(batch_model::BatchModel, max_dual::T, ρmax::T, τ::T, α::T, num_equal::Int, ρ_eq_scale::T, use_analytical_dual::Bool, use_dual_learning::Bool) where {T<:Real}
+        new{T}(batch_model, max_dual, ρmax, τ, α, num_equal, ρ_eq_scale, use_analytical_dual, use_dual_learning)
     end
 end
 
@@ -146,8 +148,10 @@ function ALMMethod(;
     τ::T = 0.8,
     α::T = 10.0,
     ρ_eq_scale::T = 1.0,
+    use_analytical_dual::Bool = false,
+    use_dual_learning::Bool = true,
 ) where {T<:Real}
-    return ALMMethod(batch_model, max_dual, ρmax, τ, α, num_equal, ρ_eq_scale)
+    return ALMMethod(batch_model, max_dual, ρmax, τ, α, num_equal, ρ_eq_scale, use_analytical_dual, use_dual_learning)
 end
 
 """
@@ -386,7 +390,9 @@ function update_trainer!(
 
     # Update dual state
     trainer.dual_loss = dual_state.dual_loss
-    trainer.prev_dual_training_state = deepcopy(trainer.dual_training_state)
+    if method.use_dual_learning
+        trainer.prev_dual_training_state = deepcopy(trainer.dual_training_state)
+    end
 
     return
 end
@@ -434,27 +440,17 @@ with the dual model fixed, then the inverse is done with the dual learning metho
 updates the trainer state.
 """
 function single_train_step!(
-    method::AbstractPrimalDualMethod,
-    trainer::AbstractPrimalDualTrainer,
+    method::ALMMethod,
+    trainer::ALMTrainer,
     data;
     L_primal::Int = 1,
     L_dual::Int   = 1,
-    use_penalty_only::Bool = false,
-    use_analytical_dual::Bool = false,
 )
-    _primal_loss = if use_penalty_only
-        penalty_only_primal_loss(method)
-    elseif use_analytical_dual
-        analytical_primal_loss(method)
-    else
-        primal_loss(method)
-    end
-    _dual_loss   = dual_loss(method)
+    _primal_loss = method.use_analytical_dual ? analytical_primal_loss(method) : primal_loss(method)
     train_state_primal = trainer.primal_training_state
     train_state_dual   = trainer.dual_training_state
 
     # ----- primal phase: exactly L_primal gradient steps, dual frozen -----
-    # Cycle through data batches (reshuffled each epoch) until L_primal steps done.
     primal_states = NamedTuple[]
     primal_step = 0
     while primal_step < L_primal
@@ -471,19 +467,23 @@ function single_train_step!(
     current_state_primal = reconcile_primal(trainer, primal_states)
 
     # ----- dual phase: exactly L_dual gradient steps, primal frozen -----
+    # Skipped entirely when use_dual_learning = false (dual network stays frozen).
     dual_states = NamedTuple[]
-    dual_step = 0
-    while dual_step < L_dual
-        for θ in data
-            dual_step >= L_dual && break
-            _, _, stats, train_state_dual = Training.single_train_step!(
-                AutoZygote(), _dual_loss, (θ, trainer), train_state_dual,
-            )
-            push!(dual_states, stats)
-            dual_step += 1
+    if method.use_dual_learning
+        _dual_loss = dual_loss(method)
+        dual_step = 0
+        while dual_step < L_dual
+            for θ in data
+                dual_step >= L_dual && break
+                _, _, stats, train_state_dual = Training.single_train_step!(
+                    AutoZygote(), _dual_loss, (θ, trainer), train_state_dual,
+                )
+                push!(dual_states, stats)
+                dual_step += 1
+            end
         end
+        trainer.dual_training_state = train_state_dual
     end
-    trainer.dual_training_state = train_state_dual
     update_trainer!(method, trainer, current_state_primal, reconcile_dual(trainer, dual_states))
     return
 end
@@ -629,8 +629,6 @@ function train!(
     warmup_epochs::Int = 4,
     lr_primal::Real = 1e-4, lr_dual::Real = 1e-4,
     lr_decay::Real = 0.99,
-    use_penalty_only::Bool = false,
-    use_analytical_dual::Bool = false,
     eval_fn = nothing,
 )
     Optimisers.adjust!(trainer.primal_training_state.optimizer_state, lr_primal)
@@ -664,7 +662,7 @@ function train!(
     # ----- alternating primal/dual loop with optional lr decay -----
     last_val = typemax(Float64)
     for outer in 1:K
-        single_train_step!(method, trainer, data; L_primal = L_primal, L_dual = L_dual, use_penalty_only = use_penalty_only, use_analytical_dual = use_analytical_dual)
+        single_train_step!(method, trainer, data; L_primal = L_primal, L_dual = L_dual)
         if eval_fn !== nothing
             v = eval_fn(outer, trainer)
             if v isa Real

From b0304f04670071d1eab2ca31afe0edec8501a557 Mon Sep 17 00:00:00 2001
From: Khai Nguyen <khai.nx1201@gmail.com>
Date: Fri, 22 May 2026 22:31:49 -0400
Subject: [PATCH 3/5] Default use_analytical_dual to true (best config from
 experiments)

---
 examples/case57_train_twophase.jl | 8 ++++----
 src/L2OALM.jl                     | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/examples/case57_train_twophase.jl b/examples/case57_train_twophase.jl
index 7b5bfb7..cc81cb0 100644
--- a/examples/case57_train_twophase.jl
+++ b/examples/case57_train_twophase.jl
@@ -203,10 +203,10 @@ function main()
 
     method_p1 = ALMMethod(; batch_model = bm_train, num_equal = num_equal,
         max_dual = MAX_DUAL_P1, ρmax = RHO_P1, τ = TAU, α = ALPHA,
-        ρ_eq_scale = RHO_EQ_SCALE_P1, use_analytical_dual = true)
+        ρ_eq_scale = RHO_EQ_SCALE_P1)
     method_p1_test = ALMMethod(; batch_model = bm_test, num_equal = num_equal,
         max_dual = MAX_DUAL_P1, ρmax = RHO_P1, τ = TAU, α = ALPHA,
-        ρ_eq_scale = RHO_EQ_SCALE_P1, use_analytical_dual = true)
+        ρ_eq_scale = RHO_EQ_SCALE_P1)
 
     trainer = ALMTrainer(primal_model, train_state_primal, dual_model, train_state_dual, RHO_P1)
 
@@ -230,10 +230,10 @@ function main()
 
     method_p2 = ALMMethod(; batch_model = bm_train, num_equal = num_equal,
         max_dual = MAX_DUAL_P2, ρmax = RHO_MAX_P2, τ = TAU, α = ALPHA,
-        ρ_eq_scale = RHO_EQ_SCALE_P2, use_analytical_dual = true)
+        ρ_eq_scale = RHO_EQ_SCALE_P2)
     method_p2_test = ALMMethod(; batch_model = bm_test, num_equal = num_equal,
         max_dual = MAX_DUAL_P2, ρmax = RHO_MAX_P2, τ = TAU, α = ALPHA,
-        ρ_eq_scale = RHO_EQ_SCALE_P2, use_analytical_dual = true)
+        ρ_eq_scale = RHO_EQ_SCALE_P2)
 
     eval_p2 = make_eval_fn(2, method_p2_test)
 
diff --git a/src/L2OALM.jl b/src/L2OALM.jl
index 341fd19..6d52796 100644
--- a/src/L2OALM.jl
+++ b/src/L2OALM.jl
@@ -131,7 +131,7 @@ struct ALMMethod{T<:Real} <: AbstractPrimalDualMethod
 end
 
 """
-    ALMMethod(; batch_model, num_equal, max_dual=1e6, ρmax=1e6, τ=0.8, α=10.0, ρ_eq_scale=1.0)
+    ALMMethod(; batch_model, num_equal, max_dual=1e6, ρmax=1e6, τ=0.8, α=10.0, ρ_eq_scale=1.0, use_analytical_dual=true, use_dual_learning=true)
 
 Constructor for the ALM method.
 
@@ -148,7 +148,7 @@ function ALMMethod(;
     τ::T = 0.8,
     α::T = 10.0,
     ρ_eq_scale::T = 1.0,
-    use_analytical_dual::Bool = false,
+    use_analytical_dual::Bool = true,
     use_dual_learning::Bool = true,
 ) where {T<:Real}
     return ALMMethod(batch_model, max_dual, ρmax, τ, α, num_equal, ρ_eq_scale, use_analytical_dual, use_dual_learning)

From d13bc9b65cd1a1624e02656958a4608f9d1053b5 Mon Sep 17 00:00:00 2001
From: Khai Nguyen <khai.nx1201@gmail.com>
Date: Fri, 22 May 2026 22:41:32 -0400
Subject: [PATCH 4/5] Write README: installation, quick start, API reference,
 benchmark results

---
 README.md | 194 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 191 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index a9fa2de..9d9a179 100644
--- a/README.md
+++ b/README.md
@@ -1,9 +1,197 @@
 # L2OALM.jl
 
-Learning To Optimize using the Augmented Lagrangian Primal-Dual Method.
+Julia implementation of **Primal-Dual Learning (PDL)** for parametric constrained optimization, following Park & Van Hentenryck, *"Self-Supervised Primal-Dual Learning for Constrained Optimization"* (AAAI 2023).
 
-[![Stable](https://img.shields.io/badge/docs-stable-blue.svg)](https://LearningToOptimize.github.io/L2OALM.jl/stable/)
-[![Dev](https://img.shields.io/badge/docs-dev-blue.svg)](https://LearningToOptimize.github.io/L2OALM.jl/dev/)
 [![Build Status](https://github.com/LearningToOptimize/L2OALM.jl/actions/workflows/CI.yml/badge.svg?branch=main)](https://github.com/LearningToOptimize/L2OALM.jl/actions/workflows/CI.yml?query=branch%3Amain)
 [![Coverage](https://codecov.io/gh/LearningToOptimize/L2OALM.jl/branch/main/graph/badge.svg)](https://codecov.io/gh/LearningToOptimize/L2OALM.jl)
 
+---
+
+## What it does
+
+Given a parametric program
+
+```
+min   f(y; θ)
+ y
+s.t.  g(y; θ) ≤ 0      [inequalities]
+      h(y; θ) = 0      [equalities]
+      lvar ≤ y ≤ uvar  [bounds]
+```
+
+PDL trains two networks jointly — **self-supervised**, no solver in the loop:
+
+| Network | Maps | Role |
+|---------|------|------|
+| Primal `ŷ(θ; φ)` | parameters → decisions | Produces a near-feasible, near-optimal solution |
+| Dual `λ̂(θ; ψ)` | parameters → multipliers | Predicts Lagrange multipliers (μ for ineq, λ for eq) |
+
+Training mimics the Augmented Lagrangian Method (ALM): alternate between minimizing the augmented Lagrangian over `φ` and regressing `ψ` onto the ALM multiplier update.
+
+---
+
+## Installation
+
+```julia
+using Pkg
+Pkg.add(url="https://github.com/LearningToOptimize/L2OALM.jl")
+```
+
+`BatchNLPKernels` is a required dependency pinned to its GitHub `main` branch and is resolved automatically.
+
+---
+
+## Quick start
+
+```julia
+using L2OALM, Lux, Optimisers, MLUtils, BatchNLPKernels
+
+# 1. Build a BatchModel wrapping your ExaModel (see test/power.jl for AC-OPF)
+bm_train = BNK.BatchModel(model, batch_size, config=BNK.BatchModelConfig(:full))
+bm_test  = BNK.BatchModel(model, test_size,  config=BNK.BatchModelConfig(:full))
+
+# 2. Define primal and dual networks
+primal_net = Chain(Dense(nθ, 512, relu), Dense(512, 512, relu), Dense(512, nvar),
+                   BoundedOutput(lvar, uvar))   # enforces bounds architecturally
+dual_net   = Chain(Dense(nθ, 512, relu), Dense(512, 512, relu), Dense(512, ncon))
+
+ps_p, st_p = Lux.setup(rng, primal_net)
+ps_d, st_d = Lux.setup(rng, dual_net)
+
+# 3. Configure the ALM method
+method = ALMMethod(;
+    batch_model         = bm_train,
+    num_equal           = num_equal,     # number of equality constraints (tail of constraint vec)
+    ρmax                = 1e4,
+    max_dual            = 1e6,
+    τ                   = 0.8,
+    α                   = 2.0,
+    use_analytical_dual = true,          # apply ALM update analytically per gradient step (recommended)
+    use_dual_learning   = true,          # train the dual network (recommended)
+)
+
+trainer = ALMTrainer(primal_net,
+                     Training.TrainState(primal_net, ps_p, st_p, Optimisers.Adam(1e-4)),
+                     dual_net,
+                     Training.TrainState(dual_net,   ps_d, st_d, Optimisers.Adam(1e-3)))
+
+data = DataLoader(Θ_train; batchsize=200, shuffle=true)
+
+# 4. Train
+train!(method, trainer, data;
+    K             = 100,       # outer ALM iterations
+    L_primal      = 2500,      # gradient steps per primal phase
+    L_dual        = 2500,      # gradient steps per dual phase
+    warmup_epochs = 25000,     # penalty-only warm-start steps before ALM loop
+    lr_primal     = 1e-4,
+    lr_dual       = 1e-3,
+    lr_decay      = 0.99,
+)
+```
+
+---
+
+## Key components
+
+### `BoundedOutput(lvar, uvar)`
+
+Lux layer that enforces variable bounds architecturally using sigmoid:
+
+```
+yᵢ = lvar[i] + (uvar[i] − lvar[i]) · σ(zᵢ)
+```
+
+Guarantees `max_bound_violation ≡ 0` by construction. Uses sigmoid (not hardsigmoid) so gradient is always nonzero — hardsigmoid's zero-gradient zone at `z < −3` permanently traps outputs at the lower bound.
+
+### `FixRefBus(nvar, ref_bus_idxs)`
+
+Lux layer that zeroes the reference-bus voltage angle architecturally. GPU-safe — mask lives on the same device as the network.
+
+### `ALMMethod`
+
+Immutable configuration struct. Key fields:
+
+| Field | Default | Description |
+|-------|---------|-------------|
+| `ρmax` | `1e6` | Maximum penalty parameter |
+| `max_dual` | `1e6` | Multiplier clip bound |
+| `τ` | `0.8` | Violation ratio threshold for ρ update |
+| `α` | `10.0` | ρ growth factor |
+| `ρ_eq_scale` | `1.0` | Extra penalty multiplier for equalities only |
+| `use_analytical_dual` | `true` | Apply ALM dual update per gradient step |
+| `use_dual_learning` | `true` | Train the dual network |
+
+**`use_analytical_dual`**: instead of using the dual network output directly, computes analytically-corrected multipliers at every primal gradient step:
+```
+μ_eff = clamp(μ̂(θ) + ρ·g(ŷ),  0,  max_dual)
+λ_eff = clamp(λ̂(θ) + ρ·h(ŷ), −max_dual, max_dual)
+```
+Eliminates the dual tracking gap (the dual network lags the true multipliers by orders of magnitude at high ρ without this correction).
+
+**`use_dual_learning`**: set `false` to freeze the dual network — useful for ablations where you want to isolate the effect of the analytical correction alone.
+
+### `ALMTrainer`
+
+Mutable state struct holding both networks, their `TrainState`s, the current penalty `ρ`, and a snapshot of the previous dual state used for computing ALM targets.
+
+### `train!(method, trainer, data; ...)`
+
+Top-level training loop:
+1. **Warm-start** (`warmup_epochs` gradient steps): penalty-only primal loss at `ρmax` to push the network into a feasible basin before the dual starts producing meaningful multipliers.
+2. **K outer iterations**: alternate primal phase (`L_primal` steps) and dual phase (`L_dual` steps); update ρ if violations stagnate; optional `eval_fn` callback with learning-rate decay.
+
+---
+
+## Constraint ordering
+
+Constraints must be ordered as **inequalities first, equalities last**. `num_equal` is the count of equalities at the tail of the constraint vector. See `test/power.jl` for how to build a compliant `ExaModel` for AC-OPF.
+
+---
+
+## AC-OPF benchmark (case57)
+
+Results on `pglib_opf_case57_ieee` (128 variables, 435 constraints: 320 ineq + 115 eq), 5000 held-out test samples:
+
+| Config | max_eq | max_ineq | Notes |
+|--------|--------|----------|-------|
+| `use_analytical_dual=true`, ρmax=1e4, max_dual=1e6 | **1.165** | 0.000 | Best result |
+| `use_analytical_dual=false` (dual network only) | ~1.25 | 0.000 | Tracking gap hurts |
+| ρmax=1e6, max_dual=1e6 | 1.741 | 0.000 | Oscillates at high ρ |
+| max_dual=1e4 (any ρ schedule) | ~1.80 | 0.000 | Gradient saturates at 1.0/step |
+
+Variable bounds and inequality constraints are satisfied exactly (`max_bound = max_ineq = 0`) in all runs by iter ~10.
+
+---
+
+## Running tests
+
+```bash
+# CPU
+julia --project=. -e 'using Pkg; Pkg.test()'
+
+# GPU (CUDA)
+BNK_TEST_CUDA=1 julia --project=. -e 'using Pkg; Pkg.test()'
+```
+
+The test uses `pglib_opf_case14_ieee` (downloaded automatically via the `PGLib` artifact on first run).
+
+---
+
+## Examples
+
+- [`examples/case57_train.jl`](examples/case57_train.jl) — single-phase training on case57; all hyperparameters overridable via `PDL_*` env vars.
+- [`examples/case57_train_twophase.jl`](examples/case57_train_twophase.jl) — two-phase training: Phase 1 at fixed high ρ (avoids degenerate pg=0 basin), Phase 2 grows ρ with `ρ_eq_scale`.
+
+---
+
+## Reference
+
+```bibtex
+@inproceedings{park2023pdl,
+  title     = {Self-Supervised Primal-Dual Learning for Constrained Optimization},
+  author    = {Park, Seonho and Van Hentenryck, Pascal},
+  booktitle = {Proceedings of the AAAI Conference on Artificial Intelligence},
+  year      = {2023},
+  url       = {https://arxiv.org/abs/2208.09046}
+}
+```

From b4333c462c6740fd9cdd6d1ceca1d95c97ba0d5c Mon Sep 17 00:00:00 2001
From: Andrew Rosemberg <andrewrosemberg@gmail.com>
Date: Thu, 28 May 2026 11:22:08 -0400
Subject: [PATCH 5/5] Modify CI.yml to update Go versions

Updated the CI workflow to include version 1.12 and removed older versions.
---
 .github/workflows/CI.yml | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
index dca863e..2d27e8c 100644
--- a/.github/workflows/CI.yml
+++ b/.github/workflows/CI.yml
@@ -23,9 +23,7 @@ jobs:
       fail-fast: false
       matrix:
         version:
-          - '1.10'
-          - '1.6'
-          - 'pre'
+          - '1.12'
         os:
           - ubuntu-latest
         arch: