From 9cb040e0ea92b3f3bdd13f298f95be2d9c6bd900 Mon Sep 17 00:00:00 2001 From: Khai Nguyen Date: Fri, 22 May 2026 18:13:12 -0400 Subject: [PATCH 1/5] =?UTF-8?q?PDL=20case57:=20paper-faithful=20loss,=20an?= =?UTF-8?q?alytical=20dual,=20BoundedOutput,=20FixRefBus,=20=CF=81=5Feq=5F?= =?UTF-8?q?scale?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Key changes to src/L2OALM.jl: - Paper-faithful augmented Lagrangian loss (correct g,h from BNK.constraints!) - Clamp inequality multipliers to μ ≥ 0 (KKT correctness) - Analytical primal loss: apply ALM dual update per gradient step (λ_eff = clamp(λ̂ + ρ·h, −M, M)); eliminates dual tracking gap - ρ_eq_scale: separate penalty multiplier for equality constraints - use_penalty_only kwarg for penalty-only (no dual) ablation Key changes to test/power.jl and examples/case57_train.jl: - BoundedOutput head: sigmoid variable-bound enforcement (max_bound_viol ≡ 0) - FixRefBus layer: reference-bus angle fixed to zero architecturally; GPU-safe - Reduced-space AC-OPF: eliminate branch-flow variables - Warmup at ρ_max, lr decay, env-var hyperparameter overrides New: examples/case57_train_twophase.jl — two-phase training (Phase 1 fixed ρ warm-start to skip degenerate basin; Phase 2 growing ρ with ρ_eq_scale). Best result: max_eq = 1.165 p.u. on 5000 held-out case57 test samples, max_ineq = max_bound = 0 (analytical dual, ρ_max=1e4, MAX_DUAL=1e6). --- .gitignore | 14 + Project.toml | 24 +- examples/Project.toml | 22 ++ examples/case57_train.jl | 228 +++++++++++++++ examples/case57_train_twophase.jl | 261 +++++++++++++++++ scripts/plot_training.jl | 49 ++++ src/L2OALM.jl | 465 +++++++++++++++++++++++++----- test/power.jl | 150 ++++++---- test/runtests.jl | 71 +++-- 9 files changed, 1128 insertions(+), 156 deletions(-) create mode 100644 examples/Project.toml create mode 100644 examples/case57_train.jl create mode 100644 examples/case57_train_twophase.jl create mode 100644 scripts/plot_training.jl diff --git a/.gitignore b/.gitignore index 285da1e..d9448cd 100644 --- a/.gitignore +++ b/.gitignore @@ -26,3 +26,17 @@ Manifest*.toml # File generated by the Preferences package to store local preferences LocalPreferences.toml JuliaLocalPreferences.toml + +# Training run artifacts (CSV trajectories, plots). The example writes these +# under outputs/; don't version-control them. +outputs/ + +# Cluster job scripts and output logs (HPC-specific, not part of the library). +slurm_jobs/ + +# Experimental / scratch training scripts not ready for inclusion. +examples/case57_train_rho_restart.jl + +# Claude Code editor state. +.claude/ +CLAUDE.md diff --git a/Project.toml b/Project.toml index f3699aa..f04aebe 100644 --- a/Project.toml +++ b/Project.toml @@ -1,19 +1,38 @@ name = "L2OALM" uuid = "f31bfc7b-7b5d-4cc3-b76b-1af281ce159d" -authors = ["Andrew and contributors"] version = "1.0.0-DEV" +authors = ["Andrew and contributors"] [deps] BatchNLPKernels = "7145f916-0e30-4c9d-93a2-b32b6056125d" CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" ChainRules = "082447d4-558c-5d27-93f4-14fc19e9eca2" ExaModels = "1037b233-b668-4ce9-9b63-f9f681f55dd2" +GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527" +KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" Lux = "b2108857-7c20-44ae-9111-449ecde12c47" LuxCUDA = "d0bbae9a-e099-4d5b-a835-1c6931763bda" +MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54" +NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd" Optimisers = "3bd65402-5787-11e9-1adc-39752487f4e2" +PGLib = "07a8691f-3d11-4330-951b-3c50f98338be" +PowerModels = "c36e90e8-916a-50a6-bd94-075b64ef4655" +Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" +Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f" + +[sources] +BatchNLPKernels = {rev = "main", url = "https://github.com/klamike/BatchNLPKernels.jl"} [compat] +GPUArraysCore = "0.2.0" +KernelAbstractions = "0.9.41" +MLUtils = "0.4.8" +NNlib = "0.9.34" +PGLib = "0.2.2" +PowerModels = "0.21.6" +Random = "1.11.0" +Zygote = "0.7.10" julia = "1.6.7" [extras] @@ -25,8 +44,5 @@ PowerModels = "c36e90e8-916a-50a6-bd94-075b64ef4655" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" -[sources.BatchNLPKernels] -url = "https://github.com/klamike/BatchNLPKernels.jl" - [targets] test = ["Test", "PowerModels", "PGLib", "Random", "MLUtils", "KernelAbstractions", "GPUArraysCore"] diff --git a/examples/Project.toml b/examples/Project.toml new file mode 100644 index 0000000..4a3cf2a --- /dev/null +++ b/examples/Project.toml @@ -0,0 +1,22 @@ +[deps] +BatchNLPKernels = "7145f916-0e30-4c9d-93a2-b32b6056125d" +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" +Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" +ExaModels = "1037b233-b668-4ce9-9b63-f9f681f55dd2" +GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527" +KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" +L2OALM = "f31bfc7b-7b5d-4cc3-b76b-1af281ce159d" +Lux = "b2108857-7c20-44ae-9111-449ecde12c47" +LuxCUDA = "d0bbae9a-e099-4d5b-a835-1c6931763bda" +MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54" +NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd" +Optimisers = "3bd65402-5787-11e9-1adc-39752487f4e2" +PGLib = "07a8691f-3d11-4330-951b-3c50f98338be" +PowerModels = "c36e90e8-916a-50a6-bd94-075b64ef4655" +Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" +Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" +Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" + +[sources] +BatchNLPKernels = {rev = "main", url = "https://github.com/klamike/BatchNLPKernels.jl"} +L2OALM = {path = ".."} diff --git a/examples/case57_train.jl b/examples/case57_train.jl new file mode 100644 index 0000000..6ce43f5 --- /dev/null +++ b/examples/case57_train.jl @@ -0,0 +1,228 @@ +# examples/case57_train.jl +# +# Trains the PDL primal-dual networks on `pglib_opf_case57_ieee` on GPU using +# the paper's AC-OPF hyperparameters and the L2OALM `train!` schedule +# (penalty-only warm-start, K outer ALM iterations × L inner gradient epochs +# per phase, validation-loss-triggered lr decay). +# +# Acceptance: `max_violation` on a held-out test set < 0.01 after training. +# +# Usage: +# julia --project=. examples/case57_train.jl +# +# Tunables — overridable via env vars (PDL_K, PDL_L_PRIMAL, etc.) for quick smoke runs. +# L_PRIMAL / L_DUAL / WARMUP_EPOCHS are GRADIENT STEP counts, not epoch counts. +# With BATCH_SIZE=200 and DATASET_SIZE=5000 there are 25 batches/epoch. +# Paper (Park & Van Hentenryck 2023): K=10, L=5000, ρ_init=10, α=2, τ=0.8. +# We use K=20, L=2500 as a halfway point: same total steps as K=10/L=5000 but +# with twice as many multiplier updates. Key fix vs earlier runs: BoundedOutput +# now uses sigmoid (not hardsigmoid) so generators can always recover from +# saturation — hardsigmoid's zero-gradient dead zone at z<-3 caused collapse. +const K = parse(Int, get(ENV, "PDL_K", "20")) +const L_PRIMAL = parse(Int, get(ENV, "PDL_L_PRIMAL", "2500")) +const L_DUAL = parse(Int, get(ENV, "PDL_L_DUAL", "2500")) +const WARMUP_EPOCHS = parse(Int, get(ENV, "PDL_WARMUP_EPOCHS", "2500")) +const LR_PRIMAL = parse(Float64, get(ENV, "PDL_LR_PRIMAL", "1.0e-4")) +const LR_DUAL = parse(Float64, get(ENV, "PDL_LR_DUAL", "1.0e-4")) +const LR_DECAY = parse(Float64, get(ENV, "PDL_LR_DECAY", "0.99")) +const BATCH_SIZE = parse(Int, get(ENV, "PDL_BATCH_SIZE", "200")) +const DATASET_SIZE = parse(Int, get(ENV, "PDL_DATASET_SIZE", "5000")) +const RHO_INIT = parse(Float64, get(ENV, "PDL_RHO_INIT", "10.0")) +const RHO_MAX = parse(Float64, get(ENV, "PDL_RHO_MAX", "1.0e4")) +const TAU = parse(Float64, get(ENV, "PDL_TAU", "0.8")) +const ALPHA = parse(Float64, get(ENV, "PDL_ALPHA", "2.0")) +const RHO_EQ_SCALE = parse(Float64, get(ENV, "PDL_RHO_EQ_SCALE", "1.0")) +const MAX_DUAL = parse(Float64, get(ENV, "PDL_MAX_DUAL", "1.0e6")) +const HIDDEN_MULT = parse(Float64, get(ENV, "PDL_HIDDEN_MULT", "1.2")) +const USE_PENALTY_ONLY = parse(Bool, get(ENV, "PDL_USE_PENALTY_ONLY", "false")) +const USE_ANALYTICAL_DUAL = parse(Bool, get(ENV, "PDL_USE_ANALYTICAL_DUAL", "false")) +const T_F = Float32 + +using L2OALM +using Lux +using Optimisers +using MLUtils +using KernelAbstractions +using ExaModels +using BatchNLPKernels +using CUDA +using NNlib + +using PowerModels +PowerModels.silence() +using PGLib + +using Random +using Statistics +using Dates +using Printf + +import GPUArraysCore: @allowscalar +const BNK = BatchNLPKernels + +# AC-OPF model construction (constraint ordering enforced: inequalities first, +# equalities last). Reuses test/power.jl. +include(joinpath(@__DIR__, "..", "test", "power.jl")) + +function feed_forward_builder(num_p, num_y, hidden_layers; activation = relu) + layer_sizes = [num_p; hidden_layers; num_y] + layers = Any[] + for i in 1:length(layer_sizes)-1 + if i < length(layer_sizes) - 1 + push!(layers, Dense(layer_sizes[i], layer_sizes[i+1], activation)) + else + push!(layers, Dense(layer_sizes[i], layer_sizes[i+1])) + end + end + return Chain(layers...) +end + +function main() + rng = Random.default_rng() + Random.seed!(rng, 2026_05_14) + + if !CUDA.functional() + @warn "CUDA not functional — falling back to CPU. Performance will be poor." + end + + backend, dev = CUDA.functional() ? + (CUDABackend(), gpu_device()) : + (CPU(), cpu_device()) + + @info "Building parametric AC-OPF model" case = "pglib_opf_case57_ieee.m" T = T_F backend + model, nbus, ngen, narc, ref_bus_idxs = create_parametric_ac_power_model( + "pglib_opf_case57_ieee.m"; backend = backend, T = T_F, + ) + bm_train = BNK.BatchModel(model, BATCH_SIZE, config = BNK.BatchModelConfig(:full)) + bm_test = BNK.BatchModel(model, DATASET_SIZE, config = BNK.BatchModelConfig(:full)) + + nvar = model.meta.nvar + ncon = model.meta.ncon + nθ = length(model.θ) + + is_equal = model.meta.lcon .== model.meta.ucon + num_equal = count(is_equal) + @assert all(is_equal[ncon-num_equal+1:ncon]) "constraint ordering: equalities at tail" + @assert !any(is_equal[1:ncon-num_equal]) "constraint ordering: inequalities at head" + @info "Problem dimensions" nbus ngen narc nvar ncon nθ num_equal + + # Θ: load multipliers ~ 1.0 ± 10% + Random.seed!(rng, 0xCA5E5701) + Θ_train = (T_F(1.0) .+ T_F(0.1) .* randn(rng, T_F, nθ, DATASET_SIZE)) |> dev + Random.seed!(rng, 0xCA5E5702) + Θ_test = (T_F(1.0) .+ T_F(0.1) .* randn(rng, T_F, nθ, DATASET_SIZE)) |> dev + + # Primal network with bound-enforcing head; paper width = 1.2·nvar. + h = round(Int, HIDDEN_MULT * nvar) + lvar = T_F.(model.meta.lvar) + uvar = T_F.(model.meta.uvar) + # Build FixRefBus mask on the same device as lvar (GPU when CUDA is available). + # FixRefBus.mask must be a CuArray — Lux |> dev only transfers parameters/states, + # not data fields of custom AbstractLuxLayer structs. + ref_mask = fill!(similar(lvar), one(T_F)) + @allowscalar for idx in ref_bus_idxs + ref_mask[idx] = zero(T_F) + end + primal_model = Chain( + feed_forward_builder(nθ, nvar, [h, h]), + BoundedOutput(lvar, uvar), + FixRefBus(ref_mask), + ) |> dev + ps_primal, st_primal = Lux.setup(rng, primal_model) + ps_primal = ps_primal |> dev + st_primal = st_primal |> dev + + # Dual network: 2-layer MLP outputting ncon multipliers. + dual_model = feed_forward_builder(nθ, ncon, [h, h]) |> dev + ps_dual, st_dual = Lux.setup(rng, dual_model) + ps_dual = ps_dual |> dev + st_dual = st_dual |> dev + + train_state_primal = Training.TrainState(primal_model, ps_primal, st_primal, Optimisers.Adam(LR_PRIMAL)) + train_state_dual = Training.TrainState(dual_model, ps_dual, st_dual, Optimisers.Adam(LR_DUAL)) + + data = DataLoader((Θ_train); batchsize = BATCH_SIZE, shuffle = true) .|> dev + + # Paper hyperparameters for AC-OPF. + method = ALMMethod(; batch_model = bm_train, num_equal = num_equal, + max_dual = MAX_DUAL, ρmax = RHO_MAX, τ = TAU, α = ALPHA, ρ_eq_scale = RHO_EQ_SCALE) + trainer = ALMTrainer(primal_model, train_state_primal, dual_model, train_state_dual, RHO_INIT) + + method_test = ALMMethod(; batch_model = bm_test, num_equal = num_equal, + max_dual = MAX_DUAL, ρmax = RHO_MAX, τ = TAU, α = ALPHA, ρ_eq_scale = RHO_EQ_SCALE) + test_primal_loss = primal_loss(method_test) + + # ---- CSV logger setup ---- + log_dir = joinpath(@__DIR__, "..", "outputs") + mkpath(log_dir) + timestamp = Dates.format(now(), "yyyymmdd_HHMMSS") + log_path = joinpath(log_dir, "training_log_case57_$(timestamp).csv") + open(log_path, "w") do io + println(io, "outer_iter,rho,mean_abs_mu,mean_abs_lambda,", + "mean_violations,max_violation_ineq,max_violation_eq,", + "max_bound_violation,mean_objs,primal_loss,wallclock_s") + end + @info "CSV log path" log_path + + t0 = time() + function eval_and_log(iter::Int, trainer) + ps_p = trainer.primal_training_state.parameters + st_p = trainer.primal_training_state.states + X̂_test, _ = primal_model(Θ_test, ps_p, st_p) + Vc_test, Vb_test = BNK.all_violations!(bm_test, X̂_test, Θ_test) + objs_test = BNK.objective!(bm_test, X̂_test, Θ_test) + dual_hat, _ = trainer.dual_model(Θ_test, + trainer.dual_training_state.parameters, + trainer.dual_training_state.states) + μ = dual_hat[1:end-num_equal, :] + λ = dual_hat[end-num_equal+1:end, :] + Vc_ineq = Vc_test[1:end-num_equal, :] + Vc_eq = Vc_test[end-num_equal+1:end, :] + mean_abs_mu = mean(abs.(μ)) + mean_abs_lambda = mean(abs.(λ)) + mean_viol = mean(Vc_test) + max_viol_ineq = maximum(Vc_ineq) + max_viol_eq = maximum(Vc_eq) + max_bound = maximum(Vb_test) + mean_obj = mean(objs_test) + wallclock = time() - t0 + primal_val, _, _ = test_primal_loss(primal_model, ps_p, st_p, (Θ_test, trainer)) + + open(log_path, "a") do io + @printf(io, "%d,%.6e,%.6e,%.6e,%.6e,%.6e,%.6e,%.6e,%.6e,%.6e,%.3f\n", + iter, trainer.ρ, mean_abs_mu, mean_abs_lambda, + mean_viol, max_viol_ineq, max_viol_eq, max_bound, mean_obj, + Float64(primal_val), wallclock) + end + + @info @sprintf("iter %3d", iter) ρ=trainer.ρ max_viol_ineq max_viol_eq mean_viol mean_obj mean_abs_mu mean_abs_lambda + return Float64(mean_viol) # signal for lr-decay: decay only when violations worsen + end + + eval_and_log(0, trainer) # baseline before training + + @info "Starting train!" K L_PRIMAL L_DUAL WARMUP_EPOCHS RHO_INIT RHO_MAX TAU ALPHA RHO_EQ_SCALE MAX_DUAL LR_PRIMAL LR_DUAL HIDDEN_MULT h USE_PENALTY_ONLY USE_ANALYTICAL_DUAL + train!(method, trainer, data; + K = K, L_primal = L_PRIMAL, L_dual = L_DUAL, + warmup_epochs = WARMUP_EPOCHS, + lr_primal = LR_PRIMAL, lr_dual = LR_DUAL, + lr_decay = LR_DECAY, + use_penalty_only = USE_PENALTY_ONLY, + use_analytical_dual = USE_ANALYTICAL_DUAL, + eval_fn = eval_and_log, + ) + + # ---- Acceptance check ---- + ps_p = trainer.primal_training_state.parameters + st_p = trainer.primal_training_state.states + X̂_final, _ = primal_model(Θ_test, ps_p, st_p) + Vc_final, Vb_final = BNK.all_violations!(bm_test, X̂_final, Θ_test) + max_viol = maximum(Vc_final) + max_bound = maximum(Vb_final) + @info "FINAL" max_viol max_bound log = log_path + @info "Acceptance" passed = (max_viol < 0.01) target = 0.01 + + return max_viol +end + +main() diff --git a/examples/case57_train_twophase.jl b/examples/case57_train_twophase.jl new file mode 100644 index 0000000..125abf2 --- /dev/null +++ b/examples/case57_train_twophase.jl @@ -0,0 +1,261 @@ +# examples/case57_train_twophase.jl +# +# Two-phase PDL training for case57 (v26a): +# +# Phase 1 — skip the degenerate phase: +# ρ_init = ρ_max = 1e4 (pinned, no growth). At ρ=1e4 the zero-generation +# degenerate min costs 2.88M vs. feasible ~40K — mechanically impossible. +# Run K1 outer iterations. After this phase the network is in a non-degenerate +# basin but equalities oscillate at ~2.5 p.u. (reproduces v21a behaviour). +# +# Phase 2 — drive equalities to convergence: +# Start from Phase-1 weights (trainer unchanged). Swap to a new method with +# ρ_max=1e6, ρ_eq_scale=10, MAX_DUAL=1e4. +# - ρ grows from trainer.ρ (=1e4 at end of Phase 1) up to 1e6. +# - ρ_eq_scale=10 makes effective equality ρ up to 1e7, but MAX_DUAL=1e4 +# caps the gradient at lr × 1e4 = 1.0/step — stable throughout. +# - Phase-1 weights serve as a warm start in the non-degenerate basin. +# Run K2 outer iterations. +# +# The key difference from the user's original reset idea: we do NOT reset ρ back +# to a small value (which would misalign the dual network). Instead, Phase 2 +# CONTINUES growing ρ from where Phase 1 left it, but with higher ρ_max and +# ρ_eq_scale so equalities get additional convergence pressure. + +const K1 = parse(Int, get(ENV, "PDL_K1", "40")) +const K2 = parse(Int, get(ENV, "PDL_K2", "60")) +const L_PRIMAL = parse(Int, get(ENV, "PDL_L_PRIMAL", "2500")) +const L_DUAL = parse(Int, get(ENV, "PDL_L_DUAL", "2500")) +const WARMUP_EPOCHS = parse(Int, get(ENV, "PDL_WARMUP_EPOCHS", "25000")) +const LR_PRIMAL = parse(Float64, get(ENV, "PDL_LR_PRIMAL", "1.0e-4")) +const LR_DUAL = parse(Float64, get(ENV, "PDL_LR_DUAL", "1.0e-3")) +const LR_DECAY = parse(Float64, get(ENV, "PDL_LR_DECAY", "0.99")) +const BATCH_SIZE = parse(Int, get(ENV, "PDL_BATCH_SIZE", "200")) +const DATASET_SIZE = parse(Int, get(ENV, "PDL_DATASET_SIZE", "5000")) +const TAU = parse(Float64, get(ENV, "PDL_TAU", "0.8")) +const ALPHA = parse(Float64, get(ENV, "PDL_ALPHA", "2.0")) +const HIDDEN_MULT = parse(Float64, get(ENV, "PDL_HIDDEN_MULT", "4.0")) + +# Phase 1 hyperparams +const RHO_P1 = parse(Float64, get(ENV, "PDL_RHO_P1", "10000.0")) +const MAX_DUAL_P1 = parse(Float64, get(ENV, "PDL_MAX_DUAL_P1", "10000.0")) +const RHO_EQ_SCALE_P1 = parse(Float64, get(ENV, "PDL_RHO_EQ_SCALE_P1","1.0")) + +# Phase 2 hyperparams +const RHO_MAX_P2 = parse(Float64, get(ENV, "PDL_RHO_MAX_P2", "1000000.0")) +const MAX_DUAL_P2 = parse(Float64, get(ENV, "PDL_MAX_DUAL_P2", "10000.0")) +const RHO_EQ_SCALE_P2 = parse(Float64, get(ENV, "PDL_RHO_EQ_SCALE_P2","10.0")) + +const T_F = Float32 + +using L2OALM +using Lux +using Optimisers +using MLUtils +using KernelAbstractions +using ExaModels +using BatchNLPKernels +using CUDA +using NNlib + +using PowerModels +PowerModels.silence() +using PGLib + +using Random +using Statistics +using Dates +using Printf + +import GPUArraysCore: @allowscalar +const BNK = BatchNLPKernels + +include(joinpath(@__DIR__, "..", "test", "power.jl")) + +function feed_forward_builder(num_p, num_y, hidden_layers; activation = relu) + layer_sizes = [num_p; hidden_layers; num_y] + layers = Any[] + for i in 1:length(layer_sizes)-1 + if i < length(layer_sizes) - 1 + push!(layers, Dense(layer_sizes[i], layer_sizes[i+1], activation)) + else + push!(layers, Dense(layer_sizes[i], layer_sizes[i+1])) + end + end + return Chain(layers...) +end + +function main() + rng = Random.default_rng() + Random.seed!(rng, 2026_05_22) + + if !CUDA.functional() + @warn "CUDA not functional — falling back to CPU." + end + + backend, dev = CUDA.functional() ? + (CUDABackend(), gpu_device()) : + (CPU(), cpu_device()) + + @info "Building parametric AC-OPF model" case = "pglib_opf_case57_ieee.m" T = T_F backend + model, nbus, ngen, narc, ref_bus_idxs = create_parametric_ac_power_model( + "pglib_opf_case57_ieee.m"; backend = backend, T = T_F, + ) + bm_train = BNK.BatchModel(model, BATCH_SIZE, config = BNK.BatchModelConfig(:full)) + bm_test = BNK.BatchModel(model, DATASET_SIZE, config = BNK.BatchModelConfig(:full)) + + nvar = model.meta.nvar + ncon = model.meta.ncon + nθ = length(model.θ) + + is_equal = model.meta.lcon .== model.meta.ucon + num_equal = count(is_equal) + @assert all(is_equal[ncon-num_equal+1:ncon]) + @assert !any(is_equal[1:ncon-num_equal]) + @info "Problem dimensions" nbus ngen narc nvar ncon nθ num_equal + + Random.seed!(rng, 0xCA5E5701) + Θ_train = (T_F(1.0) .+ T_F(0.1) .* randn(rng, T_F, nθ, DATASET_SIZE)) |> dev + Random.seed!(rng, 0xCA5E5702) + Θ_test = (T_F(1.0) .+ T_F(0.1) .* randn(rng, T_F, nθ, DATASET_SIZE)) |> dev + + h = round(Int, HIDDEN_MULT * nvar) + lvar = T_F.(model.meta.lvar) + uvar = T_F.(model.meta.uvar) + ref_mask = fill!(similar(lvar), one(T_F)) + @allowscalar for idx in ref_bus_idxs + ref_mask[idx] = zero(T_F) + end + primal_model = Chain( + feed_forward_builder(nθ, nvar, [h, h]), + BoundedOutput(lvar, uvar), + FixRefBus(ref_mask), + ) |> dev + ps_primal, st_primal = Lux.setup(rng, primal_model) + ps_primal = ps_primal |> dev + st_primal = st_primal |> dev + + dual_model = feed_forward_builder(nθ, ncon, [h, h]) |> dev + ps_dual, st_dual = Lux.setup(rng, dual_model) + ps_dual = ps_dual |> dev + st_dual = st_dual |> dev + + train_state_primal = Training.TrainState(primal_model, ps_primal, st_primal, Optimisers.Adam(LR_PRIMAL)) + train_state_dual = Training.TrainState(dual_model, ps_dual, st_dual, Optimisers.Adam(LR_DUAL)) + + data = DataLoader((Θ_train); batchsize = BATCH_SIZE, shuffle = true) .|> dev + + # ---- shared logger ---- + log_dir = joinpath(@__DIR__, "..", "outputs") + mkpath(log_dir) + timestamp = Dates.format(now(), "yyyymmdd_HHMMSS") + log_path = joinpath(log_dir, "training_log_case57_twophase_$(timestamp).csv") + open(log_path, "w") do io + println(io, "phase,outer_iter,rho,rho_eq_effective,mean_abs_mu,mean_abs_lambda,", + "mean_violations,max_violation_ineq,max_violation_eq,", + "max_bound_violation,mean_objs,primal_loss,wallclock_s") + end + @info "CSV log path" log_path + t0 = time() + + function make_eval_fn(phase::Int, method_test) + test_primal_loss = primal_loss(method_test) + return function eval_and_log(iter::Int, trainer) + ps_p = trainer.primal_training_state.parameters + st_p = trainer.primal_training_state.states + X̂_test, _ = primal_model(Θ_test, ps_p, st_p) + Vc_test, Vb_test = BNK.all_violations!(bm_test, X̂_test, Θ_test) + objs_test = BNK.objective!(bm_test, X̂_test, Θ_test) + dual_hat, _ = trainer.dual_model(Θ_test, + trainer.dual_training_state.parameters, + trainer.dual_training_state.states) + μ = dual_hat[1:end-num_equal, :] + λ = dual_hat[end-num_equal+1:end, :] + Vc_ineq = Vc_test[1:end-num_equal, :] + Vc_eq = Vc_test[end-num_equal+1:end, :] + mean_abs_mu = mean(abs.(μ)) + mean_abs_lambda = mean(abs.(λ)) + max_viol_ineq = maximum(Vc_ineq) + max_viol_eq = maximum(Vc_eq) + max_bound = maximum(Vb_test) + mean_obj = mean(objs_test) + wallclock = time() - t0 + rho_eq_eff = trainer.ρ * (phase == 1 ? RHO_EQ_SCALE_P1 : RHO_EQ_SCALE_P2) + primal_val, _, _ = test_primal_loss(primal_model, ps_p, st_p, (Θ_test, trainer)) + + open(log_path, "a") do io + @printf(io, "%d,%d,%.6e,%.6e,%.6e,%.6e,%.6e,%.6e,%.6e,%.6e,%.6e,%.6e,%.3f\n", + phase, iter, trainer.ρ, rho_eq_eff, + mean_abs_mu, mean_abs_lambda, + mean(Vc_test), max_viol_ineq, max_viol_eq, max_bound, mean_obj, + Float64(primal_val), wallclock) + end + + @info @sprintf("Phase %d iter %3d", phase, iter) ρ=trainer.ρ ρ_eq=rho_eq_eff max_viol_ineq max_viol_eq mean_obj + return Float64(mean(Vc_test)) + end + end + + # ========================================================================== + # PHASE 1: fixed ρ = ρ_max = RHO_P1 (skip degenerate phase) + # ========================================================================== + @info "=== PHASE 1 ===" K1 RHO_P1 MAX_DUAL_P1 RHO_EQ_SCALE_P1 + + method_p1 = ALMMethod(; batch_model = bm_train, num_equal = num_equal, + max_dual = MAX_DUAL_P1, ρmax = RHO_P1, τ = TAU, α = ALPHA, + ρ_eq_scale = RHO_EQ_SCALE_P1) + method_p1_test = ALMMethod(; batch_model = bm_test, num_equal = num_equal, + max_dual = MAX_DUAL_P1, ρmax = RHO_P1, τ = TAU, α = ALPHA, + ρ_eq_scale = RHO_EQ_SCALE_P1) + + trainer = ALMTrainer(primal_model, train_state_primal, dual_model, train_state_dual, RHO_P1) + + eval_p1 = make_eval_fn(1, method_p1_test) + eval_p1(0, trainer) # baseline + + train!(method_p1, trainer, data; + K = K1, L_primal = L_PRIMAL, L_dual = L_DUAL, + warmup_epochs = WARMUP_EPOCHS, + lr_primal = LR_PRIMAL, lr_dual = LR_DUAL, lr_decay = LR_DECAY, + use_penalty_only = false, use_analytical_dual = true, + eval_fn = eval_p1, + ) + + @info "Phase 1 complete" trainer.ρ trainer.max_violations + + # ========================================================================== + # PHASE 2: grow ρ → RHO_MAX_P2 with ρ_eq_scale=RHO_EQ_SCALE_P2 + # Trainer state (weights + ρ from Phase 1) is preserved. + # ========================================================================== + @info "=== PHASE 2 ===" K2 RHO_MAX_P2 MAX_DUAL_P2 RHO_EQ_SCALE_P2 + + method_p2 = ALMMethod(; batch_model = bm_train, num_equal = num_equal, + max_dual = MAX_DUAL_P2, ρmax = RHO_MAX_P2, τ = TAU, α = ALPHA, + ρ_eq_scale = RHO_EQ_SCALE_P2) + method_p2_test = ALMMethod(; batch_model = bm_test, num_equal = num_equal, + max_dual = MAX_DUAL_P2, ρmax = RHO_MAX_P2, τ = TAU, α = ALPHA, + ρ_eq_scale = RHO_EQ_SCALE_P2) + + eval_p2 = make_eval_fn(2, method_p2_test) + + train!(method_p2, trainer, data; + K = K2, L_primal = L_PRIMAL, L_dual = L_DUAL, + warmup_epochs = 0, # no second warmup — primal already in good basin + lr_primal = LR_PRIMAL, lr_dual = LR_DUAL, lr_decay = LR_DECAY, + use_penalty_only = false, use_analytical_dual = true, + eval_fn = eval_p2, + ) + + # ---- Final eval ---- + ps_p = trainer.primal_training_state.parameters + st_p = trainer.primal_training_state.states + X̂_final, _ = primal_model(Θ_test, ps_p, st_p) + Vc_final, Vb_final = BNK.all_violations!(bm_test, X̂_final, Θ_test) + max_viol = maximum(Vc_final) + max_bound = maximum(Vb_final) + @info "FINAL" max_viol max_bound log = log_path + @info "Acceptance" passed = (max_viol < 0.01) target = 0.01 + return max_viol +end + +main() diff --git a/scripts/plot_training.jl b/scripts/plot_training.jl new file mode 100644 index 0000000..a873b1f --- /dev/null +++ b/scripts/plot_training.jl @@ -0,0 +1,49 @@ +# scripts/plot_training.jl +# +# Plot a training trajectory CSV produced by examples/case57_train.jl. +# +# Usage: +# julia --project=examples scripts/plot_training.jl outputs/training_log_case57_.csv +# +# Produces a PNG next to the CSV. + +using DelimitedFiles +using Plots +gr() + +function main(csv_path::String) + @assert isfile(csv_path) "CSV not found: $csv_path" + data, header = readdlm(csv_path, ',', Float64; header = true) + cols = Dict(strip(c) => i for (i, c) in enumerate(header)) + + iters = data[:, cols["outer_iter"]] + ρ = data[:, cols["rho"]] + mean_μ = data[:, cols["mean_abs_mu"]] + mean_λ = data[:, cols["mean_abs_lambda"]] + mean_v = data[:, cols["mean_violations"]] + max_vi = data[:, cols["max_violation_ineq"]] + max_ve = data[:, cols["max_violation_eq"]] + max_b = data[:, cols["max_bound_violation"]] + mean_o = data[:, cols["mean_objs"]] + primal_l = data[:, cols["primal_loss"]] + + p1 = plot(iters, primal_l, xlabel="outer iter", ylabel="primal AL loss", title="Primal loss", lw=2) + p2 = plot(iters, ρ, xlabel="outer iter", ylabel="ρ", title="Penalty ρ", lw=2) + p3 = plot(iters, [mean_μ mean_λ], xlabel="outer iter", + label=["mean |μ|" "mean |λ|"], title="Dual magnitudes", lw=2) + p4 = plot(iters, [max_vi max_ve max_b], xlabel="outer iter", + label=["max V_ineq" "max V_eq" "max V_bound"], + title="Max violations (log)", yscale=:log10, lw=2) + p5 = plot(iters, mean_v, xlabel="outer iter", ylabel="mean violations", title="Mean violations", lw=2) + p6 = plot(iters, mean_o, xlabel="outer iter", ylabel="mean objective", title="Mean cost", lw=2) + + fig = plot(p1, p2, p3, p4, p5, p6, layout=(2, 3), size=(1400, 800), legend=:topright) + png_path = replace(csv_path, r"\.csv$" => ".png") + savefig(fig, png_path) + @info "Saved" png_path +end + +if length(ARGS) < 1 + error("usage: julia --project=examples scripts/plot_training.jl ") +end +main(ARGS[1]) diff --git a/src/L2OALM.jl b/src/L2OALM.jl index ef6836c..fcc90fd 100644 --- a/src/L2OALM.jl +++ b/src/L2OALM.jl @@ -7,11 +7,93 @@ using Lux using LuxCUDA using Lux.Training using CUDA +using NNlib: sigmoid_fast +using Random: AbstractRNG using Statistics using ChainRules: @ignore_derivatives +using Zygote + +using Optimisers export ALMMethod, - ALMTrainer, dual_loss, primal_loss, single_train_step! + ALMTrainer, BoundedOutput, FixRefBus, dual_loss, primal_loss, penalty_only_primal_loss, + analytical_primal_loss, single_train_step!, train! + +""" + BoundedOutput(lvar, uvar) + +Parameterless Lux layer that enforces per-variable bounds on the primal network's +output. For variables with finite `(lvar[i], uvar[i])` it applies + + yᵢ = lvar[i] + (uvar[i] − lvar[i]) · sigmoid_fast(zᵢ) + +so the output is in `(lvar[i], uvar[i])` asymptotically (sigmoid never reaches the +bounds exactly, but gradient is always nonzero). Variables with an infinite bound +(e.g., voltage angles) pass through unchanged. + +Uses logistic sigmoid_fast (not hardsigmoid) so that generators can always recover from +near-zero outputs: hardsigmoid has a dead gradient zone at z < -3 that permanently +traps outputs at the lower bound once entered. Sigmoid'(z) = σ(z)(1-σ(z)) > 0 for +all z, and Adam's gradient normalization restores steps even at deep saturation. +""" +struct BoundedOutput{V<:AbstractVector} <: Lux.AbstractLuxLayer + lo::V # safe lower bound: lvar[i] where finite, 0 elsewhere + width::V # uvar[i] - lvar[i] where both finite, 0 elsewhere + mask::V # 1.0 where both finite, 0.0 elsewhere (Float for GPU broadcasting) +end + +function BoundedOutput(lvar::AbstractVector, uvar::AbstractVector) + @assert length(lvar) == length(uvar) "lvar and uvar must have the same length" + T = promote_type(eltype(lvar), eltype(uvar)) + both = isfinite.(lvar) .& isfinite.(uvar) + lo = T.(ifelse.(both, lvar, zero(T))) + width = T.(ifelse.(both, uvar .- lvar, zero(T))) + mask = T.(both) + return BoundedOutput(lo, width, mask) +end + +Lux.initialparameters(::AbstractRNG, ::BoundedOutput) = NamedTuple() +Lux.initialstates(::AbstractRNG, ::BoundedOutput) = NamedTuple() +Lux.parameterlength(::BoundedOutput) = 0 +Lux.statelength(::BoundedOutput) = 0 + +function (b::BoundedOutput)(z::AbstractMatrix, _ps, st::NamedTuple) + bounded = b.lo .+ b.width .* sigmoid_fast.(z) + y = b.mask .* bounded .+ (one(eltype(b.mask)) .- b.mask) .* z + return y, st +end + +""" + FixRefBus(mask) + +Parameterless Lux layer that forces reference-bus voltage angle(s) to zero by +multiplying the corresponding rows of the output matrix by zero. This satisfies +the `va[ref] = 0` equality constraint architecturally, removing it from the +Lagrangian penalty and freeing network capacity for the remaining 127 variables. + +Construct via `FixRefBus(nvar, ref_bus_idxs)` which builds the mask on CPU; the +mask is transferred to GPU when the Chain is moved with `|> dev`. +""" +struct FixRefBus{V<:AbstractVector} <: Lux.AbstractLuxLayer + mask::V # length nvar; 0 at ref bus angle positions, 1 elsewhere +end + +function FixRefBus(nvar::Int, ref_bus_idxs) + mask = ones(Float32, nvar) + for idx in ref_bus_idxs + mask[idx] = 0f0 + end + return FixRefBus(mask) +end + +Lux.initialparameters(::AbstractRNG, ::FixRefBus) = NamedTuple() +Lux.initialstates(::AbstractRNG, ::FixRefBus) = NamedTuple() +Lux.parameterlength(::FixRefBus) = 0 +Lux.statelength(::FixRefBus) = 0 + +function (f::FixRefBus)(X::AbstractMatrix, _ps, st::NamedTuple) + return X .* f.mask, st +end """ AbstractL2OMethod @@ -39,18 +121,22 @@ struct ALMMethod{T<:Real} <: AbstractPrimalDualMethod τ::T α::T num_equal::Int # TODO: There should be a way to get this from the batch model + ρ_eq_scale::T # extra penalty multiplier for equality constraints (default 1.0) - function ALMMethod(batch_model::BatchModel, max_dual::T, ρmax::T, τ::T, α::T, num_equal::Int) where {T<:Real} - new{T}(batch_model, max_dual, ρmax, τ, α, num_equal) + function ALMMethod(batch_model::BatchModel, max_dual::T, ρmax::T, τ::T, α::T, num_equal::Int, ρ_eq_scale::T) where {T<:Real} + new{T}(batch_model, max_dual, ρmax, τ, α, num_equal, ρ_eq_scale) end end """ - ALMMethod(; batch_model::BatchModel, num_equal::Int, max_dual::T = 1e6, ρmax::T = 1e6, τ::T = 0.8, α::T = 10.0) + ALMMethod(; batch_model, num_equal, max_dual=1e6, ρmax=1e6, τ=0.8, α=10.0, ρ_eq_scale=1.0) + +Constructor for the ALM method. -Constructor for the Augmented Lagrangian Method (ALM) for primal-dual Learning to Optimize (L2O) methods. -This function initializes the ALM method with a batch model, maximum dual variable values, maximum learning rate `ρ`, -threshold for the parameter updater `τ`, meta learning rate `α`, and the number of equality constraints. +`ρ_eq_scale`: extra multiplier applied to the equality-constraint penalty (and dual +update) relative to the inequality penalty. Values > 1 focus gradient pressure on +power-balance residuals when those equalities are harder to satisfy than the +inequality constraints. Default 1.0 reproduces the standard ALM formulation. """ function ALMMethod(; batch_model::BatchModel, @@ -59,8 +145,9 @@ function ALMMethod(; ρmax::T = 1e6, τ::T = 0.8, α::T = 10.0, + ρ_eq_scale::T = 1.0, ) where {T<:Real} - return ALMMethod(batch_model, max_dual, ρmax, τ, α, num_equal) + return ALMMethod(batch_model, max_dual, ρmax, τ, α, num_equal, ρ_eq_scale) end """ @@ -139,6 +226,7 @@ function dual_loss(method::ALMMethod) bm = method.batch_model max_dual = method.max_dual num_equal = method.num_equal + ρ_eq_scale = method.ρ_eq_scale return (dual_model, ps_dual, st_dual, data) -> begin Θ, trainer = data ρ = trainer.ρ @@ -151,21 +239,21 @@ function dual_loss(method::ALMMethod) # Get previous dual predictions dual_hat_k, _ = @ignore_derivatives dual_model(Θ, prev_dual_training_state.parameters, prev_dual_training_state.states) - # Separate bound and equality constraints - # # Forward pass for primal model - X̂, _ = trainer.primal_model(θ, primal_training_state.parameters, primal_training_state.states) - gh = constraints!(bm, X̂, Θ) - - @ignore_derivatives gh = trainer.constraints(Θ, trainer.ps_constraints, trainer.st_constraints) + # Separate bound and equality constraints (frozen primal — no gradients needed here) + X̂_dual, _ = @ignore_derivatives trainer.primal_model(Θ, primal_training_state.parameters, primal_training_state.states) + gh = @ignore_derivatives BNK.constraints!(bm, X̂_dual, Θ) gh_bound = gh[1:end-num_equal, :] gh_equal = gh[end-num_equal+1:end, :] dual_hat_bound = dual_hat_k[1:end-num_equal, :] dual_hat_equal = dual_hat_k[end-num_equal+1:end, :] - # Target for dual variables + # ALM dual targets: + # μ_{k+1} ← max(μ_k + ρ·g, 0) (ineq: one-sided clip) + # λ_{k+1} ← λ_k + ρ·ρ_eq_scale·h (eq: symmetric; scale boosts equality pressure) + # The outer clamp is a numerical safeguard against unbounded growth. dual_target = vcat( - min.(max.(dual_hat_bound + ρ .* gh_bound, 0), max_dual), - min.(dual_hat_equal + ρ .* gh_equal, max_dual), + clamp.(dual_hat_bound + ρ .* gh_bound, zero(max_dual), max_dual), + clamp.(dual_hat_equal + (ρ * ρ_eq_scale) .* gh_equal, -max_dual, max_dual), ) loss = mean((dual_hat .- dual_target) .^ 2) @@ -181,33 +269,60 @@ from current dual predictions for the batch model `bm` under parameters `Θ`. """ function primal_loss(method::ALMMethod) bm = method.batch_model + num_equal = method.num_equal + ρ_eq_scale = method.ρ_eq_scale return (model, ps, st, data) -> begin Θ, trainer = data ρ = trainer.ρ num_s = size(Θ, 2) - # Forward pass for prediction X̂, st_new = model(Θ, ps, st) - # Get current dual predictions - dual_hat, _ = @ignore_derivatives trainer.dual_model(Θ, trainer.dual_training_state.parameters, trainer.dual_training_state.states) - - # Calculate violations and objectives + # Frozen dual predictions for this primal step. + dual_hat, _ = @ignore_derivatives trainer.dual_model( + Θ, trainer.dual_training_state.parameters, trainer.dual_training_state.states) + + # Paper PDL formulation (Park & Van Hentenryck 2023): + # L_ρ(y, μ, λ) = f(y) + μᵀg(y) + λᵀh(y) + (ρ/2)·(‖max(g,0)‖² + ‖h‖²) + # Lagrangian linear term uses RAW signed g, h from BNK.constraints! + # (`max(g,0)`/`|h|` from all_violations! is the wrong quantity here: + # μ ≥ 0 with g_i ≤ 0 should contribute *negatively* so the primal keeps + # inequalities slack rather than just non-positive). + # Penalty term squares Vc, which BNK.all_violations! already computes as + # max(g,0) for inequalities and |h| for equalities — squaring gives the + # paper's max(g,0)² + h² exactly. + # ρ_eq_scale > 1 applies extra pressure on equality violations specifically, + # matching the dual_loss update which also uses ρ·ρ_eq_scale for equalities. objs = BNK.objective!(bm, X̂, Θ) - # gh = constraints!(bm, X̂, Θ) - Vc, Vb = BNK.all_violations!(bm, X̂, Θ) - V = vcat(Vb, Vc) - total_loss = ( - sum(abs.(dual_hat .* V)) / num_s + ρ / 2 * sum((V) .^ 2) / num_s + mean(objs) - ) + gh = BNK.constraints!(bm, X̂, Θ) + + # Split by ordering: inequalities first, equalities last (enforced in power.jl). + g = gh[1:end-num_equal, :] + h = gh[end-num_equal+1:end, :] + # Clamp inequality multipliers to μ ≥ 0: the dual network output is unconstrained + # so it can predict negative values, which would flip the sign of the μᵀg term and + # push the primal TOWARD constraint violations. Clamping restores KKT sign correctness. + μ = max.(dual_hat[1:end-num_equal, :], zero(eltype(dual_hat))) + λ = dual_hat[end-num_equal+1:end, :] + + # Compute violations from gh directly — avoids a second BNK.constraints! call + # through all_violations! (which shares bm.cons_out and corrupts the rrule view). + Vc_ineq = max.(g, zero(eltype(g))) + Vc_eq = abs.(h) + Vc = vcat(Vc_ineq, Vc_eq) + + lagrangian_term = (sum(μ .* g) + sum(λ .* h)) / num_s + penalty_term = (ρ / 2) * (sum(Vc_ineq .^ 2) + ρ_eq_scale * sum(Vc_eq .^ 2)) / num_s + total_loss = mean(objs) + lagrangian_term + penalty_term return total_loss, st_new, ( - total_loss = total_loss, - mean_violations = mean(V), - max_violation = maximum(V), - mean_objs = mean(objs), + total_loss = total_loss, + mean_violations = mean(Vc), + max_violation = maximum(Vc), + max_bound_violation = zero(eltype(Vc)), + mean_objs = mean(objs), ) end end @@ -220,12 +335,14 @@ This function computes the maximum violation, mean violations, mean objectives, from the batch states. """ function reconcile_primal(::AbstractL2OTrainer, batch_states::Vector{NamedTuple}) - max_violation = maximum([s.max_violation for s in batch_states]) + # Use mean of mean_violations (not max-of-max) as the ρ-update signal. + # max-of-max is dominated by noisy hard batches and causes ρ to escalate + # before violations have genuinely plateaued, collapsing generators. mean_violations = mean([s.mean_violations for s in batch_states]) mean_objs = mean([s.mean_objs for s in batch_states]) mean_loss = mean([s.total_loss for s in batch_states]) return (; - max_violation = max_violation, + max_violation = mean_violations, # proxy for ρ criterion: bump only when mean stalls mean_violations = mean_violations, mean_objs = mean_objs, total_loss = mean_loss, @@ -239,6 +356,7 @@ Reconciles the state of the dual model after processing a batch of data. This function computes the mean dual loss from the batch states. """ function reconcile_dual(::AbstractL2OTrainer, batch_states::Vector{NamedTuple}) + isempty(batch_states) && return (dual_loss = 0.0,) return (dual_loss = mean([s.dual_loss for s in batch_states]),) end @@ -256,7 +374,7 @@ function update_trainer!( dual_state::NamedTuple, ) # Update primal state - new_max_violations = primal_state.max_violations + new_max_violations = primal_state.max_violation trainer.mean_violations = primal_state.mean_violations trainer.mean_objs = primal_state.mean_objs trainer.total_loss = primal_state.total_loss @@ -280,7 +398,7 @@ Default stopping criterion for the L2O methods. This function checks if the number of iterations has reached a predefined limit (100). """ function _stopping_criterion(iter::Int, ::M, ::N) where {M<:AbstractL2OMethod, N<:AbstractL2OTrainer} - return iter >= 100 ? true : false + return iter >= 2 ? true : false end """ @@ -318,55 +436,248 @@ updates the trainer state. function single_train_step!( method::AbstractPrimalDualMethod, trainer::AbstractPrimalDualTrainer, - data, + data; + L_primal::Int = 1, + L_dual::Int = 1, + use_penalty_only::Bool = false, + use_analytical_dual::Bool = false, ) - iter_primal = 1 - iter_dual = 1 - num_batches = length(data) - current_state_primal = (;) - current_state_dual = (;) - _primal_loss = primal_loss(method) - _dual_loss = dual_loss(method) + _primal_loss = if use_penalty_only + penalty_only_primal_loss(method) + elseif use_analytical_dual + analytical_primal_loss(method) + else + primal_loss(method) + end + _dual_loss = dual_loss(method) train_state_primal = trainer.primal_training_state - train_state_dual = trainer.dual_training_state - - # primal loop - while primal_stopping_criterion(iter_primal, method, trainer) - current_states_primal = Vector{NamedTuple}(undef, num_batches) - iter_batch = 1 - for (θ) in data - _, loss_val, stats, train_state_primal = Training.single_train_step!( - AutoZygote(), # AD backend - _primal_loss, # Loss function - (θ, trainer), # Data - train_state_primal, # Training state + train_state_dual = trainer.dual_training_state + + # ----- primal phase: exactly L_primal gradient steps, dual frozen ----- + # Cycle through data batches (reshuffled each epoch) until L_primal steps done. + primal_states = NamedTuple[] + primal_step = 0 + while primal_step < L_primal + for θ in data + primal_step >= L_primal && break + _, _, stats, train_state_primal = Training.single_train_step!( + AutoZygote(), _primal_loss, (θ, trainer), train_state_primal, ) - current_states_primal[iter_batch] = stats - iter_batch += 1 + push!(primal_states, stats) + primal_step += 1 end - current_state_primal = reconcile_primal(trainer, current_states_primal) - iter_primal += 1 end - - # dual loop - while dual_stopping_criterion(iter_dual, method, trainer) - current_states_dual = Vector{NamedTuple}(undef, num_batches) - iter_batch = 1 - for (θ) in data - _, loss_val, stats, train_state_dual = Training.single_train_step!( - AutoZygote(), # AD backend - _dual_loss, # Loss function - (θ, trainer), # Data - train_state_dual, # Training state + trainer.primal_training_state = train_state_primal + current_state_primal = reconcile_primal(trainer, primal_states) + + # ----- dual phase: exactly L_dual gradient steps, primal frozen ----- + dual_states = NamedTuple[] + dual_step = 0 + while dual_step < L_dual + for θ in data + dual_step >= L_dual && break + _, _, stats, train_state_dual = Training.single_train_step!( + AutoZygote(), _dual_loss, (θ, trainer), train_state_dual, ) - current_states_dual[iter_batch] = stats - iter_batch += 1 + push!(dual_states, stats) + dual_step += 1 + end + end + trainer.dual_training_state = train_state_dual + update_trainer!(method, trainer, current_state_primal, reconcile_dual(trainer, dual_states)) + return +end + +""" + penalty_only_primal_loss(method::ALMMethod) + +Closure that computes a *penalty-only* primal loss (no Lagrangian linear term): + + L(y) = f(y) + (ρ/2) · ‖V(y)‖² + +Used for the primal warm-start phase before the alternating loop, so the primal +network finds a feasible basin against fixed-zero multipliers before the dual +starts producing nonzero multipliers. Without this, the first few outer +iterations can be unstable because the dual fits multipliers against random +primal predictions. +""" +function penalty_only_primal_loss(method::ALMMethod) + bm = method.batch_model + num_equal = method.num_equal + ρ_eq_scale = method.ρ_eq_scale + return (model, ps, st, data) -> begin + Θ, trainer = data + ρ = trainer.ρ + num_s = size(Θ, 2) + X̂, st_new = model(Θ, ps, st) + objs = BNK.objective!(bm, X̂, Θ) + # Use BNK.constraints! (which has an rrule) so that penalty gradients flow + # back through to the network weights. BNK.all_violations! has no rrule and + # would give zero penalty gradient, collapsing generators to pmin. + gh = BNK.constraints!(bm, X̂, Θ) + g = gh[1:end-num_equal, :] + h = gh[end-num_equal+1:end, :] + Vc_ineq = max.(g, zero(eltype(g))) + Vc_eq = abs.(h) + Vc = vcat(Vc_ineq, Vc_eq) + penalty_term = (ρ / 2) * (sum(Vc_ineq .^ 2) + ρ_eq_scale * sum(Vc_eq .^ 2)) / num_s + total_loss = mean(objs) + penalty_term + return total_loss, st_new, ( + total_loss = total_loss, + mean_violations = mean(Vc), + max_violation = maximum(Vc), + max_bound_violation = zero(eltype(Vc)), + mean_objs = mean(objs), + ) + end +end + +""" + analytical_primal_loss(method::ALMMethod) + +Primal loss that applies the ALM dual update analytically at every gradient step, +eliminating the dual tracking gap entirely. + +Instead of using the dual network's output directly (which lags behind the correct +multipliers by a 4000× factor at ρmax=10000), this computes the analytically corrected +multipliers at each step: + + μ_eff = max(μ_net(θ) + ρ·g(y,θ), 0) + λ_eff = clamp(λ_net(θ) + ρ·h(y,θ), -max_dual, max_dual) + +Then uses these in the augmented Lagrangian: + + L = f(y) + μ_eff·g(y) + λ_eff·h(y) + +The penalty term is implicit: μ_eff·g = (μ_net + ρg)·g includes a ρg² term for +violated constraints, equivalent to the standard ALM penalty. The dual network still +trains in parallel (providing a warm-start for test-time generalization) but the primal +gradient is no longer bottlenecked by slow dual convergence. +""" +function analytical_primal_loss(method::ALMMethod) + bm = method.batch_model + num_equal = method.num_equal + ρ_eq_scale = method.ρ_eq_scale + max_dual = method.max_dual + return (model, ps, st, data) -> begin + Θ, trainer = data + ρ = trainer.ρ + num_s = size(Θ, 2) + + X̂, st_new = model(Θ, ps, st) + + objs = BNK.objective!(bm, X̂, Θ) + gh = BNK.constraints!(bm, X̂, Θ) + + g = gh[1:end-num_equal, :] + h = gh[end-num_equal+1:end, :] + + # Dual network provides a warm-start base; analytically correct by ρ·gh. + # @ignore_derivatives: gradient does NOT flow through dual net params, + # but DOES flow through gh (computed from X̂ above), providing the correct + # ρ·g and ρ·h gradient terms without any tracking lag. + dual_hat, _ = @ignore_derivatives trainer.dual_model( + Θ, trainer.dual_training_state.parameters, trainer.dual_training_state.states) + + μ_base = dual_hat[1:end-num_equal, :] + λ_base = dual_hat[end-num_equal+1:end, :] + + # Analytically corrected multipliers (one full ALM update applied immediately) + μ = clamp.(μ_base .+ ρ .* g, zero(max_dual), max_dual) + λ = clamp.(λ_base .+ (ρ * ρ_eq_scale) .* h, -max_dual, max_dual) + + Vc_ineq = max.(g, zero(eltype(g))) + Vc_eq = abs.(h) + Vc = vcat(Vc_ineq, Vc_eq) + + lagrangian_term = (sum(μ .* g) + sum(λ .* h)) / num_s + total_loss = mean(objs) + lagrangian_term + + return total_loss, st_new, ( + total_loss = total_loss, + mean_violations = mean(Vc), + max_violation = maximum(Vc), + max_bound_violation = zero(eltype(Vc)), + mean_objs = mean(objs), + ) + end +end + +""" + train!(method, trainer, data; + K=10, L_primal=500, L_dual=500, warmup_epochs=4, + lr_primal=1e-4, lr_dual=1e-4, lr_decay=0.99, + eval_fn=nothing) + +Top-level training loop matching the PDL paper schedule. + +- `K`: number of outer alternating iterations. +- `L_primal` / `L_dual`: inner epochs (passes through `data`) per phase. +- `warmup_epochs`: number of penalty-only primal epochs before the alternating + loop. Set to 0 to disable. +- `lr_primal`, `lr_dual`: starting Adam learning rates. The trainer's existing + TrainStates are mutated via `Optimisers.adjust!` so this overrides whatever + was set at construction. +- `lr_decay`: multiplicative decay applied when validation loss worsens. +- `eval_fn(iter, trainer) -> Real | nothing`: optional validation callback. If + it returns a Real that's larger than the previous, both lrs are multiplied + by `lr_decay`. +""" +function train!( + method::ALMMethod, trainer::ALMTrainer, data; + K::Int = 10, L_primal::Int = 500, L_dual::Int = 500, + warmup_epochs::Int = 4, + lr_primal::Real = 1e-4, lr_dual::Real = 1e-4, + lr_decay::Real = 0.99, + use_penalty_only::Bool = false, + use_analytical_dual::Bool = false, + eval_fn = nothing, +) + Optimisers.adjust!(trainer.primal_training_state.optimizer_state, lr_primal) + Optimisers.adjust!(trainer.dual_training_state.optimizer_state, lr_dual) + cur_lr_primal = lr_primal + cur_lr_dual = lr_dual + + # ----- warm-start: penalty-only primal (warmup_epochs = gradient steps) ----- + # Run at ρmax (not ρ_init) so the penalty is strong enough to push the primal + # toward feasibility before the dual starts producing multipliers. At ρ_init=10 + # the penalty gradient is ~1000× too weak to reduce power-balance violations. + if warmup_epochs > 0 + warmup_loss = penalty_only_primal_loss(method) + ts_primal = trainer.primal_training_state + ρ_saved = trainer.ρ + trainer.ρ = method.ρmax # full penalty during warmup + warmup_step = 0 + while warmup_step < warmup_epochs + for θ in data + warmup_step >= warmup_epochs && break + _, _, _, ts_primal = Training.single_train_step!( + AutoZygote(), warmup_loss, (θ, trainer), ts_primal, + ) + warmup_step += 1 + end + end + trainer.primal_training_state = ts_primal + trainer.ρ = ρ_saved # restore initial ρ for ALM schedule + end + + # ----- alternating primal/dual loop with optional lr decay ----- + last_val = typemax(Float64) + for outer in 1:K + single_train_step!(method, trainer, data; L_primal = L_primal, L_dual = L_dual, use_penalty_only = use_penalty_only, use_analytical_dual = use_analytical_dual) + if eval_fn !== nothing + v = eval_fn(outer, trainer) + if v isa Real + if v > last_val + cur_lr_primal *= lr_decay + cur_lr_dual *= lr_decay + Optimisers.adjust!(trainer.primal_training_state.optimizer_state, cur_lr_primal) + Optimisers.adjust!(trainer.dual_training_state.optimizer_state, cur_lr_dual) + end + last_val = v + end end - current_state_dual = reconcile_dual(trainer, current_states_dual) - iter_dual += 1 end - # Update trainer state - update_trainer!(method, trainer, current_state_primal, current_state_dual) return end diff --git a/test/power.jl b/test/power.jl index 6644468..91c038d 100644 --- a/test/power.jl +++ b/test/power.jl @@ -78,6 +78,8 @@ function _parse_ac_data_raw(filename; T = Float64) c7 = T((g + g_to)), c8 = T((b + b_to)), rate_a_sq = T(branch["rate_a"]^2), + angmax = T(branch["angmax"]), + angmin = T(branch["angmin"]), ) end for (i, branch_raw) in ref[:branch] ], @@ -111,7 +113,12 @@ function create_parametric_ac_power_model( data = _parse_ac_data(filename, backend, T = T) c = ExaCore(T; backend = backend) - va = variable(c, length(data.bus);) + va = variable( + c, + length(data.bus); + lvar = fill!(similar(data.bus, T), -T(π / 2)), + uvar = fill!(similar(data.bus, T), T(π / 2)), + ) vm = variable( c, length(data.bus); @@ -125,88 +132,117 @@ function create_parametric_ac_power_model( @allowscalar load_multiplier = parameter(c, [1.0 for b in data.bus]) - p = variable(c, length(data.arc); lvar = -data.rate_a, uvar = data.rate_a) - q = variable(c, length(data.arc); lvar = -data.rate_a, uvar = data.rate_a) - o = objective(c, g.cost1 * pg[g.i]^2 + g.cost2 * pg[g.i] + g.cost3 for g in data.gen) - # Reference bus angle ------------------------------------------------------ - c1 = constraint(c, va[i] for i in data.ref_buses) - - # Branch power-flow equations --------------------------------------------- + # Reduced-space formulation: p and q branch flows are eliminated as decision + # variables and expressed analytically from (vm, va) via the AC power flow + # equations. This removes all 4·nbranch branch-flow equality constraints + # (which were the dominant cause of max_viol_eq ≈ 1.0 when p/q were free): + # + # p_from = c5·vm[f]² + c3·vm[f]·vm[t]·cos(θ) + c4·vm[f]·vm[t]·sin(θ) + # q_from = -c6·vm[f]² - c4·vm[f]·vm[t]·cos(θ) + c3·vm[f]·vm[t]·sin(θ) + # p_to = c7·vm[t]² + c1·vm[t]·vm[f]·cos(-θ) + c2·vm[t]·vm[f]·sin(-θ) + # q_to = -c8·vm[t]² - c2·vm[t]·vm[f]·cos(-θ) + c1·vm[t]·vm[f]·sin(-θ) + # + # Constraints: inequalities first (angle_diff × 2, thermal × 2), then + # equalities (ref_bus, power_balance_p, power_balance_q). + + # =========================== INEQUALITIES =============================== + + # Angle difference upper: va_f - va_t ≤ angmax constraint( c, - ( - p[b.f_idx] - b.c5 * vm[b.f_bus]^2 - - b.c3 * (vm[b.f_bus] * vm[b.t_bus] * cos(va[b.f_bus] - va[b.t_bus])) - - b.c4 * (vm[b.f_bus] * vm[b.t_bus] * sin(va[b.f_bus] - va[b.t_bus])) for - b in data.branch - ), + va[b.f_bus] - va[b.t_bus] - b.angmax for b in data.branch; + lcon = fill!(similar(data.branch, Float64, length(data.branch)), -Inf), ) - + # Angle difference lower: angmin ≤ va_f - va_t constraint( c, - ( - q[b.f_idx] + - b.c6 * vm[b.f_bus]^2 + - b.c4 * (vm[b.f_bus] * vm[b.t_bus] * cos(va[b.f_bus] - va[b.t_bus])) - - b.c3 * (vm[b.f_bus] * vm[b.t_bus] * sin(va[b.f_bus] - va[b.t_bus])) for - b in data.branch - ), + b.angmin - (va[b.f_bus] - va[b.t_bus]) for b in data.branch; + lcon = fill!(similar(data.branch, Float64, length(data.branch)), -Inf), ) + # Thermal from: p_from² + q_from² ≤ rate_a² constraint( c, ( - p[b.t_idx] - b.c7 * vm[b.t_bus]^2 - - b.c1 * (vm[b.t_bus] * vm[b.f_bus] * cos(va[b.t_bus] - va[b.f_bus])) - - b.c2 * (vm[b.t_bus] * vm[b.f_bus] * sin(va[b.t_bus] - va[b.f_bus])) for - b in data.branch - ), + b.c5 * vm[b.f_bus]^2 + + b.c3 * (vm[b.f_bus] * vm[b.t_bus] * cos(va[b.f_bus] - va[b.t_bus])) + + b.c4 * (vm[b.f_bus] * vm[b.t_bus] * sin(va[b.f_bus] - va[b.t_bus])) + )^2 + ( + -b.c6 * vm[b.f_bus]^2 - + b.c4 * (vm[b.f_bus] * vm[b.t_bus] * cos(va[b.f_bus] - va[b.t_bus])) + + b.c3 * (vm[b.f_bus] * vm[b.t_bus] * sin(va[b.f_bus] - va[b.t_bus])) + )^2 - b.rate_a_sq for b in data.branch; + lcon = fill!(similar(data.branch, Float64, length(data.branch)), -Inf), ) - + # Thermal to: p_to² + q_to² ≤ rate_a² constraint( c, ( - q[b.t_idx] + - b.c8 * vm[b.t_bus]^2 + - b.c2 * (vm[b.t_bus] * vm[b.f_bus] * cos(va[b.t_bus] - va[b.f_bus])) - - b.c1 * (vm[b.t_bus] * vm[b.f_bus] * sin(va[b.t_bus] - va[b.f_bus])) for - b in data.branch - ), + b.c7 * vm[b.t_bus]^2 + + b.c1 * (vm[b.t_bus] * vm[b.f_bus] * cos(va[b.t_bus] - va[b.f_bus])) + + b.c2 * (vm[b.t_bus] * vm[b.f_bus] * sin(va[b.t_bus] - va[b.f_bus])) + )^2 + ( + -b.c8 * vm[b.t_bus]^2 - + b.c2 * (vm[b.t_bus] * vm[b.f_bus] * cos(va[b.t_bus] - va[b.f_bus])) + + b.c1 * (vm[b.t_bus] * vm[b.f_bus] * sin(va[b.t_bus] - va[b.f_bus])) + )^2 - b.rate_a_sq for b in data.branch; + lcon = fill!(similar(data.branch, Float64, length(data.branch)), -Inf), ) - # Angle difference limits -------------------------------------------------- - constraint( - c, - va[b.f_bus] - va[b.t_bus] for b in data.branch; - lcon = data.angmin, - ucon = data.angmax, - ) + # ============================ EQUALITIES ================================ - # Apparent power thermal limits ------------------------------------------- - constraint( - c, - p[b.f_idx]^2 + q[b.f_idx]^2 - b.rate_a_sq for b in data.branch; - lcon = fill!(similar(data.branch, Float64, length(data.branch)), -Inf), - ) - constraint( - c, - p[b.t_idx]^2 + q[b.t_idx]^2 - b.rate_a_sq for b in data.branch; - lcon = fill!(similar(data.branch, Float64, length(data.branch)), -Inf), - ) + # Reference bus angle + c1 = constraint(c, va[i] for i in data.ref_buses) - # Power balance at each bus ----------------------------------------------- + # Power balance: load + shunt + Σ branch flows - Σ generation = 0 load_balance_p = constraint(c, b.pd * load_multiplier[b.i] + b.gs * vm[b.i]^2 for b in data.bus) load_balance_q = constraint(c, b.qd * load_multiplier[b.i] - b.bs * vm[b.i]^2 for b in data.bus) - # Map arc & generator variables into the bus balance equations - constraint!(c, load_balance_p, a.bus => p[a.i] for a in data.arc) - constraint!(c, load_balance_q, a.bus => q[a.i] for a in data.arc) + # From-arc p flowing out of f_bus + constraint!( + c, + load_balance_p, + b.f_bus => b.c5 * vm[b.f_bus]^2 + + b.c3 * (vm[b.f_bus] * vm[b.t_bus] * cos(va[b.f_bus] - va[b.t_bus])) + + b.c4 * (vm[b.f_bus] * vm[b.t_bus] * sin(va[b.f_bus] - va[b.t_bus])) for + b in data.branch + ) + # To-arc p flowing out of t_bus + constraint!( + c, + load_balance_p, + b.t_bus => b.c7 * vm[b.t_bus]^2 + + b.c1 * (vm[b.t_bus] * vm[b.f_bus] * cos(va[b.t_bus] - va[b.f_bus])) + + b.c2 * (vm[b.t_bus] * vm[b.f_bus] * sin(va[b.t_bus] - va[b.f_bus])) for + b in data.branch + ) + # From-arc q flowing out of f_bus + constraint!( + c, + load_balance_q, + b.f_bus => -b.c6 * vm[b.f_bus]^2 - + b.c4 * (vm[b.f_bus] * vm[b.t_bus] * cos(va[b.f_bus] - va[b.t_bus])) + + b.c3 * (vm[b.f_bus] * vm[b.t_bus] * sin(va[b.f_bus] - va[b.t_bus])) for + b in data.branch + ) + # To-arc q flowing out of t_bus + constraint!( + c, + load_balance_q, + b.t_bus => -b.c8 * vm[b.t_bus]^2 - + b.c2 * (vm[b.t_bus] * vm[b.f_bus] * cos(va[b.t_bus] - va[b.f_bus])) + + b.c1 * (vm[b.t_bus] * vm[b.f_bus] * sin(va[b.t_bus] - va[b.f_bus])) for + b in data.branch + ) constraint!(c, load_balance_p, g.bus => -pg[g.i] for g in data.gen) constraint!(c, load_balance_q, g.bus => -qg[g.i] for g in data.gen) - return ExaModel(c; prod = prod), length(data.bus), length(data.gen), length(data.arc) + # ref_bus_idxs: positions in y (1-indexed) of the reference bus voltage angles. + # va occupies positions 1..nbus, so ref bus at internal index i → y[i]. + ref_bus_idxs = Int.(Array(data.ref_buses)) # always CPU — just indices, not model data + return ExaModel(c; prod = prod), length(data.bus), length(data.gen), length(data.arc), ref_bus_idxs end diff --git a/test/runtests.jl b/test/runtests.jl index ef25104..da64f59 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -58,7 +58,7 @@ include("power.jl") rng = Random.default_rng(), T = Float64, ) - model, nbus, ngen, blines = + model, nbus, ngen, blines, _ = create_parametric_ac_power_model(filename; backend = backend, T = T) bm_train = BNK.BatchModel(model, batch_size, config = BNK.BatchModelConfig(:full)) bm_test = BNK.BatchModel(model, dataset_size, config = BNK.BatchModelConfig(:full)) @@ -66,12 +66,31 @@ include("power.jl") nvar = model.meta.nvar ncon = model.meta.ncon nθ = length(model.θ) - num_equal = nbus * 2 - - Θ_train = randn(T, nθ, dataset_size) |> dev_gpu - Θ_test = randn(T, nθ, dataset_size) |> dev_gpu - - primal_model = feed_forward_builder(nθ, nvar, [320, 320]) + # Compute equality count from model metadata — equality iff lcon == ucon. + # Constraint ordering in power.jl puts inequalities first, equalities last, + # so num_equal is the count of equalities at the tail of the constraint vector. + is_equal = model.meta.lcon .== model.meta.ucon + num_equal = count(is_equal) + @test all(is_equal[ncon-num_equal+1:ncon]) # ordering invariant: equalities at the tail + @test !any(is_equal[1:ncon-num_equal]) # ordering invariant: inequalities at the head + @info "Constraint counts" ncon num_equal num_inequal=(ncon-num_equal) + + # Paper-style sampling: load multipliers ~ 1.0 ± 10% + # (The model declares `parameter(c, [1.0 for b in data.bus])` so Θ is a + # per-bus load multiplier centered at 1.0.) + Random.seed!(rng, 0x12345678) + Θ_train = (T(1.0) .+ T(0.1) .* randn(rng, T, nθ, dataset_size)) |> dev_gpu + Random.seed!(rng, 0xCAFEBABE) + Θ_test = (T(1.0) .+ T(0.1) .* randn(rng, T, nθ, dataset_size)) |> dev_gpu + + # Primal architecture: 2-layer MLP + BoundedOutput head to enforce + # variable bounds architecturally (Park & Van Hentenryck 2023 AC-OPF model). + lvar = T.(model.meta.lvar) + uvar = T.(model.meta.uvar) + primal_model = Chain( + feed_forward_builder(nθ, nvar, [320, 320]), + BoundedOutput(lvar, uvar), + ) |> dev_gpu ps_primal, st_primal = Lux.setup(rng, primal_model) ps_primal = ps_primal |> dev_gpu st_primal = st_primal |> dev_gpu @@ -99,24 +118,40 @@ include("power.jl") data = DataLoader((Θ_train); batchsize = batch_size, shuffle = true) .|> dev_gpu - method = ALMMethod(; batch_model=bm_train, num_equal=num_equal) + # Paper hyperparameters for AC-OPF (Park & Van Hentenryck 2023, Table for AC-OPF): + # α=2 (not 10), ρ_max=1e4 — prevents the augmented Lagrangian from blowing up. + method = ALMMethod(; batch_model=bm_train, num_equal=num_equal, + max_dual=1.0e6, ρmax=1.0e4, τ=0.8, α=2.0) trainer = ALMTrainer(primal_model, train_state_primal, dual_model, train_state_dual) - method_test = ALMMethod(; batch_model=bm_test, num_equal=num_equal) + method_test = ALMMethod(; batch_model=bm_test, num_equal=num_equal, + max_dual=1.0e6, ρmax=1.0e4, τ=0.8, α=2.0) test_primal_loss = primal_loss(method_test) test_dual_loss = dual_loss(method_test) - _, prev_primal_loss_val, _, _ = test_primal_loss(primal_model, ps_primal, st_primal, (Θ_test, trainer)) + prev_primal_loss_val, _, prev_stats = test_primal_loss(primal_model, ps_primal, st_primal, (Θ_test, trainer)) + primal_loss_val = prev_primal_loss_val + dual_loss_val = zero(prev_primal_loss_val) + stats_primal = prev_stats + # L_primal/L_dual now count gradient steps (not epochs); use length(data) per + # outer iter to match the old behaviour of one full pass per outer iteration. + n_steps = length(data) for iter in 1:100 - single_train_step!(method, trainer, data) - # Log - _, primal_loss_val, stats_primal, train_state_primal = test_primal_loss(primal_model, ps_primal, st_primal, (Θ_test, trainer)) - _, dual_loss_val, stats_dual, train_state_dual = test_dual_loss(dual_model, ps_dual, st_dual, (Θ_test, trainer)) - - @info "Validation Testset: Iteration $iter" primal_loss_val stats_primal.max_violation stats_primal.mean_violations stats_primal.mean_objs dual_loss_val + single_train_step!(method, trainer, data; L_primal=n_steps, L_dual=n_steps) + # Log using the trained parameters stored in trainer + ps_primal_trained = trainer.primal_training_state.parameters + st_primal_trained = trainer.primal_training_state.states + ps_dual_trained = trainer.dual_training_state.parameters + st_dual_trained = trainer.dual_training_state.states + primal_loss_val, _, stats_primal = test_primal_loss(primal_model, ps_primal_trained, st_primal_trained, (Θ_test, trainer)) + dual_loss_val, _, stats_dual = test_dual_loss(dual_model, ps_dual_trained, st_dual_trained, (Θ_test, trainer)) + + @info "Validation Testset: Iteration $iter" primal_loss_val stats_primal.max_violation stats_primal.max_bound_violation stats_primal.mean_violations stats_primal.mean_objs dual_loss_val end - # Check that the primal loss is decreasing - @test primal_loss_val < prev_primal_loss_val + # Check that training drove down constraint violations. + # The AL loss itself can grow because the trained dual produces large multipliers + # against any residual violation; mean_violations is the metric ALM directly targets. + @test stats_primal.mean_violations < prev_stats.mean_violations return end From 45e89de548e4ad17e8f45966bfee0bcbd010d465 Mon Sep 17 00:00:00 2001 From: Khai Nguyen Date: Fri, 22 May 2026 22:06:14 -0400 Subject: [PATCH 2/5] Move use_analytical_dual and use_dual_learning into ALMMethod Both flags are now ALMMethod fields (defaults: false, true) instead of train!/single_train_step! kwargs, keeping all method config in one place. - use_analytical_dual: apply ALM dual update analytically per gradient step - use_dual_learning: set false to keep the dual network frozen throughout (skips dual training loop and the per-outer-iter deepcopy of dual state) Remove use_penalty_only from single_train_step! (warmup is handled directly in train! and was never exposed as a kwarg callers needed to set). --- examples/case57_train.jl | 12 +++---- examples/case57_train_twophase.jl | 10 +++--- src/L2OALM.jl | 58 +++++++++++++++---------------- 3 files changed, 38 insertions(+), 42 deletions(-) diff --git a/examples/case57_train.jl b/examples/case57_train.jl index 6ce43f5..d723fbe 100644 --- a/examples/case57_train.jl +++ b/examples/case57_train.jl @@ -34,8 +34,8 @@ const ALPHA = parse(Float64, get(ENV, "PDL_ALPHA", "2.0")) const RHO_EQ_SCALE = parse(Float64, get(ENV, "PDL_RHO_EQ_SCALE", "1.0")) const MAX_DUAL = parse(Float64, get(ENV, "PDL_MAX_DUAL", "1.0e6")) const HIDDEN_MULT = parse(Float64, get(ENV, "PDL_HIDDEN_MULT", "1.2")) -const USE_PENALTY_ONLY = parse(Bool, get(ENV, "PDL_USE_PENALTY_ONLY", "false")) const USE_ANALYTICAL_DUAL = parse(Bool, get(ENV, "PDL_USE_ANALYTICAL_DUAL", "false")) +const USE_DUAL_LEARNING = parse(Bool, get(ENV, "PDL_USE_DUAL_LEARNING", "true")) const T_F = Float32 using L2OALM @@ -145,11 +145,13 @@ function main() # Paper hyperparameters for AC-OPF. method = ALMMethod(; batch_model = bm_train, num_equal = num_equal, - max_dual = MAX_DUAL, ρmax = RHO_MAX, τ = TAU, α = ALPHA, ρ_eq_scale = RHO_EQ_SCALE) + max_dual = MAX_DUAL, ρmax = RHO_MAX, τ = TAU, α = ALPHA, ρ_eq_scale = RHO_EQ_SCALE, + use_analytical_dual = USE_ANALYTICAL_DUAL, use_dual_learning = USE_DUAL_LEARNING) trainer = ALMTrainer(primal_model, train_state_primal, dual_model, train_state_dual, RHO_INIT) method_test = ALMMethod(; batch_model = bm_test, num_equal = num_equal, - max_dual = MAX_DUAL, ρmax = RHO_MAX, τ = TAU, α = ALPHA, ρ_eq_scale = RHO_EQ_SCALE) + max_dual = MAX_DUAL, ρmax = RHO_MAX, τ = TAU, α = ALPHA, ρ_eq_scale = RHO_EQ_SCALE, + use_analytical_dual = USE_ANALYTICAL_DUAL, use_dual_learning = USE_DUAL_LEARNING) test_primal_loss = primal_loss(method_test) # ---- CSV logger setup ---- @@ -201,14 +203,12 @@ function main() eval_and_log(0, trainer) # baseline before training - @info "Starting train!" K L_PRIMAL L_DUAL WARMUP_EPOCHS RHO_INIT RHO_MAX TAU ALPHA RHO_EQ_SCALE MAX_DUAL LR_PRIMAL LR_DUAL HIDDEN_MULT h USE_PENALTY_ONLY USE_ANALYTICAL_DUAL + @info "Starting train!" K L_PRIMAL L_DUAL WARMUP_EPOCHS RHO_INIT RHO_MAX TAU ALPHA RHO_EQ_SCALE MAX_DUAL LR_PRIMAL LR_DUAL HIDDEN_MULT h USE_ANALYTICAL_DUAL USE_DUAL_LEARNING train!(method, trainer, data; K = K, L_primal = L_PRIMAL, L_dual = L_DUAL, warmup_epochs = WARMUP_EPOCHS, lr_primal = LR_PRIMAL, lr_dual = LR_DUAL, lr_decay = LR_DECAY, - use_penalty_only = USE_PENALTY_ONLY, - use_analytical_dual = USE_ANALYTICAL_DUAL, eval_fn = eval_and_log, ) diff --git a/examples/case57_train_twophase.jl b/examples/case57_train_twophase.jl index 125abf2..7b5bfb7 100644 --- a/examples/case57_train_twophase.jl +++ b/examples/case57_train_twophase.jl @@ -203,10 +203,10 @@ function main() method_p1 = ALMMethod(; batch_model = bm_train, num_equal = num_equal, max_dual = MAX_DUAL_P1, ρmax = RHO_P1, τ = TAU, α = ALPHA, - ρ_eq_scale = RHO_EQ_SCALE_P1) + ρ_eq_scale = RHO_EQ_SCALE_P1, use_analytical_dual = true) method_p1_test = ALMMethod(; batch_model = bm_test, num_equal = num_equal, max_dual = MAX_DUAL_P1, ρmax = RHO_P1, τ = TAU, α = ALPHA, - ρ_eq_scale = RHO_EQ_SCALE_P1) + ρ_eq_scale = RHO_EQ_SCALE_P1, use_analytical_dual = true) trainer = ALMTrainer(primal_model, train_state_primal, dual_model, train_state_dual, RHO_P1) @@ -217,7 +217,6 @@ function main() K = K1, L_primal = L_PRIMAL, L_dual = L_DUAL, warmup_epochs = WARMUP_EPOCHS, lr_primal = LR_PRIMAL, lr_dual = LR_DUAL, lr_decay = LR_DECAY, - use_penalty_only = false, use_analytical_dual = true, eval_fn = eval_p1, ) @@ -231,10 +230,10 @@ function main() method_p2 = ALMMethod(; batch_model = bm_train, num_equal = num_equal, max_dual = MAX_DUAL_P2, ρmax = RHO_MAX_P2, τ = TAU, α = ALPHA, - ρ_eq_scale = RHO_EQ_SCALE_P2) + ρ_eq_scale = RHO_EQ_SCALE_P2, use_analytical_dual = true) method_p2_test = ALMMethod(; batch_model = bm_test, num_equal = num_equal, max_dual = MAX_DUAL_P2, ρmax = RHO_MAX_P2, τ = TAU, α = ALPHA, - ρ_eq_scale = RHO_EQ_SCALE_P2) + ρ_eq_scale = RHO_EQ_SCALE_P2, use_analytical_dual = true) eval_p2 = make_eval_fn(2, method_p2_test) @@ -242,7 +241,6 @@ function main() K = K2, L_primal = L_PRIMAL, L_dual = L_DUAL, warmup_epochs = 0, # no second warmup — primal already in good basin lr_primal = LR_PRIMAL, lr_dual = LR_DUAL, lr_decay = LR_DECAY, - use_penalty_only = false, use_analytical_dual = true, eval_fn = eval_p2, ) diff --git a/src/L2OALM.jl b/src/L2OALM.jl index fcc90fd..341fd19 100644 --- a/src/L2OALM.jl +++ b/src/L2OALM.jl @@ -122,9 +122,11 @@ struct ALMMethod{T<:Real} <: AbstractPrimalDualMethod α::T num_equal::Int # TODO: There should be a way to get this from the batch model ρ_eq_scale::T # extra penalty multiplier for equality constraints (default 1.0) + use_analytical_dual::Bool # apply ALM dual update analytically per gradient step + use_dual_learning::Bool # train the dual network; false = frozen dual throughout - function ALMMethod(batch_model::BatchModel, max_dual::T, ρmax::T, τ::T, α::T, num_equal::Int, ρ_eq_scale::T) where {T<:Real} - new{T}(batch_model, max_dual, ρmax, τ, α, num_equal, ρ_eq_scale) + function ALMMethod(batch_model::BatchModel, max_dual::T, ρmax::T, τ::T, α::T, num_equal::Int, ρ_eq_scale::T, use_analytical_dual::Bool, use_dual_learning::Bool) where {T<:Real} + new{T}(batch_model, max_dual, ρmax, τ, α, num_equal, ρ_eq_scale, use_analytical_dual, use_dual_learning) end end @@ -146,8 +148,10 @@ function ALMMethod(; τ::T = 0.8, α::T = 10.0, ρ_eq_scale::T = 1.0, + use_analytical_dual::Bool = false, + use_dual_learning::Bool = true, ) where {T<:Real} - return ALMMethod(batch_model, max_dual, ρmax, τ, α, num_equal, ρ_eq_scale) + return ALMMethod(batch_model, max_dual, ρmax, τ, α, num_equal, ρ_eq_scale, use_analytical_dual, use_dual_learning) end """ @@ -386,7 +390,9 @@ function update_trainer!( # Update dual state trainer.dual_loss = dual_state.dual_loss - trainer.prev_dual_training_state = deepcopy(trainer.dual_training_state) + if method.use_dual_learning + trainer.prev_dual_training_state = deepcopy(trainer.dual_training_state) + end return end @@ -434,27 +440,17 @@ with the dual model fixed, then the inverse is done with the dual learning metho updates the trainer state. """ function single_train_step!( - method::AbstractPrimalDualMethod, - trainer::AbstractPrimalDualTrainer, + method::ALMMethod, + trainer::ALMTrainer, data; L_primal::Int = 1, L_dual::Int = 1, - use_penalty_only::Bool = false, - use_analytical_dual::Bool = false, ) - _primal_loss = if use_penalty_only - penalty_only_primal_loss(method) - elseif use_analytical_dual - analytical_primal_loss(method) - else - primal_loss(method) - end - _dual_loss = dual_loss(method) + _primal_loss = method.use_analytical_dual ? analytical_primal_loss(method) : primal_loss(method) train_state_primal = trainer.primal_training_state train_state_dual = trainer.dual_training_state # ----- primal phase: exactly L_primal gradient steps, dual frozen ----- - # Cycle through data batches (reshuffled each epoch) until L_primal steps done. primal_states = NamedTuple[] primal_step = 0 while primal_step < L_primal @@ -471,19 +467,23 @@ function single_train_step!( current_state_primal = reconcile_primal(trainer, primal_states) # ----- dual phase: exactly L_dual gradient steps, primal frozen ----- + # Skipped entirely when use_dual_learning = false (dual network stays frozen). dual_states = NamedTuple[] - dual_step = 0 - while dual_step < L_dual - for θ in data - dual_step >= L_dual && break - _, _, stats, train_state_dual = Training.single_train_step!( - AutoZygote(), _dual_loss, (θ, trainer), train_state_dual, - ) - push!(dual_states, stats) - dual_step += 1 + if method.use_dual_learning + _dual_loss = dual_loss(method) + dual_step = 0 + while dual_step < L_dual + for θ in data + dual_step >= L_dual && break + _, _, stats, train_state_dual = Training.single_train_step!( + AutoZygote(), _dual_loss, (θ, trainer), train_state_dual, + ) + push!(dual_states, stats) + dual_step += 1 + end end + trainer.dual_training_state = train_state_dual end - trainer.dual_training_state = train_state_dual update_trainer!(method, trainer, current_state_primal, reconcile_dual(trainer, dual_states)) return end @@ -629,8 +629,6 @@ function train!( warmup_epochs::Int = 4, lr_primal::Real = 1e-4, lr_dual::Real = 1e-4, lr_decay::Real = 0.99, - use_penalty_only::Bool = false, - use_analytical_dual::Bool = false, eval_fn = nothing, ) Optimisers.adjust!(trainer.primal_training_state.optimizer_state, lr_primal) @@ -664,7 +662,7 @@ function train!( # ----- alternating primal/dual loop with optional lr decay ----- last_val = typemax(Float64) for outer in 1:K - single_train_step!(method, trainer, data; L_primal = L_primal, L_dual = L_dual, use_penalty_only = use_penalty_only, use_analytical_dual = use_analytical_dual) + single_train_step!(method, trainer, data; L_primal = L_primal, L_dual = L_dual) if eval_fn !== nothing v = eval_fn(outer, trainer) if v isa Real From b0304f04670071d1eab2ca31afe0edec8501a557 Mon Sep 17 00:00:00 2001 From: Khai Nguyen Date: Fri, 22 May 2026 22:31:49 -0400 Subject: [PATCH 3/5] Default use_analytical_dual to true (best config from experiments) --- examples/case57_train_twophase.jl | 8 ++++---- src/L2OALM.jl | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/examples/case57_train_twophase.jl b/examples/case57_train_twophase.jl index 7b5bfb7..cc81cb0 100644 --- a/examples/case57_train_twophase.jl +++ b/examples/case57_train_twophase.jl @@ -203,10 +203,10 @@ function main() method_p1 = ALMMethod(; batch_model = bm_train, num_equal = num_equal, max_dual = MAX_DUAL_P1, ρmax = RHO_P1, τ = TAU, α = ALPHA, - ρ_eq_scale = RHO_EQ_SCALE_P1, use_analytical_dual = true) + ρ_eq_scale = RHO_EQ_SCALE_P1) method_p1_test = ALMMethod(; batch_model = bm_test, num_equal = num_equal, max_dual = MAX_DUAL_P1, ρmax = RHO_P1, τ = TAU, α = ALPHA, - ρ_eq_scale = RHO_EQ_SCALE_P1, use_analytical_dual = true) + ρ_eq_scale = RHO_EQ_SCALE_P1) trainer = ALMTrainer(primal_model, train_state_primal, dual_model, train_state_dual, RHO_P1) @@ -230,10 +230,10 @@ function main() method_p2 = ALMMethod(; batch_model = bm_train, num_equal = num_equal, max_dual = MAX_DUAL_P2, ρmax = RHO_MAX_P2, τ = TAU, α = ALPHA, - ρ_eq_scale = RHO_EQ_SCALE_P2, use_analytical_dual = true) + ρ_eq_scale = RHO_EQ_SCALE_P2) method_p2_test = ALMMethod(; batch_model = bm_test, num_equal = num_equal, max_dual = MAX_DUAL_P2, ρmax = RHO_MAX_P2, τ = TAU, α = ALPHA, - ρ_eq_scale = RHO_EQ_SCALE_P2, use_analytical_dual = true) + ρ_eq_scale = RHO_EQ_SCALE_P2) eval_p2 = make_eval_fn(2, method_p2_test) diff --git a/src/L2OALM.jl b/src/L2OALM.jl index 341fd19..6d52796 100644 --- a/src/L2OALM.jl +++ b/src/L2OALM.jl @@ -131,7 +131,7 @@ struct ALMMethod{T<:Real} <: AbstractPrimalDualMethod end """ - ALMMethod(; batch_model, num_equal, max_dual=1e6, ρmax=1e6, τ=0.8, α=10.0, ρ_eq_scale=1.0) + ALMMethod(; batch_model, num_equal, max_dual=1e6, ρmax=1e6, τ=0.8, α=10.0, ρ_eq_scale=1.0, use_analytical_dual=true, use_dual_learning=true) Constructor for the ALM method. @@ -148,7 +148,7 @@ function ALMMethod(; τ::T = 0.8, α::T = 10.0, ρ_eq_scale::T = 1.0, - use_analytical_dual::Bool = false, + use_analytical_dual::Bool = true, use_dual_learning::Bool = true, ) where {T<:Real} return ALMMethod(batch_model, max_dual, ρmax, τ, α, num_equal, ρ_eq_scale, use_analytical_dual, use_dual_learning) From d13bc9b65cd1a1624e02656958a4608f9d1053b5 Mon Sep 17 00:00:00 2001 From: Khai Nguyen Date: Fri, 22 May 2026 22:41:32 -0400 Subject: [PATCH 4/5] Write README: installation, quick start, API reference, benchmark results --- README.md | 194 +++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 191 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index a9fa2de..9d9a179 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,197 @@ # L2OALM.jl -Learning To Optimize using the Augmented Lagrangian Primal-Dual Method. +Julia implementation of **Primal-Dual Learning (PDL)** for parametric constrained optimization, following Park & Van Hentenryck, *"Self-Supervised Primal-Dual Learning for Constrained Optimization"* (AAAI 2023). -[![Stable](https://img.shields.io/badge/docs-stable-blue.svg)](https://LearningToOptimize.github.io/L2OALM.jl/stable/) -[![Dev](https://img.shields.io/badge/docs-dev-blue.svg)](https://LearningToOptimize.github.io/L2OALM.jl/dev/) [![Build Status](https://github.com/LearningToOptimize/L2OALM.jl/actions/workflows/CI.yml/badge.svg?branch=main)](https://github.com/LearningToOptimize/L2OALM.jl/actions/workflows/CI.yml?query=branch%3Amain) [![Coverage](https://codecov.io/gh/LearningToOptimize/L2OALM.jl/branch/main/graph/badge.svg)](https://codecov.io/gh/LearningToOptimize/L2OALM.jl) +--- + +## What it does + +Given a parametric program + +``` +min f(y; θ) + y +s.t. g(y; θ) ≤ 0 [inequalities] + h(y; θ) = 0 [equalities] + lvar ≤ y ≤ uvar [bounds] +``` + +PDL trains two networks jointly — **self-supervised**, no solver in the loop: + +| Network | Maps | Role | +|---------|------|------| +| Primal `ŷ(θ; φ)` | parameters → decisions | Produces a near-feasible, near-optimal solution | +| Dual `λ̂(θ; ψ)` | parameters → multipliers | Predicts Lagrange multipliers (μ for ineq, λ for eq) | + +Training mimics the Augmented Lagrangian Method (ALM): alternate between minimizing the augmented Lagrangian over `φ` and regressing `ψ` onto the ALM multiplier update. + +--- + +## Installation + +```julia +using Pkg +Pkg.add(url="https://github.com/LearningToOptimize/L2OALM.jl") +``` + +`BatchNLPKernels` is a required dependency pinned to its GitHub `main` branch and is resolved automatically. + +--- + +## Quick start + +```julia +using L2OALM, Lux, Optimisers, MLUtils, BatchNLPKernels + +# 1. Build a BatchModel wrapping your ExaModel (see test/power.jl for AC-OPF) +bm_train = BNK.BatchModel(model, batch_size, config=BNK.BatchModelConfig(:full)) +bm_test = BNK.BatchModel(model, test_size, config=BNK.BatchModelConfig(:full)) + +# 2. Define primal and dual networks +primal_net = Chain(Dense(nθ, 512, relu), Dense(512, 512, relu), Dense(512, nvar), + BoundedOutput(lvar, uvar)) # enforces bounds architecturally +dual_net = Chain(Dense(nθ, 512, relu), Dense(512, 512, relu), Dense(512, ncon)) + +ps_p, st_p = Lux.setup(rng, primal_net) +ps_d, st_d = Lux.setup(rng, dual_net) + +# 3. Configure the ALM method +method = ALMMethod(; + batch_model = bm_train, + num_equal = num_equal, # number of equality constraints (tail of constraint vec) + ρmax = 1e4, + max_dual = 1e6, + τ = 0.8, + α = 2.0, + use_analytical_dual = true, # apply ALM update analytically per gradient step (recommended) + use_dual_learning = true, # train the dual network (recommended) +) + +trainer = ALMTrainer(primal_net, + Training.TrainState(primal_net, ps_p, st_p, Optimisers.Adam(1e-4)), + dual_net, + Training.TrainState(dual_net, ps_d, st_d, Optimisers.Adam(1e-3))) + +data = DataLoader(Θ_train; batchsize=200, shuffle=true) + +# 4. Train +train!(method, trainer, data; + K = 100, # outer ALM iterations + L_primal = 2500, # gradient steps per primal phase + L_dual = 2500, # gradient steps per dual phase + warmup_epochs = 25000, # penalty-only warm-start steps before ALM loop + lr_primal = 1e-4, + lr_dual = 1e-3, + lr_decay = 0.99, +) +``` + +--- + +## Key components + +### `BoundedOutput(lvar, uvar)` + +Lux layer that enforces variable bounds architecturally using sigmoid: + +``` +yᵢ = lvar[i] + (uvar[i] − lvar[i]) · σ(zᵢ) +``` + +Guarantees `max_bound_violation ≡ 0` by construction. Uses sigmoid (not hardsigmoid) so gradient is always nonzero — hardsigmoid's zero-gradient zone at `z < −3` permanently traps outputs at the lower bound. + +### `FixRefBus(nvar, ref_bus_idxs)` + +Lux layer that zeroes the reference-bus voltage angle architecturally. GPU-safe — mask lives on the same device as the network. + +### `ALMMethod` + +Immutable configuration struct. Key fields: + +| Field | Default | Description | +|-------|---------|-------------| +| `ρmax` | `1e6` | Maximum penalty parameter | +| `max_dual` | `1e6` | Multiplier clip bound | +| `τ` | `0.8` | Violation ratio threshold for ρ update | +| `α` | `10.0` | ρ growth factor | +| `ρ_eq_scale` | `1.0` | Extra penalty multiplier for equalities only | +| `use_analytical_dual` | `true` | Apply ALM dual update per gradient step | +| `use_dual_learning` | `true` | Train the dual network | + +**`use_analytical_dual`**: instead of using the dual network output directly, computes analytically-corrected multipliers at every primal gradient step: +``` +μ_eff = clamp(μ̂(θ) + ρ·g(ŷ), 0, max_dual) +λ_eff = clamp(λ̂(θ) + ρ·h(ŷ), −max_dual, max_dual) +``` +Eliminates the dual tracking gap (the dual network lags the true multipliers by orders of magnitude at high ρ without this correction). + +**`use_dual_learning`**: set `false` to freeze the dual network — useful for ablations where you want to isolate the effect of the analytical correction alone. + +### `ALMTrainer` + +Mutable state struct holding both networks, their `TrainState`s, the current penalty `ρ`, and a snapshot of the previous dual state used for computing ALM targets. + +### `train!(method, trainer, data; ...)` + +Top-level training loop: +1. **Warm-start** (`warmup_epochs` gradient steps): penalty-only primal loss at `ρmax` to push the network into a feasible basin before the dual starts producing meaningful multipliers. +2. **K outer iterations**: alternate primal phase (`L_primal` steps) and dual phase (`L_dual` steps); update ρ if violations stagnate; optional `eval_fn` callback with learning-rate decay. + +--- + +## Constraint ordering + +Constraints must be ordered as **inequalities first, equalities last**. `num_equal` is the count of equalities at the tail of the constraint vector. See `test/power.jl` for how to build a compliant `ExaModel` for AC-OPF. + +--- + +## AC-OPF benchmark (case57) + +Results on `pglib_opf_case57_ieee` (128 variables, 435 constraints: 320 ineq + 115 eq), 5000 held-out test samples: + +| Config | max_eq | max_ineq | Notes | +|--------|--------|----------|-------| +| `use_analytical_dual=true`, ρmax=1e4, max_dual=1e6 | **1.165** | 0.000 | Best result | +| `use_analytical_dual=false` (dual network only) | ~1.25 | 0.000 | Tracking gap hurts | +| ρmax=1e6, max_dual=1e6 | 1.741 | 0.000 | Oscillates at high ρ | +| max_dual=1e4 (any ρ schedule) | ~1.80 | 0.000 | Gradient saturates at 1.0/step | + +Variable bounds and inequality constraints are satisfied exactly (`max_bound = max_ineq = 0`) in all runs by iter ~10. + +--- + +## Running tests + +```bash +# CPU +julia --project=. -e 'using Pkg; Pkg.test()' + +# GPU (CUDA) +BNK_TEST_CUDA=1 julia --project=. -e 'using Pkg; Pkg.test()' +``` + +The test uses `pglib_opf_case14_ieee` (downloaded automatically via the `PGLib` artifact on first run). + +--- + +## Examples + +- [`examples/case57_train.jl`](examples/case57_train.jl) — single-phase training on case57; all hyperparameters overridable via `PDL_*` env vars. +- [`examples/case57_train_twophase.jl`](examples/case57_train_twophase.jl) — two-phase training: Phase 1 at fixed high ρ (avoids degenerate pg=0 basin), Phase 2 grows ρ with `ρ_eq_scale`. + +--- + +## Reference + +```bibtex +@inproceedings{park2023pdl, + title = {Self-Supervised Primal-Dual Learning for Constrained Optimization}, + author = {Park, Seonho and Van Hentenryck, Pascal}, + booktitle = {Proceedings of the AAAI Conference on Artificial Intelligence}, + year = {2023}, + url = {https://arxiv.org/abs/2208.09046} +} +``` From b4333c462c6740fd9cdd6d1ceca1d95c97ba0d5c Mon Sep 17 00:00:00 2001 From: Andrew Rosemberg Date: Thu, 28 May 2026 11:22:08 -0400 Subject: [PATCH 5/5] Modify CI.yml to update Go versions Updated the CI workflow to include version 1.12 and removed older versions. --- .github/workflows/CI.yml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index dca863e..2d27e8c 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -23,9 +23,7 @@ jobs: fail-fast: false matrix: version: - - '1.10' - - '1.6' - - 'pre' + - '1.12' os: - ubuntu-latest arch: