From 8d609a41f35f1490f1bc4882fa5fdda0ffa4ca4e Mon Sep 17 00:00:00 2001 From: liobrasil Date: Tue, 10 Mar 2026 22:19:08 -0400 Subject: [PATCH 1/3] test: tighten supervisor recovery regressions --- cadence/tests/scheduled_supervisor_test.cdc | 105 ++++++- ...duler_mixed_population_regression_test.cdc | 292 ++++++++++++++++++ 2 files changed, 388 insertions(+), 9 deletions(-) create mode 100644 cadence/tests/scheduler_mixed_population_regression_test.cdc diff --git a/cadence/tests/scheduled_supervisor_test.cdc b/cadence/tests/scheduled_supervisor_test.cdc index 32438057..bb96543a 100644 --- a/cadence/tests/scheduled_supervisor_test.cdc +++ b/cadence/tests/scheduled_supervisor_test.cdc @@ -599,6 +599,25 @@ fun testStuckYieldVaultDetectionLogic() { log("PASS: Stuck yield vault detection correctly identifies healthy yield vaults") } +/// Returns per-yield-vault recovery event counts from YieldVaultRecovered events. +/// +/// This is used by stress tests to distinguish "all vaults recovered at least once" +/// from "lots of recovery events happened", which can otherwise hide duplicate +/// recovery churn for the same vault IDs. +/// +/// Example: 240 recovery events for 200 vaults can look healthy if the test only +/// checks `events.length >= 200`, even though 40 of those events may be repeats. +access(all) +fun getRecoveredYieldVaultCounts(): {UInt64: Int} { + let counts: {UInt64: Int} = {} + let recoveredEvents = Test.eventsOfType(Type()) + for recoveredAny in recoveredEvents { + let recovered = recoveredAny as! FlowYieldVaultsSchedulerV1.YieldVaultRecovered + counts[recovered.yieldVaultID] = (counts[recovered.yieldVaultID] ?? 0) + 1 + } + return counts +} + /// COMPREHENSIVE TEST: Insufficient Funds -> Failure -> Recovery /// /// This test validates the COMPLETE failure and recovery cycle: @@ -939,9 +958,21 @@ fun testInsufficientFundsAndRecovery() { /// /// Flow: create 200 yield vaults, run 2 scheduling rounds, drain FLOW so executions fail, /// wait for vaults to be marked stuck, refund FLOW, schedule the supervisor, then advance -/// time for ceil(200/MAX_BATCH_SIZE)+10 supervisor ticks. Asserts all 200 vaults are -/// recovered (YieldVaultRecovered events), none still stuck, and all have active schedules. -/// The +10 extra ticks are a buffer so every vault is processed despite scheduler timing. +/// time for enough supervisor ticks to recover all unique vault IDs, plus a short +/// stabilization window. This asserts: +/// - every one of the 200 vault IDs is recovered at least once, +/// - no recovery failures occur, +/// - no vault emits more than one recovery event, +/// - once all vaults are recovered and healthy, extra supervisor ticks do not emit +/// additional recovery events, +/// - none remain stuck, and all have active schedules. +/// +/// Why this was tightened: +/// the earlier version only checked `YieldVaultRecovered.length >= n` after +/// `ceil(n / MAX_BATCH_SIZE) + 10` supervisor ticks. That allowed the test to pass +/// even when some vaults recovered more than once while others had not yet been +/// uniquely validated. This version keeps the same tick budget, but uses it as a +/// timeout ceiling instead of treating it as proof that the recovery set is clean. access(all) fun testSupervisorHandlesManyStuckVaults() { let n = 200 @@ -1024,19 +1055,75 @@ fun testSupervisorHandlesManyStuckVaults() { ) Test.expect(schedSupRes, Test.beSucceeded()) - // 7. Advance time for supervisor ticks (ceil(n/MAX_BATCH_SIZE)+10); each tick processes a batch + // 7. Advance time until every target vault has emitted at least one recovery event. + // + // We still compute ceil(n / MAX_BATCH_SIZE) + 10, but it now acts as a maximum + // allowed budget for supervisor ticks. The loop stops early once all 200 vault IDs + // have been seen at least once. This makes the assertion sensitive to duplicate + // recoveries: repeated events for the same vault no longer help the test finish. + // + // After all unique IDs are observed, run a short stabilization window to check + // that a healthy supervisor does not continue to emit recovery events. let supervisorRunsNeeded = (UInt(n) + UInt(maxBatchSize) - 1) / UInt(maxBatchSize) + let maxSupervisorTicks = supervisorRunsNeeded + 10 var run = 0 as UInt - while run < supervisorRunsNeeded + 10 { + var recoveredCounts = getRecoveredYieldVaultCounts() + while run < maxSupervisorTicks && recoveredCounts.length < n { Test.moveTime(by: 60.0 * 10.0 + 10.0) Test.commitBlock() run = run + 1 + recoveredCounts = getRecoveredYieldVaultCounts() + } + log("testSupervisorHandlesManyStuckVaults: ran \(run.toString()) supervisor ticks to reach \(recoveredCounts.length.toString()) unique recovered vaults") + + let recoveryFailures = Test.eventsOfType(Type()) + Test.assertEqual(0, recoveryFailures.length) + + // Split the validation into: + // - missing recoveries: some vaults never emitted YieldVaultRecovered at all + // - duplicate recoveries: some vaults emitted YieldVaultRecovered more than once + // + // Both matter. Missing recoveries means the supervisor did not cover the whole set. + // Duplicate recoveries means event volume was inflated by churn, which the old + // `recoveredEvents.length >= n` assertion could not distinguish from success. + var missingRecoveries = 0 + var duplicatedRecoveries = 0 + var duplicatedVaults = 0 + for yieldVaultID in yieldVaultIDs { + let recoveryCount = recoveredCounts[yieldVaultID] ?? 0 + if recoveryCount == 0 { + missingRecoveries = missingRecoveries + 1 + } else if recoveryCount > 1 { + duplicatedRecoveries = duplicatedRecoveries + (recoveryCount - 1) + duplicatedVaults = duplicatedVaults + 1 + } } - log("testSupervisorHandlesManyStuckVaults: ran \((supervisorRunsNeeded + 10).toString()) supervisor ticks") + Test.assert( + missingRecoveries == 0, + message: "expected every vault to recover at least once, but \(missingRecoveries.toString()) vaults emitted no YieldVaultRecovered event" + ) + Test.assert( + duplicatedRecoveries == 0, + message: "expected exactly one recovery per vault, but saw \(duplicatedRecoveries.toString()) duplicate recoveries across \(duplicatedVaults.toString()) vaults" + ) - let recoveredEvents = Test.eventsOfType(Type()) - Test.assert(recoveredEvents.length >= n, message: "expected at least \(n.toString()) recovered, got \(recoveredEvents.length.toString())") - log("testSupervisorHandlesManyStuckVaults: recovered \(recoveredEvents.length.toString()) vaults") + // This second guard catches late churn that may start only after the full unique + // set has already been recovered. The duplicate-per-vault check above inspects the + // state up to this point; the stabilization window verifies the system stays quiet + // once recovery should be complete. + let recoveredEventsBeforeStabilization = Test.eventsOfType(Type()).length + var stabilizationTick = 0 + while stabilizationTick < 2 { + Test.moveTime(by: 60.0 * 10.0 + 10.0) + Test.commitBlock() + stabilizationTick = stabilizationTick + 1 + } + let recoveredEventsAfterStabilization = Test.eventsOfType(Type()).length + Test.assert( + recoveredEventsAfterStabilization == recoveredEventsBeforeStabilization, + message: "expected no additional recovery churn after all vaults were recovered; before stabilization: \(recoveredEventsBeforeStabilization.toString()), after: \(recoveredEventsAfterStabilization.toString())" + ) + log("testSupervisorHandlesManyStuckVaults: stable recovery set of \(recoveredCounts.length.toString()) unique vaults with \(recoveredEventsAfterStabilization.toString()) total recovery events") // 8. Health check: none stuck, all have active schedules var stillStuck = 0 diff --git a/cadence/tests/scheduler_mixed_population_regression_test.cdc b/cadence/tests/scheduler_mixed_population_regression_test.cdc new file mode 100644 index 00000000..036758ae --- /dev/null +++ b/cadence/tests/scheduler_mixed_population_regression_test.cdc @@ -0,0 +1,292 @@ +/// Mixed-population regression tests for the scheduler stuck-scan. +/// +/// WHY THIS FILE EXISTS +/// -------------------- +/// The Supervisor does not scan the full registry for stuck vaults. Each run only asks the +/// registry for up to MAX_BATCH_SIZE candidates from the tail of the scan order, then checks +/// those candidates one by one. +/// +/// That optimization breaks down when the scan order contains vaults that can never become +/// stuck. Today, non-recurring vaults are still registered into the same ordering even though +/// isStuckYieldVault() immediately returns false for them. +/// +/// This creates a liveness risk: +/// - more than MAX_BATCH_SIZE non-recurring vaults can occupy the tail, +/// - the Supervisor can keep rescanning those same ineligible IDs, +/// - and a real stuck recurring vault further up the list is never detected or recovered. +/// +/// This file exists to lock that failure mode down as a regression. The main test below +/// intentionally builds that mixed population and asserts the Supervisor should still find +/// the real stuck vault. On the current implementation, that assertion fails, which is the +/// exact bug this test is meant to expose until the scheduler logic is fixed. +import Test +import BlockchainHelpers + +import "test_helpers.cdc" + +import "FlowToken" +import "MOET" +import "YieldToken" +import "MockStrategies" +import "FlowYieldVaultsAutoBalancers" +import "FlowYieldVaultsSchedulerV1" +import "FlowYieldVaultsSchedulerRegistry" + +access(all) let protocolAccount = Test.getAccount(0x0000000000000008) +access(all) let flowYieldVaultsAccount = Test.getAccount(0x0000000000000009) +access(all) let yieldTokenAccount = Test.getAccount(0x0000000000000010) + +access(all) var strategyIdentifier = Type<@MockStrategies.TracerStrategy>().identifier +access(all) var flowTokenIdentifier = Type<@FlowToken.Vault>().identifier +access(all) var yieldTokenIdentifier = Type<@YieldToken.Vault>().identifier +access(all) var moetTokenIdentifier = Type<@MOET.Vault>().identifier +access(all) var snapshot: UInt64 = 0 + +access(all) +fun setup() { + log("Setting up mixed-population scheduler regression test...") + + deployContracts() + mintFlow(to: flowYieldVaultsAccount, amount: 1000.0) + + setMockOraclePrice(signer: flowYieldVaultsAccount, forTokenIdentifier: yieldTokenIdentifier, price: 1.0) + setMockOraclePrice(signer: flowYieldVaultsAccount, forTokenIdentifier: flowTokenIdentifier, price: 1.0) + + let reserveAmount = 100_000_00.0 + setupMoetVault(protocolAccount, beFailed: false) + setupYieldVault(protocolAccount, beFailed: false) + mintFlow(to: protocolAccount, amount: reserveAmount) + mintMoet(signer: protocolAccount, to: protocolAccount.address, amount: reserveAmount, beFailed: false) + mintYield(signer: yieldTokenAccount, to: protocolAccount.address, amount: reserveAmount, beFailed: false) + setMockSwapperLiquidityConnector(signer: protocolAccount, vaultStoragePath: MOET.VaultStoragePath) + setMockSwapperLiquidityConnector(signer: protocolAccount, vaultStoragePath: YieldToken.VaultStoragePath) + setMockSwapperLiquidityConnector(signer: protocolAccount, vaultStoragePath: /storage/flowTokenVault) + + createAndStorePool(signer: protocolAccount, defaultTokenIdentifier: moetTokenIdentifier, beFailed: false) + addSupportedTokenFixedRateInterestCurve( + signer: protocolAccount, + tokenTypeIdentifier: flowTokenIdentifier, + collateralFactor: 0.8, + borrowFactor: 1.0, + yearlyRate: UFix128(0.1), + depositRate: 1_000_000.0, + depositCapacityCap: 1_000_000.0 + ) + + let openRes = _executeTransaction( + "../../lib/FlowALP/cadence/transactions/flow-alp/position/create_position.cdc", + [reserveAmount / 2.0, /storage/flowTokenVault, true], + protocolAccount + ) + Test.expect(openRes, Test.beSucceeded()) + + addStrategyComposer( + signer: flowYieldVaultsAccount, + strategyIdentifier: strategyIdentifier, + composerIdentifier: Type<@MockStrategies.TracerStrategyComposer>().identifier, + issuerStoragePath: MockStrategies.IssuerStoragePath, + beFailed: false + ) + + snapshot = getCurrentBlockHeight() + log("Setup complete") +} + +access(all) +fun cancelSchedulesAndRemoveRecurringConfig(yieldVaultID: UInt64) { + let storagePath = FlowYieldVaultsAutoBalancers.deriveAutoBalancerPath( + id: yieldVaultID, + storage: true + ) as! StoragePath + let res = _executeTransaction( + "../transactions/flow-yield-vaults/admin/cancel_all_scheduled_transactions_and_remove_recurring_config.cdc", + [storagePath], + flowYieldVaultsAccount + ) + Test.expect(res, Test.beSucceeded()) +} + +access(all) +fun isStuckYieldVault(_ yieldVaultID: UInt64): Bool { + let res = _executeScript("../scripts/flow-yield-vaults/is_stuck_yield_vault.cdc", [yieldVaultID]) + Test.expect(res, Test.beSucceeded()) + return res.returnValue! as! Bool +} + +access(all) +fun hasActiveSchedule(_ yieldVaultID: UInt64): Bool { + let res = _executeScript("../scripts/flow-yield-vaults/has_active_schedule.cdc", [yieldVaultID]) + Test.expect(res, Test.beSucceeded()) + return res.returnValue! as! Bool +} + +access(all) +fun getFlowYieldVaultsFlowBalance(): UFix64 { + let res = _executeScript( + "../scripts/flow-yield-vaults/get_flow_balance.cdc", + [flowYieldVaultsAccount.address] + ) + Test.expect(res, Test.beSucceeded()) + return res.returnValue! as! UFix64 +} + +/// Regression test: more than MAX_BATCH_SIZE non-recurring registry entries must not +/// permanently starve stuck recurring vault detection. +/// +/// Setup: +/// 1. Create MAX_BATCH_SIZE + 1 mock vaults and strip their recurring config/schedules. +/// 2. Create one real recurring vault after them so it sits behind those tail entries. +/// 3. Drain FLOW so that recurring vault executes once but fails to reschedule, becoming stuck. +/// 4. Fund and run the Supervisor for several ticks. +/// +/// Expected behavior: +/// - The recurring vault is eventually detected and recovered. +/// - The non-recurring tail entries do not block recovery forever. +access(all) +fun testSupervisorScansPastNonRecurringTailEntries() { + if snapshot != getCurrentBlockHeight() { + Test.reset(to: snapshot) + } + log("\n[TEST] Supervisor scans past non-recurring tail entries...") + + let blockerCount = FlowYieldVaultsSchedulerRegistry.MAX_BATCH_SIZE + 1 + let user = Test.createAccount() + mintFlow(to: user, amount: 2000.0) + grantBeta(flowYieldVaultsAccount, user) + + // Step 1: create more than one full scan batch of normal recurring mock vaults. + // We will convert these into permanently ineligible "blockers" without removing them + // from the registry, so they keep occupying the tail of the scan order. + var idx = 0 + while idx < blockerCount { + let createRes = _executeTransaction( + "../transactions/flow-yield-vaults/create_yield_vault.cdc", + [strategyIdentifier, flowTokenIdentifier, 25.0], + user + ) + Test.expect(createRes, Test.beSucceeded()) + idx = idx + 1 + } + + let blockerIDs = getYieldVaultIDs(address: user.address)! + Test.assertEqual(blockerCount, blockerIDs.length) + + // Step 2: strip schedules and recurring config from those vaults. They stay registered, + // but they can no longer self-schedule and they can never satisfy isStuckYieldVault(). + for blockerID in blockerIDs { + cancelSchedulesAndRemoveRecurringConfig(yieldVaultID: blockerID) + Test.assertEqual(false, hasActiveSchedule(blockerID)) + Test.assertEqual(false, isStuckYieldVault(blockerID)) + } + log("Prepared \(blockerCount.toString()) non-recurring registry entries at the tail") + + // Step 3: create one real recurring vault after the blockers. This vault is the one we + // will intentionally push into a stuck state and expect the Supervisor to recover. + let targetRes = _executeTransaction( + "../transactions/flow-yield-vaults/create_yield_vault.cdc", + [strategyIdentifier, flowTokenIdentifier, 25.0], + user + ) + Test.expect(targetRes, Test.beSucceeded()) + + let allYieldVaultIDs = getYieldVaultIDs(address: user.address)! + var recurringYieldVaultID: UInt64 = 0 + var foundRecurringTarget = false + for yieldVaultID in allYieldVaultIDs { + if !blockerIDs.contains(yieldVaultID) { + recurringYieldVaultID = yieldVaultID + foundRecurringTarget = true + } + } + Test.assert(foundRecurringTarget, message: "Failed to identify the recurring target yield vault") + Test.assertEqual(true, hasActiveSchedule(recurringYieldVaultID)) + + // Sanity check the test setup: the first scan batch should contain only blocker IDs, + // proving the real recurring target starts behind the tail window the Supervisor reads. + let initialCandidates = FlowYieldVaultsSchedulerRegistry.getStuckScanCandidates( + limit: UInt(FlowYieldVaultsSchedulerRegistry.MAX_BATCH_SIZE) + ) + Test.assert( + !initialCandidates.contains(recurringYieldVaultID), + message: "Setup failure: target recurring vault should sit behind the first stuck-scan batch" + ) + for candidate in initialCandidates { + Test.assert( + blockerIDs.contains(candidate), + message: "Setup failure: initial tail scan should contain only non-recurring blockers" + ) + } + + setMockOraclePrice(signer: flowYieldVaultsAccount, forTokenIdentifier: flowTokenIdentifier, price: 2.0) + setMockOraclePrice(signer: flowYieldVaultsAccount, forTokenIdentifier: yieldTokenIdentifier, price: 1.5) + + // Step 4: drain the shared FLOW fee vault to the minimum possible residual balance. + // The target vault already has its first schedule funded, so it should execute once and + // then eventually stop self-scheduling, making it a genuine stuck recurring vault. + let balanceBeforeDrain = getFlowYieldVaultsFlowBalance() + let residualBalance = 0.00000001 + if balanceBeforeDrain > residualBalance { + let drainRes = _executeTransaction( + "../transactions/flow-yield-vaults/drain_flow.cdc", + [balanceBeforeDrain - residualBalance], + flowYieldVaultsAccount + ) + Test.expect(drainRes, Test.beSucceeded()) + } + + // Give the already-funded first execution time to run, then keep advancing until the + // vault becomes overdue with no active schedule. A nearly-empty fee vault can still be + // enough for one extra scheduling attempt, so this waits several intervals. + idx = 0 + while idx < 6 && !isStuckYieldVault(recurringYieldVaultID) { + Test.moveTime(by: 60.0 * 10.0 + 10.0) + Test.commitBlock() + idx = idx + 1 + } + + Test.assertEqual(true, isStuckYieldVault(recurringYieldVaultID)) + Test.assertEqual(false, hasActiveSchedule(recurringYieldVaultID)) + + // Step 5: fund the account again and start the Supervisor. A correct implementation + // should eventually scan past the blockers, detect the real stuck recurring vault, and + // recover it. + mintFlow(to: flowYieldVaultsAccount, amount: 200.0) + Test.commitBlock() + + let recoveredEventsBefore = Test.eventsOfType(Type()).length + let detectedEventsBefore = Test.eventsOfType(Type()).length + + let scheduleSupervisorRes = _executeTransaction( + "../transactions/flow-yield-vaults/admin/schedule_supervisor.cdc", + [60.0 * 10.0, UInt8(1), UInt64(800), true], + flowYieldVaultsAccount + ) + Test.expect(scheduleSupervisorRes, Test.beSucceeded()) + + let supervisorTicks = 3 + idx = 0 + while idx < supervisorTicks { + Test.moveTime(by: 60.0 * 10.0 + 10.0) + Test.commitBlock() + idx = idx + 1 + } + + let recoveredEvents = Test.eventsOfType(Type()) + let detectedEvents = Test.eventsOfType(Type()) + log("Recovered events after supervisor ticks: \(recoveredEvents.length.toString())") + log("Detected events after supervisor ticks: \(detectedEvents.length.toString())") + + // These are the core regression assertions. On the current implementation they fail, + // because the Supervisor keeps rescanning the same non-recurring tail entries and never + // reaches the real stuck recurring vault behind them. + Test.assert( + detectedEvents.length > detectedEventsBefore, + message: "Supervisor should eventually detect the stuck recurring vault instead of rescanning the same non-recurring tail entries forever" + ) + Test.assert( + recoveredEvents.length > recoveredEventsBefore, + message: "Supervisor should eventually recover the stuck recurring vault even when more than MAX_BATCH_SIZE non-recurring entries occupy the tail" + ) + Test.assertEqual(false, isStuckYieldVault(recurringYieldVaultID)) + Test.assertEqual(true, hasActiveSchedule(recurringYieldVaultID)) +} From 8447480f21cdb9926f150b7ec5e59356c47d7451 Mon Sep 17 00:00:00 2001 From: liobrasil Date: Wed, 11 Mar 2026 00:21:09 -0400 Subject: [PATCH 2/3] fix: avoid duplicate supervisor recoveries --- .../FlowYieldVaultsAutoBalancers.cdc | 29 +++++++++++++++---- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/cadence/contracts/FlowYieldVaultsAutoBalancers.cdc b/cadence/contracts/FlowYieldVaultsAutoBalancers.cdc index c9325872..770d4d25 100644 --- a/cadence/contracts/FlowYieldVaultsAutoBalancers.cdc +++ b/cadence/contracts/FlowYieldVaultsAutoBalancers.cdc @@ -68,11 +68,20 @@ access(all) contract FlowYieldVaultsAutoBalancers { return nil } - /// Checks if an AutoBalancer has at least one active (Scheduled) transaction. + /// Checks if an AutoBalancer has at least one active internally-managed transaction. /// Used by Supervisor to detect stuck yield vaults that need recovery. /// + /// A transaction is considered active when it is: + /// - still `Scheduled`, or + /// - already marked `Executed` by FlowTransactionScheduler, but the AutoBalancer has not + /// yet advanced its last rebalance timestamp past that transaction's scheduled time. + /// + /// The second case matters because FlowTransactionScheduler flips status to `Executed` + /// before the handler actually runs. Without treating that in-flight window as active, + /// the Supervisor can falsely classify healthy vaults as stuck and recover them twice. + /// /// @param id: The yield vault/AutoBalancer ID - /// @return Bool: true if there's at least one Scheduled transaction, false otherwise + /// @return Bool: true if there's at least one active internally-managed transaction, false otherwise /// access(all) fun hasActiveSchedule(id: UInt64): Bool { let autoBalancer = self.borrowAutoBalancer(id: id) @@ -80,10 +89,20 @@ access(all) contract FlowYieldVaultsAutoBalancers { return false } + let lastRebalanceTimestamp = autoBalancer!.getLastRebalanceTimestamp() let txnIDs = autoBalancer!.getScheduledTransactionIDs() for txnID in txnIDs { - if autoBalancer!.borrowScheduledTransaction(id: txnID)?.status() == FlowTransactionScheduler.Status.Scheduled { - return true + if let scheduledTxn = autoBalancer!.borrowScheduledTransaction(id: txnID) { + if let status = scheduledTxn.status() { + if status == FlowTransactionScheduler.Status.Scheduled { + return true + } + + if status == FlowTransactionScheduler.Status.Executed + && scheduledTxn.timestamp > lastRebalanceTimestamp { + return true + } + } } } return false @@ -110,7 +129,7 @@ access(all) contract FlowYieldVaultsAutoBalancers { return false // Not configured for recurring, can't be "stuck" } - // Check if there's an active schedule + // Check if there's an active schedule or an in-flight due execution if self.hasActiveSchedule(id: id) { return false // Has active schedule, not stuck } From 66ccb02dc8ccab717307af6546374cbc31ce6fa6 Mon Sep 17 00:00:00 2001 From: liobrasil Date: Wed, 11 Mar 2026 00:55:32 -0400 Subject: [PATCH 3/3] fix: restrict supervisor stuck scan to recurring vaults --- .../FlowYieldVaultsAutoBalancers.cdc | 10 +++- .../FlowYieldVaultsSchedulerRegistry.cdc | 53 +++++++++++++------ .../contracts/FlowYieldVaultsSchedulerV1.cdc | 5 +- docs/IMPLEMENTATION_SUMMARY.md | 8 +-- docs/SCHEDULED_REBALANCING_GUIDE.md | 4 +- ...autobalancer-restart-recurring-proposal.md | 2 +- docs/rebalancing_architecture.md | 11 ++-- 7 files changed, 60 insertions(+), 33 deletions(-) diff --git a/cadence/contracts/FlowYieldVaultsAutoBalancers.cdc b/cadence/contracts/FlowYieldVaultsAutoBalancers.cdc index 770d4d25..bf857ba2 100644 --- a/cadence/contracts/FlowYieldVaultsAutoBalancers.cdc +++ b/cadence/contracts/FlowYieldVaultsAutoBalancers.cdc @@ -232,8 +232,14 @@ access(all) contract FlowYieldVaultsAutoBalancers { let scheduleCap = self.account.capabilities.storage .issue(storagePath) - // Register yield vault in registry for global mapping of live yield vault IDs - FlowYieldVaultsSchedulerRegistry.register(yieldVaultID: uniqueID.id, handlerCap: handlerCap, scheduleCap: scheduleCap) + // Register yield vault in registry for global mapping of live yield vault IDs. + // Only recurring vaults participate in stuck-scan ordering. + FlowYieldVaultsSchedulerRegistry.register( + yieldVaultID: uniqueID.id, + handlerCap: handlerCap, + scheduleCap: scheduleCap, + participatesInStuckScan: recurringConfig != nil + ) // Start the native AutoBalancer self-scheduling chain if recurringConfig was provided // This schedules the first rebalance; subsequent ones are scheduled automatically diff --git a/cadence/contracts/FlowYieldVaultsSchedulerRegistry.cdc b/cadence/contracts/FlowYieldVaultsSchedulerRegistry.cdc index a8bdbe97..6c26b201 100644 --- a/cadence/contracts/FlowYieldVaultsSchedulerRegistry.cdc +++ b/cadence/contracts/FlowYieldVaultsSchedulerRegistry.cdc @@ -16,6 +16,7 @@ access(all) contract FlowYieldVaultsSchedulerRegistry { /* --- TYPES --- */ /// Node in the simulated doubly-linked list used for O(1) stuck-scan ordering. + /// Only recurring, scan-eligible vaults participate in this list. /// `prev` points toward the head (most recently executed); `next` points toward the tail (oldest/least recently executed). access(all) struct ListNode { access(all) var prev: UInt64? @@ -79,10 +80,11 @@ access(all) contract FlowYieldVaultsSchedulerRegistry { /// Stored as a dictionary for O(1) add/remove; iteration gives the pending set access(self) var pendingQueue: {UInt64: Bool} - /// Simulated doubly-linked list for O(1) stuck-scan ordering. - /// listHead = most recently executed vault ID (or nil if empty). - /// listTail = least recently executed vault ID — getStuckScanCandidates walks from here. - /// On reportExecution a vault is snipped from its current position and moved to head in O(1). + /// Simulated doubly-linked list for O(1) stuck-scan ordering across recurring scan participants. + /// listHead = most recently executed recurring vault ID (or nil if empty). + /// listTail = least recently executed recurring vault ID — getStuckScanCandidates walks from here. + /// On reportExecution a recurring participant is snipped from its current position and moved to head in O(1). + /// If a vault later disables recurring config, its stale list entry is pruned lazily during candidate walks. access(self) var listNodes: {UInt64: ListNode} access(self) var listHead: UInt64? access(self) var listTail: UInt64? @@ -136,10 +138,12 @@ access(all) contract FlowYieldVaultsSchedulerRegistry { /* --- ACCOUNT-LEVEL FUNCTIONS --- */ /// Register a YieldVault and store its handler and schedule capabilities (idempotent) + /// `participatesInStuckScan` should be true only for vaults that currently have recurring config. access(account) fun register( yieldVaultID: UInt64, handlerCap: Capability, - scheduleCap: Capability + scheduleCap: Capability, + participatesInStuckScan: Bool ) { pre { handlerCap.check(): "Invalid handler capability provided for yieldVaultID \(yieldVaultID)" @@ -148,20 +152,23 @@ access(all) contract FlowYieldVaultsSchedulerRegistry { self.yieldVaultRegistry[yieldVaultID] = true self.handlerCaps[yieldVaultID] = handlerCap self.scheduleCaps[yieldVaultID] = scheduleCap - // New vaults go to the head; they haven't executed yet but are freshly registered. + + // Only recurring vaults participate in stuck-scan ordering. // If already in the list (idempotent re-register), remove first to avoid duplicates. if self.listNodes[yieldVaultID] != nil { self._listRemove(id: yieldVaultID) } - self._listInsertAtHead(id: yieldVaultID) + if participatesInStuckScan { + self._listInsertAtHead(id: yieldVaultID) + } emit YieldVaultRegistered(yieldVaultID: yieldVaultID) } - /// Called on every execution. Moves yieldVaultID to the head (most recently executed) - /// so the Supervisor scans from the tail (least recently executed) for stuck detection — O(1). - /// If the list entry is unexpectedly missing, reinsert it to restore the ordering structure. + /// Called on every execution. Moves scan-participating yieldVaultID to the head + /// (most recently executed) so the Supervisor scans recurring participants from the tail + /// (least recently executed) for stuck detection — O(1). access(account) fun reportExecution(yieldVaultID: UInt64) { - if !(self.yieldVaultRegistry[yieldVaultID] ?? false) { + if !(self.yieldVaultRegistry[yieldVaultID] ?? false) || self.listNodes[yieldVaultID] == nil { return } let _ = self._listRemove(id: yieldVaultID) @@ -270,18 +277,30 @@ access(all) contract FlowYieldVaultsSchedulerRegistry { return self.pendingQueue.length } - /// Returns up to `limit` vault IDs starting from the tail (least recently executed). + /// Returns up to `limit` recurring scan participants starting from the tail + /// (least recently executed among recurring participants). + /// Stale entries whose recurring config has been removed are pruned lazily as the walk proceeds. /// Supervisor should only scan these for stuck detection instead of all registered vaults. /// @param limit: Maximum number of IDs to return (caller typically passes MAX_BATCH_SIZE) access(all) fun getStuckScanCandidates(limit: UInt): [UInt64] { var result: [UInt64] = [] var current = self.listTail - var count: UInt = 0 - while count < limit { + while UInt(result.length) < limit { if let id = current { - result.append(id) - current = self.listNodes[id]?.prev - count = count + 1 + let previous = self.listNodes[id]?.prev + let scheduleCap = self.scheduleCaps[id] + let isRecurringParticipant = + scheduleCap != nil + && scheduleCap!.check() + && scheduleCap!.borrow()?.getRecurringConfig() != nil + + if isRecurringParticipant { + result.append(id) + } else { + self.dequeuePending(yieldVaultID: id) + let _ = self._listRemove(id: id) + } + current = previous } else { break } diff --git a/cadence/contracts/FlowYieldVaultsSchedulerV1.cdc b/cadence/contracts/FlowYieldVaultsSchedulerV1.cdc index 27aceb70..ff17ee04 100644 --- a/cadence/contracts/FlowYieldVaultsSchedulerV1.cdc +++ b/cadence/contracts/FlowYieldVaultsSchedulerV1.cdc @@ -160,7 +160,8 @@ access(all) contract FlowYieldVaultsSchedulerV1 { /// Detects and recovers stuck yield vaults by directly calling their scheduleNextRebalance(). /// /// Detection methods: - /// 1. State-based: Scans for registered yield vaults with no active schedule that are overdue + /// 1. State-based: Scans recurring yield vaults in stuck-scan order for candidates with + /// no active schedule that are overdue /// /// Recovery method: /// - Uses Schedule capability to call AutoBalancer.scheduleNextRebalance() directly @@ -172,7 +173,7 @@ access(all) contract FlowYieldVaultsSchedulerV1 { /// "priority": UInt8 (0=High,1=Medium,2=Low) - for Supervisor self-rescheduling /// "executionEffort": UInt64 - for Supervisor self-rescheduling /// "recurringInterval": UFix64 (for Supervisor self-rescheduling) - /// "scanForStuck": Bool (default true - scan up to MAX_BATCH_SIZE least-recently-executed vaults for stuck ones) + /// "scanForStuck": Bool (default true - scan up to MAX_BATCH_SIZE least-recently-executed recurring scan participants for stuck ones) /// } access(FlowTransactionScheduler.Execute) fun executeTransaction(id: UInt64, data: AnyStruct?) { let cfg = data as? {String: AnyStruct} ?? {} diff --git a/docs/IMPLEMENTATION_SUMMARY.md b/docs/IMPLEMENTATION_SUMMARY.md index b6a73b31..c81ab1e7 100644 --- a/docs/IMPLEMENTATION_SUMMARY.md +++ b/docs/IMPLEMENTATION_SUMMARY.md @@ -22,7 +22,7 @@ This document reflects the current scheduler architecture in this repository. 2. Direct AutoBalancer capabilities, with no scheduling wrapper layer. 3. Native self-scheduling for healthy recurring AutoBalancers. 4. Recovery-only Supervisor with bounded scanning and bounded pending-queue processing. -5. LRU stuck-scan ordering, so the longest-idle vaults are checked first. +5. LRU stuck-scan ordering across recurring scan participants, so the longest-idle recurring vaults are checked first. ### Main Components @@ -40,7 +40,7 @@ FlowYieldVaults Account | +-- handlerCaps | +-- scheduleCaps | +-- pendingQueue - | +-- listNodes / listHead / listTail (LRU stuck-scan order) + | +-- listNodes / listHead / listTail (LRU stuck-scan order for recurring participants) | +-- supervisorCap | +-- FlowYieldVaultsSchedulerV1 @@ -75,7 +75,7 @@ FlowYieldVaults Account Each Supervisor run has two bounded steps: 1. Stuck detection: - - reads up to `MAX_BATCH_SIZE` least-recently-executed vault IDs from `getStuckScanCandidates(...)` + - reads up to `MAX_BATCH_SIZE` least-recently-executed recurring-participant vault IDs from `getStuckScanCandidates(...)` - checks whether each candidate is overdue and lacks an active schedule - enqueues stuck vaults into `pendingQueue` @@ -98,7 +98,7 @@ If the Supervisor itself is configured with a recurring interval, it self-resche - registered vault tracking - pending queue - handler/schedule capability storage - - LRU stuck-scan ordering + - LRU stuck-scan ordering for recurring participants - `FlowYieldVaultsSchedulerV1.cdc` - Supervisor recovery handler diff --git a/docs/SCHEDULED_REBALANCING_GUIDE.md b/docs/SCHEDULED_REBALANCING_GUIDE.md index 687f176b..c47fb5a4 100644 --- a/docs/SCHEDULED_REBALANCING_GUIDE.md +++ b/docs/SCHEDULED_REBALANCING_GUIDE.md @@ -162,10 +162,10 @@ flow scripts execute cadence/scripts/flow-yield-vaults/get_pending_count.cdc ### What It Does The Supervisor handles two recovery scenarios per run: -1. **Stuck detection**: Scans up to `MAX_BATCH_SIZE` vault candidates using `getStuckScanCandidates()`, which returns vaults ordered least-recently-executed first (LRU). Stuck vaults are enqueued in `pendingQueue`. +1. **Stuck detection**: Scans up to `MAX_BATCH_SIZE` recurring scan participants using `getStuckScanCandidates()`, which returns recurring vaults ordered least-recently-executed first (LRU). Stuck vaults are enqueued in `pendingQueue`. 2. **Pending processing**: Seeds vaults from `pendingQueue` (up to `MAX_BATCH_SIZE` per run). When scheduled with a recurring interval, the Supervisor keeps self-rescheduling even if a given run finds no work. -Each AutoBalancer reports back to the registry after every execution via `RegistryReportCallback`, which calls `reportExecution()` to move the vault to the most-recently-executed end of the internal list. Because stuck scanning starts from the least-recently-executed tail, the Supervisor still prioritises the longest-idle vaults first. +Each recurring AutoBalancer reports back to the registry after every execution via `RegistryReportCallback`, which calls `reportExecution()` to move the vault to the most-recently-executed end of the internal list. Because stuck scanning starts from the least-recently-executed recurring tail, the Supervisor prioritises the longest-idle recurring vaults first. ### When It's Needed diff --git a/docs/autobalancer-restart-recurring-proposal.md b/docs/autobalancer-restart-recurring-proposal.md index 6aa42dcb..111b5fed 100644 --- a/docs/autobalancer-restart-recurring-proposal.md +++ b/docs/autobalancer-restart-recurring-proposal.md @@ -2,7 +2,7 @@ > Historical note: this proposal describes the recovery design that was later implemented. > Current code names are `FlowYieldVaultsSchedulerV1` and `FlowYieldVaultsSchedulerRegistry`. -> Current stuck detection scans up to `MAX_BATCH_SIZE` least-recently-executed vaults from +> Current stuck detection scans up to `MAX_BATCH_SIZE` least-recently-executed recurring scan participants from > the registry's LRU ordering, not the full registered set. ## Problem Statement diff --git a/docs/rebalancing_architecture.md b/docs/rebalancing_architecture.md index 5d2acd8d..81972ed6 100644 --- a/docs/rebalancing_architecture.md +++ b/docs/rebalancing_architecture.md @@ -45,7 +45,7 @@ - `yieldVaultRegistry`: registered yield vault IDs - `handlerCaps`: direct capabilities to AutoBalancers (no wrapper) - `pendingQueue`: yield vaults needing (re)seeding; processing is bounded by `MAX_BATCH_SIZE = 5` per Supervisor run - - `stuckScanOrder`: LRU-ordered list of vault IDs for stuck detection; vaults call `reportExecution()` on each run to move themselves to the most-recently-executed end, so the Supervisor always scans the longest-idle vaults first + - `stuckScanOrder`: LRU-ordered list of recurring scan participants for stuck detection; recurring vaults call `reportExecution()` on each run to move themselves to the most-recently-executed end, so the Supervisor scans the longest-idle recurring vaults first - `supervisorCap`: capability for Supervisor self-scheduling - **FlowYieldVaultsSchedulerV1** provides: - `Supervisor`: recovery handler for failed schedules @@ -125,7 +125,8 @@ When `_initNewAutoBalancer()` is called: FlowYieldVaultsSchedulerRegistry.register( yieldVaultID: uniqueID.id, handlerCap: handlerCap, - scheduleCap: scheduleCap + scheduleCap: scheduleCap, + participatesInStuckScan: recurringConfig != nil ) autoBalancerRef.scheduleNextRebalance(whileExecuting: nil) ``` @@ -162,7 +163,7 @@ fun executeTransaction(id: UInt64, data: AnyStruct?) { The Supervisor runs two steps per execution: **Step 1 – Stuck detection** (when `scanForStuck == true`): -Fetches up to `MAX_BATCH_SIZE` candidates from `getStuckScanCandidates(limit:)`, which returns vault IDs starting from the least-recently-executed tail of `stuckScanOrder`. Vaults that are stuck (recurring config set, no active schedule, overdue) are enqueued into `pendingQueue`. +Fetches up to `MAX_BATCH_SIZE` candidates from `getStuckScanCandidates(limit:)`, which returns recurring scan participants starting from the least-recently-executed tail of `stuckScanOrder`. Vaults that are stuck (recurring config set, no active schedule, overdue) are enqueued into `pendingQueue`. **Step 2 – Pending processing**: Seeds vaults from `pendingQueue` (up to `MAX_BATCH_SIZE` per run via `getPendingYieldVaultIDsPaginated(page: 0, size: UInt(MAX_BATCH_SIZE))`). @@ -170,7 +171,7 @@ Seeds vaults from `pendingQueue` (up to `MAX_BATCH_SIZE` per run via `getPending ```cadence access(FlowTransactionScheduler.Execute) fun executeTransaction(id: UInt64, data: AnyStruct?) { - // STEP 1: scan least-recently-executed vaults for stuck detection + // STEP 1: scan least-recently-executed recurring participants for stuck detection let candidates = FlowYieldVaultsSchedulerRegistry.getStuckScanCandidates( limit: UInt(FlowYieldVaultsSchedulerRegistry.MAX_BATCH_SIZE)) for yieldVaultID in candidates { @@ -195,7 +196,7 @@ fun executeTransaction(id: UInt64, data: AnyStruct?) { } ``` -Each AutoBalancer sets a shared `RegistryReportCallback` capability at creation time. On every execution it calls `FlowYieldVaultsSchedulerRegistry.reportExecution(yieldVaultID:)`, which moves the vault to the head of `stuckScanOrder` so the least-recently-executed tail remains the next stuck-scan priority. +Each AutoBalancer sets a shared `RegistryReportCallback` capability at creation time. On every execution, recurring scan participants call `FlowYieldVaultsSchedulerRegistry.reportExecution(yieldVaultID:)`, which moves the vault to the head of `stuckScanOrder` so the least-recently-executed recurring tail remains the next stuck-scan priority. ---