Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 32 additions & 7 deletions cadence/contracts/FlowYieldVaultsAutoBalancers.cdc
Original file line number Diff line number Diff line change
Expand Up @@ -68,22 +68,41 @@ access(all) contract FlowYieldVaultsAutoBalancers {
return nil
}

/// Checks if an AutoBalancer has at least one active (Scheduled) transaction.
/// Checks if an AutoBalancer has at least one active internally-managed transaction.
/// Used by Supervisor to detect stuck yield vaults that need recovery.
///
/// A transaction is considered active when it is:
/// - still `Scheduled`, or
/// - already marked `Executed` by FlowTransactionScheduler, but the AutoBalancer has not
/// yet advanced its last rebalance timestamp past that transaction's scheduled time.
///
/// The second case matters because FlowTransactionScheduler flips status to `Executed`
/// before the handler actually runs. Without treating that in-flight window as active,
/// the Supervisor can falsely classify healthy vaults as stuck and recover them twice.
///
/// @param id: The yield vault/AutoBalancer ID
/// @return Bool: true if there's at least one Scheduled transaction, false otherwise
/// @return Bool: true if there's at least one active internally-managed transaction, false otherwise
///
access(all) fun hasActiveSchedule(id: UInt64): Bool {
let autoBalancer = self.borrowAutoBalancer(id: id)
if autoBalancer == nil {
return false
}

let lastRebalanceTimestamp = autoBalancer!.getLastRebalanceTimestamp()
let txnIDs = autoBalancer!.getScheduledTransactionIDs()
for txnID in txnIDs {
if autoBalancer!.borrowScheduledTransaction(id: txnID)?.status() == FlowTransactionScheduler.Status.Scheduled {
return true
if let scheduledTxn = autoBalancer!.borrowScheduledTransaction(id: txnID) {
if let status = scheduledTxn.status() {
if status == FlowTransactionScheduler.Status.Scheduled {
return true
}

if status == FlowTransactionScheduler.Status.Executed
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the same issue we faced in onflow/FlowYieldVaultsEVM#70
The problem with the fix is that if the transaction panics, lastRebalanceTimestamp is not updated. This makes the rebalancer permanantely stuck because it's Executed and the lastRebalanceTimestamp was never updated.
You might want to consider a grace period based fix.

&& scheduledTxn.timestamp > lastRebalanceTimestamp {
return true
}
}
}
}
return false
Expand All @@ -110,7 +129,7 @@ access(all) contract FlowYieldVaultsAutoBalancers {
return false // Not configured for recurring, can't be "stuck"
}

// Check if there's an active schedule
// Check if there's an active schedule or an in-flight due execution
if self.hasActiveSchedule(id: id) {
return false // Has active schedule, not stuck
}
Expand Down Expand Up @@ -213,8 +232,14 @@ access(all) contract FlowYieldVaultsAutoBalancers {
let scheduleCap = self.account.capabilities.storage
.issue<auth(DeFiActions.Schedule) &DeFiActions.AutoBalancer>(storagePath)

// Register yield vault in registry for global mapping of live yield vault IDs
FlowYieldVaultsSchedulerRegistry.register(yieldVaultID: uniqueID.id, handlerCap: handlerCap, scheduleCap: scheduleCap)
// Register yield vault in registry for global mapping of live yield vault IDs.
// Only recurring vaults participate in stuck-scan ordering.
FlowYieldVaultsSchedulerRegistry.register(
yieldVaultID: uniqueID.id,
handlerCap: handlerCap,
scheduleCap: scheduleCap,
participatesInStuckScan: recurringConfig != nil
)

// Start the native AutoBalancer self-scheduling chain if recurringConfig was provided
// This schedules the first rebalance; subsequent ones are scheduled automatically
Expand Down
53 changes: 36 additions & 17 deletions cadence/contracts/FlowYieldVaultsSchedulerRegistry.cdc
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ access(all) contract FlowYieldVaultsSchedulerRegistry {
/* --- TYPES --- */

/// Node in the simulated doubly-linked list used for O(1) stuck-scan ordering.
/// Only recurring, scan-eligible vaults participate in this list.
/// `prev` points toward the head (most recently executed); `next` points toward the tail (oldest/least recently executed).
access(all) struct ListNode {
access(all) var prev: UInt64?
Expand Down Expand Up @@ -79,10 +80,11 @@ access(all) contract FlowYieldVaultsSchedulerRegistry {
/// Stored as a dictionary for O(1) add/remove; iteration gives the pending set
access(self) var pendingQueue: {UInt64: Bool}

/// Simulated doubly-linked list for O(1) stuck-scan ordering.
/// listHead = most recently executed vault ID (or nil if empty).
/// listTail = least recently executed vault ID — getStuckScanCandidates walks from here.
/// On reportExecution a vault is snipped from its current position and moved to head in O(1).
/// Simulated doubly-linked list for O(1) stuck-scan ordering across recurring scan participants.
/// listHead = most recently executed recurring vault ID (or nil if empty).
/// listTail = least recently executed recurring vault ID — getStuckScanCandidates walks from here.
/// On reportExecution a recurring participant is snipped from its current position and moved to head in O(1).
/// If a vault later disables recurring config, its stale list entry is pruned lazily during candidate walks.
access(self) var listNodes: {UInt64: ListNode}
access(self) var listHead: UInt64?
access(self) var listTail: UInt64?
Expand Down Expand Up @@ -136,10 +138,12 @@ access(all) contract FlowYieldVaultsSchedulerRegistry {
/* --- ACCOUNT-LEVEL FUNCTIONS --- */

/// Register a YieldVault and store its handler and schedule capabilities (idempotent)
/// `participatesInStuckScan` should be true only for vaults that currently have recurring config.
access(account) fun register(
yieldVaultID: UInt64,
handlerCap: Capability<auth(FlowTransactionScheduler.Execute) &{FlowTransactionScheduler.TransactionHandler}>,
scheduleCap: Capability<auth(DeFiActions.Schedule) &DeFiActions.AutoBalancer>
scheduleCap: Capability<auth(DeFiActions.Schedule) &DeFiActions.AutoBalancer>,
participatesInStuckScan: Bool
) {
pre {
handlerCap.check(): "Invalid handler capability provided for yieldVaultID \(yieldVaultID)"
Expand All @@ -148,20 +152,23 @@ access(all) contract FlowYieldVaultsSchedulerRegistry {
self.yieldVaultRegistry[yieldVaultID] = true
self.handlerCaps[yieldVaultID] = handlerCap
self.scheduleCaps[yieldVaultID] = scheduleCap
// New vaults go to the head; they haven't executed yet but are freshly registered.

// Only recurring vaults participate in stuck-scan ordering.
// If already in the list (idempotent re-register), remove first to avoid duplicates.
if self.listNodes[yieldVaultID] != nil {
self._listRemove(id: yieldVaultID)
}
self._listInsertAtHead(id: yieldVaultID)
if participatesInStuckScan {
self._listInsertAtHead(id: yieldVaultID)
}
emit YieldVaultRegistered(yieldVaultID: yieldVaultID)
}

/// Called on every execution. Moves yieldVaultID to the head (most recently executed)
/// so the Supervisor scans from the tail (least recently executed) for stuck detection — O(1).
/// If the list entry is unexpectedly missing, reinsert it to restore the ordering structure.
/// Called on every execution. Moves scan-participating yieldVaultID to the head
/// (most recently executed) so the Supervisor scans recurring participants from the tail
/// (least recently executed) for stuck detection — O(1).
access(account) fun reportExecution(yieldVaultID: UInt64) {
if !(self.yieldVaultRegistry[yieldVaultID] ?? false) {
if !(self.yieldVaultRegistry[yieldVaultID] ?? false) || self.listNodes[yieldVaultID] == nil {
return
}
let _ = self._listRemove(id: yieldVaultID)
Expand Down Expand Up @@ -270,18 +277,30 @@ access(all) contract FlowYieldVaultsSchedulerRegistry {
return self.pendingQueue.length
}

/// Returns up to `limit` vault IDs starting from the tail (least recently executed).
/// Returns up to `limit` recurring scan participants starting from the tail
/// (least recently executed among recurring participants).
/// Stale entries whose recurring config has been removed are pruned lazily as the walk proceeds.
/// Supervisor should only scan these for stuck detection instead of all registered vaults.
/// @param limit: Maximum number of IDs to return (caller typically passes MAX_BATCH_SIZE)
access(all) fun getStuckScanCandidates(limit: UInt): [UInt64] {
var result: [UInt64] = []
var current = self.listTail
var count: UInt = 0
while count < limit {
while UInt(result.length) < limit {
if let id = current {
result.append(id)
current = self.listNodes[id]?.prev
count = count + 1
let previous = self.listNodes[id]?.prev
let scheduleCap = self.scheduleCaps[id]
let isRecurringParticipant =
scheduleCap != nil
&& scheduleCap!.check()
&& scheduleCap!.borrow()?.getRecurringConfig() != nil

if isRecurringParticipant {
result.append(id)
} else {
self.dequeuePending(yieldVaultID: id)
let _ = self._listRemove(id: id)
}
current = previous
} else {
break
}
Expand Down
5 changes: 3 additions & 2 deletions cadence/contracts/FlowYieldVaultsSchedulerV1.cdc
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,8 @@ access(all) contract FlowYieldVaultsSchedulerV1 {
/// Detects and recovers stuck yield vaults by directly calling their scheduleNextRebalance().
///
/// Detection methods:
/// 1. State-based: Scans for registered yield vaults with no active schedule that are overdue
/// 1. State-based: Scans recurring yield vaults in stuck-scan order for candidates with
/// no active schedule that are overdue
///
/// Recovery method:
/// - Uses Schedule capability to call AutoBalancer.scheduleNextRebalance() directly
Expand All @@ -172,7 +173,7 @@ access(all) contract FlowYieldVaultsSchedulerV1 {
/// "priority": UInt8 (0=High,1=Medium,2=Low) - for Supervisor self-rescheduling
/// "executionEffort": UInt64 - for Supervisor self-rescheduling
/// "recurringInterval": UFix64 (for Supervisor self-rescheduling)
/// "scanForStuck": Bool (default true - scan up to MAX_BATCH_SIZE least-recently-executed vaults for stuck ones)
/// "scanForStuck": Bool (default true - scan up to MAX_BATCH_SIZE least-recently-executed recurring scan participants for stuck ones)
/// }
access(FlowTransactionScheduler.Execute) fun executeTransaction(id: UInt64, data: AnyStruct?) {
let cfg = data as? {String: AnyStruct} ?? {}
Expand Down
105 changes: 96 additions & 9 deletions cadence/tests/scheduled_supervisor_test.cdc
Original file line number Diff line number Diff line change
Expand Up @@ -599,6 +599,25 @@ fun testStuckYieldVaultDetectionLogic() {
log("PASS: Stuck yield vault detection correctly identifies healthy yield vaults")
}

/// Returns per-yield-vault recovery event counts from YieldVaultRecovered events.
///
/// This is used by stress tests to distinguish "all vaults recovered at least once"
/// from "lots of recovery events happened", which can otherwise hide duplicate
/// recovery churn for the same vault IDs.
///
/// Example: 240 recovery events for 200 vaults can look healthy if the test only
/// checks `events.length >= 200`, even though 40 of those events may be repeats.
access(all)
fun getRecoveredYieldVaultCounts(): {UInt64: Int} {
let counts: {UInt64: Int} = {}
let recoveredEvents = Test.eventsOfType(Type<FlowYieldVaultsSchedulerV1.YieldVaultRecovered>())
for recoveredAny in recoveredEvents {
let recovered = recoveredAny as! FlowYieldVaultsSchedulerV1.YieldVaultRecovered
counts[recovered.yieldVaultID] = (counts[recovered.yieldVaultID] ?? 0) + 1
}
return counts
}

/// COMPREHENSIVE TEST: Insufficient Funds -> Failure -> Recovery
///
/// This test validates the COMPLETE failure and recovery cycle:
Expand Down Expand Up @@ -939,9 +958,21 @@ fun testInsufficientFundsAndRecovery() {
///
/// Flow: create 200 yield vaults, run 2 scheduling rounds, drain FLOW so executions fail,
/// wait for vaults to be marked stuck, refund FLOW, schedule the supervisor, then advance
/// time for ceil(200/MAX_BATCH_SIZE)+10 supervisor ticks. Asserts all 200 vaults are
/// recovered (YieldVaultRecovered events), none still stuck, and all have active schedules.
/// The +10 extra ticks are a buffer so every vault is processed despite scheduler timing.
/// time for enough supervisor ticks to recover all unique vault IDs, plus a short
/// stabilization window. This asserts:
/// - every one of the 200 vault IDs is recovered at least once,
/// - no recovery failures occur,
/// - no vault emits more than one recovery event,
/// - once all vaults are recovered and healthy, extra supervisor ticks do not emit
/// additional recovery events,
/// - none remain stuck, and all have active schedules.
///
/// Why this was tightened:
/// the earlier version only checked `YieldVaultRecovered.length >= n` after
/// `ceil(n / MAX_BATCH_SIZE) + 10` supervisor ticks. That allowed the test to pass
/// even when some vaults recovered more than once while others had not yet been
/// uniquely validated. This version keeps the same tick budget, but uses it as a
/// timeout ceiling instead of treating it as proof that the recovery set is clean.
access(all)
fun testSupervisorHandlesManyStuckVaults() {
let n = 200
Expand Down Expand Up @@ -1024,19 +1055,75 @@ fun testSupervisorHandlesManyStuckVaults() {
)
Test.expect(schedSupRes, Test.beSucceeded())

// 7. Advance time for supervisor ticks (ceil(n/MAX_BATCH_SIZE)+10); each tick processes a batch
// 7. Advance time until every target vault has emitted at least one recovery event.
//
// We still compute ceil(n / MAX_BATCH_SIZE) + 10, but it now acts as a maximum
// allowed budget for supervisor ticks. The loop stops early once all 200 vault IDs
// have been seen at least once. This makes the assertion sensitive to duplicate
// recoveries: repeated events for the same vault no longer help the test finish.
//
// After all unique IDs are observed, run a short stabilization window to check
// that a healthy supervisor does not continue to emit recovery events.
let supervisorRunsNeeded = (UInt(n) + UInt(maxBatchSize) - 1) / UInt(maxBatchSize)
let maxSupervisorTicks = supervisorRunsNeeded + 10
var run = 0 as UInt
while run < supervisorRunsNeeded + 10 {
var recoveredCounts = getRecoveredYieldVaultCounts()
while run < maxSupervisorTicks && recoveredCounts.length < n {
Test.moveTime(by: 60.0 * 10.0 + 10.0)
Test.commitBlock()
run = run + 1
recoveredCounts = getRecoveredYieldVaultCounts()
}
log("testSupervisorHandlesManyStuckVaults: ran \(run.toString()) supervisor ticks to reach \(recoveredCounts.length.toString()) unique recovered vaults")

let recoveryFailures = Test.eventsOfType(Type<FlowYieldVaultsSchedulerV1.YieldVaultRecoveryFailed>())
Test.assertEqual(0, recoveryFailures.length)

// Split the validation into:
// - missing recoveries: some vaults never emitted YieldVaultRecovered at all
// - duplicate recoveries: some vaults emitted YieldVaultRecovered more than once
//
// Both matter. Missing recoveries means the supervisor did not cover the whole set.
// Duplicate recoveries means event volume was inflated by churn, which the old
// `recoveredEvents.length >= n` assertion could not distinguish from success.
var missingRecoveries = 0
var duplicatedRecoveries = 0
var duplicatedVaults = 0
for yieldVaultID in yieldVaultIDs {
let recoveryCount = recoveredCounts[yieldVaultID] ?? 0
if recoveryCount == 0 {
missingRecoveries = missingRecoveries + 1
} else if recoveryCount > 1 {
duplicatedRecoveries = duplicatedRecoveries + (recoveryCount - 1)
duplicatedVaults = duplicatedVaults + 1
}
}
log("testSupervisorHandlesManyStuckVaults: ran \((supervisorRunsNeeded + 10).toString()) supervisor ticks")
Test.assert(
missingRecoveries == 0,
message: "expected every vault to recover at least once, but \(missingRecoveries.toString()) vaults emitted no YieldVaultRecovered event"
)
Test.assert(
duplicatedRecoveries == 0,
message: "expected exactly one recovery per vault, but saw \(duplicatedRecoveries.toString()) duplicate recoveries across \(duplicatedVaults.toString()) vaults"
)

let recoveredEvents = Test.eventsOfType(Type<FlowYieldVaultsSchedulerV1.YieldVaultRecovered>())
Test.assert(recoveredEvents.length >= n, message: "expected at least \(n.toString()) recovered, got \(recoveredEvents.length.toString())")
log("testSupervisorHandlesManyStuckVaults: recovered \(recoveredEvents.length.toString()) vaults")
// This second guard catches late churn that may start only after the full unique
// set has already been recovered. The duplicate-per-vault check above inspects the
// state up to this point; the stabilization window verifies the system stays quiet
// once recovery should be complete.
let recoveredEventsBeforeStabilization = Test.eventsOfType(Type<FlowYieldVaultsSchedulerV1.YieldVaultRecovered>()).length
var stabilizationTick = 0
while stabilizationTick < 2 {
Test.moveTime(by: 60.0 * 10.0 + 10.0)
Test.commitBlock()
stabilizationTick = stabilizationTick + 1
}
let recoveredEventsAfterStabilization = Test.eventsOfType(Type<FlowYieldVaultsSchedulerV1.YieldVaultRecovered>()).length
Test.assert(
recoveredEventsAfterStabilization == recoveredEventsBeforeStabilization,
message: "expected no additional recovery churn after all vaults were recovered; before stabilization: \(recoveredEventsBeforeStabilization.toString()), after: \(recoveredEventsAfterStabilization.toString())"
)
log("testSupervisorHandlesManyStuckVaults: stable recovery set of \(recoveredCounts.length.toString()) unique vaults with \(recoveredEventsAfterStabilization.toString()) total recovery events")

// 8. Health check: none stuck, all have active schedules
var stillStuck = 0
Expand Down
Loading