onflow · liobrasil · Mar 11, 2026 · Mar 11, 2026 · Mar 11, 2026 · nvdtf
@@ -68,22 +68,41 @@ access(all) contract FlowYieldVaultsAutoBalancers {
         return nil
     }
 
-    /// Checks if an AutoBalancer has at least one active (Scheduled) transaction.
+    /// Checks if an AutoBalancer has at least one active internally-managed transaction.
     /// Used by Supervisor to detect stuck yield vaults that need recovery.
     ///
+    /// A transaction is considered active when it is:
+    /// - still `Scheduled`, or
+    /// - already marked `Executed` by FlowTransactionScheduler, but the AutoBalancer has not
+    ///   yet advanced its last rebalance timestamp past that transaction's scheduled time.
+    ///
+    /// The second case matters because FlowTransactionScheduler flips status to `Executed`
+    /// before the handler actually runs. Without treating that in-flight window as active,
+    /// the Supervisor can falsely classify healthy vaults as stuck and recover them twice.
+    ///
     /// @param id: The yield vault/AutoBalancer ID
-    /// @return Bool: true if there's at least one Scheduled transaction, false otherwise
+    /// @return Bool: true if there's at least one active internally-managed transaction, false otherwise
     ///
     access(all) fun hasActiveSchedule(id: UInt64): Bool {
         let autoBalancer = self.borrowAutoBalancer(id: id)
         if autoBalancer == nil {
             return false
         }
 
+        let lastRebalanceTimestamp = autoBalancer!.getLastRebalanceTimestamp()
         let txnIDs = autoBalancer!.getScheduledTransactionIDs()
         for txnID in txnIDs {
-            if autoBalancer!.borrowScheduledTransaction(id: txnID)?.status() == FlowTransactionScheduler.Status.Scheduled {
-                return true
+            if let scheduledTxn = autoBalancer!.borrowScheduledTransaction(id: txnID) {
+                if let status = scheduledTxn.status() {
+                    if status == FlowTransactionScheduler.Status.Scheduled {
+                        return true
+                    }
+
+                    if status == FlowTransactionScheduler.Status.Executed
+                        && scheduledTxn.timestamp > lastRebalanceTimestamp {
+                        return true
+                    }
+                }
             }
         }
         return false
@@ -110,7 +129,7 @@ access(all) contract FlowYieldVaultsAutoBalancers {
             return false // Not configured for recurring, can't be "stuck"
         }
 
-        // Check if there's an active schedule
+        // Check if there's an active schedule or an in-flight due execution
         if self.hasActiveSchedule(id: id) {
             return false // Has active schedule, not stuck
         }
@@ -213,8 +232,14 @@ access(all) contract FlowYieldVaultsAutoBalancers {
         let scheduleCap = self.account.capabilities.storage
             .issue<auth(DeFiActions.Schedule) &DeFiActions.AutoBalancer>(storagePath)
 
-        // Register yield vault in registry for global mapping of live yield vault IDs
-        FlowYieldVaultsSchedulerRegistry.register(yieldVaultID: uniqueID.id, handlerCap: handlerCap, scheduleCap: scheduleCap)
+        // Register yield vault in registry for global mapping of live yield vault IDs.
+        // Only recurring vaults participate in stuck-scan ordering.
+        FlowYieldVaultsSchedulerRegistry.register(
+            yieldVaultID: uniqueID.id,
+            handlerCap: handlerCap,
+            scheduleCap: scheduleCap,
+            participatesInStuckScan: recurringConfig != nil
+        )
 
         // Start the native AutoBalancer self-scheduling chain if recurringConfig was provided
         // This schedules the first rebalance; subsequent ones are scheduled automatically

@@ -16,6 +16,7 @@ access(all) contract FlowYieldVaultsSchedulerRegistry {
     /* --- TYPES --- */
 
     /// Node in the simulated doubly-linked list used for O(1) stuck-scan ordering.
+    /// Only recurring, scan-eligible vaults participate in this list.
     /// `prev` points toward the head (most recently executed); `next` points toward the tail (oldest/least recently executed).
     access(all) struct ListNode {
         access(all) var prev: UInt64?
@@ -79,10 +80,11 @@ access(all) contract FlowYieldVaultsSchedulerRegistry {
     /// Stored as a dictionary for O(1) add/remove; iteration gives the pending set
     access(self) var pendingQueue: {UInt64: Bool}
 
-    /// Simulated doubly-linked list for O(1) stuck-scan ordering.
-    /// listHead = most recently executed vault ID (or nil if empty).
-    /// listTail = least recently executed vault ID — getStuckScanCandidates walks from here.
-    /// On reportExecution a vault is snipped from its current position and moved to head in O(1).
+    /// Simulated doubly-linked list for O(1) stuck-scan ordering across recurring scan participants.
+    /// listHead = most recently executed recurring vault ID (or nil if empty).
+    /// listTail = least recently executed recurring vault ID — getStuckScanCandidates walks from here.
+    /// On reportExecution a recurring participant is snipped from its current position and moved to head in O(1).
+    /// If a vault later disables recurring config, its stale list entry is pruned lazily during candidate walks.
     access(self) var listNodes: {UInt64: ListNode}
     access(self) var listHead: UInt64?
     access(self) var listTail: UInt64?
@@ -136,10 +138,12 @@ access(all) contract FlowYieldVaultsSchedulerRegistry {
     /* --- ACCOUNT-LEVEL FUNCTIONS --- */
 
     /// Register a YieldVault and store its handler and schedule capabilities (idempotent)
+    /// `participatesInStuckScan` should be true only for vaults that currently have recurring config.
     access(account) fun register(
         yieldVaultID: UInt64,
         handlerCap: Capability<auth(FlowTransactionScheduler.Execute) &{FlowTransactionScheduler.TransactionHandler}>,
-        scheduleCap: Capability<auth(DeFiActions.Schedule) &DeFiActions.AutoBalancer>
+        scheduleCap: Capability<auth(DeFiActions.Schedule) &DeFiActions.AutoBalancer>,
+        participatesInStuckScan: Bool
     ) {
         pre {
             handlerCap.check(): "Invalid handler capability provided for yieldVaultID \(yieldVaultID)"
@@ -148,20 +152,23 @@ access(all) contract FlowYieldVaultsSchedulerRegistry {
         self.yieldVaultRegistry[yieldVaultID] = true
         self.handlerCaps[yieldVaultID] = handlerCap
         self.scheduleCaps[yieldVaultID] = scheduleCap
-        // New vaults go to the head; they haven't executed yet but are freshly registered.
+
+        // Only recurring vaults participate in stuck-scan ordering.
         // If already in the list (idempotent re-register), remove first to avoid duplicates.
         if self.listNodes[yieldVaultID] != nil {
             self._listRemove(id: yieldVaultID)
         }
-        self._listInsertAtHead(id: yieldVaultID)
+        if participatesInStuckScan {
+            self._listInsertAtHead(id: yieldVaultID)
+        }
         emit YieldVaultRegistered(yieldVaultID: yieldVaultID)
     }
 
-    /// Called on every execution. Moves yieldVaultID to the head (most recently executed)
-    /// so the Supervisor scans from the tail (least recently executed) for stuck detection — O(1).
-    /// If the list entry is unexpectedly missing, reinsert it to restore the ordering structure.
+    /// Called on every execution. Moves scan-participating yieldVaultID to the head
+    /// (most recently executed) so the Supervisor scans recurring participants from the tail
+    /// (least recently executed) for stuck detection — O(1).
     access(account) fun reportExecution(yieldVaultID: UInt64) {
-        if !(self.yieldVaultRegistry[yieldVaultID] ?? false) {
+        if !(self.yieldVaultRegistry[yieldVaultID] ?? false) || self.listNodes[yieldVaultID] == nil {
             return
         }
         let _ = self._listRemove(id: yieldVaultID)
@@ -270,18 +277,30 @@ access(all) contract FlowYieldVaultsSchedulerRegistry {
         return self.pendingQueue.length
     }
 
-    /// Returns up to `limit` vault IDs starting from the tail (least recently executed).
+    /// Returns up to `limit` recurring scan participants starting from the tail
+    /// (least recently executed among recurring participants).
+    /// Stale entries whose recurring config has been removed are pruned lazily as the walk proceeds.
     /// Supervisor should only scan these for stuck detection instead of all registered vaults.
     /// @param limit: Maximum number of IDs to return (caller typically passes MAX_BATCH_SIZE)
     access(all) fun getStuckScanCandidates(limit: UInt): [UInt64] {
         var result: [UInt64] = []
         var current = self.listTail
-        var count: UInt = 0
-        while count < limit {
+        while UInt(result.length) < limit {
             if let id = current {
-                result.append(id)
-                current = self.listNodes[id]?.prev
-                count = count + 1
+                let previous = self.listNodes[id]?.prev
+                let scheduleCap = self.scheduleCaps[id]
+                let isRecurringParticipant =
+                    scheduleCap != nil
+                    && scheduleCap!.check()
+                    && scheduleCap!.borrow()?.getRecurringConfig() != nil
+
+                if isRecurringParticipant {
+                    result.append(id)
+                } else {
+                    self.dequeuePending(yieldVaultID: id)
+                    let _ = self._listRemove(id: id)
+                }
+                current = previous
             } else {
                 break
             }

@@ -160,7 +160,8 @@ access(all) contract FlowYieldVaultsSchedulerV1 {
         /// Detects and recovers stuck yield vaults by directly calling their scheduleNextRebalance().
         ///
         /// Detection methods:
-        /// 1. State-based: Scans for registered yield vaults with no active schedule that are overdue
+        /// 1. State-based: Scans recurring yield vaults in stuck-scan order for candidates with
+        ///    no active schedule that are overdue
         ///
         /// Recovery method:
         /// - Uses Schedule capability to call AutoBalancer.scheduleNextRebalance() directly
@@ -172,7 +173,7 @@ access(all) contract FlowYieldVaultsSchedulerV1 {
         ///   "priority": UInt8 (0=High,1=Medium,2=Low) - for Supervisor self-rescheduling
         ///   "executionEffort": UInt64 - for Supervisor self-rescheduling
         ///   "recurringInterval": UFix64 (for Supervisor self-rescheduling)
-        ///   "scanForStuck": Bool (default true - scan up to MAX_BATCH_SIZE least-recently-executed vaults for stuck ones)
+        ///   "scanForStuck": Bool (default true - scan up to MAX_BATCH_SIZE least-recently-executed recurring scan participants for stuck ones)
         /// }
         access(FlowTransactionScheduler.Execute) fun executeTransaction(id: UInt64, data: AnyStruct?) {
             let cfg = data as? {String: AnyStruct} ?? {}

@@ -599,6 +599,25 @@ fun testStuckYieldVaultDetectionLogic() {
     log("PASS: Stuck yield vault detection correctly identifies healthy yield vaults")
 }
 
+/// Returns per-yield-vault recovery event counts from YieldVaultRecovered events.
+///
+/// This is used by stress tests to distinguish "all vaults recovered at least once"
+/// from "lots of recovery events happened", which can otherwise hide duplicate
+/// recovery churn for the same vault IDs.
+///
+/// Example: 240 recovery events for 200 vaults can look healthy if the test only
+/// checks `events.length >= 200`, even though 40 of those events may be repeats.
+access(all)
+fun getRecoveredYieldVaultCounts(): {UInt64: Int} {
+    let counts: {UInt64: Int} = {}
+    let recoveredEvents = Test.eventsOfType(Type<FlowYieldVaultsSchedulerV1.YieldVaultRecovered>())
+    for recoveredAny in recoveredEvents {
+        let recovered = recoveredAny as! FlowYieldVaultsSchedulerV1.YieldVaultRecovered
+        counts[recovered.yieldVaultID] = (counts[recovered.yieldVaultID] ?? 0) + 1
+    }
+    return counts
+}
+
 /// COMPREHENSIVE TEST: Insufficient Funds -> Failure -> Recovery
 ///
 /// This test validates the COMPLETE failure and recovery cycle:
@@ -939,9 +958,21 @@ fun testInsufficientFundsAndRecovery() {
 ///
 /// Flow: create 200 yield vaults, run 2 scheduling rounds, drain FLOW so executions fail,
 /// wait for vaults to be marked stuck, refund FLOW, schedule the supervisor, then advance
-/// time for ceil(200/MAX_BATCH_SIZE)+10 supervisor ticks. Asserts all 200 vaults are
-/// recovered (YieldVaultRecovered events), none still stuck, and all have active schedules.
-/// The +10 extra ticks are a buffer so every vault is processed despite scheduler timing.
+/// time for enough supervisor ticks to recover all unique vault IDs, plus a short
+/// stabilization window. This asserts:
+/// - every one of the 200 vault IDs is recovered at least once,
+/// - no recovery failures occur,
+/// - no vault emits more than one recovery event,
+/// - once all vaults are recovered and healthy, extra supervisor ticks do not emit
+///   additional recovery events,
+/// - none remain stuck, and all have active schedules.
+///
+/// Why this was tightened:
+/// the earlier version only checked `YieldVaultRecovered.length >= n` after
+/// `ceil(n / MAX_BATCH_SIZE) + 10` supervisor ticks. That allowed the test to pass
+/// even when some vaults recovered more than once while others had not yet been
+/// uniquely validated. This version keeps the same tick budget, but uses it as a
+/// timeout ceiling instead of treating it as proof that the recovery set is clean.
 access(all)
 fun testSupervisorHandlesManyStuckVaults() {
     let n = 200
@@ -1024,19 +1055,75 @@ fun testSupervisorHandlesManyStuckVaults() {
     )
     Test.expect(schedSupRes, Test.beSucceeded())
 
-    // 7. Advance time for supervisor ticks (ceil(n/MAX_BATCH_SIZE)+10); each tick processes a batch
+    // 7. Advance time until every target vault has emitted at least one recovery event.
+    //
+    // We still compute ceil(n / MAX_BATCH_SIZE) + 10, but it now acts as a maximum
+    // allowed budget for supervisor ticks. The loop stops early once all 200 vault IDs
+    // have been seen at least once. This makes the assertion sensitive to duplicate
+    // recoveries: repeated events for the same vault no longer help the test finish.
+    //
+    // After all unique IDs are observed, run a short stabilization window to check
+    // that a healthy supervisor does not continue to emit recovery events.
     let supervisorRunsNeeded = (UInt(n) + UInt(maxBatchSize) - 1) / UInt(maxBatchSize)
+    let maxSupervisorTicks = supervisorRunsNeeded + 10
     var run = 0 as UInt
-    while run < supervisorRunsNeeded + 10 {
+    var recoveredCounts = getRecoveredYieldVaultCounts()
+    while run < maxSupervisorTicks && recoveredCounts.length < n {
         Test.moveTime(by: 60.0 * 10.0 + 10.0)
         Test.commitBlock()
         run = run + 1
+        recoveredCounts = getRecoveredYieldVaultCounts()
+    }
+    log("testSupervisorHandlesManyStuckVaults: ran \(run.toString()) supervisor ticks to reach \(recoveredCounts.length.toString()) unique recovered vaults")
+
+    let recoveryFailures = Test.eventsOfType(Type<FlowYieldVaultsSchedulerV1.YieldVaultRecoveryFailed>())
+    Test.assertEqual(0, recoveryFailures.length)
+
+    // Split the validation into:
+    // - missing recoveries: some vaults never emitted YieldVaultRecovered at all
+    // - duplicate recoveries: some vaults emitted YieldVaultRecovered more than once
+    //
+    // Both matter. Missing recoveries means the supervisor did not cover the whole set.
+    // Duplicate recoveries means event volume was inflated by churn, which the old
+    // `recoveredEvents.length >= n` assertion could not distinguish from success.
+    var missingRecoveries = 0
+    var duplicatedRecoveries = 0
+    var duplicatedVaults = 0
+    for yieldVaultID in yieldVaultIDs {
+        let recoveryCount = recoveredCounts[yieldVaultID] ?? 0
+        if recoveryCount == 0 {
+            missingRecoveries = missingRecoveries + 1
+        } else if recoveryCount > 1 {
+            duplicatedRecoveries = duplicatedRecoveries + (recoveryCount - 1)
+            duplicatedVaults = duplicatedVaults + 1
+        }
     }
-    log("testSupervisorHandlesManyStuckVaults: ran \((supervisorRunsNeeded + 10).toString()) supervisor ticks")
+    Test.assert(
+        missingRecoveries == 0,
+        message: "expected every vault to recover at least once, but \(missingRecoveries.toString()) vaults emitted no YieldVaultRecovered event"
+    )
+    Test.assert(
+        duplicatedRecoveries == 0,
+        message: "expected exactly one recovery per vault, but saw \(duplicatedRecoveries.toString()) duplicate recoveries across \(duplicatedVaults.toString()) vaults"
+    )
 
-    let recoveredEvents = Test.eventsOfType(Type<FlowYieldVaultsSchedulerV1.YieldVaultRecovered>())
-    Test.assert(recoveredEvents.length >= n, message: "expected at least \(n.toString()) recovered, got \(recoveredEvents.length.toString())")
-    log("testSupervisorHandlesManyStuckVaults: recovered \(recoveredEvents.length.toString()) vaults")
+    // This second guard catches late churn that may start only after the full unique
+    // set has already been recovered. The duplicate-per-vault check above inspects the
+    // state up to this point; the stabilization window verifies the system stays quiet
+    // once recovery should be complete.
+    let recoveredEventsBeforeStabilization = Test.eventsOfType(Type<FlowYieldVaultsSchedulerV1.YieldVaultRecovered>()).length
+    var stabilizationTick = 0
+    while stabilizationTick < 2 {
+        Test.moveTime(by: 60.0 * 10.0 + 10.0)
+        Test.commitBlock()
+        stabilizationTick = stabilizationTick + 1
+    }
+    let recoveredEventsAfterStabilization = Test.eventsOfType(Type<FlowYieldVaultsSchedulerV1.YieldVaultRecovered>()).length
+    Test.assert(
+        recoveredEventsAfterStabilization == recoveredEventsBeforeStabilization,
+        message: "expected no additional recovery churn after all vaults were recovered; before stabilization: \(recoveredEventsBeforeStabilization.toString()), after: \(recoveredEventsAfterStabilization.toString())"
+    )
+    log("testSupervisorHandlesManyStuckVaults: stable recovery set of \(recoveredCounts.length.toString()) unique vaults with \(recoveredEventsAfterStabilization.toString()) total recovery events")
 
     // 8. Health check: none stuck, all have active schedules
     var stillStuck = 0